コード例 #1
0
    def test_delete_auto_motif_failed(self):
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ1',
        ], MOTIF_VERSION)
        result = delete_motif(seq_ids_to_motifs['SEQ1'][0].id)
        self.assertIsNotNone(result)

        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ1',
        ], MOTIF_VERSION)
        result = delete_motif(seq_ids_to_motifs['SEQ1'][4].id)
        self.assertIsNotNone(result)
コード例 #2
0
ファイル: lrr-search.py プロジェクト: phytolrr/phytolrr
def find_lrr_and_save_db(matrix, seq):
    seq_ids_to_motifs = dao.find_motifs_by_seq_ids([seq.seq_id,], MOTIF_VERSION)
    if len(seq_ids_to_motifs.get(seq.seq_id, [])) > 0:
        logging.warning(str.format("The sequence {} is already analysied before, skip it", seq.seq_id))
        return

    seq_str = str(seq.seq)

    invalid_amino = set(seq_str) - VALID_AMINO
    if len(invalid_amino) > 0:
        logging.error(str.format("The sequence {} contains invalid amino {}", seq.seq_id, invalid_amino))
        return

    logging.info(str.format("Begin to find lrr in seq {}, {}", seq.seq_id, seq_str))
    motifs = motif_tool.lrr_search(matrix, str(seq.seq))
    motifs = motif_tool.found_no_overlapped_motifs(motifs)
    for m in motifs:
        m.seq_id = seq.seq_id
        m.correct = True

    logging.info(str.format("Found {} LRR motifs in sequence {}", len(motifs), seq.seq_id))
    _print_debug(motifs, seq)

    # Write to db
    motif_entities = [dao.motif_entity.MotifEntityBase(**m.__dict__) for m in motifs]
    logging.debug("Begin to write to db")
    with dao.session_scope() as session:
        dao.motif.replace_motifs_by_seq(session, seq.seq_id, motif_entities, MOTIF_VERSION)
コード例 #3
0
ファイル: motif_service.py プロジェクト: phytolrr/phytolrr
def get_and_check_offset(seq, version):
    payload = request.json
    if payload is None:
        raise ValidationError(str.format(ErrorCode.PARA_NOT_EXISTS, "offset"))
    offset = payload.get('offset', None)
    if offset is None:
        raise ValidationError(str.format(ErrorCode.PARA_NOT_EXISTS, "offset"))
    offset = get_and_check_int(
        offset, str.format(ErrorCode.INVALID_PARA, 'offset', offset))
    if offset < 0:
        raise ValidationError(
            str.format(ErrorCode.INVALID_PARA, 'offset', offset))
    if offset + 16 > len(seq.seq):
        raise ValidationError(
            str.format(ErrorCode.INVALID_PARA, 'offset', offset))

    seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
        seq.seq_id,
    ], version)
    motifs = seq_ids_to_motifs.get(seq.seq_id, [])
    poses = set()
    wrong_poses = set()
    for m in motifs:
        if m.correct:
            poses.update(range(m.offset - 15, m.offset + 16))
        else:
            wrong_poses.add(m.offset)
    if offset in poses:
        raise ValidationError(ErrorCode.OFFSET_OVERLAP)
    if offset in wrong_poses:
        raise ValidationError(ErrorCode.OFFSET_EXISTS_WRONG)
    return offset
コード例 #4
0
    def test_add_manually_motif_before_wrong_area(self):
        # 先把seq都查出来
        with dao.query_session() as session:
            seqs = dao.sequence.find_all_seqs(session)
            seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])
        sid = seq_ids_to_seq['SEQ2'].id
        ms = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)['SEQ2']
        ms.sort(key=lambda m: m.offset)
        motif = ms[0]

        # 重叠(新增的offset在前面面)时无法添加
        with boddle(json={"offset": 185}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)},
                             result)

        # 标记错误
        mark_wrong(sid, motif.id)

        # 再次添加OK
        with boddle(json={"offset": 185}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertTrue("motifs_16" in result)
        self.assertEqual(185, result['motifs_16'][0]['offset'])
コード例 #5
0
def get_sequences(version):
    filters = {}
    filters['page_index'], filters['page_size'] = _get_page_arg()
    filters['keyword'] = _get_keyword_arg()
    _get_offset_arg(filters)
    _get_lrr_count_arg(filters)
    _get_species_arg(filters)

    filters['page_index'] = filters['page_index'] * filters['page_size']
    logging.debug(str.format("Filters: {}", filters))
    seqs, total = dao.query_sequences(filters, get_version_arg(version))

    # find motifs and nsites
    seq_ids = [seq.seq_id for seq in seqs]
    seq_ids_to_motifs = dao.find_motifs_by_seq_ids(seq_ids,
                                                   get_version_arg(version))
    seq_ids_to_nsites = dao.find_nsites_by_seq_ids(seq_ids)

    # find tags, the overlap mark is tagged on version 1 motifs
    if get_version_arg(version) == 1:
        ids_to_tag_names = dao.find_tags_by_motif_ids(
            set([
                m.id for motifs in seq_ids_to_motifs.values() for m in motifs
            ]))
    else:
        ids_to_tag_names = {}

    result = {'sequences': [], 'total': total}
    for seq in seqs:
        result['sequences'].append(
            sequence_entity_to_output(seq,
                                      seq_ids_to_motifs.get(seq.seq_id, []),
                                      seq_ids_to_nsites.get(seq.seq_id, []),
                                      ids_to_tag_names))
    return response_ok(result, True)
コード例 #6
0
    def test_add_manually_motif_after_wrong_area(self):
        set_up_baseline_seq('SEQ4', '', start=10, count=10, step=24)
        # 先把seq都查出来
        with dao.query_session() as session:
            seqs = dao.sequence.find_all_seqs(session)
            seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])
        sid = seq_ids_to_seq['SEQ2'].id
        ms = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)['SEQ2']
        ms.sort(key=lambda m: m.offset)
        motif = ms[-1]

        # 重叠(新增的offset在前面面)时无法添加
        with boddle(json={"offset": 595}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)},
                             result)

        # 标记错误
        mark_wrong(sid, motif.id)

        # 再次添加OK
        with boddle(json={"offset": 595}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertEqual(1, len(result.get('motifs_16', [])))
        self.assertEqual(595, result['motifs_16'][0].get('offset', -1))
コード例 #7
0
    def test_tag_multiple_times(self):
        # 首先调用tag用例,打上tag
        self.test_tag_false_discovery()

        with dao.query_session() as session:
            seq_ids_to_seq = dict([
                (seq.seq_id, seq)
                for seq in dao.sequence.find_all_seqs(session)
            ])

        #  再打一次
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids(
            {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION)
        mark_wrong(seq_ids_to_seq['SEQ1'].id, seq_ids_to_motifs['SEQ1'][0].id)
        mark_wrong(seq_ids_to_seq['SEQ2'].id, seq_ids_to_motifs['SEQ2'][0].id)

        # 效果与原来是一样的
        self.assertEqual(
            9,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ1',
                ], MOTIF_VERSION)['SEQ1']))
        self.assertEqual(
            19,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))
        self.assertEqual(
            30,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ3',
                ], MOTIF_VERSION)['SEQ3']))
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids(
            {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION)
        self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'},
                         set(seq_ids_to_motifs.keys()))
        self.assertFalse(seq_ids_to_motifs['SEQ1'][0].correct)
        self.assertFalse(seq_ids_to_motifs['SEQ2'][0].correct)
        self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct)
        self.assertEqual(10, len(seq_ids_to_motifs['SEQ1']))
        self.assertEqual(20, len(seq_ids_to_motifs['SEQ2']))
        self.assertEqual(30, len(seq_ids_to_motifs['SEQ3']))
コード例 #8
0
def main():
    motifs = get_old_wrong_new_exists()
    print(
        str.format("Get wrong in old and still exists in new, Count {}, {}",
                   len(motifs), motifs))
    for old_m, new_m in motifs:
        print(
            str.format(
                "Mark mid {}, seq_id {}, offset {} as false_discovery in version {}",
                new_m.id, new_m.seq_id, new_m.offset, NEW_VERSION))
        #dao.update_false_discovery_by_motif(new_m.id, True, NEW_VERSION)

    matrix = pssm_matrix.calc_pssm_matrix(dao.find_baseline_motifs())
    motifs = get_old_correct_new_not_exists()
    with dao.query_session() as session:
        all_seq_ids = set([m.seq_id for m in motifs])
        seqs = dao.sequence.find_seq_by_ids(session, all_seq_ids)
        seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])
        assert (len(seq_ids_to_seq) == len(all_seq_ids))

    print(
        str.format("Get correct in old and not exists in new, Count {}, {}",
                   len(motifs), motifs))
    for m in motifs:
        motif_seq = seq_ids_to_seq[m.seq_id].seq[m.offset:m.offset + 16]
        assert (len(motif_seq) == 16)
        score = motif_tools.calc_pssm_score(motif_seq, matrix)
        probability = motif_tools.calc_probability_by_score(score)
        new_m = dao.motif.MotifEntityBase(m.offset,
                                          m.seq_id,
                                          score,
                                          probability,
                                          0,
                                          manually_add=True)
        new_motifs = dao.find_motifs_by_seq_ids([
            m.seq_id,
        ],
                                                NEW_VERSION,
                                                with_wrong=False)[m.seq_id]
        new_motifs.append(new_m)
        new_motifs_no_overlap = motif_tools.found_no_overlapped_motifs(
            new_motifs, 16)
        if len(new_motifs) != len(new_motifs_no_overlap):
            logging.error(
                str.format("Overlap found in seq {}, {}/{}, new offset {}",
                           m.seq_id, len(new_motifs),
                           len(new_motifs_no_overlap), new_m.offset))
        else:
            logging.debug(
                str.format(
                    "Add motif offset {}, score {}, probability {} to seq {} manually",
                    new_m.offset, new_m.score, new_m.probability,
                    new_m.seq_id))
            dao.add_manually_motif(new_m.seq_id, new_m.offset, NEW_VERSION,
                                   new_m.score, new_m.probability)
コード例 #9
0
 def _get_manually_motifs(self, seq_id):
     seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
         seq_id,
     ], MOTIF_VERSION)
     self.assertTrue(seq_id in seq_ids_to_motifs)
     manually_motifs = []
     for m in seq_ids_to_motifs[seq_id]:
         if m.manually_add:
             manually_motifs.append(m)
     self.assertTrue(len(manually_motifs) > 0)
     return manually_motifs
コード例 #10
0
    def test_untag_false_discovery(self):
        # 首先调用tag用例,打上tag
        self.test_tag_false_discovery()

        with dao.query_session() as session:
            seqs = dao.sequence.find_all_seqs(session)
            seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])

        #  删除SEQ1中motif的tag
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids(
            {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION)
        unmark_wrong(seq_ids_to_seq['SEQ1'].id,
                     seq_ids_to_motifs['SEQ1'][0].id)

        # SEQ1的数量恢复
        self.assertEqual(
            10,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ1',
                ], MOTIF_VERSION)['SEQ1']))
        self.assertEqual(
            19,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))
        self.assertEqual(
            30,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ3',
                ], MOTIF_VERSION)['SEQ3']))
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids(
            {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION)
        self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'},
                         set(seq_ids_to_motifs.keys()))
        self.assertTrue(seq_ids_to_motifs['SEQ1'][0].correct)
        self.assertFalse(seq_ids_to_motifs['SEQ2'][0].correct)
        self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct)
コード例 #11
0
 def setUp(self):
     set_up_db()
     set_up_seq('SEQ2', 'SEQ2ABCDABCD', start=10, count=20)
     set_up_seq('SEQ1', 'SEQ1ABCDABCD')
     set_up_seq('SEQ3', 'SEQ3ABCDABCD', start=50, count=30)
     with dao.query_session() as session:
         seqs = dao.sequence.find_all_seqs(session)
     self.seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])
     self.assertEqual(3, len(self.seq_ids_to_seq))
     self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'},
                      set(self.seq_ids_to_seq.keys()))
     seq_ids_to_motifs = dao.find_motifs_by_seq_ids(
         self.seq_ids_to_seq.keys(), MOTIF_VERSION)
コード例 #12
0
ファイル: generate_figs.py プロジェクト: phytolrr/phytolrr
def main():
    with dao.query_session() as session:
        seqs = dao.sequence.find_all_seqs(session)

    step1(seqs)
    logging.info("Step 1 end")

    # since only five SGs were considered from step 3.4, the seq was redefined
    relevant_seqs = [seq for seq in seqs if seq.subgroup in SUBGROUPS]
    relevant_seq_ids = [seq.seq_id for seq in relevant_seqs]
    seq_ids_to_nsites = dao.find_nsites_by_seq_ids(relevant_seq_ids)
    seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(relevant_seq_ids,
                                                 MOTIF_VERSION,
                                                 with_wrong=False)

    step3_5(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs)
    logging.info("Step 3.5 end")

    step3_7(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs)
    logging.info("Step 3.7 end")

    step3_9(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs)
    logging.info("Step 3.9 end")

    step3_10(seq_ids_to_lrrs, seqs)
    logging.info("Step 3.10 end")

    step3_11(seqs, seq_ids_to_lrrs)
    logging.info("Step 3.11 end")

    full_seq_ids = [seq.seq_id for seq in seqs]
    full_seq_ids_to_nsites = dao.find_nsites_by_seq_ids(full_seq_ids)
    full_seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(full_seq_ids,
                                                      MOTIF_VERSION,
                                                      with_wrong=False)
    supplement_for_review(seqs, full_seq_ids_to_nsites, full_seq_ids_to_lrrs)
    logging.info("Step supplement for review end")
コード例 #13
0
def main():
    with dao.query_session() as session:
        seqs = dao.sequence.find_all_seqs(session)

    step1(seqs)
    logging.info("Step 1 end")

    # 3.4开始,只关注特定的5个亚家族,因此对seq做精简
    relevant_seqs = [seq for seq in seqs if seq.subgroup in SUBGROUPS]
    relevant_seq_ids = [seq.seq_id for seq in relevant_seqs]
    seq_ids_to_nsites = dao.find_nsites_by_seq_ids(relevant_seq_ids)
    seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(relevant_seq_ids,
                                                 MOTIF_VERSION,
                                                 with_wrong=False)

    step3_5(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs)
    logging.info("Step 3.5 end")

    step3_7(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs)
    logging.info("Step 3.7 end")

    step3_9(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs)
    logging.info("Step 3.9 end")

    step3_10(seq_ids_to_lrrs, seqs)
    logging.info("Step 3.10 end")

    step3_11(seqs, seq_ids_to_lrrs)
    logging.info("Step 3.11 end")

    full_seq_ids = [seq.seq_id for seq in seqs]
    full_seq_ids_to_nsites = dao.find_nsites_by_seq_ids(full_seq_ids)
    full_seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(full_seq_ids,
                                                      MOTIF_VERSION,
                                                      with_wrong=False)
    supplement_for_review(seqs, full_seq_ids_to_nsites, full_seq_ids_to_lrrs)
    logging.info("Step supplement for review end")
コード例 #14
0
def main():
    seqs = find_seqs()
    seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])
    logging.info("Seqs count {}, seq_ids {}", len(seqs), seq_ids_to_seq.keys())
    seq_ids_to_motifs = dao.find_motifs_by_seq_ids(seq_ids_to_seq.keys(),
                                                   MOTIF_VERSION,
                                                   with_wrong=False)

    seq_ids_to_matrix = {}
    for seq_id, motif_entities in seq_ids_to_motifs.items():
        motif_entities.sort(key=lambda m: m.offset)
        seq_str = seq_ids_to_seq[seq_id].seq
        motif_seqs_str = [
            seq_str[m.offset:m.offset + 16] for m in motif_entities
        ]
        generate_weblogo(seq_id, motif_seqs_str)

        motif_seq = [
            Seq(motif_seq_str, IUPAC.protein)
            for motif_seq_str in motif_seqs_str
        ]
        matrix = motifs.create(motif_seq, IUPAC.protein)

        seq_ids_to_matrix[seq_id] = matrix

    seq_ids = list(seq_ids_to_matrix.keys())
    seq_ids.sort()

    rows = []
    for seq_id in seq_ids:
        matrix = seq_ids_to_matrix.get(seq_id)
        print(matrix.pwm[AMINO][POS])
        print(str.format("seq {}:\n{}", seq_id, matrix))
        row = OrderedDict(seq_id=seq_id,
                          s_num=matrix.counts[AMINO][POS],
                          probability=matrix.pwm[AMINO][POS],
                          LRRs='\n'.join([
                              seq_ids_to_seq[seq_id].seq[m.offset:m.offset +
                                                         16]
                              for m in seq_ids_to_motifs[seq_id]
                          ]))
        rows.append(row)

    with open(OUTPUT_FILE, 'w') as f:
        c = csv.DictWriter(
            f, fieldnames=['seq_id', 's_num', 'probability', 'LRRs'])
        c.writeheader()
        c.writerows(rows)
コード例 #15
0
ファイル: motif_service.py プロジェクト: phytolrr/phytolrr
    def change_motif_false_discovery(version, sid, mid):
        try:
            version = get_version_arg(version)
            seq = get_and_check_sid(sid)
            motif = get_and_check_motif_id(mid, version)

            if seq.seq_id != motif.seq_id:
                raise ValidationError(
                    str.format(
                        "The seq_id {} in sequence is different with motif {}",
                        seq.seq_id, motif.seq_id))
            if motif.manually_add:
                raise ValidationError(
                    str.format(
                        "The motif {} offset {} in sequence {} is manually added,"
                        " and should not be tagged wrong again. The motif can be delete directly",
                        mid, motif.offset, motif.seq_id))

            false_discovery = _get_input_false_discovery()
            if not false_discovery:
                seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
                    motif.seq_id,
                ], version)
                if is_overlapped([
                        m.offset
                        for m in seq_ids_to_motifs[motif.seq_id] if m.correct
                ] + [motif.offset]):
                    raise ValidationError(
                        str.format(
                            "The motif {} can not be unmark from wrong because overlapping was found",
                            mid))
            if version < 2:
                raise ValidationError(
                    "The version 1 is no longer support marking wrong")
            dao.update_false_discovery_by_motif(mid, false_discovery, version)

            motif_entity = dao.find_motif_by_mid(mid, version)
            if motif_entity is None:
                return response_error(
                    str.format("Internal error, can not find the motif {}",
                               mid))
            motif_output = motif_entity_output([motif_entity], {})
            return response_ok(motif_output[0])

        except ValidationError as e:
            return response_error(e.message)
コード例 #16
0
    def test_can_not_unmark_wrong_overlapped_with_manually_added_one(self):
        # 如果手工添加的motif与原有已经mark为wrong的moiti存在覆盖,那么手动添加后,原有的motif就不可以unmark了
        # 细分的话,又分两种情况,手动motif的offset在wrong的后面,与在其前面
        # 首先对SEQ2的第一个和最后一个motif标记wrong,然后手工添加motif
        self.test_add_manually_motif_after_wrong_area()
        self.test_add_manually_motif_before_wrong_area()

        # 尝试对第一个和最后一个motif解除wrong标记,应该都要失败(因为前面添加过两个手动motif了,所以最后一个应该是-3)
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        seq_ids_to_motifs['SEQ2'].sort(key=lambda m: m.offset)
        first_motif = seq_ids_to_motifs['SEQ2'][1]
        last_motif = seq_ids_to_motifs['SEQ2'][-2]

        with dao.query_session() as session:
            seqs = dao.sequence.find_all_seqs(session)
            seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])

        result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, first_motif.id)
        self.assertIsNotNone(result)
        msg = result.get('message', '')
        self.assertTrue(
            msg.endswith(
                ' can not be unmark from wrong because overlapping was found'),
            result)

        result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, last_motif.id)
        self.assertIsNotNone(result)
        msg = result.get('message', '')
        self.assertTrue(
            msg.endswith(
                ' can not be unmark from wrong because overlapping was found'),
            result)

        # 删除手工添加的motif
        for m in self._get_manually_motifs('SEQ2'):
            delete_motif(m.id)

        # 尝试对第一个和最后一个解除wrong标记,成功
        result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, first_motif.id)
        self.assertFalse(result['false_discovery'])
        result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, last_motif.id)
        self.assertFalse(result['false_discovery'])
コード例 #17
0
ファイル: generate_figs.py プロジェクト: phytolrr/phytolrr
def step3_10(seq_ids_to_lrrs, seqs):
    seq_ids_to_file_path = {
        'AT1G55610.2':
        ("mafft/brl1_homo_10_ali.fasta", "iqtree/BRL1_figtree.tree"),
        'AT4G39400.1':
        ("mafft/bri1_homo_100_align.fasta", "iqtree/bri1_figtree.tree"),
        'AT5G46330.1':
        ("mafft/fls2_homo_10_ali.fasta", "iqtree/FLS2_figtree.tree"),
        'AT4G28490.1':
        ("mafft/hae_top10_id_ali.fasta", "iqtree/HAE_figtree.tree"),
        'AT1G73080.1': ("mafft/pepr1_top10_ali.fasta",
                        "iqtree/pepr1_figtree.tree"),
        'AT2G02220.1': ("mafft/pskr1_top10_ali.fasta",
                        "iqtree/PSKR1_figtree.tree"),
        'AT5G61480.1': ("mafft/PXY_top10_ali.fasta",
                        "iqtree/PXY_figtree.tree"),
        "AT4G26540.1": ("mafft/RGFR1_top10_ali.fasta",
                        "iqtree/RGFR1_figtree.tree")
    }

    seq_ids_to_data = read_in_ali_seqs(seq_ids_to_file_path)
    relevant_seq_ids = set([
        seq_id for data in seq_ids_to_data.values()
        for seq_id in data.seq_ids_to_seq.keys()
    ])

    lost_seq_ids = relevant_seq_ids - set(seq_ids_to_lrrs.keys())
    lost_seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(lost_seq_ids,
                                                      MOTIF_VERSION,
                                                      with_wrong=False)
    seq_ids_to_lrrs.update(lost_seq_ids_to_lrrs)

    seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs
                           if seq.seq_id in relevant_seq_ids])

    generate_step3_10_weblogo_per_offset(seq_ids_to_data, seq_ids_to_lrrs)
    generate_step3_10_weblog_lrrs(seq_ids_to_data, seq_ids_to_seq,
                                  seq_ids_to_lrrs)
    generate_step3_10_weblogo_by_types(seq_ids_to_data, seq_ids_to_seq,
                                       seq_ids_to_lrrs)
    statitics_match_count(seq_ids_to_data, seq_ids_to_lrrs)
コード例 #18
0
ファイル: lrr-search.py プロジェクト: phytolrr/phytolrr
def main():
    # generate the matrix
    seqs = dao.find_baseline_seqs()
    seq_ids_to_seq_str = dict([(seq.seq_id, seq.seq) for seq in seqs])
    logging.info(str.format("Baseline sequence ids count({}): {}", len(seq_ids_to_seq_str), seq_ids_to_seq_str.keys()))

    seq_ids_to_motifs = dao.find_motifs_by_seq_ids(seq_ids_to_seq_str.keys(), BASELINE_MOTIF_VERSION, with_wrong=False)
    motif_strs = [seq_ids_to_seq_str[m.seq_id][m.offset:m.offset+16] for motifs in seq_ids_to_motifs.values() for m in motifs]
    logging.info(str.format("Baseline LRR motifs( count {}): {}", len(motif_strs), motif_strs))

    matrix = pssm_matrix.calc_pssm_matrix(motif_strs)
    logging.info(str.format("Matrix: {}", matrix))
    logging.info(str.format("PSSM: {}", matrix.pssm))

    # find the lrr
    with dao.query_session() as session:
        all_seqs = dao.sequence.find_all_seqs(session)

    tasks = [CalculateTask(matrix, seq) for seq in all_seqs]

    with Pool(12) as pool:
        pool.map(find_lrr_and_save_db_task, tasks)
    logging.info("All tasks done")
コード例 #19
0
 def test_add_motif(self):
     # 向seq中添加motif,OK
     self.assertIsNone(
         dao.add_manually_motif('SEQ1', 400, MOTIF_VERSION, 10.0, 0.1))
     self.assertIsNone(
         dao.add_manually_motif('SEQ1', 420, MOTIF_VERSION, 20.0, 0.2))
     seq_ids_to_motifs = dao.find_motifs_by_seq_ids(['SEQ1', 'SEQ2'],
                                                    MOTIF_VERSION)
     self.assertEqual(2, len(seq_ids_to_motifs))
     self.assertEqual({'SEQ1', 'SEQ2'}, seq_ids_to_motifs.keys())
     seq1_motifs = seq_ids_to_motifs['SEQ1']
     self.assertEqual(12, len(seq1_motifs))
     self.assertEqual(10, len(seq_ids_to_motifs['SEQ2']))
     offsets_to_motif = dict([(m.offset, m) for m in seq1_motifs])
     m1 = offsets_to_motif.get(400, None)
     self.assertIsNotNone(m1)
     self.assertEqual(400, m1.offset)
     self.assertTrue(10.001 > m1.score)
     self.assertTrue(9.999 < m1.score)
     m2 = offsets_to_motif.get(420, None)
     self.assertIsNotNone(m2)
     self.assertEqual(420, m2.offset)
     self.assertTrue(20.001 > m2.score)
     self.assertTrue(19.999 < m2.score)
コード例 #20
0
    def test_add_tags_by_names_to_ids_two_mark_on_one_motif(self):
        # 打tag前,SEQ2有10个motif
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs = seq_ids_to_motifs['SEQ2']
        self.assertEqual(10, len(motifs))

        # 为第0个motif打重复的tag
        dao.add_tags_by_names_to_ids({'inner.overlap': [motifs[0].id]},
                                     MOTIF_VERSION)
        # 检查点:第0个motif重查出来后,correct字段为False
        with query_session() as session:
            motif_0_in = dao.motif.find_motifs_by_ids(session, [motifs[0].id],
                                                      MOTIF_VERSION)
        self.assertEqual(1, len(motif_0_in))
        self.assertFalse(motif_0_in[0].correct)
        # 检查点:SEQ2带错查询,查出10个motif
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs_after_tag = seq_ids_to_motifs['SEQ2']
        self.assertEqual(10, len(motifs_after_tag))
        # 检查点:SEQ2本身的motif个数为9个
        self.assertEqual(
            9,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))

        # 为第0个motif添加手工标记错误的tag
        dao.add_tags_by_names_to_ids({'inner.falsediscovery': [motifs[0].id]},
                                     MOTIF_VERSION)
        # 检查点:第0个motif重查出来后,correct字段为False
        with query_session() as session:
            motif_0_in = dao.motif.find_motifs_by_ids(session, [motifs[0].id],
                                                      MOTIF_VERSION)
        self.assertEqual(1, len(motif_0_in))
        self.assertFalse(motif_0_in[0].correct)
        # 检查点:SEQ2带错查询,查出10个motif
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs_after_tag = seq_ids_to_motifs['SEQ2']
        self.assertEqual(10, len(motifs_after_tag))
        # 检查点: SEQ2只查正确,查出9个motif
        seq_ids_to_motifs = dao.find_correct_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs_after_tag = seq_ids_to_motifs['SEQ2']
        self.assertEqual(9, len(motifs_after_tag))
        # 检查点:SEQ2本身的motif个数为9个
        self.assertEqual(
            9,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))
コード例 #21
0
    def test_add_manually_motif_invalid_input(self):
        # sid不是数字
        with boddle(json={"offset": 10}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, "a")
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual(
            {'message': str.format(ErrorCode.INVALID_PARA, "sid", 'a')},
            result)

        # sid不存在
        with boddle(json={"offset": 10}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, 10240)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual(
            {'message': str.format(ErrorCode.OBJECT_NOT_EXISTS, 10240)},
            result)

        # offset不是数字
        with dao.query_session() as session:
            seqs = dao.sequence.find_all_seqs(session)
            seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])
        seq1_sid = seq_ids_to_seq['SEQ1'].id
        seq2_sid = seq_ids_to_seq['SEQ2'].id
        with boddle(json={"offset": "b"}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, seq1_sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual(
            {'message': str.format(ErrorCode.INVALID_PARA, "offset", 'b')},
            result)

        # offset出现重叠
        with boddle(json={"offset": 195}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, seq1_sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)},
                             result)

        # offset出现重叠
        with boddle(json={"offset": 185}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, seq2_sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)},
                             result)

        # 不可以在已标记为错误的offset上新增,因为直接取消错误标记就可以了
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ1',
        ], MOTIF_VERSION)
        seq1_second_motif = seq_ids_to_motifs['SEQ1'][1]
        mark_wrong(seq1_sid, seq1_second_motif.id)
        with boddle(json={"offset": seq1_second_motif.offset}):
            result = lrr_search_web_service.add_manually_motif(
                MOTIF_VERSION, seq1_sid)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertDictEqual({'message': ErrorCode.OFFSET_EXISTS_WRONG},
                             result)
コード例 #22
0
    def unused_test_replace_tags_by_motifs_false_discovery(self):
        # 打tag前,SEQ2有10个motif
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs = seq_ids_to_motifs['SEQ2']
        self.assertEqual(10, len(motifs))

        # 为第0个motif打标记错误的tag

        dao.replace_tags_by_motifs(
            {motifs[0].id: ['a', 'inner.falsediscovery']}, MOTIF_VERSION)
        # 检查点:第0个motif重查出来后,correct字段为False
        with query_session() as session:
            motif_0_in = dao.motif.find_motifs_by_ids(session, [motifs[0].id],
                                                      MOTIF_VERSION)
        self.assertEqual(1, len(motif_0_in))
        self.assertFalse(motif_0_in[0].correct)
        # 检查点:SEQ2带错查询,查出10个motif
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs_after_tag = seq_ids_to_motifs['SEQ2']
        self.assertEqual(10, len(motifs_after_tag))
        # 检查点:SEQ2本身的motif个数为9个
        self.assertEqual(
            9,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))

        # 为第5个motif打标记重叠的tag,SEQ2有8个motif
        dao.replace_tags_by_motifs({motifs[5].id: ['a', 'inner.overlap']},
                                   MOTIF_VERSION)
        # 检查点:不带错查询,能查出8个motif
        seq_ids_to_motifs = dao.find_correct_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs_after_tag = seq_ids_to_motifs['SEQ2']
        self.assertEqual(8, len(motifs_after_tag))

        # 为第5个motif replace回不相干的tag,SEQ2有9个correct motif
        dao.replace_tags_by_motifs({motifs[5].id: ['a']}, MOTIF_VERSION)
        self.assertEqual(1, len(motif_0_in))
        self.assertFalse(motif_0_in[0].correct)
        # 检查点:SEQ2带错查询,查出10个motif
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
            'SEQ2',
        ], MOTIF_VERSION)
        self.assertTrue('SEQ2' in seq_ids_to_motifs)
        motifs_after_tag = seq_ids_to_motifs['SEQ2']
        self.assertEqual(10, len(motifs_after_tag))
        # 检查点:SEQ2本身的motif个数为9个
        self.assertEqual(
            9,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))
コード例 #23
0
    def test_tag_false_discovery(self):
        with dao.query_session() as session:
            seqs = dao.sequence.find_all_seqs(session)
            seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs])
        # 打上inner.falsediscovery之后,motif变为false,sequence lrr数量少1
        # 1. 打tag之前,检查SEQ1,SEQ2,SEQ3的LRR数量,及第一个motif的correct状态
        self.assertEqual(
            10,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ1',
                ], MOTIF_VERSION)['SEQ1']))
        self.assertEqual(
            20,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))
        self.assertEqual(
            30,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ3',
                ], MOTIF_VERSION)['SEQ3']))
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids(
            {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION)
        self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'},
                         set(seq_ids_to_motifs.keys()))
        self.assertTrue(seq_ids_to_motifs['SEQ1'][0].correct)
        self.assertTrue(seq_ids_to_motifs['SEQ2'][0].correct)
        self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct)

        # 2. 为SEQ1、2的第一个motif打上inner.falsediscovery
        result = mark_wrong(seq_ids_to_seq['SEQ1'].id,
                            seq_ids_to_motifs['SEQ1'][0].id)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertTrue(result['false_discovery'])
        result = mark_wrong(seq_ids_to_seq['SEQ2'].id,
                            seq_ids_to_motifs['SEQ2'][0].id)
        self.assertIsNotNone(result)
        result = json.loads(result)
        self.assertTrue(result['false_discovery'])

        # 3. 重新检查SEQ1,SEQ2,SEQ3的LRR数量,及第一个motif的correct状态,SEQ1/2数量减1,correct状态为False
        self.assertEqual(
            9,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ1',
                ], MOTIF_VERSION)['SEQ1']))
        self.assertEqual(
            19,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ2',
                ], MOTIF_VERSION)['SEQ2']))
        self.assertEqual(
            30,
            len(
                dao.find_correct_motifs_by_seq_ids([
                    'SEQ3',
                ], MOTIF_VERSION)['SEQ3']))
        seq_ids_to_motifs = dao.find_motifs_by_seq_ids(
            {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION)
        self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'},
                         set(seq_ids_to_motifs.keys()))
        self.assertFalse(seq_ids_to_motifs['SEQ1'][0].correct)
        self.assertFalse(seq_ids_to_motifs['SEQ2'][0].correct)
        self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct)
        self.assertEqual(10, len(seq_ids_to_motifs['SEQ1']))
        self.assertEqual(20, len(seq_ids_to_motifs['SEQ2']))
        self.assertEqual(30, len(seq_ids_to_motifs['SEQ3']))
コード例 #24
0
def _get_motifs_by_seq_id(seq_id):
    seq_ids_to_motifs = dao.find_motifs_by_seq_ids([
        seq_id,
    ], MOTIF_VERSION)
    return seq_ids_to_motifs.get(seq_id, [])