def test_delete_auto_motif_failed(self): seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ1', ], MOTIF_VERSION) result = delete_motif(seq_ids_to_motifs['SEQ1'][0].id) self.assertIsNotNone(result) seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ1', ], MOTIF_VERSION) result = delete_motif(seq_ids_to_motifs['SEQ1'][4].id) self.assertIsNotNone(result)
def find_lrr_and_save_db(matrix, seq): seq_ids_to_motifs = dao.find_motifs_by_seq_ids([seq.seq_id,], MOTIF_VERSION) if len(seq_ids_to_motifs.get(seq.seq_id, [])) > 0: logging.warning(str.format("The sequence {} is already analysied before, skip it", seq.seq_id)) return seq_str = str(seq.seq) invalid_amino = set(seq_str) - VALID_AMINO if len(invalid_amino) > 0: logging.error(str.format("The sequence {} contains invalid amino {}", seq.seq_id, invalid_amino)) return logging.info(str.format("Begin to find lrr in seq {}, {}", seq.seq_id, seq_str)) motifs = motif_tool.lrr_search(matrix, str(seq.seq)) motifs = motif_tool.found_no_overlapped_motifs(motifs) for m in motifs: m.seq_id = seq.seq_id m.correct = True logging.info(str.format("Found {} LRR motifs in sequence {}", len(motifs), seq.seq_id)) _print_debug(motifs, seq) # Write to db motif_entities = [dao.motif_entity.MotifEntityBase(**m.__dict__) for m in motifs] logging.debug("Begin to write to db") with dao.session_scope() as session: dao.motif.replace_motifs_by_seq(session, seq.seq_id, motif_entities, MOTIF_VERSION)
def get_and_check_offset(seq, version): payload = request.json if payload is None: raise ValidationError(str.format(ErrorCode.PARA_NOT_EXISTS, "offset")) offset = payload.get('offset', None) if offset is None: raise ValidationError(str.format(ErrorCode.PARA_NOT_EXISTS, "offset")) offset = get_and_check_int( offset, str.format(ErrorCode.INVALID_PARA, 'offset', offset)) if offset < 0: raise ValidationError( str.format(ErrorCode.INVALID_PARA, 'offset', offset)) if offset + 16 > len(seq.seq): raise ValidationError( str.format(ErrorCode.INVALID_PARA, 'offset', offset)) seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ seq.seq_id, ], version) motifs = seq_ids_to_motifs.get(seq.seq_id, []) poses = set() wrong_poses = set() for m in motifs: if m.correct: poses.update(range(m.offset - 15, m.offset + 16)) else: wrong_poses.add(m.offset) if offset in poses: raise ValidationError(ErrorCode.OFFSET_OVERLAP) if offset in wrong_poses: raise ValidationError(ErrorCode.OFFSET_EXISTS_WRONG) return offset
def test_add_manually_motif_before_wrong_area(self): # 先把seq都查出来 with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) sid = seq_ids_to_seq['SEQ2'].id ms = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'] ms.sort(key=lambda m: m.offset) motif = ms[0] # 重叠(新增的offset在前面面)时无法添加 with boddle(json={"offset": 185}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, sid) self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)}, result) # 标记错误 mark_wrong(sid, motif.id) # 再次添加OK with boddle(json={"offset": 185}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, sid) self.assertIsNotNone(result) result = json.loads(result) self.assertTrue("motifs_16" in result) self.assertEqual(185, result['motifs_16'][0]['offset'])
def get_sequences(version): filters = {} filters['page_index'], filters['page_size'] = _get_page_arg() filters['keyword'] = _get_keyword_arg() _get_offset_arg(filters) _get_lrr_count_arg(filters) _get_species_arg(filters) filters['page_index'] = filters['page_index'] * filters['page_size'] logging.debug(str.format("Filters: {}", filters)) seqs, total = dao.query_sequences(filters, get_version_arg(version)) # find motifs and nsites seq_ids = [seq.seq_id for seq in seqs] seq_ids_to_motifs = dao.find_motifs_by_seq_ids(seq_ids, get_version_arg(version)) seq_ids_to_nsites = dao.find_nsites_by_seq_ids(seq_ids) # find tags, the overlap mark is tagged on version 1 motifs if get_version_arg(version) == 1: ids_to_tag_names = dao.find_tags_by_motif_ids( set([ m.id for motifs in seq_ids_to_motifs.values() for m in motifs ])) else: ids_to_tag_names = {} result = {'sequences': [], 'total': total} for seq in seqs: result['sequences'].append( sequence_entity_to_output(seq, seq_ids_to_motifs.get(seq.seq_id, []), seq_ids_to_nsites.get(seq.seq_id, []), ids_to_tag_names)) return response_ok(result, True)
def test_add_manually_motif_after_wrong_area(self): set_up_baseline_seq('SEQ4', '', start=10, count=10, step=24) # 先把seq都查出来 with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) sid = seq_ids_to_seq['SEQ2'].id ms = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'] ms.sort(key=lambda m: m.offset) motif = ms[-1] # 重叠(新增的offset在前面面)时无法添加 with boddle(json={"offset": 595}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, sid) self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)}, result) # 标记错误 mark_wrong(sid, motif.id) # 再次添加OK with boddle(json={"offset": 595}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, sid) self.assertIsNotNone(result) result = json.loads(result) self.assertEqual(1, len(result.get('motifs_16', []))) self.assertEqual(595, result['motifs_16'][0].get('offset', -1))
def test_tag_multiple_times(self): # 首先调用tag用例,打上tag self.test_tag_false_discovery() with dao.query_session() as session: seq_ids_to_seq = dict([ (seq.seq_id, seq) for seq in dao.sequence.find_all_seqs(session) ]) # 再打一次 seq_ids_to_motifs = dao.find_motifs_by_seq_ids( {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION) mark_wrong(seq_ids_to_seq['SEQ1'].id, seq_ids_to_motifs['SEQ1'][0].id) mark_wrong(seq_ids_to_seq['SEQ2'].id, seq_ids_to_motifs['SEQ2'][0].id) # 效果与原来是一样的 self.assertEqual( 9, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ1', ], MOTIF_VERSION)['SEQ1'])) self.assertEqual( 19, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'])) self.assertEqual( 30, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ3', ], MOTIF_VERSION)['SEQ3'])) seq_ids_to_motifs = dao.find_motifs_by_seq_ids( {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION) self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'}, set(seq_ids_to_motifs.keys())) self.assertFalse(seq_ids_to_motifs['SEQ1'][0].correct) self.assertFalse(seq_ids_to_motifs['SEQ2'][0].correct) self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct) self.assertEqual(10, len(seq_ids_to_motifs['SEQ1'])) self.assertEqual(20, len(seq_ids_to_motifs['SEQ2'])) self.assertEqual(30, len(seq_ids_to_motifs['SEQ3']))
def main(): motifs = get_old_wrong_new_exists() print( str.format("Get wrong in old and still exists in new, Count {}, {}", len(motifs), motifs)) for old_m, new_m in motifs: print( str.format( "Mark mid {}, seq_id {}, offset {} as false_discovery in version {}", new_m.id, new_m.seq_id, new_m.offset, NEW_VERSION)) #dao.update_false_discovery_by_motif(new_m.id, True, NEW_VERSION) matrix = pssm_matrix.calc_pssm_matrix(dao.find_baseline_motifs()) motifs = get_old_correct_new_not_exists() with dao.query_session() as session: all_seq_ids = set([m.seq_id for m in motifs]) seqs = dao.sequence.find_seq_by_ids(session, all_seq_ids) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) assert (len(seq_ids_to_seq) == len(all_seq_ids)) print( str.format("Get correct in old and not exists in new, Count {}, {}", len(motifs), motifs)) for m in motifs: motif_seq = seq_ids_to_seq[m.seq_id].seq[m.offset:m.offset + 16] assert (len(motif_seq) == 16) score = motif_tools.calc_pssm_score(motif_seq, matrix) probability = motif_tools.calc_probability_by_score(score) new_m = dao.motif.MotifEntityBase(m.offset, m.seq_id, score, probability, 0, manually_add=True) new_motifs = dao.find_motifs_by_seq_ids([ m.seq_id, ], NEW_VERSION, with_wrong=False)[m.seq_id] new_motifs.append(new_m) new_motifs_no_overlap = motif_tools.found_no_overlapped_motifs( new_motifs, 16) if len(new_motifs) != len(new_motifs_no_overlap): logging.error( str.format("Overlap found in seq {}, {}/{}, new offset {}", m.seq_id, len(new_motifs), len(new_motifs_no_overlap), new_m.offset)) else: logging.debug( str.format( "Add motif offset {}, score {}, probability {} to seq {} manually", new_m.offset, new_m.score, new_m.probability, new_m.seq_id)) dao.add_manually_motif(new_m.seq_id, new_m.offset, NEW_VERSION, new_m.score, new_m.probability)
def _get_manually_motifs(self, seq_id): seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ seq_id, ], MOTIF_VERSION) self.assertTrue(seq_id in seq_ids_to_motifs) manually_motifs = [] for m in seq_ids_to_motifs[seq_id]: if m.manually_add: manually_motifs.append(m) self.assertTrue(len(manually_motifs) > 0) return manually_motifs
def test_untag_false_discovery(self): # 首先调用tag用例,打上tag self.test_tag_false_discovery() with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) # 删除SEQ1中motif的tag seq_ids_to_motifs = dao.find_motifs_by_seq_ids( {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION) unmark_wrong(seq_ids_to_seq['SEQ1'].id, seq_ids_to_motifs['SEQ1'][0].id) # SEQ1的数量恢复 self.assertEqual( 10, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ1', ], MOTIF_VERSION)['SEQ1'])) self.assertEqual( 19, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'])) self.assertEqual( 30, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ3', ], MOTIF_VERSION)['SEQ3'])) seq_ids_to_motifs = dao.find_motifs_by_seq_ids( {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION) self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'}, set(seq_ids_to_motifs.keys())) self.assertTrue(seq_ids_to_motifs['SEQ1'][0].correct) self.assertFalse(seq_ids_to_motifs['SEQ2'][0].correct) self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct)
def setUp(self): set_up_db() set_up_seq('SEQ2', 'SEQ2ABCDABCD', start=10, count=20) set_up_seq('SEQ1', 'SEQ1ABCDABCD') set_up_seq('SEQ3', 'SEQ3ABCDABCD', start=50, count=30) with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) self.seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) self.assertEqual(3, len(self.seq_ids_to_seq)) self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'}, set(self.seq_ids_to_seq.keys())) seq_ids_to_motifs = dao.find_motifs_by_seq_ids( self.seq_ids_to_seq.keys(), MOTIF_VERSION)
def main(): with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) step1(seqs) logging.info("Step 1 end") # since only five SGs were considered from step 3.4, the seq was redefined relevant_seqs = [seq for seq in seqs if seq.subgroup in SUBGROUPS] relevant_seq_ids = [seq.seq_id for seq in relevant_seqs] seq_ids_to_nsites = dao.find_nsites_by_seq_ids(relevant_seq_ids) seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(relevant_seq_ids, MOTIF_VERSION, with_wrong=False) step3_5(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs) logging.info("Step 3.5 end") step3_7(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs) logging.info("Step 3.7 end") step3_9(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs) logging.info("Step 3.9 end") step3_10(seq_ids_to_lrrs, seqs) logging.info("Step 3.10 end") step3_11(seqs, seq_ids_to_lrrs) logging.info("Step 3.11 end") full_seq_ids = [seq.seq_id for seq in seqs] full_seq_ids_to_nsites = dao.find_nsites_by_seq_ids(full_seq_ids) full_seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(full_seq_ids, MOTIF_VERSION, with_wrong=False) supplement_for_review(seqs, full_seq_ids_to_nsites, full_seq_ids_to_lrrs) logging.info("Step supplement for review end")
def main(): with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) step1(seqs) logging.info("Step 1 end") # 3.4开始,只关注特定的5个亚家族,因此对seq做精简 relevant_seqs = [seq for seq in seqs if seq.subgroup in SUBGROUPS] relevant_seq_ids = [seq.seq_id for seq in relevant_seqs] seq_ids_to_nsites = dao.find_nsites_by_seq_ids(relevant_seq_ids) seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(relevant_seq_ids, MOTIF_VERSION, with_wrong=False) step3_5(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs) logging.info("Step 3.5 end") step3_7(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs) logging.info("Step 3.7 end") step3_9(relevant_seqs, seq_ids_to_nsites, seq_ids_to_lrrs) logging.info("Step 3.9 end") step3_10(seq_ids_to_lrrs, seqs) logging.info("Step 3.10 end") step3_11(seqs, seq_ids_to_lrrs) logging.info("Step 3.11 end") full_seq_ids = [seq.seq_id for seq in seqs] full_seq_ids_to_nsites = dao.find_nsites_by_seq_ids(full_seq_ids) full_seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(full_seq_ids, MOTIF_VERSION, with_wrong=False) supplement_for_review(seqs, full_seq_ids_to_nsites, full_seq_ids_to_lrrs) logging.info("Step supplement for review end")
def main(): seqs = find_seqs() seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) logging.info("Seqs count {}, seq_ids {}", len(seqs), seq_ids_to_seq.keys()) seq_ids_to_motifs = dao.find_motifs_by_seq_ids(seq_ids_to_seq.keys(), MOTIF_VERSION, with_wrong=False) seq_ids_to_matrix = {} for seq_id, motif_entities in seq_ids_to_motifs.items(): motif_entities.sort(key=lambda m: m.offset) seq_str = seq_ids_to_seq[seq_id].seq motif_seqs_str = [ seq_str[m.offset:m.offset + 16] for m in motif_entities ] generate_weblogo(seq_id, motif_seqs_str) motif_seq = [ Seq(motif_seq_str, IUPAC.protein) for motif_seq_str in motif_seqs_str ] matrix = motifs.create(motif_seq, IUPAC.protein) seq_ids_to_matrix[seq_id] = matrix seq_ids = list(seq_ids_to_matrix.keys()) seq_ids.sort() rows = [] for seq_id in seq_ids: matrix = seq_ids_to_matrix.get(seq_id) print(matrix.pwm[AMINO][POS]) print(str.format("seq {}:\n{}", seq_id, matrix)) row = OrderedDict(seq_id=seq_id, s_num=matrix.counts[AMINO][POS], probability=matrix.pwm[AMINO][POS], LRRs='\n'.join([ seq_ids_to_seq[seq_id].seq[m.offset:m.offset + 16] for m in seq_ids_to_motifs[seq_id] ])) rows.append(row) with open(OUTPUT_FILE, 'w') as f: c = csv.DictWriter( f, fieldnames=['seq_id', 's_num', 'probability', 'LRRs']) c.writeheader() c.writerows(rows)
def change_motif_false_discovery(version, sid, mid): try: version = get_version_arg(version) seq = get_and_check_sid(sid) motif = get_and_check_motif_id(mid, version) if seq.seq_id != motif.seq_id: raise ValidationError( str.format( "The seq_id {} in sequence is different with motif {}", seq.seq_id, motif.seq_id)) if motif.manually_add: raise ValidationError( str.format( "The motif {} offset {} in sequence {} is manually added," " and should not be tagged wrong again. The motif can be delete directly", mid, motif.offset, motif.seq_id)) false_discovery = _get_input_false_discovery() if not false_discovery: seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ motif.seq_id, ], version) if is_overlapped([ m.offset for m in seq_ids_to_motifs[motif.seq_id] if m.correct ] + [motif.offset]): raise ValidationError( str.format( "The motif {} can not be unmark from wrong because overlapping was found", mid)) if version < 2: raise ValidationError( "The version 1 is no longer support marking wrong") dao.update_false_discovery_by_motif(mid, false_discovery, version) motif_entity = dao.find_motif_by_mid(mid, version) if motif_entity is None: return response_error( str.format("Internal error, can not find the motif {}", mid)) motif_output = motif_entity_output([motif_entity], {}) return response_ok(motif_output[0]) except ValidationError as e: return response_error(e.message)
def test_can_not_unmark_wrong_overlapped_with_manually_added_one(self): # 如果手工添加的motif与原有已经mark为wrong的moiti存在覆盖,那么手动添加后,原有的motif就不可以unmark了 # 细分的话,又分两种情况,手动motif的offset在wrong的后面,与在其前面 # 首先对SEQ2的第一个和最后一个motif标记wrong,然后手工添加motif self.test_add_manually_motif_after_wrong_area() self.test_add_manually_motif_before_wrong_area() # 尝试对第一个和最后一个motif解除wrong标记,应该都要失败(因为前面添加过两个手动motif了,所以最后一个应该是-3) seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) seq_ids_to_motifs['SEQ2'].sort(key=lambda m: m.offset) first_motif = seq_ids_to_motifs['SEQ2'][1] last_motif = seq_ids_to_motifs['SEQ2'][-2] with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, first_motif.id) self.assertIsNotNone(result) msg = result.get('message', '') self.assertTrue( msg.endswith( ' can not be unmark from wrong because overlapping was found'), result) result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, last_motif.id) self.assertIsNotNone(result) msg = result.get('message', '') self.assertTrue( msg.endswith( ' can not be unmark from wrong because overlapping was found'), result) # 删除手工添加的motif for m in self._get_manually_motifs('SEQ2'): delete_motif(m.id) # 尝试对第一个和最后一个解除wrong标记,成功 result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, first_motif.id) self.assertFalse(result['false_discovery']) result = unmark_wrong(seq_ids_to_seq['SEQ2'].id, last_motif.id) self.assertFalse(result['false_discovery'])
def step3_10(seq_ids_to_lrrs, seqs): seq_ids_to_file_path = { 'AT1G55610.2': ("mafft/brl1_homo_10_ali.fasta", "iqtree/BRL1_figtree.tree"), 'AT4G39400.1': ("mafft/bri1_homo_100_align.fasta", "iqtree/bri1_figtree.tree"), 'AT5G46330.1': ("mafft/fls2_homo_10_ali.fasta", "iqtree/FLS2_figtree.tree"), 'AT4G28490.1': ("mafft/hae_top10_id_ali.fasta", "iqtree/HAE_figtree.tree"), 'AT1G73080.1': ("mafft/pepr1_top10_ali.fasta", "iqtree/pepr1_figtree.tree"), 'AT2G02220.1': ("mafft/pskr1_top10_ali.fasta", "iqtree/PSKR1_figtree.tree"), 'AT5G61480.1': ("mafft/PXY_top10_ali.fasta", "iqtree/PXY_figtree.tree"), "AT4G26540.1": ("mafft/RGFR1_top10_ali.fasta", "iqtree/RGFR1_figtree.tree") } seq_ids_to_data = read_in_ali_seqs(seq_ids_to_file_path) relevant_seq_ids = set([ seq_id for data in seq_ids_to_data.values() for seq_id in data.seq_ids_to_seq.keys() ]) lost_seq_ids = relevant_seq_ids - set(seq_ids_to_lrrs.keys()) lost_seq_ids_to_lrrs = dao.find_motifs_by_seq_ids(lost_seq_ids, MOTIF_VERSION, with_wrong=False) seq_ids_to_lrrs.update(lost_seq_ids_to_lrrs) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs if seq.seq_id in relevant_seq_ids]) generate_step3_10_weblogo_per_offset(seq_ids_to_data, seq_ids_to_lrrs) generate_step3_10_weblog_lrrs(seq_ids_to_data, seq_ids_to_seq, seq_ids_to_lrrs) generate_step3_10_weblogo_by_types(seq_ids_to_data, seq_ids_to_seq, seq_ids_to_lrrs) statitics_match_count(seq_ids_to_data, seq_ids_to_lrrs)
def main(): # generate the matrix seqs = dao.find_baseline_seqs() seq_ids_to_seq_str = dict([(seq.seq_id, seq.seq) for seq in seqs]) logging.info(str.format("Baseline sequence ids count({}): {}", len(seq_ids_to_seq_str), seq_ids_to_seq_str.keys())) seq_ids_to_motifs = dao.find_motifs_by_seq_ids(seq_ids_to_seq_str.keys(), BASELINE_MOTIF_VERSION, with_wrong=False) motif_strs = [seq_ids_to_seq_str[m.seq_id][m.offset:m.offset+16] for motifs in seq_ids_to_motifs.values() for m in motifs] logging.info(str.format("Baseline LRR motifs( count {}): {}", len(motif_strs), motif_strs)) matrix = pssm_matrix.calc_pssm_matrix(motif_strs) logging.info(str.format("Matrix: {}", matrix)) logging.info(str.format("PSSM: {}", matrix.pssm)) # find the lrr with dao.query_session() as session: all_seqs = dao.sequence.find_all_seqs(session) tasks = [CalculateTask(matrix, seq) for seq in all_seqs] with Pool(12) as pool: pool.map(find_lrr_and_save_db_task, tasks) logging.info("All tasks done")
def test_add_motif(self): # 向seq中添加motif,OK self.assertIsNone( dao.add_manually_motif('SEQ1', 400, MOTIF_VERSION, 10.0, 0.1)) self.assertIsNone( dao.add_manually_motif('SEQ1', 420, MOTIF_VERSION, 20.0, 0.2)) seq_ids_to_motifs = dao.find_motifs_by_seq_ids(['SEQ1', 'SEQ2'], MOTIF_VERSION) self.assertEqual(2, len(seq_ids_to_motifs)) self.assertEqual({'SEQ1', 'SEQ2'}, seq_ids_to_motifs.keys()) seq1_motifs = seq_ids_to_motifs['SEQ1'] self.assertEqual(12, len(seq1_motifs)) self.assertEqual(10, len(seq_ids_to_motifs['SEQ2'])) offsets_to_motif = dict([(m.offset, m) for m in seq1_motifs]) m1 = offsets_to_motif.get(400, None) self.assertIsNotNone(m1) self.assertEqual(400, m1.offset) self.assertTrue(10.001 > m1.score) self.assertTrue(9.999 < m1.score) m2 = offsets_to_motif.get(420, None) self.assertIsNotNone(m2) self.assertEqual(420, m2.offset) self.assertTrue(20.001 > m2.score) self.assertTrue(19.999 < m2.score)
def test_add_tags_by_names_to_ids_two_mark_on_one_motif(self): # 打tag前,SEQ2有10个motif seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs = seq_ids_to_motifs['SEQ2'] self.assertEqual(10, len(motifs)) # 为第0个motif打重复的tag dao.add_tags_by_names_to_ids({'inner.overlap': [motifs[0].id]}, MOTIF_VERSION) # 检查点:第0个motif重查出来后,correct字段为False with query_session() as session: motif_0_in = dao.motif.find_motifs_by_ids(session, [motifs[0].id], MOTIF_VERSION) self.assertEqual(1, len(motif_0_in)) self.assertFalse(motif_0_in[0].correct) # 检查点:SEQ2带错查询,查出10个motif seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs_after_tag = seq_ids_to_motifs['SEQ2'] self.assertEqual(10, len(motifs_after_tag)) # 检查点:SEQ2本身的motif个数为9个 self.assertEqual( 9, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'])) # 为第0个motif添加手工标记错误的tag dao.add_tags_by_names_to_ids({'inner.falsediscovery': [motifs[0].id]}, MOTIF_VERSION) # 检查点:第0个motif重查出来后,correct字段为False with query_session() as session: motif_0_in = dao.motif.find_motifs_by_ids(session, [motifs[0].id], MOTIF_VERSION) self.assertEqual(1, len(motif_0_in)) self.assertFalse(motif_0_in[0].correct) # 检查点:SEQ2带错查询,查出10个motif seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs_after_tag = seq_ids_to_motifs['SEQ2'] self.assertEqual(10, len(motifs_after_tag)) # 检查点: SEQ2只查正确,查出9个motif seq_ids_to_motifs = dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs_after_tag = seq_ids_to_motifs['SEQ2'] self.assertEqual(9, len(motifs_after_tag)) # 检查点:SEQ2本身的motif个数为9个 self.assertEqual( 9, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2']))
def test_add_manually_motif_invalid_input(self): # sid不是数字 with boddle(json={"offset": 10}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, "a") self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual( {'message': str.format(ErrorCode.INVALID_PARA, "sid", 'a')}, result) # sid不存在 with boddle(json={"offset": 10}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, 10240) self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual( {'message': str.format(ErrorCode.OBJECT_NOT_EXISTS, 10240)}, result) # offset不是数字 with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) seq1_sid = seq_ids_to_seq['SEQ1'].id seq2_sid = seq_ids_to_seq['SEQ2'].id with boddle(json={"offset": "b"}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, seq1_sid) self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual( {'message': str.format(ErrorCode.INVALID_PARA, "offset", 'b')}, result) # offset出现重叠 with boddle(json={"offset": 195}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, seq1_sid) self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)}, result) # offset出现重叠 with boddle(json={"offset": 185}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, seq2_sid) self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual({'message': str.format(ErrorCode.OFFSET_OVERLAP)}, result) # 不可以在已标记为错误的offset上新增,因为直接取消错误标记就可以了 seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ1', ], MOTIF_VERSION) seq1_second_motif = seq_ids_to_motifs['SEQ1'][1] mark_wrong(seq1_sid, seq1_second_motif.id) with boddle(json={"offset": seq1_second_motif.offset}): result = lrr_search_web_service.add_manually_motif( MOTIF_VERSION, seq1_sid) self.assertIsNotNone(result) result = json.loads(result) self.assertDictEqual({'message': ErrorCode.OFFSET_EXISTS_WRONG}, result)
def unused_test_replace_tags_by_motifs_false_discovery(self): # 打tag前,SEQ2有10个motif seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs = seq_ids_to_motifs['SEQ2'] self.assertEqual(10, len(motifs)) # 为第0个motif打标记错误的tag dao.replace_tags_by_motifs( {motifs[0].id: ['a', 'inner.falsediscovery']}, MOTIF_VERSION) # 检查点:第0个motif重查出来后,correct字段为False with query_session() as session: motif_0_in = dao.motif.find_motifs_by_ids(session, [motifs[0].id], MOTIF_VERSION) self.assertEqual(1, len(motif_0_in)) self.assertFalse(motif_0_in[0].correct) # 检查点:SEQ2带错查询,查出10个motif seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs_after_tag = seq_ids_to_motifs['SEQ2'] self.assertEqual(10, len(motifs_after_tag)) # 检查点:SEQ2本身的motif个数为9个 self.assertEqual( 9, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'])) # 为第5个motif打标记重叠的tag,SEQ2有8个motif dao.replace_tags_by_motifs({motifs[5].id: ['a', 'inner.overlap']}, MOTIF_VERSION) # 检查点:不带错查询,能查出8个motif seq_ids_to_motifs = dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs_after_tag = seq_ids_to_motifs['SEQ2'] self.assertEqual(8, len(motifs_after_tag)) # 为第5个motif replace回不相干的tag,SEQ2有9个correct motif dao.replace_tags_by_motifs({motifs[5].id: ['a']}, MOTIF_VERSION) self.assertEqual(1, len(motif_0_in)) self.assertFalse(motif_0_in[0].correct) # 检查点:SEQ2带错查询,查出10个motif seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION) self.assertTrue('SEQ2' in seq_ids_to_motifs) motifs_after_tag = seq_ids_to_motifs['SEQ2'] self.assertEqual(10, len(motifs_after_tag)) # 检查点:SEQ2本身的motif个数为9个 self.assertEqual( 9, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2']))
def test_tag_false_discovery(self): with dao.query_session() as session: seqs = dao.sequence.find_all_seqs(session) seq_ids_to_seq = dict([(seq.seq_id, seq) for seq in seqs]) # 打上inner.falsediscovery之后,motif变为false,sequence lrr数量少1 # 1. 打tag之前,检查SEQ1,SEQ2,SEQ3的LRR数量,及第一个motif的correct状态 self.assertEqual( 10, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ1', ], MOTIF_VERSION)['SEQ1'])) self.assertEqual( 20, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'])) self.assertEqual( 30, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ3', ], MOTIF_VERSION)['SEQ3'])) seq_ids_to_motifs = dao.find_motifs_by_seq_ids( {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION) self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'}, set(seq_ids_to_motifs.keys())) self.assertTrue(seq_ids_to_motifs['SEQ1'][0].correct) self.assertTrue(seq_ids_to_motifs['SEQ2'][0].correct) self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct) # 2. 为SEQ1、2的第一个motif打上inner.falsediscovery result = mark_wrong(seq_ids_to_seq['SEQ1'].id, seq_ids_to_motifs['SEQ1'][0].id) self.assertIsNotNone(result) result = json.loads(result) self.assertTrue(result['false_discovery']) result = mark_wrong(seq_ids_to_seq['SEQ2'].id, seq_ids_to_motifs['SEQ2'][0].id) self.assertIsNotNone(result) result = json.loads(result) self.assertTrue(result['false_discovery']) # 3. 重新检查SEQ1,SEQ2,SEQ3的LRR数量,及第一个motif的correct状态,SEQ1/2数量减1,correct状态为False self.assertEqual( 9, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ1', ], MOTIF_VERSION)['SEQ1'])) self.assertEqual( 19, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ2', ], MOTIF_VERSION)['SEQ2'])) self.assertEqual( 30, len( dao.find_correct_motifs_by_seq_ids([ 'SEQ3', ], MOTIF_VERSION)['SEQ3'])) seq_ids_to_motifs = dao.find_motifs_by_seq_ids( {'SEQ1', 'SEQ2', 'SEQ3'}, MOTIF_VERSION) self.assertEqual({'SEQ1', 'SEQ2', 'SEQ3'}, set(seq_ids_to_motifs.keys())) self.assertFalse(seq_ids_to_motifs['SEQ1'][0].correct) self.assertFalse(seq_ids_to_motifs['SEQ2'][0].correct) self.assertTrue(seq_ids_to_motifs['SEQ3'][0].correct) self.assertEqual(10, len(seq_ids_to_motifs['SEQ1'])) self.assertEqual(20, len(seq_ids_to_motifs['SEQ2'])) self.assertEqual(30, len(seq_ids_to_motifs['SEQ3']))
def _get_motifs_by_seq_id(seq_id): seq_ids_to_motifs = dao.find_motifs_by_seq_ids([ seq_id, ], MOTIF_VERSION) return seq_ids_to_motifs.get(seq_id, [])