def test_min_scores_filter(self): 'We can keep the hits scores above the given one' blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml')) #with evalue filters = [{'kind': 'score_threshold', 'score_key': 'expect', 'max_score': 1e-34, }] expected = {'cCL1Contig2': 2, 'cCL1Contig3': 0, 'cCL1Contig4': 2, 'cCL1Contig5': 2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected) #with similartiry filters = [{'kind': 'score_threshold', 'score_key': 'similarity', 'min_score': 92, }] expected = {'cCL1Contig2': 0, 'cCL1Contig3': 0, 'cCL1Contig4': 1, 'cCL1Contig5': 2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected)
def _look_for_blast_matches(self, seqrecords, blastdb): "it makes the blast and filters the results" blasts, blast_fhand = _do_blast_2(blastdb, seqrecords, self.program, params=self.params) # print open(blast_fhand.name).read() if self.filters is not None: blasts = filter_alignments(blasts, config=self.filters) indexed_match_parts = {} for blast in blasts: query = blast["query"] for match in blast["matches"]: subject = match["subject"] if self.elongate_for_global: elongate_match_parts_till_global( match["match_parts"], query_length=query["length"], subject_length=subject["length"], align_completely=SUBJECT, ) match_parts = match["match_parts"] try: indexed_match_parts[query["name"]].extend(match_parts) except KeyError: indexed_match_parts[query["name"]] = match_parts blast_fhand.close() return indexed_match_parts
def _look_for_blast_matches(self, seqrecords, blastdb, dbtype): "it makes the blast and filters the results" blasts, blast_fhand = _do_blast_2( blastdb, seqrecords, self.program, params=self.params, dbtype=dbtype, remote=self._remote ) # print open(blast_fhand.name).read() if self.filters is not None: blasts = filter_alignments(blasts, config=self.filters) blasts = {blast["query"]["name"]: blast for blast in blasts} blast_fhand.close() return blasts
def test_min_length_filter(self): 'We can keep the hits length above the given one' blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml')) #with the min length given in base pairs filters = [{'kind': 'min_length', 'min_num_residues': 500, 'length_in_query': True }] expected = {'cCL1Contig2': 3, 'cCL1Contig3': 0, 'cCL1Contig4': 1, 'cCL1Contig5': 1} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected) #with the min length given in query filters = [{'kind': 'min_length', 'min_percentage': 70, 'length_in_query': True }] expected = {'cCL1Contig2': 0, 'cCL1Contig3': 0, 'cCL1Contig4': 2, 'cCL1Contig5': 0} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) #print match_summary _check_match_summary(match_summary, expected) #with the min length given in subject % filters = [{'kind': 'min_length', 'min_percentage': 0.002, 'length_in_query': False }] expected = {'cCL1Contig2': 3, 'cCL1Contig3': 0, 'cCL1Contig4': 1, 'cCL1Contig5': 2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected)
def test_best_scores_filter(self): 'We can keep the hits with the bests expects' blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml')) filters = [{'kind': 'best_scores', 'score_key': 'expect', 'max_score': 1e-4, 'score_tolerance': 10 }] expected = {'cCL1Contig2': 2, 'cCL1Contig3': 1, 'cCL1Contig4': 1, 'cCL1Contig5': 2} blasts = BlastParser(fhand=blast_file) filtered_blasts = filter_alignments(blasts, config=filters) match_summary = _summarize_matches(filtered_blasts) _check_match_summary(match_summary, expected)
def _look_for_blast_matches(self, seqrecords, blastdb, dbtype): 'it makes the blast and filters the results' blasts, blast_fhand = _do_blast_2(blastdb, seqrecords, self.program, params=self.params, dbtype=dbtype, remote=self._remote) # print open(blast_fhand.name).read() if self.filters is not None: blasts = filter_alignments(blasts, config=self.filters) blasts = {blast['query']['name']: blast for blast in blasts} blast_fhand.close() return blasts
def test_blast_no_result(self): 'It test that the xml output can be and empty string' blast_file = NamedTemporaryFile() blasts = BlastParser(fhand=blast_file) filters = [{'kind': 'best_scores', 'score_key': 'expect', 'max_score': 1e-4, 'score_tolerance': 10 }] filt_b = filter_alignments(blasts, config=filters,) try: filt_b.next() self.fail() except StopIteration: pass
def _look_for_blast_matches(self, seq_fpath, oligos, seqs_type): 'It looks for the oligos in the given sequence files' # we need to keep the blast_fhands, because they're temp files and # otherwise they might be removed temp_dir = TemporaryDir() dbpath = os.path.join(temp_dir.name, os.path.basename(seq_fpath)) seqio([open(seq_fpath)], open(dbpath, 'w'), out_format='fasta', copy_if_same_format=False) blasts, blast_fhand = _do_blast_2(dbpath, oligos, params=self.params, program=self.program, dbtype=seqs_type) if self.filters is not None: blasts = filter_alignments(blasts, config=self.filters) # Which are the regions covered in each sequence? indexed_match_parts = {} for blast in blasts: oligo = blast['query'] for match in blast['matches']: read = match['subject'] if self.elongate_for_global: elongate_match_parts_till_global( match['match_parts'], query_length=oligo['length'], subject_length=read['length'], align_completely=QUERY) # match_parts = [m['match_parts'] for m in blast['matches']] match_parts = match['match_parts'] try: indexed_match_parts[read['name']].extend(match_parts) except KeyError: indexed_match_parts[read['name']] = match_parts temp_dir.close() blast_fhand.close() return indexed_match_parts
def _look_for_blast_matches(self, seq_fpath, oligos): "It looks for the oligos in the given sequence files" # we need to keep the blast_fhands, because they're temp files and # otherwise they might be removed temp_dir = TemporaryDir() dbpath = os.path.join(temp_dir.name, os.path.basename(seq_fpath)) seqio([open(seq_fpath)], [open(dbpath, "w")], out_format="fasta", copy_if_same_format=False) blasts, blast_fhand = _do_blast_2(dbpath, oligos, params=self.params, program=self.program) if self.filters is not None: blasts = filter_alignments(blasts, config=self.filters) # Which are the regions covered in each sequence? indexed_match_parts = {} one_oligo = True if len(oligos) == 1 else False for blast in blasts: oligo = blast["query"] for match in blast["matches"]: read = match["subject"] if self.elongate_for_global: elongate_match_parts_till_global( match["match_parts"], query_length=oligo["length"], subject_length=read["length"], align_completely=QUERY, ) # match_parts = [m['match_parts'] for m in blast['matches']] match_parts = match["match_parts"] if one_oligo: indexed_match_parts[read["name"]] = match_parts else: try: indexed_match_parts[read["name"]].extend(match_parts) except KeyError: indexed_match_parts[read["name"]] = match_parts temp_dir.close() blast_fhand.close() return indexed_match_parts
def _look_for_blast_matches(self, seq_fpath, oligos, seqs_type): 'It looks for the oligos in the given sequence files' # we need to keep the blast_fhands, because they're temp files and # otherwise they might be removed temp_dir = TemporaryDir() dbpath = os.path.join(temp_dir.name, os.path.basename(seq_fpath)) seqio([open(seq_fpath)], open(dbpath, 'w'), out_format='fasta', copy_if_same_format=False) blasts, blast_fhand = _do_blast_2(dbpath, oligos, params=self.params, program=self.program, dbtype=seqs_type) if self.filters is not None: blasts = filter_alignments(blasts, config=self.filters) # Which are the regions covered in each sequence? indexed_match_parts = {} for blast in blasts: oligo = blast['query'] for match in blast['matches']: read = match['subject'] if self.elongate_for_global: elongate_match_parts_till_global(match['match_parts'], query_length=oligo['length'], subject_length=read['length'], align_completely=QUERY) # match_parts = [m['match_parts'] for m in blast['matches']] match_parts = match['match_parts'] try: indexed_match_parts[read['name']].extend(match_parts) except KeyError: indexed_match_parts[read['name']] = match_parts temp_dir.close() blast_fhand.close() return indexed_match_parts
def test_min_length_mapper(self): 'We can filter the matches according to their length' filter1 = {'kind': 'min_length', 'min_num_residues': 100, 'length_in_query': True, } align1 = {'matches': [{'match_parts':[{'query_start':0, 'query_end':100, 'subject_start':0, 'subject_end':100}, {'query_start':0, 'query_end':50, 'subject_start':0, 'subject_end':50}, ]}, {'match_parts':[{'query_start':0, 'query_end':50, 'subject_start':0, 'subject_end':100, }]}, ] } alignments = [align1] filtered_alignments = list(filter_alignments(alignments, config=[filter1])) expected_align1 = {'matches': [{'start':0, 'end':100, 'subject_start':0, 'subject_end':100, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1 assert len(filtered_alignments[0]['matches'][0]['match_parts']) == 2 # Now filtering every match part filter1 = {'kind': 'min_length', 'min_num_residues': 100, 'length_in_query': True, 'filter_match_parts': True } filtered_alignments = list(filter_alignments(alignments, config=[filter1])) expected_align1 = {'matches': [{'start':0, 'end':100, 'subject_start':0, 'subject_end':100, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1 assert len(filtered_alignments[0]['matches'][0]['match_parts']) == 1 filter_ = {'kind': 'min_length', 'min_num_residues': 100, 'length_in_query': False, } filtered_alignments = list(filter_alignments(alignments, config=[filter_])) expected_align1 = {'matches': [{'start':0, 'end':100, 'subject_start':0, 'subject_end':100, }, {'start':0, 'end':50, 'subject_start':0, 'subject_end':100, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1 filter_ = {'kind': 'min_length', 'min_percentage': 90, 'length_in_query': True, } align1 = {'query': {'length': 100}, 'matches': [{'match_parts':[{'query_start':0, 'query_end':90, 'subject_start':0, 'subject_end':100, }]}, {'match_parts':[{'query_start':0, 'query_end':50, 'subject_start':0, 'subject_end':100, }]}, ] } alignments = [align1] filtered_alignments = list(filter_alignments(alignments, config=[filter_])) expected_align1 = {'matches': [{'start':0, 'end':90, 'subject_start':0, 'subject_end':100, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1 filter_ = {'kind': 'min_length', 'min_percentage': 90, 'length_in_query': False, } align1 = {'matches': [{'subject': {'length': 100}, 'match_parts':[{'query_start':0, 'query_end':100, 'subject_start':0, 'subject_end':90, }]}, {'subject': {'length': 100}, 'match_parts':[{'query_start':0, 'query_end':100, 'subject_start':0, 'subject_end':89, }]}, ] } alignments = [align1] filtered_alignments = list(filter_alignments(alignments, config=[filter_])) expected_align1 = {'matches': [{'start':0, 'end':100, 'subject_start':0, 'subject_end':90, }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1
def test_max_score_mapper(self): filter1 = {'kind': 'best_scores', 'score_key': 'expect', 'max_score': 1e-3 } align1 = {'matches': [{'scores':{'expect':1e-4}, 'start':0, 'end':100, 'subject_start':0, 'subject_end':100, 'match_parts':[{'scores':{'expect':1e-4}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'expect':5e-4}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, {'scores':{'expect':1e-3}, 'query_start':50, 'query_end':60, 'subject_start':50, 'subject_end':60, }, {'scores':{'expect':1e-2}, 'query_start':80, 'query_end':100, 'subject_start':80, 'subject_end':100, } ], }, {'scores':{'expect':1e-3}, 'match_parts':[{'scores':{'expect':1e-3}}], }, {'scores':{'expect':1e-2}, 'match_parts':[{'scores':{'expect':1e-2}}], } ] } align2 = {'matches': [{'scores':{'expect':1e-2}, 'match_parts':[{'scores':{'expect':1e-2}}], }]} alignments = [align1, align2] filtered_alignments = list(filter_alignments(alignments, config=[filter1])) expected_align1 = {'matches': [{'scores':{'expect':1e-4}, 'start':0, 'end':60, 'subject_start':0, 'subject_end':60, 'match_parts':[{'scores':{'expect':1e-4}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'expect':5e-4}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, {'scores':{'expect':1e-3}, 'query_start':50, 'query_end':60, 'subject_start':50, 'subject_end':60, }, ], }, {'scores':{'expect':1e-3}, 'match_parts':[{'scores':{'expect':1e-3}}], }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1
def test_min_score_mapper(self): 'We keep the matches with the scores above the threshold' filter1 = {'kind': 'score_threshold', 'score_key': 'score', 'min_score': 100, } align1 = {'matches': [{'scores':{'score':400}, 'start':0, 'end':100, 'subject_start':0, 'subject_end':100, 'match_parts':[{'scores':{'score':400}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'score':300}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, {'scores':{'score':50}, 'query_start':50, 'query_end':60, 'subject_start':50, 'subject_end':60, }, {'scores':{'score':40}, 'query_start':80, 'query_end':100, 'subject_start':80, 'subject_end':100, } ], }, {'scores':{'score':20}, 'match_parts':[{'scores':{'score':20}}], }, {'scores':{'score':90}, 'match_parts':[{'scores':{'score':90}}], } ] } align2 = {'matches': [{'scores':{'score':20}, 'match_parts':[{'scores':{'score':20}}], }]} alignments = [align1, align2] filtered_alignments = list(filter_alignments(alignments, config=[filter1])) expected_align1 = {'matches': [{'scores':{'score':400}, 'start':0, 'end':40, 'subject_start':0, 'subject_end':40, 'match_parts':[{'scores':{'score':400}, 'query_start':0, 'query_end':10, 'subject_start':0, 'subject_end':10, }, {'scores':{'score':300}, 'query_start':30, 'query_end':40, 'subject_start':30, 'subject_end':40, }, ], }, ] } _check_blast(filtered_alignments[0], expected_align1) assert len(filtered_alignments) == 1