def test_min_scores_filter(self):
        'We can keep the hits scores above the given one'
        blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml'))

        #with evalue
        filters = [{'kind'     : 'score_threshold',
                    'score_key': 'expect',
                    'max_score': 1e-34,
                   }]
        expected = {'cCL1Contig2':2, 'cCL1Contig3':0,
                     'cCL1Contig4':2, 'cCL1Contig5':2}
        blasts = BlastParser(fhand=blast_file)
        filtered_blasts = filter_alignments(blasts, config=filters)
        match_summary = _summarize_matches(filtered_blasts)
        _check_match_summary(match_summary, expected)

        #with similartiry
        filters = [{'kind'     : 'score_threshold',
                    'score_key': 'similarity',
                    'min_score': 92,
                   }]
        expected = {'cCL1Contig2':0, 'cCL1Contig3':0,
                     'cCL1Contig4':1, 'cCL1Contig5':2}
        blasts = BlastParser(fhand=blast_file)
        filtered_blasts = filter_alignments(blasts, config=filters)
        match_summary = _summarize_matches(filtered_blasts)
        _check_match_summary(match_summary, expected)
Beispiel #2
0
def similar_sequences_for_blast(blast_fhand, filters=None):
    "It look for similar sequences in a blast result"
    # now we parse the blast
    blast_parser = get_alignment_parser("blast+")
    blast_result = blast_parser(blast_fhand)

    # We filter the results with appropiate  filters
    if filters is None:
        filters = [
            {"kind": "score_threshold", "score_key": "similarity", "min_score": 90},
            {"kind": "min_length", "min_num_residues": 100, "length_in_query": True},
        ]
    alignments = filter_alignments(blast_result, config=filters)
    try:
        alignment = alignments.next()
    except StopIteration:
        return []
    similar_seqs = []
    for match in alignment["matches"]:
        # to which sequence our query is similar?
        name = match["subject"].name
        similar_seqs.append(
            {
                "name": name,
                "subject_start": match["subject_start"],
                "subject_end": match["subject_end"],
                "query_start": match["start"],
                "query_end": match["end"],
            }
        )
    return similar_seqs
Beispiel #3
0
def similar_sequences_for_blast(blast_fhand, filters=None):
    'It look fro similar sequences ina blast result'
    #now we parse the blast
    blast_parser = get_alignment_parser('blast+')
    blast_result = blast_parser(blast_fhand)

    # We filter the results with appropiate  filters
    if filters is None:
        filters = [{'kind'     : 'score_threshold',
                    'score_key': 'similarity',
                    'min_score': 90,
                   },
                   {'kind'            : 'min_length',
                    'min_num_residues': 100,
                    'length_in_query' : True
                   }
                  ]
    alignments = filter_alignments(blast_result, config=filters)
    try:
        alignment = alignments.next()
    except StopIteration:
        return []
    similar_seqs = []
    for match in alignment['matches']:
        #to which sequence our query is similar?
        name = match['subject'].name
        similar_seqs.append({'name':name,
                             'subject_start': match['subject_start'],
                             'subject_end':   match['subject_end'],
                             'query_start':   match['start'],
                             'query_end':     match['end']
                             })
    return similar_seqs
Beispiel #4
0
def get_hit_pairs_fom_blast(blast_fhand, sub_def_as_acc=None, filters=None):
    'It return a iterator with query subjetc tuples of the hist in the blast'

    blasts = BlastParser(fhand=blast_fhand, subj_def_as_accesion=sub_def_as_acc)
    if filters is None:
        filters = [{'kind'           : 'best_scores',
                    'score_key'      : 'expect',
                    'max_score'      : 1e-20,
                    'score_tolerance': 10}]
    filtered_results = filter_alignments(blasts, config=filters)

    get_id = lambda x : x.split()[0]

    for match in filtered_results:
        try:
            query = match['query'].id
        except AttributeError:
            query = match['query'].name
        query = get_id(query)
        for match_hit in match['matches']:
            try:
                subject = match_hit['subject'].id
            except AttributeError:
                subject = match_hit['subject'].name
            subject = get_id(subject)
            yield(query, subject)
    def unique_contiguous_region_filter(sequence):
        '''It filters out the snv in regions repeated in the genome or
        discontiguous'''
        if sequence is None:
            return None

        for snv in sequence.get_features(kind='snv'):
            # Check if it is already done
            previous_result = _get_filter_result(snv, 'uniq_contiguous',
                                                 threshold=distance)
            if previous_result is not None:
                continue

            #we make a blast
            #with the sequence around the snv
            location = snv.location.start.position
            start = location - distance
            end = location + distance
            if start < 0:
                start = 0
            #print start, end
            seq_fragment = sequence[start:end]
            blast_fhand = blast_runner(seq_fragment)['blastn']
            #now we parse the blast
            blast_result = blast_parser(blast_fhand)
            alignments = filter_alignments(blast_result, config=match_filters)
            #are there any similar sequences?
            try:
                alignment = alignments.next()
                result = True
            except StopIteration:
                #if there is no similar sequence we assume that is unique
                result = False
            if result:
                #how many matches, it should be only one
                num_hits = len(alignment['matches'])

                if num_hits > 1:
                    result = True
                else:
                    #how many match parts have the first match?
                    #we could do it with the blast result, but blast is not very
                    #good aligning, so we realign with est2genome
                    blast_fhand.seek(0)
                    sim_seqs = similar_sequences_for_blast(blast_fhand)
                    sim_seq = sim_seqs[0] if sim_seqs else None

                    introns = infer_introns_for_cdna(sequence=seq_fragment,
                                          genomic_seqs_index=genomic_seqs_index,
                                              similar_sequence=sim_seq,
                                              genomic_db=genomic_db)
                    if introns:
                        result = True
                    else:
                        result = False

            blast_fhand.close()
            _add_filter_result(snv, 'uniq_contiguous', result, distance)
        return sequence
 def do_alignment(self, query):
     'It returns an alignment with this query'
     alignments = []
     for subject in self._subjects:
         alignment = sw_align(query, subject)
         alignments.append(alignment)
     if self._filters is not None:
         alignments = filter_alignments(alignments, config=self._filters)
     return alignments
    def do_alignment(self, query):
        'It returns an alignment with this query'

        alignment_fhand = self._aligner(query)[self._program]
        # We need to parse the result
        alignments = self._parser(alignment_fhand)

        # We filter the results with appropriate filters
        if self._filters is not None:
            alignments = filter_alignments(alignments, config=self._filters)
        return alignments
Beispiel #8
0
def get_hit_pairs_from_blast(blast_fhand, sub_def_as_acc=None, filters=None):
    """It returns an iterator with query-subject tuples of the hits
    in the blast
    """

    blasts = BlastParser(fhand=blast_fhand, subj_def_as_accesion=sub_def_as_acc)
    if filters is None:
        filters = [{"kind": "best_scores", "score_key": "expect", "max_score": 1e-20, "score_tolerance": 10}]
    filtered_results = filter_alignments(blasts, config=filters)

    return get_pairs_from_alignments(filtered_results)
    def test_min_length_filter(self):
        'We can keep the hits length above the given one'
        blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml'))

        #with the min length given in base pairs
        filters = [{'kind'            : 'min_length',
                    'min_num_residues': 500,
                    'length_in_query':True
                   }]
        expected = {'cCL1Contig2':3, 'cCL1Contig3':0,
                     'cCL1Contig4':1, 'cCL1Contig5':1}
        blasts = BlastParser(fhand=blast_file)
        filtered_blasts = filter_alignments(blasts, config=filters)
        match_summary = _summarize_matches(filtered_blasts)
        _check_match_summary(match_summary, expected)

        #with the min length given in query %
        filters = [{'kind'          : 'min_length',
                    'min_percentage': 70,
                    'length_in_query':True
                   }]
        expected = {'cCL1Contig2':0, 'cCL1Contig3':0,
                     'cCL1Contig4':2, 'cCL1Contig5':0}
        blasts = BlastParser(fhand=blast_file)
        filtered_blasts = filter_alignments(blasts, config=filters)
        match_summary = _summarize_matches(filtered_blasts)
        _check_match_summary(match_summary, expected)

        #with the min length given in subject %
        filters = [{'kind'           : 'min_length',
                    'min_percentage' : 0.002,
                    'length_in_query': False
                   }]
        expected = {'cCL1Contig2':3, 'cCL1Contig3':0,
                     'cCL1Contig4':1, 'cCL1Contig5':2}
        blasts = BlastParser(fhand=blast_file)
        filtered_blasts = filter_alignments(blasts, config=filters)
        match_summary = _summarize_matches(filtered_blasts)
        _check_match_summary(match_summary, expected)
 def test_best_scores_filter(self):
     'We can keep the hits with the bests expects'
     blast_file = open(os.path.join(TEST_DATA_DIR, 'blast.xml'))
     filters = [{'kind'           : 'best_scores',
                 'score_key'      : 'expect',
                 'max_score'      : 1e-4,
                 'score_tolerance': 10
                }]
     expected = {'cCL1Contig2':2, 'cCL1Contig3':1,
                  'cCL1Contig4':1, 'cCL1Contig5':2}
     blasts = BlastParser(fhand=blast_file)
     filtered_blasts = filter_alignments(blasts, config=filters)
     match_summary = _summarize_matches(filtered_blasts)
     _check_match_summary(match_summary, expected)
 def _filter(sequence):
     'Giving a sequence it returns true or False depending on the exonerate'
     if sequence is None:
         return False
     source_result    = run_align_for_seq(sequence)[aligner_cmd]
     results          = parser(source_result)
     filtered_results = filter_alignments(results, config=match_filters)
     try:
         #only one sequence -> only one result
         filtered_results.next()
     except StopIteration:
         #there was no result for this sequence
         return False
     return True
    def test_blast_no_result(self):
        'It test that the xml output can be and empty string'
        blast_file = NamedTemporaryFile()
        blasts = BlastParser(fhand=blast_file)

        filters = [{'kind'           : 'best_scores',
                    'score_key'      : 'expect',
                    'max_score': 1e-4,
                    'score_tolerance': 10
                   }]
        filt_b = filter_alignments(blasts, config=filters,)
        try:
            filt_b.next()
            self.fail()
        except StopIteration:
            pass
def _get_descriptions_from_blasts(blasts):
    '''It gets a description from a list of blast outputs.
    Blast description in the xml may be modified to remove trash. This depends
    on blast xml, so the item of the list can be a blast or a dict with the
    blast and the function to modify the description field.

    It tries to find the name in the first file, after in the second, etc'''

    seq_annot = {}
    filters = [{'kind'           : 'best_scores',
                'score_key'      : 'expect',
                'max_score'      : 1e-20,
                'score_tolerance': 10}]
    for blast in blasts:
        blast_fhand = blast['blast']
        if 'modifier' in blast:
            modifier = blast['modifier']
        else:
            modifier = None
        blast_fhand = get_fhand(blast_fhand)
        blast = BlastParser(fhand=blast_fhand)
        filtered_results = filter_alignments(blast, config=filters)
        db_name = blast.db_name
        try:
            for match in filtered_results:
                try:
                    query = match['query'].id
                except AttributeError:
                    query = match['query'].name
                if query not in seq_annot:
                    match_hit = match['matches'][0]
                    description = match_hit['subject'].description
                    subject_name = match_hit['subject'].name
                    if modifier is not None:
                        description = modifier(description)
                    if description != "<unknown description>":
                        seq_annot[query] = {'description':description.strip(),
                                            'db_name':db_name,
                                            'subj_name': subject_name}
        except ExpatError as error:
            msg = str(error) + ':%s' % blast_fhand.name
            raise ExpatError(msg)
    return seq_annot
def similar_sequences_for_blast(blast_fhand, filters):
    'It look fro similar sequences ina blast result'
    #now we parse the blast
    blast_parser = get_alignment_parser('blast+')
    blast_result = blast_parser(blast_fhand)

    alignments = filter_alignments(blast_result, config=filters)
    for alignment in alignments:
        query_name = alignment['query'].name
        for match in alignment['matches']:
            print match
            #to which sequence our query is similar?
            name = match['subject'].name
            subj_desc = match['subject'].description

            if 'expect' in match['scores']:
                evalue = str(match['scores']['expect'])
            else:
                evalue = None
            if 'identity'in match['scores']:
                identity = str(match['scores']['identity'])
            else:
                identity = None
            if 'similarity' in match['scores']:
                similarity = str(match['scores']['similarity'])
            else:
                similarity = None

            yield{'name':name,
                  'subject_description':subj_desc,
                  'query_name':query_name,
                  'subject_start': match['subject_start'],
                  'subject_end':   match['subject_end'],
                  'query_start':   match['start'],
                  'query_end':     match['end'],
                  'evalue':        evalue,
                  'identity':      identity,
                  'similarity':    similarity
                  }
    def test_min_score_mapper(self):
        'We keep the matches with the scores above the threshold'
        filter1 = {'kind'     : 'score_threshold',
                   'score_key': 'score',
                   'min_score': 100,
                   }

        align1 = {'matches': [{'scores':{'score':400},
                               'start':0,
                               'end':100,
                               'subject_start':0,
                               'subject_end':100,
                               'match_parts':[{'scores':{'score':400},
                                               'query_start':0, 'query_end':10,
                                               'subject_start':0,
                                               'subject_end':10,
                                              },
                                              {'scores':{'score':300},
                                               'query_start':30, 'query_end':40,
                                               'subject_start':30,
                                               'subject_end':40,
                                              },
                                              {'scores':{'score':50},
                                               'query_start':50, 'query_end':60,
                                               'subject_start':50,
                                               'subject_end':60,
                                              },
                                              {'scores':{'score':40},
                                              'query_start':80, 'query_end':100,
                                               'subject_start':80,
                                               'subject_end':100,
                                              }
                                             ],
                               },
                               {'scores':{'score':20},
                               'match_parts':[{'scores':{'score':20}}],
                               },
                               {'scores':{'score':90},
                                'match_parts':[{'scores':{'score':90}}],
                               }
                             ]
                 }
        align2 = {'matches': [{'scores':{'score':20},
                               'match_parts':[{'scores':{'score':20}}],
                              }]}
        alignments = [align1, align2]
        filtered_alignments = list(filter_alignments(alignments,
                                                     config=[filter1]))
        expected_align1 = {'matches': [{'scores':{'score':400},
                               'start':0,
                               'end':40,
                               'subject_start':0,
                               'subject_end':40,
                               'match_parts':[{'scores':{'score':400},
                                               'query_start':0, 'query_end':10,
                                               'subject_start':0,
                                               'subject_end':10,
                                              },
                                              {'scores':{'score':300},
                                               'query_start':30, 'query_end':40,
                                               'subject_start':30,
                                               'subject_end':40,
                                              },
                                             ],
                               },
                             ]
                 }
        _check_blast(filtered_alignments[0], expected_align1)
        assert len(filtered_alignments) == 1
    def test_max_score_mapper(self):
        filter1 = {'kind'           : 'best_scores',
                   'score_key'      : 'expect',
                   'max_score'      : 1e-3
                   }

        align1 = {'matches': [{'scores':{'expect':1e-4},
                               'start':0,
                               'end':100,
                               'subject_start':0,
                               'subject_end':100,
                               'match_parts':[{'scores':{'expect':1e-4},
                                               'query_start':0, 'query_end':10,
                                               'subject_start':0,
                                               'subject_end':10,
                                              },
                                              {'scores':{'expect':5e-4},
                                               'query_start':30, 'query_end':40,
                                               'subject_start':30,
                                               'subject_end':40,
                                              },
                                              {'scores':{'expect':1e-3},
                                               'query_start':50, 'query_end':60,
                                               'subject_start':50,
                                               'subject_end':60,
                                              },
                                              {'scores':{'expect':1e-2},
                                              'query_start':80, 'query_end':100,
                                               'subject_start':80,
                                               'subject_end':100,
                                              }
                                             ],
                               },
                               {'scores':{'expect':1e-3},
                               'match_parts':[{'scores':{'expect':1e-3}}],
                               },
                               {'scores':{'expect':1e-2},
                                'match_parts':[{'scores':{'expect':1e-2}}],
                               }
                             ]
                 }
        align2 = {'matches': [{'scores':{'expect':1e-2},
                               'match_parts':[{'scores':{'expect':1e-2}}],
                              }]}
        alignments = [align1, align2]
        filtered_alignments = list(filter_alignments(alignments,
                                                     config=[filter1]))
        expected_align1 = {'matches': [{'scores':{'expect':1e-4},
                               'start':0,
                               'end':60,
                               'subject_start':0,
                               'subject_end':60,
                               'match_parts':[{'scores':{'expect':1e-4},
                                               'query_start':0, 'query_end':10,
                                               'subject_start':0,
                                               'subject_end':10,
                                              },
                                              {'scores':{'expect':5e-4},
                                               'query_start':30, 'query_end':40,
                                               'subject_start':30,
                                               'subject_end':40,
                                              },
                                              {'scores':{'expect':1e-3},
                                               'query_start':50, 'query_end':60,
                                               'subject_start':50,
                                               'subject_end':60,
                                              },
                                             ],
                               },
                               {'scores':{'expect':1e-3},
                               'match_parts':[{'scores':{'expect':1e-3}}],
                               },
                             ]
                 }
        _check_blast(filtered_alignments[0], expected_align1)
        assert len(filtered_alignments) == 1
    def test_min_length_mapper(self):
        'We can filter the matches according to their length'
        filter1 = {'kind'            : 'min_length',
                   'min_num_residues': 100,
                   'length_in_query' : True,
                   }

        align1 = {'matches': [{'match_parts':[{'query_start':0, 'query_end':100,
                                               'subject_start':0,
                                               'subject_end':100, }]},
                              {'match_parts':[{'query_start':0, 'query_end':50,
                                               'subject_start':0,
                                               'subject_end':100, }]},
                             ]
                 }
        alignments = [align1]

        filtered_alignments = list(filter_alignments(alignments,
                                                     config=[filter1]))
        expected_align1 = {'matches': [{'start':0, 'end':100,
                                        'subject_start':0, 'subject_end':100, },
                                      ]
                          }
        _check_blast(filtered_alignments[0], expected_align1)
        assert len(filtered_alignments) == 1

        filter = {'kind'            : 'min_length',
                   'min_num_residues': 100,
                   'length_in_query' : False,
                   }
        filtered_alignments = list(filter_alignments(alignments,
                                                     config=[filter]))
        expected_align1 = {'matches': [{'start':0, 'end':100,
                                        'subject_start':0, 'subject_end':100, },
                                        {'start':0, 'end':50,
                                        'subject_start':0, 'subject_end':100, },
                                      ]
                          }
        _check_blast(filtered_alignments[0], expected_align1)
        assert len(filtered_alignments) == 1

        filter = {'kind'            : 'min_length',
                  'min_percentage': 90,
                  'length_in_query' : True,
                 }
        align1 = {'query':UnknownSeq(100),
                  'matches': [{'match_parts':[{'query_start':0, 'query_end':90,
                                               'subject_start':0,
                                               'subject_end':100, }]},
                              {'match_parts':[{'query_start':0, 'query_end':50,
                                               'subject_start':0,
                                               'subject_end':100, }]},
                             ]
                 }
        alignments = [align1]

        filtered_alignments = list(filter_alignments(alignments,
                                                     config=[filter]))
        expected_align1 = {'matches': [{'start':0, 'end':90,
                                        'subject_start':0, 'subject_end':100, },
                                      ]
                          }
        _check_blast(filtered_alignments[0], expected_align1)
        assert len(filtered_alignments) == 1

        filter = {'kind'           : 'min_length',
                  'min_percentage' : 90,
                  'length_in_query': False,
                 }
        align1 = {'matches': [{'subject': UnknownSeq(100),
                               'match_parts':[{'query_start':0, 'query_end':100,
                                               'subject_start':0,
                                               'subject_end':90, }]},
                              {'subject': UnknownSeq(100),
                               'match_parts':[{'query_start':0, 'query_end':100,
                                               'subject_start':0,
                                               'subject_end':89, }]},
                             ]
                 }
        alignments = [align1]

        filtered_alignments = list(filter_alignments(alignments,
                                                     config=[filter]))
        expected_align1 = {'matches': [{'start':0, 'end':100,
                                        'subject_start':0, 'subject_end':90, },
                                      ]
                          }
        _check_blast(filtered_alignments[0], expected_align1)
        assert len(filtered_alignments) == 1