def __init__(self, subject=None, database=None, program='blastn',
                 parameters=None, filters=None):
        '''It inits the class.

        Query should be a sequence and subject can be one or several.
        subject could be an fhand (fasta) or an string
        '''
        if subject is None and database is None:
            raise ValueError('Either subject or database should be given')
        if subject is not None and database is not None:
            msg = 'subject and database can not be given at the same time'
            raise ValueError(msg)

        if parameters is None:
            parameters = {}
        self._filters = filters

        if subject is not None:
            parameters['alig_format'] = 0
            self._parser  = get_alignment_parser('blast_text')
            self._subject_fhand = _seq_to_fasta_fhand(subject)
            parameters['subject'] = self._subject_fhand.name
        if database is not None:
            parameters['database'] = database
            parameters['alig_format'] = 5
            self._parser  = get_alignment_parser('blast')
        self._program = program
        self._aligner = create_runner(tool=program, parameters=parameters)
Beispiel #2
0
def similar_sequences_for_blast(blast_fhand, filters=None):
    'It look fro similar sequences ina blast result'
    #now we parse the blast
    blast_parser = get_alignment_parser('blast+')
    blast_result = blast_parser(blast_fhand)

    # We filter the results with appropiate  filters
    if filters is None:
        filters = [{'kind'     : 'score_threshold',
                    'score_key': 'similarity',
                    'min_score': 90,
                   },
                   {'kind'            : 'min_length',
                    'min_num_residues': 100,
                    'length_in_query' : True
                   }
                  ]
    alignments = filter_alignments(blast_result, config=filters)
    try:
        alignment = alignments.next()
    except StopIteration:
        return []
    similar_seqs = []
    for match in alignment['matches']:
        #to which sequence our query is similar?
        name = match['subject'].name
        similar_seqs.append({'name':name,
                             'subject_start': match['subject_start'],
                             'subject_end':   match['subject_end'],
                             'query_start':   match['start'],
                             'query_end':     match['end']
                             })
    return similar_seqs
Beispiel #3
0
def similar_sequences_for_blast(blast_fhand, filters=None):
    "It look for similar sequences in a blast result"
    # now we parse the blast
    blast_parser = get_alignment_parser("blast+")
    blast_result = blast_parser(blast_fhand)

    # We filter the results with appropiate  filters
    if filters is None:
        filters = [
            {"kind": "score_threshold", "score_key": "similarity", "min_score": 90},
            {"kind": "min_length", "min_num_residues": 100, "length_in_query": True},
        ]
    alignments = filter_alignments(blast_result, config=filters)
    try:
        alignment = alignments.next()
    except StopIteration:
        return []
    similar_seqs = []
    for match in alignment["matches"]:
        # to which sequence our query is similar?
        name = match["subject"].name
        similar_seqs.append(
            {
                "name": name,
                "subject_start": match["subject_start"],
                "subject_end": match["subject_end"],
                "query_start": match["start"],
                "query_end": match["end"],
            }
        )
    return similar_seqs
def create_aligner_filter(aligner_cmd, cmd_parameters, match_filters=None,
                          environment=None):
    '''A function factory factory that creates aligner filters.

    It returns a function that will accept a sequence and it will return
    True or False depending on the exonerate outcome.
    parameters is a dictionary and key are defined in ExonerateRunner.
    Required is only the target fasta file
    '''
    #runners = {'blast':BlastRunner, 'exonerate':ExonerateRunner}

    parser = get_alignment_parser(aligner_cmd)

    run_align_for_seq = create_runner(tool=aligner_cmd, environment=environment,
                                      parameters=cmd_parameters)
    def _filter(sequence):
        'Giving a sequence it returns true or False depending on the exonerate'
        if sequence is None:
            return False
        source_result    = run_align_for_seq(sequence)[aligner_cmd]
        results          = parser(source_result)
        filtered_results = filter_alignments(results, config=match_filters)
        try:
            #only one sequence -> only one result
            filtered_results.next()
        except StopIteration:
            #there was no result for this sequence
            return False
        return True
    return _filter
    def __init__(self, subject, parameters=None, filters=None):
        'It inits the class'

        if parameters is None:
            parameters = {}
        self._filters = filters

        self._parser  = get_alignment_parser('exonerate')

        self._subject_fhand = _seq_to_fasta_fhand(subject)
        parameters['target'] = self._subject_fhand.name
        self._aligner = create_runner(tool='exonerate', parameters=parameters)
def similar_sequences_for_blast(blast_fhand, filters):
    'It look fro similar sequences ina blast result'
    #now we parse the blast
    blast_parser = get_alignment_parser('blast+')
    blast_result = blast_parser(blast_fhand)

    alignments = filter_alignments(blast_result, config=filters)
    for alignment in alignments:
        query_name = alignment['query'].name
        for match in alignment['matches']:
            print match
            #to which sequence our query is similar?
            name = match['subject'].name
            subj_desc = match['subject'].description

            if 'expect' in match['scores']:
                evalue = str(match['scores']['expect'])
            else:
                evalue = None
            if 'identity'in match['scores']:
                identity = str(match['scores']['identity'])
            else:
                identity = None
            if 'similarity' in match['scores']:
                similarity = str(match['scores']['similarity'])
            else:
                similarity = None

            yield{'name':name,
                  'subject_description':subj_desc,
                  'query_name':query_name,
                  'subject_start': match['subject_start'],
                  'subject_end':   match['subject_end'],
                  'query_start':   match['start'],
                  'query_end':     match['end'],
                  'evalue':        evalue,
                  'identity':      identity,
                  'similarity':    similarity
                  }
def create_unique_contiguous_region_filter(distance, genomic_db,
                                           genomic_seqs_fpath):
    '''It returns a filter that removes snv in a region that give more than one
    match or more than one match_parts'''
    parameters = {'database': genomic_db}
    blast_runner = create_runner(tool='blastn', parameters=parameters)
    blast_parser = get_alignment_parser('blast')
    match_filters = [{'kind'     : 'score_threshold',
                      'score_key': 'similarity',
                      'min_score': 90,
                     },
                     {'kind'            : 'min_length',
                      'min_num_residues': 20,
                      'length_in_query' : True
                     }
                    ]
    if not genomic_seqs_fpath:
        msg = 'No genomic sequence file defined for unique SNV filter'
        raise ValueError(msg)
    if not genomic_db:
        msg = 'No genomic blast database defined for unique SNV filter'
        raise ValueError(msg)
    genomic_seqs_fhand = open(genomic_seqs_fpath)
    genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name,
                                     guess_seq_file_format(genomic_seqs_fhand))

    def unique_contiguous_region_filter(sequence):
        '''It filters out the snv in regions repeated in the genome or
        discontiguous'''
        if sequence is None:
            return None

        for snv in sequence.get_features(kind='snv'):
            # Check if it is already done
            previous_result = _get_filter_result(snv, 'uniq_contiguous',
                                                 threshold=distance)
            if previous_result is not None:
                continue

            #we make a blast
            #with the sequence around the snv
            location = snv.location.start.position
            start = location - distance
            end = location + distance
            if start < 0:
                start = 0
            #print start, end
            seq_fragment = sequence[start:end]
            blast_fhand = blast_runner(seq_fragment)['blastn']
            #now we parse the blast
            blast_result = blast_parser(blast_fhand)
            alignments = filter_alignments(blast_result, config=match_filters)
            #are there any similar sequences?
            try:
                alignment = alignments.next()
                result = True
            except StopIteration:
                #if there is no similar sequence we assume that is unique
                result = False
            if result:
                #how many matches, it should be only one
                num_hits = len(alignment['matches'])

                if num_hits > 1:
                    result = True
                else:
                    #how many match parts have the first match?
                    #we could do it with the blast result, but blast is not very
                    #good aligning, so we realign with est2genome
                    blast_fhand.seek(0)
                    sim_seqs = similar_sequences_for_blast(blast_fhand)
                    sim_seq = sim_seqs[0] if sim_seqs else None

                    introns = infer_introns_for_cdna(sequence=seq_fragment,
                                          genomic_seqs_index=genomic_seqs_index,
                                              similar_sequence=sim_seq,
                                              genomic_db=genomic_db)
                    if introns:
                        result = True
                    else:
                        result = False

            blast_fhand.close()
            _add_filter_result(snv, 'uniq_contiguous', result, distance)
        return sequence

    return unique_contiguous_region_filter