def __init__(self, subject=None, database=None, program='blastn', parameters=None, filters=None): '''It inits the class. Query should be a sequence and subject can be one or several. subject could be an fhand (fasta) or an string ''' if subject is None and database is None: raise ValueError('Either subject or database should be given') if subject is not None and database is not None: msg = 'subject and database can not be given at the same time' raise ValueError(msg) if parameters is None: parameters = {} self._filters = filters if subject is not None: parameters['alig_format'] = 0 self._parser = get_alignment_parser('blast_text') self._subject_fhand = _seq_to_fasta_fhand(subject) parameters['subject'] = self._subject_fhand.name if database is not None: parameters['database'] = database parameters['alig_format'] = 5 self._parser = get_alignment_parser('blast') self._program = program self._aligner = create_runner(tool=program, parameters=parameters)
def similar_sequences_for_blast(blast_fhand, filters=None): 'It look fro similar sequences ina blast result' #now we parse the blast blast_parser = get_alignment_parser('blast+') blast_result = blast_parser(blast_fhand) # We filter the results with appropiate filters if filters is None: filters = [{'kind' : 'score_threshold', 'score_key': 'similarity', 'min_score': 90, }, {'kind' : 'min_length', 'min_num_residues': 100, 'length_in_query' : True } ] alignments = filter_alignments(blast_result, config=filters) try: alignment = alignments.next() except StopIteration: return [] similar_seqs = [] for match in alignment['matches']: #to which sequence our query is similar? name = match['subject'].name similar_seqs.append({'name':name, 'subject_start': match['subject_start'], 'subject_end': match['subject_end'], 'query_start': match['start'], 'query_end': match['end'] }) return similar_seqs
def similar_sequences_for_blast(blast_fhand, filters=None): "It look for similar sequences in a blast result" # now we parse the blast blast_parser = get_alignment_parser("blast+") blast_result = blast_parser(blast_fhand) # We filter the results with appropiate filters if filters is None: filters = [ {"kind": "score_threshold", "score_key": "similarity", "min_score": 90}, {"kind": "min_length", "min_num_residues": 100, "length_in_query": True}, ] alignments = filter_alignments(blast_result, config=filters) try: alignment = alignments.next() except StopIteration: return [] similar_seqs = [] for match in alignment["matches"]: # to which sequence our query is similar? name = match["subject"].name similar_seqs.append( { "name": name, "subject_start": match["subject_start"], "subject_end": match["subject_end"], "query_start": match["start"], "query_end": match["end"], } ) return similar_seqs
def create_aligner_filter(aligner_cmd, cmd_parameters, match_filters=None, environment=None): '''A function factory factory that creates aligner filters. It returns a function that will accept a sequence and it will return True or False depending on the exonerate outcome. parameters is a dictionary and key are defined in ExonerateRunner. Required is only the target fasta file ''' #runners = {'blast':BlastRunner, 'exonerate':ExonerateRunner} parser = get_alignment_parser(aligner_cmd) run_align_for_seq = create_runner(tool=aligner_cmd, environment=environment, parameters=cmd_parameters) def _filter(sequence): 'Giving a sequence it returns true or False depending on the exonerate' if sequence is None: return False source_result = run_align_for_seq(sequence)[aligner_cmd] results = parser(source_result) filtered_results = filter_alignments(results, config=match_filters) try: #only one sequence -> only one result filtered_results.next() except StopIteration: #there was no result for this sequence return False return True return _filter
def __init__(self, subject, parameters=None, filters=None): 'It inits the class' if parameters is None: parameters = {} self._filters = filters self._parser = get_alignment_parser('exonerate') self._subject_fhand = _seq_to_fasta_fhand(subject) parameters['target'] = self._subject_fhand.name self._aligner = create_runner(tool='exonerate', parameters=parameters)
def similar_sequences_for_blast(blast_fhand, filters): 'It look fro similar sequences ina blast result' #now we parse the blast blast_parser = get_alignment_parser('blast+') blast_result = blast_parser(blast_fhand) alignments = filter_alignments(blast_result, config=filters) for alignment in alignments: query_name = alignment['query'].name for match in alignment['matches']: print match #to which sequence our query is similar? name = match['subject'].name subj_desc = match['subject'].description if 'expect' in match['scores']: evalue = str(match['scores']['expect']) else: evalue = None if 'identity'in match['scores']: identity = str(match['scores']['identity']) else: identity = None if 'similarity' in match['scores']: similarity = str(match['scores']['similarity']) else: similarity = None yield{'name':name, 'subject_description':subj_desc, 'query_name':query_name, 'subject_start': match['subject_start'], 'subject_end': match['subject_end'], 'query_start': match['start'], 'query_end': match['end'], 'evalue': evalue, 'identity': identity, 'similarity': similarity }
def create_unique_contiguous_region_filter(distance, genomic_db, genomic_seqs_fpath): '''It returns a filter that removes snv in a region that give more than one match or more than one match_parts''' parameters = {'database': genomic_db} blast_runner = create_runner(tool='blastn', parameters=parameters) blast_parser = get_alignment_parser('blast') match_filters = [{'kind' : 'score_threshold', 'score_key': 'similarity', 'min_score': 90, }, {'kind' : 'min_length', 'min_num_residues': 20, 'length_in_query' : True } ] if not genomic_seqs_fpath: msg = 'No genomic sequence file defined for unique SNV filter' raise ValueError(msg) if not genomic_db: msg = 'No genomic blast database defined for unique SNV filter' raise ValueError(msg) genomic_seqs_fhand = open(genomic_seqs_fpath) genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name, guess_seq_file_format(genomic_seqs_fhand)) def unique_contiguous_region_filter(sequence): '''It filters out the snv in regions repeated in the genome or discontiguous''' if sequence is None: return None for snv in sequence.get_features(kind='snv'): # Check if it is already done previous_result = _get_filter_result(snv, 'uniq_contiguous', threshold=distance) if previous_result is not None: continue #we make a blast #with the sequence around the snv location = snv.location.start.position start = location - distance end = location + distance if start < 0: start = 0 #print start, end seq_fragment = sequence[start:end] blast_fhand = blast_runner(seq_fragment)['blastn'] #now we parse the blast blast_result = blast_parser(blast_fhand) alignments = filter_alignments(blast_result, config=match_filters) #are there any similar sequences? try: alignment = alignments.next() result = True except StopIteration: #if there is no similar sequence we assume that is unique result = False if result: #how many matches, it should be only one num_hits = len(alignment['matches']) if num_hits > 1: result = True else: #how many match parts have the first match? #we could do it with the blast result, but blast is not very #good aligning, so we realign with est2genome blast_fhand.seek(0) sim_seqs = similar_sequences_for_blast(blast_fhand) sim_seq = sim_seqs[0] if sim_seqs else None introns = infer_introns_for_cdna(sequence=seq_fragment, genomic_seqs_index=genomic_seqs_index, similar_sequence=sim_seq, genomic_db=genomic_db) if introns: result = True else: result = False blast_fhand.close() _add_filter_result(snv, 'uniq_contiguous', result, distance) return sequence return unique_contiguous_region_filter