def get_orthologs(blast1_fhand, blast2_fhand, sub1_def_as_acc=None, sub2_def_as_acc=None): """It return orthologs from two pools. It needs the xml output blast of the pools""" # First we have to get hist from the first blast. We will put the in a set blast1_hits = set() for hits in get_hit_pairs_from_blast(get_fhand(blast1_fhand), sub_def_as_acc=sub1_def_as_acc): blast1_hits.add(hits) # Know we will see if the hits in the second blast in the first too for hits in get_hit_pairs_from_blast(get_fhand(blast2_fhand), sub_def_as_acc=sub2_def_as_acc): hits = (hits[1], hits[0]) if hits in blast1_hits: yield hits
def create_cdna_intron_annotator(genomic_db, genomic_seqs_fhand): 'It creates a function that annotates introns in cdna matching with genomic' genomic_seqs_fhand = get_fhand(genomic_seqs_fhand) genomic_seqs_index = SeqIO.index(genomic_seqs_fhand.name, guess_seq_file_format(genomic_seqs_fhand)) def annotate_intron(sequence): 'It adds the orf to the SeqFeatures' if sequence is None: return try: introns = infer_introns_for_cdna(sequence=sequence, genomic_db=genomic_db, genomic_seqs_index=genomic_seqs_index) except KeyError as error: error = str(error).lstrip('u').strip("'") if 'not found' in error: error += ' in seq file %s, but present in blast db %s' % \ (genomic_seqs_fhand.name, genomic_db) raise RuntimeError(error) for intron_pos in introns: feature = SeqFeature(location=FeatureLocation(intron_pos, intron_pos), type='intron', qualifiers={'genomic_db':genomic_db}) sequence.features.append(feature) return sequence return annotate_intron
def create_vector_striper(vectors, vectors_are_blastdb=False): '''It returns a function capable of detecting vector sequences. The vectors could be an fhand to a fasta file or a blast database. The vectors will be detected by using blastn ''' if not vectors_are_blastdb: check_sequences_length(get_fhand(vectors), MAX_ADAPTOR_LENGTH) return _create_vector_striper(vectors, aligner='blastn', vectors_are_blastdb=vectors_are_blastdb, seqs_are_short=False, elongate_match_to_complete_adaptor=False)
def create_adaptor_striper(adaptors, elongate_match_to_complete_adaptor=True): '''It creates a function capable of detecting adaptor sequences. The adaptors should be a fhand to a fasta file with the adaptors in it. The adaptors will be detected by using blastn-short. ''' fhand = get_fhand(adaptors) check_sequences_length(fhand, MIN_ADAPTOR_LENGTH, MAX_ADAPTOR_LENGTH) return _create_vector_striper(vectors=adaptors, aligner='blast_short', vectors_are_blastdb=False, seqs_are_short=True, elongate_match_to_complete_adaptor=elongate_match_to_complete_adaptor)
def create_snv_annotator(bam_fhand, min_quality=45, default_sanger_quality=25, min_mapq=15, min_num_alleles=1, max_maf=None, read_edge_conf=None, default_bam_platform=None, min_num_reads_for_allele=None, ploidy=2): 'It creates an annotator capable of annotating the snvs in a SeqRecord' #the bam should have an index, does the index exists? bam_fhand = get_fhand(bam_fhand) create_bam_index(bam_fpath=bam_fhand.name) read_edge_conf = _normalize_read_edge_conf(read_edge_conf) bam = pysam.Samfile(bam_fhand.name, 'rb') # default min num_reads per allele and ploidy if min_num_reads_for_allele is None: min_num_reads_for_allele = DEFAUL_MIN_NUM_READS_PER_ALLELE if ploidy is None: ploidy = DEFAULT_PLOIDY def annotate_snps(sequence): 'It annotates the snvs found in the sequence' for snv in _snvs_in_bam(bam, reference=sequence, min_quality=min_quality, default_sanger_quality=default_sanger_quality, min_mapq=min_mapq, min_num_alleles=min_num_alleles, max_maf=max_maf, read_edge_conf=read_edge_conf, default_bam_platform=default_bam_platform, min_num_reads_for_allele=min_num_reads_for_allele): snv = _summarize_snv(snv) location = snv['ref_position'] type_ = 'snv' qualifiers = {'alleles':snv['alleles'], 'reference_allele':snv['reference_allele'], 'read_groups':snv['read_groups'], 'mapping_quality': snv['mapping_quality'], 'quality': snv['quality']} snv_feat = SeqFeature(location=FeatureLocation(location, location), type=type_, qualifiers=qualifiers) annotate_pic(snv_feat) annotate_heterozygosity(snv_feat, ploidy=ploidy) sequence.features.append(snv_feat) return sequence return annotate_snps
def num_seqs_in_file(seq_fhand, format=None): 'It counts seqs in file. ' seq_fhand = get_fhand(seq_fhand) if format is None: format = guess_seq_file_format(seq_fhand) if format == 'fasta': return count_str_in_file(seq_fhand, '^>') elif format == 'repr': class_name = SeqWithQuality.__class__.__name__.split('.')[-1] return count_str_in_file(seq_fhand, "^%s" % class_name) elif 'fastq' in format: return _num_seqs_in_fastq(seq_fhand) else: raise NotImplementedError('I can not count this format: %s' % format)
def _get_descriptions_from_blasts(blasts): '''It gets a description from a list of blast outputs. Blast description in the xml may be modified to remove trash. This depends on blast xml, so the item of the list can be a blast or a dict with the blast and the function to modify the description field. It tries to find the name in the first file, after in the second, etc''' seq_annot = {} filters = [{'kind' : 'best_scores', 'score_key' : 'expect', 'max_score' : 1e-20, 'score_tolerance': 10}] for blast in blasts: blast_fhand = blast['blast'] if 'modifier' in blast: modifier = blast['modifier'] else: modifier = None blast_fhand = get_fhand(blast_fhand) blast = BlastParser(fhand=blast_fhand) filtered_results = filter_alignments(blast, config=filters) db_name = blast.db_name try: for match in filtered_results: try: query = match['query'].id except AttributeError: query = match['query'].name if query not in seq_annot: match_hit = match['matches'][0] description = match_hit['subject'].description subject_name = match_hit['subject'].name if modifier is not None: description = modifier(description) if description != "<unknown description>": seq_annot[query] = {'description':description.strip(), 'db_name':db_name, 'subj_name': subject_name} except ExpatError as error: msg = str(error) + ':%s' % blast_fhand.name raise ExpatError(msg) return seq_annot
def _create_vector_striper( vectors, aligner, vectors_are_blastdb=False, seqs_are_short=False, elongate_match_to_complete_adaptor=False ): """It creates a function which will remove vectors from the given sequence. It looks for the vectors comparing the sequence with a vector database. To do these alignments two programs can be used, exonerate and blast. Exonerate requires a fasta file with the vectors and blast and indexed blast database. """ # exonerate fails with sequences below 20 bp # blast_short starts to fail bellow 15 bases with 2% errors (although not as # badly as exonerate with 19 # depending on the aligner program we need different parameters and filters # blast parameter value is taken from vecscreen parameters: # http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html vectors = get_fhand(vectors) parameters = { "blast_long": { "gapextend": "3", "gapopen": "3", "penalty": "-5", "expect": "700", "dust": "20 1 64", "searchsp": "1750000000000", }, "blast_short": {"task": "blastn-short", "expect": "0.0001", "subject": vectors, "alig_format": 6}, } # They filter matches not match parts filters = { "blast_long": [ {"kind": "score_threshold", "score_key": "identity", "min_score": 96}, {"kind": "min_length", "min_num_residues": MIN_ADAPTOR_LENGTH, "length_in_query": False}, ], "blast_short": [ {"kind": "score_threshold", "score_key": "identity", "min_score": 89}, {"kind": "min_length", "min_num_residues": 13, "length_in_query": False}, ], } if vectors is None: aligner = None elif aligner == "blast_short" or aligner == "blastn": seq_type = "blast_short" if seqs_are_short else "blast_long" if vectors_are_blastdb: aligner = BlastAligner(database=vectors, parameters=parameters[seq_type], filters=filters[seq_type]) else: aligner = BlastAligner(subject=vectors, parameters=parameters[seq_type], filters=filters[seq_type]) def strip_vector_by_alignment(sequence): """It strips the vector from a sequence. It returns a striped sequence with the longest segment without vector. """ if sequence is None: return None if vectors is None: return sequence alignments = list(aligner.do_alignment(sequence)) if elongate_match_to_complete_adaptor: _elongate_matches_to_complete_subject(alignments) alignment_matches = _get_non_matched_locations(alignments) _add_trim_segments(alignment_matches, sequence) return sequence return strip_vector_by_alignment
def _create_vector_striper(vectors, aligner, vectors_are_blastdb=False, seqs_are_short=False, elongate_match_to_complete_adaptor=False): '''It creates a function which will remove vectors from the given sequence. It looks for the vectors comparing the sequence with a vector database. To do these alignments two programs can be used, exonerate and blast. Exonerate requires a fasta file with the vectors and blast and indexed blast database. ''' #exonerate fails with sequences below 20 bp #blast_short starts to fail bellow 15 bases with 2% errors (although not as #badly as exonerate with 19 # depending on the aligner program we need different parameters and filters # blast parameter value is taken from vecscreen parameters: # http://www.ncbi.nlm.nih.gov/VecScreen/VecScreen_docs.html vectors = get_fhand(vectors) parameters = {'blast_long' : {'gapextend': '3', 'gapopen':'3', 'penalty':'-5', 'expect':'700', 'dust':'20 1 64'}, 'blast_short': {'task': 'blastn-short', 'expect': '0.0001', 'subject': vectors, 'alig_format':6}, } #They filter matches not match parts filters = {'blast_long': [{'kind' : 'score_threshold', 'score_key': 'identity', 'min_score': 96}, {'kind' : 'min_length', 'min_num_residues': MIN_ADAPTOR_LENGTH, 'length_in_query' : False}], 'blast_short': [{'kind' : 'score_threshold', 'score_key': 'identity', 'min_score': 89}, {'kind' : 'min_length', 'min_num_residues': 13, 'length_in_query' : False}] } if vectors is None: aligner = None elif aligner == 'blast_short' or aligner == 'blastn': seq_type = 'blast_short' if seqs_are_short else 'blast_long' if vectors_are_blastdb: aligner = BlastAligner(database=vectors, parameters=parameters[seq_type], filters=filters[seq_type]) else: aligner = BlastAligner(subject=vectors, parameters=parameters[seq_type], filters=filters[seq_type]) def strip_vector_by_alignment(sequence): '''It strips the vector from a sequence. It returns a striped sequence with the longest segment without vector. ''' if sequence is None: return None if vectors is None: return sequence alignments = list(aligner.do_alignment(sequence)) if elongate_match_to_complete_adaptor: _elongate_matches_to_complete_subject(alignments) alignment_matches = _get_non_matched_locations(alignments) segments = _get_longest_non_matched_seq_region_limits(sequence, alignment_matches) if segments is None: return None segments = _get_non_matched_from_matched_locations([segments], len(sequence)) _add_trim_segments(segments, sequence) return sequence return strip_vector_by_alignment