def test_remove_site_with_partial_feature_overlap__downstream(self): """Test fixing a site that is only partially inside a feature. We expect a modification to the part that is not inside the feature, leaving the feature unchanged. """ RESTRICTION_ENZYME = Restriction.BsmBI SITE_SEQ = RESTRICTION_ENZYME.site BEFORE = 'CCCCCCCCCCCCCCCCCCCCCCCC' AFTER = 'AAAAAAAAAAAAAAAA' SEQ = Seq(BEFORE + SITE_SEQ + AFTER, generic_dna) seq_record = SeqRecord(SEQ) feature_1_loc = FeatureLocation(len(BEFORE) + 2, len(SEQ), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) FEATURE_1_SEQ_ORIG = feature_1.extract(str(seq_record.seq)) occurrences = find_restriction_site_occurrences( seq_record, RESTRICTION_ENZYME) self.assertEqual(1, len(occurrences)) seq_record = _remove_site_with_partial_feature_overlap( seq_record, occurrences[0], feature_1) FEATURE_1_SEQ_UPDATED = feature_1.extract(str(seq_record.seq)) occurrences = find_restriction_site_occurrences( seq_record, RESTRICTION_ENZYME) self.assertEqual(0, len(occurrences)) self.assertEqual(FEATURE_1_SEQ_ORIG, FEATURE_1_SEQ_UPDATED)
def test_deletion__overlapping_features(self): # Example based on intersection of nei and arbB genes in MG1655. before_overlap = 'GCCCTGGCTGCCAGCA' overlap = 'CTAG' after_overlap = 'GCCGACCGCTTCGG' raw_seq_str = before_overlap + overlap + after_overlap seq = Seq(raw_seq_str, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(before_overlap) + len(overlap), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str), strand=1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2') seq_record.features.append(feature_2) maker = VCFToGenbankMaker(seq_record, None, None) maker._update_genome_record_for_variant( len(before_overlap), overlap, '') # Assert the sequence is correct. EXPECTED_SEQ = before_overlap + after_overlap self.assertEqual(EXPECTED_SEQ, str(seq_record.seq)) # Assert the feature annotations are still correct. EXPECTED_FEATURE_1_SEQ = before_overlap self.assertEqual(EXPECTED_FEATURE_1_SEQ, str(feature_1.extract(seq_record.seq))) EXPECTED_FEATURE_2_SEQ = after_overlap self.assertEqual(EXPECTED_FEATURE_2_SEQ, str(feature_2.extract(seq_record.seq)))
def test_update_genome_record_for_variant__overlapping_features(self): """Tests handling a record that lands in a region of overlapping features. """ # Example based on intersection of nei and arbB genes in MG1655. before_overlap = 'GCCCTGGCTGCCAGCA' overlap = 'CTAG' after_overlap = 'GCCGACCGCTTCGG' raw_seq_str = before_overlap + overlap + after_overlap seq = Seq(raw_seq_str, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(before_overlap) + len(overlap), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str), strand=1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2') seq_record.features.append(feature_2) maker = VCFToGenbankMaker(seq_record, None, None) overlap_replacement = 'TTAA' maker._update_genome_record_for_variant(len(before_overlap), overlap, overlap_replacement) # Features changed, so requery them. feature_1 = None feature_2 = None for feature in seq_record.features: if feature.id == '1': feature_1 = feature elif feature.id == '2': feature_2 = feature assert feature_1 assert feature_2 # Assert the sequence is correct. EXPECTED_SEQ = before_overlap + overlap_replacement + after_overlap self.assertEqual(EXPECTED_SEQ, str(seq_record.seq)) # Feature added to represent swap. # self.assertEqual(3, len(seq_record.features)) # Assert the feature annotations are still correct. EXPECTED_FEATURE_1_SEQ = before_overlap + overlap_replacement self.assertEqual(EXPECTED_FEATURE_1_SEQ, str(feature_1.extract(seq_record.seq))) EXPECTED_FEATURE_2_SEQ = overlap_replacement + after_overlap self.assertEqual(EXPECTED_FEATURE_2_SEQ, str(feature_2.extract(seq_record.seq)))
def test_remove_site_in_coding_feature(self): """Tests removing a restriction enzyme that falls in a coding region. """ RESTRICTION_ENZYME = Restriction.BsmBI BEFORE = 'ATGTTTGGGCCCAAATTTGGGAAATTTGGGAAATTTGGGAAATTTGGGAAATTTGGG' SITE_SEQ = RESTRICTION_ENZYME.site AFTER = 'TAGAAAAAAAAAAAAAAAA' SEQ = Seq(BEFORE + SITE_SEQ + AFTER, generic_dna) seq_record = SeqRecord(SEQ) refactor_context = RefactorContext(seq_record) feature_1_loc = FeatureLocation(0, len(BEFORE) + len(SITE_SEQ) + 3, strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) FEATURE_1_SEQ_ORIG = feature_1.extract(str(seq_record.seq)) FEATURE_1_NUM_CODONS = len(feature_1) / 3 # Compute fake feature profile. fake_profile_values_map = {} fake_profile_values_map[feature_1.id] = { GCContentFeatureProfile.get_name(): [0.2] * FEATURE_1_NUM_CODONS, SecondaryStructureFeatureProfile.get_name(): [-10] * FEATURE_1_NUM_CODONS, CodonRarityFeatureProfile.get_name(): [0.5] * FEATURE_1_NUM_CODONS, } refactor_context.set_feature_id_to_profile_values_map( fake_profile_values_map) occurrences = find_restriction_site_occurrences( seq_record, RESTRICTION_ENZYME) self.assertEqual(1, len(occurrences)) result = _remove_site_in_coding_feature(refactor_context, seq_record, occurrences[0], feature_1) self.assertTrue(result['is_success']) seq_record = result['updated_genome_record'] FEATURE_1_SEQ_UPDATED = feature_1.extract(str(seq_record.seq)) occurrences = find_restriction_site_occurrences( seq_record, RESTRICTION_ENZYME) self.assertEqual(0, len(occurrences)) self.assertEqual(translate_custom(FEATURE_1_SEQ_ORIG), translate_custom(FEATURE_1_SEQ_UPDATED))
def _findGeneInSeq(self, record): """Extract gene sequence from larger sequence (e.g. genomes) by searching features.""" if not record.features: # if there aren't any features, just return the record return record for feature in record.features: feature_names = [] if 'gene' in feature.qualifiers.keys(): feature_names.extend(feature.qualifiers['gene']) if 'gene_synonym' in feature.qualifiers.keys(): feature_names.extend(feature.qualifiers['gene_synonym']) if 'product' in feature.qualifiers.keys(): feature_names.extend(feature.qualifiers['product']) gene_names = [e.lower() for e in self.gene_names] feature_names = [e.lower() for e in feature_names] if set(gene_names) & set(feature_names): try: extractor = SeqFeature(feature.location) found_seq = extractor.extract(record) except ValueError: # catch value errors raised for sequences # with "fuzzy" positions # TODO: what are fuzzy positions and can I use # them? return record else: return found_seq return record
def test__init(self): "Test FeatureMatch object creation" # forward strand m = self.match self.assertEqual(m.direction, "forward") self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA") self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe())) self.assertEqual( str(m.aas), str(self.feature.extract(self.seq).translate(to_stop=True))) # reverse strand inv_seq = self.seq.reverse_complement() feature_seq = inv_seq[3:-3] inv_feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=-1) m = FeatureMatch(inv_feature, feature_seq, -1, 3) self.assertEqual(m.direction, "reverse") self.assertEqual(str(m.dna), str(feature_seq[3:-3].reverse_complement())) self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(feature_seq.transcribe().reverse_complement())) self.assertEqual( str(m.aas), str(inv_feature.extract(inv_seq).translate(to_stop=True)))
def get_longest(seq_record, gene2isoforms): l = [] c = 0; chrom = adjust_name(seq_record.name); for gene, isoforms in gene2isoforms.iteritems(): longest = max(isoforms, key = lambda i: sum([len(x) for x in i])) if(args.format == 'bed'): compound_to_bed(longest, chrom, gene) elif(args.format == 'fasta'): if(len(longest) > 1): location = CompoundLocation(longest, operator = "join") else: location = longest[0]; feature = SeqFeature(location=location, type='utr', strand = longest[0].strand) #print longest[0].strand f = feature.extract(seq_record) f.name = gene f.id = gene f.description = gene l.append(f); return l;
def set_gap_features(self, len_cutoff=10): num_assembly_gap = 0 for record in self.seq_dict.values(): startPosition = 0 seq = str(record.seq).upper() pat = "(" + "N" * len_cutoff + "+)" for fragment in re.split(pat, seq): endPosition = startPosition + len(fragment) if fragment.startswith("N"): qualifiers = { "estimated_length": [len(fragment)], "gap_type": ["within scaffold"], "linkage_evidence": [self.linkage_evidence] } location = FeatureLocation(startPosition, endPosition, strand=1) feature = SeqFeature(location, id=uuid4(), type="assembly_gap", qualifiers=qualifiers) assert str(feature.extract(record).seq).upper() == fragment record.features.append(feature) num_assembly_gap += 1 startPosition = endPosition self.report["num_assembly_gap"] = num_assembly_gap
def generate_genes(genbank): """ Generate gene rows for every feature in a genbank object. """ for (idx, feature) in enumerate(genbank.features): if feature.type == 'source' or feature.type == 'gene': continue row = { 'location_start': feature.location.start, 'location_end': feature.location.end, 'strand': feature.strand, 'ref': feature.ref, 'ref_db': feature.ref_db } for (name, val) in feature.qualifiers.items(): # For some reason, all values under .qualifiers are lists of one elem # We join the elems into a string just in case there are ever multiple items row[name] = ', '.join(val) if not row.get('locus_tag'): # No locus tag; skip this one. We can only use features with locus tags. continue row['_key'] = row['locus_tag'] # Generate the DNA sequence using biopython # https://biopython.org/DIST/docs/api/Bio.SeqFeature.SeqFeature-class.html#extract seq_obj = SeqFeature(feature.location, feature.type) # type: SeqFeature seq_str = str(seq_obj.extract(genbank.seq)) row['dna_sequence'] = seq_str yield row
def get_gene_and_201bp_upstream(genefeature,genomeseq): mystart = genefeature.location.start myend = genefeature.location.end mystrand = genefeature.location.strand if mystrand == 1: newfeature = SeqFeature(FeatureLocation(mystart-201,myend),strand=mystrand) elif mystrand == -1: newfeature = SeqFeature(FeatureLocation(mystart,myend+201),strand=mystrand) return newfeature.extract(genomeseq)
def find_cds (): seq_des = str(record_dict[keys].description).split("|") for i in seq_des: if re.match("CDS", i): feature, cds_start, cds_end = re.split(":|-", i) cds_feature = SeqFeature(FeatureLocation(int(cds_start)-1,int(cds_end)-1), type=str(feature)) cds_sequence = cds_feature.extract(record_dict[keys].seq) print cds_sequence.translate() return cds_start, cds_end, cds_sequence
def create_fragmented_pseudo(args, fragments, seqrecord): """Takes a list of features are concatenates them into a single pseudogene feature""" start = min([feature.location.start for feature in fragments]) end = max([feature.location.end for feature in fragments]) strands = [feature.strand for feature in fragments if (feature.strand is not None and feature.strand != 0)] if len(strands) == 0: # Occurs if two intergenic regions are being merged strand = 0 elif all([strand == strands[0] for strand in strands]): strand = strands[0] else: # TODO: This occurs if features from + and - strand are going to be combined. Very rarely does this happen # TODO: but there could be biologically relevant reasons for this to occur ie two fragments on + strand # TODO: separated by an ORF on the - strand which is an insertion sequence. # TODO: We should explore options to handle these cases in the future. tags = [feature.qualifiers['locus_tag'][0] for feature in fragments] parent_tags = [feature.qualifiers.get('parents') for feature in fragments] # TODO: understand why 'tags + parent_tags' results in a nested list and fix the source of the issue rather than forcibly flattening. # Order of operations: flatten -> remove duplicates -> remove 'None' -> convert to list all_tags = list(filter(None, set(flatten(tags + parent_tags)))) common.print_with_time("WARNING: Pseudogene detected which traverses features on (+) and (-) strands.\n" "We recommend manual inspection of this region.\n" "Features involved: %s" % all_tags) strand = 0 # raise RuntimeError("Trying to combine genes on opposite strands.") hits = [] for feature in fragments: hits.extend(feature.qualifiers['hits']) parents = [] for fragment in fragments: frag_parents = fragment.qualifiers.get('parents') if frag_parents is not None: parents.extend(frag_parents) parents.extend([feature.qualifiers['locus_tag'][0] for feature in fragments]) parents = list(set(flatten(parents))) pseudo = SeqFeature(location=FeatureLocation(start, end), type='pseudogene', strand=strand) pseudo.qualifiers['nucleotide_seq'] = pseudo.extract(seqrecord.seq) pseudo.qualifiers['contig_id'] = seqrecord.name pseudo.qualifiers['hits'] = hits pseudo.qualifiers['locus_tag'] = '' pseudo.qualifiers['parents'] = parents pseudo.qualifiers['pseudo_type'] = PseudoType.fragmented pseudo.qualifiers['note'] = "Pseudogene candidate. Reason: Predicted fragmentation of a single gene." seqrecord.features.append(pseudo)
def test_within(self): """Features: write/read simple within locations.""" s = "N" * 100 f = SeqFeature(FeatureLocation(WithinPosition(2,6),10), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "(3.9)..10") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(WithinPosition(12,6), WithinPosition(20,8)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "(13.19)..(20.28)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(25,WithinPosition(30,3)), \ strand=+1, type="misc_feature") self.assertEqual(_insdc_feature_location_string(f,100), "26..(30.33)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(WithinPosition(35,4),40), \ strand=-1, type="rRNA") self.assertEqual(_insdc_feature_location_string(f,100), "complement((36.40)..40)") self.record.features.append(f) f = SeqFeature(FeatureLocation(WithinPosition(45,2), WithinPosition(50,3)), \ strand=-1, type="repeat_region") self.assertEqual(_insdc_feature_location_string(f,100), "complement((46.48)..(50.53))") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(55,WithinPosition(60,5)), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(56..(60.65))") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) self.write_read_checks()
def get_seq(genome, intervals): seq_records = [] for seq_id, start, end in intervals: start = int(start) end = int(end) feature = SeqFeature(FeatureLocation(start, end)) seq = feature.extract(SeqIO.to_dict(genome)[seq_id].seq) seq_record = SeqRecord(seq) seq_record.id = '{0}:{1}-{2}'.format(seq_id, start + 1, end) seq_record.description = '' seq_records.append(seq_record) return seq_records
def get_seq_record_by_id_location(self, identifier, start=None, end=None, strand_=None): # print("Identifier {0} Start {1} End {2} Strand {3} ".format(identifier, start, end, strand_)) if self.indexed_data is not None: if start is not None and end is not None and strand_ is not None: seq = self.get_sequence_by_id(identifier) dna_seq = Seq(str(seq), generic_dna) feature = SeqFeature(FeatureLocation(start, end), type="exon") return str(feature.extract(seq)) else: return self.get_sequence_by_id(identifier) else: raise ValueError("Genbank Handler not initialized")
def test_deletion__overlapping_features(self): # Example based on intersection of nei and arbB genes in MG1655. before_overlap = 'GCCCTGGCTGCCAGCA' overlap = 'CTAG' after_overlap = 'GCCGACCGCTTCGG' raw_seq_str = before_overlap + overlap + after_overlap seq = Seq(raw_seq_str, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(before_overlap) + len(overlap), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str), strand=1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2') seq_record.features.append(feature_2) maker = VCFToGenbankMaker(seq_record, None, None) maker._update_genome_record_for_variant(len(before_overlap), overlap, '') # Assert the sequence is correct. EXPECTED_SEQ = before_overlap + after_overlap self.assertEqual(EXPECTED_SEQ, str(seq_record.seq)) # Assert the feature annotations are still correct. EXPECTED_FEATURE_1_SEQ = before_overlap self.assertEqual(EXPECTED_FEATURE_1_SEQ, str(feature_1.extract(seq_record.seq))) EXPECTED_FEATURE_2_SEQ = after_overlap self.assertEqual(EXPECTED_FEATURE_2_SEQ, str(feature_2.extract(seq_record.seq)))
def test_avoid_changes_in_shadows(self): """Avoid changing bases in CDS features. """ # Sequence is all AT so some will have to change. seq = Seq(''.join([random.choice('AT') for i in range(200)]), generic_dna) orig_seq_record = SeqRecord(seq) # We will expect a 60 base shadow from 60 - 120. This is the CDS # and 20 bases upstream. feature_1_loc = FeatureLocation(80, 120, strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id=1) orig_seq_record.features.append(feature_1) seq_record = copy.deepcopy(orig_seq_record) self.assertTrue(GC(seq_record.seq) == 0) # Hit just one position. interval_list = [(100, 101)] # Aim for 100% to hit all bases possible. constraint_obj = GCContentConstraints() constraint_obj.local_window_lower_bound = 1.0 constraint_obj.local_window_upper_bound = 1.1 # no upper bound automated_intergenic_gc_fixer(seq_record, interval_list, gc_content_constraint_obj=constraint_obj) # Expect window centered at 100 to have GC 60, which is all window_seq = seq_record.seq[50:150] self.assertEqual(40, GC(window_seq)) # Make sure shadow seq is unchanged. self.assertEqual(str(feature_1.extract(orig_seq_record.seq)), str(feature_1.extract(seq_record.seq)))
def retrieveCompositeSequence(seq_record,seqList) : # true seq listePosition = list() for node in seqList : seq,coord = node.split(":") start,end = coord.split("..") listePosition.append(int(float(start))) listePosition.append(int(float(end))) start = min(listePosition) end = max(listePosition) f = SeqFeature(FeatureLocation(start,end)) seq = f.extract(seq_record) seqId = seq_record.id+"|"+str(start)+"_"+str(end) return SeqRecord(seq=seq.seq,id=seqId,description="")
def __init__(self, coords, global_seq): """Init a region's basic information: start, end, globalsequence, subsequence. Args: coords (tuple(int, int)): start and end coordinates of the region. global_seq (Bio.Seq.Seq): the original sequence to which the region belongs. """ self.start = coords[0] self.end = coords[1] self.globalsequence = global_seq # Biopython is 1-based but FeatureLocation takes Python # slicing-style positions: [20:30] -> 19..30 feature = SeqFeature(FeatureLocation(self.start - 1, self.end)) self.subsequence = feature.extract(global_seq).seq
def check_sub(feature, sequence): new_features = [] loc_list = [] qual_list = {} topop = [] for sub in feature.sub_features: if sub.sub_features: # If there are sub_features, go deeper new_features.extend(check_sub(sub, sequence)) elif sub.type == 'CDS': loc = [sub.location.start.real, sub.location.end.real] loc_list.append(FeatureLocation(loc[0], loc[1], strand=sub.strand)) # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if # they're the same, i.e.: all children have the same "protein_ID" (key and value). for qual in sub.qualifiers.keys(): if qual not in qual_list: qual_list[qual] = sub.qualifiers[qual] if qual in qual_list and not qual_list[qual] == sub.qualifiers[ qual]: topop.append(qual) for n in topop: # Pop mismatching qualifers over split features qual_list.pop(n, None) qual_list.pop('Parent', None) # Pop parent. # Only works in tip of the tree, when there's no new_feature built yet. If there is, # it means the script just came out of a check_sub and it's ready to return. if not new_features: if len(loc_list) > 1: loc_list = sorted(loc_list, key=lambda x: x.start.real) if loc_list[0].strand == 1: new_loc = CompoundLocation(loc_list) else: new_loc = CompoundLocation(list(reversed(loc_list))) elif len(loc_list) == 0: return new_features else: new_loc = loc_list[0] new_feature = SeqFeature(new_loc) new_feature.qualifiers = qual_list new_feature.type = 'CDS' trans = new_feature.extract(sequence.seq).translate(stop_symbol='') new_feature.qualifiers['translation'] = [str(trans)] new_features.append(new_feature) return new_features
def require_sd(data, record, chrom_start, sd_min, sd_max): sd_finder = NaiveSDCaller() for putative_gene in data: if putative_gene[2] > 0: # strand start = chrom_start + putative_gene[0] - sd_max end = chrom_start + putative_gene[0] - sd_min else: start = chrom_start + putative_gene[1] + sd_min end = chrom_start + putative_gene[1] + sd_max (start, end) = __ensure_location_in_bounds(start=start, end=end, parent_length=len(record)) tmp = SeqFeature(FeatureLocation( start, end, strand=putative_gene[2]), type='domain') # Get the sequence seq = str(tmp.extract(record.seq)) sds = sd_finder.list_sds(seq) if len(sds) > 0: yield putative_gene + (start, end)
def get_residue_annotations(self, start_resnum, end_resnum=None): """Retrieve letter annotations for a residue or a range of residues Args: start_resnum (int): Residue number end_resnum (int): Optional residue number, specify if a range is desired Returns: dict: Letter annotations for this residue or residues """ if not end_resnum: end_resnum = start_resnum # Create a new SeqFeature f = SeqFeature(FeatureLocation(start_resnum - 1, end_resnum)) # Get sequence properties return f.extract(self).letter_annotations
def extract_sequences_one_sample(args): ''' Function for extracting protein sequences given annotated regions and produce fasta files that can be used for clustering ''' (fasta_path, annotation_path, sample, output_dir, domain) = args domainInfo = load_annotation_pfam(annotation_path, domain) print "Generating domain fasta sequences for " + sample + " ..." from Bio import SeqIO from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.SeqRecord import SeqRecord (annot, start, stop, strand, evalue) = domainInfo record_dict = index_fasta(fasta_path) recordlist = [] outfilename = output_dir + '/forClustering/' + sample + '.fasta' outhandle = open(outfilename, 'w') for domainID in annot.keys(): for i in range(len(annot[domainID])): domain = annot[domainID][i] try: seq = record_dict[domain] except KeyError: print "Error: " + domain + " not in fasta file.\n" break a = start[domainID][i] b = stop[domainID][i] seq_strand = strand[domainID][i] seq_evalue = evalue[domainID][i] if seq_strand in '+': domain_feature = SeqFeature(FeatureLocation(a - 1, b - 1), type="domain", strand=1) elif seq_strand in '-': domain_feature = SeqFeature(FeatureLocation(a - 1, b - 1), type="domain", strand=-1) feature_seq = domain_feature.extract(seq) feature_seq.id = feature_seq.id + ' ' + domainID + ' ' + seq_evalue recordlist.append(feature_seq) SeqIO.write(recordlist, outhandle, "fasta") outhandle.close() print "Done"
def retrievePartialSequence(seqList,seq2coord,gi2domain,gi2phylum,gi2species,seqId2seqRecord) : # true seq seq_set = set() for node in seqList : seq,coord = node.split(":") seq_set.add(seq) # print len(seq_set) # get fasta seqListExtracted = list() for seqId in seq_set : if seqId in seqId2seqRecord : # print seq_record.id seq_record = seqId2seqRecord[seqId] for node in seqList : # print "\t"+node.split(":")[0] if seq_record.id == node.split(":")[0] : # extracting subseq thanks to coordinates # print "\t\t"+seq_record.id+"\t"+node+"\t"+str(seq2coord[node]) start = seq2coord[node][0] end = seq2coord[node][1] f = SeqFeature(FeatureLocation(int(start),int(end) ) ) seq = f.extract(seq_record) # print seq gi = node.split(":")[0] species = gi2species[gi] phylum = gi2phylum[gi] seqId = gi+"|"+phylum+"|"+species+"|"+node.split(":")[1].replace("..","_") seqRecord = SeqRecord(seq=seq.seq,id=seqId,description="") seqListExtracted.append(seqRecord) seqList.remove(node) break else : continue else : continue return seqListExtracted
def extract_sequences_one_sample(args): ''' Function for extracting protein sequences given annotated regions and produce fasta files that can be used for clustering ''' (fasta_path,annotation_path,sample, output_dir)=args domainInfo=load_annotation_pfam(annotation_path) print "Generating domain fasta sequences for "+sample+" ..." from Bio import SeqIO from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.SeqRecord import SeqRecord (annot,start,stop,strand,evalue)=domainInfo record_dict=index_fasta(fasta_path) recordlist=[] outfilename=output_dir +'/forClustering/'+ sample +'.fasta' outhandle=open(outfilename,'w') for domainID in annot.keys(): for i in range(len(annot[domainID])): domain=annot[domainID][i] try: seq=record_dict[domain] except KeyError: print "Error: " + domain + " not in fasta file.\n" break a=start[domainID][i] b=stop[domainID][i] seq_strand=strand[domainID][i] seq_evalue=evalue[domainID][i] if seq_strand in '+': domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=1) elif seq_strand in '-': domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=-1) feature_seq = domain_feature.extract(seq) feature_seq.id=feature_seq.id+' '+domainID+' '+seq_evalue recordlist.append(feature_seq) SeqIO.write(recordlist, outhandle, "fasta") outhandle.close() print "Done"
def test_get_codon_feature_location__reverse_strand(self): FEATURE_SEQ_RAW = reverse_complement('ATGTTTGGGTAG') SEQ = Seq('CCCCCC' + FEATURE_SEQ_RAW + 'AGTA', generic_dna) seq_record = SeqRecord(SEQ) FEATURE_1_ID = '1' FEATURE_1_LOC = FeatureLocation(6, 18) feature_1 = SeqFeature(FEATURE_1_LOC, type='CDS', id=FEATURE_1_ID, strand=-1) add_feature_to_seq_record(seq_record, feature_1) # Sanity check for the feature sequence. feature_seq = str(feature_1.extract(seq_record.seq)) self.assertEqual('ATG', feature_seq[0:3]) self.assertEqual('TTT', feature_seq[3:6]) CODON_0_FEATURE_LOCATION = get_codon_feature_location(feature_1, 0) self.assertEqual(15, CODON_0_FEATURE_LOCATION.start) self.assertEqual(18, CODON_0_FEATURE_LOCATION.end) CODON_1_FEATURE_LOCATION = get_codon_feature_location(feature_1, 1) self.assertEqual(12, CODON_1_FEATURE_LOCATION.start) self.assertEqual(15, CODON_1_FEATURE_LOCATION.end)
def testFeatureUpstream(self, feature, record, sd_min=5, sd_max=15): # Strand information necessary to getting correct upstream sequence # TODO: library? strand = feature.location.strand # n_bases_upstream if strand > 0: start = feature.location.start - sd_max end = feature.location.start - sd_min else: start = feature.location.end + sd_min end = feature.location.end + sd_max (start, end) = ensure_location_in_bounds(start=start, end=end, parent_length=len(record)) # Create our temp feature used to obtain correct portion of # genome tmp = SeqFeature(FeatureLocation(start, end, strand=strand), type="domain") seq = str(tmp.extract(record.seq)) return self.list_sds(seq), start, end, seq
def get_TIS(fullpath, filename): #scans the file name to determine how many bases are upstream of the CDS list_of_numbers_in_filename = re.findall('\d+', filename) num_bp_upstreamcds = int(list_of_numbers_in_filename[0]) extracted_TIS_list = [] #sets up the TIS coordinates prior to extraction TIS_coordinates = SeqFeature(FeatureLocation(num_bp_upstreamcds - num_bp_upstream_start, num_bp_upstreamcds + num_bp_downstream_start)) # reads in a gbk and creates a SeqRecord object for record in SeqIO.parse(fullpath, "fasta"): TIS_only_record = TIS_coordinates.extract(record) #annotated_TIS_only_record = SeqRecord(TIS_only_record.seq, TIS_only_record.id, description = "|" + cds_protein_id +"|") extracted_TIS_list.append(TIS_only_record) SeqIO.write(extracted_TIS_list, "extracted_TIS_" + filename + ".TIS.fasta", "fasta") return
def annotate_like_HXB2(refname, VERBOSE=0): '''Annotate copying from HXB2''' hxb2 = load_custom_reference('HXB2', 'gb') ref = load_custom_reference(refname, 'fasta') refs = str(ref.seq) def get_sublocation(sublocation): hxb2_seq = sublocation.extract(hxb2) ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '') start = refs.find(ref_seq) end = start + len(ref_seq) return FeatureLocation(start, end, strand=+1) for fea in hxb2.features: if VERBOSE >= 1: print fea.id loc = [get_sublocation(loc) for loc in fea.location.parts] if len(loc) == 1: loc = loc[0] else: loc = CompoundLocation(loc) feature = SeqFeature(loc, type=fea.type, id=fea.id) # Test length of old and new if fea.id not in ["LTR5'", "LTR3'", 'V4']: L1 = len(fea.extract(hxb2)) L2 = len(feature.extract(ref)) s = str(L2)+' vs '+str(L1) if 1.0 * L2 / L1 < 0.9: raise ValueError('Feature: '+fea.id+' is too short: '+s) elif 1.0 * L2 / L1 > 1.1: raise ValueError('Feature: '+fea.id+' is too long: '+s) ref.features.append(feature) return ref
def annotate_like_HXB2(refname, VERBOSE=0): '''Annotate copying from HXB2''' hxb2 = load_custom_reference('HXB2', 'gb') ref = load_custom_reference(refname, 'fasta') refs = str(ref.seq) def get_sublocation(sublocation): hxb2_seq = sublocation.extract(hxb2) ref_seq = trim_to_refseq(refs, hxb2_seq).replace('-', '') start = refs.find(ref_seq) end = start + len(ref_seq) return FeatureLocation(start, end, strand=+1) for fea in hxb2.features: if VERBOSE >= 1: print fea.id loc = [get_sublocation(loc) for loc in fea.location.parts] if len(loc) == 1: loc = loc[0] else: loc = CompoundLocation(loc) feature = SeqFeature(loc, type=fea.type, id=fea.id) # Test length of old and new if fea.id not in ["LTR5'", "LTR3'", 'V4']: L1 = len(fea.extract(hxb2)) L2 = len(feature.extract(ref)) s = str(L2) + ' vs ' + str(L1) if 1.0 * L2 / L1 < 0.9: raise ValueError('Feature: ' + fea.id + ' is too short: ' + s) elif 1.0 * L2 / L1 > 1.1: raise ValueError('Feature: ' + fea.id + ' is too long: ' + s) ref.features.append(feature) return ref
def test__init(self): "Test FeatureMatch object creation" # forward strand m = self.match self.assertEqual(m.direction, "forward") self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA") self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe())) self.assertEqual(str(m.aas), str(self.feature.extract(self.seq).translate(to_stop=True))) # reverse strand inv_seq = self.seq.reverse_complement() feature_seq = inv_seq[3:-3] inv_feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=-1) m = FeatureMatch(inv_feature, feature_seq, -1, 3) self.assertEqual(m.direction, "reverse") self.assertEqual(str(m.dna), str(feature_seq[3:-3].reverse_complement())) self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(feature_seq.transcribe().reverse_complement())) self.assertEqual(str(m.aas), str(inv_feature.extract(inv_seq).translate(to_stop=True)))
def test_oneof(self): """Features: write/read simple one-of locations.""" s = "N" * 100 start = OneOfPosition([ExactPosition(0),ExactPosition(3),ExactPosition(6)]) f = SeqFeature(FeatureLocation(start,21), strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "one-of(1,4,7)..21") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) start = OneOfPosition([ExactPosition(x) for x in [10,13,16]]) end = OneOfPosition([ExactPosition(x) for x in [41,44,50]]) f = SeqFeature(FeatureLocation(start,end), strand=+1, type="gene") self.assertEqual(_insdc_feature_location_string(f,100), "one-of(11,14,17)..one-of(41,44,50)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) end = OneOfPosition([ExactPosition(x) for x in [30,33]]) f = SeqFeature(FeatureLocation(27,end), strand=+1, type="gene") self.assertEqual(_insdc_feature_location_string(f,100), "28..one-of(30,33)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) start = OneOfPosition([ExactPosition(x) for x in [36,40]]) f = SeqFeature(FeatureLocation(start,46), strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(one-of(37,41)..46)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) start = OneOfPosition([ExactPosition(x) for x in [45,60]]) end = OneOfPosition([ExactPosition(x) for x in [70,90]]) f = SeqFeature(FeatureLocation(start,end), strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(one-of(46,61)..one-of(70,90))") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) end = OneOfPosition([ExactPosition(x) for x in [60,63]]) f = SeqFeature(FeatureLocation(55,end), strand=-1, type="tRNA") self.assertEqual(_insdc_feature_location_string(f,100), "complement(56..one-of(60,63))") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) self.write_read_checks()
def test_after(self): """Features: write/read simple after locations.""" s = "N" * 200 f = SeqFeature(FeatureLocation(AfterPosition(5),10), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), ">6..10") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(AfterPosition(15),AfterPosition(20)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), ">16..>20") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(25,AfterPosition(30)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "26..>30") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(AfterPosition(35),40), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(>36..40)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(AfterPosition(45),AfterPosition(50)), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(>46..>50)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(55,AfterPosition(60)), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(56..>60)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) self.write_read_checks()
def test_before(self): """Features: write/read simple before locations.""" s = "N"*200 f = SeqFeature(FeatureLocation(BeforePosition(5),10), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "<6..10") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(BeforePosition(15),BeforePosition(20)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "<16..<20") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(25,BeforePosition(30)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "26..<30") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(BeforePosition(35),40), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(<36..40)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(BeforePosition(45),BeforePosition(50)), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(<46..<50)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) f = SeqFeature(FeatureLocation(55,BeforePosition(60)), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f,100), "complement(56..<60)") self.assertEqual(len(f), len(f.extract(s))) self.record.features.append(f) self.write_read_checks()
class TestFeatureMatch(unittest2.TestCase): def setUp(self): self.seq = Seq("CCCAAAATGTACTCCACTATCTGCTGATTTGGG", generic_dna) self.feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=1) self.feature_seq = self.seq[3:-3] self.match = FeatureMatch(self.feature, self.feature_seq, 1, 3) def test__init(self): "Test FeatureMatch object creation" # forward strand m = self.match self.assertEqual(m.direction, "forward") self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA") self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe())) self.assertEqual(str(m.aas), str(self.feature.extract(self.seq).translate(to_stop=True))) # reverse strand inv_seq = self.seq.reverse_complement() feature_seq = inv_seq[3:-3] inv_feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=-1) m = FeatureMatch(inv_feature, feature_seq, -1, 3) self.assertEqual(m.direction, "reverse") self.assertEqual(str(m.dna), str(feature_seq[3:-3].reverse_complement())) self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(feature_seq.transcribe().reverse_complement())) self.assertEqual(str(m.aas), str(inv_feature.extract(inv_seq).translate(to_stop=True))) def test_get_fasta_header(self): "Test FeatureMatch FASTA header creation" expected = ">untagged" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['gene'] = ['fake'] expected = ">fake" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001'] expected = ">FAKE_0001" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['product'] = ['Mup1'] expected = ">FAKE_0001|Mup1" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['protein_id'] = ['MUP_0001'] expected = ">FAKE_0001|Mup1|MUP_0001" self.assertEqual(self.match.get_fasta_header(), expected) def test__str_(self): "Test FeatureMatch string representation" expected = """Feature: untagged Strand: forward DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['gene'] = ["fake"] expected = """Feature: Tag: fake Strand: forward DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001'] expected = """Feature: Tag: FAKE_0001 Strand: forward DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['product'] = ['Mup1'] expected = """Feature: Tag: FAKE_0001 Strand: forward Product: Mup1 DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['protein_id'] = ['MUP_0001'] expected = """Feature: Tag: FAKE_0001 Strand: forward Product: Mup1 Protein ID: MUP_0001 DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) def test_dna_fasta(self): "Test FeatureMatch DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.dna) self.assertMultiLineEqual(self.match.dna_fasta(), expected) def test_long_dna_fasta(self): "Test FeatureMatch long DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.long_dna) self.assertMultiLineEqual(self.match.long_dna_fasta(), expected) def test_mrna_fasta(self): "Test FeatureMatch mRNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.mrna) self.assertMultiLineEqual(self.match.mrna_fasta(), expected) def test_protein_fasta(self): "Test FeatureMatch protein FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.aas) self.assertMultiLineEqual(self.match.protein_fasta(), expected) def test_promotor_fasta(self): "Test FeatureMatch promotor DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.promotor_region) self.assertMultiLineEqual(self.match.promotor_fasta(), expected) def test_terminator_fasta(self): "Test FeatureMatch terminator DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.terminator_region) self.assertMultiLineEqual(self.match.terminator_fasta(), expected)
# if we had a TSS (not ever gene does) if tss: # open the fasta file sequence.fasta for seq_record in SeqIO.parse("sequence.fasta", "fasta"): # check from genestart to TSS is forward or tss to gene start if backwards with -1 or 1 strange appropraitely if direction is "-": # print(tss) test_feature = SeqFeature(FeatureLocation(int(geneStarts[i]), int(tss)), type="gene", strand=-1) else: test_feature = SeqFeature(FeatureLocation(int(tss), int(geneStarts[i])), type="gene", strand=1) # test_feature=SeqFeature(FeatureLocation(40417,TSS),type="gene",strand=strandDirection) # example_seq now contains the sequence which we need to search through example_seq = test_feature.extract(seq_record) print (example_seq.seq) # need to transcribe the sequence sequence = str(example_seq.seq.transcribe()) print sequence letters = ["U", "A", "C", "G"] # default the stalling position to x stallingPosition = ["x", "x", "x", "x"] # for each letter in UACG for letter in letters:
def get_codon(cds: SeqFeature, seq: Seq, region_pos: int) -> Codon: region_codon_start = region_pos - (region_pos % 3) region = cds.extract(seq) codon = region[region_codon_start:region_codon_start + 3]._data return codon
def name_nrpspks(seq_record, pksnrpsvars, withinclustergenes, options): pksnrpsvars.nrpspkstypedict = {} for feature in withinclustergenes: k = utils.get_gene_id(feature) if not pksnrpsvars.domaindict.has_key(k): continue if pksnrpsvars.domaindict[k] == []: continue #structure of domaindict: domaindict[genename] = [[name,start,end,evalue,score],[name,start,end,evalue,score], etc.] domainlist = [] nrKSdomains = 0 for i in pksnrpsvars.domaindict[k]: domainlist.append(i[0]) if i[0] == "PKS_KS": nrKSdomains += 1 modKSscore = 0 traKSscore = 0 eneKSscore = 0 iterKSscore = 0 if pksnrpsvars.ksdomaindict.has_key(k): for i in pksnrpsvars.ksdomaindict[k]: if i[0] == "Trans-AT-KS": traKSscore += 1 if i[0] == "Modular-KS": modKSscore += 1 if i[0] == "Enediyne-KS": eneKSscore += 1 if i[0] == "Iterative-KS": iterKSscore += 1 if pksnrpsvars.domaindict.has_key(k): for i in pksnrpsvars.domaindict[k]: if "Cglyc" in domainlist and "Epimerization" in domainlist and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: nrpspkstype = "Glycopeptide NRPS" elif ( "Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist ) and "AMP-binding" in domainlist and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: nrpspkstype = "NRPS" elif ("Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist) or "AMP-binding" in domainlist and ( "PKS_KS" in domainlist or "PKS_AT" in domainlist): nrpspkstype = "Hybrid PKS-NRPS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" not in domainlist and "Trans-AT_docking" in domainlist and traKSscore > modKSscore and traKSscore > iterKSscore and traKSscore > eneKSscore: nrpspkstype = "Type I Trans-AT PKS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and iterKSscore > modKSscore and iterKSscore > traKSscore and iterKSscore > eneKSscore and nrKSdomains < 3: nrpspkstype = "Type I Iterative PKS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and eneKSscore > modKSscore and eneKSscore > traKSscore and eneKSscore > iterKSscore and nrKSdomains < 3: nrpspkstype = "Type I Enediyne PKS" elif ( "Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist and ( (modKSscore > eneKSscore and modKSscore > traKSscore and modKSscore > iterKSscore) or nrKSdomains > 3): nrpspkstype = "Type I Modular PKS" elif ("Condensation_LCL" not in domainlist and "Condensation_DCL" not in domainlist and "Condensation_Starter" not in domainlist and "Cglyc" not in domainlist and "Condensation_Dual" not in domainlist and "AMP-binding" not in domainlist ) and "PKS_KS" in domainlist and "PKS_AT" in domainlist: nrpspkstype = "PKS-like protein" elif ( "Condensation_LCL" in domainlist or "Condensation_DCL" in domainlist or "Condensation_Starter" in domainlist or "Cglyc" in domainlist or "Condensation_Dual" in domainlist or "AMP-binding" in domainlist ) and "PKS_KS" not in domainlist and "PKS_AT" not in domainlist: nrpspkstype = "NRPS-like protein" else: nrpspkstype = "PKS/NRPS-like protein" if feature.qualifiers.has_key("sec_met"): feature.qualifiers['sec_met'].append("NRPS/PKS subtype: " + nrpspkstype) else: feature.qualifiers['sec_met'] = [ "NRPS/PKS subtype: " + nrpspkstype ] #Write motifs to seq_record motifFeatures = [] if pksnrpsvars.motifdict.has_key(k): motifs = pksnrpsvars.motifdict[k] counter = 1 for motif in motifs: if feature.location.strand == 1: start = feature.location.start + (3 * motif[1]) end = feature.location.start + (3 * motif[2]) else: end = feature.location.end - (3 * motif[1]) start = feature.location.end - (3 * motif[2]) loc = FeatureLocation(start, end, strand=feature.strand) motifFeature = SeqFeature( loc, type=options.FeatureTags.pksnrpsmotifs_tag) quals = defaultdict(list) quals['label'].append(str(motif[0])) if feature.qualifiers.has_key('locus_tag'): quals['locus_tag'] = feature.qualifiers['locus_tag'] else: quals['locus_tag'] = [k] quals['motif'] = [motif[0]] quals['asDomain_id'] = [ 'nrpspksmotif_' + '_'.join(quals['locus_tag']) + '_' + '{:04d}'.format(counter) ] counter += 1 quals['evalue'] = [str("{:.2E}".format(float(motif[3])))] quals['score'] = [str(motif[4])] quals['aSTool'] = ["pksnrpsmotif"] quals['detection'] = ["hmmscan"] quals['database'] = ["abmotifs"] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 quals['translation'] = [ str( motifFeature.extract(seq_record).seq.translate( table=transl_table)) ] quals['note'].append("NRPS/PKS Motif: " + motif[0] + " (e-value: " + str(motif[3]) + ", bit-score: " + str(motif[4]) + ")") motifFeature.qualifiers = quals motifFeatures.append(motifFeature) nrpspksdomains = pksnrpsvars.domaindict[k] for domain in nrpspksdomains: if feature.qualifiers.has_key("sec_met"): feature.qualifiers['sec_met'].append( "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;" % (domain[0], str(domain[1]), str( domain[2]), str(domain[3]), str(domain[4]))) else: feature.qualifiers['sec_met'] = [ "NRPS/PKS Domain: %s (%s-%s). E-value: %s. Score: %s;" % (domain[0], str(domain[1]), str( domain[2]), str(domain[3]), str(domain[4])) ] seq_record.features.extend(motifFeatures) pksnrpsvars.nrpspkstypedict[k] = nrpspkstype
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose): """ """ contig, CDSs, gffstrand, function, frames = gene2position[gene] if gffstrand in ('1', '+'): strand = +1 else: strand = -1 CDSs.reverse() '''#add stop codon if not partial seq if strand==1 and CDSs[-1][1]+3 <= len(r.seq): CDSs[-1][1] += 3 elif strand==-1 and CDSs[0][0]-3 > 0: CDSs[0][0] -= 3''' cdsloc, mrnaloc = get_locations(CDSs, start, end, strand) #add gene geneid = gene #".".join(gene.split('.')[:-1]) #get product product = "hypothetical protein" if geneid in gene2product: product = gene2product[geneid] if gene.endswith('.t1'): sf = SeqFeature(FeatureLocation(BeforePosition(start - 1), AfterPosition(end)), strand=strand, type='gene', id=geneid) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } r.features.append(sf) #get mRNA sf sf = SeqFeature(mrnaloc, type='mRNA', id=gene) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } #"protein_id": gene r.features.append(sf) #get CDS sf sf = SeqFeature(cdsloc, type='CDS', id=gene) #get translation seq = sf.extract(r.seq) aa = str(seq.translate(table=gcode)) #solve non-triplets issue if len(seq) % 3: if strand == 1: end -= len(seq) % 3 else: start += len(seq) % 3 ##check for partial sequence - no M as first or no * as last aa partial = 0 #both ends partial if aa[0] != "M" and aa[-1] != "*": partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), AfterPosition(end)) #left end partial elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1: partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), end) #right end partial elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1: partial = 1 sf.location = FeatureLocation(start - 1, AfterPosition(end)) #strip stop codon aa = aa.strip("*") #replace internal stop codons by X if "*" in aa: if verbose: sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene) return r #aa = aa.replace("*","X") sf.qualifiers = { 'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa } #"protein_id": gene, if function: sf.qualifiers['note'] = function #inform about partial entries if partial: #skip if not partial are allowed if not partialyes: return r if aa[0] != "M": sf.qualifiers['codon_start'] = 1 sf.qualifiers['product'] += ", partial cds" if verbose: sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, )) #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf)) #add to features r.features.append(sf) return r
def extract_upstream_and_CDS(fullpath, filename): extracted_cds_list = [] # reads in a gbk and creates a SeqRecord object for record in SeqIO.parse(fullpath, "genbank"): if record.features: for feature in record.features: if feature.type == "CDS": if validate_cds(record, feature) == True: # get the CDS nucleotide locations cds_start_location = feature.location.start.position cds_end_location = feature.location.end.position # only used for length culling #get the 5'UTR sequence coordinate and extract FiveUTR_location = SeqFeature( FeatureLocation( cds_start_location - num_bp_upstreamcds, cds_start_location)) extracted_5UTR = FiveUTR_location.extract(record) if len(extracted_5UTR.seq) == num_bp_upstreamcds: #need to check if complement #if it is complemement, then reverse complement it if "+" in str(feature.location): #extract -num_bp_upstreamcds + the whole CDS #THIS LOCATION HAS TO BE DIFFERENT extract_location = SeqFeature( FeatureLocation( cds_start_location - num_bp_upstreamcds, cds_end_location)) extracted_seq = extract_location.extract( record) #print "reverse complement disengaged" + str(feature.location) elif "-" in str(feature.location): rc_extract_location = SeqFeature( FeatureLocation( cds_start_location, cds_end_location + num_bp_upstreamcds)) extracted_seq = rc_extract_location.extract( record).reverse_complement() #print "reverse complement engaged " + str(feature.location) cds_protein_id = str( feature.qualifiers.get('protein_id')).strip( '\'[]') annotated_record = SeqRecord(extracted_seq.seq, extracted_seq.name, description="|" + cds_protein_id + "|") extracted_cds_list.append(annotated_record) # create a SeqFeature object containing the location of where to extract # need to test if its taking + or - 1 off the location # genbank starts with 1 #upstream_cds_downstream_location = SeqFeature(FeatureLocation(cds_start_location - num_bp_upstreamcds, cds_end_location + num_bp_downstreamcds)) # extraction is using the GENBANK protein for all SeqIO.write(extracted_cds_list, filename + str(num_bp_upstreamcds) + "upstream_and_CDS.fasta", "fasta") return
def subset_viruses_nextstrain_build(virus, subtype, gene, window, min_seqs, year_max, year_min): configs = readin_virus_config(virus) standard_gene = standardize_gene_name(virus, gene) #Find reference, alignment and meta files (some sub-genic regions may use files from a gene or a whole genome) if 'specify_location' in configs[standard_gene].keys(): parent_gene = configs[standard_gene]['specify_location']['parent_gene'] reference_file = configs['reference_file'].format(virus=virus, subtype=subtype, gene=parent_gene) alignment_file = configs['alignment_file'].format(virus=virus, subtype=subtype, gene=parent_gene) meta_file = configs['meta_file'].format(virus=virus, subtype=subtype, gene=parent_gene) #some are comma-separated, some are tab-separated metafile_sep = configs['metafile_sep'] else: reference_file = configs['reference_file'].format(virus=virus, subtype=subtype, gene=gene) alignment_file = configs['alignment_file'].format(virus=virus, subtype=subtype, gene=gene) meta_file = configs['meta_file'].format(virus=virus, subtype=subtype, gene=gene) metafile_sep = configs['metafile_sep'] #Find gene location, if domain is sub-genic or reference file contains multiple genes gene_location = False #If domain is sub-genic, fetch its position (within genome or parent gene) from config file if 'specify_location' in configs[standard_gene].keys(): if subtype == None: gene_location_key = "location" else: gene_location_key = "location_" + str(subtype) gene_location_list = ast.literal_eval( configs[standard_gene]['specify_location'][gene_location_key]) #Need to deal with domains the are not contiguous if len(gene_location_list) == 1: gene_location = SeqFeature( FeatureLocation(gene_location_list[0][0], gene_location_list[0][1])) else: compound_locations = [] for location in gene_location_list: compound_locations.append( FeatureLocation(location[0], location[1])) gene_location = CompoundLocation(compound_locations) #Find gene location from reference files else: for seq_record in SeqIO.parse(reference_file, "genbank"): for feature in seq_record.features: if feature.type == 'CDS': if 'gene' in feature.qualifiers.keys(): if feature.qualifiers['gene'][0].lower() == gene.lower( ): gene_location = feature.location elif feature.qualifiers['product'][0].lower( ) == gene.lower(): gene_location = feature.location #Subset data based on time windows meta = pd.read_csv(meta_file, sep=metafile_sep) meta.drop(meta[meta['date'] == '?'].index, inplace=True) meta.dropna(subset=['date'], inplace=True) meta['year'] = meta['date'].str[:4].astype('int') if year_max: meta.drop(meta[meta['year'] > year_max].index, inplace=True) if year_min: meta.drop(meta[meta['year'] < year_min].index, inplace=True) date_range = meta['year'].max() - meta['year'].min() #Remove egg- and cell-passaged strains meta.drop(meta[meta['strain'].str[-4:] == '-egg'].index, inplace=True) meta.drop(meta[meta['strain'].str[-5:] == '-cell'].index, inplace=True) #Limit meta data to only strains in alignment file aligned_isolates = [] with open(alignment_file, "r") as aligned_handle: for isolate in SeqIO.parse(aligned_handle, "fasta"): aligned_isolates.append(isolate.id) aligned_isolates_df = pd.DataFrame(aligned_isolates, columns=['strain']) meta = meta.merge(aligned_isolates_df, on='strain', how='inner') #Group viruses by time windows virus_time_subset = {} if window == 'all': years = str(meta['year'].min()) + '-' + str(meta['year'].max()) virus_time_subset[years] = meta['strain'].tolist() else: date_window_start = meta['year'].min() date_window_end = meta['year'].min() + window while date_window_end <= meta['year'].max(): years = str(date_window_start) + '-' + str(date_window_end) strains = meta[(meta['year'] >= date_window_start) & ( meta['year'] < date_window_end)]['strain'].tolist() virus_time_subset[years] = strains #sliding window date_window_end += 1 date_window_start += 1 #Only use time points with enough data: virus_time_subset = { k: v for k, v in virus_time_subset.items() if len(v) >= min_seqs } year_windows = [] seqs_in_window = [] #Find outgroup sequence from strains at first time point(to make consensus from) first_window = True first_window_strains = [] first_window_sequences = [] alignment_time_subset = {} for years, subset_viruses in virus_time_subset.items(): year_windows.append(years) seqs_in_window.append(len(subset_viruses)) alignment_time_subset[years] = [] #make consensus sequence at first time point if first_window == True: first_window_strains += subset_viruses first_window = False with open(alignment_file, "r") as aligned_handle: for isolate in SeqIO.parse(aligned_handle, "fasta"): if isolate.id in first_window_strains: if gene_location: gene_record = SeqRecord(seq=gene_location.extract( isolate.seq), id=isolate.id, description=gene) else: gene_record = SeqRecord(seq=isolate.seq, id=isolate.id, description=gene) first_window_sequences.append(gene_record) if isolate.id in subset_viruses: if gene_location: alignment_time_subset[years].append( gene_location.extract(isolate.seq)) else: alignment_time_subset[years].append(isolate.seq) first_window_alignment = MultipleSeqAlignment(first_window_sequences) outgroup_seq = AlignInfo.SummaryInfo(first_window_alignment).gap_consensus( ambiguous='N') outgroup_seq_aa = outgroup_seq.translate() return virus_time_subset, alignment_time_subset, outgroup_seq, outgroup_seq_aa, year_windows, seqs_in_window
def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=60): seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta")) CASES = [ re.compile('^.{%s,%s}[ACGSILMFTV][^REKD][GASNL]C' % (lipobox_mindist, lipobox_maxdist)), # re.compile('^.{%s,%s}AWAC' % (lipobox_mindist, lipobox_maxdist)), # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file ] for record in GFF.parse(gff3_file, base_dict=seq_dict): good_features = [] genes = list( feature_lambda(record.features, feature_test_type, {'type': 'gene'}, subfeatures=True)) for gene in genes: cdss = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=False)) if len(cdss) == 0: continue # Someday this will bite me in the arse. cds = cdss[0] try: tmpseq = str( cds.extract(record.seq).translate(table=11, cds=True)).replace( "*", "") except: continue for case in CASES: m = case.search(tmpseq) if m: if cds.location.strand > 0: start = cds.location.start + (3 * (m.end() - 4)) end = cds.location.start + (3 * m.end()) else: start = cds.location.end - (3 * (m.end() - 4)) end = cds.location.end - (3 * m.end()) tmp = SeqFeature(FeatureLocation( min(start, end), max(start, end), strand=cds.location.strand), type='Lipobox', qualifiers={ 'source': 'CPT_LipoRy', 'ID': '%s.lipobox' % get_id(gene), }) tmp.qualifiers['sequence'] = str( tmp.extract(record).seq.translate()) gene.sub_features.append(tmp) good_features.append(gene) record.features = good_features yield [record]
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose): """ """ contig, CDSs, gffstrand, function, frames = gene2position[gene] if gffstrand in ('1','+'): strand = +1 else: strand = -1 CDSs.reverse() '''#add stop codon if not partial seq if strand==1 and CDSs[-1][1]+3 <= len(r.seq): CDSs[-1][1] += 3 elif strand==-1 and CDSs[0][0]-3 > 0: CDSs[0][0] -= 3''' cdsloc, mrnaloc = get_locations(CDSs, start, end, strand) #add gene geneid = gene #".".join(gene.split('.')[:-1]) #get product product = "hypothetical protein" if geneid in gene2product: product = gene2product[geneid] if gene.endswith('.t1'): sf = SeqFeature(FeatureLocation(BeforePosition(start-1),AfterPosition(end)), strand=strand, type='gene', id=geneid) sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} r.features.append(sf) #get mRNA sf sf = SeqFeature(mrnaloc, type='mRNA', id=gene) sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} #"protein_id": gene r.features.append(sf) #get CDS sf sf = SeqFeature(cdsloc, type='CDS', id=gene) #get translation seq = sf.extract(r.seq) aa = str(seq.translate(table=gcode)) #solve non-triplets issue if len(seq) % 3: if strand==1: end -= len(seq) % 3 else: start += len(seq) % 3 ##check for partial sequence - no M as first or no * as last aa partial = 0 #both ends partial if aa[0]!="M" and aa[-1]!="*": partial = 1 sf.location = FeatureLocation(BeforePosition(start-1),AfterPosition(end)) #left end partial elif aa[0]!="M" and strand==1 or aa[-1]!="*" and strand==-1: partial = 1 sf.location = FeatureLocation(BeforePosition(start-1),end) #right end partial elif aa[-1]!="*" and strand==1 or aa[0]!="M" and strand==-1: partial = 1 sf.location = FeatureLocation(start-1,AfterPosition(end)) #strip stop codon aa = aa.strip("*") #replace internal stop codons by X if "*" in aa: if verbose: sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene) return r #aa = aa.replace("*","X") sf.qualifiers = {'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa} #"protein_id": gene, if function: sf.qualifiers['note'] = function #inform about partial entries if partial: #skip if not partial are allowed if not partialyes: return r if aa[0]!="M": sf.qualifiers['codon_start'] = 1 sf.qualifiers['product'] += ", partial cds" if verbose: sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene,)) #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf)) #add to features r.features.append(sf) return r
def write_data_to_seq_record(pksnrpsvars, seq_record, options): #Save substrate specificity predictions in NRPS/PKS domain sec_met info of seq_record # # Workaround to extract positional information for CDS_motifs from the sec_met qualifiers for f in utils.get_cluster_features(seq_record): cluster_info = f.qualifiers for feature in pksnrpsvars.pksnrpscoregenes: nrat = 0 nra = 0 nrcal = 0 nrkr = 0 nrXdom = 0 secmetqualifiers = feature.qualifiers['sec_met'] updated_secmetqualifiers = [] # BiosynML:creating object to add detailed substrate predictions updated_secmetqualifiers_predictions = [] domainFeatures = [] gene_id = utils.get_gene_id(feature) for qualifier in secmetqualifiers: if "NRPS/PKS Domain:" not in qualifier: updated_secmetqualifiers.append(qualifier) updated_secmetqualifiers_predictions.append(qualifier) else: # extract domain type, start and end position from qualifier string match_pos_obj = re.search("NRPS/PKS Domain: ([\w-]+) \((\d+)\-(\d+)\)\. E-value: ([\de\.-]+)\. Score: ([\de\.a-]+);", qualifier) if not match_pos_obj: logging.exception("Exception: could not extract domain string from qualifier %s:" % qualifier) sys.exit(1) domain_type = match_pos_obj.group(1) start_aa = int(match_pos_obj.group(2)) end_aa = int(match_pos_obj.group(3)) evalue = float(match_pos_obj.group(4)) score = float (match_pos_obj.group(5)) #calculate respective positions based on aa coordinates if feature.location.strand==1: start = feature.location.start + ( 3 * start_aa ) end = feature.location.start + ( 3* end_aa ) else: end = feature.location.end - ( 3 * start_aa ) start = feature.location.end - ( 3 * end_aa) loc = FeatureLocation(start, end, strand=feature.strand) # set up new CDS_motif feature domainFeature = SeqFeature(loc, type=options.FeatureTags.pksnrpsdomains_tag) domainFeature.qualifiers['domain'] = [domain_type] if feature.qualifiers.has_key('locus_tag'): domainFeature.qualifiers['locus_tag'] = feature.qualifiers['locus_tag'] else: domainFeature.qualifiers['locus_tag'] = [gene_id] domainFeature.qualifiers['detection'] = ["hmmscan"] domainFeature.qualifiers['database'] = ["nrpspksdomains.hmm"] domainFeature.qualifiers['evalue'] = [str("{:.2E}".format(float(evalue)))] domainFeature.qualifiers['score'] = [score] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 domainFeature.qualifiers['translation'] = [str(domainFeature.extract(seq_record).seq.translate(table=transl_table))] domainFeature_specificity = [] if domain_type == "AMP-binding": nra += 1 domainname = gene_id + "_A" + str(nra) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("NRPSpredictor2 SVM: %s" % pksnrpsvars.nrps_svm_preds[domainname]) domainFeature_specificity.append("Stachelhaus code: %s" % pksnrpsvars.nrps_code_preds[domainname]) domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_nrps_preds[domainname]) domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname]) newqualifier = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname, pksnrpsvars.nrps_svm_preds[domainname], pksnrpsvars.nrps_code_preds[domainname], pksnrpsvars.minowa_nrps_preds[domainname], pksnrpsvars.consensuspreds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " NRPS/PKS Domain: %s; Substrate specificity predictions: %s (NRPSPredictor2 SVM), %s (Stachelhaus code), %s (Minowa), %s (consensus);" % (domainname,pksnrpsvars.nrps_code_preds_details[domainname], pksnrpsvars.nrps_svm_preds_details[domainname], pksnrpsvars.minowa_nrps_preds_details[domainname], pksnrpsvars.consensuspreds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_AT": nrat += 1 domainname = gene_id + "_AT" + str(nrat) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("PKS signature: %s" % pksnrpsvars.pks_code_preds[domainname]) domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_pks_preds[domainname]) #For t1pks, t2pks and t3pks if 'transatpks' not in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) #For transatpks elif 'transatpks' in cluster_info['product'][0]: domainFeature_specificity.append("consensus: %s" % pksnrpsvars.consensuspreds_transat[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds[domainname], pksnrpsvars.minowa_pks_preds[domainname], pksnrpsvars.consensuspreds_transat[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (PKS signature), %s (Minowa), %s (consensus);" %(pksnrpsvars.pks_code_preds_details[domainname], pksnrpsvars.minowa_pks_preds_details[domainname], pksnrpsvars.consensuspreds_transat[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "CAL_domain": nrcal += 1 domainname = gene_id + "_CAL" + str(nrcal) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("Minowa: %s" % pksnrpsvars.minowa_cal_preds[domainname]) newqualifier = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Substrate specificity predictions: %s (Minowa);" %(pksnrpsvars.minowa_cal_preds_details[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) elif domain_type == "PKS_KR": nrkr += 1 domainname = gene_id + "_KR" + str(nrkr) domainFeature.qualifiers['label'] = [domainname] domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_"+domainname] domainFeature_specificity.append("KR activity: %s" % pksnrpsvars.kr_activity_preds[domainname]) domainFeature_specificity.append("KR stereochemistry: %s" % pksnrpsvars.kr_stereo_preds[domainname]) newqualifier = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) # BiosynML: appending substrate prediction data into 'newqualifier_detailed' newqualifier_detailed = qualifier + " Predicted KR activity: %s; Predicted KR stereochemistry: %s;" %(pksnrpsvars.kr_activity_preds[domainname], pksnrpsvars.kr_stereo_preds[domainname]) updated_secmetqualifiers.append(newqualifier) updated_secmetqualifiers_predictions.append(newqualifier_detailed) else: nrXdom += 1 domainFeature.qualifiers['asDomain_id'] = ["nrpspksdomains_" + gene_id.partition(".")[0] + "_Xdom"+'{:02d}'.format(nrXdom)] updated_secmetqualifiers.append(qualifier) domainFeature.qualifiers['specificity'] = domainFeature_specificity if _map_domaintype(domain_type): domainFeature.qualifiers['domain_subtype'] = [domain_type] domainFeature.qualifiers['domain'] = [_map_domaintype(domain_type)] domainFeatures.append(domainFeature) feature.qualifiers['sec_met'] = updated_secmetqualifiers # BiosynML: creating new 'sec_met_predictions' qualifier #feature.qualifiers['sec_met_predictions'] = updated_secmetqualifiers_predictions seq_record.features.extend(domainFeatures) if pksnrpsvars.consensuspred_gene_dict.has_key(gene_id): feature.qualifiers[options.QualifierTags.product_prediction] = "-".join(pksnrpsvars.consensuspred_gene_dict[gene_id]) #Save consensus structure + link to structure image to seq_record clusters = utils.get_cluster_features(seq_record) for cluster in clusters: clusternr = utils.get_cluster_number(cluster) if pksnrpsvars.compound_pred_dict.has_key(clusternr): structpred = pksnrpsvars.compound_pred_dict[clusternr] cluster.qualifiers['note'].append("Monomers prediction: " + structpred) cluster.qualifiers['note'].append("Structure image: structures/genecluster%s.png" % clusternr)
def find_ORFs(seqBio, codonTable, startCodons=['ATG', 'GTG', 'TTG'], verbose=0): """ Find all putative open reading frames (ORFs) in the nucleotide sequence. We assume that an ORF start with one of the start codons as defined in the argument list and ends with a stop codon as defined from the codon table. Note that the expression levels from other non-canonical start codons different from ATG, GTG or TTG is extremely low, as shown in E. coli [1]. [1] Hecht, A., Glasgow, J., Jaschke, P. R., Bawazer, L. A., Munson, M. S., Cochran, J. R., … Salit, M. (2017). Measurements of translation initiation from all 64 codons in E. coli. Nucleic Acids Research, 1–12. http://doi.org/10.1093/nar/gkx070 """ ORFList = [] for strand, seq in [(+1, seqBio), (-1, seqBio.reverse_complement())]: # We define frame relative to the start of the seq for frame in range(3): translatedSeq = seq[frame:].translate(codonTable) if verbose >= 2: print("\n\n#### seq", seq, "\nframe", frame, "\ntranslatedSeq", translatedSeq) codonList = list( extract_codons_list(seq, frame=frame, checkLengthMultipleOf3=False)) # Find positions of all stop codons iCodonStopList = [ iCodon for iCodon, codon in enumerate(codonList) if codon in codonTable.stop_codons ] if verbose >= 2: print(iCodonStopList) print([codonList[i] for i in iCodonStopList]) # In each subsequence in between stop codons, search for start codons iCodonFirstInChunk = 0 for iCodonStop in iCodonStopList: if verbose >= 2: print("iCodonFirstInChunk", iCodonFirstInChunk, "iCodonStop", iCodonStop) codonListChunk = codonList[iCodonFirstInChunk:iCodonStop + 1] ORFStopCodon = codonList[iCodonStop] if verbose >= 2: print("codonListChunk", [str(c) for c in codonListChunk]) # Find the start codon positions inside the chunk iCodonStartList = [ i for i, c in enumerate(codonListChunk) if str(c) in startCodons ] if verbose >= 2: print("iCodonStartList", iCodonStartList) for iStart in iCodonStartList: if verbose >= 2: print("iStart", iStart) iCodonInSeq = iCodonFirstInChunk + iStart ORFStartCodon = codonList[iCodonInSeq] if verbose >= 2: print("iCodonInSeq", iCodonInSeq, "codonList[iCodonInSeq]", codonList[iCodonInSeq]) # Position of start and stop codons in the nucleotide sequence iStartInNucleotideSeq = iCodonInSeq * 3 + frame iStopInNucleotideSeq = iCodonStop * 3 + frame if verbose >= 2: print("iStartInNucleotideSeq", iStartInNucleotideSeq, "iStopInNucleotideSeq", iStopInNucleotideSeq) # Define ORF region on the nucleotide sequence # STRAND ??????? if strand == +1: ORFStart = iStartInNucleotideSeq ORFEnd = iStopInNucleotideSeq + 2 + 1 elif strand == -1: # reverse location ORFStart = (len(seq) - 1) - (iStopInNucleotideSeq + 2 ) # include stop codon ORFEnd = (len(seq) - 1) - (iStartInNucleotideSeq) + 1 if verbose >= 2: print("ORFStart", ORFStart, "ORFEnd", ORFEnd, "strand", strand) # Define ORF as Biopython SeqFeature ORFFeat = SeqFeature(location=FeatureLocation( ORFStart, ORFEnd, strand=strand), type='putative ORF', id=None) # Compute translation of ORF ORFFeat.qualifiers['translation'] = ORFFeat.extract( seqBio).translate(table=codonTable, cds=True) ORFFeat.qualifiers['start_codon'] = str(ORFStartCodon) ORFFeat.qualifiers['stop_codon'] = str(ORFStopCodon) if verbose >= 2: print("ORFFeat", ORFFeat) print("ORFFeat.extract(seqBio)", ORFFeat.extract(seqBio)) ORFList.append(ORFFeat) iCodonFirstInChunk = iCodonStop + 1 return ORFList
class TestFeatureMatch(unittest.TestCase): def setUp(self): self.seq = Seq("CCCAAAATGTACTCCACTATCTGCTGATTTGGG", generic_dna) self.feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=1) self.feature_seq = self.seq[3:-3] self.match = FeatureMatch(self.feature, self.feature_seq, 1, 3) def test__init(self): "Test FeatureMatch object creation" # forward strand m = self.match self.assertEqual(m.direction, "forward") self.assertEqual(str(m.dna), "ATGTACTCCACTATCTGCTGA") self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(self.feature_seq.transcribe())) self.assertEqual( str(m.aas), str(self.feature.extract(self.seq).translate(to_stop=True))) # reverse strand inv_seq = self.seq.reverse_complement() feature_seq = inv_seq[3:-3] inv_feature = SeqFeature(FeatureLocation(6, 27), type="gene", strand=-1) m = FeatureMatch(inv_feature, feature_seq, -1, 3) self.assertEqual(m.direction, "reverse") self.assertEqual(str(m.dna), str(feature_seq[3:-3].reverse_complement())) self.assertEqual(str(m.long_dna), str(self.feature_seq)) self.assertEqual(str(m.promotor_region), "AAA") self.assertEqual(str(m.terminator_region), "TTT") self.assertEqual(str(m.mrna), str(feature_seq.transcribe().reverse_complement())) self.assertEqual( str(m.aas), str(inv_feature.extract(inv_seq).translate(to_stop=True))) def test_get_fasta_header(self): "Test FeatureMatch FASTA header creation" expected = ">untagged" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['gene'] = ['fake'] expected = ">fake" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001'] expected = ">FAKE_0001" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['product'] = ['Mup1'] expected = ">FAKE_0001|Mup1" self.assertEqual(self.match.get_fasta_header(), expected) self.match.feature.qualifiers['protein_id'] = ['MUP_0001'] expected = ">FAKE_0001|Mup1|MUP_0001" self.assertEqual(self.match.get_fasta_header(), expected) def test__str_(self): "Test FeatureMatch string representation" expected = """Feature: untagged Strand: forward DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['gene'] = ["fake"] expected = """Feature: Tag: fake Strand: forward DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['locus_tag'] = ['FAKE_0001'] expected = """Feature: Tag: FAKE_0001 Strand: forward DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['product'] = ['Mup1'] expected = """Feature: Tag: FAKE_0001 Strand: forward Product: Mup1 DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) self.match.feature.qualifiers['protein_id'] = ['MUP_0001'] expected = """Feature: Tag: FAKE_0001 Strand: forward Product: Mup1 Protein ID: MUP_0001 DNA: ATGTACTCCACTATCTGCTGA mRNA: AAAAUGUACUCCACUAUCUGCUGAUUU Protein: MYSTIC""" self.assertMultiLineEqual(str(self.match), expected) def test_dna_fasta(self): "Test FeatureMatch DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.dna) self.assertMultiLineEqual(self.match.dna_fasta(), expected) def test_long_dna_fasta(self): "Test FeatureMatch long DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.long_dna) self.assertMultiLineEqual(self.match.long_dna_fasta(), expected) def test_mrna_fasta(self): "Test FeatureMatch mRNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.mrna) self.assertMultiLineEqual(self.match.mrna_fasta(), expected) def test_protein_fasta(self): "Test FeatureMatch protein FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.aas) self.assertMultiLineEqual(self.match.protein_fasta(), expected) def test_promotor_fasta(self): "Test FeatureMatch promotor DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.promotor_region) self.assertMultiLineEqual(self.match.promotor_fasta(), expected) def test_terminator_fasta(self): "Test FeatureMatch terminator DNA FASTA output" expected = "%s\n%s" % (self.match.get_fasta_header(), self.match.terminator_region) self.assertMultiLineEqual(self.match.terminator_fasta(), expected)
def _annotate(seq_record, options, results): "Annotate seq_record with CDS_motifs for the result" logging.debug("generating feature objects for PFAM hits") min_score = _min_score(options) max_evalue = _max_evalue(options) feature_by_id = utils.get_feature_dict(seq_record) for r in results: i = 1 for hsp in r.hsps: if hsp.bitscore <= min_score or hsp.evalue >= max_evalue: continue if not feature_by_id.has_key(hsp.query_id): continue feature = feature_by_id[hsp.query_id] start, end = _calculate_start_end(feature, hsp) loc = FeatureLocation(start, end, strand=feature.strand) newFeature = SeqFeature(location=loc, type=options.FeatureTags.fullhmmer_tag) quals = defaultdict(list) quals['label'].append(r.id) if feature.qualifiers.has_key('locus_tag'): quals['locus_tag'] = feature.qualifiers['locus_tag'] else: quals['locus_tag'] = [hsp.query_id] quals['domain'] = [hsp.hit_id] quals['asDomain_id'] = ['fullhmmer_'+'_'.join(quals['locus_tag'])+'_'+'{:04d}'.format(i)] i += 1 quals['evalue'] = [str("{:.2E}".format(float(hsp.evalue)))] quals['score'] = [str(hsp.bitscore)] quals['aSTool'] = ["fullhmmer"] quals['detection'] = ["hmmscan"] quals['database'] = [path.basename(r.target)] if feature.qualifiers.has_key('transl_table'): [transl_table] = feature.qualifiers['transl_table'] else: transl_table = 1 quals['translation'] = [str(newFeature.extract(seq_record.seq).translate(table=transl_table))] quals['note'].append("%s-Hit: %s. Score: %s. E-value: %s. Domain range: %s..%s." % \ (path.basename(r.target), hsp.hit_id, hsp.bitscore, hsp.evalue, hsp.hit_start, hsp.hit_end)) quals['description'] = [hsp.hit_description] try: pfamid = name_to_pfamid[hsp.hit_id] if quals.has_key('db_xref'): quals['db_xref'].append("PFAM: %s" % pfamid) else: quals['db_xref'] = ["PFAM: %s" % pfamid] except KeyError: pass newFeature.qualifiers=quals seq_record.features.append(newFeature)
type=typ, id=name) else: fea = SeqFeature(CompoundLocation([FeatureLocation(edge[0], edge[1], strand=+1) for edge in edges]), type=typ, id=name) seqnew.features.append(fea) print 'Sanity checks' for fea in seqnew.features: if fea.type in ('gene', 'protein'): feaseq = fea.extract(seqnew) # vpr has an additional T in HXB2 if fea.id == 'vpr': assert ((len(feaseq) - 1) % 3) == 0 T_pos = 5771 prot = (feaseq[: T_pos - fea.location.nofuzzy_start].seq + \ feaseq[T_pos + 1 - fea.location.nofuzzy_start:].seq).translate() else: assert (len(feaseq) % 3) == 0 prot = feaseq.seq.translate() # These genes contain premature stops in HXB2 if fea.id in ('tat', 'nef'): assert prot[-1] == '*'