def _findGeneInSeq(self, record): """Extract gene sequence from larger sequence (e.g. genomes) by searching features.""" if not record.features: # if there aren't any features, just return the record return record for feature in record.features: feature_names = [] if 'gene' in feature.qualifiers.keys(): feature_names.extend(feature.qualifiers['gene']) if 'gene_synonym' in feature.qualifiers.keys(): feature_names.extend(feature.qualifiers['gene_synonym']) if 'product' in feature.qualifiers.keys(): feature_names.extend(feature.qualifiers['product']) gene_names = [e.lower() for e in self.gene_names] feature_names = [e.lower() for e in feature_names] if set(gene_names) & set(feature_names): try: extractor = SeqFeature(feature.location) found_seq = extractor.extract(record) except ValueError: # catch value errors raised for sequences # with "fuzzy" positions # TODO: what are fuzzy positions and can I use # them? return record else: return found_seq return record
def t_write_from_recs(self): """Write out GFF3 from SeqRecord inputs. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = {"source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1"} sub_qualifiers = {"source": "prediction"} top_feature = SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) top_feature.sub_features = [SeqFeature(FeatureLocation(0, 5), type="exon", strand=1, qualifiers=sub_qualifiers), SeqFeature(FeatureLocation(15, 20), type="exon", strand=1, qualifiers=sub_qualifiers)] rec.features = [top_feature] out_handle = StringIO.StringIO() GFF.write([rec], out_handle) wrote_info = out_handle.getvalue().split("\n") assert wrote_info[0] == "##gff-version 3" assert wrote_info[1] == "##sequence-region ID1 1 20" assert wrote_info[2].split("\t") == ['ID1', 'prediction', 'gene', '1', '20', '10.0', '+', '.', 'other=Some,annotations;ID=gene1'] assert wrote_info[3].split("\t") == ['ID1', 'prediction', 'exon', '1', '5', '.', '+', '.', 'Parent=gene1']
def mergeRecords(file): #adapted from SeqHandler by NF Alikhan (github.com/happykhan/seqhandler) #SeqHandler is a script for merging, converting and splitting sequence files (Genbank, EMBL, fasta and others). Please use it to merge multi-Genbank files before running bwast.py filetype = determineFileType(file) #determine file type readInMultifasta = open(file, "r") records = list(SeqIO.parse(readInMultifasta, filetype)) mergingFile = records[0] from Bio.SeqFeature import SeqFeature, FeatureLocation contigs = SeqFeature(FeatureLocation(0, len(mergingFile) ), type="fasta_record",\ strand=1) contigs.qualifiers["note"] = records[0].name #pull out contig number of first contig mergingFile.features.append(contigs) #append first contig to mergingFile for nextRecord in records[1:]: contigs = SeqFeature(FeatureLocation(len(mergingFile), len(mergingFile) + len(nextRecord)), type="fasta_record",\ strand=1) contigs.qualifiers["note"] = nextRecord.name mergingFile.features.append(contigs) #append subsequent contigs to mergingFile mergingFile += nextRecord mergingFile.name = records[0].name mergingFile.description = records[0].description mergingFile.annotations = records[0].annotations for feature in mergingFile.features: if feature.type == 'source': mergingFile.features.remove(feature) contigs = SeqFeature(FeatureLocation(0, len(mergingFile)), type="source", strand=1) mergingFile.features.insert(0,contigs) merged_file = re.sub(r"\.\w+$", r".merged.fa", file) out_handle = open(merged_file, "w") SeqIO.write(mergingFile, out_handle, filetype) return merged_file
def make_protein_feature(feature_name, feature_start, feature_end, feature_type): ''' Returns sequence feature, using start, end, name and type as input ''' feature = SeqFeature(FeatureLocation(int(feature_start), int(feature_end)), type=feature_type) if feature_type == "Region": feature.qualifiers = {'name': [feature_name]} return feature
def test_translation_checks_cds(self): """Test that a CDS feature is subject to respective checks.""" seq = Seq.Seq("GGTTACACTTACCGATAATGTCTCTGATGA", generic_dna) f = SeqFeature(FeatureLocation(0, 30), type="CDS") f.qualifiers['transl_table'] = [11] with self.assertRaises(TranslationError): f.translate(seq)
def mergeMod(args): filetype = args.inFormat # Load file as SeqRecord int_handle = open(args.input, "r") recs = list(SeqIO.parse(int_handle, filetype)) # For each SeqRecord, I.e. complete gbk annotation obj in file fgbk = recs[0] from Bio.SeqFeature import SeqFeature, FeatureLocation d = SeqFeature(FeatureLocation(0, len(fgbk)), type="fasta_record", strand=1) d.qualifiers["note"] = recs[0].name fgbk.features.append(d) for l in recs[1:]: d = SeqFeature(FeatureLocation(len(fgbk), len(fgbk) + len(l)), type="fasta_record", strand=1) d.qualifiers["note"] = l.name fgbk.features.append(d) fgbk += l fgbk.name = recs[0].name fgbk.description = recs[0].description fgbk.annotations = recs[0].annotations if args.accession != None: fgbk.name = args.accession if args.ver != None: fgbk.id = fgbk.name + "." + args.ver for f in fgbk.features: if f.type == "source": fgbk.features.remove(f) d = SeqFeature(FeatureLocation(0, len(fgbk)), type="source", strand=1) fgbk.features.insert(0, d) outtype = filetype if args.outFormat != None: outtype = args.outFormat out_handle = open(args.output, "w") SeqIO.write(fgbk, out_handle, outtype)
def get_longest(seq_record, gene2isoforms): l = [] c = 0; chrom = adjust_name(seq_record.name); for gene, isoforms in gene2isoforms.iteritems(): longest = max(isoforms, key = lambda i: sum([len(x) for x in i])) if(args.format == 'bed'): compound_to_bed(longest, chrom, gene) elif(args.format == 'fasta'): if(len(longest) > 1): location = CompoundLocation(longest, operator = "join") else: location = longest[0]; feature = SeqFeature(location=location, type='utr', strand = longest[0].strand) #print longest[0].strand f = feature.extract(seq_record) f.name = gene f.id = gene f.description = gene l.append(f); return l;
def seqBlastToFeatures(self, blastDB, blastExe, seqFile, blastType = "blastn",scoreMin = 1e-3, logFile = None): ''' Blast sequence file against blast database parse files into records. This function may not work as well on very large blast comparisons because it does a full read of the result for the conversion to features. ''' print ">blast %s %s %s %s" % (self.blastDB, self.blastExe, seqFile, blastType) blastRecords = self.seqBlast(seqFile, blastType = "blastn", scoreMin = 1e-3, logFile = None) result = [] index = 0 for r in blastRecords: recordFeatures = [] for alignment in r.alignments: name = alignment.title query = r.query for hsp in alignment.hsps: if hsp.expect < scoreMin: (ts,ss) = hsp.frame strand = ss start = hsp.sbjct_start end = hsp.sbjct_end location = FeatureLocation(start,end) feature = SeqFeature(id=query,location=location,strand=strand) aMatch = hsp.query + "\n" + hsp.match + "\n" + hsp.sbjct feature.qualifiers["query"] = hsp.query feature.qualifiers["subject"] = hsp.sbjct feature.qualifiers["alignment"] = aMatch recordFeatures.append(feature) result.append(recordFeatures) index = index + 1 return result
def annotate_dna_reference(ref, protein): '''Annotate DNA reference with the protein secondary structures''' from Bio.SeqFeature import SeqFeature, FeatureLocation annotation_table = parse_secondary_structure(protein) if protein == 'gagpol': start_protein = ref.annotation['gag'].location.nofuzzy_start elif protein == 'vpu': # Uniprot starts one aa downstream of us start_protein = ref.annotation[protein].location.nofuzzy_start + 3 else: start_protein = ref.annotation[protein].location.nofuzzy_start features = [] for _, datum in annotation_table.iterrows(): start_dna = datum['start'] * 3 + start_protein end_dna = datum['end'] * 3 + start_protein # Notice ribosomal slippage site if protein == 'gagpol': if start_dna >= slippage_site: start_dna -= 1 if end_dna >= slippage_site: end_dna -= 1 anno = SeqFeature(FeatureLocation(start_dna, end_dna), strand=+1) anno.type = datum['feature'] features.append(anno) return features
def parse(self): with open(self._file) as handle: genbank = SeqRecord(Seq.UnknownSeq(0)) header_pattern = re.compile(r"ref\|(?P<id>.*?)\|:(?P<start>[0-9]+)-(?P<end>[0-9]+)\|(?P<description>.*?)\|\s*\[gene=(?P<gene>\S+)\]\s*\[locus_tag=(?P<locus_tag>\S+)\]\s*") first = True for record in SeqIO.parse(handle, "fasta"): header = record.description match = header_pattern.match(header) if not match: self.errors.append("Invalid header: >" + header) continue if first: first = False genbank.id = match.group("id") genbank.name = match.group("id") feature = SeqFeature(FeatureLocation(int(match.group("start")), int(match.group("end"))), type = "gene") feature.qualifiers = {"locus_tag": match.group("locus_tag"), "gene": match.group("gene"), "note": match.group("description"), "sequence": record.seq} genbank.features.append(feature) return genbank return None
def to_seqfeature(self): """Create a SeqFeature from the ProteinDomain Object.""" feat = SeqFeature(location=FeatureLocation(self.start, self.end), id=self.value) if hasattr(self, 'confidence'): feat.qualifiers['confidence'] = self.confidence return feat
def test_deletion__overlapping_features(self): # Example based on intersection of nei and arbB genes in MG1655. before_overlap = 'GCCCTGGCTGCCAGCA' overlap = 'CTAG' after_overlap = 'GCCGACCGCTTCGG' raw_seq_str = before_overlap + overlap + after_overlap seq = Seq(raw_seq_str, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(before_overlap) + len(overlap), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str), strand=1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2') seq_record.features.append(feature_2) maker = VCFToGenbankMaker(seq_record, None, None) maker._update_genome_record_for_variant( len(before_overlap), overlap, '') # Assert the sequence is correct. EXPECTED_SEQ = before_overlap + after_overlap self.assertEqual(EXPECTED_SEQ, str(seq_record.seq)) # Assert the feature annotations are still correct. EXPECTED_FEATURE_1_SEQ = before_overlap self.assertEqual(EXPECTED_FEATURE_1_SEQ, str(feature_1.extract(seq_record.seq))) EXPECTED_FEATURE_2_SEQ = after_overlap self.assertEqual(EXPECTED_FEATURE_2_SEQ, str(feature_2.extract(seq_record.seq)))
def create_gene_feature(gene_name, feature_location, feature_qualifiers): """Creates a minimal SeqFeature to represent a gene. """ gene_feature = SeqFeature(feature_location, type='gene') gene_feature.qualifiers = {'gene': [gene_name]} gene_feature.qualifiers = dict(gene_feature.qualifiers.items() + feature_qualifiers.items()) return gene_feature
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ location = FeatureLocation(*feature_dict['location']) new_feature = SeqFeature(location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']) new_feature.qualifiers = feature_dict['quals'] return new_feature
def get_gene_and_201bp_upstream(genefeature,genomeseq): mystart = genefeature.location.start myend = genefeature.location.end mystrand = genefeature.location.strand if mystrand == 1: newfeature = SeqFeature(FeatureLocation(mystart-201,myend),strand=mystrand) elif mystrand == -1: newfeature = SeqFeature(FeatureLocation(mystart,myend+201),strand=mystrand) return newfeature.extract(genomeseq)
def makeSeqObjectsForTblastnNeighbors(tblastn_id, clusterrunid, cur, N=200000): """ Given a tBBLSATn ID and a dictionary from sanitized contig IDs (which is what will be present in the TBLASTN id) to non-sanitized IDs (which are what is in the database), returns a list of seq objects INCLUDING the TBLASTN hit itself (so that we can show that on the region drawing). We pick an N large enough to get at least one gene and then pick the closest one and get all of its neighbors with a call to makeSeqFeaturesForGeneNeighbors() and just tack the TBLASTN onto it. """ # Lets first get the contig and start/stop locations (which tell us teh strand) out of # the TBLASTN id. This returns a ValueError if it fails which the calling function can catch if needed. sanitizedToNot = getSanitizedContigList(cur) contig, start, stop = splitTblastn(tblastn_id) if contig in sanitizedToNot: contig = sanitizedToNot[contig] start = int(start) stop = int(stop) # Create a seq object for the TBLASTN hit itself if start < stop: strand = +1 else: strand = -1 tblastn_feature = SeqFeature(FeatureLocation(start, stop), strand=strand, id=tblastn_id) tblastn_feature.qualifiers["cluster_id"] = -1 # Find the neighboring genes. neighboring_genes = getGenesInRegion(contig, start - N, stop + N, cur) if len(neighboring_genes) == 0: sys.stderr.write( "WARNING: No neighboring genes found for TBLASTN hit %s within %d nucleotides in contig %s\n" % (tblastn_id, N, contig) ) return [tblastn_feature] else: neighboring_geneinfo = getGeneInfo(neighboring_genes, cur) # Find the closest gene to ours and get the clusters for those neighbors based on the specific clusterrunid minlen = N mingene = None minstrand = None for geneinfo in neighboring_geneinfo: genestart = int(geneinfo[5]) geneend = int(geneinfo[6]) distance = min(abs(genestart - start), abs(geneend - start), abs(genestart - stop), abs(geneend - stop)) if distance < minlen: mingene = geneinfo[0] minlen = distance neighboring_features = makeSeqFeaturesForGeneNeighbors(mingene, clusterrunid, cur) # Add the TBLASTN itself and return it. neighboring_features.append(tblastn_feature) return neighboring_features
def find_cds (): seq_des = str(record_dict[keys].description).split("|") for i in seq_des: if re.match("CDS", i): feature, cds_start, cds_end = re.split(":|-", i) cds_feature = SeqFeature(FeatureLocation(int(cds_start)-1,int(cds_end)-1), type=str(feature)) cds_sequence = cds_feature.extract(record_dict[keys].seq) print cds_sequence.translate() return cds_start, cds_end, cds_sequence
def make_seq_feature(start, end, ftype, quals={}): ''' create a sequence feature from a start, end, and a type. additionally you may include other fields, like note, label, evidence, citation, as a dict. ''' seq_feature = SeqFeature(FeatureLocation(start, end), strand= +1, type=ftype) seq_feature.qualifiers = quals seq_feature.qualifiers['source'] = ['splicemod'] return seq_feature
def test_update_genome_record_for_variant__overlapping_features(self): """Tests handling a record that lands in a region of overlapping features. """ # Example based on intersection of nei and arbB genes in MG1655. before_overlap = 'GCCCTGGCTGCCAGCA' overlap = 'CTAG' after_overlap = 'GCCGACCGCTTCGG' raw_seq_str = before_overlap + overlap + after_overlap seq = Seq(raw_seq_str, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(before_overlap) + len(overlap), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) feature_2_loc = FeatureLocation(len(before_overlap), len(raw_seq_str), strand=1) feature_2 = SeqFeature(feature_2_loc, type='CDS', id='2') seq_record.features.append(feature_2) maker = VCFToGenbankMaker(seq_record, None, None) overlap_replacement = 'TTAA' maker._update_genome_record_for_variant(len(before_overlap), overlap, overlap_replacement) # Features changed, so requery them. feature_1 = None feature_2 = None for feature in seq_record.features: if feature.id == '1': feature_1 = feature elif feature.id == '2': feature_2 = feature assert feature_1 assert feature_2 # Assert the sequence is correct. EXPECTED_SEQ = before_overlap + overlap_replacement + after_overlap self.assertEqual(EXPECTED_SEQ, str(seq_record.seq)) # Feature added to represent swap. # self.assertEqual(3, len(seq_record.features)) # Assert the feature annotations are still correct. EXPECTED_FEATURE_1_SEQ = before_overlap + overlap_replacement self.assertEqual(EXPECTED_FEATURE_1_SEQ, str(feature_1.extract(seq_record.seq))) EXPECTED_FEATURE_2_SEQ = overlap_replacement + after_overlap self.assertEqual(EXPECTED_FEATURE_2_SEQ, str(feature_2.extract(seq_record.seq)))
def attach_features(predictions, seqrecord): for prediction in predictions[seqrecord.id]: if prediction.raw_score >= 1.0: qualifiers = {} qualifiers['locus_tag'] = [prediction.cds_id] feature = SeqFeature( location=prediction.location, type='CDS', strand=prediction.strand, qualifiers=qualifiers, ) feature.qualifiers = qualifiers seqrecord.features.append(feature)
def makeSeqFeature(geneid, cur): ''' Make a BioPython SeqFeature object for a gene with ITEP ID geneid ''' geneinfo = getGeneInfo( [ geneid ], cur ) geneinfo = geneinfo[0] start = int(geneinfo[5]) stop = int(geneinfo[6]) strand = int(geneinfo[8]) feature = SeqFeature(FeatureLocation(start, stop), strand=strand, id=geneid) # This can be overwritten by other functions but we need a placeholder. feature.qualifiers["cluster_id"] = -1 return feature
def _add_gff_line(self, rec, gff_parts, parents, children): """Add details from a GFF line to the given SeqRecord. """ gff_parts = [(None if p == '.' else p) for p in gff_parts] assert rec.id == gff_parts[0], "ID mismatch: %s %s" % (rec.id, gff_parts[0]) # collect all of the base qualifiers for this item quals = collections.defaultdict(list) if gff_parts[1]: quals["source"].append(gff_parts[1]) if gff_parts[5]: quals["score"].append(gff_parts[5]) if gff_parts[7]: quals["phase"].append(gff_parts[7]) for key, val in [a.split('=') for a in gff_parts[8].split(';')]: quals[key].extend(val.split(',')) quals = dict(quals) # if we are describing a location, then we are a feature if gff_parts[3] and gff_parts[4]: #if quals.has_key('ID') or quals.has_key('Parent'): # print gff_parts[1:6], quals location = FeatureLocation(int(gff_parts[3]) - 1, int(gff_parts[4])) new_feature = SeqFeature(location, gff_parts[2], id = quals.get('ID', [''])[0], strand = self._strand_map[gff_parts[6]]) new_feature.qualifiers = quals # Handle flat features if not new_feature.id: rec.features.append(new_feature) # features that have parents need to link so we can pick up # the relationship elif new_feature.qualifiers.has_key('Parent'): for parent in new_feature.qualifiers['Parent']: children[parent].append(new_feature) # top level features else: parents[rec.id].append(new_feature) # otherwise, associate these annotations with the full record else: # add these as a list of annotations, checking not to overwrite # current values for key, vals in quals: if rec.annotations.has_key(key): try: rec.annotations[key].extend(vals) except AttributeError: rec.annotations[key] = [rec.annotations[key]] + vals else: rec.annotations[key] = vals return rec, parents, children
def make_join_feature(f_list, ftype="misc_feature"): #NOTE - Does NOT reorder the sub-features (which you may #want to do for reverse strand features...) if len(set(f.strand for f in f_list))==1: strand = f_list[0].strand else: strand = None for f in f_list: f.type=ftype f.location_operator="join" jf = SeqFeature(FeatureLocation(f_list[0].location.start, f_list[-1].location.end), type=ftype, strand=strand, location_operator="join") jf.sub_features = f_list return jf
def nrpsSmash(dnaSeq): options = Namespace() options.outputfoldername = "/tmp/nrpspks_predictions_txt" options.record_idx = "" # used in NRPSPredictor2.nrpscodepred, check later what to set it to options.eukaryotic = 0 tstFeature = SeqFeature(FeatureLocation(0, len(dnaSeq)), type="CDS", strand=1) tstFeature.qualifiers = {'gene':['gene']} sequenceRecord = SeqRecord(Seq(dnaSeq, IUPAC.unambiguous_dna), id = "seq_id", name = "seq_name", description = "seq_description") sequenceRecord.features = [tstFeature] analysis = specific_analysis(sequenceRecord, options) shutil.rmtree(options.raw_predictions_outputfolder) return analysis
def retrieveCompositeSequence(seq_record,seqList) : # true seq listePosition = list() for node in seqList : seq,coord = node.split(":") start,end = coord.split("..") listePosition.append(int(float(start))) listePosition.append(int(float(end))) start = min(listePosition) end = max(listePosition) f = SeqFeature(FeatureLocation(start,end)) seq = f.extract(seq_record) seqId = seq_record.id+"|"+str(start)+"_"+str(end) return SeqRecord(seq=seq.seq,id=seqId,description="")
def __repr__(self): "It prints representing the seqfeature" toprint = BioSeqFeature.__repr__(self) toprint = toprint[:-1] toprint += ", qualifiers=%s " % repr(self.qualifiers) toprint += ")" return toprint
def convertReportToFeatures(self,report,conversionMap,startTag,endTag,strandTag,qualTag="gene"): ''' parse 2d hash into feature list ''' result = [] for rowName in report.returnRowNames(): start = float(report.getElement(rowName,startTag)) end = float(report.getElement(rowName,endTag)) location = FeatureLocation(start,end) strand = self._parseConversion(report.getElement(rowName,strandTag),conversionMap) feature = SeqFeature(id = rowName,location=location,strand=strand) feature.qualifiers[qualTag] = [rowName] for colName in [rowName,report.returnColumnNames()]: if colName in conversionMap.keys(): newName = conversionMap[colName] value = report[colName] feature.qualifiers[newName] = [value] result.append(feature) return result
def parse_smart_domains(file_name): in_handle = open(file_name, "rU") lines = in_handle.readlines() domains = [] domain_status = False domain_type = False is_domain = False for line in lines: line = line.rstrip() if len(line) > 1: pairs = line.split("=") is_domain = True if len(pairs) == 2: if pairs[0] == "DOMAIN": domain_name = pairs[1] elif pairs[0] == "START": domain_start = int(pairs[1]) elif pairs[0] == "END": domain_end = int(pairs[1]) elif pairs[0] == "TYPE": if pairs[1] != "PFAM": domain_type = True else: domain_type = False elif pairs[0] == "STATUS": if pairs[1] == "visible|OK": domain_status = True #False else: domain_status = True else: is_domain = False else: if is_domain & domain_type & domain_status: d = SeqFeature(FeatureLocation(domain_start, domain_end), type="Region") d.qualifiers = {'region_name': [domain_name]} if domain_name != 'low_complexity_region': domains.append(d) is_domain = False domain_type = False domain_status = False in_handle.close() return domains
def main(): genome_record = SeqIO.read(INPUT_GENOME, 'genbank') with open(SNP_CSV_DATA_FILE) as input_fh: reader = csv.DictReader(input_fh, SNP_FIELD_NAMES) reader.next() # Ignore header. for row in reader: feature_ref = row['ref'] feature_alt = row['alt'].replace('[', '').replace(']', '') feature_start = int(row['position']) - 1 # pythonic feature_end = feature_start + len(feature_alt) feature_location = FeatureLocation(feature_start, feature_end) feature = SeqFeature(location=feature_location, type=REALIGNED_SNP_TYPE, strand=1) feature.qualifiers['ref'] = row['ref'] feature.qualifiers['alt'] = row['alt'] genome_record.features.append(feature) with open(OUTPUT_GENOME, 'w') as output_fh: SeqIO.write(genome_record, output_fh, 'genbank')
def extract_sequences_one_sample(args): ''' Function for extracting protein sequences given annotated regions and produce fasta files that can be used for clustering ''' (fasta_path,annotation_path,sample, output_dir)=args domainInfo=load_annotation_pfam(annotation_path) print "Generating domain fasta sequences for "+sample+" ..." from Bio import SeqIO from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.SeqRecord import SeqRecord (annot,start,stop,strand,evalue)=domainInfo record_dict=index_fasta(fasta_path) recordlist=[] outfilename=output_dir +'/forClustering/'+ sample +'.fasta' outhandle=open(outfilename,'w') for domainID in annot.keys(): for i in range(len(annot[domainID])): domain=annot[domainID][i] try: seq=record_dict[domain] except KeyError: print "Error: " + domain + " not in fasta file.\n" break a=start[domainID][i] b=stop[domainID][i] seq_strand=strand[domainID][i] seq_evalue=evalue[domainID][i] if seq_strand in '+': domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=1) elif seq_strand in '-': domain_feature = SeqFeature(FeatureLocation(a-1, b-1), type="domain", strand=-1) feature_seq = domain_feature.extract(seq) feature_seq.id=feature_seq.id+' '+domainID+' '+seq_evalue recordlist.append(feature_seq) SeqIO.write(recordlist, outhandle, "fasta") outhandle.close() print "Done"
def add_track_with_sigils(self, **kwargs): """Add track with sigils.""" self.gdt_features = self.gdd.new_track(1, greytrack=False) self.gds_features = self.gdt_features.new_set() for i in range(18): start = int((400 * i) / 18.0) end = start + 17 if i % 3 == 0: strand = None name = "Strandless" color = colors.orange elif i % 3 == 1: strand = +1 name = "Forward" color = colors.red else: strand = -1 name = "Reverse" color = colors.blue feature = SeqFeature(FeatureLocation(start, end), strand=strand) self.gds_features.add_feature(feature, name=name, color=color, label=True, **kwargs)
def match_sequence_numbering(self): """ SequenceData.match_sequence_numbering Assign canonical sequence numbering to structural fragments """ if not hasattr(self, 'has_canonical'): return False for ch_id in self.data: if ch_id not in self.has_canonical or not self.has_canonical[ch_id]: continue for mod_id in self.data[ch_id]['pdb']: frgs = self.data[ch_id]['pdb'][mod_id]['frgs'] self.data[ch_id]['pdb'][mod_id]['match_numbering'] = True for nfrag in range(0, len(frgs)): inic = self.data[ch_id]['can'].seq.find(frgs[nfrag].seq) + 1 fin = inic + len(frgs[nfrag].seq) - 1 self.data[ch_id]['pdb'][mod_id]['frgs'][nfrag].features.append( SeqFeature(FeatureLocation(inic, fin)) ) if inic != frgs[nfrag].features[0].location.start or\ fin != frgs[nfrag].features[0].location.end: self.data[ch_id]['pdb'][mod_id]['match_numbering'] = False return True
def add_point_feature(self, resnum, feat_type=None, feat_id=None): """Add a feature to the features list describing a single residue. Args: resnum (int): Protein sequence residue number feat_type (str, optional): Optional description of the feature type (ie. 'catalytic residue') feat_id (str, optional): Optional ID of the feature type (ie. 'TM1') """ if self.feature_file: raise ValueError( 'Feature file associated with sequence, please remove file association to append ' 'additional features.') if not feat_type: feat_type = 'Manually added protein sequence single residue feature' newfeat = SeqFeature(location=FeatureLocation( ExactPosition(resnum - 1), ExactPosition(resnum)), type=feat_type, id=feat_id) self.features.append(newfeat)
def t_write_seqrecord(self): """Write single SeqRecords. """ seq = Seq("GATCGATCGATCGATCGATC") rec = SeqRecord(seq, "ID1") qualifiers = { "source": "prediction", "score": 10.0, "other": ["Some", "annotations"], "ID": "gene1" } rec.features = [ SeqFeature(FeatureLocation(0, 20), type="gene", strand=1, qualifiers=qualifiers) ] out_handle = StringIO() GFF.write([rec], out_handle, include_fasta=True) wrote_info = out_handle.getvalue().split("\n") gff_line = wrote_info[2] assert gff_line.split("\t")[0] == "ID1"
def long_sigils(self, glyph): """Check feature sigils within bounding box.""" # Add a track of features, bigger height to emphasise any sigil errors self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3) # We'll just use one feature set for these features if strand specific self.gds_features = self.gdt_features.new_set() if glyph in ["BIGARROW"]: # These straddle the axis, so don't want to draw them on top of each other feature = SeqFeature(FeatureLocation(25, 375), strand=None) self.gds_features.add_feature(feature, color="lightblue") feature = SeqFeature(FeatureLocation(25, 375), strand=+1) else: feature = SeqFeature(FeatureLocation(25, 375), strand=+1) self.gds_features.add_feature(feature, color="lightblue") self.gds_features.add_feature( feature, name="Forward", sigil=glyph, color="blue", arrowhead_length=2.0 ) if glyph in ["BIGARROW"]: # These straddle the axis, so don't want to draw them on top of each other self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3) self.gds_features = self.gdt_features.new_set() feature = SeqFeature(FeatureLocation(25, 375), strand=None) self.gds_features.add_feature(feature, color="pink") feature = SeqFeature(FeatureLocation(25, 375), strand=-1) else: feature = SeqFeature(FeatureLocation(25, 375), strand=-1) self.gds_features.add_feature(feature, color="pink") self.gds_features.add_feature( feature, name="Reverse", sigil=glyph, color="red", arrowhead_length=2.0 ) # Add another track of features, bigger height to emphasise any sigil errors self.gdt_features = self.gdd.new_track(1, greytrack=True, height=3) # We'll just use one feature set for these features, self.gds_features = self.gdt_features.new_set() feature = SeqFeature(FeatureLocation(25, 375), strand=None) self.gds_features.add_feature(feature, color="lightgreen") self.gds_features.add_feature( feature, name="Standless", sigil=glyph, color="green", arrowhead_length=2.0 ) self.finish(f"GD_sigil_long_{glyph}")
def check_sub(feature, sequence): new_features = [] loc_list = [] qual_list = {} topop = [] for sub in feature.sub_features: if sub.sub_features: # If there are sub_features, go deeper new_features.extend(check_sub(sub, sequence)) elif sub.type == 'CDS': loc = [sub.location.start.real, sub.location.end.real] loc_list.append(FeatureLocation(loc[0], loc[1], strand=sub.strand)) # For split features (CDSs), the final feature will have the same qualifiers as the children ONLY if # they're the same, i.e.: all children have the same "protein_ID" (key and value). for qual in sub.qualifiers.keys(): if qual not in qual_list: qual_list[qual] = sub.qualifiers[qual] if qual in qual_list and not qual_list[qual] == sub.qualifiers[ qual]: topop.append(qual) for n in topop: # Pop mismatching qualifers over split features qual_list.pop(n, None) qual_list.pop('Parent', None) # Pop parent. # Only works in tip of the tree, when there's no new_feature built yet. If there is, # it means the script just came out of a check_sub and it's ready to return. if not new_features: if len(loc_list) > 1: loc_list = sorted(loc_list, key=lambda x: x.start.real) if loc_list[0].strand == 1: new_loc = CompoundLocation(loc_list) else: new_loc = CompoundLocation(list(reversed(loc_list))) elif len(loc_list) == 0: return new_features else: new_loc = loc_list[0] new_feature = SeqFeature(new_loc) new_feature.qualifiers = qual_list new_feature.type = 'CDS' trans = new_feature.extract(sequence.seq).translate(stop_symbol='') new_feature.qualifiers['translation'] = [str(trans)] new_features.append(new_feature) return new_features
def setUp(self): random.seed(1) self.sf1 = SeqFeature(FeatureLocation(ExactPosition(100), ExactPosition(500), strand=-1), type='exon', id='exon_1') self.sf2 = SeqFeature(FeatureLocation(ExactPosition(800), ExactPosition(1000), strand=-1), type='exon', id='exon_2') self.sf3 = SeqFeature(FeatureLocation(ExactPosition(1500), ExactPosition(2000), strand=-1), type='exon', id='exon_3') self.sf4 = SeqFeature(FeatureLocation(ExactPosition(10000), ExactPosition(15000), strand=-1), type='exon', id='exon_4') self.sf5 = SeqFeature(FeatureLocation(ExactPosition(17000), ExactPosition(20000), strand=-1), type='exon', id='exon_5') self.sf6 = SeqFeature(FeatureLocation(ExactPosition(22000), ExactPosition(25000), strand=-1), type='exon', id='exon_6') self.exon1 = FusionExon(self.sf1) self.exon2 = FusionExon(self.sf2) self.exon3 = FusionExon(self.sf3) self.exon4 = FusionExon(self.sf4) self.exon5 = FusionExon(self.sf5) self.exon6 = FusionExon(self.sf6) self.sfs1 = [self.sf1, self.sf2, self.sf3] self.sfs2 = [self.sf4, self.sf5, self.sf6] self.exons1 = [self.exon1, self.exon2, self.exon3] self.exons2 = [self.exon4, self.exon5, self.exon6] self.FE1 = FusionEvent(self.sfs1, self.sfs2, "+", "-") self.FE2 = FusionEvent(self.sfs1, self.sfs2, "-", "+")
def mergeMod(args): filetype = args.inFormat # Load file as SeqRecord int_handle = open(args.input, "r") recs = list(SeqIO.parse(int_handle, filetype)) # For each SeqRecord, I.e. complete gbk annotation obj in file fgbk = recs[0] from Bio.SeqFeature import SeqFeature, FeatureLocation d = SeqFeature(FeatureLocation(0, len(fgbk) ), type="fasta_record",\ strand=1) d.qualifiers["note"] = recs[0].name fgbk.features.append(d) for l in recs[1:]: d = SeqFeature(FeatureLocation(len(fgbk), len(fgbk) + len(l)), type="fasta_record",\ strand=1) d.qualifiers["note"] = l.name fgbk.features.append(d) fgbk += l fgbk.name = recs[0].name fgbk.description = recs[0].description fgbk.annotations = recs[0].annotations if args.accession != None: fgbk.name = args.accession if args.ver != None: fgbk.id = fgbk.name +'.' + args.ver for f in fgbk.features: if f.type == 'source': fgbk.features.remove(f) d = SeqFeature(FeatureLocation(0, len(fgbk)), type="source", strand=1) fgbk.features.insert(0,d) outtype = filetype if args.outFormat != None: outtype = args.outFormat out_handle = open( args.output,"w") SeqIO.write(fgbk, out_handle, outtype)
def test_translate(self): s = SeqRecord( Seq("ATGGTGTAA"), id="TestID", name="TestName", description="TestDescription", dbxrefs=["TestDbxrefs"], features=[SeqFeature(FeatureLocation(0, 3), type="Site")], annotations={"organism": "bombyx"}, letter_annotations={"test": "abcdefghi"}, ) t = s.translate() self.assertEqual(t.seq, "MV*") self.assertEqual(t.id, "<unknown id>") self.assertEqual(t.name, "<unknown name>") self.assertEqual(t.description, "<unknown description>") self.assertFalse(t.dbxrefs) self.assertFalse(t.features) self.assertFalse(t.annotations) self.assertFalse(t.letter_annotations) t = s.translate( cds=True, id=True, name=True, description=True, dbxrefs=True, annotations=True, ) self.assertEqual(t.seq, "MV") self.assertEqual(t.id, "TestID") self.assertEqual(t.name, "TestName") self.assertEqual(t.description, "TestDescription") self.assertEqual(t.dbxrefs, ["TestDbxrefs"]) self.assertFalse(t.features) self.assertEqual(t.annotations, {"organism": "bombyx"}) self.assertFalse(t.letter_annotations)
def execute(self, params: dict): records: List[SeqRecord] = [] assembly_list: AssemblyResultSet = params[0] for assembly_idx, assembly in enumerate(assembly_list): features = [] part_start_pos = 0 assembly_sequence = assembly.get_sequence() for part in assembly.get_parts(): #assembly_sequence += str(part.source_part.sequence.seq) # Add annotation feature = SeqFeature( id=part.identifier, qualifiers={'name': part.identifier}, #location=FeatureLocation( # start=part_start_pos, # end=new_start_pos #), type='part') features.append(feature) record = SeqRecord( assembly_sequence.seq, id='123456789', # random accession number name='Example', description='An example GenBank file generated by SuperGSL') for feature in features: record.features.append(feature) records.append(record) output_file = open('example.gb', 'w') SeqIO.write(records, output_file, 'genbank')
def _easy_seqrec( str_seq: str, id, annotation_type: str = "misc_feature", start=0, end=None, **qualifiers: list, ) -> SeqRecord: """Return an annotated SeqRecord from a string and id. Args: str_seq: sequence of SeqRecord. id : Identifier for new part. annotation_type (optional): Equivalent to Bio.SeqFeature type e.g. CDS, Defaults to "misc_feature". start (optional): start of the annotation, Defaults to 0. end (optional): end of the annotation, if None defaults to len(str_seq), Defaults to 'None'. **qualifiers: equivalent to Bio.SeqFeature.qualifiers for annotation e.g. standard_name=["LMP"]. Return: SeqRecord: An annotated SeqRecord. """ if not end: end = len(str_seq) seqrec = SeqRecord( Seq(str_seq), id=id, features=[ SeqFeature( type=annotation_type, location=FeatureLocation(start=start, end=end, strand=+1), qualifiers={item[0]: item[1] for item in qualifiers.items()}, ) ], ) return seqrec
def to_record(self, record=None, record_id=None): """Return a Biopython seqrecord of the quote. >>> record = to_record(solution) >>> # Let's plot with DnaVu: >>> from dnavu import create_record_plot >>> from bokeh.io import output_file, show >>> output_file("view.html") >>> plot = create_record_plot(record) >>> show(plot) """ if record_id is None: record_id = self.id if record is None: if has_dna_alphabet: # Biopython <1.78 record = SeqRecord(Seq(self.sequence, DNAAlphabet()), id=record_id) else: record = SeqRecord(Seq(self.sequence), id=record_id) record.annotations["molecule_type"] = "DNA" else: record = deepcopy(record) if self.assembly_plan is not None: features = [ SeqFeature( FeatureLocation(segment[0], segment[1], 1), type="Feature", qualifiers={ "name": quote.id, "source": quote.source, "price": quote.price, "lead_time": quote.lead_time, }, ) for segment, quote in self.assembly_plan.items() ] record.features = features + record.features return record
def test_replacer__TGA_codon_mid_seq(self): """Tests that TGA codon mis-sequence is not replaced, as it codes for Selenocysteine. """ CODONS_TO_REMOVE = ['TGA'] # Simple table for testing. AA_TO_CODON_LIST_DICT = { '*': {'TGA': {}, 'TAG': {}}, } CODON_USAGE_MEMEX = CodonUsageMemex(AA_TO_CODON_LIST_DICT) feature_1_seq = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TGA' whole_seq = feature_1_seq seq = Seq(whole_seq, generic_dna) seq_record = SeqRecord(seq) feature_1_loc = FeatureLocation(0, len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) codon_replacer = GraphSearchCodonReplacer( CODONS_TO_REMOVE, CODON_USAGE_MEMEX) # Perform replacement. replace_result = codon_replacer.replace_codons_in_feature( feature_1.id, seq_record) # Assert successful fix. self.assertTrue(replace_result['is_success']) # Assert the new sequence has only the TTG that is not a start codon # removed. EXPECTED_NEW_FEATURE_SEQUENCE = 'TTG' + 'GCT' + 'TGA' + 'TTG' + 'TAG' self.assertEqual( EXPECTED_NEW_FEATURE_SEQUENCE, str(replace_result['new_feature_seq']))
def gb_create(sequence, nb_strain, inp, path, name_directory): # initialization of the DNA sequence DNA_seq = '' # loop in sequence list to generate the string corresponding to the DNA sequence of the device. for seq in sequence: DNA_seq += seq[1] # creation of the formated DNA sequence for the genbank file seq_final = Seq(DNA_seq, IUPAC.unambiguous_dna) record = SeqRecord( seq_final, id='NA', # random accession number name='Strain' + nb_strain + '_' + inp, description='Sequence of the computational device in the strain' + nb_strain + ' to implement the Boolean function required') # initialization of the variable len_seq len_seq = 0 # loop in sequence list to generate the genbank feature of the sequence for feat in sequence: feature = SeqFeature(FeatureLocation(start=len_seq, end=len_seq + len(feat[1])), id=feat[0], type=feat[0], strand=feat[2]) record.features.append(feature) len_seq += len(feat[1]) # Save as GenBank file output_file = open( path + '/' + name_directory + '_Strain' + nb_strain + '.gb', 'w') SeqIO.write(record, output_file, 'genbank') output_file.close()
def bed2SeqFeature( bedlist,window,expcount,cnvlimit ): """Generate feature data.""" gdata = [] for s,e,c in bedlist: #assume not reads and dels log2 = float('-inf') #get log2 only if any reads if c: #get exp count expcountlocal = expcount if e-s < window: #if region shorter than window, normalize expcount accordingly expcountlocal = 1.0 * (e-s) / window * expcount log2 = log( c / expcountlocal,2 ) #store dels and dups sf = SeqFeature(FeatureLocation(s,e)) color = 0 ''' if log2 <= -cnvlimit: color = (0,0,_get_color(-log2)) elif log2 >= cnvlimit: color = (_get_color(log2),0,0) #rgb ''' if log2 <= -3*cnvlimit: color = colors.darkblue elif log2 <= -2*cnvlimit: color = colors.blue elif log2 <= -1*cnvlimit: color = colors.lightblue elif log2 >= 3*cnvlimit: color = colors.darkred elif log2 >= 2*cnvlimit: color = colors.red elif log2 >= 1*cnvlimit: color = colors.lightsalmon #''' if color: gdata.append((sf,color)) return gdata
def plan_step_to_record(plan_step, record=None, record_id=None): """Return a Biopython seqrecord of the quote. >>> record = to_SeqRecord(solution) >>> # Let's plot with DnaVu: >>> from dnavu import create_record_plot >>> from bokeh.io import output_file, show >>> output_file("view.html") >>> plot = create_record_plot(record) >>> show(plot) """ if record_id is None: record_id = plan_step.id if record is None: if has_dna_alphabet: # Biopython <1.78 record = SeqRecord(Seq(plan_step.sequence, DNAAlphabet()), id=record_id) else: record = SeqRecord(Seq(plan_step.sequence), id=record_id) record.annotations["molecule_type"] = "DNA" else: record = deepcopy(record) if plan_step.assembly_plan is not None: features = [ SeqFeature( FeatureLocation(q.segment_start, q.segment_end, 1), type="misc_feature", qualifiers={ "label": "%s - From %s" % (q.id, q.source), "name": q.id, "source": q.source, "price": q.price, "lead_time": q.lead_time, }, ) for q in plan_step.assembly_plan ] record.features = features + record.features return record
def locations_to_biopython_features( self, feature_type="misc_feature", color="red", label_prefix="", merge_overlapping=False, ): """Return a list of locations (of breach/suboptimality) as annotations. Parameters ---------- feature_type Genbank type of the annotations color Color property attached to the annotations label_prefix The locations will be labelled of the form "prefix NameOfSpecification()" merge_overlapping If true, then overlapping locations (0-5, 2-9) will be merged into a single one (0-9). """ locations = self.locations if merge_overlapping: locations = Location.merge_overlapping_locations(locations) return [ SeqFeature( location.to_biopython_location(), type=feature_type, qualifiers=dict( label=label_prefix + " " + str(self.specification), color=color, ), ) for location in locations ]
def annotate_record( seqrecord, location="full", feature_type="misc_feature", margin=0, **qualifiers ): """Add a feature to a Biopython SeqRecord. Parameters ---------- seqrecord The biopython seqrecord to be annotated. location Either (start, end) or (start, end, strand). (strand defaults to +1) feature_type The type associated with the feature margin Number of extra bases added on each side of the given location. qualifiers Dictionnary that will be the Biopython feature's `qualifiers` attribute. """ if location == "full": location = (margin, len(seqrecord) - margin) strand = location[2] if len(location) == 3 else 1 seqrecord.features.append( SeqFeature( FeatureLocation(location[0], location[1], strand), qualifiers=qualifiers, type=feature_type, ) )
def convert_feature(feature, qualifier_transformers=DEFAULT_QUALIFIER_TRANSFORMERS): """ :param SeqFeature feature: :param qualifier_transformers: a set of transformation functions used to clean up the qualifiers. :return: a :class:`SeqFeature` with valid GenBank qualifiers and a feature type that is a Sequence Ontology term. """ type_ = convert_feature_type(feature) before, after = feature.qualifiers, feature.qualifiers for transformer in qualifier_transformers: before = after = transformer(before, dict(after)) # finally, remove all qualifiers that do not belong to a certain feature key qualifiers = remove_qualifiers_inappropriate_for_feature( before, dict(after), genbank_feature_key(type_)) return SeqFeature(location=feature.location, type=type_, id=feature.id, qualifiers=qualifiers)
def get_genome(inputfile): with open(inputfile) as f: lines = f.readlines() sepline = lines.index("##FASTA\n") genome_id = lines[sepline+1][1:].strip() genome_features = [] for i in range(2,sepline): splitline = lines[i].split() startloc = int(splitline[3]) endloc = int(splitline[4]) featstrand = stranddict[splitline[6]] locustag = splitline[8].split('=')[1] thisfeature = SeqFeature(FeatureLocation(start=startloc, end=endloc, strand = featstrand), type=splitline[2], qualifiers = {'locus_tag':[locustag]}) genome_features.append(thisfeature) genome_seq = "" for i in range(sepline+2, len(lines)): genome_seq = genome_seq + lines[i].strip() return(genome_id, genome_seq, genome_features)
def gb_create(DNA_seq, list_feat, name, directory): # creation of the formated DNA sequence for the genbank file seq_final = Seq(DNA_seq, IUPAC.unambiguous_dna) record = SeqRecord( seq_final, id='NA', # random accession number name='', description='Synthetic sequence') # loop in sequence list to generate the genbank feature of the sequence for feat in list_feat: feature = SeqFeature(FeatureLocation(start=feat[0], end=feat[1]), id=feat[2], type=feat[2], strand=feat[3]) record.features.append(feature) # Save as GenBank file output_file = open(directory + '/' + name + '.gb', 'w') SeqIO.write(record, output_file, 'genbank') output_file.close()
def test_find_features_starting_at(self): before = 'TACTAGTCGAT' feature_1_seq = 'ATGAAAGGGATG' after = 'CTATCTAGCTAGCT' whole_seq = Seq(before + feature_1_seq + after, generic_dna) seq_record = SeqRecord(whole_seq) feature_1_loc = FeatureLocation(len(before), len(before) + len(feature_1_seq), strand=1) feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1') seq_record.features.append(feature_1) self.assertEquals(set([]), set(find_features_starting_at(0, seq_record))) self.assertEquals( set([feature_1]), set(find_features_starting_at(feature_1_loc.start, seq_record))) self.assertEquals( set(), set( find_features_starting_at(feature_1_loc.end, seq_record, ['misc'])))
def bed2SeqFeature(bed1, expcount1, bed2, expcount2, window): """Generate feature data: - mark heterozygous SNP-rich with orange - mark homozygous SNP-rich with darkgrey """ gdata = [] bed1.sort() bed2.sort() for (s, e, c1), (s2, e2, c2) in zip(bed1, bed2): # hapA color = "darkgrey" #0 #mark heterozygous SNP-rich with orange # hetero if get_log2(s, e, c2, expcount2, window) > 0: color = 0 #"orange" #mark homozygous SNP-rich with darkgrey # hapB elif get_log2(s, e, c1, expcount1, window) > 0: color = "orange" #"darkgrey" #store dels and dups if color: sf = SeqFeature(FeatureLocation(s, e)) gdata.append((sf, color)) return gdata
def extract_target(self): inter_df = self.inter_df i = 0 for index, row in inter_df.iterrows(): i += 1 print(i) # if i > 20: # break chr_id = row['chr'] chr = self.get_genome_by_id(chr_id) strand = row['strand'] assert strand == '+' or strand == '-', "strand value incorrect {}".format( strand) strand = 1 if strand == '+' else -1 start = row[ 'start'] - 1 #Note that the start and end location numbering follow Python's scheme stop = row['end'] target_feature = SeqFeature( FeatureLocation(start, stop, strand=strand)) target = target_feature.location.extract(chr) inter_df.loc[index, 'target'] = str(target.seq) #print (str(target.seq)) self.inter_df = inter_df.copy()
def _translate_feature(fi, rec, table): f = rec.features[fi] srec = f.extract(rec) try: tsec = srec.seq.translate(table) if tsec[-1] == '*': tsec = tsec[:-1] try: fid = f.qualifiers['locus_tag'][0] except KeyError: fid = '%s_f%d' % (rec.id, fi) trec = SeqRecord(tsec, id=fid, name=rec.name, description=rec.description, annotations=rec.annotations) pf = SeqFeature(FeatureLocation(0, len(trec)), id=f.id, type='CDS', qualifiers=f.qualifiers) trec.features.append(pf) except Exception, e: print e raise RuntimeError('Unable to translate: %s' % str(srec.seq))
def make_gbk_from_gff(infile, outfile): g_id, g_seq, g_features = get_genome(infile) myseq = Seq.Seq(g_seq, IUPAC.unambiguous_dna) myrecord = SeqRecord(myseq, id = g_id, name = g_id) f1 = SeqFeature(FeatureLocation(0, len(myseq)), type="source") myrecord.features.append(f1) for feature in g_features: feature.qualifiers['transl_table']=[11] if feature.location.strand == 1: aaseq = myseq[feature.location.start-1:feature.location.end-1].translate() else: aaseq = myseq[feature.location.start-1:feature.location.end-1].reverse_complement().translate() feature.qualifiers['translation']=[str(aaseq)] feature.type="CDS" feature.qualifiers['product'] = feature.qualifiers['locus_tag'] genefeature = copy.deepcopy(feature) genefeature.type = 'gene' genefeature.qualifiers = {'locus_tag':feature.qualifiers['locus_tag']} myrecord.features.append(genefeature) myrecord.features.append(feature) SeqIO.write(myrecord, outfile, 'genbank')
def load_annotation(self, gff): print("Loading annotation...") try: with open(gff, 'r') as g: for feature in g: if 'gene' in feature: scaffold, source, gfftype, start, end, score, strand, phase, attributes = feature.rstrip( ).split('\t') if scaffold not in self.sequences: continue featurestrand = None if strand == '+': featurestrand = 1 elif strand == '-': featurestrand = -1 location = FeatureLocation(ExactPosition(start), ExactPosition(end), strand=featurestrand) feature = SeqFeature(location, type=gfftype) self.sequences[scaffold].features.append(feature) except IOError: print("Can't load annotation from file {}!".format(gff)) sys.exit()
def add_jaggies(contig_seq, offset, gd_contig_features): """Add JAGGY features for any run of NNNN or XXXX in sequence.""" if not MIN_GAP_JAGGY: #Skip the jaggies return contig_seq = contig_seq.upper().replace("X", "N") i = 0 j = 0 NNN = "N" * MIN_GAP_JAGGY while i < len(contig_seq): i = contig_seq.find(NNN, i) if i == -1: return j = i while j < len(contig_seq) and contig_seq[j] == "N": j += 1 #print("Adding jaggy") gd_contig_features.add_feature(SeqFeature( FeatureLocation(offset + i, offset + j)), sigil="JAGGY", color=colors.slategrey, border=colors.black) i = j + 1
def to_biopython_feature(self, feature_type="misc_feature", role="constraint", colors_dict=None, **qualifiers): """Return a Biopython feature representing the specification. The feature label is a string representation of the specification, and its location indicates the specification's scope. """ if colors_dict is None: colors_dict = {"constraint": "#355c87", "objective": "#f9cd60"} qualifiers["role"] = role if "label" not in qualifiers: qualifiers['label'] = self.label(role=role, with_location=False, assignment=':') if "color" not in qualifiers: qualifiers['color'] = colors_dict[role] return SeqFeature(self.location.to_biopython_location(), type=feature_type, qualifiers=qualifiers)
def organism_iterator(self, organism, seq_map=None): for dbcontig in Contig.objects(organism=organism).no_cache(): if seq_map: seq = str(seq_map[dbcontig.name].seq) else: seq = dbcontig.seq contig = SeqRecord(id=dbcontig.name, seq=Seq(seq)) for dbfeature in dbcontig.features: qualifiers = {"locus_tag": [dbfeature.locus_tag]} p = list( Protein.objects(organism=organism, gene=dbfeature.identifier)) if p: p = p[0] qualifiers["description"] = [p.description] qualifiers["gene_symbol"] = p.gene qualifiers["Note"] = [p.description] ecs = [ x.upper() for x in p.ontologies if x.startswith("ec:") ] gos = [ x.upper() for x in p.ontologies if x.startswith("go:") ] if ecs: qualifiers["EC"] = ecs if gos: qualifiers["GO"] = gos feature = SeqFeature(id=dbfeature.identifier, type=dbfeature.type, qualifiers=qualifiers, location=FeatureLocation( start=dbfeature.location.start, end=dbfeature.location.end, strand=dbfeature.location.strand)) contig.features.append(feature) yield contig