def create_feature_annot(loc_range, featuretype, s): """ Create a new feature annotation at loc_range with featuretype on strand s. """ location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(loc_range[0]), SeqFeature.ExactPosition(loc_range[1])) new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s) return (new_feature)
def modify_genbank(gb_file, fasta_file): gb_filename = re.search(r'(.*/users/.*/uploads/.*).(\w*)', gb_file) out_file = str(gb_filename.group(1)) + '_modified.' + str( gb_filename.group(2)) genome = SeqIO.read(fasta_file, "fasta").seq final_annotations = get_final_annotations(genome) final_features = [] for record in SeqIO.parse(open(gb_file, "r"), "genbank"): for feature in record.features: if feature.type == "gene" or feature.type == "CDS": locus_tag = feature.qualifiers["locus_tag"][0] if locus_tag in final_annotations.keys(): new_start = final_annotations[locus_tag]["start"] feature.location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(new_start - 1), SeqFeature.ExactPosition( feature.location.end.position), feature.location.strand) if feature.type == "CDS": feature.qualifiers["product"][0] = final_annotations[ locus_tag]["function"] feature.qualifiers["translation"][ 0] = final_annotations[locus_tag]["translation"] else: continue final_features.append(feature) # Append final features record.features = final_features with open(out_file, "w") as new_gb: SeqIO.write(record, new_gb, "genbank") return out_file
def convert_annotations(block, gb): # Add My annotations as features for annotation in block["sequence"]["annotations"]: gb_annot = SeqFeature.SeqFeature() annotation_type = "unknown" if "role" in annotation and annotation["role"] != "": annotation_type = annotation["role"] for key, value in annotation.iteritems(): if key not in ["start", "end", "notes", "strand", "color", "role"]: gb_annot.qualifiers[key] = value elif key == "color": gb_annot.qualifiers["GC_Color"] = value elif key == "notes": for notes_key, notes_value in annotation["notes"].iteritems(): if notes_key == "genbank": for gb_key, gb_value in notes_value.iteritems(): if gb_key not in ["type"]: gb_annot.qualifiers[gb_key] = gb_value elif gb_key == "type": annotation_type = gb_value if "start" in annotation: strand = 1 if "strand" in annotation and annotation["strand"] == -1: strand = -1 gb_annot.location = SeqFeature.FeatureLocation( annotation["start"], annotation["end"] + 1, strand) gb_annot.type = annotation_type gb.features.append(gb_annot)
def read_reference(fname, genemap): try: ref = str(SeqIO.read(fname, 'fasta').seq) except: with open(fname, 'r') as fh: ref = "".join([x.strip() for x in fh]) translations = {} with open(genemap, 'r') as fh: for line in fh: if line[0] == '#': continue entries = [x.strip() for x in line.strip().split('\t')] start = int(entries[3]) end = int(entries[4]) strand = entries[6] attributes = { x.split()[0]: ' '.join(x.split()[1:]) for x in entries[8].split(';') } if 'gene_name' in attributes: name = attributes['gene_name'].strip('"') else: name = None translation = Seq.translate( SeqFeature.SeqFeature( SeqFeature.FeatureLocation( start - 1, end, strand=-1 if strand == '-' else 1)).extract(ref)) translations[name] = str(translation) return {"nuc": ref, "translations": translations}
def create_feature(hit, end): ''' Creates a feature from the hit, setting colours based on orientation and noting whether it is a left or right end hit. ''' # Set up coordinates start = int(hit[1]) stop = int(hit[2]) quals = {} location = SeqFeature.FeatureLocation(start, stop) # Annotate with end information and colour accordingly if end == 'five': quals['colour'] = '2' quals['end'] = 'left_end' feat_type = 'left end' elif end == 'three': quals['colour'] = '7' quals['end'] = 'right end' feat_type = 'right_end' # Create feature feature = SeqFeature.SeqFeature(location, type=feat_type, qualifiers=quals) return feature
def _parse_feature(element): feature = SeqFeature.SeqFeature() for k, v in element.attrib.items(): feature.qualifiers[k] = v feature.type = element.attrib.get('type', '') if 'id' in element.attrib: feature.id = element.attrib['id'] for feature_element in element.getchildren(): if feature_element.tag == NS + 'location': position_elements = feature_element.findall(NS + 'position') if position_elements: element = position_elements[0] start_position = _parse_position(element, -1) end_position = _parse_position(element) else: element = feature_element.findall(NS + 'begin')[0] start_position = _parse_position(element, -1) element = feature_element.findall(NS + 'end')[0] end_position = _parse_position(element) feature.location = SeqFeature.FeatureLocation( start_position, end_position) else: try: feature.qualifiers[feature_element.tag.replace( NS, '')] = feature_element.text except: pass #skip unparsable tag self.ParsedSeqRecord.features.append(feature)
def add_feature(self, search_str, label): """Label a feature by literal string match, failing silently. Does not label features that wrap around circular sequence Args: search_str (str): string representing feature label (str): feature name to display""" f_matches = re.finditer(search_str, str(self.seq), re.IGNORECASE) r_matches = () if search_str.lower() != dna.revc(search_str.lower()): # Don't label palindromes twice r_matches = re.finditer(dna.revc(search_str), str(self.seq), re.IGNORECASE) for m, strand in chain(izip_longest(f_matches, [1]), izip_longest(r_matches, [-1])): if m: feature = SeqFeature.SeqFeature( SeqFeature.FeatureLocation(m.start(), m.end(), strand), 'misc_feature') color = self.colors.next() feature.qualifiers = { 'label': [label], 'ApEinfo_fwdcolor': [color], 'ApEinfo_revcolor': [color], 'ApEinfo_graphicformat': ['arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0'] } self.features.append(feature) return
def _parse_feature(element): feature = SeqFeature.SeqFeature() for k, v in element.attrib.items(): feature.qualifiers[k] = v feature.type = element.attrib.get("type", "") if "id" in element.attrib: feature.id = element.attrib["id"] for feature_element in element: if feature_element.tag == NS + "location": position_elements = feature_element.findall(NS + "position") if position_elements: element = position_elements[0] start_position = _parse_position(element, -1) end_position = _parse_position(element) else: element = feature_element.findall(NS + "begin")[0] start_position = _parse_position(element, -1) element = feature_element.findall(NS + "end")[0] end_position = _parse_position(element) feature.location = SeqFeature.FeatureLocation( start_position, end_position ) else: try: feature.qualifiers[ feature_element.tag.replace(NS, "") ] = feature_element.text except Exception: # TODO - Which exceptions? pass # skip unparsable tag self.ParsedSeqRecord.features.append(feature)
def _retrieve_reference(adaptor, primary_id): # XXX dbxref_qualifier_value refs = adaptor.execute_and_fetchall( "SELECT start_pos, end_pos, " " location, title, authors," " dbname, accession" " FROM bioentry_reference" " JOIN reference USING (reference_id)" " LEFT JOIN dbxref USING (dbxref_id)" " WHERE bioentry_id = %s" " ORDER BY rank", (primary_id,)) references = [] for start, end, location, title, authors, dbname, accession in refs: reference = SeqFeature.Reference() # If the start/end are missing, reference.location is an empty list if (start is not None) or (end is not None): if start is not None: start -= 1 # python counting reference.location = [SeqFeature.FeatureLocation(start, end)] # Don't replace the default "" with None. if authors: reference.authors = authors if title: reference.title = title reference.journal = location if dbname == 'PUBMED': reference.pubmed_id = accession elif dbname == 'MEDLINE': reference.medline_id = accession references.append(reference) if references: return {'references': references} else: return {}
def make_genbank_recs(rec): new_rec = rec #new_rec.seq.alphabet = generic_dna scaffold = new_rec.id scaffold_recs = list( filter(lambda x: x.id.startswith(scaffold + '_'), protein_recs)) for protein_rec in scaffold_recs: start = int(protein_rec.description.split(' # ')[1]) startpos = SeqFeature.ExactPosition(start) end = int(protein_rec.description.split(' # ')[2]) endpos = int(SeqFeature.ExactPosition(end)) strand = int(protein_rec.description.split(' # ')[3]) rec_location = FeatureLocation(startpos, endpos) rec_feature = SeqFeature.SeqFeature(rec_location, type="CDS", strand=strand) #Add ORF name without genome ID rec_feature.qualifiers['protein_id'] = protein_rec.id rec_feature.qualifiers['translation'] = protein_rec.seq rec_feature.qualifiers['locus_tag'] = protein_rec.description new_rec.features.append(rec_feature) return new_rec
def createFEATUREannot(loc_range, featuretype, s): """ Creates a new SeqFeature with ExactPositions based on range.""" location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(loc_range[0]), SeqFeature.ExactPosition(loc_range[1])) new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s) return (new_feature)
def ins_tag(self,tag_seq,protease_seq,ins_name,ins_sites,side=5): #the cutpoint is after ins_sites[0] bp and after ins_sites[1] bp from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord for feature in self.record.features: if feature.qualifiers.has_key("note")\ and re.search(r"^mcs",feature.qualifiers["note"][0],re.I): mcs_start=int(str(feature.location.start)) mcs_end=int(str(feature.location.end)) mcs_qualifiers=feature.qualifiers if ins_sites[0]>mcs_start and ins_sites[1]<mcs_end: f_mcs=SeqFeature(FeatureLocation(mcs_start, mcs_end+ins_sites[0]-ins_sites[1]+len(tag_seq+protease_seq)),type="mcs") f_mcs.qualifiers=mcs_qualifiers if side==5: ins_record=SeqRecord(Seq(tag_seq+protease_seq,IUPAC.ambiguous_dna)) f=SeqFeature(FeatureLocation(0,len(tag_seq)),type="tag") elif side==3: ins_record=SeqRecord(Seq(protease_seq+tag_seq,IUPAC.ambiguous_dna)) f=SeqFeature(FeatureLocation(len(protease_seq),len(protease_seq+tag_seq)),type="tag") f.qualifiers["note"]=[ins_name,] ins_record.features=[f] old_name=self.record.name self.record=self.record[:ins_sites[0]]+ins_record+self.record[ins_sites[1]:] self.record.name=old_name self.whole_len=len(self.record) self.record.features.append(f_mcs) self.record.features=sorted(self.record.features,key=lambda x:int(str(x.location.start)))
def _make_position(location_string, offset=0): """Turn a Swiss location position into a SeqFeature position object (PRIVATE). An offset of -1 is used with a start location to make it pythonic. """ if location_string == "?": return SeqFeature.UnknownPosition() # Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. try: return SeqFeature.ExactPosition(max(0, offset + int(location_string))) except ValueError: pass if location_string.startswith("<"): try: return SeqFeature.BeforePosition( max(0, offset + int(location_string[1:]))) except ValueError: pass elif location_string.startswith(">"): # e.g. ">13" try: return SeqFeature.AfterPosition( max(0, offset + int(location_string[1:]))) except ValueError: pass elif location_string.startswith("?"): # e.g. "?22" try: return SeqFeature.UncertainPosition( max(0, offset + int(location_string[1:]))) except ValueError: pass raise NotImplementedError("Cannot parse location '%s'" % location_string)
def al_string2feat(queryseq, ampsdict): #lib5pr is subjectseq; t7 is queryseq ''' This function accepts a query seq and a dictionary of subjectseqs, where the key (amp) is contained in a field in queryseq, highlighting the location of queryseq in it. Returns a string. ''' subjectseq = SeqRecord(ampsdict[queryseq[1][0]]) #for seqrecord in subjectseq: locstart = queryseq[1][1] #print queryseq locend = queryseq[1][2] fwdlocs = [] revlocs = [] # Figure out which strand the BLAST hit is on if locstart <= locend: fwdlocs.append(locstart) if locstart > locend: revlocs.append(locend) for item in fwdlocs: start = ExactPosition(int(item)) end = ExactPosition(int((item) + len(queryseq[0].seq) + 1)) location = FeatureLocation(start, end) feature = SeqFeature(location,type=str("cutsite_fwd"), strand = +1) subjectseq.features.append(feature) for item in revlocs: start = ExactPosition(int(item)) end = ExactPosition(start + len(queryseq[0].seq)) location = FeatureLocation(start, end) feature = SeqFeature(location,type=str("cutsite_rev"), strand = -1) subjectseq.features.append(feature) #print subjectseq.features return subjectseq
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ location = SeqFeature.FeatureLocation(*feature_dict['location']) new_feature = SeqFeature.SeqFeature(location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']) new_feature.qualifiers = feature_dict['quals'] return new_feature
def _make_seqfeature(name, from_res, to_res, description, ft_id): """Construct SeqFeature from feature data from parser (PRIVATE).""" loc = SeqFeature.FeatureLocation(_make_position(from_res, -1), _make_position(to_res, 0)) if not ft_id: ft_id = "<unknown id>" # The default in SeqFeature object return SeqFeature.SeqFeature(loc, type=name, id=ft_id, qualifiers={"description": description})
def create_feature(sequence, name, start, end, strand=+1): if str(name) and int(start) and int(end): my_feature_location = SeqFeature.FeatureLocation(start, end, strand=strand) my_feature = SeqFeature.SeqFeature(my_feature_location, type=name) sequence.features.append(my_feature)
def add_features(block, allblocks, gb, start): # Disregard fillers... don't create features for them if is_filler(block): return start + block["sequence"]["length"] # For handling list blocks! if "current_option" in block: option = [b for b in allblocks if b["id"] == block["current_option"]][0] return add_features(option, allblocks, gb, start) # Add Myself as a feature sf = SeqFeature.SeqFeature() # Set the type based on the original type or the role type if "genbank" in block["metadata"] and "type" in block["metadata"]["genbank"]: sf.type = block["metadata"]["genbank"]["type"] elif "rules" in block and "role" in block["rules"] and block["rules"]["role"] is not None and block["rules"]["role"] != "": sf.type = block["rules"]["role"] else: sf.type = "misc_feature" # Set up the location of the feature feature_strand = 1 if "strand" in block["metadata"]: feature_strand = block["metadata"]["strand"] # And copy all the other qualifiers that came originally from genbank if "genbank" in block["metadata"]: for annot_key, annot_value in block["metadata"]["genbank"].iteritems(): if annot_key not in ["name_source", "note"]: sf.qualifiers[annot_key] = annot_value convert_block_name(sf, block) add_GC_info(sf, block, allblocks) convert_annotations(block, gb, start) # Add my children as features child_start = start for i in range(0, len(block["components"])): block_id = block["components"][i] bl = [b for b in allblocks if b["id"] == block_id][0] child_start = add_features(bl, allblocks, gb, child_start) if child_start != start: # The end is where the last child ended... end = child_start else: # No children, look at the block's length if "sequence" in block: end = start + block["sequence"]["length"] else: end = start sf.location = SeqFeature.FeatureLocation(start, end, strand=feature_strand) gb.features.append(sf) return end
def spacersonly(seqs): sgRNAconst = SeqRecord(Seq("GTTTAAGAG")) while True: seqrecord = seqs.next() #for seqrecord in seqs: fwdlocs = [] revlocs = [] fwdlocs = [ tloc.start() for tloc in re.finditer(str(sgRNAconst.seq), str(seqrecord.seq)) ] for item in fwdlocs: start = ExactPosition(int(item) + 1) end = ExactPosition(int((item) + len(sgRNAconst))) location = FeatureLocation(start, end) feature = SeqFeature(location, type="sgRNAconst", strand=+1) seqrecord.features.append(feature) revlocs = [ tloc.start() for tloc in re.finditer( str(sgRNAconst.reverse_complement().seq), str(seqrecord.seq)) ] for item in revlocs: start = ExactPosition(int(item) + 1) end = ExactPosition(start + len(sgRNAconst) - 1) location = FeatureLocation(start, end) feature = SeqFeature(location, type="sgRNAconst", strand=-1) seqrecord.features.append(feature) for feat in seqrecord.features: if feat.strand == 1: tgtstart = int(feat.location.start) - 36 # -21 tgtend = int(feat.location.start) - 1 sgtgt = seqrecord[tgtstart:tgtend] #yield sgtgt #alltgts.append(sgtgt) #print "pos \n \n" if feat.strand == -1: tgtend = int(feat.location.end) + 36 # +21 tgtstart = int(feat.location.end) sgtgt = seqrecord[tgtstart:tgtend].reverse_complement() sgtgt.name = seqrecord.name #yield sgtgt #alltgts.append(sgtgt) bad = 0 try: l = [ tloc.end() for tloc in re.finditer("ACTCACTATAG", str(sgtgt.seq)) ] sgtgt = sgtgt[int(l[0]):] except: None for score in sgtgt.letter_annotations["phred_quality"]: if score < 30: bad = 1 if bad == 0 and len(sgtgt) > 10: yield sgtgt break
def translateFeatureLocation(location, region, translation=0): location2 = location + translation + 1 if location2.end < 0: logging.debug('Error-prone feature detected: {}'.format(location2)) return SeqFeature.FeatureLocation(start=0, end=0, strand=0) else: return SeqFeature.FeatureLocation(start=max(0, location2.start), end=min(location2.end, region.end), strand=location2.strand)
def addFeatureComplSTF(): if m.end() <= seqLength: newFeature = SeqFeature(FeatureLocation(m.start(),m.end(), strand=-1), type=str(feature)) newFeature.qualifiers['note'] = featureName newRecord.features.append(newFeature) else: newFeature = SeqFeature(CompoundLocation([FeatureLocation(m.start(),seqLength, strand=-1), FeatureLocation(1, (seqLength - m.end()), strand=-1)]), type=str(feature)) newFeature.qualifiers['note'] = featureName newRecord.features.append(newFeature)
def test_GenerateFeatLoc__make_start_fuzzy__1(self): ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`. This test evaluates the case where FeatureLocations are made fuzzy. ''' from Bio import SeqFeature start_pos = SeqFeature.ExactPosition(5) end_pos = SeqFeature.ExactPosition(9) location_object = SeqFeature.FeatureLocation(start_pos, end_pos) out = GnOps.GenerateFeatLoc().make_start_fuzzy(location_object) self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation self.assertIsInstance(out.start, Bio.SeqFeature.BeforePosition) # Fuzzy Start
def contig_info(contig_id, contig_seq, species_informations): """ Create contig information from species_informations dictionary and contig id and contig seq. """ record = SeqRecord(contig_seq, id=contig_id, name=contig_id, description=species_informations['description'], annotations={"molecule_type": "DNA"}) if IUPAC: record.seq.alphabet = IUPAC.ambiguous_dna if 'data_file_division' in species_informations: record.annotations['data_file_division'] = species_informations[ 'data_file_division'] record.annotations['date'] = datetime.date.today().strftime( '%d-%b-%Y').upper() if 'topology' in species_informations: record.annotations['topology'] = species_informations['topology'] record.annotations['accessions'] = contig_id if 'organism' in species_informations: record.annotations['organism'] = species_informations['organism'] # Use of literal_eval for taxonomy and keywords to retrieve list. if 'taxonomy' in species_informations: record.annotations['taxonomy'] = species_informations['taxonomy'] if 'keywords' in species_informations: record.annotations['keywords'] = species_informations['keywords'] if 'source' in species_informations: record.annotations['source'] = species_informations['source'] new_feature_source = sf.SeqFeature(sf.FeatureLocation( 1 - 1, len(contig_seq)), type="source") new_feature_source.qualifiers['scaffold'] = contig_id if 'isolate' in species_informations: new_feature_source.qualifiers['isolate'] = species_informations[ 'isolate'] # db_xref corresponds to the taxon NCBI ID. # Important if you want to use Pathway Tools after. if 'db_xref' in species_informations: new_feature_source.qualifiers['db_xref'] = species_informations[ 'db_xref'] if 'cell_type' in species_informations: new_feature_source.qualifiers['cell_type'] = species_informations[ 'cell_type'] if 'dev_stage' in species_informations: new_feature_source.qualifiers['dev_stage'] = species_informations[ 'dev_stage'] if 'mol_type' in species_informations: new_feature_source.qualifiers['mol_type'] = species_informations[ 'mol_type'] record.features.append(new_feature_source) return record
def _parse_dbReference(element): self.ParsedSeqRecord.dbxrefs.append(element.attrib["type"] + ":" + element.attrib["id"]) if "type" in element.attrib: # <dbReference type="EMBL" id="U96180"> # <property type="protein sequence ID" value="AAB66902.1"/> # <property type="molecule type" value="mRNA"/> # </dbReference> if element.attrib["type"] == "EMBL": for ref_element in element: if "type" in ref_element.attrib and "value" in ref_element.attrib: if ref_element.attrib[ "type"] == "protein sequence ID": self.ParsedSeqRecord.dbxrefs.append( "EMBL-CDS:" + ref_element.attrib["value"]) # e.g. # <dbReference type="PDB" key="11" id="2GEZ"> # <property value="X-ray" type="method"/> # <property value="2.60 A" type="resolution"/> # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> # </dbReference> elif element.attrib["type"] == "PDB": method = "" resolution = "" for ref_element in element: if ref_element.tag == NS + "property": dat_type = ref_element.attrib["type"] if dat_type == "method": method = ref_element.attrib["value"] if dat_type == "resolution": resolution = ref_element.attrib["value"] if dat_type == "chains": pairs = ref_element.attrib["value"].split(",") for elem in pairs: pair = elem.strip().split("=") if pair[1] != "-": # TODO - How best to store these, do SeqFeatures make sense? feature = SeqFeature.SeqFeature() feature.type = element.attrib["type"] feature.qualifiers[ "name"] = element.attrib["id"] feature.qualifiers["method"] = method feature.qualifiers[ "resolution"] = resolution feature.qualifiers["chains"] = pair[ 0].split("/") start = int(pair[1].split("-")[0]) - 1 end = int(pair[1].split("-")[1]) feature.location = SeqFeature.FeatureLocation( start, end) # self.ParsedSeqRecord.features.append(feature) for ref_element in element: if ref_element.tag == NS + "property": pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ location = SeqFeature.FeatureLocation(*feature_dict['location']) new_feature = SeqFeature.SeqFeature(location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']) # Support for Biopython 1.68 and above, which removed sub_features if not hasattr(new_feature, "sub_features"): new_feature.sub_features = [] new_feature.qualifiers = feature_dict['quals'] return new_feature
def _trans_loc(loc): # Don't write the contig ID in the loc line unless it's trans-spliced if loc[0] == current_contig_id: loc[0] = None if loc[2] == "-": return SeqFeature.FeatureLocation(loc[1] - loc[3], loc[1], -1, loc[0]) else: return SeqFeature.FeatureLocation(loc[1] - 1, loc[1] + loc[3] - 1, 1, loc[0])
def create_genbank(gene_nucleic_seqs, gene_protein_seqs, annot, go_namespaces, go_alternatives, output_path, species_informations): """ Create genbank file from nucleic and protein fasta plus eggnog mapper annotation file. Args: gene_nucleic_seqs (dict): dictionary of nucleic sequences (key: sequence id, value: sequence) gene_protein_seqs (dict): dictionary of protein sequences (key: sequence id, value: sequence) annot (dict): dictionary of eggnog-ammper annotation (key: gene_id, value: ['GOs','EC', 'Preferred_name']) go_namespaces (dict): dictionary of GO terms namespace (key: GO Term ID, value: namespace associated to GO Term) go_alternatives (dict): dictionary of GO terms alternatives ID (key: GO Term ID, value: alternatives GO Term associated to GO Term) output_path (str): output file or directory species_informations (dict): dictionary containing information about species """ # All SeqRecord objects will be stored in a list and then give to the SeqIO writer to create the genbank. records = [] # Iterate through each contig/gene. for gene_nucleic_id in sorted(gene_nucleic_seqs): # Create a SeqRecord object using gene information. record = record_info(gene_nucleic_id, gene_nucleic_seqs[gene_nucleic_id], species_informations) # If id is numeric, change it if gene_nucleic_id.isnumeric(): id_gene = f"gene_{gene_nucleic_id}" elif "|" in gene_nucleic_id: id_gene = gene_nucleic_id.split("|")[1] else: id_gene = gene_nucleic_id start_position = 1 end_position = len(gene_nucleic_seqs[gene_nucleic_id]) strand = 0 new_feature_gene = sf.SeqFeature(sf.FeatureLocation( start_position, end_position, strand), type="gene") new_feature_gene.qualifiers['locus_tag'] = id_gene # Add gene information to contig record. record.features.append(new_feature_gene) new_cds_feature = create_cds_feature(id_gene, start_position, end_position, strand, annot, go_namespaces, go_alternatives, gene_protein_seqs) new_cds_feature.qualifiers['locus_tag'] = id_gene # Add CDS information to contig record record.features.append(new_cds_feature) records.append(record) # Create Genbank with the list of SeqRecord. SeqIO.write(records, output_path, 'genbank')
def test_GenerateFeatLoc__make_start_fuzzy__3(self): ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`. This test evaluates if end FeatureLocations are made fuzzy. See AfterPosition. ''' from Bio import SeqFeature start_pos = SeqFeature.ExactPosition(5) end_pos = SeqFeature.ExactPosition(9) location_object = SeqFeature.FeatureLocation(start_pos, end_pos) out = GnOps.GenerateFeatLoc().make_end_fuzzy(location_object) self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation self.assertIsInstance(out.end, Bio.SeqFeature.AfterPosition) # Fuzzy End
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation.""" location = SeqFeature.FeatureLocation(*feature_dict["location"]) new_feature = SeqFeature.SeqFeature( location, feature_dict["type"], id=feature_dict["id"], strand=feature_dict["strand"], ) new_feature.qualifiers = feature_dict["quals"] return new_feature
def convert_annotations(block, gb, start): if "sequence" not in block: return # Add My annotations as features for annotation in block["sequence"]["annotations"]: gb_annot = SeqFeature.SeqFeature() annotation_type = "misc_feature" if "role" in annotation and annotation["role"] != "": annotation_type = annotation["role"] for key, value in annotation.iteritems(): if key not in [ "start", "end", "notes", "strand", "color", "role", "isForward" ]: gb_annot.qualifiers[key] = value elif key == "notes" and "genbank" in annotation["notes"]: for gb_key, gb_value in annotation["notes"][ "genbank"].iteritems(): if gb_key not in ["type", "note"]: gb_annot.qualifiers[gb_key] = gb_value elif gb_key == "type": annotation_type = gb_value gc_info = { "GC": { "name": annotation["name"], "type": "annotation", "parents": [block["id"]] } } if "color" in annotation: gc_info["GC"]["color"] = annotation["color"] if "notes" in annotation and "genbank" in annotation[ "notes"] and "note" in annotation["notes"]["genbank"]: gc_info["note"] = annotation["notes"]["genbank"]["note"] gb_annot.qualifiers["note"] = json.dumps(gc_info).replace("\"", "'") if "start" in annotation: strand = 1 if "isForward" in annotation and annotation["isForward"] == -1: strand = -1 # Remember: annotations start and end are relative to the block gb_annot.location = SeqFeature.FeatureLocation( annotation["start"] + start, annotation["end"] + start + 1, strand) gb_annot.type = annotation_type gb.features.append(gb_annot)
def add_feature(sequence_rec, start_postion, end_position, strand, name, feature_type, feature_id): # add a feature to the seq record my_feature_location = SeqFeature.FeatureLocation(start_postion - 1, end_position, strand=strand) my_feature = SeqFeature.SeqFeature(my_feature_location, type=feature_type, id=feature_id) my_feature.qualifiers["label"] = name return my_feature
def to_seq_feature(self): quals = {} for q in self.qualifiers.all(): quals[q.name] = q.data s = None if self.direction == 'f': s = 1 elif self.direction == 'r': s = -1 return SeqFeature.SeqFeature(location=SeqFeature.FeatureLocation( self.start, self.end), type=self.type, strand=s, qualifiers=quals)
def make_feature(product, blast_qresult, fragment ,hit, hsp, fragstart, count): s = hsp.hit_start e = hsp.hit_end if product == 'YR': s = hsp.hit_start-int(fragstart) e = hsp.hit_end-int(fragstart) feature = SeqFeature(FeatureLocation(s, e), type="DOMAIN", strand= hsp.hit_strand) feature.qualifiers['loc_on_contig'] = str(hsp.hit_start+1) + '..' + str(hsp.hit_end) feature.qualifiers['product'] = product feature.qualifiers['serial_on_frag'] = count count += 1 feature.qualifiers['program'] = blast_qresult.program + "_" + blast_qresult.version feature.qualifiers['evalue'] = hsp.evalue feature.qualifiers['assembly'] = blast_qresult.target.split('/')[-1] feature.qualifiers['contig'] = contig feature.qualifiers['translation'] = feature.extract(fragment.seq).translate() return (feature, count)
def writePBS(): global variation, seqRecordToCheck, seqRecordToCheckComplement, difference, newFeature for variation in featureStatistic_container[feature]: primerSeq = str(variation.seq) primerName = variation.note partialPrimerSeq = primerSeq[len(primerSeq) - 15::] seqRecordToCheck = str(record.seq) seqRecordToCheckComplement = str(reverse_complement(record.seq)) matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheck, partialPrimerSeq) if (len(matchingPrimerPositions) > 1): difference = len(primerSeq) - len(partialPrimerSeq) length = len(matchingPrimerPositions) for j in range(1, length): if primerSeq == seqRecordToCheck[matchingPrimerPositions[j] - difference: matchingPrimerPositions[j] - difference + len(primerSeq)]: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], matchingPrimerPositions[j] + len(primerSeq), strand=1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) else: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition( matchingPrimerPositions[j] + len(primerSeq)), strand=1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheckComplement, partialPrimerSeq) if (len(matchingPrimerPositions) > 1): difference = len(primerSeq) - len(partialPrimerSeq) length = len(matchingPrimerPositions) for j in range(1, length): if primerSeq == seqRecordToCheckComplement[matchingPrimerPositions[j] - difference: matchingPrimerPositions[j] - difference + len(primerSeq)]: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], matchingPrimerPositions[j] + len(primerSeq), strand=-1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) else: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition( matchingPrimerPositions[j] + len(primerSeq)), strand=-1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature)
def ins_insert(self,vec_5_site,utr_5_seq,ins_seq,utr_3_seq,vec_3_site,ins_name): from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord ins_record=SeqRecord(Seq(utr_5_seq+ins_seq+utr_3_seq,IUPAC.ambiguous_dna)) f_i=SeqFeature(FeatureLocation(len(utr_5_seq),len(utr_5_seq)+len(ins_seq)),type="insert") f_i.qualifiers["note"]=[ins_name,] ins_record.features=[f_i] old_name=self.record.name for feature in self.record.features: if feature.qualifiers.has_key("note")\ and re.search(r"^mcs",feature.qualifiers["note"][0],re.I): mcs_start=int(str(feature.location.start)) mcs_end=int(str(feature.location.end)) mcs_qualifiers=feature.qualifiers self.record=self.record[:vec_5_site]+ins_record+self.record[vec_3_site:] f_mcs=SeqFeature(FeatureLocation(mcs_start, vec_5_site+len(utr_5_seq+ins_seq+utr_3_seq)+mcs_end-vec_3_site),type="mcs") f_mcs.qualifiers=mcs_qualifiers self.record.features.append(f_mcs) self.record.features=sorted(self.record.features,key=lambda x:int(str(x.location.start))) self.record.name=old_name self.whole_len=len(self.record)
def writeFeature(strand): global newFeature if (len(occurrence) > 1): for i in range(1, len(occurrence)): newFeature = SeqFeature(FeatureLocation(occurrence[i], occurrence[i] + len(featureSeq), strand=strand), type=str(feature)) if variation.product is not None: newFeature.qualifiers['product'] = variation.product if variation.gene is not None: newFeature.qualifiers['gene'] = variation.gene if variation.bound_moiety is not None: newFeature.qualifiers['bound_moiety'] = variation.bound_moiety if variation.mobile is not None: newFeature.qualifiers['mobile'] = variation.mobile if variation.note is not None: newFeature.qualifiers['note'] = variation.note newRecord.features.append(newFeature)
# .WithinPosition: position between two nucleotides '(1.5)' in this way: position 1 is lower boundary, extension 4 is range to higher boundary # .OneOfPosition: any of a list of several numbers # .UnknownPosition: position of unknown location. from Bio import SeqFeature start_pos = SeqFeature.AfterPosition(5) end_pos = SeqFeature.BetweenPosition(9, left=8, right=9) mylocation = SeqFeature.FeatureLocation(start_pos, end_pos) print mylocation, mylocation.start, mylocation.end, int(mylocation.end) for feature in record.features: if 4350 in feature: # if position 4350 is in any feature print feature.type, feature.qualifiers.get('db_xref') from Bio.SeqFeature import SeqFeature, FeatureLocation seqParent = Seq('ACCGAGACGGCAAAGGCTAGCATAGGTATGAGACTTCCTTCCTGCCAGTGCTGAGGAACTGGGAGCCTAC') featu = SeqFeature(FeatureLocation(5, 18), type='gene', strand=-1) # location [5:18] in reverse_complement print featu featureSeq = seqParent[featu.location.start:featu.location.end].reverse_complement() print featureSeq print featu.extract(seqParent), len(featu.extract(seqParent)), len(featu), len(featu.location) # extract gets the subseq in location featu from seqParent # References publications that mention it # Bio.SeqFeature.Reference # journal: book, magazine, journal name # title, authors: of the paper # medline_id, pubmed_id: ID en Medline y PubMed # comment: about the reference # location: to specify location in the sequence mentioned in the paper # format: method to output as fasta or genbank formatted seq
if filename == inverted_repeats_file: direction = 'inverted' lines = open(filename, 'r').readlines() readl = 0 for line in lines: if line[0:9] == 'FASTA_HDR': #Get fragment name parts = re.split(r'\s+',line) fragment = ':'.join(parts[1].split(':')[0:-1]) if not fragment in repeat_features.keys(): repeat_features[fragment] = [] elif line[0:8] == 'FEATURES': #Start to read feature lines readl = 1 elif line[0:6] == 'ORIGIN': #Stop reading feature lines and ... readl = 0 if not end_a == 0: #If there were features, put the last one in repeat_features feature = SeqFeature(FeatureLocation(int(start_a)-1, int(end_a)), type="REPEAT", id = direction + '_' + str(repeat_name) + '.1') #Make a SeqFeature object for the first repeat partenr for mod in modifyers.keys(): feature.qualifiers[mod] = modifyers[mod] #add the qualifiers to the SeqFeature object feature.qualifiers['name'] = direction + '_' + str(repeat_name) + '.1' repeat_features[fragment].append(feature) feature = SeqFeature(FeatureLocation(int(start_b)-1, int(end_b)), type="REPEAT", id = direction + '_' + str(repeat_name) + '.2') #Make a SeqFeature object for the second repeat partenr for mod in modifyers.keys(): feature.qualifiers[mod] = modifyers[mod] #add the qualifiers to the SeqFeature object feature.qualifiers['name'] = direction + '_' + str(repeat_name) + '.2' #print ' Got repeat ' + feature.qualifiers['name'] + ' on fragment ' + fragment repeat_features[fragment].append(feature) repeat_name = 1 #Roll back all parameters to null for the next fragment start_a = 0 end_a = 0 start_b = 0 end_b = 0