def _extract_regions(gff_iterator): """Function added by KC Jan 2020. This Extracts regions from the first annotated position to the last annotated position, and updates the locations to correspond to the location in the sequence. """ for rec in gff_iterator: pos = [] loc = min([i.location.start for i in rec.features]) endloc = max([i.location.end for i in rec.features]) for i in range(len(rec.features)): pos += range(int(rec.features[i].location.start), int(rec.features[i].location.end)) rec.features[i].location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(rec.features[i].location.start - loc), SeqFeature.ExactPosition(rec.features[i].location.end - loc), strand=rec.features[i].strand) for j in range(len(rec.features[i].sub_features)): rec.features[i].sub_features[ j].location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition( rec.features[i].sub_features[j].location.start - loc), SeqFeature.ExactPosition( rec.features[i].sub_features[j].location.end - loc), strand=rec.features[i].sub_features[j].strand) rec.seq = rec.seq[loc:endloc] yield rec
def translateFeatureLocation(location, region, translation=0): location2 = location + translation + 1 if location2.end < 0: logging.debug('Error-prone feature detected: {}'.format(location2)) return SeqFeature.FeatureLocation(start=0, end=0, strand=0) else: return SeqFeature.FeatureLocation(start=max(0, location2.start), end=min(location2.end, region.end), strand=location2.strand)
def _trans_loc(loc): # Don't write the contig ID in the loc line unless it's trans-spliced if loc[0] == current_contig_id: loc[0] = None if loc[2] == "-": return SeqFeature.FeatureLocation(loc[1] - loc[3], loc[1], -1, loc[0]) else: return SeqFeature.FeatureLocation(loc[1] - 1, loc[1] + loc[3] - 1, 1, loc[0])
def ss_extract(genbank_in): """ Extraction of list of individual 5' and 3' ss. Function takes an input GenBank file name (genbank_in); Function returns 5' and 3' ss sequences (f_ss_seq, t_ss_seq) """ seq_record = SeqIO.read(genbank_in, 'genbank') f_loc = [] t_loc = [] f_list = [] t_list = [] f_ss_seq = [] t_ss_seq = [] # Loop over the gen file, get the mRNA starts # and ends position for '+' strand for feature in seq_record.features: if feature.type == 'mRNA': for exon_location in feature.location.parts: t_loc.append(int(exon_location.start)) f_loc.append(int(exon_location.end)) break with open('seq_log', 'a') as file: # Adding data to log file file.write(genbank_in.split('.')[0]) # Gene name file.write(' ' + feature.type + ' ') # Selected feature type file.write(str(feature.qualifiers.get("db_xref")) + '\n') # GeneID and DBref feature file.write('5` intron end location:' + str(f_loc) + '\n') # 5' intron ends location file.write('3` intron end location:' + str(t_loc) + '\n \n') # 3' intron ends location # 5' ss extraction for n in f_loc: if n > 15 and n < len(seq_record.seq): f_list.append(SeqFeature.FeatureLocation(n - 3, n + 6)) for loc in f_list: f_ss_seq.append(str(loc.extract(seq_record.seq))) # 3' ss extraction for n in t_loc: if n > 15: t_list.append(SeqFeature.FeatureLocation(n - 20, n + 3)) for loc in t_list: t_ss_seq.append(str(loc.extract(seq_record.seq))) return f_ss_seq, t_ss_seq
def make_start_fuzzy(self, location_object): ''' This function makes the start position of location objects fuzzy. ''' from Bio import SeqFeature if hasattr(location_object, 'parts'): if len(location_object.parts) == 1: new_start_pos = SeqFeature.BeforePosition(location_object.start) location_object = SeqFeature.FeatureLocation(new_start_pos, location_object.end) if len(location_object.parts) > 1: new_start_pos = SeqFeature.BeforePosition(location_object.parts[0].start) location_object.parts[0] = SeqFeature.FeatureLocation(new_start_pos, location_object.parts[0].end) return location_object
def annotate(my_plasmid, annot_dict): """ This function takes a plasmid map (.gb) and a dictionary of features (such as created by gb_annot_collector) and adds these features to the plasmid if they are found """ #loop through sequences in the dictionary for key in annot_dict.keys(): #create a variable for the feature label for convenience (we'll use it couple more times) name = str(annot_dict[key]['label']) #try to find this sequence if my_plasmid.seq.find(key) > -1: #get start and end coordinates start = my_plasmid.seq.find(key) end = start + len(key) #create new feature, put it in place and... new_feature = sf.SeqFeature(sf.FeatureLocation( start, end, strand=annot_dict[key]['strand']), type=annot_dict[key]['type']) #now append all the other features new_feature.qualifiers = annot_dict[key] #add it to the plasmid features my_plasmid.features.append(new_feature) #and let the user know we've found something print('found ' + name) else: # if the feature is not found try reverse complement #let the user know about it print(name + ' not found, trying complementary') #make reverse complement comp = Seq(key, generic_dna).reverse_complement() #try to find reverse complement if my_plasmid.seq.find(comp) > -1: #get start and end coordinates start = my_plasmid.seq.find(comp) end = start + len(comp) #create new feature, put it in place and... new_feature = sf.SeqFeature(sf.FeatureLocation( start, end, strand=-1 * annot_dict[key]['strand']), type=annot_dict[key]['type']) #now append all the other features new_feature.qualifiers = annot_dict[key] #add it to the plasmid features my_plasmid.features.append(new_feature) #let the user know we've found something print('found ' + name + ' in reverse complementary') else: # if not found in either strand print(name + ' not found') # let the user know about it return my_plasmid
def convert_annotations(block, gb): # Add My annotations as features for annotation in block["sequence"]["annotations"]: gb_annot = SeqFeature.SeqFeature() annotation_type = "unknown" if "role" in annotation and annotation["role"] != "": annotation_type = annotation["role"] for key, value in annotation.iteritems(): if key not in ["start", "end", "notes", "strand", "color", "role"]: gb_annot.qualifiers[key] = value elif key == "color": gb_annot.qualifiers["GC_Color"] = value elif key == "notes": for notes_key, notes_value in annotation["notes"].iteritems(): if notes_key == "genbank": for gb_key, gb_value in notes_value.iteritems(): if gb_key not in ["type"]: gb_annot.qualifiers[gb_key] = gb_value elif gb_key == "type": annotation_type = gb_value if "start" in annotation: strand = 1 if "strand" in annotation and annotation["strand"] == -1: strand = -1 gb_annot.location = SeqFeature.FeatureLocation( annotation["start"], annotation["end"] + 1, strand) gb_annot.type = annotation_type gb.features.append(gb_annot)
def create_feature_annot(loc_range, featuretype, s): """ Create a new feature annotation at loc_range with featuretype on strand s. """ location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(loc_range[0]), SeqFeature.ExactPosition(loc_range[1])) new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s) return (new_feature)
def extract_location_on_gene(self): """ :return: :rtype: SeqFeature.FeatureLocation """ if self.__genpept is None: return cds_set = set( filter(lambda feature: feature.type == 'CDS', self.__genpept.features)) if len(cds_set) != 1: return cds = cds_set.pop() coding_regions = cds.qualifiers.get('coded_by', []) if len(coding_regions) != 1: return coding_regions = coding_regions[0] res = self.__location_format.search(coding_regions) if res is None: return if self._accession_no is None: self._accession_no = res.group("accession") strand = self.COMPLEMENT if bool(res.group("complement")) else None start = int(res.group("start")) end = int(res.group("end")) return SeqFeature.FeatureLocation(start, end, strand)
def create_feature(hit, end): ''' Creates a feature from the hit, setting colours based on orientation and noting whether it is a left or right end hit. ''' # Set up coordinates start = int(hit[1]) stop = int(hit[2]) quals = {} location = SeqFeature.FeatureLocation(start, stop) # Annotate with end information and colour accordingly if end == 'five': quals['colour'] = '2' quals['end'] = 'left_end' feat_type = 'left end' elif end == 'three': quals['colour'] = '7' quals['end'] = 'right end' feat_type = 'right_end' # Create feature feature = SeqFeature.SeqFeature(location, type=feat_type, qualifiers=quals) return feature
def modify_genbank(gb_file, fasta_file): gb_filename = re.search(r'(.*/users/.*/uploads/.*).(\w*)', gb_file) out_file = str(gb_filename.group(1)) + '_modified.' + str( gb_filename.group(2)) genome = SeqIO.read(fasta_file, "fasta").seq final_annotations = get_final_annotations(genome) final_features = [] for record in SeqIO.parse(open(gb_file, "r"), "genbank"): for feature in record.features: if feature.type == "gene" or feature.type == "CDS": locus_tag = feature.qualifiers["locus_tag"][0] if locus_tag in final_annotations.keys(): new_start = final_annotations[locus_tag]["start"] feature.location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(new_start - 1), SeqFeature.ExactPosition( feature.location.end.position), feature.location.strand) if feature.type == "CDS": feature.qualifiers["product"][0] = final_annotations[ locus_tag]["function"] feature.qualifiers["translation"][ 0] = final_annotations[locus_tag]["translation"] else: continue final_features.append(feature) # Append final features record.features = final_features with open(out_file, "w") as new_gb: SeqIO.write(record, new_gb, "genbank") return out_file
def add_feature(self, search_str, label): """Label a feature by literal string match, failing silently. Does not label features that wrap around circular sequence Args: search_str (str): string representing feature label (str): feature name to display""" f_matches = re.finditer(search_str, str(self.seq), re.IGNORECASE) r_matches = () if search_str.lower() != dna.revc(search_str.lower()): # Don't label palindromes twice r_matches = re.finditer(dna.revc(search_str), str(self.seq), re.IGNORECASE) for m, strand in chain(izip_longest(f_matches, [1]), izip_longest(r_matches, [-1])): if m: feature = SeqFeature.SeqFeature( SeqFeature.FeatureLocation(m.start(), m.end(), strand), 'misc_feature') color = self.colors.next() feature.qualifiers = { 'label': [label], 'ApEinfo_fwdcolor': [color], 'ApEinfo_revcolor': [color], 'ApEinfo_graphicformat': ['arrow_data {{0 1 2 0 0 -1} {} 0} width 5 offset 0'] } self.features.append(feature) return
def createFEATUREannot(loc_range, featuretype, s): """ Creates a new SeqFeature with ExactPositions based on range.""" location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(loc_range[0]), SeqFeature.ExactPosition(loc_range[1])) new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s) return (new_feature)
def read_reference(fname, genemap): try: ref = str(SeqIO.read(fname, 'fasta').seq) except: with open(fname, 'r') as fh: ref = "".join([x.strip() for x in fh]) translations = {} with open(genemap, 'r') as fh: for line in fh: if line[0] == '#': continue entries = [x.strip() for x in line.strip().split('\t')] start = int(entries[3]) end = int(entries[4]) strand = entries[6] attributes = { x.split()[0]: ' '.join(x.split()[1:]) for x in entries[8].split(';') } if 'gene_name' in attributes: name = attributes['gene_name'].strip('"') else: name = None translation = Seq.translate( SeqFeature.SeqFeature( SeqFeature.FeatureLocation( start - 1, end, strand=-1 if strand == '-' else 1)).extract(ref)) translations[name] = str(translation) return {"nuc": ref, "translations": translations}
def _parse_feature(element): feature = SeqFeature.SeqFeature() for k, v in element.attrib.items(): feature.qualifiers[k] = v feature.type = element.attrib.get('type', '') if 'id' in element.attrib: feature.id = element.attrib['id'] for feature_element in element.getchildren(): if feature_element.tag == NS + 'location': position_elements = feature_element.findall(NS + 'position') if position_elements: element = position_elements[0] start_position = _parse_position(element, -1) end_position = _parse_position(element) else: element = feature_element.findall(NS + 'begin')[0] start_position = _parse_position(element, -1) element = feature_element.findall(NS + 'end')[0] end_position = _parse_position(element) feature.location = SeqFeature.FeatureLocation( start_position, end_position) else: try: feature.qualifiers[feature_element.tag.replace( NS, '')] = feature_element.text except: pass #skip unparsable tag self.ParsedSeqRecord.features.append(feature)
def _retrieve_reference(adaptor, primary_id): # XXX dbxref_qualifier_value refs = adaptor.execute_and_fetchall( "SELECT start_pos, end_pos, " " location, title, authors," " dbname, accession" " FROM bioentry_reference" " JOIN reference USING (reference_id)" " LEFT JOIN dbxref USING (dbxref_id)" " WHERE bioentry_id = %s" " ORDER BY rank", (primary_id,)) references = [] for start, end, location, title, authors, dbname, accession in refs: reference = SeqFeature.Reference() # If the start/end are missing, reference.location is an empty list if (start is not None) or (end is not None): if start is not None: start -= 1 # python counting reference.location = [SeqFeature.FeatureLocation(start, end)] # Don't replace the default "" with None. if authors: reference.authors = authors if title: reference.title = title reference.journal = location if dbname == 'PUBMED': reference.pubmed_id = accession elif dbname == 'MEDLINE': reference.medline_id = accession references.append(reference) if references: return {'references': references} else: return {}
def _parse_feature(element): feature = SeqFeature.SeqFeature() for k, v in element.attrib.items(): feature.qualifiers[k] = v feature.type = element.attrib.get("type", "") if "id" in element.attrib: feature.id = element.attrib["id"] for feature_element in element: if feature_element.tag == NS + "location": position_elements = feature_element.findall(NS + "position") if position_elements: element = position_elements[0] start_position = _parse_position(element, -1) end_position = _parse_position(element) else: element = feature_element.findall(NS + "begin")[0] start_position = _parse_position(element, -1) element = feature_element.findall(NS + "end")[0] end_position = _parse_position(element) feature.location = SeqFeature.FeatureLocation( start_position, end_position ) else: try: feature.qualifiers[ feature_element.tag.replace(NS, "") ] = feature_element.text except Exception: # TODO - Which exceptions? pass # skip unparsable tag self.ParsedSeqRecord.features.append(feature)
def _make_seqfeature(name, from_res, to_res, description, ft_id): """Construct SeqFeature from feature data from parser (PRIVATE).""" loc = SeqFeature.FeatureLocation(_make_position(from_res, -1), _make_position(to_res, 0)) if not ft_id: ft_id = "<unknown id>" # The default in SeqFeature object return SeqFeature.SeqFeature(loc, type=name, id=ft_id, qualifiers={"description": description})
def write_gbk(sequence, matched_orfs, filename, organism): date = strftime("%d-%b-%Y").upper() orfs = [] features = [] gbk_record = SeqRecord(Seq(sequence, IUPAC.unambiguous_dna), description=organism + " predicted Tn3 sequence", annotations={ "accession": '.', "version": '.', "organism": '.', "date": date, "data_file_division": "BCT" }) for item in matched_orfs: if item[0] < item[1]: start = item[0] end = item[1] strand = 0 else: start = item[1] end = item[0] strand = -1 orfs.append((start, end, strand, item[2], item[3])) for item in orfs: if item[3] and item[4]: features.append( SeqFeature.SeqFeature( SeqFeature.FeatureLocation(item[0] - 1, item[1], strand=item[2]), type="misc_feature", qualifiers={"note": item[4] + " family " + item[3]})) else: features.append( SeqFeature.SeqFeature(SeqFeature.FeatureLocation( item[0] - 1, item[1], strand=item[2]), type="CDS")) for item in features: gbk_record.features.append(item) SeqIO.write(gbk_record, filename, "gb")
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ location = SeqFeature.FeatureLocation(*feature_dict['location']) new_feature = SeqFeature.SeqFeature(location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']) new_feature.qualifiers = feature_dict['quals'] return new_feature
def create_feature(sequence, name, start, end, strand=+1): if str(name) and int(start) and int(end): my_feature_location = SeqFeature.FeatureLocation(start, end, strand=strand) my_feature = SeqFeature.SeqFeature(my_feature_location, type=name) sequence.features.append(my_feature)
def add_features(block, allblocks, gb, start): # Disregard fillers... don't create features for them if is_filler(block): return start + block["sequence"]["length"] # For handling list blocks! if "current_option" in block: option = [b for b in allblocks if b["id"] == block["current_option"]][0] return add_features(option, allblocks, gb, start) # Add Myself as a feature sf = SeqFeature.SeqFeature() # Set the type based on the original type or the role type if "genbank" in block["metadata"] and "type" in block["metadata"]["genbank"]: sf.type = block["metadata"]["genbank"]["type"] elif "rules" in block and "role" in block["rules"] and block["rules"]["role"] is not None and block["rules"]["role"] != "": sf.type = block["rules"]["role"] else: sf.type = "misc_feature" # Set up the location of the feature feature_strand = 1 if "strand" in block["metadata"]: feature_strand = block["metadata"]["strand"] # And copy all the other qualifiers that came originally from genbank if "genbank" in block["metadata"]: for annot_key, annot_value in block["metadata"]["genbank"].iteritems(): if annot_key not in ["name_source", "note"]: sf.qualifiers[annot_key] = annot_value convert_block_name(sf, block) add_GC_info(sf, block, allblocks) convert_annotations(block, gb, start) # Add my children as features child_start = start for i in range(0, len(block["components"])): block_id = block["components"][i] bl = [b for b in allblocks if b["id"] == block_id][0] child_start = add_features(bl, allblocks, gb, child_start) if child_start != start: # The end is where the last child ended... end = child_start else: # No children, look at the block's length if "sequence" in block: end = start + block["sequence"]["length"] else: end = start sf.location = SeqFeature.FeatureLocation(start, end, strand=feature_strand) gb.features.append(sf) return end
def write_gb(main_record_file, add="", destination=""): destination = expanduser(destination) main_record_file = expanduser(main_record_file) main_record = SeqIO.read(main_record_file, "fasta") add = expanduser(add) add = SeqIO.read(add, "fasta") main_record.seq.alphabet = IUPACAmbiguousDNA() cre_end = len(main_record.seq) main_record.seq = Seq( concatenate_overlapping_sequences(main_record.seq.tostring(), add.seq.tostring()), IUPACAmbiguousDNA()) main_record.name = "ePet-cre" main_record.id = "ePet-cre" main_record.description = "ePet-cre construct from doi:10.1038/nn.2623" my_start_pos = SeqFeature.ExactPosition(0) my_end_pos = SeqFeature.ExactPosition(cre_end) my_feature_location = SeqFeature.FeatureLocation(my_start_pos, my_end_pos, strand=1) my_feature_type = "CDS" my_feature = SeqFeature.SeqFeature(my_feature_location, type=my_feature_type) my_feature.qualifiers["gene"] = "Cre" main_record.features.append(my_feature) # Add SV40 PolyA my_start_pos = SeqFeature.ExactPosition(cre_end) my_end_pos = SeqFeature.ExactPosition(cre_end + 118) my_feature_location = SeqFeature.FeatureLocation(my_start_pos, my_end_pos, strand=1) my_feature_type = "PolyA" my_feature = SeqFeature.SeqFeature(my_feature_location, type=my_feature_type) my_feature.qualifiers["PolyA"] = "SV40-PolyA" main_record.features.append(my_feature) SeqIO.write(main_record, destination + ".gb", "genbank") SeqIO.write(main_record, destination + ".fastas", "fasta")
def test_GenerateFeatLoc__make_start_fuzzy__1(self): ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`. This test evaluates the case where FeatureLocations are made fuzzy. ''' from Bio import SeqFeature start_pos = SeqFeature.ExactPosition(5) end_pos = SeqFeature.ExactPosition(9) location_object = SeqFeature.FeatureLocation(start_pos, end_pos) out = GnOps.GenerateFeatLoc().make_start_fuzzy(location_object) self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation self.assertIsInstance(out.start, Bio.SeqFeature.BeforePosition) # Fuzzy Start
def contig_info(contig_id, contig_seq, species_informations): """ Create contig information from species_informations dictionary and contig id and contig seq. """ record = SeqRecord(contig_seq, id=contig_id, name=contig_id, description=species_informations['description'], annotations={"molecule_type": "DNA"}) if IUPAC: record.seq.alphabet = IUPAC.ambiguous_dna if 'data_file_division' in species_informations: record.annotations['data_file_division'] = species_informations[ 'data_file_division'] record.annotations['date'] = datetime.date.today().strftime( '%d-%b-%Y').upper() if 'topology' in species_informations: record.annotations['topology'] = species_informations['topology'] record.annotations['accessions'] = contig_id if 'organism' in species_informations: record.annotations['organism'] = species_informations['organism'] # Use of literal_eval for taxonomy and keywords to retrieve list. if 'taxonomy' in species_informations: record.annotations['taxonomy'] = species_informations['taxonomy'] if 'keywords' in species_informations: record.annotations['keywords'] = species_informations['keywords'] if 'source' in species_informations: record.annotations['source'] = species_informations['source'] new_feature_source = sf.SeqFeature(sf.FeatureLocation( 1 - 1, len(contig_seq)), type="source") new_feature_source.qualifiers['scaffold'] = contig_id if 'isolate' in species_informations: new_feature_source.qualifiers['isolate'] = species_informations[ 'isolate'] # db_xref corresponds to the taxon NCBI ID. # Important if you want to use Pathway Tools after. if 'db_xref' in species_informations: new_feature_source.qualifiers['db_xref'] = species_informations[ 'db_xref'] if 'cell_type' in species_informations: new_feature_source.qualifiers['cell_type'] = species_informations[ 'cell_type'] if 'dev_stage' in species_informations: new_feature_source.qualifiers['dev_stage'] = species_informations[ 'dev_stage'] if 'mol_type' in species_informations: new_feature_source.qualifiers['mol_type'] = species_informations[ 'mol_type'] record.features.append(new_feature_source) return record
def _parse_dbReference(element): self.ParsedSeqRecord.dbxrefs.append(element.attrib["type"] + ":" + element.attrib["id"]) if "type" in element.attrib: # <dbReference type="EMBL" id="U96180"> # <property type="protein sequence ID" value="AAB66902.1"/> # <property type="molecule type" value="mRNA"/> # </dbReference> if element.attrib["type"] == "EMBL": for ref_element in element: if "type" in ref_element.attrib and "value" in ref_element.attrib: if ref_element.attrib[ "type"] == "protein sequence ID": self.ParsedSeqRecord.dbxrefs.append( "EMBL-CDS:" + ref_element.attrib["value"]) # e.g. # <dbReference type="PDB" key="11" id="2GEZ"> # <property value="X-ray" type="method"/> # <property value="2.60 A" type="resolution"/> # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> # </dbReference> elif element.attrib["type"] == "PDB": method = "" resolution = "" for ref_element in element: if ref_element.tag == NS + "property": dat_type = ref_element.attrib["type"] if dat_type == "method": method = ref_element.attrib["value"] if dat_type == "resolution": resolution = ref_element.attrib["value"] if dat_type == "chains": pairs = ref_element.attrib["value"].split(",") for elem in pairs: pair = elem.strip().split("=") if pair[1] != "-": # TODO - How best to store these, do SeqFeatures make sense? feature = SeqFeature.SeqFeature() feature.type = element.attrib["type"] feature.qualifiers[ "name"] = element.attrib["id"] feature.qualifiers["method"] = method feature.qualifiers[ "resolution"] = resolution feature.qualifiers["chains"] = pair[ 0].split("/") start = int(pair[1].split("-")[0]) - 1 end = int(pair[1].split("-")[1]) feature.location = SeqFeature.FeatureLocation( start, end) # self.ParsedSeqRecord.features.append(feature) for ref_element in element: if ref_element.tag == NS + "property": pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation.""" location = SeqFeature.FeatureLocation(*feature_dict["location"]) new_feature = SeqFeature.SeqFeature( location, feature_dict["type"], id=feature_dict["id"], strand=feature_dict["strand"], ) new_feature.qualifiers = feature_dict["quals"] return new_feature
def test_GenerateFeatLoc__make_start_fuzzy__3(self): ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`. This test evaluates if end FeatureLocations are made fuzzy. See AfterPosition. ''' from Bio import SeqFeature start_pos = SeqFeature.ExactPosition(5) end_pos = SeqFeature.ExactPosition(9) location_object = SeqFeature.FeatureLocation(start_pos, end_pos) out = GnOps.GenerateFeatLoc().make_end_fuzzy(location_object) self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation self.assertIsInstance(out.end, Bio.SeqFeature.AfterPosition) # Fuzzy End
def create_genbank(gene_nucleic_seqs, gene_protein_seqs, annot, go_namespaces, go_alternatives, output_path, species_informations): """ Create genbank file from nucleic and protein fasta plus eggnog mapper annotation file. Args: gene_nucleic_seqs (dict): dictionary of nucleic sequences (key: sequence id, value: sequence) gene_protein_seqs (dict): dictionary of protein sequences (key: sequence id, value: sequence) annot (dict): dictionary of eggnog-ammper annotation (key: gene_id, value: ['GOs','EC', 'Preferred_name']) go_namespaces (dict): dictionary of GO terms namespace (key: GO Term ID, value: namespace associated to GO Term) go_alternatives (dict): dictionary of GO terms alternatives ID (key: GO Term ID, value: alternatives GO Term associated to GO Term) output_path (str): output file or directory species_informations (dict): dictionary containing information about species """ # All SeqRecord objects will be stored in a list and then give to the SeqIO writer to create the genbank. records = [] # Iterate through each contig/gene. for gene_nucleic_id in sorted(gene_nucleic_seqs): # Create a SeqRecord object using gene information. record = record_info(gene_nucleic_id, gene_nucleic_seqs[gene_nucleic_id], species_informations) # If id is numeric, change it if gene_nucleic_id.isnumeric(): id_gene = f"gene_{gene_nucleic_id}" elif "|" in gene_nucleic_id: id_gene = gene_nucleic_id.split("|")[1] else: id_gene = gene_nucleic_id start_position = 1 end_position = len(gene_nucleic_seqs[gene_nucleic_id]) strand = 0 new_feature_gene = sf.SeqFeature(sf.FeatureLocation( start_position, end_position, strand), type="gene") new_feature_gene.qualifiers['locus_tag'] = id_gene # Add gene information to contig record. record.features.append(new_feature_gene) new_cds_feature = create_cds_feature(id_gene, start_position, end_position, strand, annot, go_namespaces, go_alternatives, gene_protein_seqs) new_cds_feature.qualifiers['locus_tag'] = id_gene # Add CDS information to contig record record.features.append(new_cds_feature) records.append(record) # Create Genbank with the list of SeqRecord. SeqIO.write(records, output_path, 'genbank')
def _get_feature(self, feature_dict): """Retrieve a Biopython feature from our dictionary representation. """ location = SeqFeature.FeatureLocation(*feature_dict['location']) new_feature = SeqFeature.SeqFeature(location, feature_dict['type'], id=feature_dict['id'], strand=feature_dict['strand']) # Support for Biopython 1.68 and above, which removed sub_features if not hasattr(new_feature, "sub_features"): new_feature.sub_features = [] new_feature.qualifiers = feature_dict['quals'] return new_feature