def to_biopython(self, qualifiers: Dict[str, Any] = None) -> List[SeqFeature]: """ Converts this feature into one or more SeqFeature instances. Subclasses must manage their own attributes and potential extra features. """ feature = SeqFeature(self.location, type=self.type) quals = self._qualifiers.copy() notes = self._qualifiers.get("note", []) assert notes is not None notes.extend(self.notes) if qualifiers: notes += qualifiers.pop("note", []) quals.update(qualifiers) if notes: # sorting helps with consistency and comparison testing quals["note"] = sorted(notes) if self.created_by_antismash: quals["tool"] = ["antismash"] if self._original_codon_start is not None: start = int(self._original_codon_start) quals["codon_start"] = [str(start + 1)] # adjust location back if neccessary if self.location.strand == -1: start *= -1 if self._original_codon_start != 0: feature.location = _adjust_location_by_offset(feature.location, -start) # sorted here to match the behaviour of biopython for key, val in sorted(quals.items()): feature.qualifiers[key] = val assert isinstance(feature.qualifiers, dict) return [feature]
def _track(track_level): track = diagram.new_track(track_level, greytrack=False) feature_set = track.new_set() for name, it in zip(names, intervals): feat = SeqFeature(FeatureLocation(*it, strand=1)) feature_set.add_feature(feat, name=name, label=True, label_angle=90) for i, feat in enumerate(record.features[:8]): loc = feat.location eta = 4.8045 feat.location = FeatureLocation(int(loc.start / eta), int(loc.end / eta), strand=-1) color = colors.blue if i % 2 == 0 else colors.lightblue feature_set.add_feature(feat, color=color, label=True)
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose): """ """ contig, CDSs, gffstrand, function, frames = gene2position[gene] if gffstrand in ('1', '+'): strand = +1 else: strand = -1 CDSs.reverse() '''#add stop codon if not partial seq if strand==1 and CDSs[-1][1]+3 <= len(r.seq): CDSs[-1][1] += 3 elif strand==-1 and CDSs[0][0]-3 > 0: CDSs[0][0] -= 3''' cdsloc, mrnaloc = get_locations(CDSs, start, end, strand) #add gene geneid = gene #".".join(gene.split('.')[:-1]) #get product product = "hypothetical protein" if geneid in gene2product: product = gene2product[geneid] if gene.endswith('.t1'): sf = SeqFeature(FeatureLocation(BeforePosition(start - 1), AfterPosition(end)), strand=strand, type='gene', id=geneid) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } r.features.append(sf) #get mRNA sf sf = SeqFeature(mrnaloc, type='mRNA', id=gene) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } #"protein_id": gene r.features.append(sf) #get CDS sf sf = SeqFeature(cdsloc, type='CDS', id=gene) #get translation seq = sf.extract(r.seq) aa = str(seq.translate(table=gcode)) #solve non-triplets issue if len(seq) % 3: if strand == 1: end -= len(seq) % 3 else: start += len(seq) % 3 ##check for partial sequence - no M as first or no * as last aa partial = 0 #both ends partial if aa[0] != "M" and aa[-1] != "*": partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), AfterPosition(end)) #left end partial elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1: partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), end) #right end partial elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1: partial = 1 sf.location = FeatureLocation(start - 1, AfterPosition(end)) #strip stop codon aa = aa.strip("*") #replace internal stop codons by X if "*" in aa: if verbose: sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene) return r #aa = aa.replace("*","X") sf.qualifiers = { 'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa } #"protein_id": gene, if function: sf.qualifiers['note'] = function #inform about partial entries if partial: #skip if not partial are allowed if not partialyes: return r if aa[0] != "M": sf.qualifiers['codon_start'] = 1 sf.qualifiers['product'] += ", partial cds" if verbose: sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, )) #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf)) #add to features r.features.append(sf) return r
placeholder = SeqRecord(Seq("cgctatgcgaacaaaattgaactggaacgc", alphabet=IUPAC.unambiguous_dna), name=destination) if args.vector: base, ext = os.path.splitext(os.path.basename(args.vector)) output_filename = base + "_" + destination + ext naive_construct, objectives, constraints = load_template(args.vector, placeholder, destination) else: output_filename = destination + ".gb" objectives = [] constraints = [] naive_construct = placeholder whole_seq_feat = SeqFeature() whole_seq_feat.type = "misc_feature" whole_seq_feat.qualifiers['label'] = [destination] whole_seq_feat.location = FeatureLocation(0,len(placeholder),strand=1) naive_construct.features.append(whole_seq_feat) dest_feat = find_annotation(naive_construct, placeholder.name) dest_loc = Location.from_biopython_location(dest_feat.location) user_objectives, user_constraints = load_user_options(args, dest_loc) objectives += user_objectives constraints += user_constraints problem = DnaOptimizationProblem(str(naive_construct.seq), constraints=constraints, objectives=objectives)
def agrupar_sitios(): regiones = list( GFF.parse("/data/organismos/ILEX_PARA2/regulation/ncbi_IP4.gff3.reg")) ids = 1 groups = {} for c in tqdm(regiones): groups[c.id] = [] for strand in [1, -1]: group = SeqFeature(id=c.features[0], type="grouped_transcription_regulatory_region", location=c.features[0].location) group.sub_features = [] fs = sorted([f for f in c.features if f.strand == strand], key=lambda x: x.location.start) if not fs: continue group.sub_features += [fs[0]] for f in fs[1:]: end = max([x.location.end for x in group.sub_features]) if ((abs(f.location.start - end) < 1500) or (set(range(f.location.start, f.location.end)) & set( range(group.sub_features[-1].location.start, group.sub_features[-1].location.end)))): group.sub_features.append(f) else: group.qualifiers = { "description": "_".join( sorted( set([ x.qualifiers["description"][0].split( " regulatory region")[0] for x in group.sub_features ]))), "ID": ["ILEXPARARR" + str(ids)] } ids += 1 group.location = FeatureLocation( start=min( [x.location.start for x in group.sub_features]), end=max([x.location.end for x in group.sub_features]), strand=f.location.strand) assert group.location.start < group.location.end if (group.location.end - group.location.start) > 5000: print(group.qualifiers["ID"]) groups[c.id].append(group) group = SeqFeature( id=c.features[0], type="grouped_transcription_regulatory_region", location=f.location) group.sub_features = [f] if group: group.qualifiers = { "description": "_".join( sorted( set([ x.qualifiers["description"][0].split( " binding site")[0] for x in group.sub_features ]))), "ID": ["ILEXPARARR" + str(ids)] } ids += 1 group.location = FeatureLocation( start=min([x.location.start for x in group.sub_features]), end=max([x.location.end for x in group.sub_features]), strand=f.location.strand) assert group.location.start < group.location.end if (group.location.end - group.location.start) > 5000: print(group.qualifiers["ID"]) groups[c.id].append(group) # for _, v in groups.items(): # for x in v: # x.sub_features = [] records = [ SeqRecord(id=k, name="", description="", seq=Seq(""), features=v) for k, v in groups.items() ] GFF.write(tqdm(records), open("/data/organismos/ILEX_PARA2/regulation/grouped.gff", "w"))
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose): """ """ contig, CDSs, gffstrand, function, frames = gene2position[gene] if gffstrand in ('1','+'): strand = +1 else: strand = -1 CDSs.reverse() '''#add stop codon if not partial seq if strand==1 and CDSs[-1][1]+3 <= len(r.seq): CDSs[-1][1] += 3 elif strand==-1 and CDSs[0][0]-3 > 0: CDSs[0][0] -= 3''' cdsloc, mrnaloc = get_locations(CDSs, start, end, strand) #add gene geneid = gene #".".join(gene.split('.')[:-1]) #get product product = "hypothetical protein" if geneid in gene2product: product = gene2product[geneid] if gene.endswith('.t1'): sf = SeqFeature(FeatureLocation(BeforePosition(start-1),AfterPosition(end)), strand=strand, type='gene', id=geneid) sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} r.features.append(sf) #get mRNA sf sf = SeqFeature(mrnaloc, type='mRNA', id=gene) sf.qualifiers={"locus_tag": geneid, "gene": geneid, "product": product} #"protein_id": gene r.features.append(sf) #get CDS sf sf = SeqFeature(cdsloc, type='CDS', id=gene) #get translation seq = sf.extract(r.seq) aa = str(seq.translate(table=gcode)) #solve non-triplets issue if len(seq) % 3: if strand==1: end -= len(seq) % 3 else: start += len(seq) % 3 ##check for partial sequence - no M as first or no * as last aa partial = 0 #both ends partial if aa[0]!="M" and aa[-1]!="*": partial = 1 sf.location = FeatureLocation(BeforePosition(start-1),AfterPosition(end)) #left end partial elif aa[0]!="M" and strand==1 or aa[-1]!="*" and strand==-1: partial = 1 sf.location = FeatureLocation(BeforePosition(start-1),end) #right end partial elif aa[-1]!="*" and strand==1 or aa[0]!="M" and strand==-1: partial = 1 sf.location = FeatureLocation(start-1,AfterPosition(end)) #strip stop codon aa = aa.strip("*") #replace internal stop codons by X if "*" in aa: if verbose: sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene) return r #aa = aa.replace("*","X") sf.qualifiers = {'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa} #"protein_id": gene, if function: sf.qualifiers['note'] = function #inform about partial entries if partial: #skip if not partial are allowed if not partialyes: return r if aa[0]!="M": sf.qualifiers['codon_start'] = 1 sf.qualifiers['product'] += ", partial cds" if verbose: sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene,)) #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf)) #add to features r.features.append(sf) return r
def correct(_gff, _genome): """ :param _gff: gff path :param _genome: fasta path :return: """ _seqs = SeqIO.to_dict(SeqIO.parse(_genome, 'fasta')) _gff = [_ for _ in GFF.parse(_gff, base_dict=_seqs)] correct_list = [] gene_error_dict = defaultdict(list) for scaffold in _gff: correct_scaffold = SeqRecord(seq="", id=scaffold.id, name=scaffold.name, description=scaffold.description) for gene in scaffold.features: correct_gene = SeqFeature(location=gene.location, type='gene', strand=gene.strand, id=gene.id, qualifiers={'ID': [gene.id]}) correct_gene.sub_features = [] error_dict = defaultdict(list) for mRNA in gene.sub_features: try: get_cds(mRNA, scaffold) correct_gene.sub_features.append(mRNA) except TranslationError as e: try: if e.args[0].startswith("First codon"): _tmp_mrna = correct_start_codon(mRNA, scaffold) get_cds(_tmp_mrna, scaffold) correct_gene.sub_features.append(_tmp_mrna) error_dict.setdefault('corrected', []).append(mRNA.id) elif e.args[0].startswith('The phase of first CDS is not 0'): # the translation was checked in function _tmp_mrna = correct_phase(mRNA, scaffold) correct_gene.sub_features.append(_tmp_mrna) error_dict.setdefault('corrected', []).append(mRNA.id) elif e.args[0].endswith("is not a stop codon"): _tmp_mrna = correct_stop_codon(mRNA, scaffold) get_cds(_tmp_mrna, scaffold) correct_gene.sub_features.append(_tmp_mrna) error_dict.setdefault('corrected', []).append(mRNA.id) # can not handle for now elif e.args[0] == "Extra in frame stop codon found.": error_dict.setdefault('internal', []).append(mRNA.id) elif e.args[0].endswith("is not a multiple of three"): error_dict.setdefault('three', []).append(mRNA.id) except TranslationError as e2: if e2.args[0].startswith('These mRNAs need another round correction'): correct_gene.sub_features.append(e2.args[1]) error_dict.setdefault('phase', []).append(mRNA.id) # for second round elif e2.args[0] == "Extra in frame stop codon found": error_dict.setdefault('internal', []).append(mRNA.id) elif e2.args[0].startswith('First codon'): error_dict.setdefault('first2', []).append(mRNA.id) elif e2.args[0].endswith("is not a stop codon"): error_dict.setdefault('final', []).append(mRNA.id) elif e2.args[0].endswith("is not a multiple of three"): error_dict.setdefault('three', []).append(mRNA.id) except Exception as e: print(e) print(mRNA.id) # handle mRNA and gene relationship if not correct_gene.sub_features: # Raise error for genes whose all mRNAs have error. for _key, value in error_dict.items(): _tmp_error = [gene.id + ' ' + _ for _ in value] gene_error_dict[_key] += _tmp_error else: # check boundary conflict between gene and mRNA. gene_start, gene_end = gene.location.start, gene.location.end for mRNA in correct_gene.sub_features: if mRNA.location.start < correct_gene.location.start: gene_start = mRNA.location.start if mRNA.location.end > correct_gene.location.end: gene_end = mRNA.location.end correct_gene.location = FeatureLocation(gene_start, gene_end, strand=correct_gene.strand) correct_scaffold.features.append(correct_gene) correct_list.append(correct_scaffold) # tidy correct list return correct_list, gene_error_dict
'label': 'B0015 Double Terminator', 'note': ['color: #ff8eff', '"iGEM Part: BBa_B0015"'] } # merge Lux pL promoter r0063 = next(get_features('R0063'), None) if r0063 is not None: luxpl = next(get_features('Lux pL promoter')) luxpl.location = r0063.location gb_archive.features.remove(r0063) # add LVA ssrA tag ssra_match = SSRA_TAG.search(gb_archive.seq) if ssra_match is not None: ssra = SeqFeature(type="CDS") ssra.location = FeatureLocation(*ssra_match.span(), strand=1) ssra.qualifiers = { "label": ["ssrA tag (LVA)"], "product": [ "C-terminal peptide that mediates degradation in bacteria through the ClpXP and ClpAP proteases (McGinness et al., 2006)" ], "translation": "AANDENYALVA", "note": [ "mutant LVA variant that confers accelerated degradation under some conditions (Andersen et al., 1998)", "color: #cc99b2", ], } gb_archive.features.append(ssra) # Replace E0040m with well annotated GFP
print('\n---Exercise 5---') id = "PAX-6.5" format = "fasta" record = SeqIO.read(open(id + "." + format), format) print("Record:\n", record) record.seq.alphabet = IUPAC.unambiguous_dna print("\nAlphabet altered to IUPAC.unambiguous_dna !!!") accNb = record.id.split("|")[3] print("Access number: ", accNb) record.name = accNb record.id = accNb print("record.name and record.id have been altered !!!") feature = SeqFeature() feature.type = "gene" feature.location = FeatureLocation(18, 200) feature.strand = -1 record.features.append(feature) print("record.features: ", record.features) print("\nRecord:\n", record) count = SeqIO.write(record, open(id + ".gb", "w"), "genbank") print("Converted %i records" % count) if 6 in _RunExercise: print('\n---ORF---') mail = '' id = "NC_009926" db = "nuccore" format = "fasta"