def writeGff3(data, handle, parentGff3): for record in GFF.parse(parentGff3): cdss = list( feature_lambda(record.features, feature_test_id, {"id": data.keys()}, subfeatures=False)) record.features = [] for cds in cdss: if "note" not in cds.qualifiers: cds.qualifiers["note"] = [] id = get_id(cds) if data[id]["cleavage"]: cds.qualifiers["note"].append("Cleavage between %s and %s" % data[id]["cleavageSite"]) record.features.append(fetchParent(cds)) GFF.write([record], handle)
def shinefind( genbank_file, gff3_output=None, table_output=None, lookahead_min=5, lookahead_max=15, top_only=False, add=False, ): table_output.write("\t".join([ "ID", "Name", "Terminus", "Terminus", "Strand", "Upstream Sequence", "SD", "Spacing", ]) + "\n") sd_finder = NaiveSDCaller() # Parse GFF3 records for record in list(SeqIO.parse(genbank_file, "genbank")): # Sometimes you have a case where TWO CDS features have the same start. Only handle ONE. seen = {} # Shinefind's "gff3_output". gff3_output_record = SeqRecord(record.seq, record.id) # Loop over all CDS features for feature in record.features: if feature.type != "CDS": continue seen_loc = (feature.location.start if feature.strand > 0 else feature.location.end) if seen_loc in seen: continue else: seen[seen_loc] = True sds, start, end, seq = sd_finder.testFeatureUpstream( feature, record, sd_min=lookahead_min, sd_max=lookahead_max) feature_id = get_id(feature) sd_features = sd_finder.to_features(sds, feature.location.strand, start, end, feature_id=feature.id) human_strand = "+" if feature.location.strand == 1 else "-" # http://book.pythontips.com/en/latest/for_-_else.html log.debug("Found %s SDs", len(sds)) for (sd, sd_feature) in zip(sds, sd_features): # If we only want the top feature, after the bulk of the # forloop executes once, we append the top feature, and fake a # break, because an actual break triggers the else: block table_output.write("\t".join( map( str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, sd_finder.highlight_sd(seq, sd["start"], sd["end"]), sd["hit"], int(sd["spacing"]) + lookahead_min, ], )) + "\n") if add: # Append the top RBS to the gene feature record.features.append(sd_feature) # Also register the feature with the separate GFF3 output gff3_output_record.features.append(sd_feature) if top_only: break else: if len(sds) != 0: log.debug("Should not reach here if %s", len(sds) != 0) # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem. continue table_output.write("\t".join( map( str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, seq, None, -1, ], )) + "\n") record.features = sorted(record.features, key=lambda x: x.location.start) SeqIO.write([record], sys.stdout, "genbank") gff3_output_record.features = sorted(gff3_output_record.features, key=lambda x: x.location.start) gff3_output_record.annotations = {} GFF.write([gff3_output_record], gff3_output)
top_hits[qseq][fn] = (evalue, sseq, dice) sys.stdout.write("# Query Feature\tLocation\t") sys.stdout.write("\t".join(["%s\tevalue\tdice" % x for x in blast_names])) sys.stdout.write("\n") for rec in GFF.parse(args.gff3): for feat in fsort( feature_lambda(rec.features, feature_test_type, {"types": "CDS"}, subfeatures=False)): sys.stdout.write(feat._parent._parent.qualifiers["Name"][0]) sys.stdout.write("\t") sys.stdout.write(str(feat.location)) for db in blast_names: fid = get_id(feat) if fid in top_hits: if fn in top_hits[fid]: sys.stdout.write("\t") sys.stdout.write(";".join([ "%s %s" % (x, y) for (x, y) in top_hits[fid][fn][1] ])) sys.stdout.write("\t") sys.stdout.write(str(top_hits[fid][fn][0])) sys.stdout.write("\t") sys.stdout.write(str(top_hits[fid][fn][2])) else: sys.stdout.write("\tNone") sys.stdout.write("\tNone") sys.stdout.write("\tNone") else:
def find_lipoprotein(gff3_file, fasta_genome, lipobox_mindist=10, lipobox_maxdist=60): seq_dict = SeqIO.to_dict(SeqIO.parse(fasta_genome, "fasta")) CASES = [ re.compile('^.{%s,%s}[ACGSILMFTV][^REKD][GASNL]C' % (lipobox_mindist, lipobox_maxdist)), # re.compile('^.{%s,%s}AWAC' % (lipobox_mindist, lipobox_maxdist)), # Make sure to not have multiple cases that share matches, will introduce duplicate features into gff3 file ] for record in GFF.parse(gff3_file, base_dict=seq_dict): good_features = [] genes = list( feature_lambda(record.features, feature_test_type, {'type': 'gene'}, subfeatures=True)) for gene in genes: cdss = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=False)) if len(cdss) == 0: continue # Someday this will bite me in the arse. cds = cdss[0] try: tmpseq = str( cds.extract(record.seq).translate(table=11, cds=True)).replace( "*", "") except: continue for case in CASES: m = case.search(tmpseq) if m: if cds.location.strand > 0: start = cds.location.start + (3 * (m.end() - 4)) end = cds.location.start + (3 * m.end()) else: start = cds.location.end - (3 * (m.end() - 4)) end = cds.location.end - (3 * m.end()) tmp = SeqFeature(FeatureLocation( min(start, end), max(start, end), strand=cds.location.strand), type='Lipobox', qualifiers={ 'source': 'CPT_LipoRy', 'ID': '%s.lipobox' % get_id(gene), }) tmp.qualifiers['sequence'] = str( tmp.extract(record).seq.translate()) gene.sub_features.append(tmp) good_features.append(gene) record.features = good_features yield [record]
def main(fasta, gff3, feature_filter=None, nodesc=False): if feature_filter == "nice_cds": from gff2gb import gff3_to_genbank as cpt_Gff2Gbk for rec in cpt_Gff2Gbk(gff3, fasta, 11): seenList = {} if rec.seq[0] == "?": print("No Fasta ID matches GFF") exit(1) for feat in sorted(rec.features, key=lambda x: x.location.start): if feat.type != "CDS": continue ind = 0 if (str( feat.qualifiers.get("locus_tag", get_id(feat)).replace(" ", "-")) in seenList.keys()): seenList[str( feat.qualifiers.get("locus_tag", get_id(feat)).replace(" ", "-"))] += 1 ind = seenList[str( feat.qualifiers.get("locus_tag", get_id(feat)).replace(" ", "-"))] else: seenList[str( feat.qualifiers.get("locus_tag", get_id(feat)).replace(" ", "-"))] = 1 append = "" if ind != 0: append = "_" + str(ind) if nodesc: description = "" else: feat.qualifiers["ID"] = [feat._ID] product = feat.qualifiers.get("product", "") description = "{1} [Location={0.location};ID={0.qualifiers[ID][0]}]".format( feat, product) # print(feat.qualifiers.get('locus_tag', get_id(feat)).replace(' ', '-')) yield [ SeqRecord( feat.extract(rec).seq, id=str( feat.qualifiers.get( "locus_tag", get_id(feat)).replace(" ", "-")) + append, description=description, ) ] elif feature_filter == "unique_cds": seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) seen_ids = {} for rec in GFF.parse(gff3, base_dict=seq_dict): noMatch = True if "Alias" in rec.features[0].qualifiers.keys(): lColumn = rec.features[0].qualifiers["Alias"][0] else: lColumn = "" for x in seq_dict: if x == rec.id or x == lColumn: noMatch = False if noMatch: print("No Fasta ID matches GFF") exit(1) newfeats = [] for feat in sorted( feature_lambda(rec.features, feature_test_type, {"type": "CDS"}, subfeatures=False), key=lambda f: f.location.start, ): nid = rec.id + "____" + feat.id if nid in seen_ids: nid = nid + "__" + uuid.uuid4().hex feat.qualifiers["ID"] = nid newfeats.append(feat) seen_ids[nid] = True if nodesc: description = "" else: important_data = {"Location": feat.location} if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get( "Name", [""])[0] description = "[{}]".format(";".join([ "{key}={value}".format(key=k, value=v) for (k, v) in important_data.items() ])) yield [ SeqRecord( feat.extract(rec).seq, id=nid.replace(" ", "-"), description=description, ) ] rec.features = newfeats rec.annotations = {} GFF.write([rec], sys.stderr) else: seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) for rec in GFF.parse(gff3, base_dict=seq_dict): noMatch = True if "Alias" in rec.features[0].qualifiers.keys(): lColumn = rec.features[0].qualifiers["Alias"][0] else: lColumn = "" for x in seq_dict: if x == rec.id or x == lColumn: noMatch = False if noMatch: print("No Fasta ID matches GFF") exit(1) for feat in sorted( feature_lambda( rec.features, feature_test_type, {"type": feature_filter}, subfeatures=False, ), key=lambda f: f.location.start, ): id = feat.id if len(id) == 0: id = get_id(feat) if nodesc: description = "" else: important_data = {"Location": feat.location} if "Name" in feat.qualifiers: important_data["Name"] = feat.qualifiers.get( "Name", [""])[0] description = "[{}]".format(";".join([ "{key}={value}".format(key=k, value=v) for (k, v) in important_data.items() ])) yield [ SeqRecord( feat.extract(rec).seq, id=id.replace(" ", "-"), description=description, ) ]
def main(fasta, gff3, feature_filter=None, nodesc=False): if feature_filter == 'nice_cds': from gff2gb import gff3_to_genbank for rec in gff3_to_genbank(gff3, fasta): for feat in sorted(rec.features, key=lambda x: x.location.start): if feat.type != 'CDS': continue if nodesc: description = '' else: feat.qualifiers['ID'] = [feat._ID] product = feat.qualifiers.get('product', '') description = '{1} [Location={0.location};ID={0.qualifiers[ID][0]}]'.format( feat, product) yield [ SeqRecord(feat.extract(rec).seq, id=feat.qualifiers.get('locus_tag', get_id(feat)).replace( ' ', '-'), description=description) ] elif feature_filter == 'unique_cds': seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) seen_ids = {} for rec in GFF.parse(gff3, base_dict=seq_dict): newfeats = [] for feat in sorted(feature_lambda(rec.features, feature_test_type, {'type': 'CDS'}, subfeatures=False), key=lambda f: f.location.start): nid = rec.id + '____' + feat.id if nid in seen_ids: nid = nid + '__' + uuid.uuid4().hex feat.qualifiers['ID'] = nid newfeats.append(feat) seen_ids[nid] = True if nodesc: description = '' else: important_data = { 'Location': feat.location, } if 'Name' in feat.qualifiers: important_data['Name'] = feat.qualifiers.get( 'Name', [''])[0] description = '[{}]'.format(';'.join([ '{key}={value}'.format(key=k, value=v) for (k, v) in important_data.items() ])) yield [ SeqRecord(feat.extract(rec).seq, id=nid.replace(' ', '-'), description=description) ] rec.features = newfeats rec.annotations = {} GFF.write([rec], sys.stderr) else: seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) for rec in GFF.parse(gff3, base_dict=seq_dict): for feat in sorted(feature_lambda(rec.features, feature_test_type, {'type': feature_filter}, subfeatures=False), key=lambda f: f.location.start): id = feat.id if len(id) == 0: id = get_id(feat) if nodesc: description = '' else: important_data = { 'Location': feat.location, } if 'Name' in feat.qualifiers: important_data['Name'] = feat.qualifiers.get( 'Name', [''])[0] description = '[{}]'.format(';'.join([ '{key}={value}'.format(key=k, value=v) for (k, v) in important_data.items() ])) yield [ SeqRecord(feat.extract(rec).seq, id=id.replace(' ', '-'), description=description) ]
def shinefind(fasta, gff3, gff3_output=None, table_output=None, lookahead_min=5, lookahead_max=15, top_only=False, add=False): table_output.write('\t'.join([ 'ID', 'Name', 'Terminus', 'Terminus', 'Strand', 'Upstream Sequence', 'SD', 'Spacing' ]) + "\n") sd_finder = NaiveSDCaller() # Load up sequence(s) for GFF3 data seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # Parse GFF3 records for record in GFF.parse(gff3, base_dict=seq_dict): # Shinefind's "gff3_output". gff3_output_record = SeqRecord(record.seq, record.id) # Filter out just coding sequences ignored_features = [] for x in record.features: # If feature X does NOT contain a CDS, add to ignored_features # list. This means if we have a top level gene feature with or # without a CDS subfeature, we're catch it appropriately here. if len( list( feature_lambda([x], feature_test_type, {'type': 'CDS'}, subfeatures=True))) == 0: ignored_features.append(x) # Loop over all gene features for gene in feature_lambda(record.features, feature_test_type, {'type': 'gene'}, subfeatures=True): # Get the CDS from this gene. feature = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=True)) # If no CDSs are in this gene feature, then quit if len(feature) == 0: # We've already caught these above in our ignored_features # list, so we skip out on the rest of this for loop continue else: # Otherwise pull the first (bad?) We don't expect >1 CDS/gene feature = feature[0] # Three different ways RBSs can be stored that we expect. rbs_rbs = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'RBS'}, subfeatures=False)) rbs_sds = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'Shine_Dalgarno_sequence'}, subfeatures=False)) regulatory_elements = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'regulatory'}, subfeatures=False)) rbs_regulatory = list( feature_lambda(regulatory_elements, feature_test_quals, {'regulatory_class': ['ribosome_binding_site']}, subfeatures=False)) rbss = rbs_rbs + rbs_sds + rbs_regulatory # If someone has already annotated an RBS, we quit if len(rbss) > 0: log.debug("Has %s RBSs", len(rbss)) ignored_features.append(gene) continue sds, start, end, seq = sd_finder.testFeatureUpstream( feature, record, sd_min=lookahead_min, sd_max=lookahead_max) feature_id = get_id(feature) sd_features = sd_finder.to_features(sds, feature.location.strand, start, end, feature_id=feature.id) human_strand = '+' if feature.location.strand == 1 else '-' # http://book.pythontips.com/en/latest/for_-_else.html log.debug('Found %s SDs', len(sds)) for (sd, sd_feature) in zip(sds, sd_features): # If we only want the top feature, after the bulk of the # forloop executes once, we append the top feature, and fake a # break, because an actual break triggers the else: block table_output.write('\t'.join( map(str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, sd_finder.highlight_sd(seq, sd['start'], sd['end']), sd['hit'], int(sd['spacing']) + lookahead_min, ])) + "\n") if add: # Append the top RBS to the gene feature gene.sub_features.append(sd_feature) # Pick out start/end locations for all sub_features locations = [x.location.start for x in gene.sub_features] + \ [x.location.end for x in gene.sub_features] # Update gene's start/end to be inclusive gene.location._start = min(locations) gene.location._end = max(locations) # Also register the feature with the separate GFF3 output sd_feature = fix_gene_boundaries(sd_feature) gff3_output_record.features.append(sd_feature) if top_only: break else: if len(sds) != 0: log.debug('Should not reach here if %s', len(sds) != 0) # Somehow this is triggerring, and I don't feel like figuring out why. Someone else's problem. continue table_output.write('\t'.join( map(str, [ feature.id, feature_id, feature.location.start, feature.location.end, human_strand, seq, None, -1, ])) + "\n") record.annotations = {} GFF.write([record], sys.stdout) gff3_output_record.features = sorted(gff3_output_record.features, key=lambda x: x.location.start) gff3_output_record.annotations = {} GFF.write([gff3_output_record], gff3_output)
def feature_test_id(feature, **kwargs): return get_id(feature) in kwargs["id"]