def find_introns(gff3, fasta): seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) for rec in GFF.parse(gff3, base_dict=seq_dict): genes = list( feature_lambda(rec.features, feature_test_type, {"type": "gene"}, subfeatures=True)) for gene in genes: cdss = sorted( list( feature_lambda( gene.sub_features, feature_test_type, {"type": "CDS"}, subfeatures=False, )), key=lambda x: x.location.start, ) if len(cdss) > 1: intron = "" for i in range( len(cdss) - 1): # find pairs of cdss with introns in between intron_start = cdss[i].location.end intron_end = cdss[i + 1].location.start intron += rec[intron_start:intron_end].seq sys.stdout.write(">" + rec.id + "\n") sys.stdout.write(intron + "\n")
def require_shinefind(gff3, fasta): # Load up sequence(s) for GFF3 data seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # Parse GFF3 records for record in GFF.parse(gff3, base_dict=seq_dict): # Reopen genes = list( feature_lambda(record.features, feature_test_type, {'type': 'gene'}, subfeatures=True)) good_genes = [] for gene in genes: cdss = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=False)) if len(cdss) == 0: continue one_good_cds = False for cds in cdss: if cds.extract(record).seq[0:3].upper() in ('GTG', 'ATG', 'TTG'): one_good_cds = True if one_good_cds: good_genes.append(gene) record.features = good_genes record.annotations = {} yield record
def fix_ncbi(gff3): for rec in GFF.parse(gff3): for feature in feature_lambda(rec.features, feature_test_type, {'type': 'gene'}, subfeatures=True): CDSs = list( feature_lambda(feature.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=False)) if len(CDSs) == 1: feature.qualifiers.update(safe_qualifiers(CDSs[0].qualifiers)) GFF.write([rec], sys.stdout)
def main(fasta, gff3): seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) codon_usage = {} for rec in GFF.parse(gff3, base_dict=seq_dict): for feat in feature_lambda(rec.features, feature_test_type, {'type': 'CDS'}, subfeatures=True): seq = str(feat.extract(rec).seq)[-3:] try: codon_usage[seq] += 1 except KeyError: codon_usage[seq] = 1 names = { 'TAG': 'Amber', 'TAA': 'Ochre', 'TGA': 'Opal', } # TODO: print all actg combinations? Or just ones that are there print '# Name\tCodon\tCount' for key in sorted(codon_usage): print '\t'.join((names.get(key.upper(), 'None'), key, str(codon_usage[key])))
def _orfCalls(self): fnmga = self.base_name + ".mga" if not os.path.exists(fnmga): log.warn("%s does not exist, calling genes in %s", fnmga, self.rec_file.name) # Run MGA subprocess.check_call(["mga_linux_x64", "-s", self.rec_file.name], stdout=open(fnmga, "w")) # Convert to gff3 fn = self.base_name + ".mga.gff3" self.mga_gff3 = fn with open(fnmga, "r") as handle, open(fn, "w") as output: self.rec_file.seek(0) for result in mga_to_gff3(handle, self.rec_file): # Store gFF3 data in self in order to access later. self.mga_rec = result GFF.write([result], output) # Process a feature id -> feature table in mem. self.featureDict = {} for f in feature_lambda(self.mga_rec.features, lambda x: True, {}, subfeatures=True): self.featureDict[f.qualifiers["ID"][0]] = f # Extract fnfa = self.base_name + ".mga.fa" self.fnfa = fnfa subprocess.check_call( [ "python2", os.path.join(SCRIPT_DIR, os.pardir, "gff3", "gff3_extract_sequence.py"), "--feature_filter", "CDS", self.rec_file.name, fn, ], stdout=open(fnfa, "w"), ) # Translate fnpfa = self.base_name + ".mga.pfa" self.fnpfa = fnpfa subprocess.check_call( [ "python2", os.path.join(SCRIPT_DIR, os.pardir, "fasta", "fasta_translate.py"), "--table", "11", "--strip_stops", "--target", "protein", fnfa, ], stdout=open(fnpfa, "w"), ) return fnpfa
def fixed_feature(rec): for idx, feature in enumerate( feature_lambda(rec.features, feature_test_type, {"type": "tRNA"}, subfeatures=True)): fid = "tRNA-%03d" % (1 + idx) name = ["tRNA-" + feature.qualifiers["Codon"][0]] gene = SeqFeature( location=feature.location, type="gene", qualifiers={ "ID": [fid + ".gene"], "source": ["aragorn"], "Name": name }, ) feature.qualifiers["Name"] = name # Below that we have an mRNA exon = SeqFeature( location=feature.location, type="exon", qualifiers={ "source": ["aragorn"], "ID": ["%s.exon" % fid], "Name": name }, ) feature.qualifiers["ID"] = [fid] # gene -> trna -> exon feature.sub_features = [exon] gene.sub_features = [feature] yield gene
def fixed_feature(rec): for feature in feature_lambda( rec.features, feature_test_type, {'type': 'CDS'}, subfeatures=True ): import random fid = feature.qualifiers['ID'][0] + '_' + str(random.random()) gene = SeqFeature( location=feature.location, type='gene', qualifiers={ 'ID': [fid], 'source': ['cpt.fixModel'], } ) # Below that we have an mRNA mRNA = SeqFeature( location=feature.location, type='mRNA', qualifiers={ 'source': ['cpt.fixModel'], 'ID': ['%s.mRNA' % fid] } ) feature.qualifiers['ID'] = [fid + '.CDS'] mRNA.sub_features = [feature] gene.sub_features = [mRNA] yield gene
def gff_reopen(gff3, index=1, fasta=None, fasta_output=None): # Convert to zero-based index -= 1 it = None if fasta: seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) it = GFF.parse(gff3, base_dict=seq_dict) else: it = GFF.parse(gff3) for rec in it: # Reopen if len( list( feature_lambda(rec.features, feature_test_contains, {'index': index}, subfeatures=False))) > 0: log.warn( "WARNING: Index chosen is in the middle of a feature. This feature will disappear from the output" ) # TODO: This call removes metadata! rec = rec[index:] + rec[0:index] rec.features = sorted(rec.features, key=lambda x: x.location.start) if fasta: if len(rec.seq) == rec.seq.count("?"): log.error( "ERROR: You have provided a fasta file but the sequence ID in the fasta file DID NOT MATCH THE GFF. THIS IS BAD." ) yield rec
def parse_gff(gff3): """ Extracts strand and start location to be used in cluster filtering """ log.debug("parse_gff3") gff_info = {} _rec = None for rec in GFF.parse(gff3): _rec = rec _rec.annotations = {} for feat in feature_lambda(rec.features, test_true, {}, subfeatures=False): if feat.type == 'CDS': gff_info[feat.id] = { 'strand': feat.strand, 'start': feat.location.start, 'loc': feat.location, 'feat': feat, } gff_info = OrderedDict( sorted(gff_info.items(), key=lambda k: k[1]['start'])) for i, feat_id in enumerate(gff_info): gff_info[feat_id].update({'index': i}) return dict(gff_info), _rec
def merge_interpro(gff3, interpro): ipr_additions = {} # blacklist = ('Name', 'ID', 'Target', 'date', 'status', 'signature_desc', 'source', 'md5', 'score') whitelist = ('Dbxref', 'Ontology_term') for rec in GFF.parse(interpro): ipr_additions[rec.id] = {} for feature in rec.features: quals = feature.qualifiers for key in quals: if key not in ipr_additions[rec.id]: ipr_additions[rec.id][key] = set() for value in quals[key]: ipr_additions[rec.id][key].add(value) # Cast as a list so we aren't iterating over actual keyset. Otherwise, # we'll throw an error for modifying keyset during iteration, which we # don't really care about here. for key in list(ipr_additions[rec.id]): if key not in whitelist: del ipr_additions[rec.id][key] for rec in GFF.parse(gff3): for feature in feature_lambda(rec.features, feature_test_true, None, subfeatures=True): if feature.id in ipr_additions: for key in ipr_additions[feature.id]: if key not in feature.qualifiers: feature.qualifiers[key] = [] feature.qualifiers[key] += list(ipr_additions[feature.id][key]) rec.annotations = {} GFF.write([rec], sys.stdout)
def fixed_feature(rec): for idx, feature in enumerate( feature_lambda(rec.features, feature_test_type, {'type': 'tRNA'}, subfeatures=True)): fid = 'tRNA-%03d' % (1 + idx) name = ['tRNA-' + feature.qualifiers['Codon'][0]] gene = SeqFeature(location=feature.location, type='gene', qualifiers={ 'ID': [fid + '.gene'], 'source': ['aragorn'], 'Name': name, }) feature.qualifiers['Name'] = name # Below that we have an mRNA exon = SeqFeature(location=feature.location, type='exon', qualifiers={ 'source': ['aragorn'], 'ID': ['%s.exon' % fid], 'Name': name, }) feature.qualifiers['ID'] = [fid] # gene -> trna -> exon feature.sub_features = [exon] gene.sub_features = [feature] yield gene
def fixed_feature(rec): for feature in feature_lambda(rec.features, feature_test_type, {"type": "CDS"}, subfeatures=True): import random fid = feature.qualifiers["ID"][0] + "_" + str(random.random()) gene = SeqFeature( location=feature.location, type="gene", qualifiers={ "ID": [fid], "source": ["cpt.fixModel"] }, ) # Below that we have an mRNA mRNA = SeqFeature( location=feature.location, type="mRNA", qualifiers={ "source": ["cpt.fixModel"], "ID": ["%s.mRNA" % fid] }, ) feature.qualifiers["ID"] = [fid + ".CDS"] mRNA.sub_features = [feature] gene.sub_features = [mRNA] yield gene
def main(fasta, gff3): seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) codon_usage = {} for rec in GFF.parse(gff3, base_dict=seq_dict): for feat in feature_lambda(rec.features, feature_test_type, {"type": "CDS"}, subfeatures=True): seq = str(feat.extract(rec).seq)[-3:] try: codon_usage[seq] += 1 except KeyError: codon_usage[seq] = 1 names = { "TAG": "Amber", "TAA": "Ochre", "TGA": "Opal", } # TODO: print all actg combinations? Or just ones that are there print "# Name\tCodon\tCount" for key in sorted(codon_usage): print "\t".join((names.get(key.upper(), "None"), key, str(codon_usage[key])))
def get_cdss(self): return list( feature_lambda( self.feature.sub_features, feature_test_type, {"type": "CDS"}, subfeatures=False, ))
def gff_filter(gff3): for rec in GFF.parse(gff3): for feature in feature_lambda( rec.features, test_true, {}, subfeatures=True): if feature.type == 'exon' and len(feature) < 20: feature.type = 'Shine_Dalgarno_sequence' rec.annotations = {} GFF.write([rec], sys.stdout)
def gff_filter(gff3): cs = ColorScheme() for rec in GFF.parse(gff3): rec.features = feature_lambda(rec.features, apply_color, {"cs": cs}, subfeatures=False) rec.annotations = {} GFF.write([rec], sys.stdout)
def require_shinefind(gff3, fasta): sd_finder = NaiveSDCaller() # Load up sequence(s) for GFF3 data seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # Parse GFF3 records for record in GFF.parse(gff3, base_dict=seq_dict): # Reopen genes = list( feature_lambda(record.features, feature_test_type, {"type": "gene"}, subfeatures=True)) good_genes = [] for gene in genes: cdss = sorted( list( feature_lambda( gene.sub_features, feature_test_type, {"type": "CDS"}, subfeatures=False, )), key=lambda x: x.location.start, ) if len(cdss) == 0: continue cds = cdss[0] sds, start, end, seq = sd_finder.testFeatureUpstream(cds, record, sd_min=5, sd_max=15) if len(sds) >= 1: sd_features = sd_finder.to_features(sds, gene.location.strand, start, end, feature_id=gene.id) gene.sub_features.append(sd_features[0]) good_genes.append(gene) record.features = good_genes yield record
def getGff3Locations(parent, map_by="ID"): featureLocations = {} recs = GFF.parse(parent) # Only parse first. rec = next(recs) # Get all the feature locations in this genome for feature in feature_lambda(rec.features, feature_test_true, {}): id = feature.qualifiers.get(map_by, [feature.id])[0] featureLocations[id] = feature.location return rec, featureLocations
def require_shinefind(gff3, fasta): sd_finder = NaiveSDCaller() # Load up sequence(s) for GFF3 data seq_dict = SeqIO.to_dict(SeqIO.parse(fasta, "fasta")) # Parse GFF3 records for record in GFF.parse(gff3, base_dict=seq_dict): # Reopen genes = list( feature_lambda(record.features, feature_test_type, {'type': 'gene'}, subfeatures=True)) good_genes = [] for gene in genes: cdss = list( feature_lambda(gene.sub_features, feature_test_type, {'type': 'CDS'}, subfeatures=False)) if len(cdss) == 0: continue # Someday this will bite me in the arse. cds = cdss[0] sds, start, end, seq = sd_finder.testFeatureUpstream(cds, record, sd_min=5, sd_max=15) if len(sds) >= 1: # TODO # Double plus yuck sd_features = sd_finder.to_features(sds, gene.location.strand, start, end, feature_id=gene.id) gene.sub_features.append(sd_features[0]) good_genes.append(gene) # Yuck! record.features = good_genes yield record
def fixed_feature(rec): # Get all gene features to remove the mRNAs from for feature in feature_lambda(rec.features, feature_test_type, {"type": "gene"}, subfeatures=True): gene = feature sub_features = [] # Filter out mRNA subfeatures, save other ones to new gene object. for sf in feature_lambda( feature.sub_features, feature_test_type, {"type": "mRNA"}, subfeatures=True, invert=True, ): sf.qualifiers["Parent"] = gene.qualifiers["ID"] sub_features.append(sf) # override original subfeatures with our filtered list gene.sub_features = sub_features yield gene
def gff3_diff(gff3_1, gff3_2): feats1 = {} feats2 = {} for rec1 in GFF.parse(gff3_1): for feat in feature_lambda(rec1.features, feature_test_type, {"type": "gene"}, subfeatures=True): if feat.location.strand == 1: feats1[feat.location.start] = feat else: feats1[feat.location.end] = feat for rec2 in GFF.parse(gff3_2): for feat in feature_lambda(rec2.features, feature_test_type, {"type": "gene"}, subfeatures=True): if feat.location.strand == 1: feats2[feat.location.start] = feat else: feats2[feat.location.end] = feat no_match = [] flags_list = {} for i in feats1: try: diffs = find_differences(feats1[i], feats2[i]) # need to somehow check for subfeatures del feats2[i] for d in diffs: if diffs[d]: flags_list[ i] = flags # noqa HXR: Commented out for linting, please remove when ready. break except: no_match.append(feats1[i]) print flags_list for nm in no_match: print nm for f in feats2: print feats2[f]
def handle_non_gene_features(features): # These are NON-GENE features (maybe terminators? etc?) for feature in feature_lambda( features, feature_test_type, {"type": "gene"}, subfeatures=False, invert=True, recurse=False, ): if feature.type in ("terminator", "tRNA"): yield feature
def fminmax(feature): fmin = None fmax = None for sf in feature_lambda([feature], feature_test_true, {}, subfeatures=True): if fmin is None: fmin = sf.location.start fmax = sf.location.end if sf.location.start < fmin: fmin = sf.location.start if sf.location.end > fmax: fmax = sf.location.end return fmin, fmax
def validate(gff3): results = {} for rec in GFF.parse(gff3): for feature in feature_lambda(rec.features, feature_test_type, {"type": "gene"}, subfeatures=True): checks = [] graded = [] # dbxrefs if "CPT:283675" in feature.qualifiers.get("Dbxref", []): checks.append(True) graded.append({}) else: checks.append(False) graded.append({"q1": "0"}) # ??? # Notes if "Howdy!" in feature.qualifiers.get("Note", []): checks.append(True) graded.append({}) else: checks.append(False) graded.append({"q2": "0"}) # ??? owner = feature.qualifiers.get("owner", ["unknown"])[0] results[owner] = { "checks": checks, "graded": graded, "score": checks.count(True), } # Process all students at once token = auth(open("/galaxy/creds.json", "r"), GUANINE_URL) for email, result in results.items(): sid = student_id(email, GUANINE_URL, token) result = post_result( sid, result["score"], 2, token, GUANINE_URL, "a59a5001-57e7-4776-8807-63b544735f3f", json.dumps({ "raw": result, "graded": result["graded"] }), ) if result.status_code in (200, 201): print("Success") else: print("[Error] user=%s msg=%s" % (email, result.text))
def suppress(genome, annotations, suppress=None): if suppress is None: raise Exception("Must provide a list of stop codons to suppress") seq_dict = SeqIO.to_dict(SeqIO.parse(genome, "fasta")) suppressed_features = [] for record in GFF.parse(annotations, base_dict=seq_dict): for feature in feature_lambda( record.features, feature_test, { "type": "CDS", "record": record, "stops": suppress }, subfeatures=True, ): log.info("Found matching feature %s", feature.id) new_end = None codon_idx = 0 while new_end is None: if feature.strand > 0: cs = feature.location.end + (3 * codon_idx) codon = str(record.seq[cs:cs + 3]) else: cs = feature.location.start - (3 * (1 + codon_idx)) codon = reverse_complement(record.seq[cs:cs + 3]) if codon not in suppress and translate(codon, 11) == "*": new_end = codon_idx break codon_idx += 1 if codon_idx > 40: log.warn("Could not find a new stop codon") break if new_end is not None: if feature.strand > 0: feature.location._end += codon_idx * 3 else: feature.location._start -= codon_idx * 3 suppressed_features.append(feature) record.features = suppressed_features record.annotations = {} GFF.write([record], sys.stdout)
def gff_filter(gff3, id_list=None, id="", attribute_field="ID", subfeatures=True): attribute_field = attribute_field.split("__cn__") if id_list: filter_strings = [line.strip() for line in id_list] else: filter_strings = [x.strip() for x in id.split("__cn__")] for rec in GFF.parse(gff3): rec.features = feature_lambda( rec.features, feature_test_qual_value, {"qualifier": attribute_field, "attribute_list": filter_strings}, subfeatures=subfeatures, ) rec.annotations = {} GFF.write([rec], sys.stdout)
def gff_filter(gff3, id_list=None, id='', attribute_field='ID', subfeatures=True): attribute_field = attribute_field.split('__cn__') if id_list: filter_strings = [line.strip() for line in id_list] else: filter_strings = [x.strip() for x in id.split('__cn__')] for rec in GFF.parse(gff3): rec.features = feature_lambda( rec.features, feature_test_qual_value, {'qualifier': attribute_field, 'attribute_list': filter_strings}, subfeatures=subfeatures ) rec.annotations = {} GFF.write([rec], sys.stdout)
def generate_annotation_file(gff3): # TODO: cleanup t = tempfile.NamedTemporaryFile(delete=False, suffix='.coords') for rec in GFF.parse(gff3): features = feature_lambda(rec.features, feature_test_type, {'type': 'CDS'}, subfeatures=False) for feature in sorted(features, key=lambda x: x.location.start): t.write('\t'.join(map(str, [ feature.id, feature.location.start + 1, feature.location.end, rec.id ])) + '\n') name = t.name t.close() return name
def genes_all(feature_list, feature_type=["gene"], sort=False): """ Simple filter to extract gene features from the feature set. """ if not sort: for x in feature_lambda(feature_list, feature_test_type, {"types": feature_type}, subfeatures=True): yield x else: data = list(genes_all(feature_list, feature_type, sort=False)) data = sorted(data, key=lambda feature: feature.location.start) for x in data: yield x
def validate(gff3): results = {} for rec in GFF.parse(gff3): for feature in feature_lambda(rec.features, feature_test_type, {'type': 'gene'}, subfeatures=True): checks = [] graded = [] # dbxrefs if 'CPT:283675' in feature.qualifiers.get('Dbxref', []): checks.append(True) graded.append({}) else: checks.append(False) graded.append({'q1': '0'}) # ??? # Notes if 'Howdy!' in feature.qualifiers.get('Note', []): checks.append(True) graded.append({}) else: checks.append(False) graded.append({'q2': '0'}) # ??? owner = feature.qualifiers.get('owner', ['unknown'])[0] results[owner] = { 'checks': checks, 'graded': graded, 'score': checks.count(True), } # Process all students at once token = auth(open('/galaxy/creds.json', 'r'), GUANINE_URL) for email, result in results.items(): sid = student_id(email, GUANINE_URL, token) result = post_result( sid, result['score'], 2, token, GUANINE_URL, 'a59a5001-57e7-4776-8807-63b544735f3f', json.dumps({ "raw": result, "graded": result['graded'] }), ) if result.status_code in (200, 201): print("Success") else: print("[Error] user=%s msg=%s" % (email, result.text))