def main(): """Select desired genes from a gff based on gene IDs If a given gene has even a single transcript in the to-select list, then all isoforms will be collected Usage: select-gene-records.py list-of-transcript-ids.txt gene-records.gff Output: gene-records-selected.gff """ idlist = sys.argv[1] gff = sys.argv[2] outnam = re.sub(".gff", "-selected.gff", gff) ids = set() with open(idlist, "r") as file_object: for line in file_object: ids.add(line.strip("\n")) for i in range(1, 99): deriv_geneID = re.sub("\Z", "-mRNA-" + str(i), line.strip("\n")) ids.add(deriv_geneID) outfile = open(outnam, "w") with open(gff, "r") as file_object: for line in file_object: if line[0] == "#": # outfile.write(line) continue feature = gfftools.GFF(line) if feature.id in ids: outfile.write(line) else: for papa in feature.parent: if papa in ids: outfile.write(line) break outfile.close()
def main(): gene_loci = 'version3-gene-locus-tags.tsv' mrna_loci = 'version3-transcript-locus-tags.tsv' gff = '/projects/bullfrog_assembly/genome/ARCS/annotation/fresh-Oct2018/maker-second-round/analysis/third-round/gag_output/genome.gff' blastp = '/projects/bullfrog_assembly/genome/ARCS/annotation/fresh-Oct2018/submission_prep/blastp-annotated2.tsv' pfam = 'pfam-hits.tsv' genevalidator = 'genevalidator-90.txt' POS = 25.0 QCOV = 50.0 EVALUE = 0.00001 mrna_tags = load_tags(mrna_loci) gene_tags = load_tags(gene_loci) blast_aln = load_blast(blastp, POS, QCOV) pfam_aln = load_pfam(pfam, EVALUE) gv_good = load_gv(genevalidator) with open(gff, 'r') as infile: for line in infile: if line[0] == '#': sys.stdout.write(line) else: rec = gfftools.GFF(line) tag = get_tag(rec, mrna_tags, gene_tags) if rec.type == 'mRNA': if rec.id in blast_aln: hit = blast_aln[rec.id] prot = hit.get_best_blast() else: prot = '' if rec.id in pfam_aln: hit = pfam_aln[rec.id] dom = hit.get_best_pfam() else: dom = '' if prot: hit_name = prot.sname if prot.qid in gv_good: rec.add_prot_desc(hit_name, flag=True) else: rec.add_prot_desc(hit_name) elif dom: pfam_name = dom.sname rec.add_pfam(pfam_name) else: rec.add_prot_desc('hypothetical protein', flag=True) if tag: rec.add_locus_tag(tag) # process attributes and print out rec.extend_attr() print(rec.print_record()) elif rec.type == 'gene': # genes only get locus_tags if tag: rec.add_locus_tag(tag) rec.extend_attr() print(rec.print_record()) else: # non-gene or non-mRNA record rec.extend_attr() print(rec.print_record())
def main(): """Extract CDS coordinates for each mRNA in a gff file, then print the corresponding sequence. The subfeatures MUST be sorted by their start coordinate Usage: extract-cds.py genes.gff genome.fa > genes-cds.fa """ if len(sys.argv) is not 3: print main.__doc__ sys.exit(1) gff = sys.argv[1] fasta = sys.argv[2] cds_coords = {} with open(gff, "r") as file_object: for line in file_object: if line[0] == "#": sys.stdout.write(line) continue feature = gfftools.GFF(line) if feature.type == "CDS": for parent in feature.parent: if parent in cds_coords: if feature.strand == "+": cds_coords[parent]['coords'].append( [feature.start, feature.end]) else: cds_coords[parent]['coords'].insert( 0, [feature.start, feature.end]) else: cds_coords[parent] = { 'coords': [[feature.start, feature.end]], 'scaf': feature.seqid, 'strand': feature.strand } sequences = {} with open(fasta, "r") as file_object: for rec in fastatools.fasta_iter(file_object): sequences[rec[0]] = rec[1] for record in cds_coords: whole = "" for coords in cds_coords[record]['coords']: seqid = cds_coords[record]['scaf'] if cds_coords[record]['strand'] == "-": piece = fastatools.revcomp( sequences[seqid][int(coords[0]) - 1:int(coords[1])]) ### watch for OBO else: piece = sequences[seqid][int(coords[0]) - 1:int(coords[1])] ### watch for OBO whole += piece else: sys.stdout.write(">" + record + "_CDS\n" + whole + "\n")
import re import gfftools # n.b. s.argv[0] == script name gff = sys.argv[1] # dicts to hold the exon and cds records edic = {} cdic = {} with open(gff, "r") as infile: for line in infile: if line[0] != "#": rec = gfftools.GFF(line) if rec.type == "mRNA" or rec.type == "ncRNA": mid = rec.id fstart = int(rec.start) - 1 fend = int(rec.end) - 1 edic[mid] = [ rec.seqid, fstart, fend, mid, 0, rec.strand, fstart, fend, [255, 0, 0] ] # if rec.type != "ncRNA": # cdic[mid]=[rec.seqid,fstart,fend,mid,0,rec.strand,fstart,fend,[0,0,255]] elif rec.type == "exon": mid = re.sub(":exon", "", rec.id) fstart = int(rec.start) - 1 - edic[mid][1] fend = int(rec.end) - 1 - edic[mid][1]
#gffout=open(re.sub(".gff","-prepared.gff",gff),"w") tblout=open(re.sub(".gff","-prepared.tbl",gff),"w") lokey=open(re.sub(".gff","-locus_tag-conversion-key.tsv",gff),"w") ################### # MAIN genes = {} # read in the entries from the gff file with open(gff,"r") as infile: print "Reading the annotations" for line in infile: if line[0] == "#" or line[0] == "-": continue feature = gfftools.GFF(line) if feature.type == "gene": genes[feature.id] = gfftools.Gene(feature) elif feature.type == "mRNA" or feature.type == "ncRNA": # transcripts can only have one parent, so access it explicitly genes[feature.parent[0]].add_transcript(feature) elif feature.type == "exon": if feature.name: genes[feature.name].transcript[feature.parent[0]].add_exon(feature) continue for parent in feature.parent: for gene in genes: if re.findall(genes[gene].id,parent):