def main():
    """Select desired genes from a gff based on gene IDs

    If a given gene has even a single transcript in the to-select list,
    then all isoforms will be collected

    Usage: select-gene-records.py list-of-transcript-ids.txt gene-records.gff
    Output: gene-records-selected.gff

    """

    idlist = sys.argv[1]
    gff = sys.argv[2]
    outnam = re.sub(".gff", "-selected.gff", gff)

    ids = set()
    with open(idlist, "r") as file_object:
        for line in file_object:
            ids.add(line.strip("\n"))
            for i in range(1, 99):
                deriv_geneID = re.sub("\Z", "-mRNA-" + str(i),
                                      line.strip("\n"))
                ids.add(deriv_geneID)

    outfile = open(outnam, "w")

    with open(gff, "r") as file_object:
        for line in file_object:
            if line[0] == "#":
                #                outfile.write(line)
                continue
            feature = gfftools.GFF(line)
            if feature.id in ids:
                outfile.write(line)
            else:
                for papa in feature.parent:
                    if papa in ids:
                        outfile.write(line)
                        break
    outfile.close()
Exemple #2
0
def main():
    gene_loci = 'version3-gene-locus-tags.tsv'
    mrna_loci = 'version3-transcript-locus-tags.tsv'
    gff = '/projects/bullfrog_assembly/genome/ARCS/annotation/fresh-Oct2018/maker-second-round/analysis/third-round/gag_output/genome.gff'
    blastp = '/projects/bullfrog_assembly/genome/ARCS/annotation/fresh-Oct2018/submission_prep/blastp-annotated2.tsv'
    pfam = 'pfam-hits.tsv'
    genevalidator = 'genevalidator-90.txt'
    POS = 25.0
    QCOV = 50.0
    EVALUE = 0.00001

    mrna_tags = load_tags(mrna_loci)
    gene_tags = load_tags(gene_loci)
    blast_aln = load_blast(blastp, POS, QCOV)
    pfam_aln = load_pfam(pfam, EVALUE)
    gv_good = load_gv(genevalidator)

    with open(gff, 'r') as infile:
        for line in infile:
            if line[0] == '#':
                sys.stdout.write(line)
            else:
                rec = gfftools.GFF(line)
                tag = get_tag(rec, mrna_tags, gene_tags)
                if rec.type == 'mRNA':
                    if rec.id in blast_aln:
                        hit = blast_aln[rec.id]
                        prot = hit.get_best_blast()
                    else:
                        prot = ''

                    if rec.id in pfam_aln:
                        hit = pfam_aln[rec.id]
                        dom = hit.get_best_pfam()
                    else:
                        dom = ''

                    if prot:
                        hit_name = prot.sname
                        if prot.qid in gv_good:
                            rec.add_prot_desc(hit_name, flag=True)
                        else:
                            rec.add_prot_desc(hit_name)
                    elif dom:
                        pfam_name = dom.sname
                        rec.add_pfam(pfam_name)
                    else:
                        rec.add_prot_desc('hypothetical protein', flag=True)

                    if tag:
                        rec.add_locus_tag(tag)

                    # process attributes and print out
                    rec.extend_attr()
                    print(rec.print_record())

                elif rec.type == 'gene':
                    # genes only get locus_tags
                    if tag:
                        rec.add_locus_tag(tag)
                    rec.extend_attr()
                    print(rec.print_record())

                else:
                    # non-gene or non-mRNA record
                    rec.extend_attr()
                    print(rec.print_record())
Exemple #3
0
def main():
    """Extract CDS coordinates for each mRNA in a gff file,
    then print the corresponding sequence.
    The subfeatures MUST be sorted by their start coordinate

    Usage: extract-cds.py genes.gff genome.fa > genes-cds.fa
    """

    if len(sys.argv) is not 3:
        print main.__doc__
        sys.exit(1)

    gff = sys.argv[1]
    fasta = sys.argv[2]

    cds_coords = {}

    with open(gff, "r") as file_object:
        for line in file_object:
            if line[0] == "#":
                sys.stdout.write(line)
                continue

            feature = gfftools.GFF(line)

            if feature.type == "CDS":
                for parent in feature.parent:
                    if parent in cds_coords:
                        if feature.strand == "+":
                            cds_coords[parent]['coords'].append(
                                [feature.start, feature.end])

                        else:
                            cds_coords[parent]['coords'].insert(
                                0, [feature.start, feature.end])

                    else:
                        cds_coords[parent] = {
                            'coords': [[feature.start, feature.end]],
                            'scaf': feature.seqid,
                            'strand': feature.strand
                        }

    sequences = {}

    with open(fasta, "r") as file_object:
        for rec in fastatools.fasta_iter(file_object):
            sequences[rec[0]] = rec[1]

    for record in cds_coords:
        whole = ""
        for coords in cds_coords[record]['coords']:
            seqid = cds_coords[record]['scaf']

            if cds_coords[record]['strand'] == "-":
                piece = fastatools.revcomp(
                    sequences[seqid][int(coords[0]) -
                                     1:int(coords[1])])  ### watch for OBO

            else:
                piece = sequences[seqid][int(coords[0]) -
                                         1:int(coords[1])]  ### watch for OBO

            whole += piece
        else:
            sys.stdout.write(">" + record + "_CDS\n" + whole + "\n")
Exemple #4
0
import re

import gfftools

# n.b. s.argv[0] == script name
gff = sys.argv[1]

# dicts to hold the exon and cds records
edic = {}
cdic = {}

with open(gff, "r") as infile:
    for line in infile:
        if line[0] != "#":

            rec = gfftools.GFF(line)

            if rec.type == "mRNA" or rec.type == "ncRNA":
                mid = rec.id
                fstart = int(rec.start) - 1
                fend = int(rec.end) - 1
                edic[mid] = [
                    rec.seqid, fstart, fend, mid, 0, rec.strand, fstart, fend,
                    [255, 0, 0]
                ]
#                if rec.type != "ncRNA":
#                    cdic[mid]=[rec.seqid,fstart,fend,mid,0,rec.strand,fstart,fend,[0,0,255]]
            elif rec.type == "exon":
                mid = re.sub(":exon", "", rec.id)
                fstart = int(rec.start) - 1 - edic[mid][1]
                fend = int(rec.end) - 1 - edic[mid][1]
Exemple #5
0
#gffout=open(re.sub(".gff","-prepared.gff",gff),"w")
tblout=open(re.sub(".gff","-prepared.tbl",gff),"w")
lokey=open(re.sub(".gff","-locus_tag-conversion-key.tsv",gff),"w")

###################
# MAIN

genes = {}
# read in the entries from the gff file
with open(gff,"r") as infile:
    print "Reading the annotations"
    for line in infile:
        if line[0] == "#" or line[0] == "-":
            continue

        feature = gfftools.GFF(line)

        if feature.type == "gene":
            genes[feature.id] = gfftools.Gene(feature)

        elif feature.type == "mRNA" or feature.type == "ncRNA":
            # transcripts can only have one parent, so access it explicitly
            genes[feature.parent[0]].add_transcript(feature)

        elif feature.type == "exon":
            if feature.name:
                genes[feature.name].transcript[feature.parent[0]].add_exon(feature)
                continue
            for parent in feature.parent:
                for gene in genes:
                    if re.findall(genes[gene].id,parent):