def main():
    parser = argparse.ArgumentParser(
        'Filter the genes of a GFF3 file by mRNA child IDs')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_gff',
                        type=str,
                        required=True,
                        help='GFF file of source annotation')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Optional output file path (else STDOUT)')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    current_mRNA_id = None
    current_mol_id = None
    current_fragments = list()
    current_direction = None

    for line in open(args.input_gff):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        # grab the ID and Parent columns if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        mol_id = cols[0]
        type = cols[2]

        if type == 'mRNA':
            if current_mRNA_id is not None and id != current_mRNA_id:
                # purge the existing one first
                write_transcript(fout, current_mol_id, current_fragments,
                                 current_direction)
                current_fragments = list()

            current_mRNA_id = id
            current_mol_id = cols[0]
            current_direction = cols[6]

        elif type == 'exon':

            if cols[6] == '+':
                current_fragments.append({'start': cols[3], 'end': cols[4]})
            else:
                current_fragments.append({'start': cols[4], 'end': cols[3]})

    write_transcript(fout, current_mol_id, current_fragments,
                     current_direction)
def main():
    parser = argparse.ArgumentParser('Filter the genes of a GFF3 file by mRNA child IDs')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of source molecules' )
    parser.add_argument('-l', '--id_list', type=str, required=True, help='List file of mRNA IDs to keep' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    ids_to_keep = list()

    for line in open(args.id_list):
        line = line.rstrip()
        if len(line) > 2:
            ids_to_keep.append(line)
        
    fout.write("##gff-version 3\n")

    current_gene_lines = list()
    current_gene_id = None
    keep = False
        
    for line in open(args.input_gff3):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        # grab the ID and Parent columns if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        type = cols[2]

        if type == 'gene':
            # purge the current gene, if any
            if len(current_gene_lines) > 1:
                for li in current_gene_lines:
                    fout.write("{0}\n".format(li) )

            # reset
            current_gene_lines = list()
            current_gene_lines.append( line )
            current_gene_id = id

        else:
            if type == 'mRNA':
                if id in ids_to_keep:
                    keep = True
                else:
                    keep = False

            if keep == True:
                current_gene_lines.append(line)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line))

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print(
                    "INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})"
                    .format(id, last_rna_id, cols[2]))
                cols[8] = biocodegff.set_column_9_value(
                    cols[8], 'Parent', last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)))
            else:
                ofh.write("{0}\n".format(line))
        else:
            ofh.write("{0}\n".format(line))
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'
    
    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = biocodegff.get_gff3_features( flawed_gff_file )

    print("INFO: loaded {0} assemblies and {1} features".format(len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = biothings.Polypeptide( id=id, parent=parent )
        polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(len(polypeptides)) )

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print("DEBUG: {0} not found as a parent to any polypeptide".format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()
                    
                for CDS in CDSs:
                    keep = True
                    
                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))                    

            gene.print_as(fh=fout, source=source, format='gff3')
def main():
    parser = argparse.ArgumentParser( description='Generates new identifiers in GFF3 files following the IGS identifier convention.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated')
    parser.add_argument('-m', '--mode', type=str, required=False, default='sequential', help='ID modes (see embedded documentation): sequential, uuid, hex8, hex12')

    args = parser.parse_args()
    check_arguments(args)

    id_map = dict()
    
    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    for line in open(args.input_file):
        line = line.rstrip()

        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        # grab the ID column if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        new_id = None
        new_parent = None
        type = cols[2]

        if id is not None:
            if id in id_map:
                new_id = id_map[id]
            else:
                new_id = get_new_id(args.prefix, type, args.mode)
                id_map[id] = new_id

            cols[8] = cols[8].replace("ID={0}".format(id), "ID={0}".format(new_id))

        if parent is not None:
            if parent in id_map:
                new_parent = id_map[parent]
            else:
                raise Exception("ERROR: parent ({0}) referenced before it was used as an ID".format(parent))

            cols[8] = cols[8].replace("Parent={0}".format(parent), "Parent={0}".format(new_parent))

        #print("DEBUG: old_id:{0} - old_parent:{1}, new_id:{2} - new_parent:{3}".format(id, parent, new_id, new_parent))
        fout.write("\t".join(cols) + "\n")
def main():
    parser = argparse.ArgumentParser('Filter the genes of a GFF3 file by mRNA child IDs')

    ## output file to be written
    parser.add_argument('-i', '--input_gff', type=str, required=True, help='GFF file of source annotation' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    current_mRNA_id = None
    current_mol_id = None
    current_fragments = list()
    current_direction = None
        
    for line in open(args.input_gff):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        # grab the ID and Parent columns if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        mol_id = cols[0]
        type = cols[2]

        if type == 'mRNA':
            if current_mRNA_id is not None and id != current_mRNA_id:
                # purge the existing one first
                write_transcript(fout, current_mol_id, current_fragments, current_direction)
                current_fragments = list()
                
            current_mRNA_id = id
            current_mol_id = cols[0]
            current_direction = cols[6]
            
        elif type == 'exon':
            
            if cols[6] == '+':
                current_fragments.append({'start':cols[3], 'end':cols[4]})
            else:
                current_fragments.append({'start':cols[4], 'end':cols[3]})

    write_transcript(fout, current_mol_id, current_fragments, current_direction)
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_gff', type=str, required=True, help='Path to an output GFF file to be created with new IDs' )
    parser.add_argument('-p', '--id_prefix', type=str, required=True, help='Will be used as the base for all IDs generated' )
    parser.add_argument('-m', '--output_map', type=str, required=False, help='This will create a tab-delimited mapping of old and new IDs' )
    args = parser.parse_args()

    ofh = open(args.output_gff, 'w')

    if args.output_map is None:
        map_ofh = None
    else:
        map_ofh = open(args.output_map, 'w')

    idmap = dict()

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write(line + "\n")
            continue

        feat_id   = biocodegff.column_9_value( cols[8], 'ID' )
        parent_id = biocodegff.column_9_value( cols[8], 'Parent' )
        
        if feat_id in idmap:
            new_feat_id = idmap[feat_id]
        else:
            new_feat_id = get_new_id(args.id_prefix, cols[2], feat_id, map_ofh)
            idmap[feat_id] = new_feat_id

        if parent_id is None:
            cols[8] = "ID={0}".format(new_feat_id)
        else:
            if parent_id in idmap:
                new_parent_id = idmap[parent_id]
            else:
                new_parent_id = get_new_id(args.id_prefix, cols[2], parent_id, map_ofh)
                idmap[parent_id] = new_parent_id

            cols[8] = "ID={0};Parent={1}".format(new_feat_id, new_parent_id)

        ofh.write( "\t".join(cols) + "\n" )
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser(
        description='Adds gene features for RNAs which lack them')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA') and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = 'gene'
            gene_cols[8] = biocodegff.set_column_9_value(
                gene_cols[8], 'ID', "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)))

            cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent',
                                                    "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))
Beispiel #9
0
def append_organism_names_to_gff(file_path, poly_orgs):
    # we have to write to a temp file and copy over
    fout = open("{0}.orgtmp".format(file_path), 'wt')
    orgs_found = 0
    last_RNA_id = None
    
    for line in open(file_path):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) == 9 and cols[2].endswith('RNA'):
            last_RNA_id = biocodegff.column_9_value(cols[8], 'ID')
        if len(cols) == 9 and cols[2] == 'polypeptide':
            if last_RNA_id in poly_orgs:
                cols[8] += ";top_organism_from_blast={0}".format(poly_orgs[last_RNA_id], biocodegff.escape(poly_orgs[last_RNA_id]))
                orgs_found += 1

            fout.write("{0}\n".format("\t".join(cols)) )

        else:
            fout.write("{0}\n".format(line))

    if orgs_found == 0:
        print("WARNING: The --export_organism_names option was passed, but parsing failed to find any organism names at all.  This might be an error.")
            
    ## now move the temp file over the original copy
    fout.close()
    os.rename("{0}.orgtmp".format(file_path), file_path)
def append_organism_names_to_gff(file_path, poly_orgs):
    # we have to write to a temp file and copy over
    fout = open("{0}.orgtmp".format(file_path), 'wt')
    orgs_found = 0
    last_RNA_id = None
    
    for line in open(file_path):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) == 9 and cols[2].endswith('RNA'):
            last_RNA_id = biocodegff.column_9_value(cols[8], 'ID')
        if len(cols) == 9 and cols[2] == 'polypeptide':
            if last_RNA_id in poly_orgs:
                cols[8] += ";top_organism_from_blast={0}".format(poly_orgs[last_RNA_id], biocodegff.escape(poly_orgs[last_RNA_id]))
                orgs_found += 1

            fout.write("{0}\n".format("\t".join(cols)) )

        else:
            fout.write("{0}\n".format(line))

    if orgs_found == 0:
        print("WARNING: The --export_organism_names option was passed, but parsing failed to find any organism names at all.  This might be an error.")
            
    ## now move the temp file over the original copy
    fout.close()
    os.rename("{0}.orgtmp".format(file_path), file_path)
def main():
    parser = argparse.ArgumentParser( description='Reverses CDS coodinates where stop < start')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]):
            temp = cols[3]
            cols[3] = cols[4]
            cols[4] = temp
            id = biocodegff.column_9_value(cols[8], 'ID')
            print("CDS reversed: {0}".format(id))
            ofh.write("{0}\n".format("\t".join(cols)) )
        else:
            ofh.write("{0}\n".format(line) )
def main():
    parser = argparse.ArgumentParser( description='Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None
    
    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        id     = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line) )

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print("INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})".format(id, last_rna_id, cols[2]) )
                cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent', last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)) )
            else:
                ofh.write("{0}\n".format(line) )
        else:
            ofh.write("{0}\n".format(line) )
def main():
    parser = argparse.ArgumentParser(
        description='Removes duplicate features in a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    # just reduce the keys to a string:
    # "molecule__parent__type__start__stop"
    found = list()

    infile = open(args.input)
    outfile = open(args.output, 'wt')

    for line in infile:
        if line.startswith('#'):
            outfile.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        parent = biocodegff.column_9_value(cols[8], 'Parent')
        type = cols[2]
        mol_id = cols[0]

        if parent is None:
            outfile.write("{0}\n".format(line))
            continue

        id_string = "{0}__{1}__{2}__{3}__{4}".format(mol_id, parent, type,
                                                     cols[3], cols[4])

        if id_string in found:
            print("INFO: duplicate feature to be removed:\n{0}\n".format(line))
            continue
        else:
            found.append(id_string)
            outfile.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser(description="Adds gene features for RNAs which lack them")

    ## output file to be written
    parser.add_argument("-i", "--input", type=str, required=True, help="Path to the input GFF3 file")
    parser.add_argument("-o", "--output", type=str, required=True, help="Output GFF3 file to write")
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, "wt")

    for line in infile:

        if line.startswith("#"):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = biocodegff.column_9_value(cols[8], "ID")
        parent = biocodegff.column_9_value(cols[8], "Parent")

        if cols[2].endswith("RNA") and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = "gene"
            gene_cols[8] = biocodegff.set_column_9_value(gene_cols[8], "ID", "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)))

            cols[8] = biocodegff.set_column_9_value(cols[8], "Parent", "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser( description='Removes duplicate features in a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    # just reduce the keys to a string:
    # "molecule__parent__type__start__stop"
    found = list()

    infile  = open(args.input)
    outfile = open(args.output, 'wt')
    
    for line in infile:
        if line.startswith('#'):
            outfile.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        parent = biocodegff.column_9_value(cols[8], 'Parent')
        type   = cols[2]
        mol_id = cols[0]

        if parent is None:
            outfile.write("{0}\n".format(line))
            continue

        id_string = "{0}__{1}__{2}__{3}__{4}".format(mol_id, parent, type, cols[3], cols[4])

        if id_string in found:
            print("INFO: duplicate feature to be removed:\n{0}\n".format(line) )
            continue
        else:
            found.append(id_string)
            outfile.write("{0}\n".format(line) )
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser(
        description='Reverses CDS coodinates where stop < start')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]):
            temp = cols[3]
            cols[3] = cols[4]
            cols[4] = temp
            id = biocodegff.column_9_value(cols[8], 'ID')
            print("CDS reversed: {0}".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser( description='Removes orphaned features in a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' )
    args = parser.parse_args()

    # going to try saving memory by tracking line numbers instead of storing all of it
    #  true means keep the line, false means to omit it
    # doing tracking this way since it's technically legal for a feature to have no identifier at all.
    lines = list()
    parents = dict()
    current_line_num = -1

    infile = open(args.input)
    
    for line in infile:
        current_line_num += 1
        
        if line.startswith('#'):
            lines.append(True)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            lines.append(True)
            continue

        id     = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if parent is None:
            # this might be overwritten later
            lines.append(False)
            
            if id is not None:
                if parent not in parents:
                    parents[parent] = False
        else:
            lines.append(True)
            parents[parent] = True
                
    infile.seek(0)
    current_line_num = -1

    outfh = open(args.output, 'wt')
    
    for line in infile:
        current_line_num += 1

        if lines[current_line_num] == True:
            outfh.write(line)
        else:
            line = line.rstrip()
            cols = line.split("\t")

            if len(cols) == 9:
                id     = biocodegff.column_9_value(cols[8], 'ID')

                if id is not None and id in parents and parents[id] == True:
                    outfh.write("{0}\n".format(line))
                else:
                    print("WARN: removing this line: {0}".format(line))
Beispiel #18
0
def main():
    parser = argparse.ArgumentParser( description='Convert PASA GFF file to canonical gene models')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by PASA' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-s', '--source', type=str, required=False, default='PASA', help='Value to use for the 2nd (source) column' )
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None
    
    gene = None
    mRNA = None
    gene_fmin = None
    gene_fmax = None
    gene_strand = None

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()
    
    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_id = biocodegff.column_9_value(cols[8], 'ID')

        # we expect all columns to be cDNA_match
        if feat_type != 'cDNA_match':
            raise Exception("ERROR: expected all columns to be of type 'cDNA_match' but found a {0}".format(feat_type))

        ## initialize this assembly if we haven't seen it yet
        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id )

        if gene is None or feat_id != gene.id:
            if gene is not None:
                # finish the previous one first
                mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
                gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
                gene.add_mRNA(mRNA)
                current_assembly.add_gene( gene )
                gene.print_as(fh=fout, source=args.source, format='gff3')

            # now start a new one
            gene = biothings.Gene( id=feat_id )
            mRNA = biothings.mRNA( id="{0}.mRNA".format(feat_id), parent=gene )
            exon_count_by_mRNA[mRNA.id] = 0
            
            gene_fmin = int(cols[3]) - 1
            gene_fmax = int(cols[4])
            gene_strand = cols[6]

        current_assembly = assemblies[mol_id]
            
        # each row is a new CDS/exon for the current mRNA
        CDS = biothings.CDS( id="{0}.CDS".format(feat_id), parent=mRNA.id )
        # FIX THIS PHASE
        CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase='.' )
        mRNA.add_CDS(CDS)
        
        exon_count_by_mRNA[mRNA.id] += 1
        exon_id = "{0}.exon{1}".format(mRNA.id, exon_count_by_mRNA[mRNA.id])
        exon = biothings.Exon( id=exon_id, parent=mRNA.id )
        exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
        mRNA.add_exon(exon)

        if int(cols[3]) - 1 < gene_fmin:
            gene_fmin = int(cols[3]) - 1

        if int(cols[4]) > gene_fmax:
            gene_fmax = int(cols[4])

    # don't orphan the last one
    if gene is not None:
        # finish the previous one first
        mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
        gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
        gene.add_mRNA(mRNA)
        current_assembly.add_gene( gene )
        gene.print_as(fh=fout, source=args.source, format='gff3')
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    parser.add_argument('-a', '--organism1_annotation', type=str, required=True, help='Annotation GFF for organism 1' )
    parser.add_argument('-p', '--organism1_aat_alignments', type=str, required=True, help='Path to AAT GFF3 (match/match_part)' )
    parser.add_argument('-aatdb', '--aat_fasta_db', type=str, required=True, help='Path to FASTA database that was used in AAT' )
    parser.add_argument('-b', '--organism1_blast_alignments', type=str, required=True, help='Path to BLASTp btab file vs.organism 2 proteins' )
    parser.add_argument('-be', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='BLAST e-value cutoff' )
    parser.add_argument('-bpi', '--blast_percent_identity_cutoff', type=float, required=False, default=0, help='BLAST %identity cutoff' )
    parser.add_argument('-ppc', '--aat_percent_coverage_cutoff', type=float, required=False, default=0, help='% coverage of the query protein by the AAT match' )
    parser.add_argument('-o', '--output_id_list', type=str, required=False, help='List of IDs from organism1 that passed' )
    args = parser.parse_args()

    debugging_transcript = None
    
    ## if the output file wasn't passed build one from the other parameters
    if args.output_id_list is None:
        args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff)

    print("INFO: Parsing organism1 annotation")
    (assemblies, features) = biocodegff.get_gff3_features( args.organism1_annotation )

    print("INFO: Parsing AAT FASTA database")
    aat_seqs = biocodeutils.fasta_dict_from_file( args.aat_fasta_db )
    
    # keys are assembly IDs, value for each is a list of matches on them
    aat_matches = dict()
    aat_match_count = 0
    current_match = None

    ## IDs of features in organism 1 which overlap AAT
    o1_with_aat = list()
    o1_with_o2 = list()

    print("INFO: Parsing organism1 AAT protein alignments")
    for line in open(args.organism1_aat_alignments):
        cols = line.split("\t")

        if line.startswith('#') or len(cols) != 9:
            continue

        assembly_id = cols[0]

        # skip this match if there were not predicted genes on the same assembly
        if assembly_id not in assemblies:
            continue

        if assembly_id not in aat_matches:
            aat_matches[assembly_id] = list()
        
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        feature_id = biocodegff.column_9_value(cols[8], 'ID').replace('"', '')
        target = biocodegff.column_9_value(cols[8], 'Target')
        m = re.search("^(\S+)", target)
        if m:
            target = m.group(1)

        if cols[2] == 'nucleotide_to_protein_match':
            if current_match is not None:
                aat_matches[assembly_id].append(current_match)
                aat_match_count += 1
            
            current_match = biothings.Match( id=feature_id, target_id=target, subclass='nucleotide_to_protein_match', length=fmax - fmin )
            current_match.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand )

        elif cols[2] == 'match_part':
            parent_id = biocodegff.column_9_value(cols[8], 'Parent').replace('"', '')
            match_part = biothings.MatchPart( id=feature_id, parent=parent_id, length=fmax - fmin )
            match_part.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand )
            current_match.add_part(match_part)

    print("INFO: Parsed {0} protein alignment chains".format(aat_match_count))

    print("INFO: Comparing organism1's mRNAs with AAT match coordinates")
    for assembly_id in assemblies:
        if assembly_id not in aat_matches:
            continue
        
        assembly = assemblies[assembly_id]

        for gene in assembly.genes():
            for mRNA in gene.mRNAs():

                if debugging_transcript is not None:
                    if mRNA.id == debugging_transcript:
                        print("DEBUG: processing debugging transcript: {0}".format(mRNA.id))
                    else:
                        continue

                for aat_match in aat_matches[assembly_id]:
                    #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) )
                    overlap_size = mRNA.overlap_size_with(aat_match)

                    if overlap_size is not None:
                        #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) )
                        # this shouldn't be possible, but check just in case
                        if overlap_size > mRNA.length:
                            raise Exception("ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length))

                        if aat_match.target_id not in aat_seqs:
                            raise Exception("ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format(aat_match.target_id))

                        # this is a protein length, so x3
                        match_target_length = len(aat_seqs[aat_match.target_id]['s']) * 3

                        (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage(mRNA, aat_match, match_target_length)

                        #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) )
                        #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) )
                        
                        if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff:
                            o1_with_aat.append(mRNA.id)
                            #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \
                            #        mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \
                            #        aat_match.target_id, match_target_length) )
                            #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage))
                            #print("\ttarget % cov: {0}".format(target_percent_coverage))
                            break   # only need to see if one matched

    print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat)))

    # key=org1_transcript_id, value=org2_transcript_id
    top_blast_hits = dict()

    print("INFO: parsing BLAST results vs. org2")
    for line in open(args.organism1_blast_alignments):
        cols = line.split("\t")

        if float(cols[19]) > args.blast_eval_cutoff:
            continue

        if float(cols[10]) < args.blast_percent_identity_cutoff:
            continue
        
        # if we survived until here, this one's good.
        top_blast_hits[cols[0]] = cols[5]

    print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones")
    for o1_mRNA_id in o1_with_aat:
        if o1_mRNA_id in top_blast_hits:
            o1_with_o2.append(o1_mRNA_id)

    print("INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2)))

    id_list_fh = open(args.output_id_list, 'wt')
    for mRNA_id in o1_with_o2:
        id_list_fh.write("{0}\n".format(mRNA_id))
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser(
        description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to parse')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')

    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = biocodegff.set_column_9_value(cols[8], 'ID',
                                                    "{0}.mRNA".format(id))
            cols[8] = biocodegff.set_column_9_value(cols[8], 'Name',
                                                    "{0}.mRNA".format(id))
            cols[8] = biocodegff.order_column_9(cols[8])

            # print the gene and mRNA
            fout.write("{0}\n".format("\t".join(gene_cols)))
            fout.write("{0}\n".format("\t".join(cols)))

        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = biocodegff.set_column_9_value(cols[8], 'ID',
                                                    "{0}.cds".format(parent))
            cols[8] = biocodegff.set_column_9_value(cols[8], 'Name',
                                                    "{0}.cds".format(parent))
            cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent',
                                                    "{0}.mRNA".format(parent))
            cols[8] = biocodegff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent])
            next_exon_num[parent] += 1

            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = biocodegff.set_column_9_value(
                exon_cols[8], 'ID', exon_id)
            exon_cols[8] = biocodegff.set_column_9_value(
                exon_cols[8], 'Name', exon_id)
            exon_cols[8] = biocodegff.set_column_9_value(
                exon_cols[8], 'Parent', "{0}.mRNA".format(parent))
            exon_cols[8] = biocodegff.order_column_9(exon_cols[8])

            fout.write("{0}\n".format("\t".join(exon_cols)))
            fout.write("{0}\n".format("\t".join(cols)))
def main():
    parser = argparse.ArgumentParser( description='Adds locus tag identifiers to GFF3 features')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated')
    parser.add_argument('-a', '--padding', type=int, required=True, help='Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.' )
    parser.add_argument('-n', '--interval', type=int, required=False, default=1, help='Interval between generated identifiers' )
    parser.add_argument('-s', '--starting_id', type=int, required=False, default=0, help='Initial numeric portion of IDs to be generated (do not zero-pad)' )
    parser.add_argument('-d', '--id_file', type=str, required=False, help='Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)')
    parser.add_argument('-m', '--molecule_map', type=str, required=False, help='Pass a 2-column file of molecule->token identifiers (see documentation)')
    parser.add_argument('-c', '--custom', type=str, required=False, help='For custom parsing steps.  Most should ignore this.')

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping  = parse_mapping_file( args.id_file )
    mol_mapping = parse_mapping_file( args.molecule_map )
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == 'joana':
        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana")
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match('TP(\d\d)_(\d+)', id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2) )
                    
    elif args.custom == 'bmicroti':
        microti_map = { 'I':'01', 'II':'02', 'III':'03', 'IV':'04' }
        
        if  args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti")
        else:
            for id in id_mapping:
                m = re.match('BBM_(\D+)(\d+)', id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2) )
                    print(id_mapping[id])
                else:
                    raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id]))
                    
        
    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]):
            print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id) )
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        type = cols[2]

# issue

# 66F4EEF2E3C863C251F831817FF71233
# 7F1917E4D81A959078C9A38E15488BC0
# E22888670919A4A888572155F40F2654
# B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232
# gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel

# errors on: BmicrotiR1_01g00233 -> BBM_I00233
#5800A4110A62E4EAE57AFAD1F8D65CB3        BBM_I00233




        if type == 'gene':
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == 'bmicroti':
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]])
                            else:
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]])
                        else:
                            raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]) )

                    next_id += args.interval

                cols[8] = biocodegff.set_column_9_value(cols[8], 'locus_tag', locus_id )

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id) )

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)
            
        elif type.endswith('RNA'):
            if parent in gene_loci:
                cols[8] = biocodegff.set_column_9_value(cols[8], 'locus_tag', gene_loci[parent] )
            else:
                raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent))
        
        fout.write("\t".join(cols) + "\n")
def main():
    parser = argparse.ArgumentParser(
        'Filter the genes of a GFF3 file by mRNA child IDs')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_gff3',
                        type=str,
                        required=True,
                        help='GFF3 file of source molecules')
    parser.add_argument('-l',
                        '--id_list',
                        type=str,
                        required=True,
                        help='List file of mRNA IDs to keep')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Optional output file path (else STDOUT)')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    ids_to_keep = list()

    for line in open(args.id_list):
        line = line.rstrip()
        if len(line) > 2:
            ids_to_keep.append(line)

    fout.write("##gff-version 3\n")

    current_gene_lines = list()
    current_gene_id = None
    keep = False

    for line in open(args.input_gff3):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        # grab the ID and Parent columns if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        type = cols[2]

        if type == 'gene':
            # purge the current gene, if any
            if len(current_gene_lines) > 1:
                for li in current_gene_lines:
                    fout.write("{0}\n".format(li))

            # reset
            current_gene_lines = list()
            current_gene_lines.append(line)
            current_gene_id = id

        else:
            if type == 'mRNA':
                if id in ids_to_keep:
                    keep = True
                else:
                    keep = False

            if keep == True:
                current_gene_lines.append(line)
Beispiel #23
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Generates new identifiers in GFF3 files following the IGS identifier convention.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='TA file of source molecules')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Optional output file path (else STDOUT)')
    parser.add_argument('-p',
                        '--prefix',
                        type=str,
                        required=True,
                        help='The prefix portion of IDs to be generated')
    parser.add_argument(
        '-m',
        '--mode',
        type=str,
        required=False,
        default='sequential',
        help=
        'ID modes (see embedded documentation): sequential, uuid, hex8, hex12')

    args = parser.parse_args()
    check_arguments(args)

    id_map = dict()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    for line in open(args.input_file):
        line = line.rstrip()

        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        # grab the ID column if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        new_id = None
        new_parent = None
        type = cols[2]

        if id is not None:
            if id in id_map:
                new_id = id_map[id]
            else:
                new_id = get_new_id(args.prefix, type, args.mode)
                id_map[id] = new_id

            cols[8] = cols[8].replace("ID={0}".format(id),
                                      "ID={0}".format(new_id))

        if parent is not None:
            if parent in id_map:
                new_parent = id_map[parent]
            else:
                raise Exception(
                    "ERROR: parent ({0}) referenced before it was used as an ID"
                    .format(parent))

            cols[8] = cols[8].replace("Parent={0}".format(parent),
                                      "Parent={0}".format(new_parent))

        #print("DEBUG: old_id:{0} - old_parent:{1}, new_id:{2} - new_parent:{3}".format(id, parent, new_id, new_parent))
        fout.write("\t".join(cols) + "\n")
def main():
    parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)' )
    args = parser.parse_args()

    if args.export_mode not in ['model', 'cDNA_match']:
        raise Exception("ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'")
    
    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None
    
    current_match = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    # each gb_record is a SeqRecord object
    for line in open(args.input_file, "r"):
        cols = line.split("\t")

        if len(cols) != 9:
            print("SKIPPING: {0}".format(line))
            continue
        
        mol_id = cols[0]

        if mol_id not in assemblies:
            assemblies[mol_id] = biothings.Assembly( id=mol_id )

        current_assembly = assemblies[mol_id]
        ftype  = cols[2]
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        col9 = cols[8]

        # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key)
        col9 = col9.replace(' "', '="')
        gene_id       = biocodegff.column_9_value(col9, 'gene_id').replace('"', '')
        transcript_id = biocodegff.column_9_value(col9, 'transcript_id').replace('"', '')
        
        if ftype == 'transcript':
            if args.export_mode == 'model':
                if current_gene is not None and current_gene.id != gene_id:
                    gene.print_as(fh=ofh, source='Cufflinks', format='gff3')

                if current_gene is None or current_gene.id != gene_id:
                    gene = biothings.Gene( id=gene_id )
                    gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                    current_gene = gene

                mRNA = biothings.mRNA( id=transcript_id, parent=current_gene )
                mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_mRNA(mRNA)
                current_RNA = mRNA
                exon_count_by_RNA[transcript_id] = 0
                current_CDS_phase = 0

            elif args.export_mode == 'cDNA_match':
                if current_match is not None and current_match.id != transcript_id:
                    match.print_as( fh=ofh, source='Cufflinks', format='gff3' )
                
                match = biothings.Match( id=transcript_id, subclass='cDNA_match', length=fmax - fmin )
                match.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_match = match
            
        elif ftype == 'exon':
            exon_number = biocodegff.column_9_value(col9, 'exon_number').replace('"', '')
            
            if args.export_mode == 'model':
                exon_count_by_RNA[transcript_id] += 1

                cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                CDS = biothings.CDS( id=cds_id, parent=current_RNA )
                CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase )
                current_RNA.add_CDS(CDS)

                 # calculate the starting phase for the next CDS feature (in case there is one)
                current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3)
                if current_CDS_phase == 3:
                    current_CDS_phase = 0

                exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                exon = biothings.Exon( id=exon_id, parent=current_RNA )
                exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_RNA.add_exon(exon)
                
            elif args.export_mode == 'cDNA_match':
                mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number)
                mp = biothings.MatchPart( id=mp_id, parent=current_match, length=fmax - fmin )
                mp.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_match.add_part(mp)

    # don't forget to do the last gene, if there were any
    if args.export_mode == 'model':
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')
            
    elif args.export_mode == 'cDNA_match':
        if current_match is not None:
            match.print_as( fh=ofh, source='Cufflinks', format='gff3' )
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None
    
    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()
    
    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3")

    for line in open(args.input):
        if line.startswith("#"):
            current_gene_comment_lines.append(line)
            
            if line.startswith("# end gene "):
                ## purge the comments, then write the gene
                fout.write( "".join(current_gene_comment_lines) )
                gene.print_as(fh=fout, source='AUGUSTUS', format='gff3')

                gene = None
                mRNAs = dict()
                in_sequence = False
                current_sequence = None
                current_gene_comment_lines = list()

            elif line.startswith("# protein sequence = ["):
                pass
            elif in_sequence is True:
                # build 'current_sequence'
                pass

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]
            feat_id = biocodegff.column_9_value(cols[8], 'ID')

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = biothings.Assembly( id=mol_id )

            current_assembly = assemblies[mol_id]

            if feat_type == "gene":
                gene = biothings.Gene( id=feat_id )
                gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )

            elif feat_type == "transcript":
                mRNA = biothings.mRNA( id=feat_id, parent=gene )
                mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA

                if feat_id in exon_count_by_mRNA:
                    raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_mRNA[feat_id] = 0
                    
            elif feat_type == "CDS":
                parent_id = biocodegff.column_9_value( cols[8], 'Parent' )

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id))

                CDS = biothings.CDS( id=feat_id, parent=mRNAs[parent_id] )
                CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) )
                mRNA.add_CDS(CDS)
                
                ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id])
                
                exon = biothings.Exon( id=exon_id, parent=mRNAs[parent_id] )
                exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                mRNA.add_exon(exon)
def main():
    parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    
    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = biocodegff.set_column_9_value( cols[8], 'ID', "{0}.mRNA".format(id) )
            cols[8] = biocodegff.set_column_9_value( cols[8], 'Name', "{0}.mRNA".format(id) )
            cols[8] = biocodegff.order_column_9(cols[8])
            
            # print the gene and mRNA
            fout.write( "{0}\n".format("\t".join(gene_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )
            
        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = biocodegff.set_column_9_value( cols[8], 'ID', "{0}.cds".format(parent) )
            cols[8] = biocodegff.set_column_9_value( cols[8], 'Name', "{0}.cds".format(parent) )
            cols[8] = biocodegff.set_column_9_value( cols[8], 'Parent', "{0}.mRNA".format(parent) )
            cols[8] = biocodegff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent] )
            next_exon_num[parent] += 1
            
            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'ID', exon_id )
            exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Name', exon_id )
            exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Parent', "{0}.mRNA".format(parent) )
            exon_cols[8] = biocodegff.order_column_9(exon_cols[8])

            fout.write( "{0}\n".format("\t".join(exon_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )
Beispiel #27
0
def main():
    parser = argparse.ArgumentParser(
        description='Removes orphaned features in a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' )
    args = parser.parse_args()

    # going to try saving memory by tracking line numbers instead of storing all of it
    #  true means keep the line, false means to omit it
    # doing tracking this way since it's technically legal for a feature to have no identifier at all.
    lines = list()
    parents = dict()
    current_line_num = -1

    infile = open(args.input)

    for line in infile:
        current_line_num += 1

        if line.startswith('#'):
            lines.append(True)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            lines.append(True)
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if parent is None:
            # this might be overwritten later
            lines.append(False)

            if id is not None:
                if parent not in parents:
                    parents[parent] = False
        else:
            lines.append(True)
            parents[parent] = True

    infile.seek(0)
    current_line_num = -1

    outfh = open(args.output, 'wt')

    for line in infile:
        current_line_num += 1

        if lines[current_line_num] == True:
            outfh.write(line)
        else:
            line = line.rstrip()
            cols = line.split("\t")

            if len(cols) == 9:
                id = biocodegff.column_9_value(cols[8], 'ID')

                if id is not None and id in parents and parents[id] == True:
                    outfh.write("{0}\n".format(line))
                else:
                    print("WARN: removing this line: {0}".format(line))
def main():
    parser = argparse.ArgumentParser(
        description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to be read')
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=True,
        help='Path to an output GFF file to be created with new IDs')
    parser.add_argument('-p',
                        '--id_prefix',
                        type=str,
                        required=True,
                        help='Will be used as the base for all IDs generated')
    parser.add_argument(
        '-m',
        '--output_map',
        type=str,
        required=False,
        help='This will create a tab-delimited mapping of old and new IDs')
    args = parser.parse_args()

    ofh = open(args.output_gff, 'w')

    if args.output_map is None:
        map_ofh = None
    else:
        map_ofh = open(args.output_map, 'w')

    idmap = dict()

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write(line + "\n")
            continue

        feat_id = biocodegff.column_9_value(cols[8], 'ID')
        parent_id = biocodegff.column_9_value(cols[8], 'Parent')

        if feat_id in idmap:
            new_feat_id = idmap[feat_id]
        else:
            new_feat_id = get_new_id(args.id_prefix, cols[2], feat_id, map_ofh)
            idmap[feat_id] = new_feat_id

        if parent_id is None:
            cols[8] = "ID={0}".format(new_feat_id)
        else:
            if parent_id in idmap:
                new_parent_id = idmap[parent_id]
            else:
                new_parent_id = get_new_id(args.id_prefix, cols[2], parent_id,
                                           map_ofh)
                idmap[parent_id] = new_parent_id

            cols[8] = "ID={0};Parent={1}".format(new_feat_id, new_parent_id)

        ofh.write("\t".join(cols) + "\n")
    last_gene = None
    
    for qry_gene in things:
        if qry_gene.id in handled_ids:
            continue
        
        ## mark this one as handled
        handled_ids[qry_gene.id] = 1
        nonoverlapping_set.append(qry_gene)

<<<<<<< .mine
        current_assembly = assemblies[mol_id]
        rfmin = int(cols[3]) - 1
        rfmax = int(cols[4])
        rstrand = None
        feat_id = biocodegff.column_9_value(cols[8], 'ID')
        parent_id = biocodegff.column_9_value(cols[8], 'Parent')
        parent_feat = None
        
        if parent_id is not None:
            if parent_id in features:
                parent_feat = features[parent_id]
            else:
                raise Exception("Error in GFF3: Parent {0} referenced by a child feature before it was defined".format(parent_id) )

        #print("Processing feature: ({0})".format(feat_id))

        if cols[6] == '-':
            strand = -1
        elif cols[6] == '+':
            strand = 1
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'

    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = biocodegff.get_gff3_features(flawed_gff_file)

    print("INFO: loaded {0} assemblies and {1} features".format(
        len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = biothings.Polypeptide(id=id, parent=parent)
        polypeptides[parent].locate_on(target=assemblies[cols[0]],
                                       fmin=int(cols[3]) - 1,
                                       fmax=int(cols[4]),
                                       strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(
        len(polypeptides)))

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print(
                        "DEBUG: {0} not found as a parent to any polypeptide".
                        format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()

                for CDS in CDSs:
                    keep = True

                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))

            gene.print_as(fh=fout, source=source, format='gff3')
def main():
    parser = argparse.ArgumentParser(
        description=
        'Convert native (GTF) or GFF output from Augustus into GFF3 format')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to a GFF file created by Augustus')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None

    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()

    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        if line.startswith("#"):
            current_gene_comment_lines.append(line)

            if line.startswith("# end gene "):
                ## purge the comments, then write the gene
                fout.write("".join(current_gene_comment_lines))
                gene.print_as(fh=fout, source='AUGUSTUS', format='gff3')

                gene = None
                mRNAs = dict()
                in_sequence = False
                current_sequence = None
                current_gene_comment_lines = list()

            elif line.startswith("# protein sequence = ["):
                pass
            elif in_sequence is True:
                # build 'current_sequence'
                pass

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]

            if feat_type not in ['gene', 'transcript', 'CDS']:
                continue

            ## The output format is GTF by default and (mostly) GFF if the --gff option is used.
            #   If GTF is detected, let's start by transforming the 9th column into GFF so the
            #   libraries can use it
            #   g1  ->  ID=g1
            #   g1.t1  ->  ID=g1.t1;Parent=g1
            #   transcript_id "g1.t1"; gene_id "g1";  ->  ID=g1.t1.cds;Parent=g1.t1
            m_gene = re.match('(g\d+)', cols[8])
            m_transcript = re.match('((g\d+).t\d+)', cols[8])
            m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";',
                             cols[8])

            # the input can be in GTF or GFF.  We need to reformat the 9th column for the GTF entries
            if not cols[8].startswith('ID') and not cols[8].startswith(
                    'Parent'):
                if feat_type == 'gene':
                    if m_gene:
                        cols[8] = "ID={0}".format(m_gene.group(1))
                    else:
                        raise Exception(
                            "ERROR: GTF detected but gene row has bad 9th column format: {0}"
                            .format(cols[8]))
                elif feat_type == 'transcript':
                    if m_transcript:
                        cols[8] = "ID={0};Parent={1}".format(
                            m_transcript.group(1), m_transcript.group(2))
                    else:
                        raise Exception(
                            "ERROR: GTF detected but transcript row has bad 9th column format: {0}"
                            .format(cols[8]))
                elif feat_type == 'CDS':
                    if m_CDS:
                        cols[8] = "ID={0}.cds;Parent={0}".format(
                            m_CDS.group(1))
                    else:
                        raise Exception(
                            "ERROR: GTF detected but CDS row has bad 9th column format: {0}"
                            .format(cols[8]))

            feat_id = biocodegff.column_9_value(cols[8], 'ID')

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = biothings.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            if feat_type == "gene":
                gene = biothings.Gene(id=feat_id)
                gene.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])

            elif feat_type == "transcript":
                mRNA = biothings.mRNA(id=feat_id, parent=gene)
                mRNA.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA

                if feat_id in exon_count_by_mRNA:
                    raise Exception(
                        "ERROR: two different mRNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_mRNA[feat_id] = 0

            elif feat_type == "CDS":
                parent_id = biocodegff.column_9_value(cols[8], 'Parent')

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception(
                        "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file"
                        .format(parent_id))

                CDS = biothings.CDS(id=feat_id, parent=mRNAs[parent_id])
                CDS.locate_on(target=current_assembly,
                              fmin=int(cols[3]) - 1,
                              fmax=int(cols[4]),
                              strand=cols[6],
                              phase=int(cols[7]))
                mRNA.add_CDS(CDS)

                ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id,
                                               exon_count_by_mRNA[parent_id])

                exon = biothings.Exon(id=exon_id, parent=mRNAs[parent_id])
                exon.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])
                mRNA.add_exon(exon)
def main():
    parser = argparse.ArgumentParser(description="Convert native (GTF) or GFF output from Augustus into GFF3 format")

    ## output file to be written
    parser.add_argument("-i", "--input", type=str, required=True, help="Path to a GFF file created by Augustus")
    parser.add_argument("-o", "--output", type=str, required=True, help="Path to an output file to be created")
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None

    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()

    fout = open(args.output, mode="wt", encoding="utf-8")
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        if line.startswith("#"):
            current_gene_comment_lines.append(line)

            if line.startswith("# end gene "):
                ## purge the comments, then write the gene
                fout.write("".join(current_gene_comment_lines))
                gene.print_as(fh=fout, source="AUGUSTUS", format="gff3")

                gene = None
                mRNAs = dict()
                in_sequence = False
                current_sequence = None
                current_gene_comment_lines = list()

            elif line.startswith("# protein sequence = ["):
                pass
            elif in_sequence is True:
                # build 'current_sequence'
                pass

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]

            if feat_type not in ["gene", "transcript", "CDS"]:
                continue

            ## The output format is GTF by default and (mostly) GFF if the --gff option is used.
            #   If GTF is detected, let's start by transforming the 9th column into GFF so the
            #   libraries can use it
            #   g1  ->  ID=g1
            #   g1.t1  ->  ID=g1.t1;Parent=g1
            #   transcript_id "g1.t1"; gene_id "g1";  ->  ID=g1.t1.cds;Parent=g1.t1
            m_gene = re.match("(g\d+)", cols[8])
            m_transcript = re.match("((g\d+).t\d+)", cols[8])
            m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8])

            # the input can be in GTF or GFF.  We need to reformat the 9th column for the GTF entries
            if not cols[8].startswith("ID") and not cols[8].startswith("Parent"):
                if feat_type == "gene":
                    if m_gene:
                        cols[8] = "ID={0}".format(m_gene.group(1))
                    else:
                        raise Exception(
                            "ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8])
                        )
                elif feat_type == "transcript":
                    if m_transcript:
                        cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2))
                    else:
                        raise Exception(
                            "ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8])
                        )
                elif feat_type == "CDS":
                    if m_CDS:
                        cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1))
                    else:
                        raise Exception(
                            "ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8])
                        )

            feat_id = biocodegff.column_9_value(cols[8], "ID")

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = biothings.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            if feat_type == "gene":
                gene = biothings.Gene(id=feat_id)
                gene.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6])

            elif feat_type == "transcript":
                mRNA = biothings.mRNA(id=feat_id, parent=gene)
                mRNA.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6])
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA

                if feat_id in exon_count_by_mRNA:
                    raise Exception("ERROR: two different mRNAs found with same ID: {0}".format(feat_id))
                else:
                    exon_count_by_mRNA[feat_id] = 0

            elif feat_type == "CDS":
                parent_id = biocodegff.column_9_value(cols[8], "Parent")

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception(
                        "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id)
                    )

                CDS = biothings.CDS(id=feat_id, parent=mRNAs[parent_id])
                CDS.locate_on(
                    target=current_assembly,
                    fmin=int(cols[3]) - 1,
                    fmax=int(cols[4]),
                    strand=cols[6],
                    phase=int(cols[7]),
                )
                mRNA.add_CDS(CDS)

                ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id])

                exon = biothings.Exon(id=exon_id, parent=mRNAs[parent_id])
                exon.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6])
                mRNA.add_exon(exon)