def main():
    parser = argparse.ArgumentParser(
        description=
        'Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line))

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print(
                    "INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})"
                    .format(id, last_rna_id, cols[2]))
                cols[8] = gff.set_column_9_value(cols[8], 'Parent',
                                                 last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)))
            else:
                ofh.write("{0}\n".format(line))
        else:
            ofh.write("{0}\n".format(line))
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'
    
    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = gff.get_gff3_features(flawed_gff_file)

    print("INFO: loaded {0} assemblies and {1} features".format(len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = things.Polypeptide(id=id, parent=parent)
        polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(len(polypeptides)) )

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print("DEBUG: {0} not found as a parent to any polypeptide".format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()
                    
                for CDS in CDSs:
                    keep = True
                    
                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))                    

            gene.print_as(fh=fout, source=source, format='gff3')
Example #3
0
def main():
    parser = argparse.ArgumentParser( description='Generates new identifiers in GFF3 files following the IGS identifier convention.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated')
    parser.add_argument('-m', '--mode', type=str, required=False, default='sequential', help='ID modes (see embedded documentation): sequential, uuid, hex8, hex12')

    args = parser.parse_args()
    check_arguments(args)

    id_map = dict()
    
    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    for line in open(args.input_file):
        line = line.rstrip()

        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        # grab the ID column if any
        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        new_id = None
        new_parent = None
        type = cols[2]

        if id is not None:
            if id in id_map:
                new_id = id_map[id]
            else:
                new_id = get_new_id(args.prefix, type, args.mode)
                id_map[id] = new_id

            cols[8] = cols[8].replace("ID={0}".format(id), "ID={0}".format(new_id))

        if parent is not None:
            if parent in id_map:
                new_parent = id_map[parent]
            else:
                raise Exception("ERROR: parent ({0}) referenced before it was used as an ID".format(parent))

            cols[8] = cols[8].replace("Parent={0}".format(parent), "Parent={0}".format(new_parent))

        #print("DEBUG: old_id:{0} - old_parent:{1}, new_id:{2} - new_parent:{3}".format(id, parent, new_id, new_parent))
        fout.write("\t".join(cols) + "\n")
Example #4
0
def main():
    parser = argparse.ArgumentParser('Filter the genes of a GFF3 file by mRNA child IDs')

    ## output file to be written
    parser.add_argument('-i', '--input_gff', type=str, required=True, help='GFF file of source annotation' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    current_mRNA_id = None
    current_mol_id = None
    current_fragments = list()
    current_direction = None
        
    for line in open(args.input_gff):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        # grab the ID and Parent columns if any
        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        mol_id = cols[0]
        type = cols[2]

        if type == 'mRNA':
            if current_mRNA_id is not None and id != current_mRNA_id:
                # purge the existing one first
                write_transcript(fout, current_mol_id, current_fragments, current_direction)
                current_fragments = list()
                
            current_mRNA_id = id
            current_mol_id = cols[0]
            current_direction = cols[6]
            
        elif type == 'exon':
            
            if cols[6] == '+':
                current_fragments.append({'start':cols[3], 'end':cols[4]})
            else:
                current_fragments.append({'start':cols[4], 'end':cols[3]})

    write_transcript(fout, current_mol_id, current_fragments, current_direction)
def main():
    parser = argparse.ArgumentParser(
        description='Adds gene features for RNAs which lack them')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA') and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = 'gene'
            gene_cols[8] = gff.set_column_9_value(gene_cols[8], 'ID',
                                                  "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)))

            cols[8] = gff.set_column_9_value(cols[8], 'Parent',
                                             "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_gff', type=str, required=True, help='Path to an output GFF file to be created with new IDs' )
    parser.add_argument('-p', '--id_prefix', type=str, required=True, help='Will be used as the base for all IDs generated' )
    parser.add_argument('-m', '--output_map', type=str, required=False, help='This will create a tab-delimited mapping of old and new IDs' )
    args = parser.parse_args()

    ofh = open(args.output_gff, 'w')

    if args.output_map is None:
        map_ofh = None
    else:
        map_ofh = open(args.output_map, 'w')

    idmap = dict()

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write(line + "\n")
            continue

        feat_id   = gff.column_9_value(cols[8], 'ID')
        parent_id = gff.column_9_value(cols[8], 'Parent')
        
        if feat_id in idmap:
            new_feat_id = idmap[feat_id]
        else:
            new_feat_id = get_new_id(args.id_prefix, cols[2], feat_id, map_ofh)
            idmap[feat_id] = new_feat_id

        if parent_id is None:
            cols[8] = "ID={0}".format(new_feat_id)
        else:
            if parent_id in idmap:
                new_parent_id = idmap[parent_id]
            else:
                new_parent_id = get_new_id(args.id_prefix, cols[2], parent_id, map_ofh)
                idmap[parent_id] = new_parent_id

            cols[8] = "ID={0};Parent={1}".format(new_feat_id, new_parent_id)

        ofh.write( "\t".join(cols) + "\n" )
def append_organism_names_to_gff(file_path, poly_orgs):
    # we have to write to a temp file and copy over
    fout = open("{0}.orgtmp".format(file_path), 'wt')
    orgs_found = 0
    last_RNA_id = None

    for line in open(file_path):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) == 9 and cols[2].endswith('RNA'):
            last_RNA_id = gff.column_9_value(cols[8], 'ID')
        if len(cols) == 9 and cols[2] == 'polypeptide':
            if last_RNA_id in poly_orgs:
                cols[8] += ";top_organism_from_blast={0}".format(
                    poly_orgs[last_RNA_id], gff.escape(poly_orgs[last_RNA_id]))
                orgs_found += 1

            fout.write("{0}\n".format("\t".join(cols)))

        else:
            fout.write("{0}\n".format(line))

    if orgs_found == 0:
        print(
            "WARNING: The --export_organism_names option was passed, but parsing failed to find any organism names at all.  This might be an error."
        )

    ## now move the temp file over the original copy
    fout.close()
    os.rename("{0}.orgtmp".format(file_path), file_path)
def append_organism_names_to_gff(file_path, poly_orgs):
    # we have to write to a temp file and copy over
    fout = open("{0}.orgtmp".format(file_path), 'wt')
    orgs_found = 0
    last_RNA_id = None
    
    for line in open(file_path):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) == 9 and cols[2].endswith('RNA'):
            last_RNA_id = gff.column_9_value(cols[8], 'ID')
        if len(cols) == 9 and cols[2] == 'polypeptide':
            if last_RNA_id in poly_orgs:
                cols[8] += ";top_organism_from_blast={0}".format(poly_orgs[last_RNA_id], gff.escape(poly_orgs[last_RNA_id]))
                orgs_found += 1

            fout.write("{0}\n".format("\t".join(cols)) )

        else:
            fout.write("{0}\n".format(line))

    if orgs_found == 0:
        print("WARNING: The --export_organism_names option was passed, but parsing failed to find any organism names at all.  This might be an error.")
            
    ## now move the temp file over the original copy
    fout.close()
    os.rename("{0}.orgtmp".format(file_path), file_path)
def main():
    parser = argparse.ArgumentParser( description='Reverses CDS coodinates where stop < start')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]):
            temp = cols[3]
            cols[3] = cols[4]
            cols[4] = temp
            id = gff.column_9_value(cols[8], 'ID')
            print("CDS reversed: {0}".format(id))
            ofh.write("{0}\n".format("\t".join(cols)) )
        else:
            ofh.write("{0}\n".format(line) )
def main():
    parser = argparse.ArgumentParser( description='Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None
    
    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        id     = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line) )

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print("INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})".format(id, last_rna_id, cols[2]) )
                cols[8] = gff.set_column_9_value(cols[8], 'Parent', last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)) )
            else:
                ofh.write("{0}\n".format(line) )
        else:
            ofh.write("{0}\n".format(line) )
def main():
    parser = argparse.ArgumentParser( description='Adds gene features for RNAs which lack them')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        id     = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA') and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = 'gene'
            gene_cols[8] = gff.set_column_9_value(gene_cols[8], 'ID', "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)) )

            cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)) )
        else:
            ofh.write("{0}\n".format(line) )
def main():
    parser = argparse.ArgumentParser( description='Removes duplicate features in a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    # just reduce the keys to a string:
    # "molecule__parent__type__start__stop"
    found = list()

    infile  = open(args.input)
    outfile = open(args.output, 'wt')
    
    for line in infile:
        if line.startswith('#'):
            outfile.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        parent = gff.column_9_value(cols[8], 'Parent')
        type   = cols[2]
        mol_id = cols[0]

        if parent is None:
            outfile.write("{0}\n".format(line))
            continue

        id_string = "{0}__{1}__{2}__{3}__{4}".format(mol_id, parent, type, cols[3], cols[4])

        if id_string in found:
            print("INFO: duplicate feature to be removed:\n{0}\n".format(line) )
            continue
        else:
            found.append(id_string)
            outfile.write("{0}\n".format(line) )
Example #13
0
def main():
    parser = argparse.ArgumentParser(
        description='Reverses CDS coodinates where stop < start')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]):
            temp = cols[3]
            cols[3] = cols[4]
            cols[4] = temp
            id = gff.column_9_value(cols[8], 'ID')
            print("CDS reversed: {0}".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser( description='Convert native (GTF) or GFF output from Augustus into GFF3 format')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None
    
    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()
    
    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        if line.startswith("#"):
            current_gene_comment_lines.append(line)
            
            if line.startswith("# end gene "):
                ## purge the comments, then write the gene
                fout.write( "".join(current_gene_comment_lines) )
                gene.print_as(fh=fout, source='AUGUSTUS', format='gff3')

                gene = None
                mRNAs = dict()
                in_sequence = False
                current_sequence = None
                current_gene_comment_lines = list()

            elif line.startswith("# protein sequence = ["):
                pass
            elif in_sequence is True:
                # build 'current_sequence'
                pass

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]

            if feat_type not in ['gene', 'transcript', 'CDS']:
                continue

            ## The output format is GTF by default and (mostly) GFF if the --gff option is used.
            #   If GTF is detected, let's start by transforming the 9th column into GFF so the
            #   libraries can use it
            #   g1  ->  ID=g1
            #   g1.t1  ->  ID=g1.t1;Parent=g1
            #   transcript_id "g1.t1"; gene_id "g1";  ->  ID=g1.t1.cds;Parent=g1.t1
            m_gene = re.match('(g\d+)', cols[8])
            m_transcript = re.match('((g\d+).t\d+)', cols[8])
            m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8])

            # the input can be in GTF or GFF.  We need to reformat the 9th column for the GTF entries
            if not cols[8].startswith('ID') and not cols[8].startswith('Parent'):
                if feat_type == 'gene':
                    if m_gene:
                        cols[8] = "ID={0}".format(m_gene.group(1))
                    else:
                        raise Exception("ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8]))
                elif feat_type == 'transcript':
                    if m_transcript:
                        cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2))
                    else:
                        raise Exception("ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8]))
                elif feat_type == 'CDS':
                    if m_CDS:
                        cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1))
                    else:
                        raise Exception("ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8]))

            feat_id = gff.column_9_value(cols[8], 'ID')

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            if feat_type == "gene":
                gene = things.Gene(id=feat_id)
                gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )

            elif feat_type == "transcript":
                mRNA = things.mRNA(id=feat_id, parent=gene)
                mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA

                if feat_id in exon_count_by_mRNA:
                    raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_mRNA[feat_id] = 0
                    
            elif feat_type == "CDS":
                parent_id = gff.column_9_value(cols[8], 'Parent')

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id))

                CDS = things.CDS(id=feat_id, parent=mRNAs[parent_id])
                CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) )
                mRNA.add_CDS(CDS)
                
                ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id])
                
                exon = things.Exon(id=exon_id, parent=mRNAs[parent_id])
                exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                mRNA.add_exon(exon)
Example #15
0
def main():
    parser = argparse.ArgumentParser(description="Adds locus tag identifiers to GFF3 features")

    ## output file to be written
    parser.add_argument("-i", "--input_file", type=str, required=True, help="TA file of source molecules")
    parser.add_argument("-o", "--output_file", type=str, required=False, help="Optional output file path (else STDOUT)")
    parser.add_argument("-p", "--prefix", type=str, required=True, help="The prefix portion of IDs to be generated")
    parser.add_argument(
        "-a",
        "--padding",
        type=int,
        required=True,
        help="Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.",
    )
    parser.add_argument(
        "-n", "--interval", type=int, required=False, default=1, help="Interval between generated identifiers"
    )
    parser.add_argument(
        "-s",
        "--starting_id",
        type=int,
        required=False,
        default=0,
        help="Initial numeric portion of IDs to be generated (do not zero-pad)",
    )
    parser.add_argument(
        "-d",
        "--id_file",
        type=str,
        required=False,
        help="Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)",
    )
    parser.add_argument(
        "-m",
        "--molecule_map",
        type=str,
        required=False,
        help="Pass a 2-column file of molecule->token identifiers (see documentation)",
    )
    parser.add_argument(
        "-c", "--custom", type=str, required=False, help="For custom parsing steps.  Most should ignore this."
    )

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping = parse_mapping_file(args.id_file)
    mol_mapping = parse_mapping_file(args.molecule_map)
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == "joana":
        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana")
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match("TP(\d\d)_(\d+)", id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2))

    elif args.custom == "bmicroti":
        microti_map = {"I": "01", "II": "02", "III": "03", "IV": "04"}

        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti")
        else:
            for id in id_mapping:
                m = re.match("BBM_(\D+)(\d+)", id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2))
                    print(id_mapping[id])
                else:
                    raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id]))

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, "wt")

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (
            args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]
        ):
            print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id))
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = gff.column_9_value(cols[8], "ID")
        parent = gff.column_9_value(cols[8], "Parent")
        type = cols[2]

        # issue

        # 66F4EEF2E3C863C251F831817FF71233
        # 7F1917E4D81A959078C9A38E15488BC0
        # E22888670919A4A888572155F40F2654
        # B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232
        # gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel

        # errors on: BmicrotiR1_01g00233 -> BBM_I00233
        # 5800A4110A62E4EAE57AFAD1F8D65CB3        BBM_I00233

        if type == "gene":
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == "bmicroti":
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix,
                                    str(int(last_number_portion_assigned) + 1).zfill(args.padding),
                                    mol_mapping[cols[0]],
                                )
                            else:
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]]
                                )
                        else:
                            raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]))

                    next_id += args.interval

                cols[8] = gff.set_column_9_value(cols[8], "locus_tag", locus_id)

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id))

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)

        elif type.endswith("RNA"):
            if parent in gene_loci:
                cols[8] = gff.set_column_9_value(cols[8], "locus_tag", gene_loci[parent])
            else:
                raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent))

        fout.write("\t".join(cols) + "\n")
def main():
    parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)' )
    args = parser.parse_args()

    if args.export_mode not in ['model', 'cDNA_match']:
        raise Exception("ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'")
    
    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None
    
    current_match = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    # each gb_record is a SeqRecord object
    for line in open(args.input_file, "r"):
        cols = line.split("\t")

        if len(cols) != 9:
            print("SKIPPING: {0}".format(line))
            continue
        
        mol_id = cols[0]

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        current_assembly = assemblies[mol_id]
        ftype  = cols[2]
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        col9 = cols[8]

        # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key)
        col9 = col9.replace(' "', '="')
        gene_id       = gff.column_9_value(col9, 'gene_id').replace('"', '')
        transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '')
        
        if ftype == 'transcript':
            if args.export_mode == 'model':
                if current_gene is not None and current_gene.id != gene_id:
                    gene.print_as(fh=ofh, source='Cufflinks', format='gff3')

                if current_gene is None or current_gene.id != gene_id:
                    gene = things.Gene(id=gene_id)
                    gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                    current_gene = gene

                mRNA = things.mRNA(id=transcript_id, parent=current_gene)
                mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                gene.add_mRNA(mRNA)
                current_RNA = mRNA
                exon_count_by_RNA[transcript_id] = 0
                current_CDS_phase = 0

            elif args.export_mode == 'cDNA_match':
                if current_match is not None and current_match.id != transcript_id:
                    match.print_as( fh=ofh, source='Cufflinks', format='gff3' )
                
                match = things.Match(id=transcript_id, subclass='cDNA_match', length=fmax - fmin)
                match.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_match = match
            
        elif ftype == 'exon':
            exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '')
            
            if args.export_mode == 'model':
                exon_count_by_RNA[transcript_id] += 1

                cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                CDS = things.CDS(id=cds_id, parent=current_RNA)
                CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase )
                current_RNA.add_CDS(CDS)

                 # calculate the starting phase for the next CDS feature (in case there is one)
                current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3)
                if current_CDS_phase == 3:
                    current_CDS_phase = 0

                exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
                exon = things.Exon(id=exon_id, parent=current_RNA)
                exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_RNA.add_exon(exon)
                
            elif args.export_mode == 'cDNA_match':
                mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number)
                mp = things.MatchPart(id=mp_id, parent=current_match, length=fmax - fmin)
                mp.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_match.add_part(mp)

    # don't forget to do the last gene, if there were any
    if args.export_mode == 'model':
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')
            
    elif args.export_mode == 'cDNA_match':
        if current_match is not None:
            match.print_as( fh=ofh, source='Cufflinks', format='gff3' )
def main():
    parser = argparse.ArgumentParser(
        description='Removes orphaned features in a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' )
    args = parser.parse_args()

    # going to try saving memory by tracking line numbers instead of storing all of it
    #  true means keep the line, false means to omit it
    # doing tracking this way since it's technically legal for a feature to have no identifier at all.
    lines = list()
    parents = dict()
    current_line_num = -1

    infile = open(args.input)

    for line in infile:
        current_line_num += 1

        if line.startswith('#'):
            lines.append(True)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            lines.append(True)
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if parent is None:
            # this might be overwritten later
            lines.append(False)

            if id is not None:
                if parent not in parents:
                    parents[parent] = False
        else:
            lines.append(True)
            parents[parent] = True

    infile.seek(0)
    current_line_num = -1

    outfh = open(args.output, 'wt')

    for line in infile:
        current_line_num += 1

        if lines[current_line_num] == True:
            outfh.write(line)
        else:
            line = line.rstrip()
            cols = line.split("\t")

            if len(cols) == 9:
                id = gff.column_9_value(cols[8], 'ID')

                if id is not None and id in parents and parents[id] == True:
                    outfh.write("{0}\n".format(line))
                else:
                    print("WARN: removing this line: {0}".format(line))
Example #18
0
def main():
    parser = argparse.ArgumentParser(
        description='A GTF -> GFF3 conversion script for Cufflinks output')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GTF file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output GFF file to be created')
    parser.add_argument('-e',
                        '--export_mode',
                        type=str,
                        required=False,
                        default='model',
                        help='Export mode for results (model or cDNA_match)')
    args = parser.parse_args()

    if args.export_mode not in ['model', 'cDNA_match']:
        raise Exception(
            "ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'"
        )

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None

    current_match = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    # each gb_record is a SeqRecord object
    for line in open(args.input_file, "r"):
        cols = line.split("\t")

        if len(cols) != 9:
            print("SKIPPING: {0}".format(line))
            continue

        mol_id = cols[0]

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        current_assembly = assemblies[mol_id]
        ftype = cols[2]
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        col9 = cols[8]

        # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key)
        col9 = col9.replace(' "', '="')
        gene_id = gff.column_9_value(col9, 'gene_id').replace('"', '')
        transcript_id = gff.column_9_value(col9,
                                           'transcript_id').replace('"', '')

        if ftype == 'transcript':
            if args.export_mode == 'model':
                if current_gene is not None and current_gene.id != gene_id:
                    gene.print_as(fh=ofh, source='Cufflinks', format='gff3')

                if current_gene is None or current_gene.id != gene_id:
                    gene = things.Gene(id=gene_id)
                    gene.locate_on(target=current_assembly,
                                   fmin=fmin,
                                   fmax=fmax,
                                   strand=strand)
                    current_gene = gene

                mRNA = things.mRNA(id=transcript_id, parent=current_gene)
                mRNA.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                gene.add_mRNA(mRNA)
                current_RNA = mRNA
                exon_count_by_RNA[transcript_id] = 0
                current_CDS_phase = 0

            elif args.export_mode == 'cDNA_match':
                if current_match is not None and current_match.id != transcript_id:
                    match.print_as(fh=ofh, source='Cufflinks', format='gff3')

                match = things.Match(id=transcript_id,
                                     subclass='cDNA_match',
                                     length=fmax - fmin)
                match.locate_on(target=current_assembly,
                                fmin=fmin,
                                fmax=fmax,
                                strand=strand)
                current_match = match

        elif ftype == 'exon':
            exon_number = gff.column_9_value(col9,
                                             'exon_number').replace('"', '')

            if args.export_mode == 'model':
                exon_count_by_RNA[transcript_id] += 1

                cds_id = "{0}.CDS.{1}".format(
                    current_RNA.id, exon_count_by_RNA[current_RNA.id])
                CDS = things.CDS(id=cds_id, parent=current_RNA)
                CDS.locate_on(target=current_assembly,
                              fmin=fmin,
                              fmax=fmax,
                              strand=strand,
                              phase=current_CDS_phase)
                current_RNA.add_CDS(CDS)

                # calculate the starting phase for the next CDS feature (in case there is one)
                current_CDS_phase = 3 - ((
                    (fmax - fmin) - current_CDS_phase) % 3)
                if current_CDS_phase == 3:
                    current_CDS_phase = 0

                exon_id = "{0}.exon.{1}".format(
                    current_RNA.id, exon_count_by_RNA[current_RNA.id])
                exon = things.Exon(id=exon_id, parent=current_RNA)
                exon.locate_on(target=current_assembly,
                               fmin=fmin,
                               fmax=fmax,
                               strand=strand)
                current_RNA.add_exon(exon)

            elif args.export_mode == 'cDNA_match':
                mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number)
                mp = things.MatchPart(id=mp_id,
                                      parent=current_match,
                                      length=fmax - fmin)
                mp.locate_on(target=current_assembly,
                             fmin=fmin,
                             fmax=fmax,
                             strand=strand)
                current_match.add_part(mp)

    # don't forget to do the last gene, if there were any
    if args.export_mode == 'model':
        if current_gene is not None:
            gene.print_as(fh=ofh, source='GenBank', format='gff3')

    elif args.export_mode == 'cDNA_match':
        if current_match is not None:
            match.print_as(fh=ofh, source='Cufflinks', format='gff3')
def main():
    parser = argparse.ArgumentParser(description="Put a description of your script here")

    parser.add_argument("-a", "--organism1_annotation", type=str, required=True, help="Annotation GFF for organism 1")
    parser.add_argument(
        "-p", "--organism1_aat_alignments", type=str, required=True, help="Path to AAT GFF3 (match/match_part)"
    )
    parser.add_argument(
        "-aatdb", "--aat_fasta_db", type=str, required=True, help="Path to FASTA database that was used in AAT"
    )
    parser.add_argument(
        "-b",
        "--organism1_blast_alignments",
        type=str,
        required=True,
        help="Path to BLASTp btab file vs.organism 2 proteins",
    )
    parser.add_argument(
        "-be", "--blast_eval_cutoff", type=float, required=False, default=1e-5, help="BLAST e-value cutoff"
    )
    parser.add_argument(
        "-bpi", "--blast_percent_identity_cutoff", type=float, required=False, default=0, help="BLAST %identity cutoff"
    )
    parser.add_argument(
        "-ppc",
        "--aat_percent_coverage_cutoff",
        type=float,
        required=False,
        default=0,
        help="% coverage of the query protein by the AAT match",
    )
    parser.add_argument(
        "-o", "--output_id_list", type=str, required=False, help="List of IDs from organism1 that passed"
    )
    args = parser.parse_args()

    debugging_transcript = None

    ## if the output file wasn't passed build one from the other parameters
    if args.output_id_list is None:
        args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(
            args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff
        )

    print("INFO: Parsing organism1 annotation")
    (assemblies, features) = gff.get_gff3_features(args.organism1_annotation)

    print("INFO: Parsing AAT FASTA database")
    aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db)

    # keys are assembly IDs, value for each is a list of matches on them
    aat_matches = dict()
    aat_match_count = 0
    current_match = None

    ## IDs of features in organism 1 which overlap AAT
    o1_with_aat = list()
    o1_with_o2 = list()

    print("INFO: Parsing organism1 AAT protein alignments")
    for line in open(args.organism1_aat_alignments):
        cols = line.split("\t")

        if line.startswith("#") or len(cols) != 9:
            continue

        assembly_id = cols[0]

        # skip this match if there were not predicted genes on the same assembly
        if assembly_id not in assemblies:
            continue

        if assembly_id not in aat_matches:
            aat_matches[assembly_id] = list()

        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        feature_id = gff.column_9_value(cols[8], "ID").replace('"', "")
        target = gff.column_9_value(cols[8], "Target")
        m = re.search("^(\S+)", target)
        if m:
            target = m.group(1)

        if cols[2] == "nucleotide_to_protein_match":
            if current_match is not None:
                aat_matches[assembly_id].append(current_match)
                aat_match_count += 1

            current_match = things.Match(
                id=feature_id, target_id=target, subclass="nucleotide_to_protein_match", length=fmax - fmin
            )
            current_match.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand)

        elif cols[2] == "match_part":
            parent_id = gff.column_9_value(cols[8], "Parent").replace('"', "")
            match_part = things.MatchPart(id=feature_id, parent=parent_id, length=fmax - fmin)
            match_part.locate_on(target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand)
            current_match.add_part(match_part)

    print("INFO: Parsed {0} protein alignment chains".format(aat_match_count))

    print("INFO: Comparing organism1's mRNAs with AAT match coordinates")
    for assembly_id in assemblies:
        if assembly_id not in aat_matches:
            continue

        assembly = assemblies[assembly_id]

        for gene in assembly.genes():
            for mRNA in gene.mRNAs():

                if debugging_transcript is not None:
                    if mRNA.id == debugging_transcript:
                        print("DEBUG: processing debugging transcript: {0}".format(mRNA.id))
                    else:
                        continue

                for aat_match in aat_matches[assembly_id]:
                    # print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) )
                    overlap_size = mRNA.overlap_size_with(aat_match)

                    if overlap_size is not None:
                        # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) )
                        # this shouldn't be possible, but check just in case
                        if overlap_size > mRNA.length:
                            raise Exception(
                                "ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length)
                            )

                        if aat_match.target_id not in aat_seqs:
                            raise Exception(
                                "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format(
                                    aat_match.target_id
                                )
                            )

                        # this is a protein length, so x3
                        match_target_length = len(aat_seqs[aat_match.target_id]["s"]) * 3

                        (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage(
                            mRNA, aat_match, match_target_length
                        )

                        # print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) )
                        # print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) )

                        if (
                            mRNA_percent_coverage >= args.aat_percent_coverage_cutoff
                            and target_percent_coverage >= args.aat_percent_coverage_cutoff
                        ):
                            o1_with_aat.append(mRNA.id)
                            # print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \
                            #        mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \
                            #        aat_match.target_id, match_target_length) )
                            # print("\tmRNA % cov: {0}".format(mRNA_percent_coverage))
                            # print("\ttarget % cov: {0}".format(target_percent_coverage))
                            break  # only need to see if one matched

    print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat)))

    # key=org1_transcript_id, value=org2_transcript_id
    top_blast_hits = dict()

    print("INFO: parsing BLAST results vs. org2")
    for line in open(args.organism1_blast_alignments):
        cols = line.split("\t")

        if float(cols[19]) > args.blast_eval_cutoff:
            continue

        if float(cols[10]) < args.blast_percent_identity_cutoff:
            continue

        # if we survived until here, this one's good.
        top_blast_hits[cols[0]] = cols[5]

    print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones")
    for o1_mRNA_id in o1_with_aat:
        if o1_mRNA_id in top_blast_hits:
            o1_with_o2.append(o1_mRNA_id)

    print(
        "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2))
    )

    id_list_fh = open(args.output_id_list, "wt")
    for mRNA_id in o1_with_o2:
        id_list_fh.write("{0}\n".format(mRNA_id))
Example #20
0
    last_gene = None
    
    for qry_gene in things:
        if qry_gene.id in handled_ids:
            continue
        
        ## mark this one as handled
        handled_ids[qry_gene.id] = 1
        nonoverlapping_set.append(qry_gene)

<<<<<<< .mine
        current_assembly = assemblies[mol_id]
        rfmin = int(cols[3]) - 1
        rfmax = int(cols[4])
        rstrand = None
        feat_id = gff.column_9_value(cols[8], 'ID')
        parent_id = gff.column_9_value(cols[8], 'Parent')
        parent_feat = None
        
        if parent_id is not None:
            if parent_id in features:
                parent_feat = features[parent_id]
            else:
                raise Exception("Error in GFF3: Parent {0} referenced by a child feature before it was defined".format(parent_id) )

        #print("Processing feature: ({0})".format(feat_id))

        if cols[6] == '-':
            strand = -1
        elif cols[6] == '+':
            strand = 1
def main():
    parser = argparse.ArgumentParser( description='Convert PASA GFF file to canonical gene models')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by PASA' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    parser.add_argument('-s', '--source', type=str, required=False, default='PASA', help='Value to use for the 2nd (source) column' )
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None
    
    gene = None
    mRNA = None
    gene_fmin = None
    gene_fmax = None
    gene_strand = None

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()
    
    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_id = gff.column_9_value(cols[8], 'ID')

        # we expect all columns to be cDNA_match
        if feat_type != 'cDNA_match':
            raise Exception("ERROR: expected all columns to be of type 'cDNA_match' but found a {0}".format(feat_type))

        ## initialize this assembly if we haven't seen it yet
        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        if gene is None or feat_id != gene.id:
            if gene is not None:
                # finish the previous one first
                mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
                gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
                gene.add_mRNA(mRNA)
                current_assembly.add_gene( gene )
                gene.print_as(fh=fout, source=args.source, format='gff3')

            # now start a new one
            gene = things.Gene(id=feat_id)
            mRNA = things.mRNA(id="{0}.mRNA".format(feat_id), parent=gene)
            exon_count_by_mRNA[mRNA.id] = 0
            
            gene_fmin = int(cols[3]) - 1
            gene_fmax = int(cols[4])
            gene_strand = cols[6]

        current_assembly = assemblies[mol_id]
            
        # each row is a new CDS/exon for the current mRNA
        CDS = things.CDS(id="{0}.CDS".format(feat_id), parent=mRNA.id)
        # FIX THIS PHASE
        CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase='.' )
        mRNA.add_CDS(CDS)
        
        exon_count_by_mRNA[mRNA.id] += 1
        exon_id = "{0}.exon{1}".format(mRNA.id, exon_count_by_mRNA[mRNA.id])
        exon = things.Exon(id=exon_id, parent=mRNA.id)
        exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
        mRNA.add_exon(exon)

        if int(cols[3]) - 1 < gene_fmin:
            gene_fmin = int(cols[3]) - 1

        if int(cols[4]) > gene_fmax:
            gene_fmax = int(cols[4])

    # don't orphan the last one
    if gene is not None:
        # finish the previous one first
        mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
        gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand )
        gene.add_mRNA(mRNA)
        current_assembly.add_gene( gene )
        gene.print_as(fh=fout, source=args.source, format='gff3')
def main():
    parser = argparse.ArgumentParser(
        description='Put a description of your script here')

    parser.add_argument('-a',
                        '--organism1_annotation',
                        type=str,
                        required=True,
                        help='Annotation GFF for organism 1')
    parser.add_argument('-p',
                        '--organism1_aat_alignments',
                        type=str,
                        required=True,
                        help='Path to AAT GFF3 (match/match_part)')
    parser.add_argument('-aatdb',
                        '--aat_fasta_db',
                        type=str,
                        required=True,
                        help='Path to FASTA database that was used in AAT')
    parser.add_argument('-b',
                        '--organism1_blast_alignments',
                        type=str,
                        required=True,
                        help='Path to BLASTp btab file vs.organism 2 proteins')
    parser.add_argument('-be',
                        '--blast_eval_cutoff',
                        type=float,
                        required=False,
                        default=1e-5,
                        help='BLAST e-value cutoff')
    parser.add_argument('-bpi',
                        '--blast_percent_identity_cutoff',
                        type=float,
                        required=False,
                        default=0,
                        help='BLAST %identity cutoff')
    parser.add_argument(
        '-ppc',
        '--aat_percent_coverage_cutoff',
        type=float,
        required=False,
        default=0,
        help='% coverage of the query protein by the AAT match')
    parser.add_argument('-o',
                        '--output_id_list',
                        type=str,
                        required=False,
                        help='List of IDs from organism1 that passed')
    args = parser.parse_args()

    debugging_transcript = None

    ## if the output file wasn't passed build one from the other parameters
    if args.output_id_list is None:
        args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(
            args.blast_eval_cutoff, args.blast_percent_identity_cutoff,
            args.aat_percent_coverage_cutoff)

    print("INFO: Parsing organism1 annotation")
    (assemblies, features) = gff.get_gff3_features(args.organism1_annotation)

    print("INFO: Parsing AAT FASTA database")
    aat_seqs = utils.fasta_dict_from_file(args.aat_fasta_db)

    # keys are assembly IDs, value for each is a list of matches on them
    aat_matches = dict()
    aat_match_count = 0
    current_match = None

    ## IDs of features in organism 1 which overlap AAT
    o1_with_aat = list()
    o1_with_o2 = list()

    print("INFO: Parsing organism1 AAT protein alignments")
    for line in open(args.organism1_aat_alignments):
        cols = line.split("\t")

        if line.startswith('#') or len(cols) != 9:
            continue

        assembly_id = cols[0]

        # skip this match if there were not predicted genes on the same assembly
        if assembly_id not in assemblies:
            continue

        if assembly_id not in aat_matches:
            aat_matches[assembly_id] = list()

        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        feature_id = gff.column_9_value(cols[8], 'ID').replace('"', '')
        target = gff.column_9_value(cols[8], 'Target')
        m = re.search("^(\S+)", target)
        if m:
            target = m.group(1)

        if cols[2] == 'nucleotide_to_protein_match':
            if current_match is not None:
                aat_matches[assembly_id].append(current_match)
                aat_match_count += 1

            current_match = things.Match(
                id=feature_id,
                target_id=target,
                subclass='nucleotide_to_protein_match',
                length=fmax - fmin)
            current_match.locate_on(target=assemblies[assembly_id],
                                    fmin=fmin,
                                    fmax=fmax,
                                    strand=strand)

        elif cols[2] == 'match_part':
            parent_id = gff.column_9_value(cols[8], 'Parent').replace('"', '')
            match_part = things.MatchPart(id=feature_id,
                                          parent=parent_id,
                                          length=fmax - fmin)
            match_part.locate_on(target=assemblies[assembly_id],
                                 fmin=fmin,
                                 fmax=fmax,
                                 strand=strand)
            current_match.add_part(match_part)

    print("INFO: Parsed {0} protein alignment chains".format(aat_match_count))

    print("INFO: Comparing organism1's mRNAs with AAT match coordinates")
    for assembly_id in assemblies:
        if assembly_id not in aat_matches:
            continue

        assembly = assemblies[assembly_id]

        for gene in assembly.genes():
            for mRNA in gene.mRNAs():

                if debugging_transcript is not None:
                    if mRNA.id == debugging_transcript:
                        print("DEBUG: processing debugging transcript: {0}".
                              format(mRNA.id))
                    else:
                        continue

                for aat_match in aat_matches[assembly_id]:
                    #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) )
                    overlap_size = mRNA.overlap_size_with(aat_match)

                    if overlap_size is not None:
                        #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) )
                        # this shouldn't be possible, but check just in case
                        if overlap_size > mRNA.length:
                            raise Exception(
                                "ERROR: overlap size ({0}) > mRNA length ({1})"
                                .format(overlap_size, mRNA.length))

                        if aat_match.target_id not in aat_seqs:
                            raise Exception(
                                "ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb"
                                .format(aat_match.target_id))

                        # this is a protein length, so x3
                        match_target_length = len(
                            aat_seqs[aat_match.target_id]['s']) * 3

                        (mRNA_percent_coverage, target_percent_coverage
                         ) = calculate_fragmented_coverage(
                             mRNA, aat_match, match_target_length)

                        #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) )
                        #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) )

                        if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff:
                            o1_with_aat.append(mRNA.id)
                            #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \
                            #        mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \
                            #        aat_match.target_id, match_target_length) )
                            #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage))
                            #print("\ttarget % cov: {0}".format(target_percent_coverage))
                            break  # only need to see if one matched

    print(
        "INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".
        format(len(o1_with_aat)))

    # key=org1_transcript_id, value=org2_transcript_id
    top_blast_hits = dict()

    print("INFO: parsing BLAST results vs. org2")
    for line in open(args.organism1_blast_alignments):
        cols = line.split("\t")

        if float(cols[19]) > args.blast_eval_cutoff:
            continue

        if float(cols[10]) < args.blast_percent_identity_cutoff:
            continue

        # if we survived until here, this one's good.
        top_blast_hits[cols[0]] = cols[5]

    print(
        "INFO: Comparing overlap between AAT-matched proteins and BLAST ones")
    for o1_mRNA_id in o1_with_aat:
        if o1_mRNA_id in top_blast_hits:
            o1_with_o2.append(o1_mRNA_id)

    print(
        "INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2"
        .format(len(o1_with_o2)))

    id_list_fh = open(args.output_id_list, 'wt')
    for mRNA_id in o1_with_o2:
        id_list_fh.write("{0}\n".format(mRNA_id))
def main():
    parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    
    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.mRNA".format(id))
            cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.mRNA".format(id))
            cols[8] = gff.order_column_9(cols[8])
            
            # print the gene and mRNA
            fout.write( "{0}\n".format("\t".join(gene_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )
            
        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = gff.set_column_9_value(cols[8], 'ID', "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Name', "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Parent', "{0}.mRNA".format(parent))
            cols[8] = gff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent] )
            next_exon_num[parent] += 1
            
            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'ID', exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Name', exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Parent', "{0}.mRNA".format(parent))
            exon_cols[8] = gff.order_column_9(exon_cols[8])

            fout.write( "{0}\n".format("\t".join(exon_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )
Example #24
0
def main():
    parser = argparse.ArgumentParser(
        description='Convert GFF output from Prodigal into GFF3 format')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to a GFF file created by Prodigal')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None

    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()

    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        if line.startswith("#"):
            pass

        else:
            ##

            gene = None
            mRNAs = dict()
            in_sequence = False
            current_sequence = None
            current_gene_comment_lines = list()

            ##

            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]
            feat_id = gff.column_9_value(cols[8], 'ID')

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            if feat_type == "CDS":
                # gene
                gene = things.Gene(id=feat_id)
                gene.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])

                # mRNA
                mRNA = things.mRNA(id=feat_id + '.t1', parent=gene)
                mRNA.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA
                if feat_id in exon_count_by_mRNA:
                    raise Exception(
                        "ERROR: two different mRNAs found with same ID: {0}".
                        format(feat_id))
                else:
                    exon_count_by_mRNA[feat_id + '.t1'] = 0

                # CDS / exons
                parent_id = gff.column_9_value(cols[8], 'ID') + '.t1'

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception(
                        "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file"
                        .format(parent_id))

                CDS = things.CDS(id=parent_id + '.cds',
                                 parent=mRNAs[parent_id])
                CDS.locate_on(target=current_assembly,
                              fmin=int(cols[3]) - 1,
                              fmax=int(cols[4]),
                              strand=cols[6],
                              phase=int(cols[7]))
                mRNA.add_CDS(CDS)

                # exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id,
                                               exon_count_by_mRNA[parent_id])

                exon = things.Exon(id=exon_id, parent=mRNAs[parent_id])
                exon.locate_on(target=current_assembly,
                               fmin=int(cols[3]) - 1,
                               fmax=int(cols[4]),
                               strand=cols[6])
                mRNA.add_exon(exon)

            ##

            gene.print_as(fh=fout, source='Prodigal_v2.6.3', format='gff3')
Example #25
0
def main():
    flawed_gff_file = 'canonical.flawed.gff3'
    ilri_gff = 'Theileria-all-Theileria1_ourids.gff'
    source = 'GenBank'
    out_gff = 'canonical.corrected.gff3'

    fout = open(out_gff, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    (assemblies, features) = gff.get_gff3_features(flawed_gff_file)

    print("INFO: loaded {0} assemblies and {1} features".format(
        len(assemblies), len(features)))

    polypeptides = dict()

    for line in open(ilri_gff):
        cols = line.split("\t")

        if len(cols) != 9 or cols[2] != 'polypeptide':
            continue

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        polypeptides[parent] = things.Polypeptide(id=id, parent=parent)
        polypeptides[parent].locate_on(target=assemblies[cols[0]],
                                       fmin=int(cols[3]) - 1,
                                       fmax=int(cols[4]),
                                       strand=cols[6])

    print("DEBUG: loaded {0} polypeptides from ILRI file".format(
        len(polypeptides)))

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                if mRNA.id not in polypeptides:
                    print(
                        "DEBUG: {0} not found as a parent to any polypeptide".
                        format(mRNA.id))
                else:
                    polypeptide = polypeptides[mRNA.id]

                # pull this outside of the iteration since iterating might delete some
                CDSs = mRNA.CDSs()

                for CDS in CDSs:
                    keep = True

                    if CDS < polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS <= polypeptide:
                        CDS.location().fmin = polypeptide.location().fmin
                    if CDS > polypeptide:
                        mRNA.delete_CDS(CDS)
                    elif CDS >= polypeptide:
                        CDS.location().fmax = polypeptide.location().fmax
                        #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \
                        #        CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \
                        #        polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax))

            gene.print_as(fh=fout, source=source, format='gff3')
Example #26
0
def main():
    parser = argparse.ArgumentParser(
        'Filter the genes of a GFF3 file by mRNA child IDs')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_gff3',
                        type=str,
                        required=True,
                        help='GFF3 file of source molecules')
    parser.add_argument('-l',
                        '--id_list',
                        type=str,
                        required=True,
                        help='List file of mRNA IDs to keep')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Optional output file path (else STDOUT)')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    ids_to_keep = list()

    for line in open(args.id_list):
        line = line.rstrip()
        if len(line) > 2:
            ids_to_keep.append(line)

    fout.write("##gff-version 3\n")

    current_gene_lines = list()
    current_gene_id = None
    keep = False

    for line in open(args.input_gff3):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        # grab the ID and Parent columns if any
        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        type = cols[2]

        if type == 'gene':
            # purge the current gene, if any
            if len(current_gene_lines) > 1:
                for li in current_gene_lines:
                    fout.write("{0}\n".format(li))

            # reset
            current_gene_lines = list()
            current_gene_lines.append(line)
            current_gene_id = id

        else:
            if type == 'mRNA':
                if id in ids_to_keep:
                    keep = True
                else:
                    keep = False

            if keep == True:
                current_gene_lines.append(line)
Example #27
0
def main():
    parser = argparse.ArgumentParser( description='Adds locus tag identifiers to GFF3 features')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated')
    parser.add_argument('-a', '--padding', type=int, required=True, help='Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.' )
    parser.add_argument('-n', '--interval', type=int, required=False, default=1, help='Interval between generated identifiers' )
    parser.add_argument('-s', '--starting_id', type=int, required=False, default=0, help='Initial numeric portion of IDs to be generated (do not zero-pad)' )
    parser.add_argument('-d', '--id_file', type=str, required=False, help='Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)')
    parser.add_argument('-m', '--molecule_map', type=str, required=False, help='Pass a 2-column file of molecule->token identifiers (see documentation)')
    parser.add_argument('-c', '--custom', type=str, required=False, help='For custom parsing steps.  Most should ignore this.')

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping  = parse_mapping_file( args.id_file )
    mol_mapping = parse_mapping_file( args.molecule_map )
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == 'joana':
        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana")
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match('TP(\d\d)_(\d+)', id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2) )
                    
    elif args.custom == 'bmicroti':
        microti_map = { 'I':'01', 'II':'02', 'III':'03', 'IV':'04' }
        
        if  args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti")
        else:
            for id in id_mapping:
                m = re.match('BBM_(\D+)(\d+)', id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2) )
                    print(id_mapping[id])
                else:
                    raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id]))
                    
        
    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]):
            print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id) )
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        type = cols[2]

        if type == 'gene':
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == 'bmicroti':
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]])
                            else:
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]])
                        else:
                            raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]) )

                    next_id += args.interval

                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', locus_id)

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id) )

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)
            
        elif type.endswith('RNA'):
            if parent in gene_loci:
                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag', gene_loci[parent])
            else:
                raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent))
        
        fout.write("\t".join(cols) + "\n")
Example #28
0
def main():
    parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for StringTie output')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    ofh = sys.stdout
    if args.output_file is not None:
        ofh = open(args.output_file, 'wt')

    ofh.write("##gff-version 3\n")

    assemblies = dict()
    current_assembly = None
    current_gene = None
    current_RNA = None
    
    current_match = None

    rna_count_by_gene = defaultdict(int)
    exon_count_by_RNA = defaultdict(int)

    for line in open(args.input_file, "r"):
        cols = line.split("\t")

        if len(cols) != 9:
            print("SKIPPING: {0}".format(line))
            continue
        
        mol_id = cols[0]

        if mol_id not in assemblies:
            assemblies[mol_id] = things.Assembly(id=mol_id)

        current_assembly = assemblies[mol_id]
        ftype  = cols[2]
        fmin = int(cols[3]) - 1
        fmax = int(cols[4])
        strand = cols[6]
        col9 = cols[8]

        # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key)
        col9 = col9.replace(' "', '="')
        gene_id       = gff.column_9_value(col9, 'gene_id').replace('"', '')
        transcript_id = gff.column_9_value(col9, 'transcript_id').replace('"', '')
        cov = gff.column_9_value(col9, 'cov').replace('"', '')
        
        if ftype == 'transcript':
            if current_gene is not None and current_gene.id != gene_id:
                gene.print_as(fh=ofh, source='StringTie', format='gff3')

            if current_gene is None or current_gene.id != gene_id:
                gene = things.Gene(id=gene_id)
                gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
                current_gene = gene

            mRNA = things.mRNA(id=transcript_id, parent=current_gene)
            mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
            gene.add_mRNA(mRNA)
            current_RNA = mRNA
            exon_count_by_RNA[transcript_id] = 0
            current_CDS_phase = 0

        elif ftype == 'exon':
            exon_number = gff.column_9_value(col9, 'exon_number').replace('"', '')
            exon_count_by_RNA[transcript_id] += 1

            cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
            CDS = things.CDS(id=cds_id, parent=current_RNA)
            CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase )
            current_RNA.add_CDS(CDS)

             # calculate the starting phase for the next CDS feature (in case there is one)
            current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3)
            if current_CDS_phase == 3:
                current_CDS_phase = 0

            exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] )
            exon = things.Exon(id=exon_id, parent=current_RNA)
            exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand )
            current_RNA.add_exon(exon)
                
    # don't forget to do the last gene, if there were any
    if current_gene is not None:
        gene.print_as(fh=ofh, source='StringTie', format='gff3')
def main():
    parser = argparse.ArgumentParser(
        description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to parse')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')

    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = gff.set_column_9_value(cols[8], 'ID',
                                             "{0}.mRNA".format(id))
            cols[8] = gff.set_column_9_value(cols[8], 'Name',
                                             "{0}.mRNA".format(id))
            cols[8] = gff.order_column_9(cols[8])

            # print the gene and mRNA
            fout.write("{0}\n".format("\t".join(gene_cols)))
            fout.write("{0}\n".format("\t".join(cols)))

        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = gff.set_column_9_value(cols[8], 'ID',
                                             "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Name',
                                             "{0}.cds".format(parent))
            cols[8] = gff.set_column_9_value(cols[8], 'Parent',
                                             "{0}.mRNA".format(parent))
            cols[8] = gff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent])
            next_exon_num[parent] += 1

            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'ID', exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Name',
                                                  exon_id)
            exon_cols[8] = gff.set_column_9_value(exon_cols[8], 'Parent',
                                                  "{0}.mRNA".format(parent))
            exon_cols[8] = gff.order_column_9(exon_cols[8])

            fout.write("{0}\n".format("\t".join(exon_cols)))
            fout.write("{0}\n".format("\t".join(cols)))
def main():
    parser = argparse.ArgumentParser( description='Removes orphaned features in a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' )
    args = parser.parse_args()

    # going to try saving memory by tracking line numbers instead of storing all of it
    #  true means keep the line, false means to omit it
    # doing tracking this way since it's technically legal for a feature to have no identifier at all.
    lines = list()
    parents = dict()
    current_line_num = -1

    infile = open(args.input)
    
    for line in infile:
        current_line_num += 1
        
        if line.startswith('#'):
            lines.append(True)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            lines.append(True)
            continue

        id     = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')

        if parent is None:
            # this might be overwritten later
            lines.append(False)
            
            if id is not None:
                if parent not in parents:
                    parents[parent] = False
        else:
            lines.append(True)
            parents[parent] = True
                
    infile.seek(0)
    current_line_num = -1

    outfh = open(args.output, 'wt')
    
    for line in infile:
        current_line_num += 1

        if lines[current_line_num] == True:
            outfh.write(line)
        else:
            line = line.rstrip()
            cols = line.split("\t")

            if len(cols) == 9:
                id     = gff.column_9_value(cols[8], 'ID')

                if id is not None and id in parents and parents[id] == True:
                    outfh.write("{0}\n".format(line))
                else:
                    print("WARN: removing this line: {0}".format(line))
def main():
    parser = argparse.ArgumentParser(
        description='Adds locus tag identifiers to GFF3 features')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='TA file of source molecules')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Optional output file path (else STDOUT)')
    parser.add_argument('-p',
                        '--prefix',
                        type=str,
                        required=True,
                        help='The prefix portion of IDs to be generated')
    parser.add_argument(
        '-a',
        '--padding',
        type=int,
        required=True,
        help=
        'Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.'
    )
    parser.add_argument('-n',
                        '--interval',
                        type=int,
                        required=False,
                        default=1,
                        help='Interval between generated identifiers')
    parser.add_argument(
        '-s',
        '--starting_id',
        type=int,
        required=False,
        default=0,
        help='Initial numeric portion of IDs to be generated (do not zero-pad)'
    )
    parser.add_argument(
        '-d',
        '--id_file',
        type=str,
        required=False,
        help=
        'Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)'
    )
    parser.add_argument(
        '-m',
        '--molecule_map',
        type=str,
        required=False,
        help=
        'Pass a 2-column file of molecule->token identifiers (see documentation)'
    )
    parser.add_argument(
        '-c',
        '--custom',
        type=str,
        required=False,
        help='For custom parsing steps.  Most should ignore this.')

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping = parse_mapping_file(args.id_file)
    mol_mapping = parse_mapping_file(args.molecule_map)
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == 'joana':
        if args.molecule_map is None or args.id_file is None:
            raise Exception(
                "ERROR: Expected --molecule_map and --id_file options when using --custom=joana"
            )
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match('TP(\d\d)_(\d+)', id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(
                        args.prefix, m.group(1), m.group(2))

    elif args.custom == 'bmicroti':
        microti_map = {'I': '01', 'II': '02', 'III': '03', 'IV': '04'}

        if args.molecule_map is None or args.id_file is None:
            raise Exception(
                "ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti"
            )
        else:
            for id in id_mapping:
                m = re.match('BBM_(\D+)(\d+)', id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(
                        args.prefix, microti_map[m.group(1)], m.group(2))
                    print(id_mapping[id])
                else:
                    raise Exception(
                        "ERROR: id ({0}) didn't match expected convention.".
                        format(id_mapping[id]))

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (
                args.molecule_map is not None
                and mol_mapping[cols[0]] != mol_mapping[last_molecule]):
            print("Found molecule {0}, resetting id counter from {1}".format(
                cols[0], next_id))
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = gff.column_9_value(cols[8], 'ID')
        parent = gff.column_9_value(cols[8], 'Parent')
        type = cols[2]

        # issue

        # 66F4EEF2E3C863C251F831817FF71233
        # 7F1917E4D81A959078C9A38E15488BC0
        # E22888670919A4A888572155F40F2654
        # B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232
        # gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel

        # errors on: BmicrotiR1_01g00233 -> BBM_I00233
        #5800A4110A62E4EAE57AFAD1F8D65CB3        BBM_I00233

        if type == 'gene':
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(
                            args.prefix,
                            str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == 'bmicroti':
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix,
                                    str(int(last_number_portion_assigned) +
                                        1).zfill(args.padding),
                                    mol_mapping[cols[0]])
                            else:
                                locus_id = "{0}_{2}g{1}".format(
                                    args.prefix,
                                    str(next_id).zfill(args.padding),
                                    mol_mapping[cols[0]])
                        else:
                            raise Exception(
                                "ERROR: --molecule_map passed but {0} wasn't found in it."
                                .format(cols[0]))

                    next_id += args.interval

                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag',
                                                 locus_id)

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".
                          format(locus_id))

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)

        elif type.endswith('RNA'):
            if parent in gene_loci:
                cols[8] = gff.set_column_9_value(cols[8], 'locus_tag',
                                                 gene_loci[parent])
            else:
                raise Exception(
                    "ERROR: found RNA {0} whose parent {1} wasn't found yet".
                    format(id, parent))

        fout.write("\t".join(cols) + "\n")
def main():
    parser = argparse.ArgumentParser( description='Convert native (GTF) or GFF output from Augustus into GFF3 format')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    assemblies = dict()
    current_assembly = None
    
    gene = None
    mRNAs = dict()
    in_sequence = False
    current_sequence = None
    current_gene_comment_lines = list()

    ## Used for tracking the exon count for each gene (for ID purposes)
    exon_count_by_mRNA = dict()
    
    fout = open(args.output, mode='wt', encoding='utf-8')
    fout.write("##gff-version 3\n")

    for line in open(args.input):
        if line.startswith("#"):
            current_gene_comment_lines.append(line)
            
            if line.startswith("# end gene "):
                ## purge the comments, then write the gene
                fout.write( "".join(current_gene_comment_lines) )
                gene.print_as(fh=fout, source='AUGUSTUS', format='gff3')

                gene = None
                mRNAs = dict()
                in_sequence = False
                current_sequence = None
                current_gene_comment_lines = list()

            elif line.startswith("# protein sequence = ["):
                pass
            elif in_sequence is True:
                # build 'current_sequence'
                pass

        else:
            cols = line.split("\t")

            if len(cols) != 9:
                continue

            mol_id = cols[0]
            feat_type = cols[2]

            if feat_type not in ['gene', 'transcript', 'CDS']:
                continue

            ## The output format is GTF by default and (mostly) GFF if the --gff option is used.
            #   If GTF is detected, let's start by transforming the 9th column into GFF so the
            #   libraries can use it
            #   g1  ->  ID=g1
            #   g1.t1  ->  ID=g1.t1;Parent=g1
            #   transcript_id "g1.t1"; gene_id "g1";  ->  ID=g1.t1.cds;Parent=g1.t1
            m_gene = re.match('(g\d+)', cols[8])
            m_transcript = re.match('((g\d+).t\d+)', cols[8])
            m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8])

            # the input can be in GTF or GFF.  We need to reformat the 9th column for the GTF entries
            if not cols[8].startswith('ID') and not cols[8].startswith('Parent'):
                if feat_type == 'gene':
                    if m_gene:
                        cols[8] = "ID={0}".format(m_gene.group(1))
                    else:
                        raise Exception("ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8]))
                elif feat_type == 'transcript':
                    if m_transcript:
                        cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2))
                    else:
                        raise Exception("ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8]))
                elif feat_type == 'CDS':
                    if m_CDS:
                        cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1))
                    else:
                        raise Exception("ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8]))

            feat_id = gff.column_9_value(cols[8], 'ID')

            ## initialize this assembly if we haven't seen it yet
            if mol_id not in assemblies:
                assemblies[mol_id] = things.Assembly(id=mol_id)

            current_assembly = assemblies[mol_id]

            if feat_type == "gene":
                gene = things.Gene(id=feat_id)
                gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )

            elif feat_type == "transcript":
                mRNA = things.mRNA(id=feat_id, parent=gene)
                mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                gene.add_mRNA(mRNA)
                mRNAs[mRNA.id] = mRNA

                if feat_id in exon_count_by_mRNA:
                    raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) )
                else:
                    exon_count_by_mRNA[feat_id] = 0
                    
            elif feat_type == "CDS":
                parent_id = gff.column_9_value(cols[8], 'Parent')

                ## sanity check that we've seen this parent
                if parent_id not in mRNAs:
                    raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id))

                CDS = things.CDS(id=feat_id, parent=mRNAs[parent_id])
                CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) )
                mRNA.add_CDS(CDS)
                
                ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them
                exon_count_by_mRNA[parent_id] += 1
                exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id])
                
                exon = things.Exon(id=exon_id, parent=mRNAs[parent_id])
                exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] )
                mRNA.add_exon(exon)