예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Adds gene features for RNAs which lack them')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA') and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = 'gene'
            gene_cols[8] = biocodegff.set_column_9_value(
                gene_cols[8], 'ID', "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)))

            cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent',
                                                    "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser(
        description=
        'Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to the input GFF3 file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='Output GFF3 file to write')
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None

    for line in infile:

        if line.startswith('#'):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line))

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print(
                    "INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})"
                    .format(id, last_rna_id, cols[2]))
                cols[8] = biocodegff.set_column_9_value(
                    cols[8], 'Parent', last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)))
            else:
                ofh.write("{0}\n".format(line))
        else:
            ofh.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser(description="Adds gene features for RNAs which lack them")

    ## output file to be written
    parser.add_argument("-i", "--input", type=str, required=True, help="Path to the input GFF3 file")
    parser.add_argument("-o", "--output", type=str, required=True, help="Output GFF3 file to write")
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, "wt")

    for line in infile:

        if line.startswith("#"):
            ofh.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line))
            continue

        id = biocodegff.column_9_value(cols[8], "ID")
        parent = biocodegff.column_9_value(cols[8], "Parent")

        if cols[2].endswith("RNA") and parent is None:
            gene_cols = list(cols)
            gene_cols[2] = "gene"
            gene_cols[8] = biocodegff.set_column_9_value(gene_cols[8], "ID", "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(gene_cols)))

            cols[8] = biocodegff.set_column_9_value(cols[8], "Parent", "{0}.gene".format(id))
            ofh.write("{0}\n".format("\t".join(cols)))
        else:
            ofh.write("{0}\n".format(line))
def main():
    parser = argparse.ArgumentParser( description='Updates exon Parent attributes to point at the correct RNA feature')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' )
    parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' )
    args = parser.parse_args()

    infile = open(args.input)
    ofh = open(args.output, 'wt')

    last_rna_id = None
    
    for line in infile:
        
        if line.startswith('#'):
            ofh.write(line)
            continue
        
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            ofh.write("{0}\n".format(line) )
            continue

        id     = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if cols[2].endswith('RNA'):
            last_rna_id = id
            ofh.write("{0}\n".format(line) )

        elif cols[2] == 'exon':
            if parent != last_rna_id:
                print("INFO: correcting unexpected parentage for feature ({0}) type {2}.  Expected ({1})".format(id, last_rna_id, cols[2]) )
                cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent', last_rna_id)
                ofh.write("{0}\n".format("\t".join(cols)) )
            else:
                ofh.write("{0}\n".format(line) )
        else:
            ofh.write("{0}\n".format(line) )
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input file to parse')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')

    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = biocodegff.set_column_9_value(cols[8], 'ID',
                                                    "{0}.mRNA".format(id))
            cols[8] = biocodegff.set_column_9_value(cols[8], 'Name',
                                                    "{0}.mRNA".format(id))
            cols[8] = biocodegff.order_column_9(cols[8])

            # print the gene and mRNA
            fout.write("{0}\n".format("\t".join(gene_cols)))
            fout.write("{0}\n".format("\t".join(cols)))

        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = biocodegff.set_column_9_value(cols[8], 'ID',
                                                    "{0}.cds".format(parent))
            cols[8] = biocodegff.set_column_9_value(cols[8], 'Name',
                                                    "{0}.cds".format(parent))
            cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent',
                                                    "{0}.mRNA".format(parent))
            cols[8] = biocodegff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent])
            next_exon_num[parent] += 1

            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = biocodegff.set_column_9_value(
                exon_cols[8], 'ID', exon_id)
            exon_cols[8] = biocodegff.set_column_9_value(
                exon_cols[8], 'Name', exon_id)
            exon_cols[8] = biocodegff.set_column_9_value(
                exon_cols[8], 'Parent', "{0}.mRNA".format(parent))
            exon_cols[8] = biocodegff.order_column_9(exon_cols[8])

            fout.write("{0}\n".format("\t".join(exon_cols)))
            fout.write("{0}\n".format("\t".join(cols)))
def main():
    parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3')

    # output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    
    args = parser.parse_args()

    fout = open(args.output_file, 'w')

    current_gene = None
    current_mRNA = None

    next_exon_num = defaultdict(int)

    for line in open(args.input_file, 'r'):
        if line.startswith('#'):
            fout.write(line)
            continue

        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            continue

        mol_id = cols[0]
        feat_type = cols[2]
        feat_fmin = int(cols[3]) - 1
        feat_fmax = int(cols[4])

        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')

        if feat_type == 'mRNA':
            gene_cols = list(cols)
            gene_cols[2] = 'gene'

            cols[8] = biocodegff.set_column_9_value( cols[8], 'ID', "{0}.mRNA".format(id) )
            cols[8] = biocodegff.set_column_9_value( cols[8], 'Name', "{0}.mRNA".format(id) )
            cols[8] = biocodegff.order_column_9(cols[8])
            
            # print the gene and mRNA
            fout.write( "{0}\n".format("\t".join(gene_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )
            
        elif feat_type == 'CDS':
            exon_cols = list(cols)

            cols[8] = biocodegff.set_column_9_value( cols[8], 'ID', "{0}.cds".format(parent) )
            cols[8] = biocodegff.set_column_9_value( cols[8], 'Name', "{0}.cds".format(parent) )
            cols[8] = biocodegff.set_column_9_value( cols[8], 'Parent', "{0}.mRNA".format(parent) )
            cols[8] = biocodegff.order_column_9(cols[8])

            exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent] )
            next_exon_num[parent] += 1
            
            exon_cols[2] = 'exon'
            exon_cols[7] = '.'
            exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'ID', exon_id )
            exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Name', exon_id )
            exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Parent', "{0}.mRNA".format(parent) )
            exon_cols[8] = biocodegff.order_column_9(exon_cols[8])

            fout.write( "{0}\n".format("\t".join(exon_cols)) )
            fout.write( "{0}\n".format("\t".join(cols)) )
예제 #7
0
def main():
    parser = argparse.ArgumentParser( description='Adds locus tag identifiers to GFF3 features')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' )
    parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated')
    parser.add_argument('-a', '--padding', type=int, required=True, help='Specify the minimum with to reserve for the numeric portion of the IDs.  Smaller numbers will be zero-padded.' )
    parser.add_argument('-n', '--interval', type=int, required=False, default=1, help='Interval between generated identifiers' )
    parser.add_argument('-s', '--starting_id', type=int, required=False, default=0, help='Initial numeric portion of IDs to be generated (do not zero-pad)' )
    parser.add_argument('-d', '--id_file', type=str, required=False, help='Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)')
    parser.add_argument('-m', '--molecule_map', type=str, required=False, help='Pass a 2-column file of molecule->token identifiers (see documentation)')
    parser.add_argument('-c', '--custom', type=str, required=False, help='For custom parsing steps.  Most should ignore this.')

    args = parser.parse_args()
    check_arguments(args)

    # used to store locus_tags associated with each gene (so children can inherit)
    gene_loci = dict()
    next_id = args.starting_id
    last_molecule = None

    id_mapping  = parse_mapping_file( args.id_file )
    mol_mapping = parse_mapping_file( args.molecule_map )
    loci_assigned = list()

    ## if using Joana's custom options, check assumptions
    if args.custom == 'joana':
        if args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana")
        else:
            ## need to process the ID map to reformat IDs
            for id in id_mapping:
                # TP05_0002 -> TpMuguga_05g00002
                m = re.match('TP(\d\d)_(\d+)', id_mapping[id])
                if m:
                    id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2) )
                    
    elif args.custom == 'bmicroti':
        microti_map = { 'I':'01', 'II':'02', 'III':'03', 'IV':'04' }
        
        if  args.molecule_map is None or args.id_file is None:
            raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti")
        else:
            for id in id_mapping:
                m = re.match('BBM_(\D+)(\d+)', id_mapping[id])
                if m:
                    print("Changing id from {0} to ".format(id))
                    id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2) )
                    print(id_mapping[id])
                else:
                    raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id]))
                    
        
    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    last_number_portion_assigned = 0

    for line in open(args.input_file):
        line = line.rstrip()
        cols = line.split("\t")

        if len(cols) != 9:
            fout.write(line + "\n")
            continue

        if last_molecule is None or (args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]):
            print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id) )
            next_id = args.starting_id
            last_molecule = cols[0]

        # grab the ID column if any
        id = biocodegff.column_9_value(cols[8], 'ID')
        parent = biocodegff.column_9_value(cols[8], 'Parent')
        type = cols[2]

# issue

# 66F4EEF2E3C863C251F831817FF71233
# 7F1917E4D81A959078C9A38E15488BC0
# E22888670919A4A888572155F40F2654
# B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232
# gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel

# errors on: BmicrotiR1_01g00233 -> BBM_I00233
#5800A4110A62E4EAE57AFAD1F8D65CB3        BBM_I00233




        if type == 'gene':
            while True:
                if id in id_mapping:
                    locus_id = id_mapping[id]
                else:
                    if args.molecule_map is None:
                        locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding))
                    else:
                        if cols[0] in mol_mapping:
                            if args.custom == 'bmicroti':
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]])
                            else:
                                locus_id = "{0}_{2}g{1}".format(args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]])
                        else:
                            raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]) )

                    next_id += args.interval

                cols[8] = biocodegff.set_column_9_value(cols[8], 'locus_tag', locus_id )

                ## make sure this wasn't generated already (possibly conflict between --id_file and an
                #   auto-generated ID?
                if locus_id not in loci_assigned:
                    break
                else:
                    print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id) )

            loci_assigned.append(locus_id)
            gene_loci[id] = locus_id

            m = re.search(r"(\d+)$", locus_id)
            if m:
                last_number_portion_assigned = m.group(1)
            
        elif type.endswith('RNA'):
            if parent in gene_loci:
                cols[8] = biocodegff.set_column_9_value(cols[8], 'locus_tag', gene_loci[parent] )
            else:
                raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent))
        
        fout.write("\t".join(cols) + "\n")