def main(): parser = argparse.ArgumentParser( 'Filter the genes of a GFF3 file by mRNA child IDs') ## output file to be written parser.add_argument('-i', '--input_gff', type=str, required=True, help='GFF file of source annotation') parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') current_mRNA_id = None current_mol_id = None current_fragments = list() current_direction = None for line in open(args.input_gff): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue # grab the ID and Parent columns if any id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') mol_id = cols[0] type = cols[2] if type == 'mRNA': if current_mRNA_id is not None and id != current_mRNA_id: # purge the existing one first write_transcript(fout, current_mol_id, current_fragments, current_direction) current_fragments = list() current_mRNA_id = id current_mol_id = cols[0] current_direction = cols[6] elif type == 'exon': if cols[6] == '+': current_fragments.append({'start': cols[3], 'end': cols[4]}) else: current_fragments.append({'start': cols[4], 'end': cols[3]}) write_transcript(fout, current_mol_id, current_fragments, current_direction)
def main(): parser = argparse.ArgumentParser('Filter the genes of a GFF3 file by mRNA child IDs') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of source molecules' ) parser.add_argument('-l', '--id_list', type=str, required=True, help='List file of mRNA IDs to keep' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') ids_to_keep = list() for line in open(args.id_list): line = line.rstrip() if len(line) > 2: ids_to_keep.append(line) fout.write("##gff-version 3\n") current_gene_lines = list() current_gene_id = None keep = False for line in open(args.input_gff3): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue # grab the ID and Parent columns if any id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') type = cols[2] if type == 'gene': # purge the current gene, if any if len(current_gene_lines) > 1: for li in current_gene_lines: fout.write("{0}\n".format(li) ) # reset current_gene_lines = list() current_gene_lines.append( line ) current_gene_id = id else: if type == 'mRNA': if id in ids_to_keep: keep = True else: keep = False if keep == True: current_gene_lines.append(line)
def main(): parser = argparse.ArgumentParser( description= 'Updates exon Parent attributes to point at the correct RNA feature') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') last_rna_id = None for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line)) continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') if cols[2].endswith('RNA'): last_rna_id = id ofh.write("{0}\n".format(line)) elif cols[2] == 'exon': if parent != last_rna_id: print( "INFO: correcting unexpected parentage for feature ({0}) type {2}. Expected ({1})" .format(id, last_rna_id, cols[2])) cols[8] = biocodegff.set_column_9_value( cols[8], 'Parent', last_rna_id) ofh.write("{0}\n".format("\t".join(cols))) else: ofh.write("{0}\n".format(line)) else: ofh.write("{0}\n".format(line))
def main(): flawed_gff_file = 'canonical.flawed.gff3' ilri_gff = 'Theileria-all-Theileria1_ourids.gff' source = 'GenBank' out_gff = 'canonical.corrected.gff3' fout = open(out_gff, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") (assemblies, features) = biocodegff.get_gff3_features( flawed_gff_file ) print("INFO: loaded {0} assemblies and {1} features".format(len(assemblies), len(features))) polypeptides = dict() for line in open(ilri_gff): cols = line.split("\t") if len(cols) != 9 or cols[2] != 'polypeptide': continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') polypeptides[parent] = biothings.Polypeptide( id=id, parent=parent ) polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) print("DEBUG: loaded {0} polypeptides from ILRI file".format(len(polypeptides)) ) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): if mRNA.id not in polypeptides: print("DEBUG: {0} not found as a parent to any polypeptide".format(mRNA.id)) else: polypeptide = polypeptides[mRNA.id] # pull this outside of the iteration since iterating might delete some CDSs = mRNA.CDSs() for CDS in CDSs: keep = True if CDS < polypeptide: mRNA.delete_CDS(CDS) elif CDS <= polypeptide: CDS.location().fmin = polypeptide.location().fmin if CDS > polypeptide: mRNA.delete_CDS(CDS) elif CDS >= polypeptide: CDS.location().fmax = polypeptide.location().fmax #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \ # CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \ # polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax)) gene.print_as(fh=fout, source=source, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Generates new identifiers in GFF3 files following the IGS identifier convention.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated') parser.add_argument('-m', '--mode', type=str, required=False, default='sequential', help='ID modes (see embedded documentation): sequential, uuid, hex8, hex12') args = parser.parse_args() check_arguments(args) id_map = dict() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: fout.write(line + "\n") continue # grab the ID column if any id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') new_id = None new_parent = None type = cols[2] if id is not None: if id in id_map: new_id = id_map[id] else: new_id = get_new_id(args.prefix, type, args.mode) id_map[id] = new_id cols[8] = cols[8].replace("ID={0}".format(id), "ID={0}".format(new_id)) if parent is not None: if parent in id_map: new_parent = id_map[parent] else: raise Exception("ERROR: parent ({0}) referenced before it was used as an ID".format(parent)) cols[8] = cols[8].replace("Parent={0}".format(parent), "Parent={0}".format(new_parent)) #print("DEBUG: old_id:{0} - old_parent:{1}, new_id:{2} - new_parent:{3}".format(id, parent, new_id, new_parent)) fout.write("\t".join(cols) + "\n")
def main(): parser = argparse.ArgumentParser('Filter the genes of a GFF3 file by mRNA child IDs') ## output file to be written parser.add_argument('-i', '--input_gff', type=str, required=True, help='GFF file of source annotation' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') current_mRNA_id = None current_mol_id = None current_fragments = list() current_direction = None for line in open(args.input_gff): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue # grab the ID and Parent columns if any id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') mol_id = cols[0] type = cols[2] if type == 'mRNA': if current_mRNA_id is not None and id != current_mRNA_id: # purge the existing one first write_transcript(fout, current_mol_id, current_fragments, current_direction) current_fragments = list() current_mRNA_id = id current_mol_id = cols[0] current_direction = cols[6] elif type == 'exon': if cols[6] == '+': current_fragments.append({'start':cols[3], 'end':cols[4]}) else: current_fragments.append({'start':cols[4], 'end':cols[3]}) write_transcript(fout, current_mol_id, current_fragments, current_direction)
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_gff', type=str, required=True, help='Path to an output GFF file to be created with new IDs' ) parser.add_argument('-p', '--id_prefix', type=str, required=True, help='Will be used as the base for all IDs generated' ) parser.add_argument('-m', '--output_map', type=str, required=False, help='This will create a tab-delimited mapping of old and new IDs' ) args = parser.parse_args() ofh = open(args.output_gff, 'w') if args.output_map is None: map_ofh = None else: map_ofh = open(args.output_map, 'w') idmap = dict() for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write(line + "\n") continue feat_id = biocodegff.column_9_value( cols[8], 'ID' ) parent_id = biocodegff.column_9_value( cols[8], 'Parent' ) if feat_id in idmap: new_feat_id = idmap[feat_id] else: new_feat_id = get_new_id(args.id_prefix, cols[2], feat_id, map_ofh) idmap[feat_id] = new_feat_id if parent_id is None: cols[8] = "ID={0}".format(new_feat_id) else: if parent_id in idmap: new_parent_id = idmap[parent_id] else: new_parent_id = get_new_id(args.id_prefix, cols[2], parent_id, map_ofh) idmap[parent_id] = new_parent_id cols[8] = "ID={0};Parent={1}".format(new_feat_id, new_parent_id) ofh.write( "\t".join(cols) + "\n" )
def main(): parser = argparse.ArgumentParser( description='Adds gene features for RNAs which lack them') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line)) continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') if cols[2].endswith('RNA') and parent is None: gene_cols = list(cols) gene_cols[2] = 'gene' gene_cols[8] = biocodegff.set_column_9_value( gene_cols[8], 'ID', "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(gene_cols))) cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent', "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(cols))) else: ofh.write("{0}\n".format(line))
def append_organism_names_to_gff(file_path, poly_orgs): # we have to write to a temp file and copy over fout = open("{0}.orgtmp".format(file_path), 'wt') orgs_found = 0 last_RNA_id = None for line in open(file_path): line = line.rstrip() cols = line.split("\t") if len(cols) == 9 and cols[2].endswith('RNA'): last_RNA_id = biocodegff.column_9_value(cols[8], 'ID') if len(cols) == 9 and cols[2] == 'polypeptide': if last_RNA_id in poly_orgs: cols[8] += ";top_organism_from_blast={0}".format(poly_orgs[last_RNA_id], biocodegff.escape(poly_orgs[last_RNA_id])) orgs_found += 1 fout.write("{0}\n".format("\t".join(cols)) ) else: fout.write("{0}\n".format(line)) if orgs_found == 0: print("WARNING: The --export_organism_names option was passed, but parsing failed to find any organism names at all. This might be an error.") ## now move the temp file over the original copy fout.close() os.rename("{0}.orgtmp".format(file_path), file_path)
def main(): parser = argparse.ArgumentParser( description='Reverses CDS coodinates where stop < start') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line) ) continue if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]): temp = cols[3] cols[3] = cols[4] cols[4] = temp id = biocodegff.column_9_value(cols[8], 'ID') print("CDS reversed: {0}".format(id)) ofh.write("{0}\n".format("\t".join(cols)) ) else: ofh.write("{0}\n".format(line) )
def main(): parser = argparse.ArgumentParser( description='Updates exon Parent attributes to point at the correct RNA feature') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') last_rna_id = None for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line) ) continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') if cols[2].endswith('RNA'): last_rna_id = id ofh.write("{0}\n".format(line) ) elif cols[2] == 'exon': if parent != last_rna_id: print("INFO: correcting unexpected parentage for feature ({0}) type {2}. Expected ({1})".format(id, last_rna_id, cols[2]) ) cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent', last_rna_id) ofh.write("{0}\n".format("\t".join(cols)) ) else: ofh.write("{0}\n".format(line) ) else: ofh.write("{0}\n".format(line) )
def main(): parser = argparse.ArgumentParser( description='Removes duplicate features in a GFF3 file') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') args = parser.parse_args() # just reduce the keys to a string: # "molecule__parent__type__start__stop" found = list() infile = open(args.input) outfile = open(args.output, 'wt') for line in infile: if line.startswith('#'): outfile.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue parent = biocodegff.column_9_value(cols[8], 'Parent') type = cols[2] mol_id = cols[0] if parent is None: outfile.write("{0}\n".format(line)) continue id_string = "{0}__{1}__{2}__{3}__{4}".format(mol_id, parent, type, cols[3], cols[4]) if id_string in found: print("INFO: duplicate feature to be removed:\n{0}\n".format(line)) continue else: found.append(id_string) outfile.write("{0}\n".format(line))
def main(): parser = argparse.ArgumentParser(description="Adds gene features for RNAs which lack them") ## output file to be written parser.add_argument("-i", "--input", type=str, required=True, help="Path to the input GFF3 file") parser.add_argument("-o", "--output", type=str, required=True, help="Output GFF3 file to write") args = parser.parse_args() infile = open(args.input) ofh = open(args.output, "wt") for line in infile: if line.startswith("#"): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line)) continue id = biocodegff.column_9_value(cols[8], "ID") parent = biocodegff.column_9_value(cols[8], "Parent") if cols[2].endswith("RNA") and parent is None: gene_cols = list(cols) gene_cols[2] = "gene" gene_cols[8] = biocodegff.set_column_9_value(gene_cols[8], "ID", "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(gene_cols))) cols[8] = biocodegff.set_column_9_value(cols[8], "Parent", "{0}.gene".format(id)) ofh.write("{0}\n".format("\t".join(cols))) else: ofh.write("{0}\n".format(line))
def main(): parser = argparse.ArgumentParser( description='Removes duplicate features in a GFF3 file') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) args = parser.parse_args() # just reduce the keys to a string: # "molecule__parent__type__start__stop" found = list() infile = open(args.input) outfile = open(args.output, 'wt') for line in infile: if line.startswith('#'): outfile.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue parent = biocodegff.column_9_value(cols[8], 'Parent') type = cols[2] mol_id = cols[0] if parent is None: outfile.write("{0}\n".format(line)) continue id_string = "{0}__{1}__{2}__{3}__{4}".format(mol_id, parent, type, cols[3], cols[4]) if id_string in found: print("INFO: duplicate feature to be removed:\n{0}\n".format(line) ) continue else: found.append(id_string) outfile.write("{0}\n".format(line) )
def main(): parser = argparse.ArgumentParser( description='Reverses CDS coodinates where stop < start') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') args = parser.parse_args() infile = open(args.input) ofh = open(args.output, 'wt') for line in infile: if line.startswith('#'): ofh.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write("{0}\n".format(line)) continue if cols[2] == 'CDS' and int(cols[4]) < int(cols[3]): temp = cols[3] cols[3] = cols[4] cols[4] = temp id = biocodegff.column_9_value(cols[8], 'ID') print("CDS reversed: {0}".format(id)) ofh.write("{0}\n".format("\t".join(cols))) else: ofh.write("{0}\n".format(line))
def main(): parser = argparse.ArgumentParser( description='Removes orphaned features in a GFF3 file') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file' ) parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write' ) #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' ) args = parser.parse_args() # going to try saving memory by tracking line numbers instead of storing all of it # true means keep the line, false means to omit it # doing tracking this way since it's technically legal for a feature to have no identifier at all. lines = list() parents = dict() current_line_num = -1 infile = open(args.input) for line in infile: current_line_num += 1 if line.startswith('#'): lines.append(True) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: lines.append(True) continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') if parent is None: # this might be overwritten later lines.append(False) if id is not None: if parent not in parents: parents[parent] = False else: lines.append(True) parents[parent] = True infile.seek(0) current_line_num = -1 outfh = open(args.output, 'wt') for line in infile: current_line_num += 1 if lines[current_line_num] == True: outfh.write(line) else: line = line.rstrip() cols = line.split("\t") if len(cols) == 9: id = biocodegff.column_9_value(cols[8], 'ID') if id is not None and id in parents and parents[id] == True: outfh.write("{0}\n".format(line)) else: print("WARN: removing this line: {0}".format(line))
def main(): parser = argparse.ArgumentParser( description='Convert PASA GFF file to canonical gene models') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by PASA' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) parser.add_argument('-s', '--source', type=str, required=False, default='PASA', help='Value to use for the 2nd (source) column' ) args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNA = None gene_fmin = None gene_fmax = None gene_strand = None ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_id = biocodegff.column_9_value(cols[8], 'ID') # we expect all columns to be cDNA_match if feat_type != 'cDNA_match': raise Exception("ERROR: expected all columns to be of type 'cDNA_match' but found a {0}".format(feat_type)) ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = biothings.Assembly( id=mol_id ) if gene is None or feat_id != gene.id: if gene is not None: # finish the previous one first mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.add_mRNA(mRNA) current_assembly.add_gene( gene ) gene.print_as(fh=fout, source=args.source, format='gff3') # now start a new one gene = biothings.Gene( id=feat_id ) mRNA = biothings.mRNA( id="{0}.mRNA".format(feat_id), parent=gene ) exon_count_by_mRNA[mRNA.id] = 0 gene_fmin = int(cols[3]) - 1 gene_fmax = int(cols[4]) gene_strand = cols[6] current_assembly = assemblies[mol_id] # each row is a new CDS/exon for the current mRNA CDS = biothings.CDS( id="{0}.CDS".format(feat_id), parent=mRNA.id ) # FIX THIS PHASE CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase='.' ) mRNA.add_CDS(CDS) exon_count_by_mRNA[mRNA.id] += 1 exon_id = "{0}.exon{1}".format(mRNA.id, exon_count_by_mRNA[mRNA.id]) exon = biothings.Exon( id=exon_id, parent=mRNA.id ) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon) if int(cols[3]) - 1 < gene_fmin: gene_fmin = int(cols[3]) - 1 if int(cols[4]) > gene_fmax: gene_fmax = int(cols[4]) # don't orphan the last one if gene is not None: # finish the previous one first mRNA.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.locate_on( target=current_assembly, fmin=gene_fmin, fmax=gene_fmax, strand=gene_strand ) gene.add_mRNA(mRNA) current_assembly.add_gene( gene ) gene.print_as(fh=fout, source=args.source, format='gff3')
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') parser.add_argument('-a', '--organism1_annotation', type=str, required=True, help='Annotation GFF for organism 1' ) parser.add_argument('-p', '--organism1_aat_alignments', type=str, required=True, help='Path to AAT GFF3 (match/match_part)' ) parser.add_argument('-aatdb', '--aat_fasta_db', type=str, required=True, help='Path to FASTA database that was used in AAT' ) parser.add_argument('-b', '--organism1_blast_alignments', type=str, required=True, help='Path to BLASTp btab file vs.organism 2 proteins' ) parser.add_argument('-be', '--blast_eval_cutoff', type=float, required=False, default=1e-5, help='BLAST e-value cutoff' ) parser.add_argument('-bpi', '--blast_percent_identity_cutoff', type=float, required=False, default=0, help='BLAST %identity cutoff' ) parser.add_argument('-ppc', '--aat_percent_coverage_cutoff', type=float, required=False, default=0, help='% coverage of the query protein by the AAT match' ) parser.add_argument('-o', '--output_id_list', type=str, required=False, help='List of IDs from organism1 that passed' ) args = parser.parse_args() debugging_transcript = None ## if the output file wasn't passed build one from the other parameters if args.output_id_list is None: args.output_id_list = "training_ids.be_{0}.bpi_{1}.ppc_{2}.list".format(args.blast_eval_cutoff, args.blast_percent_identity_cutoff, args.aat_percent_coverage_cutoff) print("INFO: Parsing organism1 annotation") (assemblies, features) = biocodegff.get_gff3_features( args.organism1_annotation ) print("INFO: Parsing AAT FASTA database") aat_seqs = biocodeutils.fasta_dict_from_file( args.aat_fasta_db ) # keys are assembly IDs, value for each is a list of matches on them aat_matches = dict() aat_match_count = 0 current_match = None ## IDs of features in organism 1 which overlap AAT o1_with_aat = list() o1_with_o2 = list() print("INFO: Parsing organism1 AAT protein alignments") for line in open(args.organism1_aat_alignments): cols = line.split("\t") if line.startswith('#') or len(cols) != 9: continue assembly_id = cols[0] # skip this match if there were not predicted genes on the same assembly if assembly_id not in assemblies: continue if assembly_id not in aat_matches: aat_matches[assembly_id] = list() fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] feature_id = biocodegff.column_9_value(cols[8], 'ID').replace('"', '') target = biocodegff.column_9_value(cols[8], 'Target') m = re.search("^(\S+)", target) if m: target = m.group(1) if cols[2] == 'nucleotide_to_protein_match': if current_match is not None: aat_matches[assembly_id].append(current_match) aat_match_count += 1 current_match = biothings.Match( id=feature_id, target_id=target, subclass='nucleotide_to_protein_match', length=fmax - fmin ) current_match.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand ) elif cols[2] == 'match_part': parent_id = biocodegff.column_9_value(cols[8], 'Parent').replace('"', '') match_part = biothings.MatchPart( id=feature_id, parent=parent_id, length=fmax - fmin ) match_part.locate_on( target=assemblies[assembly_id], fmin=fmin, fmax=fmax, strand=strand ) current_match.add_part(match_part) print("INFO: Parsed {0} protein alignment chains".format(aat_match_count)) print("INFO: Comparing organism1's mRNAs with AAT match coordinates") for assembly_id in assemblies: if assembly_id not in aat_matches: continue assembly = assemblies[assembly_id] for gene in assembly.genes(): for mRNA in gene.mRNAs(): if debugging_transcript is not None: if mRNA.id == debugging_transcript: print("DEBUG: processing debugging transcript: {0}".format(mRNA.id)) else: continue for aat_match in aat_matches[assembly_id]: #print("DEBUG: about to call overlap_size_with {0} and {1}, which has {2} segments".format(mRNA.id, aat_match.id, len(aat_match.parts)) ) overlap_size = mRNA.overlap_size_with(aat_match) if overlap_size is not None: #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4})".format(mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length) ) # this shouldn't be possible, but check just in case if overlap_size > mRNA.length: raise Exception("ERROR: overlap size ({0}) > mRNA length ({1})".format(overlap_size, mRNA.length)) if aat_match.target_id not in aat_seqs: raise Exception("ERROR: Found match with target ID ({0}) but didn't find a FASTA entry for it via -aatdb".format(aat_match.target_id)) # this is a protein length, so x3 match_target_length = len(aat_seqs[aat_match.target_id]['s']) * 3 (mRNA_percent_coverage, target_percent_coverage) = calculate_fragmented_coverage(mRNA, aat_match, match_target_length) #print("DEBUG: mRNA_percent_coverage:{0}".format(mRNA_percent_coverage) ) #print("DEBUG: match_percent_coverage:{0}".format(target_percent_coverage) ) if mRNA_percent_coverage >= args.aat_percent_coverage_cutoff and target_percent_coverage >= args.aat_percent_coverage_cutoff: o1_with_aat.append(mRNA.id) #print("DEBUG: {0}:({1}) overlaps (size:{2}) {3}:({4}), match target id:{5}, length:{6}".format( \ # mRNA.id, mRNA.length, overlap_size, aat_match.id, aat_match.length, \ # aat_match.target_id, match_target_length) ) #print("\tmRNA % cov: {0}".format(mRNA_percent_coverage)) #print("\ttarget % cov: {0}".format(target_percent_coverage)) break # only need to see if one matched print("INFO: Found {0} mRNAs in org1 with overlapping fungi AAT coordinates".format(len(o1_with_aat))) # key=org1_transcript_id, value=org2_transcript_id top_blast_hits = dict() print("INFO: parsing BLAST results vs. org2") for line in open(args.organism1_blast_alignments): cols = line.split("\t") if float(cols[19]) > args.blast_eval_cutoff: continue if float(cols[10]) < args.blast_percent_identity_cutoff: continue # if we survived until here, this one's good. top_blast_hits[cols[0]] = cols[5] print("INFO: Comparing overlap between AAT-matched proteins and BLAST ones") for o1_mRNA_id in o1_with_aat: if o1_mRNA_id in top_blast_hits: o1_with_o2.append(o1_mRNA_id) print("INFO: Found {0} mRNAs in org1 with overlapping AAT coordinates and BLAST hit to org2".format(len(o1_with_o2))) id_list_fh = open(args.output_id_list, 'wt') for mRNA_id in o1_with_o2: id_list_fh.write("{0}\n".format(mRNA_id))
def main(): parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3') # output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() fout = open(args.output_file, 'w') current_gene = None current_mRNA = None next_exon_num = defaultdict(int) for line in open(args.input_file, 'r'): if line.startswith('#'): fout.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_fmin = int(cols[3]) - 1 feat_fmax = int(cols[4]) id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') if feat_type == 'mRNA': gene_cols = list(cols) gene_cols[2] = 'gene' cols[8] = biocodegff.set_column_9_value(cols[8], 'ID', "{0}.mRNA".format(id)) cols[8] = biocodegff.set_column_9_value(cols[8], 'Name', "{0}.mRNA".format(id)) cols[8] = biocodegff.order_column_9(cols[8]) # print the gene and mRNA fout.write("{0}\n".format("\t".join(gene_cols))) fout.write("{0}\n".format("\t".join(cols))) elif feat_type == 'CDS': exon_cols = list(cols) cols[8] = biocodegff.set_column_9_value(cols[8], 'ID', "{0}.cds".format(parent)) cols[8] = biocodegff.set_column_9_value(cols[8], 'Name', "{0}.cds".format(parent)) cols[8] = biocodegff.set_column_9_value(cols[8], 'Parent', "{0}.mRNA".format(parent)) cols[8] = biocodegff.order_column_9(cols[8]) exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent]) next_exon_num[parent] += 1 exon_cols[2] = 'exon' exon_cols[7] = '.' exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'ID', exon_id) exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Name', exon_id) exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Parent', "{0}.mRNA".format(parent)) exon_cols[8] = biocodegff.order_column_9(exon_cols[8]) fout.write("{0}\n".format("\t".join(exon_cols))) fout.write("{0}\n".format("\t".join(cols)))
def main(): parser = argparse.ArgumentParser( description='Adds locus tag identifiers to GFF3 features') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)' ) parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated') parser.add_argument('-a', '--padding', type=int, required=True, help='Specify the minimum with to reserve for the numeric portion of the IDs. Smaller numbers will be zero-padded.' ) parser.add_argument('-n', '--interval', type=int, required=False, default=1, help='Interval between generated identifiers' ) parser.add_argument('-s', '--starting_id', type=int, required=False, default=0, help='Initial numeric portion of IDs to be generated (do not zero-pad)' ) parser.add_argument('-d', '--id_file', type=str, required=False, help='Pass a 2-column file of IDs to retain (in case you have mapped genes, for example)') parser.add_argument('-m', '--molecule_map', type=str, required=False, help='Pass a 2-column file of molecule->token identifiers (see documentation)') parser.add_argument('-c', '--custom', type=str, required=False, help='For custom parsing steps. Most should ignore this.') args = parser.parse_args() check_arguments(args) # used to store locus_tags associated with each gene (so children can inherit) gene_loci = dict() next_id = args.starting_id last_molecule = None id_mapping = parse_mapping_file( args.id_file ) mol_mapping = parse_mapping_file( args.molecule_map ) loci_assigned = list() ## if using Joana's custom options, check assumptions if args.custom == 'joana': if args.molecule_map is None or args.id_file is None: raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=joana") else: ## need to process the ID map to reformat IDs for id in id_mapping: # TP05_0002 -> TpMuguga_05g00002 m = re.match('TP(\d\d)_(\d+)', id_mapping[id]) if m: id_mapping[id] = "{0}_{1}g0{2}".format(args.prefix, m.group(1), m.group(2) ) elif args.custom == 'bmicroti': microti_map = { 'I':'01', 'II':'02', 'III':'03', 'IV':'04' } if args.molecule_map is None or args.id_file is None: raise Exception("ERROR: Expected --molecule_map and --id_file options when using --custom=bmicroti") else: for id in id_mapping: m = re.match('BBM_(\D+)(\d+)', id_mapping[id]) if m: print("Changing id from {0} to ".format(id)) id_mapping[id] = "{0}_{1}g{2}".format(args.prefix, microti_map[m.group(1)], m.group(2) ) print(id_mapping[id]) else: raise Exception("ERROR: id ({0}) didn't match expected convention.".format(id_mapping[id])) ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') last_number_portion_assigned = 0 for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: fout.write(line + "\n") continue if last_molecule is None or (args.molecule_map is not None and mol_mapping[cols[0]] != mol_mapping[last_molecule]): print("Found molecule {0}, resetting id counter from {1}".format(cols[0], next_id) ) next_id = args.starting_id last_molecule = cols[0] # grab the ID column if any id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') type = cols[2] # issue # 66F4EEF2E3C863C251F831817FF71233 # 7F1917E4D81A959078C9A38E15488BC0 # E22888670919A4A888572155F40F2654 # B9D9CF1F7A8E5A2E1124F0A6C68840DC -> BBM_I00232 # gene before is: 6DE6BCCE69CCDC39994A0940B2ED524A - novel # errors on: BmicrotiR1_01g00233 -> BBM_I00233 #5800A4110A62E4EAE57AFAD1F8D65CB3 BBM_I00233 if type == 'gene': while True: if id in id_mapping: locus_id = id_mapping[id] else: if args.molecule_map is None: locus_id = "{0}_{1}".format(args.prefix, str(next_id).zfill(args.padding)) else: if cols[0] in mol_mapping: if args.custom == 'bmicroti': locus_id = "{0}_{2}g{1}".format(args.prefix, str(int(last_number_portion_assigned) + 1).zfill(args.padding), mol_mapping[cols[0]]) else: locus_id = "{0}_{2}g{1}".format(args.prefix, str(next_id).zfill(args.padding), mol_mapping[cols[0]]) else: raise Exception("ERROR: --molecule_map passed but {0} wasn't found in it.".format(cols[0]) ) next_id += args.interval cols[8] = biocodegff.set_column_9_value(cols[8], 'locus_tag', locus_id ) ## make sure this wasn't generated already (possibly conflict between --id_file and an # auto-generated ID? if locus_id not in loci_assigned: break else: print("DEBUG: Duplicate ID assigned ({0}), trying again.".format(locus_id) ) loci_assigned.append(locus_id) gene_loci[id] = locus_id m = re.search(r"(\d+)$", locus_id) if m: last_number_portion_assigned = m.group(1) elif type.endswith('RNA'): if parent in gene_loci: cols[8] = biocodegff.set_column_9_value(cols[8], 'locus_tag', gene_loci[parent] ) else: raise Exception("ERROR: found RNA {0} whose parent {1} wasn't found yet".format(id, parent)) fout.write("\t".join(cols) + "\n")
def main(): parser = argparse.ArgumentParser( 'Filter the genes of a GFF3 file by mRNA child IDs') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='GFF3 file of source molecules') parser.add_argument('-l', '--id_list', type=str, required=True, help='List file of mRNA IDs to keep') parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') ids_to_keep = list() for line in open(args.id_list): line = line.rstrip() if len(line) > 2: ids_to_keep.append(line) fout.write("##gff-version 3\n") current_gene_lines = list() current_gene_id = None keep = False for line in open(args.input_gff3): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue # grab the ID and Parent columns if any id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') type = cols[2] if type == 'gene': # purge the current gene, if any if len(current_gene_lines) > 1: for li in current_gene_lines: fout.write("{0}\n".format(li)) # reset current_gene_lines = list() current_gene_lines.append(line) current_gene_id = id else: if type == 'mRNA': if id in ids_to_keep: keep = True else: keep = False if keep == True: current_gene_lines.append(line)
def main(): parser = argparse.ArgumentParser( description= 'Generates new identifiers in GFF3 files following the IGS identifier convention.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='TA file of source molecules') parser.add_argument('-o', '--output_file', type=str, required=False, help='Optional output file path (else STDOUT)') parser.add_argument('-p', '--prefix', type=str, required=True, help='The prefix portion of IDs to be generated') parser.add_argument( '-m', '--mode', type=str, required=False, default='sequential', help= 'ID modes (see embedded documentation): sequential, uuid, hex8, hex12') args = parser.parse_args() check_arguments(args) id_map = dict() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: fout.write(line + "\n") continue # grab the ID column if any id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') new_id = None new_parent = None type = cols[2] if id is not None: if id in id_map: new_id = id_map[id] else: new_id = get_new_id(args.prefix, type, args.mode) id_map[id] = new_id cols[8] = cols[8].replace("ID={0}".format(id), "ID={0}".format(new_id)) if parent is not None: if parent in id_map: new_parent = id_map[parent] else: raise Exception( "ERROR: parent ({0}) referenced before it was used as an ID" .format(parent)) cols[8] = cols[8].replace("Parent={0}".format(parent), "Parent={0}".format(new_parent)) #print("DEBUG: old_id:{0} - old_parent:{1}, new_id:{2} - new_parent:{3}".format(id, parent, new_id, new_parent)) fout.write("\t".join(cols) + "\n")
def main(): parser = argparse.ArgumentParser( description='A GTF -> GFF3 conversion script for Cufflinks output') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GTF file' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output GFF file to be created' ) parser.add_argument('-e', '--export_mode', type=str, required=False, default='model', help='Export mode for results (model or cDNA_match)' ) args = parser.parse_args() if args.export_mode not in ['model', 'cDNA_match']: raise Exception("ERROR: the only valid values for --export_mode are 'model' or 'cDNA_match'") ## output will either be a file or STDOUT ofh = sys.stdout if args.output_file is not None: ofh = open(args.output_file, 'wt') ofh.write("##gff-version 3\n") assemblies = dict() current_assembly = None current_gene = None current_RNA = None current_match = None rna_count_by_gene = defaultdict(int) exon_count_by_RNA = defaultdict(int) # each gb_record is a SeqRecord object for line in open(args.input_file, "r"): cols = line.split("\t") if len(cols) != 9: print("SKIPPING: {0}".format(line)) continue mol_id = cols[0] if mol_id not in assemblies: assemblies[mol_id] = biothings.Assembly( id=mol_id ) current_assembly = assemblies[mol_id] ftype = cols[2] fmin = int(cols[3]) - 1 fmax = int(cols[4]) strand = cols[6] col9 = cols[8] # this makes it look like GFF column 9 so I can use biocodeutils.column_9_value(str, key) col9 = col9.replace(' "', '="') gene_id = biocodegff.column_9_value(col9, 'gene_id').replace('"', '') transcript_id = biocodegff.column_9_value(col9, 'transcript_id').replace('"', '') if ftype == 'transcript': if args.export_mode == 'model': if current_gene is not None and current_gene.id != gene_id: gene.print_as(fh=ofh, source='Cufflinks', format='gff3') if current_gene is None or current_gene.id != gene_id: gene = biothings.Gene( id=gene_id ) gene.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_gene = gene mRNA = biothings.mRNA( id=transcript_id, parent=current_gene ) mRNA.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) gene.add_mRNA(mRNA) current_RNA = mRNA exon_count_by_RNA[transcript_id] = 0 current_CDS_phase = 0 elif args.export_mode == 'cDNA_match': if current_match is not None and current_match.id != transcript_id: match.print_as( fh=ofh, source='Cufflinks', format='gff3' ) match = biothings.Match( id=transcript_id, subclass='cDNA_match', length=fmax - fmin ) match.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_match = match elif ftype == 'exon': exon_number = biocodegff.column_9_value(col9, 'exon_number').replace('"', '') if args.export_mode == 'model': exon_count_by_RNA[transcript_id] += 1 cds_id = "{0}.CDS.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) CDS = biothings.CDS( id=cds_id, parent=current_RNA ) CDS.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand, phase=current_CDS_phase ) current_RNA.add_CDS(CDS) # calculate the starting phase for the next CDS feature (in case there is one) current_CDS_phase = 3 - (((fmax - fmin) - current_CDS_phase) % 3) if current_CDS_phase == 3: current_CDS_phase = 0 exon_id = "{0}.exon.{1}".format( current_RNA.id, exon_count_by_RNA[current_RNA.id] ) exon = biothings.Exon( id=exon_id, parent=current_RNA ) exon.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_RNA.add_exon(exon) elif args.export_mode == 'cDNA_match': mp_id = "{0}.match_part.{1}".format(transcript_id, exon_number) mp = biothings.MatchPart( id=mp_id, parent=current_match, length=fmax - fmin ) mp.locate_on( target=current_assembly, fmin=fmin, fmax=fmax, strand=strand ) current_match.add_part(mp) # don't forget to do the last gene, if there were any if args.export_mode == 'model': if current_gene is not None: gene.print_as(fh=ofh, source='GenBank', format='gff3') elif args.export_mode == 'cDNA_match': if current_match is not None: match.print_as( fh=ofh, source='Cufflinks', format='gff3' )
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus' ) parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3") for line in open(args.input): if line.startswith("#"): current_gene_comment_lines.append(line) if line.startswith("# end gene "): ## purge the comments, then write the gene fout.write( "".join(current_gene_comment_lines) ) gene.print_as(fh=fout, source='AUGUSTUS', format='gff3') gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() elif line.startswith("# protein sequence = ["): pass elif in_sequence is True: # build 'current_sequence' pass else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_id = biocodegff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = biothings.Assembly( id=mol_id ) current_assembly = assemblies[mol_id] if feat_type == "gene": gene = biothings.Gene( id=feat_id ) gene.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) elif feat_type == "transcript": mRNA = biothings.mRNA( id=feat_id, parent=gene ) mRNA.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}".format(feat_id) ) else: exon_count_by_mRNA[feat_id] = 0 elif feat_type == "CDS": parent_id = biocodegff.column_9_value( cols[8], 'Parent' ) ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception("ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id)) CDS = biothings.CDS( id=feat_id, parent=mRNAs[parent_id] ) CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]) ) mRNA.add_CDS(CDS) ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = biothings.Exon( id=exon_id, parent=mRNAs[parent_id] ) exon.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6] ) mRNA.add_exon(exon)
def main(): parser = argparse.ArgumentParser( description='Converts glimmerHMM GFF output to GFF3') # output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to parse' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() fout = open(args.output_file, 'w') current_gene = None current_mRNA = None next_exon_num = defaultdict(int) for line in open(args.input_file, 'r'): if line.startswith('#'): fout.write(line) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] feat_fmin = int(cols[3]) - 1 feat_fmax = int(cols[4]) id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') if feat_type == 'mRNA': gene_cols = list(cols) gene_cols[2] = 'gene' cols[8] = biocodegff.set_column_9_value( cols[8], 'ID', "{0}.mRNA".format(id) ) cols[8] = biocodegff.set_column_9_value( cols[8], 'Name', "{0}.mRNA".format(id) ) cols[8] = biocodegff.order_column_9(cols[8]) # print the gene and mRNA fout.write( "{0}\n".format("\t".join(gene_cols)) ) fout.write( "{0}\n".format("\t".join(cols)) ) elif feat_type == 'CDS': exon_cols = list(cols) cols[8] = biocodegff.set_column_9_value( cols[8], 'ID', "{0}.cds".format(parent) ) cols[8] = biocodegff.set_column_9_value( cols[8], 'Name', "{0}.cds".format(parent) ) cols[8] = biocodegff.set_column_9_value( cols[8], 'Parent', "{0}.mRNA".format(parent) ) cols[8] = biocodegff.order_column_9(cols[8]) exon_id = "{0}.exon.{1}".format(parent, next_exon_num[parent] ) next_exon_num[parent] += 1 exon_cols[2] = 'exon' exon_cols[7] = '.' exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'ID', exon_id ) exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Name', exon_id ) exon_cols[8] = biocodegff.set_column_9_value( exon_cols[8], 'Parent', "{0}.mRNA".format(parent) ) exon_cols[8] = biocodegff.order_column_9(exon_cols[8]) fout.write( "{0}\n".format("\t".join(exon_cols)) ) fout.write( "{0}\n".format("\t".join(cols)) )
def main(): parser = argparse.ArgumentParser( description='Removes orphaned features in a GFF3 file') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to the input GFF3 file') parser.add_argument('-o', '--output', type=str, required=True, help='Output GFF3 file to write') #parser.add_argument('-t', '--type', type=str, required=False, help='Type of features to remove' ) args = parser.parse_args() # going to try saving memory by tracking line numbers instead of storing all of it # true means keep the line, false means to omit it # doing tracking this way since it's technically legal for a feature to have no identifier at all. lines = list() parents = dict() current_line_num = -1 infile = open(args.input) for line in infile: current_line_num += 1 if line.startswith('#'): lines.append(True) continue line = line.rstrip() cols = line.split("\t") if len(cols) != 9: lines.append(True) continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') if parent is None: # this might be overwritten later lines.append(False) if id is not None: if parent not in parents: parents[parent] = False else: lines.append(True) parents[parent] = True infile.seek(0) current_line_num = -1 outfh = open(args.output, 'wt') for line in infile: current_line_num += 1 if lines[current_line_num] == True: outfh.write(line) else: line = line.rstrip() cols = line.split("\t") if len(cols) == 9: id = biocodegff.column_9_value(cols[8], 'ID') if id is not None and id in parents and parents[id] == True: outfh.write("{0}\n".format(line)) else: print("WARN: removing this line: {0}".format(line))
def main(): parser = argparse.ArgumentParser( description='Put a description of your script here') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument( '-o', '--output_gff', type=str, required=True, help='Path to an output GFF file to be created with new IDs') parser.add_argument('-p', '--id_prefix', type=str, required=True, help='Will be used as the base for all IDs generated') parser.add_argument( '-m', '--output_map', type=str, required=False, help='This will create a tab-delimited mapping of old and new IDs') args = parser.parse_args() ofh = open(args.output_gff, 'w') if args.output_map is None: map_ofh = None else: map_ofh = open(args.output_map, 'w') idmap = dict() for line in open(args.input_file): line = line.rstrip() cols = line.split("\t") if len(cols) != 9: ofh.write(line + "\n") continue feat_id = biocodegff.column_9_value(cols[8], 'ID') parent_id = biocodegff.column_9_value(cols[8], 'Parent') if feat_id in idmap: new_feat_id = idmap[feat_id] else: new_feat_id = get_new_id(args.id_prefix, cols[2], feat_id, map_ofh) idmap[feat_id] = new_feat_id if parent_id is None: cols[8] = "ID={0}".format(new_feat_id) else: if parent_id in idmap: new_parent_id = idmap[parent_id] else: new_parent_id = get_new_id(args.id_prefix, cols[2], parent_id, map_ofh) idmap[parent_id] = new_parent_id cols[8] = "ID={0};Parent={1}".format(new_feat_id, new_parent_id) ofh.write("\t".join(cols) + "\n")
last_gene = None for qry_gene in things: if qry_gene.id in handled_ids: continue ## mark this one as handled handled_ids[qry_gene.id] = 1 nonoverlapping_set.append(qry_gene) <<<<<<< .mine current_assembly = assemblies[mol_id] rfmin = int(cols[3]) - 1 rfmax = int(cols[4]) rstrand = None feat_id = biocodegff.column_9_value(cols[8], 'ID') parent_id = biocodegff.column_9_value(cols[8], 'Parent') parent_feat = None if parent_id is not None: if parent_id in features: parent_feat = features[parent_id] else: raise Exception("Error in GFF3: Parent {0} referenced by a child feature before it was defined".format(parent_id) ) #print("Processing feature: ({0})".format(feat_id)) if cols[6] == '-': strand = -1 elif cols[6] == '+': strand = 1
def main(): flawed_gff_file = 'canonical.flawed.gff3' ilri_gff = 'Theileria-all-Theileria1_ourids.gff' source = 'GenBank' out_gff = 'canonical.corrected.gff3' fout = open(out_gff, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") (assemblies, features) = biocodegff.get_gff3_features(flawed_gff_file) print("INFO: loaded {0} assemblies and {1} features".format( len(assemblies), len(features))) polypeptides = dict() for line in open(ilri_gff): cols = line.split("\t") if len(cols) != 9 or cols[2] != 'polypeptide': continue id = biocodegff.column_9_value(cols[8], 'ID') parent = biocodegff.column_9_value(cols[8], 'Parent') polypeptides[parent] = biothings.Polypeptide(id=id, parent=parent) polypeptides[parent].locate_on(target=assemblies[cols[0]], fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) print("DEBUG: loaded {0} polypeptides from ILRI file".format( len(polypeptides))) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): if mRNA.id not in polypeptides: print( "DEBUG: {0} not found as a parent to any polypeptide". format(mRNA.id)) else: polypeptide = polypeptides[mRNA.id] # pull this outside of the iteration since iterating might delete some CDSs = mRNA.CDSs() for CDS in CDSs: keep = True if CDS < polypeptide: mRNA.delete_CDS(CDS) elif CDS <= polypeptide: CDS.location().fmin = polypeptide.location().fmin if CDS > polypeptide: mRNA.delete_CDS(CDS) elif CDS >= polypeptide: CDS.location().fmax = polypeptide.location().fmax #print("WARN: found a CDS {0}:{1}-{2} outside the range of the polypeptide {3}:{4}-{5}".format( \ # CDS.id, CDS.locations[0].fmin, CDS.locations[0].fmax, \ # polypeptide.id, polypeptide.locations[0].fmin, polypeptide.locations[0].fmax)) gene.print_as(fh=fout, source=source, format='gff3')
def main(): parser = argparse.ArgumentParser( description= 'Convert native (GTF) or GFF output from Augustus into GFF3 format') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to a GFF file created by Augustus') parser.add_argument('-o', '--output', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode='wt', encoding='utf-8') fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): current_gene_comment_lines.append(line) if line.startswith("# end gene "): ## purge the comments, then write the gene fout.write("".join(current_gene_comment_lines)) gene.print_as(fh=fout, source='AUGUSTUS', format='gff3') gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() elif line.startswith("# protein sequence = ["): pass elif in_sequence is True: # build 'current_sequence' pass else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] if feat_type not in ['gene', 'transcript', 'CDS']: continue ## The output format is GTF by default and (mostly) GFF if the --gff option is used. # If GTF is detected, let's start by transforming the 9th column into GFF so the # libraries can use it # g1 -> ID=g1 # g1.t1 -> ID=g1.t1;Parent=g1 # transcript_id "g1.t1"; gene_id "g1"; -> ID=g1.t1.cds;Parent=g1.t1 m_gene = re.match('(g\d+)', cols[8]) m_transcript = re.match('((g\d+).t\d+)', cols[8]) m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8]) # the input can be in GTF or GFF. We need to reformat the 9th column for the GTF entries if not cols[8].startswith('ID') and not cols[8].startswith( 'Parent'): if feat_type == 'gene': if m_gene: cols[8] = "ID={0}".format(m_gene.group(1)) else: raise Exception( "ERROR: GTF detected but gene row has bad 9th column format: {0}" .format(cols[8])) elif feat_type == 'transcript': if m_transcript: cols[8] = "ID={0};Parent={1}".format( m_transcript.group(1), m_transcript.group(2)) else: raise Exception( "ERROR: GTF detected but transcript row has bad 9th column format: {0}" .format(cols[8])) elif feat_type == 'CDS': if m_CDS: cols[8] = "ID={0}.cds;Parent={0}".format( m_CDS.group(1)) else: raise Exception( "ERROR: GTF detected but CDS row has bad 9th column format: {0}" .format(cols[8])) feat_id = biocodegff.column_9_value(cols[8], 'ID') ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = biothings.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "gene": gene = biothings.Gene(id=feat_id) gene.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) elif feat_type == "transcript": mRNA = biothings.mRNA(id=feat_id, parent=gene) mRNA.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception( "ERROR: two different mRNAs found with same ID: {0}". format(feat_id)) else: exon_count_by_mRNA[feat_id] = 0 elif feat_type == "CDS": parent_id = biocodegff.column_9_value(cols[8], 'Parent') ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception( "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file" .format(parent_id)) CDS = biothings.CDS(id=feat_id, parent=mRNAs[parent_id]) CDS.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7])) mRNA.add_CDS(CDS) ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = biothings.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) mRNA.add_exon(exon)
def main(): parser = argparse.ArgumentParser(description="Convert native (GTF) or GFF output from Augustus into GFF3 format") ## output file to be written parser.add_argument("-i", "--input", type=str, required=True, help="Path to a GFF file created by Augustus") parser.add_argument("-o", "--output", type=str, required=True, help="Path to an output file to be created") args = parser.parse_args() assemblies = dict() current_assembly = None gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() ## Used for tracking the exon count for each gene (for ID purposes) exon_count_by_mRNA = dict() fout = open(args.output, mode="wt", encoding="utf-8") fout.write("##gff-version 3\n") for line in open(args.input): if line.startswith("#"): current_gene_comment_lines.append(line) if line.startswith("# end gene "): ## purge the comments, then write the gene fout.write("".join(current_gene_comment_lines)) gene.print_as(fh=fout, source="AUGUSTUS", format="gff3") gene = None mRNAs = dict() in_sequence = False current_sequence = None current_gene_comment_lines = list() elif line.startswith("# protein sequence = ["): pass elif in_sequence is True: # build 'current_sequence' pass else: cols = line.split("\t") if len(cols) != 9: continue mol_id = cols[0] feat_type = cols[2] if feat_type not in ["gene", "transcript", "CDS"]: continue ## The output format is GTF by default and (mostly) GFF if the --gff option is used. # If GTF is detected, let's start by transforming the 9th column into GFF so the # libraries can use it # g1 -> ID=g1 # g1.t1 -> ID=g1.t1;Parent=g1 # transcript_id "g1.t1"; gene_id "g1"; -> ID=g1.t1.cds;Parent=g1.t1 m_gene = re.match("(g\d+)", cols[8]) m_transcript = re.match("((g\d+).t\d+)", cols[8]) m_CDS = re.match('transcript_id "(g\d+.t\d+)"; gene_id "g\d+";', cols[8]) # the input can be in GTF or GFF. We need to reformat the 9th column for the GTF entries if not cols[8].startswith("ID") and not cols[8].startswith("Parent"): if feat_type == "gene": if m_gene: cols[8] = "ID={0}".format(m_gene.group(1)) else: raise Exception( "ERROR: GTF detected but gene row has bad 9th column format: {0}".format(cols[8]) ) elif feat_type == "transcript": if m_transcript: cols[8] = "ID={0};Parent={1}".format(m_transcript.group(1), m_transcript.group(2)) else: raise Exception( "ERROR: GTF detected but transcript row has bad 9th column format: {0}".format(cols[8]) ) elif feat_type == "CDS": if m_CDS: cols[8] = "ID={0}.cds;Parent={0}".format(m_CDS.group(1)) else: raise Exception( "ERROR: GTF detected but CDS row has bad 9th column format: {0}".format(cols[8]) ) feat_id = biocodegff.column_9_value(cols[8], "ID") ## initialize this assembly if we haven't seen it yet if mol_id not in assemblies: assemblies[mol_id] = biothings.Assembly(id=mol_id) current_assembly = assemblies[mol_id] if feat_type == "gene": gene = biothings.Gene(id=feat_id) gene.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) elif feat_type == "transcript": mRNA = biothings.mRNA(id=feat_id, parent=gene) mRNA.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) gene.add_mRNA(mRNA) mRNAs[mRNA.id] = mRNA if feat_id in exon_count_by_mRNA: raise Exception("ERROR: two different mRNAs found with same ID: {0}".format(feat_id)) else: exon_count_by_mRNA[feat_id] = 0 elif feat_type == "CDS": parent_id = biocodegff.column_9_value(cols[8], "Parent") ## sanity check that we've seen this parent if parent_id not in mRNAs: raise Exception( "ERROR: Found CDS column with parent ({0}) mRNA not yet in the file".format(parent_id) ) CDS = biothings.CDS(id=feat_id, parent=mRNAs[parent_id]) CDS.locate_on( target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6], phase=int(cols[7]), ) mRNA.add_CDS(CDS) ## exons weren't explicitly defined in the input file, so we need to derive new IDs for them exon_count_by_mRNA[parent_id] += 1 exon_id = "{0}.exon{1}".format(parent_id, exon_count_by_mRNA[parent_id]) exon = biothings.Exon(id=exon_id, parent=mRNAs[parent_id]) exon.locate_on(target=current_assembly, fmin=int(cols[3]) - 1, fmax=int(cols[4]), strand=cols[6]) mRNA.add_exon(exon)