def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence to report/correct phase columns.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional. Writes an output (translated) FASTA file for all those features which had internal stops') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count('*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) ) fasta_out_fh.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id) ) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main(): parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created' ) parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' ) parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI' ) parser.add_argument('-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF' ) parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_file ) if args.genomic_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta) new_assemblies = dict() ## We need to first check the ID format reformat_IDs = True ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001) asm_id_map = dict() asm_num = 1 for asm_id in assemblies: # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx if asm_id.startswith('gnl|WGS:'): reformat_IDs = False break else: new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(args.ncbi_acc_prefix, asm_num) asm_id_map[asm_id] = new_id asm_num += 1 new_assemblies[new_id] = assemblies[asm_id] new_assemblies[new_id].id = new_id if reformat_IDs == True: assemblies = new_assemblies # >gi|68352484|gb|AAGK01000001.1| # AAGK01000001 NC_007344.1 tp.assembly.567468735.1 ofh = open("{0}.tbl".format(args.output_base), 'wt') biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name) mset = biothings.AssemblySet() mset.load_from_dict(assemblies) mset.write_fasta(path="{0}.fna".format(args.output_base))
def main(): parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters') parser.add_argument('-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help='Genes with an mRNA covered by this percentage of repeats will be excluded' ) parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created') parser.add_argument('-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features( args.input_gff3 ) biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta) gff_out = open(args.output_gff3, 'wt') gff_out.write("##gff-version 3\n") rem_out = None gene_count = 0 kept_count = 0 if args.removed_gff3 is not None: rem_out = open(args.removed_gff3, 'wt') rem_out.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): keep = True gene_count += 1 for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() n_count = coding_seq.count('N') perc_repeat = (n_count / len(coding_seq)) * 100 if perc_repeat >= args.percent_repeat_coverage_cutoff: keep = False if keep == True: kept_count += 1 gene.print_as(fh=gff_out, source='IGS', format='gff3') else: if rem_out is not None: gene.print_as(fh=rem_out, source='IGS', format='gff3') print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(kept_count, gene_count, ((kept_count/gene_count) * 100)))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created') parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument( '-f', '--fasta', type=str, required=False, help= 'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues() if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write( "WARN: Non-canonical start codon ({0}) in mRNA {1}\n" .format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write( "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n" .format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format( biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' ) parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.set_defaults(check_ends=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): ## initial values of id and header to export (can be overridden by available annotation) export_id = mRNA.id export_header = None if mRNA.locus_tag is not None: export_id = mRNA.locus_tag ## Add the gene product name if there is one for polypeptide in mRNA.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") coding_seq = mRNA.get_CDS_residues(for_translation=True) if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, mRNA.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, mRNA.id)) if args.type == 'cds': fout.write("{0}\n".format(biocodeutils.wrapped_fasta(coding_seq))) else: translated_seq = biocodeutils.translate(coding_seq) fout.write("{0}\n".format(biocodeutils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence to report/correct phase columns.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] for assembly_id in assemblies: print("Assembly {0} has length {1}".format( assembly_id, assemblies[assembly_id].length)) for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = biocodeutils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format( mRNA.id, gene.id)) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) codon_step_size = 3 if mRNA_loc.strand == 1: CDS_pos = CDSs[-1].location_on( assemblies[assembly_id]).fmax mRNA_limit = mRNA_loc.fmax else: CDS_pos = CDSs[0].location_on( assemblies[assembly_id]).fmin mRNA_limit = mRNA_loc.fmin codon_step_size = -3 print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format( mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand if codon_step_size < 0: CDS_pos += codon_step_size while True: if (codon_step_size < 0 and CDS_pos < mRNA_limit) or ( codon_step_size > 0 and CDS_pos > mRNA_limit): print(" Reached the mRNA limit") break else: next_codon = assemblies[assembly_id].residues[ CDS_pos:CDS_pos + 3] print(".{0}({1})".format(next_codon, CDS_pos), end='') if next_codon in stop_codons: new_stop_found = True print(" Found a stop") break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with terminal stops: {0}".format(mRNAs_with_terminal_stops))
def main(): parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read') parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created') parser.add_argument( '-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group') parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI') parser.add_argument( '-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF') parser.add_argument( '-go', '--go_obo', type=str, required=False, help= 'GO terms will not be exported unless you pass the path to a GO OBO file' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) if args.genomic_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genomic_fasta) new_assemblies = dict() ## We need to first check the ID format reformat_IDs = True ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001) asm_id_map = dict() asm_num = 1 for asm_id in assemblies: # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx if asm_id.startswith('gnl|WGS:'): reformat_IDs = False break else: new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format( args.ncbi_acc_prefix, asm_num) asm_id_map[asm_id] = new_id asm_num += 1 new_assemblies[new_id] = assemblies[asm_id] new_assemblies[new_id].id = new_id if reformat_IDs == True: assemblies = new_assemblies ofh = open("{0}.tbl".format(args.output_base), 'wt') biocodetbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name) mset = biothings.AssemblySet() mset.load_from_dict(assemblies) mset.write_fasta(path="{0}.fna".format(args.output_base))
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence report non-terminal internal stops.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-p', '--print_n_with_stops', type=int, required=False, default=0, help= 'Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument( '-o', '--output_fasta', type=str, required=False, help= 'Optional. Writes an output (translated) FASTA file for all those features which had internal stops' ) args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: biocodeutils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if biocodeutils.translate(coding_seq).rstrip('*').count( '*') > 0: mRNAs_with_stops += 1 translated_seq = biocodeutils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format( mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand)) fasta_out_fh.write("{0}\n".format( biocodeutils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id)) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format( translated_seq, mRNA.id)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main(): parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3') parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters') parser.add_argument( '-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help= 'Genes with an mRNA covered by this percentage of repeats will be excluded' ) parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created') parser.add_argument( '-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file') args = parser.parse_args() (assemblies, features) = biocodegff.get_gff3_features(args.input_gff3) biocodeutils.add_assembly_fasta(assemblies, args.masked_fasta) gff_out = open(args.output_gff3, 'wt') gff_out.write("##gff-version 3\n") rem_out = None gene_count = 0 kept_count = 0 if args.removed_gff3 is not None: rem_out = open(args.removed_gff3, 'wt') rem_out.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): keep = True gene_count += 1 for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() n_count = coding_seq.count('N') perc_repeat = (n_count / len(coding_seq)) * 100 if perc_repeat >= args.percent_repeat_coverage_cutoff: keep = False if keep == True: kept_count += 1 gene.print_as(fh=gff_out, source='IGS', format='gff3') else: if rem_out is not None: gene.print_as(fh=rem_out, source='IGS', format='gff3') print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format( kept_count, gene_count, ((kept_count / gene_count) * 100)))