def main(): parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence report non-terminal internal stops.') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional. Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' ) parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional. Writes an output (translated) FASTA file for all those features which had internal stops') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_stops = 0 # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it. debug_mRNA = None fasta_out_fh = None if args.output_fasta is not None: fasta_out_fh = open(args.output_fasta, 'wt') for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 if debug_mRNA is not None and mRNA.id == debug_mRNA: print("CDS:{0}".format(coding_seq)) if utils.translate(coding_seq).rstrip('*').count('*') > 0: mRNAs_with_stops += 1 translated_seq = utils.translate(coding_seq) if fasta_out_fh is not None: loc = mRNA.location_on(assemblies[assembly_id]) fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) ) fasta_out_fh.write("{0}\n".format(utils.wrapped_fasta(translated_seq))) if debug_mRNA is not None and mRNA.id == debug_mRNA: print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) if mRNAs_with_stops <= args.print_n_with_stops: print("\nmRNA id: {0}".format(mRNA.id) ) print("\tCDS:{0}".format(coding_seq)) print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) ) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
def main(): parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' ) parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created' ) parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' ) parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI' ) parser.add_argument('-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF' ) parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) if args.genomic_fasta is not None: utils.add_assembly_fasta(assemblies, args.genomic_fasta) new_assemblies = dict() ## We need to first check the ID format reformat_IDs = True ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001) asm_id_map = dict() asm_num = 1 for asm_id in assemblies: # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx if asm_id.startswith('gnl|WGS:'): reformat_IDs = False break else: new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(args.ncbi_acc_prefix, asm_num) asm_id_map[asm_id] = new_id asm_num += 1 new_assemblies[new_id] = assemblies[asm_id] new_assemblies[new_id].id = new_id if reformat_IDs == True: assemblies = new_assemblies ofh = open("{0}.tbl".format(args.output_base), 'wt') tbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name) mset = things.AssemblySet() mset.load_from_dict(assemblies) mset.write_fasta(path="{0}.fna".format(args.output_base))
def main(): parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.') ## output file to be written parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters') parser.add_argument('-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help='Genes with an mRNA covered by this percentage of repeats will be excluded' ) parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created') parser.add_argument('-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_gff3) utils.add_assembly_fasta(assemblies, args.masked_fasta) gff_out = open(args.output_gff3, 'wt') gff_out.write("##gff-version 3\n") rem_out = None gene_count = 0 kept_count = 0 if args.removed_gff3 is not None: rem_out = open(args.removed_gff3, 'wt') rem_out.write("##gff-version 3\n") for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): keep = True gene_count += 1 for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() n_count = coding_seq.count('N') perc_repeat = (n_count / len(coding_seq)) * 100 if perc_repeat >= args.percent_repeat_coverage_cutoff: keep = False if keep == True: kept_count += 1 gene.print_as(fh=gff_out, source='IGS', format='gff3') else: if rem_out is not None: gene.print_as(fh=rem_out, source='IGS', format='gff3') print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(kept_count, gene_count, ((kept_count/gene_count) * 100)))
def main(): parser = argparse.ArgumentParser( description='Creates a single GFF from the output of a few different model prediction tools (coding and non-coding)') ## output file to be written parser.add_argument('-m', '--model_gff', type=str, required=True, help='Input (pass-through) GFF file' ) parser.add_argument('-o', '--output_gff', type=str, required=False, help='Output file to be written. Default=STDOUT' ) parser.add_argument('-b', '--barrnap_gff', type=str, required=False, help='GFF file from Barrnap prediction' ) parser.add_argument('-g', '--genomic_fasta', type=str, required=True, help='Source genomic FASTA file' ) parser.add_argument('-a', '--aragorn_out', type=str, required=False, help='Raw output file (with -w) from ARAGORN prediction' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.model_gff) utils.add_assembly_fasta(assemblies, args.genomic_fasta) if args.barrnap_gff: add_barrnap_features(assemblies, features, args.barrnap_gff) if args.aragorn_out: add_aragorn_features(assemblies, features, args.aragorn_out) with open(args.output_gff, 'wt') as f: gff.print_gff3_from_assemblies(ofh=f, assemblies=assemblies)
def main(): parser = argparse.ArgumentParser( description="Checks the CDS features against a genome sequence to report/correct phase columns." ) ## output file to be written parser.add_argument("-i", "--input_file", type=str, required=True, help="Path to the input GFF3") parser.add_argument( "-g", "--genome_fasta", type=str, required=False, help="Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF", ) parser.add_argument( "-o", "--output_gff", type=str, required=False, help="Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop", ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ["TAG", "TAA", "TGA"] mRNA_extension_limit = 100 mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith("*"): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id)) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) codon_step_size = 3 if mRNA_loc.strand == 1: CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax mRNA_limit = mRNA_loc.fmax + mRNA_extension_limit else: CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin mRNA_limit = mRNA_loc.fmin - mRNA_extension_limit codon_step_size = -3 print( "\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos), end="", ) new_stop_found = False # We have to step backwards to start if on the reverse strand if codon_step_size < 0: CDS_pos += codon_step_size while True: if (codon_step_size < 0 and CDS_pos < mRNA_limit) or ( codon_step_size > 0 and CDS_pos > mRNA_limit ): print(" Reached the mRNA limit") break else: next_codon = assemblies[assembly_id].residues[CDS_pos : CDS_pos + 3] print(".{0}({1})".format(next_codon, CDS_pos), end="") if next_codon in stop_codons: new_stop_found = True print(" Found a stop") break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops)) print("mRNAs which can be corrected: {0}".format(mRNAs_corrected))
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop') parser.add_argument('-el', '--extension_limit', type=int, required=False, default=100, help='Optional. Limits how far an extension will happen looking for an in-frame stop codon') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id)) print("\tCDS: {0}".format(coding_seq)) print("\tcoding sequence ends with {0}, last three a.a.: {1}".format(coding_seq[-3:], translation[-3:])) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) CDS_frame_overhang = len(coding_seq) % 3 print("\tCDS frame overhang: {0}".format(CDS_frame_overhang)) codon_step_size = 3 if mRNA_loc.strand == 1: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax - CDS_frame_overhang mRNA_limit = mRNA_loc.fmax + args.extension_limit else: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin + CDS_frame_overhang mRNA_limit = mRNA_loc.fmin - args.extension_limit codon_step_size = -3 print("\tmRNA:{0}-{1} ({3}), CDS end: {2}. Extending ... \n\t".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand CDS_pos += codon_step_size while True: if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (mRNA_loc.strand == -1 and CDS_pos < mRNA_limit): print(" Reached the mRNA limit") break elif CDS_pos < 1: print(" Reached beginning of the molecule") break else: next_codon = assemblies[assembly_id].residues[CDS_pos:CDS_pos + 3] if mRNA_loc.strand == -1: next_codon = utils.reverse_complement(next_codon) print(".{0}({1}-{2})".format(next_codon, CDS_pos, CDS_pos - 3), end='') else: print(".{0}({1}-{2})".format(next_codon, CDS_pos - 3, CDS_pos), end='') if next_codon in stop_codons: if mRNA_loc.strand == 1: mRNA.extend_stop(on=assemblies[assembly_id], to=(CDS_pos + 3)) print(" Found a stop, extending to: {0} ({1})".format(CDS_pos + 3, mRNA_loc.strand)) else: mRNA.extend_stop(on=assemblies[assembly_id], to=CDS_pos) print(" Found a stop, extending to: {0} ({1})".format(CDS_pos, mRNA_loc.strand)) new_stop_found = True break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops)) print("mRNAs successfully extended: {0}".format(mRNAs_corrected)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) parser.add_argument( '-el', '--extension_limit', type=int, required=False, default=100, help= 'Optional. Limits how far an extension will happen looking for an in-frame stop codon' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format( assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format( mRNA.id, gene.id)) print("\tCDS: {0}".format(coding_seq)) print( "\tcoding sequence ends with {0}, last three a.a.: {1}" .format(coding_seq[-3:], translation[-3:])) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) CDS_frame_overhang = len(coding_seq) % 3 print( "\tCDS frame overhang: {0}".format(CDS_frame_overhang)) codon_step_size = 3 if mRNA_loc.strand == 1: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[-1].location_on( assemblies[assembly_id]).fmax - CDS_frame_overhang mRNA_limit = mRNA_loc.fmax + args.extension_limit else: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[0].location_on( assemblies[assembly_id]).fmin + CDS_frame_overhang mRNA_limit = mRNA_loc.fmin - args.extension_limit codon_step_size = -3 print( "\tmRNA:{0}-{1} ({3}), CDS end: {2}. Extending ... \n\t" .format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand CDS_pos += codon_step_size while True: if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or ( mRNA_loc.strand == -1 and CDS_pos < mRNA_limit): print(" Reached the mRNA limit") break elif CDS_pos < 1: print(" Reached beginning of the molecule") break else: next_codon = assemblies[assembly_id].residues[ CDS_pos:CDS_pos + 3] if mRNA_loc.strand == -1: next_codon = utils.reverse_complement( next_codon) print(".{0}({1}-{2})".format( next_codon, CDS_pos, CDS_pos - 3), end='') else: print(".{0}({1}-{2})".format( next_codon, CDS_pos - 3, CDS_pos), end='') if next_codon in stop_codons: if mRNA_loc.strand == 1: mRNA.extend_stop( on=assemblies[assembly_id], to=(CDS_pos + 3)) print( " Found a stop, extending to: {0} ({1})" .format(CDS_pos + 3, mRNA_loc.strand)) else: mRNA.extend_stop( on=assemblies[assembly_id], to=CDS_pos) print( " Found a stop, extending to: {0} ({1})" .format(CDS_pos, mRNA_loc.strand)) new_stop_found = True break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format( mRNAs_with_terminal_stops)) print("mRNAs successfully extended: {0}".format(mRNAs_corrected)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read') parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created') parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument( '-f', '--fasta', type=str, required=False, help= 'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument( '-ft', '--feature_type', type=str, required=False, default='mRNA', choices=['mRNA', 'polypeptide'], help='IDs and coordinates will come from this feature type') parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.add_argument('--check_internal_stops', dest='check_internal_stops', action='store_true') parser.set_defaults(check_ends=False, check_internal_stops=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') # sanity option check if args.check_internal_stops == True and args.type == 'cds': raise Exception( "Error: Checking internal stops for CDS features not currently supported." ) (assemblies, features) = gff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: utils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): if args.feature_type == 'mRNA': feats = gene.mRNAs() elif args.feature_type == 'polypeptide': feats = gene.polypeptides() for feat in feats: ## initial values of id and header to export (can be overridden by available annotation) export_id = feat.id export_header = None ## Add the gene product name if there is one if args.feature_type == 'mRNA': for polypeptide in feat.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break coding_seq = feat.get_CDS_residues(for_translation=True) if feat.locus_tag is not None: export_id = feat.locus_tag elif args.feature_type == 'polypeptide': export_header = feat.annotation.product_name coding_seq = feat.parent.get_CDS_residues( for_translation=True) if feat.parent.locus_tag is not None: export_id = feat.parent.locus_tag if len(coding_seq) > 0: fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write( "WARN: Non-canonical start codon ({0}) in mRNA {1}\n" .format(start_codon, feat.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write( "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n" .format(stop_codon, feat.id)) if args.type == 'cds': fout.write("{0}\n".format( utils.wrapped_fasta(coding_seq))) else: translated_seq = utils.translate(coding_seq) if args.check_internal_stops == True: internal_stop_count = translated_seq[:-1].count( '*') if internal_stop_count > 0: sys.stderr.write( "Found {0} internal stops in mRNA {1}\n". format(internal_stop_count, feat.id)) fout.write("{0}\n".format( utils.wrapped_fasta(translated_seq))) else: print( "WARNING: Skipped feature {0} because it had no associated CDS features" .format(export_id), file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description= 'Checks the CDS features against a genome sequence to report/correct phase columns.' ) ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] mRNA_extension_limit = 100 mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format( assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format( mRNA.id, gene.id)) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) codon_step_size = 3 if mRNA_loc.strand == 1: CDS_pos = CDSs[-1].location_on( assemblies[assembly_id]).fmax mRNA_limit = mRNA_loc.fmax + mRNA_extension_limit else: CDS_pos = CDSs[0].location_on( assemblies[assembly_id]).fmin mRNA_limit = mRNA_loc.fmin - mRNA_extension_limit codon_step_size = -3 print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format( mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand if codon_step_size < 0: CDS_pos += codon_step_size while True: if (codon_step_size < 0 and CDS_pos < mRNA_limit) or ( codon_step_size > 0 and CDS_pos > mRNA_limit): print(" Reached the mRNA limit") break else: next_codon = assemblies[assembly_id].residues[ CDS_pos:CDS_pos + 3] print(".{0}({1})".format(next_codon, CDS_pos), end='') if next_codon in stop_codons: new_stop_found = True print(" Found a stop") break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format( mRNAs_with_terminal_stops)) print("mRNAs which can be corrected: {0}".format(mRNAs_corrected))
def main(): parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' ) parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' ) parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export') parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' ) parser.add_argument('-ft', '--feature_type', type=str, required=False, default='mRNA', choices=['mRNA', 'polypeptide'], help='IDs and coordinates will come from this feature type' ) parser.add_argument('--check_ends', dest='check_ends', action='store_true') parser.add_argument('--check_internal_stops', dest='check_internal_stops', action='store_true') parser.set_defaults(check_ends=False, check_internal_stops=False) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output_file is not None: fout = open(args.output_file, 'wt') # sanity option check if args.check_internal_stops == True and args.type == 'cds': raise Exception("Error: Checking internal stops for CDS features not currently supported.") (assemblies, features) = gff.get_gff3_features(args.input_file) # only doing the standard codon table for now start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] ## add sequence residues from external FASTA file if the user passed one if args.fasta is not None: utils.add_assembly_fasta(assemblies, args.fasta) for assembly_id in assemblies: for gene in assemblies[assembly_id].genes(): if args.feature_type == 'mRNA': feats = gene.mRNAs() elif args.feature_type == 'polypeptide': feats = gene.polypeptides() for feat in feats: ## initial values of id and header to export (can be overridden by available annotation) export_id = feat.id export_header = None ## Add the gene product name if there is one if args.feature_type == 'mRNA': for polypeptide in feat.polypeptides(): if polypeptide.annotation is not None: if polypeptide.annotation.product_name is not None: export_header = polypeptide.annotation.product_name break coding_seq = feat.get_CDS_residues(for_translation=True) if feat.locus_tag is not None: export_id = feat.locus_tag elif args.feature_type == 'polypeptide': export_header = feat.annotation.product_name coding_seq = feat.parent.get_CDS_residues(for_translation=True) if feat.parent.locus_tag is not None: export_id = feat.parent.locus_tag fout.write(">{0}".format(export_id)) if export_header is not None: fout.write(" {0}\n".format(export_header)) else: fout.write("\n") if args.check_ends == True: # check the starting codon start_codon = coding_seq[0:3].upper() if start_codon not in start_codons: sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, feat.id)) stop_codon = coding_seq[-3:].upper() if stop_codon not in stop_codons: sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, feat.id)) if args.type == 'cds': fout.write("{0}\n".format(utils.wrapped_fasta(coding_seq))) else: translated_seq = utils.translate(coding_seq) if args.check_internal_stops == True: internal_stop_count = translated_seq[:-1].count('*') if internal_stop_count > 0: sys.stderr.write("Found {0} internal stops in mRNA {1}\n".format(internal_stop_count, feat.id)) fout.write("{0}\n".format(utils.wrapped_fasta(translated_seq)))
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] newly_marked_5prime_partial = 0 newly_marked_3prime_partial = 0 for assembly_id in sorted(assemblies): for gene in sorted(assemblies[assembly_id].genes()): gene_loc = gene.location_on(assemblies[assembly_id]) for mRNA in gene.mRNAs(): mRNA_loc = mRNA.location_on(assemblies[assembly_id]) coding_seq = mRNA.get_CDS_residues() translation = utils.translate(coding_seq) if not translation.endswith('*'): newly_marked_3prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmax_partial = True CDSs[-1].location_on( assemblies[assembly_id]).fmax_partial = True gene_loc.fmax_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[-1].location_on( assemblies[assembly_id]).fmax_partial = True else: mRNA_loc.fmin_partial = True gene_loc.fmin_partial = True CDSs[0].location_on( assemblies[assembly_id]).fmin_partial = True mRNA.exons()[0].location_on( assemblies[assembly_id]).fmin_partial = True start_codon = coding_seq[0:3].upper().replace('U', 'T') if start_codon not in start_codons: newly_marked_5prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmin_partial = True CDSs[0].location_on( assemblies[assembly_id]).fmin_partial = True gene_loc.fmin_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[0].location_on( assemblies[assembly_id]).fmin_partial = True else: mRNA_loc.fmax_partial = True gene_loc.fmax_partial = True CDSs[-1].location_on( assemblies[assembly_id]).fmax_partial = True mRNA.exons()[-1].location_on( assemblies[assembly_id]).fmax_partial = True print( "Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial)) print( "Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] newly_marked_5prime_partial = 0 newly_marked_3prime_partial = 0 for assembly_id in sorted(assemblies): for gene in sorted(assemblies[assembly_id].genes()): gene_loc = gene.location_on(assemblies[assembly_id]) for mRNA in gene.mRNAs(): mRNA_loc = mRNA.location_on(assemblies[assembly_id]) coding_seq = mRNA.get_CDS_residues() translation = utils.translate(coding_seq) if not translation.endswith('*'): newly_marked_3prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmax_partial = True CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True gene_loc.fmax_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True else: mRNA_loc.fmin_partial = True gene_loc.fmin_partial = True CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True start_codon = coding_seq[0:3].upper().replace('U', 'T') if start_codon not in start_codons: newly_marked_5prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmin_partial = True CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True gene_loc.fmin_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True else: mRNA_loc.fmax_partial = True gene_loc.fmax_partial = True CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True print ("Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial)) print ("Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)