def main(): parser = argparse.ArgumentParser( description='Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies') ## output file to be written parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' ) parser.add_argument('-q', '--query_file', type=str, required=True, help='GFF3 file with alternative annotation (such as an RNA-seq assemby)' ) parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' ) args = parser.parse_args() (ref_assemblies, ref_feats) = gff.get_gff3_features(args.reference_file) (qry_assemblies, qry_genes) = gff.get_gff3_features(args.query_file) for assembly_id in ref_assemblies: # we expect to find this assembly ID in the qry set too if assembly_id not in qry_assemblies: print("WARN: expected to find assembly_id {0} in both reference and query sets".format(assembly_id)) continue for ref_gene in ref_assemblies[assembly_id].genes(): overlaps = list() polypeptides = ref_gene.polypeptides() if len(polypeptides) == 0: print("WARN: skipped gene {0} because it has no polypeptides".format(ref_gene.id)) continue ref_annot = ref_gene.polypeptides()[0].annotation for qry_gene in qry_assemblies[assembly_id].genes(): overlap = ref_gene.overlaps_with(qry_gene) if overlap: #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) ) overlaps.append(overlap) # add a dbxref to the gene ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(qry_gene.id)) if len(overlaps) > 0: print("INFO: ref_gene {0} had {1} overlaps".format(ref_gene.id, len(overlaps))) gff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
def main(): parser = argparse.ArgumentParser( description='Creates a single GFF from the output of a few different model prediction tools (coding and non-coding)') ## output file to be written parser.add_argument('-m', '--model_gff', type=str, required=True, help='Input (pass-through) GFF file' ) parser.add_argument('-o', '--output_gff', type=str, required=False, help='Output file to be written. Default=STDOUT' ) parser.add_argument('-b', '--barrnap_gff', type=str, required=False, help='GFF file from Barrnap prediction' ) parser.add_argument('-g', '--genomic_fasta', type=str, required=True, help='Source genomic FASTA file' ) parser.add_argument('-a', '--aragorn_out', type=str, required=False, help='Raw output file (with -w) from ARAGORN prediction' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.model_gff) utils.add_assembly_fasta(assemblies, args.genomic_fasta) if args.barrnap_gff: add_barrnap_features(assemblies, features, args.barrnap_gff) if args.aragorn_out: add_aragorn_features(assemblies, features, args.aragorn_out) with open(args.output_gff, 'wt') as f: gff.print_gff3_from_assemblies(ofh=f, assemblies=assemblies)
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop') parser.add_argument('-el', '--extension_limit', type=int, required=False, default=100, help='Optional. Limits how far an extension will happen looking for an in-frame stop codon') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id)) print("\tCDS: {0}".format(coding_seq)) print("\tcoding sequence ends with {0}, last three a.a.: {1}".format(coding_seq[-3:], translation[-3:])) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) CDS_frame_overhang = len(coding_seq) % 3 print("\tCDS frame overhang: {0}".format(CDS_frame_overhang)) codon_step_size = 3 if mRNA_loc.strand == 1: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax - CDS_frame_overhang mRNA_limit = mRNA_loc.fmax + args.extension_limit else: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin + CDS_frame_overhang mRNA_limit = mRNA_loc.fmin - args.extension_limit codon_step_size = -3 print("\tmRNA:{0}-{1} ({3}), CDS end: {2}. Extending ... \n\t".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand CDS_pos += codon_step_size while True: if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (mRNA_loc.strand == -1 and CDS_pos < mRNA_limit): print(" Reached the mRNA limit") break elif CDS_pos < 1: print(" Reached beginning of the molecule") break else: next_codon = assemblies[assembly_id].residues[CDS_pos:CDS_pos + 3] if mRNA_loc.strand == -1: next_codon = utils.reverse_complement(next_codon) print(".{0}({1}-{2})".format(next_codon, CDS_pos, CDS_pos - 3), end='') else: print(".{0}({1}-{2})".format(next_codon, CDS_pos - 3, CDS_pos), end='') if next_codon in stop_codons: if mRNA_loc.strand == 1: mRNA.extend_stop(on=assemblies[assembly_id], to=(CDS_pos + 3)) print(" Found a stop, extending to: {0} ({1})".format(CDS_pos + 3, mRNA_loc.strand)) else: mRNA.extend_stop(on=assemblies[assembly_id], to=CDS_pos) print(" Found a stop, extending to: {0} ({1})".format(CDS_pos, mRNA_loc.strand)) new_stop_found = True break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops)) print("mRNAs successfully extended: {0}".format(mRNAs_corrected)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description= 'Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies' ) ## output file to be written parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation') parser.add_argument( '-q', '--query_file', type=str, required=True, help= 'GFF3 file with alternative annotation (such as an RNA-seq assemby)') parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created') args = parser.parse_args() (ref_assemblies, ref_feats) = gff.get_gff3_features(args.reference_file) (qry_assemblies, qry_genes) = gff.get_gff3_features(args.query_file) for assembly_id in ref_assemblies: # we expect to find this assembly ID in the qry set too if assembly_id not in qry_assemblies: print( "WARN: expected to find assembly_id {0} in both reference and query sets" .format(assembly_id)) continue for ref_gene in ref_assemblies[assembly_id].genes(): overlaps = list() polypeptides = ref_gene.polypeptides() if len(polypeptides) == 0: print("WARN: skipped gene {0} because it has no polypeptides". format(ref_gene.id)) continue ref_annot = ref_gene.polypeptides()[0].annotation for qry_gene in qry_assemblies[assembly_id].genes(): overlap = ref_gene.overlaps_with(qry_gene) if overlap: #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) ) overlaps.append(overlap) # add a dbxref to the gene ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format( qry_gene.id)) if len(overlaps) > 0: print("INFO: ref_gene {0} had {1} overlaps".format( ref_gene.id, len(overlaps))) gff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) parser.add_argument( '-el', '--extension_limit', type=int, required=False, default=100, help= 'Optional. Limits how far an extension will happen looking for an in-frame stop codon' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) total_mRNAs = 0 mRNAs_with_terminal_stops = 0 stop_codons = ['TAG', 'TAA', 'TGA'] mRNAs_corrected = 0 for assembly_id in sorted(assemblies): print("Assembly {0} has length {1}".format( assembly_id, assemblies[assembly_id].length)) for gene in sorted(assemblies[assembly_id].genes()): for mRNA in gene.mRNAs(): coding_seq = mRNA.get_CDS_residues() total_mRNAs += 1 translation = utils.translate(coding_seq) if translation.endswith('*'): mRNAs_with_terminal_stops += 1 else: print("gene:{1}, mRNA: {0} is missing a stop".format( mRNA.id, gene.id)) print("\tCDS: {0}".format(coding_seq)) print( "\tcoding sequence ends with {0}, last three a.a.: {1}" .format(coding_seq[-3:], translation[-3:])) mRNA_loc = mRNA.location_on(assemblies[assembly_id]) CDSs = sorted(mRNA.CDSs()) CDS_frame_overhang = len(coding_seq) % 3 print( "\tCDS frame overhang: {0}".format(CDS_frame_overhang)) codon_step_size = 3 if mRNA_loc.strand == 1: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[-1].location_on( assemblies[assembly_id]).fmax - CDS_frame_overhang mRNA_limit = mRNA_loc.fmax + args.extension_limit else: # get the in-frame end coordinate of the last CDS position CDS_pos = CDSs[0].location_on( assemblies[assembly_id]).fmin + CDS_frame_overhang mRNA_limit = mRNA_loc.fmin - args.extension_limit codon_step_size = -3 print( "\tmRNA:{0}-{1} ({3}), CDS end: {2}. Extending ... \n\t" .format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='') new_stop_found = False # We have to step backwards to start if on the reverse strand CDS_pos += codon_step_size while True: if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or ( mRNA_loc.strand == -1 and CDS_pos < mRNA_limit): print(" Reached the mRNA limit") break elif CDS_pos < 1: print(" Reached beginning of the molecule") break else: next_codon = assemblies[assembly_id].residues[ CDS_pos:CDS_pos + 3] if mRNA_loc.strand == -1: next_codon = utils.reverse_complement( next_codon) print(".{0}({1}-{2})".format( next_codon, CDS_pos, CDS_pos - 3), end='') else: print(".{0}({1}-{2})".format( next_codon, CDS_pos - 3, CDS_pos), end='') if next_codon in stop_codons: if mRNA_loc.strand == 1: mRNA.extend_stop( on=assemblies[assembly_id], to=(CDS_pos + 3)) print( " Found a stop, extending to: {0} ({1})" .format(CDS_pos + 3, mRNA_loc.strand)) else: mRNA.extend_stop( on=assemblies[assembly_id], to=CDS_pos) print( " Found a stop, extending to: {0} ({1})" .format(CDS_pos, mRNA_loc.strand)) new_stop_found = True break CDS_pos += codon_step_size if new_stop_found == True: print("\tCDS_pos: UPDATE: {0}".format(CDS_pos)) mRNAs_corrected += 1 else: print("\tCDS_pos: SAME: {0}".format(CDS_pos)) print("\nTotal mRNAs found:{0}".format(total_mRNAs)) print("mRNAs initially with terminal stops: {0}".format( mRNAs_with_terminal_stops)) print("mRNAs successfully extended: {0}".format(mRNAs_corrected)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3') parser.add_argument( '-g', '--genome_fasta', type=str, required=False, help= 'Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF' ) parser.add_argument( '-o', '--output_gff', type=str, required=False, help= 'Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop' ) args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] newly_marked_5prime_partial = 0 newly_marked_3prime_partial = 0 for assembly_id in sorted(assemblies): for gene in sorted(assemblies[assembly_id].genes()): gene_loc = gene.location_on(assemblies[assembly_id]) for mRNA in gene.mRNAs(): mRNA_loc = mRNA.location_on(assemblies[assembly_id]) coding_seq = mRNA.get_CDS_residues() translation = utils.translate(coding_seq) if not translation.endswith('*'): newly_marked_3prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmax_partial = True CDSs[-1].location_on( assemblies[assembly_id]).fmax_partial = True gene_loc.fmax_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[-1].location_on( assemblies[assembly_id]).fmax_partial = True else: mRNA_loc.fmin_partial = True gene_loc.fmin_partial = True CDSs[0].location_on( assemblies[assembly_id]).fmin_partial = True mRNA.exons()[0].location_on( assemblies[assembly_id]).fmin_partial = True start_codon = coding_seq[0:3].upper().replace('U', 'T') if start_codon not in start_codons: newly_marked_5prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmin_partial = True CDSs[0].location_on( assemblies[assembly_id]).fmin_partial = True gene_loc.fmin_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[0].location_on( assemblies[assembly_id]).fmin_partial = True else: mRNA_loc.fmax_partial = True gene_loc.fmax_partial = True CDSs[-1].location_on( assemblies[assembly_id]).fmax_partial = True mRNA.exons()[-1].location_on( assemblies[assembly_id]).fmax_partial = True print( "Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial)) print( "Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main(): parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop') ## output file to be written parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' ) parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional. You must specify this unless the FASTA sequences for the molecules are embedded in the GFF') parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional. Writes an output GFF3 file with CDS (and containing features) extended to nearest stop') args = parser.parse_args() (assemblies, features) = gff.get_gff3_features(args.input_file) # deal with the FASTA file if the user passed one if args.genome_fasta is not None: utils.add_assembly_fasta(assemblies, args.genome_fasta) start_codons = ['ATG', 'GTG', 'TTG'] stop_codons = ['TAG', 'TAA', 'TGA'] newly_marked_5prime_partial = 0 newly_marked_3prime_partial = 0 for assembly_id in sorted(assemblies): for gene in sorted(assemblies[assembly_id].genes()): gene_loc = gene.location_on(assemblies[assembly_id]) for mRNA in gene.mRNAs(): mRNA_loc = mRNA.location_on(assemblies[assembly_id]) coding_seq = mRNA.get_CDS_residues() translation = utils.translate(coding_seq) if not translation.endswith('*'): newly_marked_3prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmax_partial = True CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True gene_loc.fmax_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True else: mRNA_loc.fmin_partial = True gene_loc.fmin_partial = True CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True start_codon = coding_seq[0:3].upper().replace('U', 'T') if start_codon not in start_codons: newly_marked_5prime_partial += 1 CDSs = sorted(mRNA.CDSs()) if mRNA_loc.strand == 1: mRNA_loc.fmin_partial = True CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True gene_loc.fmin_partial = True # The exon is tricky, as there's no direct link between the CDS fragment # and the corresponding exon. The assumption here is that there won't # be terminal non-coding exons if the CDS is partial. mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True else: mRNA_loc.fmax_partial = True gene_loc.fmax_partial = True CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True print ("Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial)) print ("Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial)) ofh = open(args.output_gff, 'wt') gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)