def main():
    parser = argparse.ArgumentParser( description='Checks the CDS features against a genome sequence report non-terminal internal stops.')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-p', '--print_n_with_stops', type=int, required=False, default=0, help='Optional.  Pass the number of sequences with internal stops you want printed (usually for debugging purposes)' )
    parser.add_argument('-o', '--output_fasta', type=str, required=False, help='Optional.  Writes an output (translated) FASTA file for all those features which had internal stops')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_stops = 0

    # If this is set to the ID of any particular mRNA feature, the CDS and translation will be printed for it.
    debug_mRNA = None

    fasta_out_fh = None
    
    if args.output_fasta is not None:
        fasta_out_fh = open(args.output_fasta, 'wt')
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1

                if debug_mRNA is not None and mRNA.id == debug_mRNA:
                    print("CDS:{0}".format(coding_seq))

                if utils.translate(coding_seq).rstrip('*').count('*') > 0:
                    mRNAs_with_stops += 1
                    translated_seq = utils.translate(coding_seq)

                    if fasta_out_fh is not None:
                        loc = mRNA.location_on(assemblies[assembly_id])
                        fasta_out_fh.write(">{0} {1} {2}-{3} ({4})\n".format(mRNA.id, assembly_id, loc.fmin + 1, loc.fmax, loc.strand) )
                        fasta_out_fh.write("{0}\n".format(utils.wrapped_fasta(translated_seq)))
                    
                    if debug_mRNA is not None and mRNA.id == debug_mRNA:
                        print("TRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )

                    if mRNAs_with_stops <= args.print_n_with_stops:
                        print("\nmRNA id: {0}".format(mRNA.id) )
                        print("\tCDS:{0}".format(coding_seq))
                        print("\tTRANSLATION WITH STOP ({1}): {0}".format(translated_seq, mRNA.id) )


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs with embedded stops: {0}".format(mRNAs_with_stops))
Example #2
0
def main():
    parser = argparse.ArgumentParser( description='Create a TBL file for submission to NCBI from GFF3')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input file to be read' )
    parser.add_argument('-o', '--output_base', type=str, required=True, help='Base name of output files to be created' )
    parser.add_argument('-ln', '--lab_name', type=str, required=True, help='Required by NCBI to identify the submitting group' )
    parser.add_argument('-nap', '--ncbi_acc_prefix', type=str, required=True, help='Required and assigned by NCBI' )
    parser.add_argument('-gf', '--genomic_fasta', type=str, required=False, help='FASTA file of genomic sequence, if not embedded in GFF' )
    parser.add_argument('-go', '--go_obo', type=str, required=False, help='GO terms will not be exported unless you pass the path to a GO OBO file')
    
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)
    
    if args.genomic_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genomic_fasta)
        
    new_assemblies = dict() 

    ## We need to first check the ID format
    reformat_IDs = True

    ## maps old IDs (like tp.assembly.567468735.1) to new ones (like AAGK01000001)
    asm_id_map = dict()
    asm_num = 1

    for asm_id in assemblies:
        # pre-formatted IDs are like this: gnl|WGS:XXXX|SeqID|gb|XXXX01xxxxxx
        if asm_id.startswith('gnl|WGS:'):
            reformat_IDs = False
            break
        else:
            new_id = "gnl|WGS:{0}|SeqID|gb|{0}01{1:06d}".format(args.ncbi_acc_prefix, asm_num)
            asm_id_map[asm_id] = new_id
            asm_num += 1
            new_assemblies[new_id] = assemblies[asm_id]
            new_assemblies[new_id].id = new_id

    if reformat_IDs == True:
        assemblies = new_assemblies

    ofh = open("{0}.tbl".format(args.output_base), 'wt')
    tbl.print_tbl_from_assemblies(assemblies=assemblies, ofh=ofh, go_obo=args.go_obo, lab_name=args.lab_name)

    mset = things.AssemblySet()
    mset.load_from_dict(assemblies)
    mset.write_fasta(path="{0}.fna".format(args.output_base))
Example #3
0
def main():
    parser = argparse.ArgumentParser( description='Removes gene models whose sequence has been masked.')

    ## output file to be written
    parser.add_argument('-i', '--input_gff3', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-m', '--masked_fasta', type=str, required=True, help='FASTA with sequence masked with N characters')
    parser.add_argument('-p', '--percent_repeat_coverage_cutoff', type=int, required=True, help='Genes with an mRNA covered by this percentage of repeats will be excluded' )
    parser.add_argument('-o', '--output_gff3', type=str, required=False, help='Path to GFF3 output file to be created')
    parser.add_argument('-r', '--removed_gff3', type=str, required=False, help='If passed, writes the deleted genes to this file')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_gff3)
    utils.add_assembly_fasta(assemblies, args.masked_fasta)

    gff_out = open(args.output_gff3, 'wt')
    gff_out.write("##gff-version 3\n")
    
    rem_out = None
    gene_count = 0
    kept_count = 0

    if args.removed_gff3 is not None:
        rem_out = open(args.removed_gff3, 'wt')
        rem_out.write("##gff-version 3\n")
        
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            keep = True
            gene_count += 1
            
            for mRNA in gene.mRNAs():
                
                coding_seq = mRNA.get_CDS_residues()
                n_count = coding_seq.count('N')
                perc_repeat = (n_count / len(coding_seq)) * 100

                if perc_repeat >= args.percent_repeat_coverage_cutoff:
                    keep = False

            if keep == True:
                kept_count += 1
                gene.print_as(fh=gff_out, source='IGS', format='gff3')
            else:
                if rem_out is not None:
                    gene.print_as(fh=rem_out, source='IGS', format='gff3')


    print("INFO: {0} genes kept out of {1} ({2:.1f}%)".format(kept_count, gene_count, ((kept_count/gene_count) * 100)))
Example #4
0
def main():
    parser = argparse.ArgumentParser( description='Creates a single GFF from the output of a few different model prediction tools (coding and non-coding)')

    ## output file to be written
    parser.add_argument('-m', '--model_gff', type=str, required=True, help='Input (pass-through) GFF file' )
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Output file to be written.  Default=STDOUT' )
    parser.add_argument('-b', '--barrnap_gff', type=str, required=False, help='GFF file from Barrnap prediction' )
    parser.add_argument('-g', '--genomic_fasta', type=str, required=True, help='Source genomic FASTA file' )
    parser.add_argument('-a', '--aragorn_out', type=str, required=False, help='Raw output file (with -w) from ARAGORN prediction' )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.model_gff)
    utils.add_assembly_fasta(assemblies, args.genomic_fasta)

    if args.barrnap_gff:
        add_barrnap_features(assemblies, features, args.barrnap_gff)

    if args.aragorn_out:
        add_aragorn_features(assemblies, features, args.aragorn_out)

    with open(args.output_gff, 'wt') as f:
        gff.print_gff3_from_assemblies(ofh=f, assemblies=assemblies)
Example #5
0
def main():
    parser = argparse.ArgumentParser(
        description="Checks the CDS features against a genome sequence to report/correct phase columns."
    )

    ## output file to be written
    parser.add_argument("-i", "--input_file", type=str, required=True, help="Path to the input GFF3")
    parser.add_argument(
        "-g",
        "--genome_fasta",
        type=str,
        required=False,
        help="Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF",
    )
    parser.add_argument(
        "-o",
        "--output_gff",
        type=str,
        required=False,
        help="Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop",
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ["TAG", "TAA", "TGA"]

    mRNA_extension_limit = 100
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith("*"):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax
                        mRNA_limit = mRNA_loc.fmax + mRNA_extension_limit
                    else:
                        CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin
                        mRNA_limit = mRNA_loc.fmin - mRNA_extension_limit
                        codon_step_size = -3

                    print(
                        "\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos),
                        end="",
                    )

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    if codon_step_size < 0:
                        CDS_pos += codon_step_size

                    while True:
                        if (codon_step_size < 0 and CDS_pos < mRNA_limit) or (
                            codon_step_size > 0 and CDS_pos > mRNA_limit
                        ):
                            print(" Reached the mRNA limit")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[CDS_pos : CDS_pos + 3]
                            print(".{0}({1})".format(next_codon, CDS_pos), end="")

                            if next_codon in stop_codons:
                                new_stop_found = True
                                print(" Found a stop")
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops))
    print("mRNAs which can be corrected: {0}".format(mRNAs_corrected))
Example #6
0
def main():
    parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop')
    parser.add_argument('-el', '--extension_limit', type=int, required=False, default=100, help='Optional.  Limits how far an extension will happen looking for an in-frame stop codon')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print("\tcoding sequence ends with {0}, last three a.a.: {1}".format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                    
                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print("\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (mRNA_loc.strand == -1 and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[CDS_pos:CDS_pos + 3]
                            
                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(next_codon)
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos, CDS_pos - 3), end='')
                            else:
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos - 3, CDS_pos), end='')
                        
                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=(CDS_pos + 3))
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=CDS_pos)
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
Example #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    parser.add_argument(
        '-el',
        '--extension_limit',
        type=int,
        required=False,
        default=100,
        help=
        'Optional.  Limits how far an extension will happen looking for an in-frame stop codon'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print(
                        "\tcoding sequence ends with {0}, last three a.a.: {1}"
                        .format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print(
                        "\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print(
                        "\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t"
                        .format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos,
                                mRNA_loc.strand),
                        end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (
                                mRNA_loc.strand == -1
                                and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]

                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(
                                    next_codon)
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos, CDS_pos - 3),
                                      end='')
                            else:
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos - 3, CDS_pos),
                                      end='')

                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id],
                                        to=(CDS_pos + 3))
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id], to=CDS_pos)
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(
        mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main():
    parser = argparse.ArgumentParser(
        description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to an input GFF3 file to be read')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        help='Path to an output FASTA file to be created')
    parser.add_argument('-t',
                        '--type',
                        type=str,
                        required=False,
                        default='protein',
                        choices=['protein', 'cds'],
                        help='Type of features to export')
    parser.add_argument(
        '-f',
        '--fasta',
        type=str,
        required=False,
        help=
        'If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option'
    )
    parser.add_argument(
        '-ft',
        '--feature_type',
        type=str,
        required=False,
        default='mRNA',
        choices=['mRNA', 'polypeptide'],
        help='IDs and coordinates will come from this feature type')
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.add_argument('--check_internal_stops',
                        dest='check_internal_stops',
                        action='store_true')
    parser.set_defaults(check_ends=False, check_internal_stops=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    # sanity option check
    if args.check_internal_stops == True and args.type == 'cds':
        raise Exception(
            "Error:  Checking internal stops for CDS features not currently supported."
        )

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        utils.add_assembly_fasta(assemblies, args.fasta)

    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            if args.feature_type == 'mRNA':
                feats = gene.mRNAs()
            elif args.feature_type == 'polypeptide':
                feats = gene.polypeptides()

            for feat in feats:
                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = feat.id
                export_header = None

                ## Add the gene product name if there is one
                if args.feature_type == 'mRNA':
                    for polypeptide in feat.polypeptides():
                        if polypeptide.annotation is not None:
                            if polypeptide.annotation.product_name is not None:
                                export_header = polypeptide.annotation.product_name
                                break

                    coding_seq = feat.get_CDS_residues(for_translation=True)
                    if feat.locus_tag is not None:
                        export_id = feat.locus_tag

                elif args.feature_type == 'polypeptide':
                    export_header = feat.annotation.product_name
                    coding_seq = feat.parent.get_CDS_residues(
                        for_translation=True)
                    if feat.parent.locus_tag is not None:
                        export_id = feat.parent.locus_tag

                if len(coding_seq) > 0:
                    fout.write(">{0}".format(export_id))
                    if export_header is not None:
                        fout.write(" {0}\n".format(export_header))
                    else:
                        fout.write("\n")

                    if args.check_ends == True:
                        # check the starting codon
                        start_codon = coding_seq[0:3].upper()
                        if start_codon not in start_codons:
                            sys.stderr.write(
                                "WARN: Non-canonical start codon ({0}) in mRNA {1}\n"
                                .format(start_codon, feat.id))

                        stop_codon = coding_seq[-3:].upper()
                        if stop_codon not in stop_codons:
                            sys.stderr.write(
                                "WARN: Non-canonical stop codon ({0}) in mRNA {1}\n"
                                .format(stop_codon, feat.id))

                    if args.type == 'cds':
                        fout.write("{0}\n".format(
                            utils.wrapped_fasta(coding_seq)))
                    else:
                        translated_seq = utils.translate(coding_seq)

                        if args.check_internal_stops == True:
                            internal_stop_count = translated_seq[:-1].count(
                                '*')
                            if internal_stop_count > 0:
                                sys.stderr.write(
                                    "Found {0} internal stops in mRNA {1}\n".
                                    format(internal_stop_count, feat.id))

                        fout.write("{0}\n".format(
                            utils.wrapped_fasta(translated_seq)))
                else:
                    print(
                        "WARNING: Skipped feature {0} because it had no associated CDS features"
                        .format(export_id),
                        file=sys.stderr)
Example #9
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Checks the CDS features against a genome sequence to report/correct phase columns.'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']

    mRNA_extension_limit = 100
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax
                        mRNA_limit = mRNA_loc.fmax + mRNA_extension_limit
                    else:
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin
                        mRNA_limit = mRNA_loc.fmin - mRNA_extension_limit
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1}, CDS end: {2}\n\tExtending".format(
                        mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos),
                          end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    if codon_step_size < 0:
                        CDS_pos += codon_step_size

                    while True:
                        if (codon_step_size < 0 and CDS_pos < mRNA_limit) or (
                                codon_step_size > 0 and CDS_pos > mRNA_limit):
                            print(" Reached the mRNA limit")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]
                            print(".{0}({1})".format(next_codon, CDS_pos),
                                  end='')

                            if next_codon in stop_codons:
                                new_stop_found = True
                                print(" Found a stop")
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(
        mRNAs_with_terminal_stops))
    print("mRNAs which can be corrected: {0}".format(mRNAs_corrected))
Example #10
0
def main():
    parser = argparse.ArgumentParser( description='Extracts the protein or CDS seqeunces from a GFF3 file')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to an input GFF3 file to be read' )
    parser.add_argument('-o', '--output_file', type=str, required=False, help='Path to an output FASTA file to be created' )
    parser.add_argument('-t', '--type', type=str, required=False, default='protein', choices=['protein', 'cds'], help='Type of features to export')
    parser.add_argument('-f', '--fasta', type=str, required=False, help='If the FASTA entries for the underlying assemblies is absent from the GFF3 document passed, you will need to specify this option' )
    parser.add_argument('-ft', '--feature_type', type=str, required=False, default='mRNA', choices=['mRNA', 'polypeptide'], help='IDs and coordinates will come from this feature type' )
    parser.add_argument('--check_ends', dest='check_ends', action='store_true')
    parser.add_argument('--check_internal_stops', dest='check_internal_stops', action='store_true')
    parser.set_defaults(check_ends=False, check_internal_stops=False)
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    # sanity option check
    if args.check_internal_stops == True and args.type == 'cds':
        raise Exception("Error:  Checking internal stops for CDS features not currently supported.")

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # only doing the standard codon table for now
    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons  = ['TAG', 'TAA', 'TGA']

    ## add sequence residues from external FASTA file if the user passed one
    if args.fasta is not None:
        utils.add_assembly_fasta(assemblies, args.fasta)
    
    for assembly_id in assemblies:
        for gene in assemblies[assembly_id].genes():
            if args.feature_type == 'mRNA':
                feats = gene.mRNAs()
            elif args.feature_type == 'polypeptide':
                feats = gene.polypeptides()
            
            for feat in feats:
                ## initial values of id and header to export (can be overridden by available annotation)
                export_id = feat.id
                export_header = None

                ## Add the gene product name if there is one
                if args.feature_type == 'mRNA':
                    for polypeptide in feat.polypeptides():
                        if polypeptide.annotation is not None:
                            if polypeptide.annotation.product_name is not None:
                                export_header = polypeptide.annotation.product_name
                                break

                    coding_seq = feat.get_CDS_residues(for_translation=True)
                    if feat.locus_tag is not None:
                        export_id = feat.locus_tag
                        
                elif args.feature_type == 'polypeptide':
                    export_header = feat.annotation.product_name
                    coding_seq = feat.parent.get_CDS_residues(for_translation=True)
                    if feat.parent.locus_tag is not None:
                        export_id = feat.parent.locus_tag
                
                fout.write(">{0}".format(export_id))
                if export_header is not None:
                    fout.write(" {0}\n".format(export_header))
                else:
                    fout.write("\n")

                if args.check_ends == True:
                    # check the starting codon
                    start_codon = coding_seq[0:3].upper()
                    if start_codon not in start_codons:
                        sys.stderr.write("WARN: Non-canonical start codon ({0}) in mRNA {1}\n".format(start_codon, feat.id))

                    stop_codon = coding_seq[-3:].upper()
                    if stop_codon not in stop_codons:
                        sys.stderr.write("WARN: Non-canonical stop codon ({0}) in mRNA {1}\n".format(stop_codon, feat.id))                        

                if args.type == 'cds':
                    fout.write("{0}\n".format(utils.wrapped_fasta(coding_seq)))
                else:
                    translated_seq = utils.translate(coding_seq)

                    if args.check_internal_stops == True:
                        internal_stop_count = translated_seq[:-1].count('*')
                        if internal_stop_count > 0:
                            sys.stderr.write("Found {0} internal stops in mRNA {1}\n".format(internal_stop_count, feat.id))
                    
                    fout.write("{0}\n".format(utils.wrapped_fasta(translated_seq)))
def main():
    parser = argparse.ArgumentParser(
        description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    newly_marked_5prime_partial = 0
    newly_marked_3prime_partial = 0

    for assembly_id in sorted(assemblies):
        for gene in sorted(assemblies[assembly_id].genes()):
            gene_loc = gene.location_on(assemblies[assembly_id])

            for mRNA in gene.mRNAs():
                mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                coding_seq = mRNA.get_CDS_residues()
                translation = utils.translate(coding_seq)

                if not translation.endswith('*'):
                    newly_marked_3prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmax_partial = True
                        CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True
                        gene_loc.fmax_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True

                    else:
                        mRNA_loc.fmin_partial = True
                        gene_loc.fmin_partial = True
                        CDSs[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True
                        mRNA.exons()[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True

                start_codon = coding_seq[0:3].upper().replace('U', 'T')
                if start_codon not in start_codons:
                    newly_marked_5prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmin_partial = True
                        CDSs[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True
                        gene_loc.fmin_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True

                    else:
                        mRNA_loc.fmax_partial = True
                        gene_loc.fmax_partial = True
                        CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True
                        mRNA.exons()[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True

    print(
        "Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial))
    print(
        "Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
Example #12
0
def main():
    parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    newly_marked_5prime_partial = 0
    newly_marked_3prime_partial = 0

    for assembly_id in sorted(assemblies):
        for gene in sorted(assemblies[assembly_id].genes()):
            gene_loc = gene.location_on(assemblies[assembly_id])

            for mRNA in gene.mRNAs():
                mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                coding_seq = mRNA.get_CDS_residues()
                translation = utils.translate(coding_seq)

                if not translation.endswith('*'):
                    newly_marked_3prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmax_partial = True
                        CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True
                        gene_loc.fmax_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True

                    else:
                        mRNA_loc.fmin_partial = True
                        gene_loc.fmin_partial = True
                        CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True
                        mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True

                start_codon = coding_seq[0:3].upper().replace('U', 'T')
                if start_codon not in start_codons:
                    newly_marked_5prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmin_partial = True
                        CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True
                        gene_loc.fmin_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True

                    else:
                        mRNA_loc.fmax_partial = True
                        gene_loc.fmax_partial = True
                        CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True
                        mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True

    print ("Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial))
    print ("Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)