def main():
    parser = argparse.ArgumentParser( description='Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies')

    ## output file to be written
    parser.add_argument('-r', '--reference_file', type=str, required=True, help='GFF3 file of a reference annotation' )
    parser.add_argument('-q', '--query_file', type=str, required=True, help='GFF3 file with alternative annotation (such as an RNA-seq assemby)' )
    parser.add_argument('-o', '--output_file', type=str, required=True, help='Path to an output file to be created' )
    args = parser.parse_args()

    (ref_assemblies, ref_feats) = gff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_genes) = gff.get_gff3_features(args.query_file)

    for assembly_id in ref_assemblies:
        # we expect to find this assembly ID in the qry set too
        if assembly_id not in qry_assemblies:
            print("WARN: expected to find assembly_id {0} in both reference and query sets".format(assembly_id))
            continue
        
        for ref_gene in ref_assemblies[assembly_id].genes():
            overlaps = list()
            polypeptides = ref_gene.polypeptides()

            if len(polypeptides) == 0:
                print("WARN: skipped gene {0} because it has no polypeptides".format(ref_gene.id))
                continue
                
            ref_annot = ref_gene.polypeptides()[0].annotation
            
            for qry_gene in qry_assemblies[assembly_id].genes():
                overlap = ref_gene.overlaps_with(qry_gene)
                
                if overlap:
                    #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) )
                    overlaps.append(overlap)
                    # add a dbxref to the gene
                    ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(qry_gene.id))

            if len(overlaps) > 0:
                print("INFO: ref_gene {0} had {1} overlaps".format(ref_gene.id, len(overlaps)))
    
    gff.print_gff3_from_assemblies(assemblies=ref_assemblies, ofh=open(args.output_file, 'w'))
Example #2
0
def main():
    parser = argparse.ArgumentParser( description='Creates a single GFF from the output of a few different model prediction tools (coding and non-coding)')

    ## output file to be written
    parser.add_argument('-m', '--model_gff', type=str, required=True, help='Input (pass-through) GFF file' )
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Output file to be written.  Default=STDOUT' )
    parser.add_argument('-b', '--barrnap_gff', type=str, required=False, help='GFF file from Barrnap prediction' )
    parser.add_argument('-g', '--genomic_fasta', type=str, required=True, help='Source genomic FASTA file' )
    parser.add_argument('-a', '--aragorn_out', type=str, required=False, help='Raw output file (with -w) from ARAGORN prediction' )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.model_gff)
    utils.add_assembly_fasta(assemblies, args.genomic_fasta)

    if args.barrnap_gff:
        add_barrnap_features(assemblies, features, args.barrnap_gff)

    if args.aragorn_out:
        add_aragorn_features(assemblies, features, args.aragorn_out)

    with open(args.output_gff, 'wt') as f:
        gff.print_gff3_from_assemblies(ofh=f, assemblies=assemblies)
Example #3
0
def main():
    parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop')
    parser.add_argument('-el', '--extension_limit', type=int, required=False, default=100, help='Optional.  Limits how far an extension will happen looking for an in-frame stop codon')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print("\tcoding sequence ends with {0}, last three a.a.: {1}".format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                    
                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print("\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (mRNA_loc.strand == -1 and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[CDS_pos:CDS_pos + 3]
                            
                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(next_codon)
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos, CDS_pos - 3), end='')
                            else:
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos - 3, CDS_pos), end='')
                        
                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=(CDS_pos + 3))
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=CDS_pos)
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Script for reporting of possible polycistronic genes transcripts based on a reference annotation and RNA-seq transcript assemblies'
    )

    ## output file to be written
    parser.add_argument('-r',
                        '--reference_file',
                        type=str,
                        required=True,
                        help='GFF3 file of a reference annotation')
    parser.add_argument(
        '-q',
        '--query_file',
        type=str,
        required=True,
        help=
        'GFF3 file with alternative annotation (such as an RNA-seq assemby)')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=True,
                        help='Path to an output file to be created')
    args = parser.parse_args()

    (ref_assemblies, ref_feats) = gff.get_gff3_features(args.reference_file)
    (qry_assemblies, qry_genes) = gff.get_gff3_features(args.query_file)

    for assembly_id in ref_assemblies:
        # we expect to find this assembly ID in the qry set too
        if assembly_id not in qry_assemblies:
            print(
                "WARN: expected to find assembly_id {0} in both reference and query sets"
                .format(assembly_id))
            continue

        for ref_gene in ref_assemblies[assembly_id].genes():
            overlaps = list()
            polypeptides = ref_gene.polypeptides()

            if len(polypeptides) == 0:
                print("WARN: skipped gene {0} because it has no polypeptides".
                      format(ref_gene.id))
                continue

            ref_annot = ref_gene.polypeptides()[0].annotation

            for qry_gene in qry_assemblies[assembly_id].genes():
                overlap = ref_gene.overlaps_with(qry_gene)

                if overlap:
                    #print("DEBUG: {0} and {1} appear to overlap".format(ref_gene.id, qry_gene.id) )
                    overlaps.append(overlap)
                    # add a dbxref to the gene
                    ref_annot.add_dbxref("overlaps_old_locusTagID:{0}".format(
                        qry_gene.id))

            if len(overlaps) > 0:
                print("INFO: ref_gene {0} had {1} overlaps".format(
                    ref_gene.id, len(overlaps)))

    gff.print_gff3_from_assemblies(assemblies=ref_assemblies,
                                   ofh=open(args.output_file, 'w'))
Example #5
0
def main():
    parser = argparse.ArgumentParser(
        description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    parser.add_argument(
        '-el',
        '--extension_limit',
        type=int,
        required=False,
        default=100,
        help=
        'Optional.  Limits how far an extension will happen looking for an in-frame stop codon'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print(
                        "\tcoding sequence ends with {0}, last three a.a.: {1}"
                        .format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print(
                        "\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print(
                        "\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t"
                        .format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos,
                                mRNA_loc.strand),
                        end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (
                                mRNA_loc.strand == -1
                                and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]

                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(
                                    next_codon)
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos, CDS_pos - 3),
                                      end='')
                            else:
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos - 3, CDS_pos),
                                      end='')

                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id],
                                        to=(CDS_pos + 3))
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id], to=CDS_pos)
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(
        mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main():
    parser = argparse.ArgumentParser(
        description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    newly_marked_5prime_partial = 0
    newly_marked_3prime_partial = 0

    for assembly_id in sorted(assemblies):
        for gene in sorted(assemblies[assembly_id].genes()):
            gene_loc = gene.location_on(assemblies[assembly_id])

            for mRNA in gene.mRNAs():
                mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                coding_seq = mRNA.get_CDS_residues()
                translation = utils.translate(coding_seq)

                if not translation.endswith('*'):
                    newly_marked_3prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmax_partial = True
                        CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True
                        gene_loc.fmax_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True

                    else:
                        mRNA_loc.fmin_partial = True
                        gene_loc.fmin_partial = True
                        CDSs[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True
                        mRNA.exons()[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True

                start_codon = coding_seq[0:3].upper().replace('U', 'T')
                if start_codon not in start_codons:
                    newly_marked_5prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmin_partial = True
                        CDSs[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True
                        gene_loc.fmin_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[0].location_on(
                            assemblies[assembly_id]).fmin_partial = True

                    else:
                        mRNA_loc.fmax_partial = True
                        gene_loc.fmax_partial = True
                        CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True
                        mRNA.exons()[-1].location_on(
                            assemblies[assembly_id]).fmax_partial = True

    print(
        "Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial))
    print(
        "Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
Example #7
0
def main():
    parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    start_codons = ['ATG', 'GTG', 'TTG']
    stop_codons = ['TAG', 'TAA', 'TGA']

    newly_marked_5prime_partial = 0
    newly_marked_3prime_partial = 0

    for assembly_id in sorted(assemblies):
        for gene in sorted(assemblies[assembly_id].genes()):
            gene_loc = gene.location_on(assemblies[assembly_id])

            for mRNA in gene.mRNAs():
                mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                coding_seq = mRNA.get_CDS_residues()
                translation = utils.translate(coding_seq)

                if not translation.endswith('*'):
                    newly_marked_3prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmax_partial = True
                        CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True
                        gene_loc.fmax_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True

                    else:
                        mRNA_loc.fmin_partial = True
                        gene_loc.fmin_partial = True
                        CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True
                        mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True

                start_codon = coding_seq[0:3].upper().replace('U', 'T')
                if start_codon not in start_codons:
                    newly_marked_5prime_partial += 1
                    CDSs = sorted(mRNA.CDSs())

                    if mRNA_loc.strand == 1:
                        mRNA_loc.fmin_partial = True
                        CDSs[0].location_on(assemblies[assembly_id]).fmin_partial = True
                        gene_loc.fmin_partial = True

                        # The exon is tricky, as there's no direct link between the CDS fragment
                        #  and the corresponding exon.  The assumption here is that there won't
                        #  be terminal non-coding exons if the CDS is partial.
                        mRNA.exons()[0].location_on(assemblies[assembly_id]).fmin_partial = True

                    else:
                        mRNA_loc.fmax_partial = True
                        gene_loc.fmax_partial = True
                        CDSs[-1].location_on(assemblies[assembly_id]).fmax_partial = True
                        mRNA.exons()[-1].location_on(assemblies[assembly_id]).fmax_partial = True

    print ("Genes marked as 5' partial: {0}".format(newly_marked_5prime_partial))
    print ("Genes marked as 3' partial: {0}".format(newly_marked_3prime_partial))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)