Python create_gtf_object Beispiele

Programmiersprache: Python

Namespace / Paketname: lib.parsing.gtf_object_tools

Methode / Funktion: create_gtf_object

Beispiele auf hotexamples.com: 6

Python create_gtf_object - 6 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die lib.parsing.gtf_object_tools.create_gtf_object, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: transfix_tools.py Projekt: anonconda/TranSuite

def remove_transcripts_without_cds(gtf_file, outfolder):

    print(time.asctime(),
          "Removing invalid transcripts models from annotation file")

    gtf_obj = create_gtf_object(gtf_file)
    trans_with_cds, trans_without_cds = (set() for _ in range(2))
    for trans, trans_cds in gtf_obj.trans_cds_dt.items():
        if not trans_cds:
            trans_without_cds.add(trans)
        else:
            trans_with_cds.add(trans)

    gtf_rows = []
    with open(gtf_file) as fh:
        for line in fh:
            row = line.strip('\n').split('\t')
            gtf_rows.append(row)

    filtered_rows = []
    for row in gtf_rows:
        attr = row[-1]
        trans_id = attr.strip("\n").split("transcript_id \"")[-1].split(
            "\";")[0]
        if trans_id in trans_without_cds:
            continue
        else:
            filtered_rows.append(row)

    # Adding the ".transfix.temp." to the name assure that this file will be removed with the other temporary files
    gtf_name = os.path.basename(gtf_file).replace(".gtf", ".transfix.temp.gtf")

    gtf_path = os.path.join(outfolder, gtf_name)
    with open(gtf_path, "w+") as fh:
        for row in gtf_rows:
            line = "\t".join(row) + "\n"
            fh.write(line)

    return gtf_path, trans_with_cds, trans_without_cds

Beispiel #2

Datei anzeigen

Datei: transfeat_report.py Projekt: anonconda/TranSuite

def generate_transfeat_summary(gtf_file, transfeat_table):

    # Check if files exist
    for fl in [gtf_file, transfeat_table]:
        if not os.path.isfile(fl):
            sys.exit(f"File {fl} does not exist.")

    # Create output file report and subfolders
    outpath = os.path.dirname(transfeat_table)
    outname = os.path.basename(transfeat_table).replace(".csv", "")

    # report_outfile = os.path.join(outpath, f"{outname}_report.txt")
    # table_subfolder = os.path.join(outpath, f"{outpath}_tables")

    print("\n")
    print(time.asctime(),
          f'Generating summary of TransFeat results ({transfeat_table})',
          flush=True)

    # 1) Get transcriptome information
    gtf_obj = create_gtf_object(gtf_file)

    # 2) Get information from TransFeat table
    trans_by_feature_dt, coding_categories_dt = get_transfeat_data(
        transfeat_table)

    # 3) Create a dictionary with the gene categories to analyze (Mono-exonic / Intron-containing genes, etc)
    models_by_categories_dt = group_models_into_categories(
        gtf_obj, coding_categories_dt)

    # 4) Get gene categories numbers
    categories_dt = get_categories_numbers(models_by_categories_dt,
                                           trans_by_feature_dt)

    # 5) Write tables
    write_transfeat_summary_tables(categories_dt, outpath, outname)

Beispiel #3

Datei anzeigen

def transfix_main(gtf_file, fasta, outpath, outname, iter_th=5, chimeric=None):

    print("\n")
    print(time.asctime(), "Starting TransFix analysis")

    if not 0 < iter_th <= 5:
        sys.exit(f"ERROR: The number of iterations must be within 0 and 5, not {iter_th}.")

    # If iter_th value is 0, do iterations indefinitely. Disabled for now due to unexpected bug for indefinite iteration
    check_iter = False
    if iter_th:
        check_iter = True

    # In case the user pass the name with a file extension, remove it
    if outname.endswith(".gtf"):
        outname = outname.replace(".gtf", "")
    outname += "_transfix"

    # Create output folder
    outfolder = os.path.join(outpath, outname)
    if not os.path.isdir(outfolder):
        os.makedirs(outfolder)

    # Classification categories
    cat_dt = defaultdict(set)

    # Track the fixed start-codon in the 1st fixing cycle to correct chimeric models
    gene_atg_pos_dt = defaultdict(list)

    # Assorted dictionaries to track features
    trans_cds_dt = {}  # To save the CDS coordinates of the translations
    trans_cds_seq_dt, trans_header_dt = {}, {}  # To save the CDS sequences to write them into nucl/peptide fasta files
    cycle_trans_dt = defaultdict(set)  # To track the processed transcripts at each cycle

    # Get transcriptome information
    gtf_obj = create_gtf_object(gtf_file)

    # Remove transcripts that do NOT have an annotated CDS
    # This method introduce the tag ".transfix.temp." that marks the temporary files to be removed further downstream
    gtf_known_cds, trans_with_cds, trans_without_cds = remove_transcripts_without_cds(gtf_file, outfolder)

    cat_dt["cds_not_found"].update(trans_without_cds)  # Track those models without CDS

    # Assign the file with transcripts containing CDS as the starting GTF file
    gtf_1 = gtf_known_cds

    # Create empty file to make first files comparison
    gtf_2 = os.path.join(outfolder, "empty.transfix.temp.gtf")
    with open(gtf_2, "w+"):
        pass

    # Upload transcripts sequences from fasta file
    trans_sequences_dt = get_fasta_sequences(fasta)

    # Variables to define a maximum number of iterations
    i = 0
    # The iteration will stop either when the previous output is the same as the current one, or the iter limit is reach
    while not filecmp.cmp(gtf_1, gtf_2):

        # This check is done before increasing the counter because
        # if the user specify the value as 1, the clearer meaning of this is to stop after completing 1 iteration
        if check_iter:
            if i >= iter_th:
                break
        i += 1

        print("\n")
        print(time.asctime(), "Iteration number {}:".format(i))
        print(time.asctime(), "Processing annotation file: ", gtf_1)

        print(time.asctime(), "Loading transcriptome information")
        locus_dict = load.get_models_gtf(gtf_1)

        print(time.asctime(), "Converting transcriptome information into GFF3 format")
        gff3_file = load.gtf_to_gff3(gtf_1)

        print(time.asctime(), "Loading GFF3 information")
        gff3_models = load.get_models_gff3(gff3_file)

        print(time.asctime(), "Selecting Genes with AUG start codon")
        _, _, gff3_models = characterise_max_orfs(gff3_models)

        print(time.asctime(), "Fixing transcripts start codon")

        n_genes = len(locus_dict.keys())
        for z, locus_id in enumerate(sorted(locus_dict.keys())):

            progress_bar = True
            if progress_bar:
                print(f'Processing Gene {locus_id}, {(z/n_genes)*100:.1f}% complete ({z+1}/{n_genes})')

            if locus_id not in gff3_models:
                cat_dt["absent_gff3"].add(locus_id)
                # absent_gff3.add(locus_id)
                continue

            if gff3_models[locus_id].transposon is True:
                cat_dt["retro_transposons"].add(locus_id)
                # retro_transposons.add(locus_id)
                continue

            # Start CDS
            atg_pos = gff3_models[locus_id].rep_atg

            # Keep only the ATG positions found in the first cycle of fixing
            if not gene_atg_pos_dt[locus_id]:
                gene_atg_pos_dt[locus_id].append(atg_pos)

            trans_data_dt = get_transcript_data_from_gff_obj(locus_id, locus_dict, trans_sequences_dt)

            # Fix the start-codon position for the Gene group
            grp_output_dt, grp_cat_dt = fix_atg_position(trans_data_dt, atg_pos)

            # Update the transcripts categories
            cat_dt["cds_not_found"].update(grp_cat_dt["cds_not_found"])
            cat_dt["seq_not_present"].update(grp_cat_dt["seq_not_present"])
            cat_dt["atg_not_in_cds"].update(grp_cat_dt["atg_not_in_cds"])
            cat_dt["start_codon_not_atg"].update(grp_cat_dt["start_codon_not_atg"])

            cat_dt["cds_not_found_lines"].update(grp_cat_dt["cds_not_found_lines"])
            cat_dt["seq_not_present_lines"].update(grp_cat_dt["seq_not_present_lines"])
            cat_dt["atg_not_in_cds_lines"].update(grp_cat_dt["atg_not_in_cds_lines"])
            cat_dt["start_codon_not_atg_lines"].update(grp_cat_dt["start_codon_not_atg_lines"])

            cat_dt["rejected_start_codons"].update(grp_cat_dt["rejected_start_codons"])
            cat_dt["processed_transcripts"].update(grp_cat_dt["processed_transcripts"])

            trans_cds_dt.update(grp_output_dt["trans_cds_dt"])
            trans_cds_seq_dt.update(grp_output_dt["trans_cds_seq_dt"])
            trans_header_dt.update(grp_output_dt["trans_header_dt"])

        # Remove transcripts processed in current iteration
        cat_dt["cds_not_found"] -= cat_dt["atg_not_in_cds"] | cat_dt["processed_transcripts"]
        cat_dt["atg_not_in_cds"] -= cat_dt["processed_transcripts"]
        cat_dt["start_codon_not_atg"] -= cat_dt["atg_not_in_cds"] | cat_dt["cds_not_found"] | cat_dt["processed_transcripts"]

        # Filter processed transcripts
        new_gtf = load.filter_gtf(gtf_1, cat_dt["processed_transcripts"], cat_dt["rejected_start_codons"], i, outfolder)
        gtf_1, gtf_2 = new_gtf, gtf_1

        # Track the processed transcripts at each translation cycle
        cycle = f"Cycle_{i}"
        cycle_trans_dt[cycle].update(cat_dt["processed_transcripts"])
    print("\n")

    # Track transcripts that were not processed in the analysis
    removed_st = cat_dt["processed_transcripts"] | cat_dt["cds_not_found"] | cat_dt["atg_not_in_cds"] | \
                 cat_dt["start_codon_not_atg"] | cat_dt["seq_not_present"]

    cat_dt["unprocessed_transcripts"] = trans_with_cds - removed_st
    cat_dt["unprocessed_transcripts_lines"] = [e for e in sorted(cat_dt["unprocessed_transcripts"])]

    # If a table specifying chimeric models is reported by the user, then TransFix can correct the ATG of these models
    if chimeric:
        chimeric_output_dt = fix_chimeric_start_codon(gtf_obj, chimeric, trans_cds_dt, trans_sequences_dt)

        trans_cds_dt.update(chimeric_output_dt["trans_cds_dt"])
        trans_cds_seq_dt.update(chimeric_output_dt["trans_cds_seq_dt"])
        trans_header_dt.update(chimeric_output_dt["trans_header_dt"])

    # Write annotation file with re-annotated CDS
    outfile = os.path.join(outfolder, outname + ".gtf")
    outfile = annotate_cds_into_gtf(gtf_obj, trans_cds_dt, outfile)

    # Write output fasta files
    outfile_fasta = os.path.join(outfolder, outname + "_nuc.fasta")
    write_fasta_file(trans_cds_seq_dt, outfile_fasta, trans_header_dt)

    trans_pep_dt = {}
    for trans, trans_seq in trans_cds_seq_dt.items():
        trans_pep_dt[trans] = trans_seq.translate(to_stop=True)

    outfile_fasta = os.path.join(outfolder, outname + "_pep.fasta")
    write_fasta_file(trans_pep_dt, outfile_fasta, trans_header_dt)

    # Write TransFix related tables
    write_transfix_tables(gtf_obj, cat_dt, cycle_trans_dt, trans_cds_dt, outfolder, outname)

    remove = True
    if remove:
        print(time.asctime(), "Removing temporary files")
        for (path, dirs, files) in os.walk(outfolder):
            for file in files:
                if "transfix.temp." in file or file == "empty.transfix.temp.gtf" or file == gff3_file:
                    os.remove(os.path.join(outfolder, file))

    # Return output file for TransAll function
    return outfile

Beispiel #4

Datei anzeigen

def add_features_to_gtf(gtf_file):

    gtf_obj = create_gtf_object(gtf_file)
    trans_cds_dt = gtf_obj.trans_cds_dt
    trans_5utr_dt = gtf_obj.trans_5utr_dt
    trans_3utr_dt = gtf_obj.trans_3utr_dt
    trans_start_codon_dt = gtf_obj.trans_start_codon
    trans_stop_codon_dt = gtf_obj.trans_stop_codon

    trans_gene_coords_dt = {}
    for gene, trans_list in gtf_obj.gene_trans_dt.items():
        gene_coords = gtf_obj.gene_coords_dt[gene]
        for trans in trans_list:
            trans_gene_coords_dt[trans] = gene_coords

    transcripts_lines_dt = defaultdict(list)
    for trans, lines_ix_list in gtf_obj.trans_gtf_lines_index.items():

        line_1 = linecache.getline(gtf_obj.gtf_path, lines_ix_list[0])
        seqname, source, _, _, _, score, strand, frame, attr = line_1.strip(
            '\n').split('\t')

        gene_coords = trans_gene_coords_dt[trans]
        start, end = gene_coords[0], gene_coords[-1]

        g_row = f'{seqname}\t{source}\tgene\t{start}\t{end}\t"."\t{strand}\t{frame}\t{attr}'
        t_row = f'{seqname}\t{source}\ttranscript\t{start}\t{end}\t"."\t{strand}\t{frame}\t{attr}'

        for line in [g_row, t_row]:
            if line not in transcripts_lines_dt[trans]:
                # Transcripts visualization on IGV is better without these lines; thus I disable it for the moment
                # transcripts_lines_dt[trans].append(line.strip('\n'))
                continue

        for line_ix in lines_ix_list:
            line = linecache.getline(gtf_obj.gtf_path, line_ix)

            # Important! Ignore any line that is not an exon coordinate so as the CDS re-annotation is completely new
            # Other features will be re-added by the function write_gtf_with_features further downstream
            _, _, line_feature, *_ = line.split('\t')

            if line_feature != "exon":
                continue

            if line not in transcripts_lines_dt[trans]:
                transcripts_lines_dt[trans].append(line.strip('\n'))

        feature_dicts_list = [
            trans_cds_dt, trans_5utr_dt, trans_3utr_dt, trans_start_codon_dt,
            trans_stop_codon_dt
        ]
        feature_tags_list = [
            "CDS", "five_prime_utr", "three_prime_utr", "start_codon",
            "stop_codon"
        ]
        for feature_dt, feature_tag in zip(feature_dicts_list,
                                           feature_tags_list):
            score = "."
            try:
                coord_list = feature_dt[trans]
                # The value of "start_codon", "stop_codon" is a tuple; thus, it must be converted to a list
                if feature_tag in {"start_codon", "stop_codon"}:
                    coord_list = [coord_list]

                for coord in coord_list:
                    start, end = sorted(coord)
                    line = f"{seqname}\t{source}\t{feature_tag}\t{start}\t{end}\t{score}\t{strand}\t{frame}\t{attr}"
                    if line not in transcripts_lines_dt[trans]:
                        transcripts_lines_dt[trans].append(line.strip('\n'))
            except Exception:
                continue

    gtf_lines = []
    for trans, trans_lines in transcripts_lines_dt.items():
        for line in trans_lines:
            gtf_lines.append(line.strip('\n'))

    sorted_lines = sorted(gtf_lines,
                          key=lambda l:
                          (get_id(l, 'gene_id'), get_id(l, 'transcript_id'),
                           int(l.split('\t')[3])))

    # Write output file, overwrite input file
    with open(gtf_file, "w+") as fh:
        for line in sorted_lines:
            fh.write(line + '\n')

    return gtf_file

Beispiel #5

Datei anzeigen

def findlorf_main(gtf_file,
                  fasta_file,
                  outpath,
                  outname,
                  cds_th=50,
                  filter_gtf=True):

    print("\n")
    print(time.asctime(), "Starting FindLORF analysis")

    # In case the user pass the name with a file extension, remove it
    if outname.endswith(".gtf"):
        outname = outname.replace(".gtf", "")
    outname += "_transfind"

    # Create output folder
    outfolder = os.path.join(outpath, outname)
    if not os.path.isdir(outfolder):
        os.makedirs(outfolder)

    # Condver CDS length from AA to bp
    cds_th = cds_th * 3

    # Generate a GTF file only with "valid" fields (only rows containing "exon" coord, and with known (+ or -) strand)
    if filter_gtf:
        gtf_file_filtered = filter_gtf_file(gtf_file)
    else:
        gtf_file_filtered = gtf_file

    # Get information from annotation file
    gtf_obj = create_gtf_object(gtf_file_filtered)

    # Get transcripts nucleotide sequence
    sequences_dt = get_fasta_sequences(fasta_file)

    # Get ORF information of the transcripts
    orf_data_dt, orf_index_file = find_transcripts_orf_information(
        gtf_file, sequences_dt, gtf_obj, outfolder)

    trans_cds_dt, cds_not_found_trans, short_cds_trans = \
        assign_longest_orf_as_cds(gtf_obj, sequences_dt, orf_data_dt, cds_th)

    print(time.asctime(), "Writing re-annotated transcriptome annotation")
    outfile = os.path.join(outfolder, f"{outname}.gtf")

    # Annotate the identified/selected CDS into the output file
    outfile = annotate_cds_into_gtf(gtf_obj, trans_cds_dt, outfile)

    print(time.asctime(), "Generating headers for output fasta files")
    trans_header_dt, cds_seq_dt, pep_seq_dt = [{} for _ in range(3)]
    # orf_data structure: (orf_start, orf_end, frame, strand, cds_start, cds_end, cds_seq, pep_seq)
    for trans, orf_data in orf_data_dt.items():
        trans_cds_seq = orf_data[6]

        # Ignore transcripts without sequences
        if not trans_cds_seq:
            continue

        cds_seq_dt[trans] = str(trans_cds_seq)
        pep_seq_dt[trans] = str(trans_cds_seq.translate())

        trans_chrom = gtf_obj.trans_chrom_dt[trans][:-1]

        try:
            cds_start, cds_end = trans_cds_dt[trans][0][0], trans_cds_dt[
                trans][-1][-1]
            trans_header = f">{trans} | {gtf_obj.trans_gene_dt[trans]} | {trans_chrom}:{cds_start}-{cds_end}"
        except KeyError:
            trans_exons = gtf_obj.trans_exons_dt[trans]
            exon_start, exon_end = trans_exons[0][0], trans_exons[-1][-1]
            trans_header = f">{trans} | {gtf_obj.trans_gene_dt[trans]} | {trans_chrom}:{exon_start}-{exon_end}"

        trans_header_dt[trans] = trans_header

    nucl_fasta = os.path.join(outfolder, f"{outname}_nuc.fasta")
    write_fasta_file(cds_seq_dt, nucl_fasta, trans_header_dt)

    pep_fasta = os.path.join(outfolder, f"{outname}_pep.fasta")
    write_fasta_file(pep_seq_dt, pep_fasta, trans_header_dt)

    print(time.asctime(), "Writing output-related tables")
    # Identify transcripts in the GTF file that are not present in the FASTA file
    trans_seq_absent, trans_seq_absent_lines = (set() for _ in range(2))
    fasta_trans = set(sequences_dt.keys())
    for trans in gtf_obj.trans_exons_dt.keys():
        if trans not in fasta_trans:
            trans_seq_absent.add(trans)
            line = f"{gtf_obj.trans_chrom_dt[trans][:-1]}\t{gtf_obj.trans_sense_dt[trans]}\t" \
                   f"{gtf_obj.trans_gene_dt[trans]}\t{trans}\n"
            trans_seq_absent_lines.add(line)

    if trans_seq_absent_lines:
        absent_outfile = os.path.join(outfolder,
                                      outname + "_sequence_not_found.csv")
        with open(absent_outfile, "w+") as fh:
            fh.write(f"Chromosome,Strand,Gene_ID,Transcript_ID\n")
            for line in sorted(trans_seq_absent_lines):
                fh.write(line)

    # Keep track of removed transcripts due to short CDS
    if short_cds_trans:
        short_cds_table = os.path.join(outfolder,
                                       outname + "_short_CDS_transcripts.csv")
        with open(short_cds_table, "w+") as fh:
            fh.write(f"Chromosome,Strand,Gene_ID,Transcript_ID\n")
            for trans in sorted(short_cds_trans):
                line = f"{gtf_obj.trans_chrom_dt[trans][:-1]},{gtf_obj.trans_sense_dt[trans]},{gtf_obj.trans_gene_dt[trans]},{trans}\n"
                fh.write(line)

    # Keep track of the transcript for which a CDS was not found
    if cds_not_found_trans:
        no_cds_table = os.path.join(outfolder,
                                    outname + "_no_CDS_transcripts.csv")
        with open(no_cds_table, "w+") as fh:
            fh.write(f"Chromosome,Strand,Gene_ID,Transcript_ID\n")
            for trans in sorted(cds_not_found_trans):
                line = f"{gtf_obj.trans_chrom_dt[trans][:-1]},{gtf_obj.trans_sense_dt[trans]},{gtf_obj.trans_gene_dt[trans]},{trans}\n"
                fh.write(line)

    # Return output file for TransAll function
    return outfile, orf_index_file

Beispiel #6

Datei anzeigen

def transfeat_main(gtf,
                   fasta,
                   outpath,
                   outname,
                   pep_len=50,
                   ptc_len=70,
                   uorf_len=10,
                   sj_dist=50,
                   utr3_len=350,
                   orf_index=None):

    print("\n")
    print(time.asctime(), "Starting TransFeat analysis")

    # +1 AA to account for stop codons during the AA length check
    pep_len += 1

    # In case the user pass the name with a file extension, remove it
    if outname.endswith(".gtf"):
        outname = outname.replace(".gtf", "")
    outname += "_transfeat"

    # Create output folder
    outfolder = os.path.join(outpath, outname)
    if not os.path.isdir(outfolder):
        os.makedirs(outfolder)

    # Get transcriptome annotation
    gtf_obj = create_gtf_object(gtf)

    # Upload transcripts sequences from fasta file
    trans_seq_dt = get_fasta_sequences(fasta)

    # Generate transcripts sequence information
    fasta_header_dt, cds_seq_dt, pep_seq_dt = translate_transcript_cds(
        trans_seq_dt, gtf_obj)

    print(time.asctime(), "Retrieving ORF information")
    if orf_index:
        print(time.asctime(), "Uploading ORF information from ORF index file")
        with open(orf_index) as orf_index_fh:
            orf_dt = json.load(orf_index_fh)

    else:
        # Generate ORF index file
        _, orf_index = find_transcripts_orf_information(
            gtf, trans_seq_dt, gtf_obj, outfolder)

        print(time.asctime(), "Uploading ORF information from ORF index file")
        with open(orf_index) as orf_index_fh:
            orf_dt = json.load(orf_index_fh)

    if not orf_dt:
        sys.exit("No ORF information found.")

    # Get transcript start-codon relative position
    relative_start_dt = get_transcript_start_codon_relative_position(gtf_obj)

    # Select authentic stop-codon (at gene level)
    auth_stop_dt = get_genes_authentic_stop_codon_position(gtf_obj)

    print(time.asctime(), "Retrieving alternative ORFs information")

    # Identify transcripts with long downstream ORF
    is_longer_dorf_dt, ldorf_data_dt = identify_longer_dorf(
        gtf_obj, relative_start_dt, orf_dt, trans_seq_dt)

    # Identify transcripts with upstream ORF
    is_uorf_dt, uorf_data_dt, urof_categories = identify_uorf(
        gtf_obj, relative_start_dt, orf_dt, trans_seq_dt, uorf_len)

    print(time.asctime(), "Identifying Non-Coding features")

    # Identify transcripts without an annotated CDS
    is_orf_absent_dt = is_orf_absent(gtf_obj)

    # Identify transcripts with "Premature Termination Codons" (PTC)
    is_ptc_dt = is_ptc(gtf_obj, ptc_len)

    # Identify transcripts coding for short peptides
    # The identification of "short peptides" is done after the PTC check to avoid redundancy of classification
    is_orf_short_dt = is_orf_short(gtf_obj, pep_len)

    is_long_3utr_dt = is_long_3utr(gtf_obj, utr3_len)

    # Get transcripts groups (PTC transcripts, long 3' UTR transcripts, etc) to use for NMD classification
    ptc_trans = set(
        [t_id for t_id, t_bool in is_ptc_dt.items() if t_bool is True])
    long_3utr_trans = set(
        [t_id for t_id, t_bool in is_long_3utr_dt.items() if t_bool is True])
    ov_uorf_trans = urof_categories["overlapping"]
    uorf_trans = urof_categories["not_overlapping"]

    is_nmd_dt, is_dssj_dt = is_nmd(gtf_obj,
                                   auth_stop_dt,
                                   sj_dist_th=sj_dist,
                                   ptc_trans=ptc_trans,
                                   long_3utr_trans=long_3utr_trans,
                                   ov_uorf_trans=ov_uorf_trans,
                                   uorf_trans=uorf_trans)

    nmd_features_dt = generate_nmd_features_lines(gtf_obj, is_nmd_dt,
                                                  is_ptc_dt, is_dssj_dt,
                                                  is_long_3utr_dt,
                                                  urof_categories)

    # Identify AS in UTR and NAGNAG features
    as_in_utr_dt, as_utr_location_dt, nagnag_dt = identify_similar_coding_features(
        gtf_obj)

    # Some transcripts are missing from the features_dict either because:
    # (1) Not present in the FASTA file, (2) it doesn't have a CDS, or (3) it's the only transcript in the overlap group
    # To avoid KeyError further downstream in 'generate_feature_tag'. These dictionaries return None by default

    # Dictionary of features to annotate
    feature_dicts = {}
    feature_dicts["Auto"] = gtf_obj.trans_gene_dt
    feature_dicts["No_ORF"] = is_orf_absent_dt
    feature_dicts["Short_ORF"] = is_orf_short_dt
    feature_dicts["Long_3UTR"] = is_long_3utr_dt
    feature_dicts["PTC"] = is_ptc_dt
    feature_dicts["NMD"] = is_nmd_dt
    feature_dicts["ds_SJ"] = is_dssj_dt
    feature_dicts["NMD_features"] = nmd_features_dt
    feature_dicts["uORF"] = urof_categories
    feature_dicts["ldORF"] = is_longer_dorf_dt
    feature_dicts["AS_in_UTR"] = as_in_utr_dt
    feature_dicts["AS_Location"] = as_utr_location_dt
    feature_dicts["NAGNAG"] = nagnag_dt

    # Generate the features to annotate into the output table
    coding_potentiality_dt, coding_features_dt, alternative_ORF_dt = generate_feature_tag(
        gtf_obj, feature_dicts)

    # These dictionaries are required to write the TransFeat table
    table_info_dicts = {}
    table_info_dicts["Coding_potentiality"] = coding_potentiality_dt
    table_info_dicts["Coding_features"] = coding_features_dt
    table_info_dicts["NMD_features"] = nmd_features_dt
    table_info_dicts["Alternative_ORF"] = alternative_ORF_dt

    # Get alternative ORFs IDs to validate classification on table
    ldorf_trans = set([
        t_id for t_id in is_longer_dorf_dt if is_longer_dorf_dt[t_id] is True
    ])
    uorf_trans = set([t_id for t_id in is_uorf_dt if is_uorf_dt[t_id] is True])

    # Write TransFeat table output
    transfeat_table = write_transfeat_table(gtf_obj,
                                            table_info_dicts,
                                            pep_seq_dt,
                                            outfolder,
                                            outname,
                                            ldorf_ids=ldorf_trans,
                                            uorf_ids=uorf_trans,
                                            pep_len=pep_len)

    # Write GENERAL/TOTAL output fasta files
    fasta_outfile = os.path.join(outfolder, outname)
    write_fasta_file(cds_seq_dt, f'{fasta_outfile}_nuc.fasta', fasta_header_dt)
    write_fasta_file(pep_seq_dt, f'{fasta_outfile}_pep.fasta', fasta_header_dt)

    # These dictionaries are required to write the fasta files
    sequences_dicts = {}
    sequences_dicts["Headers"] = fasta_header_dt
    sequences_dicts["Exonic_seq"] = trans_seq_dt
    sequences_dicts["CDS_seq"] = cds_seq_dt
    sequences_dicts["Peptide_seq"] = pep_seq_dt
    sequences_dicts["ldORF_data"] = ldorf_data_dt
    sequences_dicts["uORF_data"] = uorf_data_dt

    # Write ADDITIONAL output fasta files
    write_subcategories_fasta(gtf_obj, transfeat_table, sequences_dicts,
                              outfolder, outname)

    # Generate tables summarizing the main transfeat data
    generate_transfeat_summary(gtf, transfeat_table)

    # Generate table with additional NMD information
    write_NMD_table(gtf_obj, feature_dicts, sequences_dicts, outfolder,
                    outname)

    # Return output file for TransAll function
    return transfeat_table