def get_coding_potential_cutoff(ref_gtf, ref_fasta): """ estimate the coding potential cutoff that best classifies coding/noncoding transcripts by splitting the reference annotation into a test and training set and determining the cutoff where the sensitivity and specificity meet """ train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000) coding_gtf = gtf.partition_gtf(train_gtf, coding=True) noncoding_gtf = gtf.partition_gtf(train_gtf) noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta) cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True) hexamer_content = hexamer_table(cds_fasta, noncoding_fasta) coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta) logit_model = make_logit_model(coding_fasta, noncoding_fasta, hexamer_content, "test_gtf") test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta) cpat_fn = cpat(test_fasta, hexamer_content, logit_model) cpat_prob = load_cpat_coding_prob(cpat_fn) coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf) best_score = 1 best_cutoff = 0 best_sensitivity = 0 best_specificity = 0 for cutoff in list(numpy.arange(0.1, 1, 0.01)): grade = grade_cpat(coding, noncoding, cpat_prob, cutoff) score = abs(grade["sensitivity"] - grade["specificity"]) if score < best_score: best_score = score best_cutoff = cutoff best_sensitivity = grade["sensitivity"] best_specificity = grade["specificity"] return best_cutoff, hexamer_content, logit_model
def get_coding_potential_cutoff(ref_gtf, ref_fasta, data): """ estimate the coding potential cutoff that best classifies coding/noncoding transcripts by splitting the reference annotation into a test and training set and determining the cutoff where the sensitivity and specificity meet """ train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000) coding_gtf = gtf.partition_gtf(train_gtf, coding=True) noncoding_gtf = gtf.partition_gtf(train_gtf) noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta) cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True) hexamer_content = hexamer_table(cds_fasta, noncoding_fasta, data) coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta) logit_model = make_logit_model(coding_fasta, noncoding_fasta, hexamer_content, data, "test_gtf") test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta) cpat_fn = cpat(test_fasta, hexamer_content, logit_model, data) cpat_prob = load_cpat_coding_prob(cpat_fn) coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf) best_score = 1 best_cutoff = 0 best_sensitivity = 0 best_specificity = 0 for cutoff in list(numpy.arange(0.1, 1, 0.01)): grade = grade_cpat(coding, noncoding, cpat_prob, cutoff) score = abs(grade["sensitivity"] - grade["specificity"]) if score < best_score: best_score = score best_cutoff = cutoff best_sensitivity = grade["sensitivity"] best_specificity = grade["specificity"] return best_cutoff, hexamer_content, logit_model
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) config = data['config'] if not in_bam: logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.") return data gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): with tx_tmpdir(data) as tmp_dir: with file_transaction(out_dir) as tx_out_dir: bam_file = _prepare_bam_file(in_bam, tmp_dir, config) cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}") do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def create_combined_fasta(data): """ if there are genomes to be disambiguated, create a FASTA file of all of the transcripts for all genomes """ out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) fasta_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) ref_file = dd.get_ref_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa") if file_exists(out_file): fasta_files.append(out_file) else: out_file = gtf.gtf_to_fasta(gtf_file, ref_file, out_file=out_file) fasta_files.append(out_file) out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or [])) combined_file = out_stem + ".fa" if file_exists(combined_file): return combined_file fasta_file_string = " ".join(fasta_files) cmd = "cat {fasta_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.") return combined_file
def run(data): """Quantitaive isoforms expression by eXpress""" name = dd.get_sample_name(data) in_bam = dd.get_transcriptome_bam(data) if not in_bam: logger.info( "Transcriptome-mapped BAM file not found, skipping eXpress.") return data gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data)) out_dir = os.path.join(dd.get_work_dir(data), "express", name) out_file = os.path.join(out_dir, name + ".xprs") express = config_utils.get_program("express", data['config']) strand = _set_stranded_flag(in_bam, data) if not file_exists(out_file): with file_transaction(out_dir) as tx_out_dir: cmd = ( "{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {in_bam}" ) do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {}) shutil.move(os.path.join(out_dir, "results.xprs"), out_file) eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7) tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14) fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10) data = dd.set_express_counts(data, eff_count_file) data = dd.set_express_tpm(data, tpm_file) data = dd.set_express_fpkm(data, fpkm_file) return data
def index_transcriptome(gtf_file, ref_file, data): """ use a GTF file and a reference FASTA file to index the transcriptome """ gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file) bwa = config_utils.get_program("bwa", data["config"]) cmd = "{bwa} index {gtf_fasta}".format(**locals()) message = "Creating transcriptome index of %s with bwa." % (gtf_fasta) do.run(cmd, message) return gtf_fasta
def index_transcriptome(gtf_file, ref_file, data): """ use a GTF file and a reference FASTA file to index the transcriptome """ gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file) bowtie2_index = os.path.splitext(gtf_fasta)[0] bowtie2_build = config_utils.get_program("bowtie2", data["config"]) + "-build" cmd = "{bowtie2_build} --offrate 1 {gtf_fasta} {bowtie2_index}".format(**locals()) message = "Creating transcriptome index of %s with bowtie2." % (gtf_fasta) do.run(cmd, message) return bowtie2_index
def classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta, data): cpat_cmd = config_utils.get_program("cpat.py", data) if not cpat_cmd: return {} cutoff, hexamer, logit = get_coding_potential_cutoff(ref_gtf, ref_fasta, data) assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta) cpat_fn = cpat(assembled_fasta, hexamer, logit, data) coding_probabilities = load_cpat_coding_prob(cpat_fn) lengths = fasta.sequence_length(assembled_fasta) classification = {} for transcript, prob in coding_probabilities.items(): if prob > cutoff: classification[transcript] = "protein_coding" if lengths[transcript] > 200: classification[transcript] = "lncRNA" else: classification[transcript] = "ncRNA" return classification
def classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta): cpat_cmd = _find_executable("cpat.py") if not cpat_cmd: return {} cutoff, hexamer, logit = get_coding_potential_cutoff(ref_gtf, ref_fasta) assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta) cpat_fn = cpat(assembled_fasta, hexamer, logit) coding_probabilities = load_cpat_coding_prob(cpat_fn) lengths = fasta.sequence_length(assembled_fasta) classification = {} for transcript, prob in coding_probabilities.items(): if prob > cutoff: classification[transcript] = "protein_coding" if lengths[transcript] > 200: classification[transcript] = "lncRNA" else: classification[transcript] = "ncRNA" return classification
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None): """ Clean up a GTF file of assembled transcripts 1) if a known gene is known to code for a protein, remove any *novel* isoforms of the that do not also code for a protein. 2) if a new gene has been annotated and none of its isoforms are protein coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA """ if not out_file: out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf" if file_exists(out_file): return out_file ref_db = gtf.get_gtf_db(ref_gtf) known_transcript = { feature['transcript_id'][0]: feature.source for feature in gtf.complete_features(ref_db) } ref_gene_to_source = gtf.get_gene_source_set(ref_gtf) assembled_db = gtf.get_gtf_db(assembled_gtf) assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta) lengths = fasta.sequence_length(assembled_fasta) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out_handle: for feature in gtf.complete_features(assembled_db): transcript_id = feature['transcript_id'][0] gene_id = feature['gene_id'][0] if transcript_id in known_transcript: out_handle.write(str(feature) + "\n") continue known_coding = "protein_coding" in ref_gene_to_source.get( gene_id, [None]) if known_coding and feature.source != "protein_coding": continue if feature.source != "protein_coding": if lengths[transcript_id] > 200: feature.source = "lincRNA" else: feature.source = "ncRNA" out_handle.write(str(feature) + "\n") return out_file
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None): """ Clean up a GTF file of assembled transcripts 1) if a known gene is known to code for a protein, remove any *novel* isoforms of the that do not also code for a protein. 2) if a new gene has been annotated and none of its isoforms are protein coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA """ if not out_file: out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf" if file_exists(out_file): return out_file ref_db = gtf.get_gtf_db(ref_gtf) known_transcript = {feature['transcript_id'][0]: feature.source for feature in gtf.complete_features(ref_db)} ref_gene_to_source = gtf.get_gene_source_set(ref_gtf) assembled_db = gtf.get_gtf_db(assembled_gtf) assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta) lengths = fasta.sequence_length(assembled_fasta) with file_transaction(out_file) as tx_out_file: with open(tx_out_file, 'w') as out_handle: for feature in gtf.complete_features(assembled_db): transcript_id = feature['transcript_id'][0] gene_id = feature['gene_id'][0] if transcript_id in known_transcript: out_handle.write(str(feature) + "\n") continue known_coding = "protein_coding" in ref_gene_to_source.get(gene_id, [None]) if known_coding and feature.source != "protein_coding": continue if feature.source != "protein_coding": if lengths[transcript_id] > 200: feature.source = "lincRNA" else: feature.source = "ncRNA" out_handle.write(str(feature) + "\n") return out_file
def make_transcriptome_fasta(gtf_file, org_build): genome_fasta = get_genome_fasta(org_build) base, _ = os.path.splitext(gtf_file) out_file = os.path.join(base + ".fa") out_file = gtf_to_fasta(gtf_file, genome_fasta, out_file=out_file) return out_file
def index_transcriptome(gtf_file, ref_file, data): """ use a GTF file and a reference FASTA file to index the transcriptome """ gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file) return build_bwa_index(gtf_fasta, data)
def make_transcriptome_fasta(gtf_file, genome_fasta): base, _ = os.path.splitext(gtf_file) out_file = os.path.join(base + ".fa") out_file = gtf_to_fasta(gtf_file, genome_fasta, out_file=out_file) return out_file