def get_coding_potential_cutoff(ref_gtf, ref_fasta): """ estimate the coding potential cutoff that best classifies coding/noncoding transcripts by splitting the reference annotation into a test and training set and determining the cutoff where the sensitivity and specificity meet """ train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000) coding_gtf = gtf.partition_gtf(train_gtf, coding=True) noncoding_gtf = gtf.partition_gtf(train_gtf) noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta) cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True) hexamer_content = hexamer_table(cds_fasta, noncoding_fasta) coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta) logit_model = make_logit_model(coding_fasta, noncoding_fasta, hexamer_content, "test_gtf") test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta) cpat_fn = cpat(test_fasta, hexamer_content, logit_model) cpat_prob = load_cpat_coding_prob(cpat_fn) coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf) best_score = 1 best_cutoff = 0 best_sensitivity = 0 best_specificity = 0 for cutoff in list(numpy.arange(0.1, 1, 0.01)): grade = grade_cpat(coding, noncoding, cpat_prob, cutoff) score = abs(grade["sensitivity"] - grade["specificity"]) if score < best_score: best_score = score best_cutoff = cutoff best_sensitivity = grade["sensitivity"] best_specificity = grade["specificity"] return best_cutoff, hexamer_content, logit_model
def get_coding_potential_cutoff(ref_gtf, ref_fasta, data): """ estimate the coding potential cutoff that best classifies coding/noncoding transcripts by splitting the reference annotation into a test and training set and determining the cutoff where the sensitivity and specificity meet """ train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000) coding_gtf = gtf.partition_gtf(train_gtf, coding=True) noncoding_gtf = gtf.partition_gtf(train_gtf) noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta) cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True) hexamer_content = hexamer_table(cds_fasta, noncoding_fasta, data) coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta) logit_model = make_logit_model(coding_fasta, noncoding_fasta, hexamer_content, data, "test_gtf") test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta) cpat_fn = cpat(test_fasta, hexamer_content, logit_model, data) cpat_prob = load_cpat_coding_prob(cpat_fn) coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf) best_score = 1 best_cutoff = 0 best_sensitivity = 0 best_specificity = 0 for cutoff in list(numpy.arange(0.1, 1, 0.01)): grade = grade_cpat(coding, noncoding, cpat_prob, cutoff) score = abs(grade["sensitivity"] - grade["specificity"]) if score < best_score: best_score = score best_cutoff = cutoff best_sensitivity = grade["sensitivity"] best_specificity = grade["specificity"] return best_cutoff, hexamer_content, logit_model