Exemple #1
0
def get_coding_potential_cutoff(ref_gtf, ref_fasta):
    """
    estimate the coding potential cutoff that best classifies
    coding/noncoding transcripts by splitting the reference
    annotation into a test and training set and determining
    the cutoff where the sensitivity and specificity meet
    """
    train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000)
    coding_gtf = gtf.partition_gtf(train_gtf, coding=True)
    noncoding_gtf = gtf.partition_gtf(train_gtf)
    noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta)
    cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True)
    hexamer_content = hexamer_table(cds_fasta, noncoding_fasta)
    coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta)
    logit_model = make_logit_model(coding_fasta, noncoding_fasta,
                                   hexamer_content, "test_gtf")
    test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta)
    cpat_fn = cpat(test_fasta, hexamer_content, logit_model)
    cpat_prob = load_cpat_coding_prob(cpat_fn)
    coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf)
    best_score = 1
    best_cutoff = 0
    best_sensitivity = 0
    best_specificity = 0
    for cutoff in list(numpy.arange(0.1, 1, 0.01)):
        grade = grade_cpat(coding, noncoding, cpat_prob, cutoff)
        score = abs(grade["sensitivity"] - grade["specificity"])
        if score < best_score:
            best_score = score
            best_cutoff = cutoff
            best_sensitivity = grade["sensitivity"]
            best_specificity = grade["specificity"]
    return best_cutoff, hexamer_content, logit_model
Exemple #2
0
def get_coding_potential_cutoff(ref_gtf, ref_fasta, data):
    """
    estimate the coding potential cutoff that best classifies
    coding/noncoding transcripts by splitting the reference
    annotation into a test and training set and determining
    the cutoff where the sensitivity and specificity meet
    """
    train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000)
    coding_gtf = gtf.partition_gtf(train_gtf, coding=True)
    noncoding_gtf = gtf.partition_gtf(train_gtf)
    noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta)
    cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True)
    hexamer_content = hexamer_table(cds_fasta, noncoding_fasta, data)
    coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta)
    logit_model = make_logit_model(coding_fasta, noncoding_fasta,
                                       hexamer_content, data, "test_gtf")
    test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta)
    cpat_fn = cpat(test_fasta, hexamer_content, logit_model, data)
    cpat_prob = load_cpat_coding_prob(cpat_fn)
    coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf)
    best_score = 1
    best_cutoff = 0
    best_sensitivity = 0
    best_specificity = 0
    for cutoff in list(numpy.arange(0.1, 1, 0.01)):
        grade = grade_cpat(coding, noncoding, cpat_prob, cutoff)
        score = abs(grade["sensitivity"] - grade["specificity"])
        if score < best_score:
            best_score = score
            best_cutoff = cutoff
            best_sensitivity = grade["sensitivity"]
            best_specificity = grade["specificity"]
    return best_cutoff, hexamer_content, logit_model