Exemple #1
0
def get_coding_potential_cutoff(ref_gtf, ref_fasta):
    """
    estimate the coding potential cutoff that best classifies
    coding/noncoding transcripts by splitting the reference
    annotation into a test and training set and determining
    the cutoff where the sensitivity and specificity meet
    """
    train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000)
    coding_gtf = gtf.partition_gtf(train_gtf, coding=True)
    noncoding_gtf = gtf.partition_gtf(train_gtf)
    noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta)
    cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True)
    hexamer_content = hexamer_table(cds_fasta, noncoding_fasta)
    coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta)
    logit_model = make_logit_model(coding_fasta, noncoding_fasta,
                                   hexamer_content, "test_gtf")
    test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta)
    cpat_fn = cpat(test_fasta, hexamer_content, logit_model)
    cpat_prob = load_cpat_coding_prob(cpat_fn)
    coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf)
    best_score = 1
    best_cutoff = 0
    best_sensitivity = 0
    best_specificity = 0
    for cutoff in list(numpy.arange(0.1, 1, 0.01)):
        grade = grade_cpat(coding, noncoding, cpat_prob, cutoff)
        score = abs(grade["sensitivity"] - grade["specificity"])
        if score < best_score:
            best_score = score
            best_cutoff = cutoff
            best_sensitivity = grade["sensitivity"]
            best_specificity = grade["specificity"]
    return best_cutoff, hexamer_content, logit_model
Exemple #2
0
def get_coding_potential_cutoff(ref_gtf, ref_fasta, data):
    """
    estimate the coding potential cutoff that best classifies
    coding/noncoding transcripts by splitting the reference
    annotation into a test and training set and determining
    the cutoff where the sensitivity and specificity meet
    """
    train_gtf, test_gtf = gtf.split_gtf(ref_gtf, sample_size=2000)
    coding_gtf = gtf.partition_gtf(train_gtf, coding=True)
    noncoding_gtf = gtf.partition_gtf(train_gtf)
    noncoding_fasta = gtf.gtf_to_fasta(noncoding_gtf, ref_fasta)
    cds_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta, cds=True)
    hexamer_content = hexamer_table(cds_fasta, noncoding_fasta, data)
    coding_fasta = gtf.gtf_to_fasta(coding_gtf, ref_fasta)
    logit_model = make_logit_model(coding_fasta, noncoding_fasta,
                                       hexamer_content, data, "test_gtf")
    test_fasta = gtf.gtf_to_fasta(test_gtf, ref_fasta)
    cpat_fn = cpat(test_fasta, hexamer_content, logit_model, data)
    cpat_prob = load_cpat_coding_prob(cpat_fn)
    coding, noncoding = gtf.get_coding_noncoding_transcript_ids(test_gtf)
    best_score = 1
    best_cutoff = 0
    best_sensitivity = 0
    best_specificity = 0
    for cutoff in list(numpy.arange(0.1, 1, 0.01)):
        grade = grade_cpat(coding, noncoding, cpat_prob, cutoff)
        score = abs(grade["sensitivity"] - grade["specificity"])
        if score < best_score:
            best_score = score
            best_cutoff = cutoff
            best_sensitivity = grade["sensitivity"]
            best_specificity = grade["specificity"]
    return best_cutoff, hexamer_content, logit_model
Exemple #3
0
def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    config = data['config']
    if not in_bam:
        logger.info("Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data))
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(out_dir) as tx_out_dir:
                bam_file = _prepare_bam_file(in_bam, tmp_dir, config)
                cmd = ("{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {bam_file}")
                do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {})
            shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file, out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data
def create_combined_fasta(data):
    """
    if there are genomes to be disambiguated, create a FASTA file of
    all of the transcripts for all genomes
    """
    out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    items = disambiguate.split([data])
    fasta_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        ref_file = dd.get_ref_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa")
        if file_exists(out_file):
            fasta_files.append(out_file)
        else:
            out_file = gtf.gtf_to_fasta(gtf_file, ref_file, out_file=out_file)
            fasta_files.append(out_file)
    out_stem = os.path.join(out_dir, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
    combined_file = out_stem + ".fa"
    if file_exists(combined_file):
        return combined_file

    fasta_file_string = " ".join(fasta_files)
    cmd = "cat {fasta_file_string} > {tx_out_file}"
    with file_transaction(data, combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.")
    return combined_file
Exemple #5
0
def create_combined_fasta(data):
    """
    if there are genomes to be disambiguated, create a FASTA file of
    all of the transcripts for all genomes
    """
    out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome")
    items = disambiguate.split([data])
    fasta_files = []
    for i in items:
        odata = i[0]
        gtf_file = dd.get_gtf_file(odata)
        ref_file = dd.get_ref_file(odata)
        out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa")
        if file_exists(out_file):
            fasta_files.append(out_file)
        else:
            out_file = gtf.gtf_to_fasta(gtf_file, ref_file, out_file=out_file)
            fasta_files.append(out_file)
    out_stem = os.path.join(out_dir, dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_stem = "-".join([out_stem] + (dd.get_disambiguate(data) or []))
    combined_file = out_stem + ".fa"
    if file_exists(combined_file):
        return combined_file

    fasta_file_string = " ".join(fasta_files)
    cmd = "cat {fasta_file_string} > {tx_out_file}"
    with file_transaction(data, combined_file) as tx_out_file:
        do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.")
    return combined_file
Exemple #6
0
def run(data):
    """Quantitaive isoforms expression by eXpress"""
    name = dd.get_sample_name(data)
    in_bam = dd.get_transcriptome_bam(data)
    if not in_bam:
        logger.info(
            "Transcriptome-mapped BAM file not found, skipping eXpress.")
        return data
    gtf_fasta = gtf.gtf_to_fasta(dd.get_gtf_file(data), dd.get_ref_file(data))
    out_dir = os.path.join(dd.get_work_dir(data), "express", name)
    out_file = os.path.join(out_dir, name + ".xprs")
    express = config_utils.get_program("express", data['config'])
    strand = _set_stranded_flag(in_bam, data)
    if not file_exists(out_file):
        with file_transaction(out_dir) as tx_out_dir:
            cmd = (
                "{express} --no-update-check -o {tx_out_dir} {strand} {gtf_fasta} {in_bam}"
            )
            do.run(cmd.format(**locals()), "Run express on %s." % in_bam, {})
        shutil.move(os.path.join(out_dir, "results.xprs"), out_file)
    eff_count_file = _get_column(out_file,
                                 out_file.replace(".xprs", "_eff.counts"), 7)
    tpm_file = _get_column(out_file, out_file.replace("xprs", "tpm"), 14)
    fpkm_file = _get_column(out_file, out_file.replace("xprs", "fpkm"), 10)
    data = dd.set_express_counts(data, eff_count_file)
    data = dd.set_express_tpm(data, tpm_file)
    data = dd.set_express_fpkm(data, fpkm_file)
    return data
Exemple #7
0
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    bwa = config_utils.get_program("bwa", data["config"])
    cmd = "{bwa} index {gtf_fasta}".format(**locals())
    message = "Creating transcriptome index of %s with bwa." % (gtf_fasta)
    do.run(cmd, message)
    return gtf_fasta
Exemple #8
0
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    bwa = config_utils.get_program("bwa", data["config"])
    cmd = "{bwa} index {gtf_fasta}".format(**locals())
    message = "Creating transcriptome index of %s with bwa." % (gtf_fasta)
    do.run(cmd, message)
    return gtf_fasta
Exemple #9
0
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    bowtie2_index = os.path.splitext(gtf_fasta)[0]
    bowtie2_build = config_utils.get_program("bowtie2", data["config"]) + "-build"
    cmd = "{bowtie2_build} --offrate 1 {gtf_fasta} {bowtie2_index}".format(**locals())
    message = "Creating transcriptome index of %s with bowtie2." % (gtf_fasta)
    do.run(cmd, message)
    return bowtie2_index
Exemple #10
0
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    bowtie2_index = os.path.splitext(gtf_fasta)[0]
    bowtie2_build = config_utils.get_program("bowtie2", data["config"]) + "-build"
    cmd = "{bowtie2_build} --offrate 1 {gtf_fasta} {bowtie2_index}".format(**locals())
    message = "Creating transcriptome index of %s with bowtie2." % (gtf_fasta)
    do.run(cmd, message)
    return bowtie2_index
Exemple #11
0
def classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta, data):
    cpat_cmd = config_utils.get_program("cpat.py", data)
    if not cpat_cmd:
        return {}
    cutoff, hexamer, logit = get_coding_potential_cutoff(ref_gtf, ref_fasta, data)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    cpat_fn = cpat(assembled_fasta, hexamer, logit, data)
    coding_probabilities = load_cpat_coding_prob(cpat_fn)
    lengths = fasta.sequence_length(assembled_fasta)
    classification = {}
    for transcript, prob in coding_probabilities.items():
        if prob > cutoff:
            classification[transcript] = "protein_coding"
        if lengths[transcript] > 200:
            classification[transcript] = "lncRNA"
        else:
            classification[transcript] = "ncRNA"
    return classification
Exemple #12
0
def classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta):
    cpat_cmd = _find_executable("cpat.py")
    if not cpat_cmd:
        return {}
    cutoff, hexamer, logit = get_coding_potential_cutoff(ref_gtf, ref_fasta)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    cpat_fn = cpat(assembled_fasta, hexamer, logit)
    coding_probabilities = load_cpat_coding_prob(cpat_fn)
    lengths = fasta.sequence_length(assembled_fasta)
    classification = {}
    for transcript, prob in coding_probabilities.items():
        if prob > cutoff:
            classification[transcript] = "protein_coding"
        if lengths[transcript] > 200:
            classification[transcript] = "lncRNA"
        else:
            classification[transcript] = "ncRNA"
    return classification
Exemple #13
0
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None):
    """
    Clean up a GTF file of assembled transcripts
    1) if a known gene is known to code for a protein, remove any *novel*
    isoforms of the that do not also code for a protein.
    2) if a new gene has been annotated and none of its isoforms are protein
    coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA
    """

    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf"
    if file_exists(out_file):
        return out_file
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {
        feature['transcript_id'][0]: feature.source
        for feature in gtf.complete_features(ref_db)
    }
    ref_gene_to_source = gtf.get_gene_source_set(ref_gtf)
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    lengths = fasta.sequence_length(assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                gene_id = feature['gene_id'][0]
                if transcript_id in known_transcript:
                    out_handle.write(str(feature) + "\n")
                    continue
                known_coding = "protein_coding" in ref_gene_to_source.get(
                    gene_id, [None])
                if known_coding and feature.source != "protein_coding":
                    continue
                if feature.source != "protein_coding":
                    if lengths[transcript_id] > 200:
                        feature.source = "lincRNA"
                    else:
                        feature.source = "ncRNA"
                out_handle.write(str(feature) + "\n")
    return out_file
Exemple #14
0
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None):
    """
    Clean up a GTF file of assembled transcripts
    1) if a known gene is known to code for a protein, remove any *novel*
    isoforms of the that do not also code for a protein.
    2) if a new gene has been annotated and none of its isoforms are protein
    coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA
    """

    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf"
    if file_exists(out_file):
        return out_file
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {feature['transcript_id'][0]: feature.source for feature
                        in gtf.complete_features(ref_db)}
    ref_gene_to_source = gtf.get_gene_source_set(ref_gtf)
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    lengths = fasta.sequence_length(assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                gene_id = feature['gene_id'][0]
                if transcript_id in known_transcript:
                    out_handle.write(str(feature) + "\n")
                    continue
                known_coding = "protein_coding" in ref_gene_to_source.get(gene_id, [None])
                if known_coding and feature.source != "protein_coding":
                    continue
                if feature.source != "protein_coding":
                    if lengths[transcript_id] > 200:
                        feature.source = "lincRNA"
                    else:
                        feature.source = "ncRNA"
                out_handle.write(str(feature) + "\n")
    return out_file
def make_transcriptome_fasta(gtf_file, org_build):
    genome_fasta = get_genome_fasta(org_build)
    base, _ = os.path.splitext(gtf_file)
    out_file = os.path.join(base + ".fa")
    out_file = gtf_to_fasta(gtf_file, genome_fasta, out_file=out_file)
    return out_file
Exemple #16
0
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    return build_bwa_index(gtf_fasta, data)
def make_transcriptome_fasta(gtf_file, genome_fasta):
    base, _ = os.path.splitext(gtf_file)
    out_file = os.path.join(base + ".fa")
    out_file = gtf_to_fasta(gtf_file, genome_fasta, out_file=out_file)
    return out_file
Exemple #18
0
def index_transcriptome(gtf_file, ref_file, data):
    """
    use a GTF file and a reference FASTA file to index the transcriptome
    """
    gtf_fasta = gtf.gtf_to_fasta(gtf_file, ref_file)
    return build_bwa_index(gtf_fasta, data)
Exemple #19
0
def make_transcriptome_fasta(gtf_file, genome_fasta):
    base, _ = os.path.splitext(gtf_file)
    out_file = os.path.join(base + ".fa")
    out_file = gtf_to_fasta(gtf_file, genome_fasta, out_file=out_file)
    return out_file