Esempio n. 1
0
def get_genome(data):
    """
    get the effective length of the genome, falling back to the length of the genome
    if the effective length is not precomputed
    """
    from bcbio.chipseq import macs2
    from bcbio.bam import fasta
    genome = dd.get_genome_build(data)
    loaded = macs2.HS
    if genome in loaded:
        return loaded[genome]
    else:
        return sum([x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
Esempio n. 2
0
def get_genome(data):
    """
    get the effective length of the genome, falling back to the length of the genome
    if the effective length is not precomputed
    """
    from bcbio.chipseq import macs2
    from bcbio.bam import fasta
    genome = dd.get_genome_build(data)
    loaded = macs2.HS
    if genome in loaded:
        return loaded[genome]
    else:
        return sum(
            [x for x in fasta.sequence_length(dd.get_ref_file(data)).values()])
Esempio n. 3
0
def classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta):
    cpat_cmd = _find_executable("cpat.py")
    if not cpat_cmd:
        return {}
    cutoff, hexamer, logit = get_coding_potential_cutoff(ref_gtf, ref_fasta)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    cpat_fn = cpat(assembled_fasta, hexamer, logit)
    coding_probabilities = load_cpat_coding_prob(cpat_fn)
    lengths = fasta.sequence_length(assembled_fasta)
    classification = {}
    for transcript, prob in coding_probabilities.items():
        if prob > cutoff:
            classification[transcript] = "protein_coding"
        if lengths[transcript] > 200:
            classification[transcript] = "lncRNA"
        else:
            classification[transcript] = "ncRNA"
    return classification
Esempio n. 4
0
def classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta, data):
    cpat_cmd = config_utils.get_program("cpat.py", data)
    if not cpat_cmd:
        return {}
    cutoff, hexamer, logit = get_coding_potential_cutoff(ref_gtf, ref_fasta, data)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    cpat_fn = cpat(assembled_fasta, hexamer, logit, data)
    coding_probabilities = load_cpat_coding_prob(cpat_fn)
    lengths = fasta.sequence_length(assembled_fasta)
    classification = {}
    for transcript, prob in coding_probabilities.items():
        if prob > cutoff:
            classification[transcript] = "protein_coding"
        if lengths[transcript] > 200:
            classification[transcript] = "lncRNA"
        else:
            classification[transcript] = "ncRNA"
    return classification
Esempio n. 5
0
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None):
    """
    Clean up a GTF file of assembled transcripts
    1) if a known gene is known to code for a protein, remove any *novel*
    isoforms of the that do not also code for a protein.
    2) if a new gene has been annotated and none of its isoforms are protein
    coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA
    """

    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf"
    if file_exists(out_file):
        return out_file
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {
        feature['transcript_id'][0]: feature.source
        for feature in gtf.complete_features(ref_db)
    }
    ref_gene_to_source = gtf.get_gene_source_set(ref_gtf)
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    lengths = fasta.sequence_length(assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                gene_id = feature['gene_id'][0]
                if transcript_id in known_transcript:
                    out_handle.write(str(feature) + "\n")
                    continue
                known_coding = "protein_coding" in ref_gene_to_source.get(
                    gene_id, [None])
                if known_coding and feature.source != "protein_coding":
                    continue
                if feature.source != "protein_coding":
                    if lengths[transcript_id] > 200:
                        feature.source = "lincRNA"
                    else:
                        feature.source = "ncRNA"
                out_handle.write(str(feature) + "\n")
    return out_file
Esempio n. 6
0
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None):
    """
    Clean up a GTF file of assembled transcripts
    1) if a known gene is known to code for a protein, remove any *novel*
    isoforms of the that do not also code for a protein.
    2) if a new gene has been annotated and none of its isoforms are protein
    coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA
    """

    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf"
    if file_exists(out_file):
        return out_file
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {feature['transcript_id'][0]: feature.source for feature
                        in gtf.complete_features(ref_db)}
    ref_gene_to_source = gtf.get_gene_source_set(ref_gtf)
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    lengths = fasta.sequence_length(assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                gene_id = feature['gene_id'][0]
                if transcript_id in known_transcript:
                    out_handle.write(str(feature) + "\n")
                    continue
                known_coding = "protein_coding" in ref_gene_to_source.get(gene_id, [None])
                if known_coding and feature.source != "protein_coding":
                    continue
                if feature.source != "protein_coding":
                    if lengths[transcript_id] > 200:
                        feature.source = "lincRNA"
                    else:
                        feature.source = "ncRNA"
                out_handle.write(str(feature) + "\n")
    return out_file