def annotate_novel_coding(assembled_gtf, ref_gtf, ref_fasta, data, out_file=None):
    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".annotated.gtf"
    if file_exists(out_file):
        return out_file
    classification = cpat.classify_with_cpat(assembled_gtf, ref_gtf,
                                             ref_fasta, data)
    if not classification:
        logger.info("Protein coding classification of %s was skipped because "
                    "CPAT was not found." % assembled_gtf)
        return assembled_gtf
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {feature['transcript_id'][0]: feature.source for feature in
                        gtf.complete_features(ref_db)}
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                if transcript_id not in known_transcript:
                    feature.source = classification[transcript_id]
                else:
                    feature.source = known_transcript[transcript_id]
                out_handle.write(str(feature) + "\n")
    return out_file
Esempio n. 2
0
def fix_cufflinks_attributes(ref_gtf, merged_gtf, data, out_file=None):
    """
    replace the cufflinks gene_id and transcript_id with the
    gene_id and transcript_id from ref_gtf, where available

    """
    base, ext = os.path.splitext(merged_gtf)
    fixed = out_file if out_file else base + ".clean.fixed" + ext
    if file_exists(fixed):
        return fixed
    ref_db = gtf.get_gtf_db(ref_gtf)
    merged_db = gtf.get_gtf_db(merged_gtf, in_memory=True)

    ref_tid_to_gid = {}
    for gene in ref_db.features_of_type('gene'):
        for transcript in ref_db.children(gene, level=1):
            ref_tid_to_gid[transcript.id] = gene.id

    ctid_to_cgid = {}
    ctid_to_oid = {}
    for gene in merged_db.features_of_type('gene'):
        for transcript in merged_db.children(gene, level=1):
            ctid_to_cgid[transcript.id] = gene.id
            feature = list(merged_db.children(transcript))[0]
            oid = feature.attributes.get("oId", [None])[0]
            if oid:
                ctid_to_oid[transcript.id] = oid
    cgid_to_gid = {}
    for ctid, oid in ctid_to_oid.items():
        cgid = ctid_to_cgid.get(ctid, None)
        oid = ctid_to_oid.get(ctid, None)
        gid = ref_tid_to_gid.get(oid, None) if oid else None
        if cgid and gid:
            cgid_to_gid[cgid] = gid

    with file_transaction(data, fixed) as tmp_fixed_file:
        with open(tmp_fixed_file, "w") as out_handle:
            for gene in merged_db.features_of_type('gene'):
                for transcript in merged_db.children(gene, level=1):
                    for feature in merged_db.children(transcript):
                        cgid = feature.attributes.get("gene_id", [None])[0]
                        gid = cgid_to_gid.get(cgid, None)
                        ctid = None
                        if gid:
                            feature.attributes["gene_id"][0] = gid
                            ctid = feature.attributes.get(
                                "transcript_id", [None])[0]
                        tid = ctid_to_oid.get(ctid, None)
                        if tid:
                            feature.attributes["transcript_id"][0] = tid
                        if "nearest_ref" in feature.attributes:
                            del feature.attributes["nearest_ref"]
                        if "oId" in feature.attributes:
                            del feature.attributes["oId"]
                        out_handle.write(str(feature) + "\n")
    return fixed
Esempio n. 3
0
def fix_cufflinks_attributes(ref_gtf, merged_gtf, data, out_file=None):
    """
    replace the cufflinks gene_id and transcript_id with the
    gene_id and transcript_id from ref_gtf, where available

    """
    base, ext = os.path.splitext(merged_gtf)
    fixed = out_file if out_file else base + ".clean.fixed" + ext
    if file_exists(fixed):
        return fixed
    ref_db = gtf.get_gtf_db(ref_gtf)
    merged_db = gtf.get_gtf_db(merged_gtf, in_memory=True)

    ref_tid_to_gid = {}
    for gene in ref_db.features_of_type('gene'):
        for transcript in ref_db.children(gene, level=1):
            ref_tid_to_gid[transcript.id] = gene.id

    ctid_to_cgid = {}
    ctid_to_oid = {}
    for gene in merged_db.features_of_type('gene'):
        for transcript in merged_db.children(gene, level=1):
            ctid_to_cgid[transcript.id] = gene.id
            feature = list(merged_db.children(transcript))[0]
            oid = feature.attributes.get("oId", [None])[0]
            if oid:
                ctid_to_oid[transcript.id] = oid
    cgid_to_gid = {}
    for ctid, oid in ctid_to_oid.items():
        cgid = ctid_to_cgid.get(ctid, None)
        oid = ctid_to_oid.get(ctid, None)
        gid = ref_tid_to_gid.get(oid, None) if oid else None
        if cgid and gid:
            cgid_to_gid[cgid] = gid

    with file_transaction(data, fixed) as tmp_fixed_file:
        with open(tmp_fixed_file, "w") as out_handle:
            for gene in merged_db.features_of_type('gene'):
                for transcript in merged_db.children(gene, level=1):
                    for feature in merged_db.children(transcript):
                        cgid = feature.attributes.get("gene_id", [None])[0]
                        gid = cgid_to_gid.get(cgid, None)
                        ctid = None
                        if gid:
                            feature.attributes["gene_id"][0] = gid
                            ctid = feature.attributes.get("transcript_id",
                                                          [None])[0]
                        tid = ctid_to_oid.get(ctid, None)
                        if tid:
                            feature.attributes["transcript_id"][0] = tid
                        if "nearest_ref" in feature.attributes:
                            del feature.attributes["nearest_ref"]
                        if "oId" in feature.attributes:
                            del feature.attributes["oId"]
                        out_handle.write(str(feature) + "\n")
    return fixed
Esempio n. 4
0
def annotate_novel_coding(assembled_gtf, ref_gtf, ref_fasta, out_file=None):
    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".annotated.gtf"
    if file_exists(out_file):
        return out_file
    classification = cpat.classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta)
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {feature['transcript_id'][0]: feature.source for feature in
                        gtf.complete_features(ref_db)}
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                if transcript_id not in known_transcript:
                    feature.source = classification[transcript_id]
                else:
                    feature.source = known_transcript[transcript_id]
                out_handle.write(str(feature) + "\n")
    return out_file
Esempio n. 5
0
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None):
    """
    Clean up a GTF file of assembled transcripts
    1) if a known gene is known to code for a protein, remove any *novel*
    isoforms of the that do not also code for a protein.
    2) if a new gene has been annotated and none of its isoforms are protein
    coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA
    """

    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf"
    if file_exists(out_file):
        return out_file
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {
        feature['transcript_id'][0]: feature.source
        for feature in gtf.complete_features(ref_db)
    }
    ref_gene_to_source = gtf.get_gene_source_set(ref_gtf)
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    lengths = fasta.sequence_length(assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                gene_id = feature['gene_id'][0]
                if transcript_id in known_transcript:
                    out_handle.write(str(feature) + "\n")
                    continue
                known_coding = "protein_coding" in ref_gene_to_source.get(
                    gene_id, [None])
                if known_coding and feature.source != "protein_coding":
                    continue
                if feature.source != "protein_coding":
                    if lengths[transcript_id] > 200:
                        feature.source = "lincRNA"
                    else:
                        feature.source = "ncRNA"
                out_handle.write(str(feature) + "\n")
    return out_file
Esempio n. 6
0
def annotate_novel_coding(assembled_gtf, ref_gtf, ref_fasta, data, out_file=None):
    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".annotated.gtf"
    if file_exists(out_file):
        return out_file
    classification = cpat.classify_with_cpat(assembled_gtf, ref_gtf, ref_fasta, data)
    if not classification:
        logger.info("Protein coding classification of %s was skipped because " "CPAT was not found." % assembled_gtf)
        return assembled_gtf
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {feature["transcript_id"][0]: feature.source for feature in gtf.complete_features(ref_db)}
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature["transcript_id"][0]
                if transcript_id not in known_transcript:
                    feature.source = classification[transcript_id]
                else:
                    feature.source = known_transcript[transcript_id]
                out_handle.write(str(feature) + "\n")
    return out_file
Esempio n. 7
0
def cleanup_transcripts(assembled_gtf, ref_gtf, ref_fasta, out_file=None):
    """
    Clean up a GTF file of assembled transcripts
    1) if a known gene is known to code for a protein, remove any *novel*
    isoforms of the that do not also code for a protein.
    2) if a new gene has been annotated and none of its isoforms are protein
    coding and it is > 200 bp, mark it as a lincRNA. < 200 bp mark it as ncRNA
    """

    if not out_file:
        out_file = os.path.splitext(assembled_gtf)[0] + ".cleaned.gtf"
    if file_exists(out_file):
        return out_file
    ref_db = gtf.get_gtf_db(ref_gtf)
    known_transcript = {feature['transcript_id'][0]: feature.source for feature
                        in gtf.complete_features(ref_db)}
    ref_gene_to_source = gtf.get_gene_source_set(ref_gtf)
    assembled_db = gtf.get_gtf_db(assembled_gtf)
    assembled_fasta = gtf.gtf_to_fasta(assembled_gtf, ref_fasta)
    lengths = fasta.sequence_length(assembled_fasta)
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, 'w') as out_handle:
            for feature in gtf.complete_features(assembled_db):
                transcript_id = feature['transcript_id'][0]
                gene_id = feature['gene_id'][0]
                if transcript_id in known_transcript:
                    out_handle.write(str(feature) + "\n")
                    continue
                known_coding = "protein_coding" in ref_gene_to_source.get(gene_id, [None])
                if known_coding and feature.source != "protein_coding":
                    continue
                if feature.source != "protein_coding":
                    if lengths[transcript_id] > 200:
                        feature.source = "lincRNA"
                    else:
                        feature.source = "ncRNA"
                out_handle.write(str(feature) + "\n")
    return out_file
Esempio n. 8
0
def isoform_to_gene_name(gtf_file, out_file=None):
    """
    produce a table of isoform -> gene mappings for loading into EBSeq
    """
    if not out_file:
         out_file = tempfile.NamedTemporaryFile(delete=False).name
    if file_exists(out_file):
        return out_file
    db = gtf.get_gtf_db(gtf_file)
    line_format = "{transcript}\t{gene}\n"
    with file_transaction(out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in db.features_of_type('transcript'):
                transcript = feature['transcript_id'][0]
                gene = feature['gene_id'][0]
                out_handle.write(line_format.format(**locals()))
    return out_file
Esempio n. 9
0
def isoform_to_gene_name(gtf_file, out_file, data):
    """
    produce a table of isoform -> gene mappings for loading into EBSeq
    """
    if not out_file:
        out_file = tempfile.NamedTemporaryFile(delete=False).name
    if file_exists(out_file):
        return out_file
    db = gtf.get_gtf_db(gtf_file)
    line_format = "{transcript}\t{gene}\n"
    with file_transaction(data, out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for feature in db.features_of_type('transcript'):
                transcript = feature['transcript_id'][0]
                gene = feature['gene_id'][0]
                out_handle.write(line_format.format(**locals()))
    return out_file
Esempio n. 10
0
def clean_assembly(gtf_file, clean=None, dirty=None):
    """
    clean the likely garbage transcripts from the GTF file including:
    1. any novel single-exon transcripts
    2. any features with an unknown strand
    """
    base, ext = os.path.splitext(gtf_file)
    db = gtf.get_gtf_db(gtf_file, in_memory=True)
    clean = clean if clean else base + ".clean" + ext
    dirty = dirty if dirty else base + ".dirty" + ext
    if file_exists(clean):
        return clean, dirty
    with open(clean, "w") as clean_handle, open(dirty, "w") as dirty_handle:
        for gene in db.features_of_type('gene'):
            for transcript in db.children(gene, level=1):
                if is_likely_noise(db, transcript):
                    write_transcript(db, dirty_handle, transcript)
                else:
                    write_transcript(db, clean_handle, transcript)
    return clean, dirty
Esempio n. 11
0
def clean_assembly(gtf_file, clean=None, dirty=None):
    """
    clean the likely garbage transcripts from the GTF file including:
    1. any novel single-exon transcripts
    2. any features with an unknown strand
    """
    base, ext = os.path.splitext(gtf_file)
    db = gtf.get_gtf_db(gtf_file, in_memory=True)
    clean = clean if clean else base + ".clean" + ext
    dirty = dirty if dirty else base + ".dirty" + ext
    if file_exists(clean):
        return clean, dirty
    with open(clean, "w") as clean_handle, open(dirty, "w") as dirty_handle:
        for gene in db.features_of_type('gene'):
            for transcript in db.children(gene, level=1):
                if is_likely_noise(db, transcript):
                    write_transcript(db, dirty_handle, transcript)
                else:
                    write_transcript(db, clean_handle, transcript)
    return clean, dirty
Esempio n. 12
0
def make_pizzly_gtf(gtf_file, out_file, data):
    """
    pizzly needs the GTF to be in gene -> transcript -> exon order for each
    gene. it also wants the gene biotype set as the source
    """
    if file_exists(out_file):
        return out_file
    db = gtf.get_gtf_db(gtf_file)
    with file_transaction(data, out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for gene in db.features_of_type("gene"):
                children = [x for x in db.children(id=gene)]
                for child in children:
                    if child.attributes.get("gene_biotype", None):
                        gene_biotype = child.attributes.get("gene_biotype")
                gene.attributes['gene_biotype'] = gene_biotype
                gene.source = gene_biotype[0]
                print(gene, file=out_handle)
                for child in children:
                    child.source = gene_biotype[0]
                    # gffread produces a version-less FASTA file
                    child.attributes.pop("transcript_version", None)
                    print(child, file=out_handle)
    return out_file
Esempio n. 13
0
def make_pizzly_gtf(gtf_file, out_file, data):
    """
    pizzly needs the GTF to be in gene -> transcript -> exon order for each
    gene. it also wants the gene biotype set as the source
    """
    if file_exists(out_file):
        return out_file
    db = gtf.get_gtf_db(gtf_file)
    with file_transaction(data, out_file) as tx_out_file:
        with open(tx_out_file, "w") as out_handle:
            for gene in db.features_of_type("gene"):
                children = [x for x in db.children(id=gene)]
                for child in children:
                    if child.attributes.get("gene_biotype", None):
                        gene_biotype = child.attributes.get("gene_biotype")
                gene.attributes['gene_biotype'] = gene_biotype
                gene.source = gene_biotype[0]
                print(gene, file=out_handle)
                for child in children:
                    child.source = gene_biotype[0]
                    # gffread produces a version-less FASTA file
                    child.attributes.pop("transcript_version", None)
                    print(child, file=out_handle)
    return out_file