Ejemplo n.º 1
0
def getGeneTable(reffile):
    E.info("Loading reference")
    table = defaultdict(dict)
    for ens_gene in GTF.gene_iterator(GTF.iterator(
            IOTools.open_file(reffile))):
        geneid = ens_gene[0][0].gene_id
        table[geneid]["models"] = dict()
        table[geneid]["start_codons"] = defaultdict(list)

        for transcript in ens_gene:

            transcript_id = transcript[0].transcript_id
            table[geneid]["models"][transcript_id] = transcript

            CDS = GTF.asRanges(transcript, "start_codon")
            if len(CDS) == 0:
                continue

            if transcript[0].strand == "-":
                start_codon = max(e[1] for e in CDS)
            else:
                start_codon = min(e[0] for e in CDS)

            table[geneid]["start_codons"][start_codon].append(transcript_id)

    E.info("Reference Loaded")
    return table
Ejemplo n.º 2
0
def transcript2bed12(transcript):

    new_entry = Bed.Bed()
    start = min(entry.start for entry in transcript)
    end = max(entry.end for entry in transcript)

    try:
        thickStart = min(entry.start for entry in transcript
                         if entry.feature == "CDS")
        thickEnd = max(entry.end for entry in transcript
                       if entry.feature == "CDS")
    except ValueError:

        # if there is no CDS, then set first base of transcript as
        # start

        if transcript[0].strand == "-":
            thickStart = end
            thickEnd = end
        else:
            thickStart = start
            thickEnd = start

    exons = GTF.asRanges(transcript, "exon")

    exon_starts = [es - start for (es, ee) in exons]
    exon_lengths = [ee - es for (es, ee) in exons]
    exon_count = len(exons)
    new_entry.contig = transcript[0].contig
    new_entry.start = start
    new_entry.end = end
    new_entry["strand"] = transcript[0].strand
    new_entry["name"] = transcript[0].transcript_id

    new_entry["thickStart"] = thickStart
    new_entry["thickEnd"] = thickEnd

    new_entry["blockCount"] = exon_count
    new_entry["blockStarts"] = ",".join(map(str, exon_starts))
    new_entry["blockSizes"] = ",".join(map(str, exon_lengths))

    return new_entry
Ejemplo n.º 3
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $1.0$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--reffile",
                      dest="reffile",
                      type="string",
                      help="Supply reference gtf file name")

    parser.add_option("-d",
                      "--class-file",
                      dest="classfile",
                      type="string",
                      help="Supply database name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="Supply output bed file name")

    parser.add_option("-u",
                      "--indivfile",
                      dest="indivfile",
                      type="string",
                      help="Supply output bed file name for individual utrons")

    parser.add_option("-p",
                      "--partfile",
                      dest="partfile",
                      type="string",
                      help="Supply output bed file name for partnered utrons")
    parser.add_option(
        "-q",
        "--indivpartfile",
        dest="indivpartfile",
        type="string",
        help="Supply output bed file name for individual partnered utrons")
    parser.add_option("-n",
                      "--novel-file",
                      dest="novelfile",
                      type="string",
                      help="Supply output bed file name for novel introns")
    parser.add_option(
        "--novel-transcript",
        dest="novel_id",
        type="string",
        help="DEBUG: Output info for this transcript from the STDIN")
    parser.add_option(
        "--target-transcript",
        dest="target_id",
        type="string",
        help="DEBUG: Output info for this transcript from ref-file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    outlines = []
    individuals = []
    partnered = []
    individualpartnered = []
    novel = []

    db = pandas.read_csv(options.classfile, sep="\t")

    # This keeps just one entry per-transcript - why?
    #db = db.groupby("transcript_id").first()
    db = db.set_index("transcript_id")
    enshashtable = getGeneTable(options.reffile)

    for novel_transcript in GTF.transcript_iterator(GTF.iterator(
            options.stdin)):

        # Why do it on a gene by gene basis rather than transcript by transcript basis?
        transcript_id = novel_transcript[0].transcript_id

        if transcript_id == options.novel_id:
            output_novel = True
        else:
            output_novel = False

        try:
            geneid = db.loc[transcript_id].match_gene_id
        except KeyError:
            if output_novel:
                E.debug("Transcript %s not in class table" % transcript_id)
            continue

        if pandas.isnull(geneid):
            if output_novel:
                E.debug("Transcript %s matches no gene in class table" %
                        transcript_id)
            continue

        ens_gene = enshashtable[geneid]

        all_ref_introns = set()
        novel_transcript_exons = GTF.asRanges(novel_transcript, "exon")
        novel_transcript_introns = GTF.toIntronIntervals(novel_transcript)
        for ref_transcript in ens_gene["models"].values():
            ref_introns = GTF.toIntronIntervals(ref_transcript)
            all_ref_introns.update(ref_introns)

        #Identify comparison set
        def _in_exon(position, exons):
            return any(e[0] <= position <= e[1] for e in exons)

        # check if this ever gets the wrong start_codon.
        filtered_starts = [
            s for s in ens_gene["start_codons"]
            if _in_exon(s, novel_transcript_exons)
        ]

        if len(filtered_starts) == 0:
            if output_novel:
                E.debug("No starts found for %s" % transcript_id)
            continue

        #if novel_transcript[0].strand == "-":
        #    selected_start = max(filtered_starts)
        #else:
        #    selected_start = min(filtered_starts)

        selected_models = list()
        for startc in filtered_starts:
            selected_models.extend(ens_gene["start_codons"][startc])

        if output_novel:
            E.debug("Transcripts with compatible starts are %s" %
                    selected_models)

        for ref_transcript_id in selected_models:

            if output_novel and ref_transcript_id == options.target_id:
                output_ref = True
            else:
                output_ref = False

            second = ens_gene["models"][ref_transcript_id]
            ens_CDS = GTF.asRanges(second, "CDS")

            if len(ens_CDS) == 0:
                if output_ref:
                    E.debug("%s is not coding"
                            )  # ensure only protein-coding transcripts
                continue

            ens_exons = GTF.asRanges(second, "exon")

            first_introns = set(novel_transcript_introns)
            second_introns = set(GTF.toIntronIntervals(second))

            first_CDSintrons = [
                intron for intron in first_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            second_CDSintrons = [
                intron for intron in second_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            first_CDSintrons = set(first_CDSintrons)
            second_CDSintrons = set(second_CDSintrons)

            if not first_CDSintrons == second_CDSintrons:
                if output_ref:
                    E.debug("CDS chains do not match. Chains are:")
                    first_CDSintrons = sorted(list(first_CDSintrons))
                    second_CDSintrons = sorted(list(second_CDSintrons))
                    output = "\n".join(
                        map(str, zip(first_CDSintrons, second_CDSintrons)))
                    E.debug(output)
                continue  # match CDS intron chain

            firstUTRintrons = first_introns - first_CDSintrons

            if len(firstUTRintrons) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            secondUTRintrons = second_introns - second_CDSintrons

            found = False
            for intron in first_introns:
                if (intron[0] < ens_CDS[-1][1] and
                    intron[1] > ens_CDS[-1][1]) or \
                    (intron[0] < ens_CDS[0][0] and
                     intron[1] > ens_CDS[0][0]):

                    found = True
                    break  # ensure pruned transcript doesn't have
                    # introns overlapping start or stop codons in ensembl
                    # transcript
            if found:
                if output_ref:
                    E.debug("Start or stop in intron")
                continue

            if second[0].strand == "+":
                ens_stop = ens_CDS[-1][1]
                UTR3introns = [
                    intron for intron in firstUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
            else:
                ens_stop = ens_CDS[0][0]
                UTR3introns = [
                    intron for intron in firstUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]

            if len(UTR3introns) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            outbed = Bed.Bed()
            outbed.fields = ['.', '.', '.', '.', '.', '.', '.', '.', '.']
            outbed.fromIntervals(UTR3introns)
            outbed.contig = novel_transcript[0].contig
            outbed["name"] = novel_transcript[0].transcript_id
            outbed["strand"] = novel_transcript[0].strand
            outlines.append(outbed)  # get output for each transcript

            for item in UTR3introns:
                outbed2 = Bed.Bed()
                outbed2.fields = ['.', '.', '.', '.']
                outbed2.fromIntervals([item])
                outbed2.contig = novel_transcript[0].contig
                outbed2['name'] = novel_transcript[0].transcript_id
                outbed2["strand"] = novel_transcript[0].strand
                outbed2["thickStart"] = ens_stop
                individuals.append(outbed2)  # get output for each intron

            UTR3introns = set(UTR3introns)
            secondUTR3introns = set(secondUTR3introns)
            extraUTR3introns = list(UTR3introns - secondUTR3introns)

            if output_ref and len(secondUTR3introns - UTR3introns) > 0:
                E.debug("Following introns in UTR of %s but not %s" %
                        (options.target_id, options.novel_id))
                E.debug(secondUTRintrons - UTR3introns)

            # get only introns that are not in matched transcript
            if len(extraUTR3introns) != 0 and len(secondUTR3introns -
                                                  UTR3introns) == 0:
                outbed3 = Bed.Bed()
                outbed3.fields = ['.'] * 9
                outbed3.fromIntervals(extraUTR3introns)
                outbed3.contig = novel_transcript[0].contig
                outbed3["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed3["strand"] = novel_transcript[0].strand
                partnered.append(outbed3)

                for item in extraUTR3introns:
                    outbed4 = Bed.Bed()
                    outbed4.fields = ['.', '.', '.', '.']
                    outbed4.fromIntervals([item])
                    outbed4.contig = novel_transcript[0].contig
                    outbed4["name"] = novel_transcript[
                        0].transcript_id + ":" + second[0].transcript_id
                    outbed4["strand"] = novel_transcript[0].strand
                    outbed4["thickStart"] = ens_stop
                    individualpartnered.append(outbed4)

            if len(all_ref_introns) == 0:
                ens_starts, ens_ends = [], []
            else:
                ens_starts, ens_ends = zip(*all_ref_introns)

            novelEvents = [
                i for i in UTR3introns
                if i[0] not in ens_starts and i[1] not in ens_ends
            ]

            for item in novelEvents:
                outbed5 = Bed.Bed()
                outbed5.fields = ['.'] * 4
                outbed5.fromIntervals([item])
                outbed5.contig = novel_transcript[0].contig
                outbed5["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed5["strand"] = novel_transcript[0].strand
                outbed5["thickStart"] = ens_stop
                novel.append(outbed5)

    with IOTools.open_file(options.outfile, "w") as outf:
        for line in outlines:
            outf.write(str(line) + "\n")

    if options.indivfile is not None:
        with IOTools.open_file(options.indivfile, "w") as outf2:
            for line in individuals:
                outf2.write(str(line) + "\n")

    if options.partfile is not None:
        with IOTools.open_file(options.partfile, "w") as outf3:
            for line in partnered:
                outf3.write(str(line) + "\n")

    if options.indivpartfile is not None:
        with IOTools.open_file(options.indivpartfile, "w") as outf4:
            for line in individualpartnered:
                outf4.write(str(line) + "\n")

    if options.novelfile is not None:
        with IOTools.open_file(options.novelfile, "w") as outf5:
            for line in novel:
                outf5.write(str(line) + "\n")
    # write footer and output benchmark information.
    E.stop()