Ejemplo n.º 1
0
    def parse(self):
        idx = HTSeq.GenomicArrayOfSets("auto", stranded=False)
        if self.gtf_file:  # could be None of no gtf file is provided
            log.info("Loading " + self.gtf_file)

            gtf_file = HTSeq.GFF_Reader(self.gtf_file, end_included=True)
            n = 0

            for feature in gtf_file:
                if feature.type == "gene":
                    if 'gene_name' in feature.attr:
                        name = feature.attr['gene_name']
                    elif 'Name' in feature.attr:
                        name = feature.attr['Name']
                    elif 'gene' in feature.attr:
                        name = feature.attr['gene']
                    else:
                        name = feature.name

                    if feature.iv.chrom[0:3] == 'chr':
                        feature.iv.chrom = feature.iv.chrom[3:]

                    idx[feature.iv] += name
                    n += 1

            log.info("Loaded " + str(n) + " features")
        return idx
Ejemplo n.º 2
0
    def add_regions_from_bed(self, regions_bed_file):
        log.info("Parsing regions blacklist file: " + str(regions_bed_file))

        header = True
        with open(regions_bed_file, 'r') as fh:
            for line in fh:
                if not header:
                    params = line.strip("\t\n ").split("\t")
                    if len(params) > 1:
                        for i in [1, 2]:
                            params[i] = int(params[i])

                        d = params[2] - params[1]

                        if d < 1:
                            raise ValueError(
                                "Too small region (starts are 0-based, ends are 1-based, like BED):\n"
                                + line)

                        if len(params) >= 5:
                            self.add_region(params[0], params[1], params[2],
                                            params[3], params[4])
                        else:
                            self.add_region(params[0], params[1], params[2],
                                            params[3], None)
                else:
                    header = False

        log.info("Added " + str(self.r) + " regions to the blacklist")
Ejemplo n.º 3
0
    def add_junctions_from_file(self, junction_file):
        log.info("Parsing junction blacklist file: " + str(junction_file))
        header = True
        with open(junction_file, 'r') as fh:
            for line in fh:
                if not header:
                    params = line.strip().split("\t")
                    if len(params) > 1:
                        for i in [1, 2, 5, 6]:
                            params[i] = int(params[i])

                        d1 = params[2] - params[1]
                        d2 = params[6] - params[5]

                        if d1 < 1 or d2 < 1:
                            raise ValueError(
                                "Too small junction (starts are 0-based, ends are 1-based, like BED):\n"
                                + line)

                        if (params[4] < params[0]) or (params[0] == params[4]
                                                       and
                                                       params[5] < params[1]):
                            reg1 = (params[4], params[5], params[6], params[7])
                            reg2 = (params[0], params[1], params[2], params[3])
                        else:
                            reg1 = (params[0], params[1], params[2], params[3])
                            reg2 = (params[4], params[5], params[6], params[7])

                        if len(params) >= 9:
                            self.add_junction(reg1, reg2, params[8])
                        else:
                            self.add_junction(reg1, reg2, None)
                else:
                    header = False

        log.info("Added " + str(self.j) + " junctions to the blacklist")
Ejemplo n.º 4
0
    def integrate(self, output_table, gtf_file, fasta_file):
        log.info("Integrating results")

        def insert_in_index(index, entries, score, i):
            if score not in index:
                index[score] = {}

            key = entries[0].chrA + ':' + str(
                entries[0].posA
            ) + '(' + entries[0].strandA + ')-' + entries[0].chrB + ':' + str(
                entries[0].posB) + '(' + entries[0].strandB + ')|' + str(i)
            index[score][key] = entries

        with open(output_table, 'w') as fh_out:
            header = self.header.split("\t")
            header = "\t".join(header[:-5] + [
                'full-gene-dysregulation', 'frameshift=0', 'frameshift=+1',
                'frameshift=+2', 'splice-motif-edit-distance',
                "exons from (5')", "exons to (3')"
            ] + header[-5:])

            fh_out.write("shared-id\tfusion\t" + header)

            # index used to find duplicates
            self.idx = HTSeq.GenomicArrayOfSets("auto", stranded=True)

            # index used to annotate gene names: TMPRSS2->ERG
            gene_annotation = GeneAnnotation(gtf_file)
            dfs = DetectFrameShifts(gtf_file) if gtf_file else None

            ffs = Fasta(fasta_file) if fasta_file else None

            intronic_linear = []
            remainder = []

            # Find 'duplicates' or fusions that belong to each other
            log.info(
                "Searching for intronic and exonic breaks that belong to the same event"
            )
            for e in self:
                if dfs and e.RNAstrandA != '.' and e.RNAstrandB != '.':
                    done_breaks = set([])

                    if e.donorA > e.donorB:
                        exons_from, exons_to, frame_shifts = dfs.evaluate(
                            [e.chrA, e.posA, e.RNAstrandA],
                            [e.chrB, e.posB, e.RNAstrandB], 2)
                    else:
                        exons_from, exons_to, frame_shifts = dfs.evaluate(
                            [e.chrB, e.posB, e.RNAstrandB],
                            [e.chrA, e.posA, e.RNAstrandA], 2)

                    done_breaks.add(e.chrA + ':' + str(e.posA) + '/' +
                                    str(e.posA + 1) + '(' + e.strandA + ')->' +
                                    e.chrB + ':' + str(e.posB) + '/' +
                                    str(e.posB + 1) + '(' + e.strandB + ')')

                    fgd = [x[0] + '->' + x[1] for x in frame_shifts['fgd']]
                    frameshifts_0 = [
                        x[0][0] + '->' + x[1][0] for x in frame_shifts[0]
                    ]
                    frameshifts_1 = [
                        x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] +
                        '(+' + str(x[1][1]) + ')' for x in frame_shifts[1]
                    ]
                    frameshifts_2 = [
                        x[0][0] + '(+' + str(x[0][1]) + ')->' + x[1][0] +
                        '(+' + str(x[1][1]) + ')' for x in frame_shifts[2]
                    ]

                    for additional_breaks in e.structure.split('&'):
                        if additional_breaks != '':
                            params = additional_breaks.split(':(')
                            n_split_reads = sum([
                                int(x.split(':')[1])
                                for x in params[1].rstrip(')').split(',')
                                if x.split(':')[0] != 'discordant_mates'
                            ])

                            posAB = params[0].split(':')
                            posA, posB = int(posAB[1].split('/')[0]), int(
                                posAB[2].split('/')[0])

                            if params[
                                    0] not in done_breaks and n_split_reads > 0:
                                if e.donorA > e.donorB:  # nice, use same thing to swap if necessary
                                    exons_from_, exons_to_, frame_shifts = dfs.evaluate(
                                        [e.chrA, posA, e.RNAstrandA],
                                        [e.chrB, posB, e.RNAstrandB], 2)
                                else:
                                    exons_from_, exons_to_, frame_shifts = dfs.evaluate(
                                        [e.chrB, posB, e.RNAstrandB],
                                        [e.chrA, posA, e.RNAstrandA], 2)

                                exons_from += exons_from_
                                exons_to += exons_to_
                                del (exons_from_, exons_to_)

                                fgd += [
                                    x[0] + '->' + x[1]
                                    for x in frame_shifts['fgd']
                                ]
                                frameshifts_0 += [
                                    x[0][0] + '->' + x[1][0]
                                    for x in frame_shifts[0]
                                ]
                                frameshifts_1 += [
                                    x[0][0] + '(+' + str(x[0][1]) + ')->' +
                                    x[1][0] + '(+' + str(x[1][1]) + ')'
                                    for x in frame_shifts[1]
                                ]
                                frameshifts_2 += [
                                    x[0][0] + '(+' + str(x[0][1]) + ')->' +
                                    x[1][0] + '(+' + str(x[1][1]) + ')'
                                    for x in frame_shifts[2]
                                ]

                            done_breaks.add(params[0])

                    e.exons_from = sorted(list(set(exons_from)))
                    e.exons_to = sorted(list(set(exons_to)))
                    del (exons_from, exons_to)

                    e.fgd = ','.join(sorted(list(set(fgd))))
                    e.frameshift_0 = ','.join(sorted(list(set(frameshifts_0))))
                    e.frameshift_1 = ','.join(sorted(list(set(frameshifts_1))))
                    e.frameshift_2 = ','.join(sorted(list(set(frameshifts_2))))
                    del (fgd, frameshifts_0, frameshifts_1, frameshifts_2)

                if ffs:
                    e.is_on_splice_junction_motif(ffs)

                if e.x_onic == 'intronic' and e.circ_lin == 'linear':
                    intronic_linear.append(e)
                else:
                    remainder.append(e)

                def insert(pos, e):
                    if pos[0][0:3] == 'chr':
                        chrom = pos[0][3:]
                    else:
                        chrom = pos[0]

                    # position_accession = HTSeq.GenomicPosition(pos[0], pos[1], pos[2])
                    position_accession = HTSeq.GenomicInterval(
                        chrom, pos[1], pos[1] + 1, pos[2])
                    position = self.idx[position_accession]
                    position += e

                insert((e.chrA, e.posA, e.strandA), e)
                insert((e.chrB, e.posB, e.strandB), e)

            if ffs != None:
                ffs.close()

            # Reorder
            log.info("Re-order and find matching entries")
            idx2 = {}
            q = 0
            for e in intronic_linear:
                results_split = [set([]), set([])]
                positions = [(e.chrA, e.posA, e.strandA),
                             (e.chrB, e.posB, e.strandB)]

                for pos_i in [0, 1]:
                    pos = positions[pos_i]

                    if pos[2] == '-':
                        pos1 = pos[1] - 200000
                        pos2 = pos[1]
                    else:
                        pos1 = pos[1]
                        pos2 = pos[1] + 200000

                    if pos[0][0:3] == 'chr':
                        chrom = pos[0][3:]
                    else:
                        chrom = pos[0]

                    for step in self.idx[HTSeq.GenomicInterval(
                            chrom, max(0, pos1), pos2, pos[2])].steps():
                        for e2 in [_ for _ in step[1] if _ != e]:
                            if e2.strandA == e.strandA and e2.strandB == e.strandB:
                                results_split[pos_i].add(e2)

                results = results_split[0].intersection(results_split[1])
                top_result = (None, 9999999999999)
                for r in results:
                    d1 = (r.posA - e.posA)
                    d2 = (r.posB - e.posB)
                    sq_d = math.sqrt(pow(d1, 2) + pow(d2, 2))

                    shared_score = math.sqrt(
                        (pow(e.score, 2) + pow(r.score, 2)) * 0.5)
                    penalty = 1.0 * sq_d / shared_score

                    if penalty < top_result[1]:
                        top_result = (r, penalty)

                if top_result[0]:
                    insert_in_index(idx2, [e, top_result[0]],
                                    e.score + top_result[0].score, q)
                else:
                    insert_in_index(idx2, [e], e.score, q)
                q += 1

            for e in remainder:
                insert_in_index(idx2, [e], e.score, q)
                q += 1

            log.info("Determining fusion gene names and generate output")
            # Generate output
            i = 1
            exported = set([])
            for score in sorted(idx2.keys(), reverse=True):
                for key in sorted(idx2[score].keys()):
                    added = 0
                    for entry in idx2[score][key]:
                        if entry not in exported:
                            acceptors_donors = entry.get_donors_acceptors(
                                gene_annotation)
                            line = entry.line[:-5] + [
                                entry.fgd, entry.frameshift_0,
                                entry.frameshift_1, entry.frameshift_2,
                                entry.edit_dist_to_splice_motif, ",".join(
                                    entry.exons_from), ",".join(entry.exons_to)
                            ] + entry.line[-5:]

                            fh_out.write(
                                str(i) + "\t" + acceptors_donors + "\t" +
                                "\t".join(line) + "\n")
                            exported.add(entry)
                            added += 1

                    if added > 0:
                        i += 1
Ejemplo n.º 5
0
    def classify(self, output_file, only_valid, blacklist, min_chim_overhang,
                 ffpe_mismatch_ratio):
        log.info("Loading " + output_file + "[only_valid=" + {
            True: 'true',
            False: 'false'
        }[only_valid] + "]")
        n = 0
        k = 0

        with open(output_file, 'w') as fh:
            fh.write(str(self.get_header()))

            for e in self:
                if isinstance(e, str):
                    fh.write(e)
                else:

                    def classify_intronic_exonic():
                        n_edges_max = int(round(0.00575 * e.score + 5.75, 0))

                        if e.n_edges >= n_edges_max:
                            e.x_onic = 'exonic'

                    classify_intronic_exonic()

                    status = []
                    n += 1

                    # all_entropy_min_f1 = 0.705 + (math.atan((e.score - 150.0) * 0.005) * 0.035)
                    # all_entropy_min_f2 = -0.1375 * math.tanh((e.score - 2250.0) / 900.0) + 0.6175
                    all_entropy_min = -0.26 * math.tanh(
                        (e.score - 25.0) / 20.0) + 0.6225
                    all_entropy_max = -1.0 * (max(e.score, 171) - 175.0) / (
                        5.0 + max(e.score, 171) - 175.0) + (1.0 + 0.965)
                    if e.entropy_all_edges < all_entropy_min:
                        status.append("entropy=" + str(e.entropy_bp_edge) +
                                      '<' + str(round(all_entropy_min, 4)))
                    if e.entropy_all_edges > all_entropy_max:
                        status.append("entropy=" + str(e.entropy_bp_edge) +
                                      '>' + str(round(all_entropy_max, 4)))

                    # @todo subfunc
                    n_disco_min = int(round(pow(((e.n_nodes - 2) * 0.22),
                                                1.7)))
                    if e.n_discordant_reads < n_disco_min:
                        status.append("n_discordant_reads=" +
                                      str(e.n_discordant_reads) + "<" +
                                      str(n_disco_min))

                    # @todo subfunc
                    n_support_min = (0.215 *
                                     pow(max(0, e.n_nodes) - 1.0, 1.59)) + 6.5
                    n_support_min = int(round(n_support_min))
                    if e.n_supporting_reads < n_support_min:
                        status.append("n_support=" +
                                      str(e.n_supporting_reads) + "<" +
                                      str(n_support_min))

                    # @todo subfunc
                    n_disco_max = int(
                        round(math.pow(22.0 * e.n_split_reads, 0.9) + 13))
                    if e.n_split_reads < 100:
                        n_disco_min = int(
                            round(math.pow(0.0195 * e.n_split_reads, 1.95)))
                    elif e.n_split_reads >= 100 and e.n_split_reads < 125:
                        n_disco_min = 4
                    elif e.n_split_reads >= 125 and e.n_split_reads < 325:
                        n_disco_min = int(
                            round((0.135 * (e.n_split_reads - 200.0)) + 14.0))
                    else:
                        n_disco_min = int(
                            round(30.875 + (e.n_split_reads - 325) * 0.024))
                    if e.n_discordant_reads > n_disco_max:
                        status.append("n_disco=" + str(e.n_discordant_reads) +
                                      ">" + str(n_disco_max))
                    if e.n_discordant_reads < n_disco_min:
                        status.append("n_disco=" + str(e.n_discordant_reads) +
                                      "<" + str(n_disco_min))

                    # @todo subfunc
                    n_split_min = int(
                        round((0.32 * e.n_supporting_reads) -
                              pow((0.1 * e.n_supporting_reads), 1.15) - 4.0))
                    if e.n_supporting_reads < 385:
                        n_split_max = int(
                            round((0.986 * e.n_supporting_reads) - pow(
                                0.00535 * e.n_supporting_reads, 3.99 -
                                ((1.0 / 15000.0) * e.n_supporting_reads))))
                    else:
                        n_split_max = int(round(0.94 * e.n_supporting_reads))
                    if e.n_split_reads < n_split_min:
                        status.append("n_split=" + str(e.n_split_reads) + "<" +
                                      str(n_split_min))
                    if e.n_split_reads > n_split_max:
                        status.append("n_split=" + str(e.n_split_reads) + ">" +
                                      str(n_split_max))

                    # @todo subfunc
                    slope = 51.0
                    bp_pos_stddev_max = -(slope * e.nodes_edge) + 15 + (2.0 *
                                                                        slope)
                    if e.bp_pos_stddev > bp_pos_stddev_max:
                        status.append("bp_pos_stddev=" + str(e.bp_pos_stddev) +
                                      ">" + str(bp_pos_stddev_max))

                    # @todo subfunc
                    clips_min = (0.19 * e.score) - 25.0
                    clips_max = (0.84 * e.score) + 550.0
                    if e.clips < clips_min:
                        status.append("clips=" + str(e.clips) + "<" +
                                      str(clips_min))
                    if e.clips > clips_max:
                        status.append("clips=" + str(e.clips) + ">" +
                                      str(clips_max))

                    # @todo subfunc
                    blacklisted = blacklist.is_blacklisted(
                        (e.chrA, e.posA, e.strandA),
                        (e.chrB, e.posB, e.strandB))
                    if len(blacklisted) > 0:
                        status.append("blacklist=" + '&'.join(blacklisted))

                    # @todo subfunc
                    log_ratio_slope_max = (3.6 / 2)
                    log_ratio_rvalue_max = (0.8 / 2)
                    log_ratio_slope = abs(
                        math.log(
                            (e.lr_A_slope + 0.0001) / (e.lr_B_slope + 0.0001)))
                    log_ratio_rvalue = abs(
                        math.log((e.lr_A_rvalue + 0.0001) /
                                 (e.lr_B_rvalue + 0.0001)))
                    if log_ratio_slope > log_ratio_slope_max:
                        status.append("log_ratio_slope=" +
                                      str(round(log_ratio_slope, 2)) + ">" +
                                      str(round(log_ratio_slope_max, 2)))
                    if log_ratio_rvalue > log_ratio_rvalue_max:
                        status.append("log_ratio_rvalue=" +
                                      str(round(log_ratio_rvalue, 2)) + ">" +
                                      str(round(log_ratio_rvalue_max, 2)))

                    # @todo subfunc
                    # FFPE material seems to have a substantial higher amount of mismatches per base, though randomly distributed
                    # if we ever make a v2 of dr-disco that incorporates the concordant reads, this variable can be dertemined with some kind of calibration
                    # now we're only estimating the MM ratio without entropy per position
                    # [CGCGCTATATCTCGATCGCCCTTAGAGATCCTTTCGAGAGAGCTCTAGAGCG] SOME KIND OF REFERENCE SEQUENCE
                    #  CGCG*TATAT*TC                  TTTC*AGAGAGCT*TAG      The more randomly dispersed mismatches are more trustworthy (right side example)
                    #  CGCG*TATAT*TCGAT                TTCGAGAG*GCTCT
                    #   GCG*TATAT*TCG                  T*CGAGAGAG*TCTA
                    #   GCG*TATAT*TCGA                 TTCG*GAGAGCTCTA
                    #    CG*TATAT*TCGAT                TTCGAG*GAGCTCTAG
                    #     G*TATAT*TCG                   TCGA*AGA*CTCT
                    #
                    if ffpe_mismatch_ratio:
                        log_value_max = -6.45 - ((e.score + 6750.0) /
                                                 (4400.0 - (e.score + 6750.0)))
                    else:
                        log_value_max = -4.7
                    log_value = math.log((float(e.mismatches) + 0.0000001) /
                                         float(e.alignment_score))
                    if log_value > log_value_max:
                        status.append("many_muts=" + str(round(log_value, 2)) +
                                      ">" + str(round(log_value_max, 2)))

                    # @todo subfunc
                    lr_a = e.lr_A_pvalue * e.lr_A_intercept
                    lr_b = e.lr_A_pvalue * e.lr_B_intercept
                    lr_symmetry_max = -e.score / (0.11 + (0.0246 *
                                                          (e.score))) + 41
                    n_lr_symmetry = pow(pow(lr_a, 2) + pow(lr_b, 2), 0.5)
                    if n_lr_symmetry >= lr_symmetry_max:
                        status.append("n_lr_symmetry=" +
                                      str(round(n_lr_symmetry, 2)) + ">=" +
                                      str(round(lr_symmetry_max, 2)))

                    # @todo subfunc
                    chim_overhang = min(e.break_A_max_AS, e.break_B_max_AS)
                    if chim_overhang < min_chim_overhang:
                        status.append("chim_overhang=" + str(chim_overhang) +
                                      "<" + str(min_chim_overhang))

                    # @todo subfunc
                    if e.score <= 150:
                        lr_intercept_max = (-31.0 * (
                            (e.score + 100.0) /
                            (1800.0 + e.score + 100.0))) + 85.5
                    else:
                        lr_intercept_max = (
                            (e.score - 150.0) * 0.0225) + 81.71951
                    if e.lr_A_intercept > lr_intercept_max:
                        status.append("lr_A_intercept=" +
                                      str(e.lr_A_intercept) + ">" +
                                      "{:.12g}".format(lr_intercept_max))
                    if e.lr_B_intercept > lr_intercept_max:
                        status.append("lr_B_intercept=" +
                                      str(e.lr_B_intercept) + ">" +
                                      "{:.12g}".format(lr_intercept_max))

                    # @todo subfunc
                    sqrt_entropy_bps_ab = pow(
                        pow(e.entropy_disco_bps_A, 2) +
                        pow(e.entropy_disco_bps_B, 2), 0.5)
                    if e.entropy_all_edges <= 0.85:
                        sqrt_entropy_bps_ab_max = 0.475
                    else:
                        sqrt_entropy_bps_ab_max = 3.4 * e.entropy_all_edges - 2.415
                    if sqrt_entropy_bps_ab > sqrt_entropy_bps_ab_max:
                        status.append("sqrt_entropy_bps_ab=" +
                                      "{:.12g}".format(sqrt_entropy_bps_ab) +
                                      ">" +
                                      str(round(sqrt_entropy_bps_ab_max, 5)))

                    if len(status) == 0:
                        e.status = 'valid'
                        fh.write(str(e))
                        k += 1
                    elif not only_valid:
                        e.status = ','.join(status)
                        fh.write(str(e))

        log.info("Classified " + str(k) + "/" + str(n) + " as valid")
Ejemplo n.º 6
0
    def index_gtf(self):
        """
        GTF file must have:
        CDS entries for coding sequences
        each CDS entry must have:
            - source
            - gene_name attribute
            - transcript_id attribute
            - transcript_version attribute
            - exon_number attribute

        Such gtf files are provided by Ensembl
        """

        log.info("Loading GTF file " + self.gtf_file +
                 " for protein frameshift analysis")

        def load_gtf_per_transcript():
            transcript_idx = {}
            gtf_file = HTSeq.GFF_Reader(self.gtf_file, end_included=True)

            for feature in gtf_file:
                gtf_type = feature.type.lower()
                if gtf_type in ['cds', 'exon']:
                    try:
                        transcript_id = feature.attr[
                            'gene_name'] + '(' + feature.attr[
                                'transcript_id'] + '.' + feature.attr[
                                    'transcript_version'] + ')-' + feature.source

                        if transcript_id not in transcript_idx:
                            transcript_idx[transcript_id] = {}

                        exon_number = int(feature.attr['exon_number'])
                        if exon_number not in transcript_idx[transcript_id]:
                            transcript_idx[transcript_id][exon_number] = {
                                'exon': None,
                                'cds': None
                            }

                        if gtf_type in ['exon', 'cds']:
                            transcript_idx[transcript_id][exon_number][
                                gtf_type] = feature

                    except KeyError:
                        log.warn(
                            "Warning: GTF file misses certain attributes (gene_name, transcript_id or transcript_version) and is therefore skipping the frameshift detection. Ensembl GTF files are known to be compatible."
                        )
                        # there is no GFF_Reader.close() so break it dirty:
                        break

            return transcript_idx

        def insert_transcript_idx(transcript_idx):
            def clean_chrom(chrom):
                if chrom[0:3] == 'chr':
                    return chrom[3:]
                else:
                    return chrom

            def calc_from(feature):
                if feature.iv.strand == '+':
                    return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom),
                                                 feature.iv.end,
                                                 feature.iv.end + 1,
                                                 feature.iv.strand)
                elif feature.iv.strand == '-':
                    return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom),
                                                 feature.iv.start,
                                                 feature.iv.start + 1,
                                                 feature.iv.strand)

            def calc_to(feature):
                if feature.iv.strand == '+':
                    return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom),
                                                 feature.iv.start,
                                                 feature.iv.start + 1,
                                                 feature.iv.strand)
                elif feature.iv.strand == '-':
                    return HTSeq.GenomicInterval(clean_chrom(feature.iv.chrom),
                                                 feature.iv.end,
                                                 feature.iv.end + 1,
                                                 feature.iv.strand)

            """
            @todo change to:

            for exon in exons:
                if there is a CDS with similar exon-id:

                    if there is no stop codon or this exon id is the last exon id:
                        last_coding_exon = exon_id

                    if there is a start_codon with the same exon-id:
                        first_coding_exon = exon_id


                    - see if this is pre-coding
                        - add to pre-coding 'from' idx
                        - add to pre-coding 'to' list - als je naar dit exon fuseert moet transcriptie nog starten
                    - see if this is first coding exon
                        - add to normal 'from' list
                        - add to pre-coding 'to' list
                    - see if this is an inbetween coding exon
                        - add to normal 'from' list
                        - add to normal 'to' list
                    - see if this is the last coding exon
                        - add to normal 'to' list
                    """
            for transcript_id in transcript_idx:
                coding = "pre"

                cumulative_offset = 0
                exon_ids = sorted(transcript_idx[transcript_id].keys())

                for e in exon_ids:
                    exon = transcript_idx[transcript_id][e]

                    if coding == "pre":
                        if exon['cds'] is None:  # - pre coding
                            # distances are not relevant
                            self.gene_annotation_to_fgd[calc_to(
                                exon['exon'])] += (transcript_id)
                            self.gene_annotation_from_fgd[calc_from(
                                exon['exon'])] += (transcript_id)
                        else:  # - first coding
                            length = (exon['cds'].iv.end -
                                      exon['cds'].iv.start) + cumulative_offset

                            off1 = length % 3
                            off2 = -length % 3

                            self.gene_annotation_from[calc_from(
                                exon['exon'])] += (transcript_id, off1)
                            self.gene_annotation_to_fgd[calc_to(
                                exon['exon'])] += (transcript_id)

                            cumulative_offset = off1
                            coding = True
                    elif coding is True:
                        if e == exon_ids[-1] or transcript_idx[transcript_id][
                                e + 1]['cds'] is None:  # - last coding
                            self.gene_annotation_to[calc_to(
                                exon['exon'])] += (transcript_id, off2)

                            coding = "post"
                        else:  # - middle coding
                            self.gene_annotation_to[calc_to(
                                exon['exon'])] += (transcript_id, off2)

                            length = (exon['cds'].iv.end -
                                      exon['cds'].iv.start) + cumulative_offset
                            off1 = length % 3
                            off2 = -length % 3

                            self.gene_annotation_from[calc_from(
                                exon['exon'])] += (transcript_id, off1)

                            cumulative_offset = off1
                    #  else: # - post coding

        transcript_idx = load_gtf_per_transcript()
        insert_transcript_idx(transcript_idx)
Ejemplo n.º 7
0
    def convert(self, bam_file_discordant_fixed, temp_dir):
        basename, ext = os.path.splitext(
            os.path.basename(self.input_alignment_file))
        basename = temp_dir.rstrip("/") + "/" + basename

        # @TODO / consider todo - start straight from sam
        # samtools view -bS samples/7046-004-041_discordant.Chimeric.out.sam > samples/7046-004-041_discordant.Chimeric.out.unsorted.bam

        log.info(
            "Convert into a name-sorted bam file, to get all reads with the same name adjacent to each other"
        )
        pysam.sort("-o", basename + ".name-sorted.bam", "-n",
                   self.input_alignment_file)

        log.info("Fixing sam file")
        sam_file_discordant = pysam.AlignmentFile(
            basename + ".name-sorted.bam", "rb")
        header = sam_file_discordant.header
        header['RG'] = []
        header['PG'] = []

        fh = pysam.AlignmentFile(basename + ".name-sorted.fixed.sam",
                                 "wb",
                                 header=header)
        for read in sam_file_discordant:
            tag = read.get_tag('RG')
            if tag in [
                    'spanning_singleton_1', 'spanning_singleton_1_r',
                    'spanning_singleton_2', 'spanning_singleton_2_r'
            ]:
                read.is_paired = False
                read.is_read1 = False
                read.is_read2 = False
                read.next_reference_id = None
                read.next_reference_start = None

            read.set_tag('RG', None)
            read.set_tag('SA', None)
            read.set_tag('FI', None)
            read.set_tag('LB', None)
            fh.write(read)
        fh.close()

        log.info("Converting fixed file into BAM")
        fhq = open(basename + ".name-sorted.fixed.bam", "wb")
        fhq.write(pysam.view('-bS', basename + ".name-sorted.fixed.sam"))
        fhq.close()

        log.info("Sorting position based fixed file")
        pysam.sort("-o", basename + ".sorted.fixed.bam",
                   basename + ".name-sorted.fixed.bam")

        log.info("Indexing the position sorted bam file")
        pysam.index(basename + ".sorted.fixed.bam")

        log.info("Cleaning up temp files")
        for fname in [
                basename + ".name-sorted.bam",
                basename + ".name-sorted.fixed.sam",
                basename + ".name-sorted.fixed.bam"
        ]:
            log.debug("=> " + fname)
            os.remove(fname)

        log.info("Moving to final destination")
        shutil.move(basename + ".sorted.fixed.bam", bam_file_discordant_fixed)
        shutil.move(basename + ".sorted.fixed.bam" + ".bai",
                    bam_file_discordant_fixed + ".bai")
Ejemplo n.º 8
0
    def convert(self, bam_file_discordant_fixed, temp_dir):
        def randstr(n):
            return ''.join(
                random.choice(string.ascii_uppercase + string.ascii_lowercase +
                              string.digits) for _ in range(n))

        h = hashlib.new('sha256')
        h.update(str_to_bytearray(self.input_alignment_file))
        uid = h.hexdigest() + randstr(24)

        basename, ext = os.path.splitext(
            os.path.basename(self.input_alignment_file))
        basename = temp_dir.rstrip("/") + "/" + basename + '-' + uid

        # @TODO / consider todo - start straight from sam
        # samtools view -bS samples/7046-004-041_discordant.Chimeric.out.sam > samples/7046-004-041_discordant.Chimeric.out.unsorted.bam

        log.info(
            "Convert into a name-sorted bam file, to get all reads with the same name adjacent to each other"
        )
        pysam.sort("-o", basename + ".name-sorted.bam", "-n",
                   self.input_alignment_file)

        log.info("Fixing sam file")
        sam_file_discordant = pysam.AlignmentFile(
            basename + ".name-sorted.bam", "rb")
        header = sam_file_discordant.header.to_dict()
        header['RG'] = [{
            'ID': 'discordant_mates',
            'DS': 'This read has discordant mate pair'
        }, {
            'ID':
            'silent_mate',
            'DS':
            'Reads of this type are not discordant while their mate is'
        }, {
            'ID':
            'spanning_paired_1',
            'DS':
            'This read was aligned to two locations and also has an aligned mate'
        }, {
            'ID':
            'spanning_paired_1_r',
            'DS':
            'This read was aligned to two locations and also has an aligned mate (strand type r)'
        }, {
            'ID':
            'spanning_paired_1_s',
            'DS':
            'This read was aligned to two locations and also has an aligned mate (strand type s)'
        }, {
            'ID':
            'spanning_paired_1_t',
            'DS':
            'This read was aligned to two locations and also has an aligned mate (strand type t)'
        }, {
            'ID':
            'spanning_paired_2',
            'DS':
            'This read was aligned to two locations and also has an aligned mate'
        }, {
            'ID':
            'spanning_paired_2_r',
            'DS':
            'This read was aligned to two locations and also has an aligned mate (strand type r)'
        }, {
            'ID':
            'spanning_paired_2_s',
            'DS':
            'This read was aligned to two locations and also has an aligned mate (strand type s)'
        }, {
            'ID':
            'spanning_paired_2_t',
            'DS':
            'This read was aligned to two locations and also has an aligned mate (strand type t)'
        }, {
            'ID':
            'spanning_singleton_1',
            'DS':
            'This read was aligned to two locations but no aligned mate'
        }, {
            'ID':
            'spanning_singleton_1_r',
            'DS':
            'This read was aligned to two locations but no aligned mate'
        }, {
            'ID':
            'spanning_singleton_2',
            'DS':
            'This read was aligned to two locations but no aligned mate'
        }, {
            'ID':
            'spanning_singleton_2_r',
            'DS':
            'This read was aligned to two locations but no aligned mate'
        }]

        header['PG'] = [{
            'ID': 'drdisco_fix_chimeric',
            'PN': 'drdisco fix-chimeric',
            'CL': '',
            'VN': __version__
        }]

        fh = pysam.AlignmentFile(basename + ".name-sorted.fixed.sam",
                                 "wb",
                                 header=header)
        last_read_name = False
        alignments = []
        for read in sam_file_discordant:
            if read.qname != last_read_name:
                if len(alignments) > 0:
                    self.reconstruct_alignments(alignments,
                                                sam_file_discordant, fh)
                alignments = []
                last_read_name = read.qname
            alignments.append(read)
        if len(alignments) > 0:
            self.reconstruct_alignments(alignments, sam_file_discordant, fh)
        else:
            os.remove(basename + ".name-sorted.bam")
            os.remove(basename + ".name-sorted.fixed.sam")
            err = "No reads were found, fixing empty sam/bam file: " + self.input_alignment_file
            log.error(err)
            raise Exception(err)
        fh.close()

        log.info("Converting fixed file into BAM")
        fhq = open(basename + ".name-sorted.fixed.bam", "wb")
        fhq.write(pysam.view('-bS', basename + ".name-sorted.fixed.sam"))
        fhq.close()

        log.info("Sorting position based fixed file")
        pysam.sort("-o", basename + ".sorted.fixed.bam",
                   basename + ".name-sorted.fixed.bam")

        log.info("Indexing the position sorted bam file")
        pysam.index(basename + ".sorted.fixed.bam")

        log.info("Cleaning up temp files")
        for fname in [
                basename + ".name-sorted.bam",
                basename + ".name-sorted.fixed.sam",
                basename + ".name-sorted.fixed.bam"
        ]:
            log.debug("=> " + fname)
            os.remove(fname)

        log.info("Moving to final destination")
        shutil.move(basename + ".sorted.fixed.bam", bam_file_discordant_fixed)
        shutil.move(basename + ".sorted.fixed.bam" + ".bai",
                    bam_file_discordant_fixed + ".bai")