Beispiel #1
0
def read_exons(gtf):
    transcripts = defaultdict(pyinter.IntervalSet)
    totlen = 0
    names = []
    trs, ids = [], []
    for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"):
        if toks[2] not in("CDS", "stop_codon") or toks[1] not in("protein_coding"): continue
        #if toks[0] != "1": break
        start, end = map(int, toks[3:5])
        assert start <= end, toks
        transcript = toks[8].split('transcript_id "')[1].split('"', 1)[0]
        transcripts[transcript].add(pyinter.closedopen(start-1, end))

        names.append(toks[8].split('transcript_name "')[1].split('"', 1)[0].rsplit("-", 1)[0])
        ids.append(toks[8].split('gene_id "')[1].split('"', 1)[0])
        trs.append(toks[8].split('transcript_id "')[1].split('"', 1)[0])

    # sort by start so we can do binary search.
    # TODO: need to remove overlapping exons so we don't double-count
    transcripts = dict((k, sorted(v)) for k, v in transcripts.iteritems())
    #ends = dict((k, sorted(v)) for k, v in ends.iteritems())
    ints={}
    lens=pyinter.IntervalSet()
    for tr, ivset in transcripts.iteritems():
        sends = sorted(list(ivset))
        iset=pyinter.IntervalSet(pyinter.closedopen(x.lower_value,x.upper_value) for x in sends)
        lens = lens.union(iset)
        ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends]
        ints[tr] = (ss,es)
    totlen = sum(x.upper_value-x.lower_value for x in lens)
    return ints, set(names), set(ids), set(trs), totlen
Beispiel #2
0
def load_genome_gaps(gapsfile, chrom_name):
    gaps = pyinter.IntervalSet()
    with open(gapsfile, 'r') as file:
        lines = [l for l in file.readlines() if l.split('\t')[0] == chrom_name]
        for line in lines:
            toks = line.split('\t')
            a, b = int(toks[1]), int(toks[2])
            gaps.add(pyinter.closedopen(a, b))
    return gaps
Beispiel #3
0
def test_get_insertion_overlap_positions():
    blocks = [
        GenomeInterval(1, 0, 100),  # 01
        GenomeInterval(1, 100, 200),  # 23
        GenomeInterval(1, 210, 300),  # 45
        GenomeInterval(1, 350, 360),  # 67
        GenomeInterval(1, 370, 400),  # 89
        GenomeInterval(1, 0, 100, True),  # 10, 11
        GenomeInterval(1, 0, 10, True)
    ]  # 12, 13
    paths = (list(range(10)), [0, 1, 10, 11, 2, 3], [0, 1, 2, 3, 10, 11, 2, 3],
             [0, 1, 2, 3, 12, 13, 2, 3], [0, 1, 2, 3, 4, 5, 10, 11, 6,
                                          7], [0, 1, 2, 3, 4, 5, 12, 13, 6, 7])
    truth = [
        tuple(), ((80, 170), ), ((185, 275), ),
        tuple(), ((305, 395), ),
        tuple()
    ]
    rlen = 50
    m = 20

    for i in range(len(truth)):
        out, _, _ = get_insertion_overlap_positions(paths[i], blocks, rlen, m)
        inter = pyinter.IntervalSet()
        for interval in truth[i]:
            inter.add(pyinter.open(interval[0], interval[1]))
        print('truth: {0}\nresult: {1}\n'.format(inter, out))
        assert (out == inter)

    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 200, 300),
        GenomeInterval(0, 350, 400),
        GenomeInterval(1, 0, 50, True),
        GenomeInterval(1, 0, 50, True)
    ]
    path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5]
    truth = [(130, 170), (355, 395)]
    out, _, _ = get_insertion_overlap_positions(path, blocks, rlen, m)
    inter = pyinter.IntervalSet()
    for interval in truth:
        inter.add(pyinter.open(interval[0], interval[1]))
    print('truth: {0}\nresult: {1}\n'.format(inter, out))
    assert (out == inter)
Beispiel #4
0
def get_insertion_overlap_positions(path, blocks, read_len, min_mappable=20):
    invalid_read_start_d = pyinter.IntervalSet()
    invalid_read_start_t = pyinter.IntervalSet()
    invalid_window_start = pyinter.IntervalSet()
    m = min_mappable
    R = read_len
    pos = 0

    blocks_gaps = genome_blocks_gaps(blocks, path)
    for b in blocks_gaps:
        if b.is_de_novo and 0 < len(b) - R + 2 * m:
            invalid_read_start_d.add(
                pyinter.open(pos - m, pos + len(b) - R + m))
        elif b.is_translocation and 0 < len(b) - R + 2 * m:
            invalid_read_start_t.add(
                pyinter.open(pos - m, pos + len(b) - R + m))
        if b.is_insertion():
            invalid_window_start.add(pyinter.open(pos - m, pos + len(b)))
        pos += len(b)
    invalid_read_start = pyinter.IntervalSet()
    # weird code here with window_start is required to merge intervals properly
    for interval in invalid_window_start:
        if interval.lower_value < interval.upper_value - (R - m):
            invalid_read_start.add(
                pyinter.open(interval.lower_value,
                             interval.upper_value - (R - m)))
    # print(invalid_read_start_d)
    # print(invalid_read_start_t)
    # invalid_d_only = invalid_read_start_d.difference(invalid_read_start_t)
    # invalid_t_only = invalid_read_start_t.difference(invalid_read_start_d)
    # invalid_both = invalid_read_start_d.intersection(invalid_read_start_t)
    overlapping_t, overlapping_d = [], []
    for interval in invalid_read_start:
        if any([d.overlaps(interval) for d in invalid_read_start_d]):
            overlapping_d.append(True)
        else:
            overlapping_d.append(False)
        if any([t.overlaps(interval) for t in invalid_read_start_t]):
            overlapping_t.append(True)
        else:
            overlapping_t.append(False)
    return invalid_read_start, overlapping_d, overlapping_t
Beispiel #5
0
def get_gap_overlap_positions(path, blocks, read_len, min_mappable=20):
    blocks_gaps = genome_blocks_gaps(blocks, path)
    m = min_mappable

    gap_ref = pyinter.IntervalSet()
    ref = pyinter.IntervalSet()
    pos = 0
    for b in blocks_gaps:
        if len(b) == 0:
            continue
        if not b.is_insertion():
            gap_ref.add(pyinter.closedopen(pos, pos + len(b)))
            if not b.is_gap:
                ref.add(pyinter.closedopen(pos, pos + len(b)))
        pos += len(b)
    # print('gap_ref: {0}\nref: {1}\n'.format(gap_ref, ref))

    A1 = pyinter.IntervalSet()  # i: [i, i+m) contained in gap_ref
    A2 = pyinter.IntervalSet()  # i: [i, i+m) overlaps ref
    for iv in gap_ref:
        if iv.lower_value <= iv.upper_value - m:
            A1.add(pyinter.closed(iv.lower_value, iv.upper_value - m))
    for iv in ref:
        # print(iv)
        A2.add(pyinter.closed(iv.lower_value - m + 1, iv.upper_value - 1))
        # print(A2)

    A3 = A1.intersection(A2)

    A4 = pyinter.IntervalSet()
    A5 = pyinter.IntervalSet()
    for iv in A1:
        A4.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value))
    for iv in A3:
        A5.add(pyinter.closed(iv.lower_value - read_len + m, iv.upper_value))

    result = A4.difference(A5)

    # print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5))
    # print('result: {0}'.format(result))
    # print('')

    # remove any empty intervals
    out = pyinter.IntervalSet()
    for iv in result:
        a = iv.lower_value - 1 if iv.lower_value in iv else iv.lower_value
        b = iv.upper_value + 1 if iv.upper_value in iv else iv.upper_value
        # if iv.lower_value in iv or iv.upper_value in iv: # not open
        #     print('A1: {0}\nA2: {1}\nA3: {2}\nA4: {3}\nA5: {4}\n'.format(A1, A2, A3, A4, A5))
        #     print('result: {0}'.format(result))
        #     print(iv)
        #     raise Warning('non-open interval in get_gap_positions')
        if a < b - 1:
            out.add(pyinter.open(a, b))
    return out
Beispiel #6
0
def compute_null_dist(opts, discordant_pairs, dtype, insert_mu, insert_sigma,
                      gap_file, lib_idx, lr_cond):
    nreps = opts['pecluster_null_reps']
    chrom_name, start, end = opts['chromosome'], opts['region_start'], opts[
        'region_end']
    gaps_inter = load_genome_gaps(gap_file, chrom_name)
    chrom_inter = pyinter.IntervalSet()
    chrom_inter.add(pyinter.closedopen(start, end))
    non_gaps_inter = chrom_inter.difference(gaps_inter)
    non_gaps = [(i.lower_value, i.upper_value) for i in non_gaps_inter]
    total_len = sum([i[1] - i[0] for i in non_gaps])

    # For deletion null clusters, don't use pairs that are obviously too large.
    # (for normal data the discordant read cutoff for deletion supports
    #  is like mu + 3 sigma ~ mu + .3mu, and we're excluding stuff bigger than 3mu)
    if dtype == 'Del':
        max_null_insert = insert_mu * opts['insert_max_mu_multiple']
    else:
        max_null_insert = np.Inf

    null_clusters = []
    lr_null_clusters = np.array([], float)
    for _ in range(nreps):
        shuffled = shuffle_discordant_pairs(discordant_pairs,
                                            total_len,
                                            max_insert_size=max_null_insert)
        clusters_tmp, _ = cluster_pairs(opts, shuffled, dtype, lib_idx,
                                        insert_mu, insert_sigma)
        null_clusters.extend(clusters_tmp)
        lr_tmp = np.fromiter((lr_fun[dtype](c, insert_mu, insert_sigma,
                                            opts['insert_cutoff'], lr_cond)
                              for c in clusters_tmp), float)
        lr_null_clusters = np.append(lr_null_clusters, lr_tmp)
    if opts['verbosity'] > 1:
        print('[compute_null_dist] {0}'.format(dtype))
        print('shuffled lr:')
        print(lr_null_clusters)
        print('')

    outname = ('{0}_{1}_null_cluster_{2}reps.txt'.format(
        opts['library_names'][lib_idx], dtype, nreps))
    fname = os.path.join(opts['outdir'], 'logging', outname)
    write_clustering_results(fname,
                             list(zip(lr_null_clusters, null_clusters)),
                             first_reject=0)

    # print('there were {0} {1} clusters after shuffling'.format(len(clusters),
    #                                                            dtype))

    lr_null_clusters.sort()
    return lr_null_clusters
Beispiel #7
0
def test_get_gap_overlap_positions():
    rlen = 50
    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 100, 200),
        GenomeInterval(1, 249, 300),
        GenomeInterval(1, 350, 400),
        GenomeInterval(1, 500, 600)
    ]

    paths = ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 7, 6, 8, 9])
    truth = ([(299, 301), (399, 451)], [(299, 326), (424, 451)])

    for i in range(len(truth)):
        out = get_gap_overlap_positions(paths[i], blocks, rlen)
        inter = pyinter.IntervalSet()
        for interval in truth[i]:
            inter.add(pyinter.open(interval[0], interval[1]))
        print('truth: {0}\nresult: {1}\n'.format(inter, out))
        assert (out == inter)

    blocks = [
        GenomeInterval(1, 0, 100),
        GenomeInterval(1, 200, 300),
        GenomeInterval(0, 350, 400),
        GenomeInterval(1, 0, 50, True),
        GenomeInterval(1, 0, 50, True)
    ]

    path = [0, 1, 6, 7, 2, 3, 8, 9, 4, 5]
    truth = [(99, 131), (169, 201), (349, 356), (394, 401)]
    out = get_gap_overlap_positions(path, blocks, rlen)
    inter = pyinter.IntervalSet()
    for interval in truth:
        inter.add(pyinter.open(interval[0], interval[1]))
    print('truth: {0}\nresult: {1}\n'.format(inter, out))
    assert (out == inter)
Beispiel #8
0
 def test_persons_availability(self):
     avail = self.p.get_availability(
         self.range_start, self.range_finish)  # type: inter.IntervalSet
     expected = inter.IntervalSet([
         inter.closed(
             1491354000,
             1491368400),  # Wed, 05 Apr 2017 01:00:00 to 05:00:00 GMT
         # inter.closed(1491958800, 1491973200),  # Wed, 12 Apr 2017 01:00:00 to 05:00:00 GMT Second Tuesday!
         inter.closed(
             1492563600,
             1492578000),  # Wed, 19 Apr 2017 01:00:00 to 05:00:00 GMT
         inter.closed(
             1493168400,
             1493182800),  # Wed, 25 Apr 2017 01:00:00 to 05:00:00 GMT
     ])
     self.assertEqual(avail, expected)
Beispiel #9
0
 def test_interval_set(self):
     iset = self.tp.as_interval_set(self.range_start, self.range_finish)
     expected = inter.IntervalSet([
         inter.closed(
             1491354000,
             1491368400),  # Wed, 05 Apr 2017 01:00:00 to 05:00:00 GMT
         inter.closed(
             1491958800,
             1491973200),  # Wed, 12 Apr 2017 01:00:00 to 05:00:00 GMT
         inter.closed(
             1492563600,
             1492578000),  # Wed, 19 Apr 2017 01:00:00 to 05:00:00 GMT
         inter.closed(
             1493168400,
             1493182800),  # Wed, 25 Apr 2017 01:00:00 to 05:00:00 GMT
     ])
     self.assertEqual(iset, expected)
Beispiel #10
0
def constraints_unknown_sigma( \
    support_directions,
    RHS_offsets,
    LHS_offsets,
    observed_data,
    direction_of_interest,
    RSS,
    RSS_df,
    value_under_null=0.,
    tol = 1.e-4,
    DEBUG=False):
    r"""
    Given a quasi-affine constraint $\{z:Az+u \leq \hat{\sigma}b\}$ 
    (elementwise)
    specified with $A$ as `support_directions` and $b$ as
    `support_offset`, a new direction of interest $\eta$, and
    an `observed_data` is Gaussian vector $Z \sim N(\mu,\sigma^2 I)$ 
    with $\sigma$ unknown, this
    function returns $\eta^TZ$ as well as a set
    bounding this value. The value of $\hat{\sigma}$ is taken to be
    sqrt(RSS/RSS_df)

    The interval constructed is such that the endpoints are 
    independent of $\eta^TZ$, hence the 
    selective $T$ distribution of
    of `sample carving`_
    can be used to form an exact pivot.

    To construct the interval, we are in effect conditioning
    on all randomness perpendicular to the direction of interest,
    i.e. $P_{\eta}^{\perp}X$ where $X$ is the Gaussian data vector.

    Notes
    -----

    Covariance is assumed to be an unknown multiple of the identity.

    Parameters
    ----------

    support_directions : np.float
         Matrix specifying constraint, $A$.

    RHS : np.float
         Offset in constraint, $b$.

    LHS_offsets : np.float
         Offset in LHS of constraint, $u$.

    observed_data : np.float
         Observations.

    direction_of_interest : np.float
         Direction in which we're interested for the
         contrast.

    RSS : float
        Residual sum of squares.

    RSS_df : int
        Degrees of freedom of RSS.

    tol : float
         Relative tolerance parameter for deciding 
         sign of $Az-b$.

    Returns
    -------

    lower_bound : float

    observed : float

    upper_bound : float

    sigma : float

    """

    # shorthand
    A, b, L, X, w, theta = (support_directions, RHS_offsets, LHS_offsets,
                            observed_data, direction_of_interest,
                            value_under_null)

    # make direction of interest a unit vector

    normw = np.linalg.norm(w)
    w = w / normw
    theta = theta / normw

    sigma_hat = np.sqrt(RSS / RSS_df)

    # compute the sufficient statistics

    U = (w * X).sum() - theta
    V = X - (X * w).sum() * w
    W = sigma_hat**2 * RSS_df + U**2
    Tobs = U / np.sqrt((W - U**2) / RSS_df)
    sqrtW = np.sqrt(W)
    alpha = np.dot(A, w)

    gamma = theta * alpha + np.dot(A, V) + L

    Anorm = np.fabs(A).max()

    intervals = []
    intervals = []
    for _a, _b, _c in zip(alpha, b, gamma):
        _a = _a * sqrtW
        _b = _b * sqrtW
        cur_intervals = sqrt_inequality_solver(_a, _c, _b, RSS_df)
        intervals.append(
            pyinter.IntervalSet(
                [pyinter.closed(*i) for i in cur_intervals if i]))

    truncation_set = intervals[0]
    for interv in intervals[1:]:
        truncation_set = truncation_set.intersection(interv)
    if not truncation_set:
        raise ValueError("empty truncation intervals")
    return truncation_set, Tobs
Beispiel #11
0
def svelter_convert(svelterfile, outdir, reffile, filter_gaps=False, refgapfile=None,
                    flank_size=1000, verbosity=0):
    os.system('mkdir -p %s' % outdir)
    # collect all bps
    # all_bp = []
    # with open(svelterfile, 'r') as svelter:
    #     for line in svelter:
    #         if is_svelter_header(line):
    #             continue
    #         bp_str = line.split('\t')[3].split(':')[1:]
    #         all_bp.extend(int(x) for x in bp_str)
    # all_bp.sort()

    log = open(os.path.join(outdir, 'convert_{0}.log'.format(svelterfile)), 'w')
    data = []

    # it seems some sv can be repeated in svelter output with different scores
    seen_svstring = set()
    seen_id = {}
    skipped_seen = 0
    skipped_refgap = 0

    with open(svelterfile, 'r') as svelter:
        toks_list = [line.rstrip().split('\t') for line in svelter]

    if filter_gaps:
        chroms = set(toks[0] for toks in toks_list)
        chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms}
    else:
        chrom_gaps = None

    for toks in toks_list:
        # check if header
        if toks[0] == 'chr' and toks[1] == 'start':
            continue
        # check if passing score
        if float(toks[6]) == 0:
            continue
        # check if sv is duplicate
        svstring = ' '.join(toks[:6])
        if svstring in seen_svstring:
            skipped_seen += 1
            continue
        else:
            seen_svstring.add(svstring)
        # adjust id if we've seen it before
        id = toks[3]
        num_id_seen = seen_id.get(id, 0)
        seen_id[id] = num_id_seen + 1
        if num_id_seen > 0:
            print('saw {0} again'.format(id))
            id_extra = ';' + str(num_id_seen + 1)
        else:
            id_extra = ''
        chrom = toks[0]
        bp_str = toks[3].split(':')[1:]
        bp = [int(x) for x in bp_str]

        if filter_gaps:
            sv_interval = pyinter.closedopen(bp[0], bp[-1])
            sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval])
            if len(sv_gap_intersection) > 0:
                skipped_refgap += 1
                continue

        breakpoints = {(x, x): Breakpoint((x, x)) for x in bp}
        # il = bisect_left(all_bp, bp[0])
        # if il > 0:
        #     slop_left = min(all_bp[il] - all_bp[il-1], flank_size)
        # else:
        #     slop_left = flank_size
        # ir = bisect_right(all_bp, bp[-1])
        # if ir < len(all_bp):
        #     slop_right = min(all_bp[ir] - all_bp[ir-1], flank_size)
        # else:
        #     slop_right = flank_size
        slop_left, slop_right = flank_size, flank_size
        start = bp[0] - slop_left
        end = bp[-1] + slop_right
        cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity)
        blocks, _, left_bp, right_bp = cbout
        svelter_strings = toks[5].split('/')
        paths = [svelter_string_to_path(x, len(blocks)) for x in svelter_strings]
        score = float(toks[6])

        this_data = (paths, blocks, left_bp, right_bp, score, 'PASS',
                     id_extra, None, None)  # no extra INFO/FORMAT tags like VCF vase
        data.append(this_data)
    log.write('skipped_seen\t{0}\n'.format(skipped_seen))
    log.write('skipped_refgap\t{0}\n'.format(skipped_refgap))

    do_sv_processing(data, outdir, reffile, log, verbosity)

    svelter.close()
    log.close()
Beispiel #12
0
def generic_vcf_convert(vcffile, outdir, reffile, filter_gaps=False, refgapfile=None,
                        caller=None, flank_size=1000, verbosity=0):
    os.system('mkdir -p %s' % outdir)

    vcf = open(vcffile, 'r')
    log = open(os.path.join(outdir, 'convert_{0}.log'.format(vcffile)), 'w')
    data = []
    svtype_skipped = {}
    seen_coords_count = {}
    skipped_refgap = 0
    write_extra = False         # need to write FORMAT or INFO to file?

    with open(vcffile, 'r') as vcf:
        toks_list = [line.rstrip().split('\t') for line in vcf if line[0] != '#']

    if filter_gaps:
        chroms = set(toks[0] for toks in toks_list)
        chrom_gaps = {chrom: load_genome_gaps(refgapfile, chrom) for chrom in chroms}
    else:
        chrom_gaps = None

    for toks in toks_list:
        # NOTE not parsing qual; do filtering beforehand for DELLY
        chrom, pos, id, ref, alt, qual, filterstring, info, format, sample1 = toks

        # VCF is 1-indexed, but specifies pos/end positions
        # which are to the left of breakpoints, so no adjustment
        pos = int(pos)

        tags = info.split(';')
        if 'PRECISE' in tags:
            filterstring += ':PRECISE'
        elif 'IMPRECISE' in tags:
            filterstring += ':IMPRECISE'
        elif caller == 'lumpy':  # only includes tags for imprecise events
            filterstring += ':PRECISE'
        tags = [t for t in tags if '=' in t]
        tagd = {t.split('=')[0]: t.split('=')[1] for t in tags}
        end = int(tagd.get('END', -99999))
        svtype = tagd['SVTYPE']
        if caller == 'pindel' and svtype == 'INS':
            inslen = int(tagd['SVLEN'])
        else:
            inslen = int(tagd.get('INSLEN', 0))

        if caller == 'pindel':
            homlen = int(tagd['HOMLEN'])
            if pos + homlen > end or svtype == 'INS':
                print('pos + homlen > end: positions {0}'.format((pos, end)))
                cipos = (0, 0)
                ciend = (0, 0)
            else:
                cipos = (0, homlen)
                ciend = (0, homlen)
        else:
            if 'CIPOS95' in tagd:   # LUMPY
                tmp = tagd['CIPOS95'].split(',')
                cipos = (int(tmp[0]), int(tmp[1]))
            elif 'CIPOS' in tagd:
                tmp = tagd['CIPOS'].split(',')
                cipos = (int(tmp[0]), int(tmp[1]))
            else:
                cipos = (0, 0)
            if 'CIEND95' in tagd:   # LUMPY
                tmp = tagd['CIEND95'].split(',')
                ciend = (int(tmp[0]), int(tmp[1]))
            elif 'CIEND' in tagd:
                tmp = tagd['CIEND'].split(',')
                ciend = (int(tmp[0]), int(tmp[1]))
            else:
                ciend = (0, 0)
        split_support = int(tagd.get('SR', 0))
        pe_support = int(tagd.get('PE', 0))
        # lumpy STRANDS only relevant for inversions
        if caller == 'lumpy' and svtype == 'INV':
            tmp = tagd['STRANDS'].split(',')
            tmpd = {a: b for (a, b) in (p.split(':') for p in tmp)}
            tagd['INV_PLUS'] = tmpd['++']
            tagd['INV_MINUS'] = tmpd['--']
        tagd_used = ('SR', 'PE', 'SVTYPE', 'SVMETHOD', 'END', 'STRANDS',
                     'SVLEN', 'HOMSEQ', 'CONSENSUS', 'CHR2')
        tagd_extra = {k: v for (k, v) in tagd.items() if k not in tagd_used}

        tags2 = {k: v for (k, v) in zip(format.split(':'), sample1.split(':'))}
        if 'AD' in tags2:       # pindel
            split_support = int(tags2['AD'].split(',')[1])

        gt = tags2['GT']

        if gt == './.' or gt == '.|.':
            is_het = False
            filterstring += ':NOGT'
        elif gt in ('0/0', '0|0'):
            is_het = False
            filterstring += ':ZEROGT'
        elif gt in ('0/1', '1/0', '0|1', '1|0'):
            is_het = True
        else:
            assert(gt in ('1/1', '1|1'))
            is_het = False

        tags2_used = ('AD', 'SR', 'PE', 'SU')
        tags2_extra = {k: v for (k, v) in tags2.items() if k not in tags2_used}
        if len(tagd_extra) + len(tags2_extra) > 0:
            write_extra = True

        # cases
        if svtype == 'DEL':
            path = (0, 1, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'Del'
        elif svtype == 'INV':
            path = (0, 1, 3, 2, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'InvL'
        elif svtype == 'DUP' or svtype == 'DUP:TANDEM':
            path = (0, 1, 2, 3, 2, 3, 4, 5)
            refpath = (0, 1, 2, 3, 4, 5)
            supptype = 'Dup'
        elif svtype == 'INS':
            # INSERTIONS parse inslen, add insertion block to blocks
            path = (0, 1, 4, 5, 2, 3)
            refpath = (0, 1, 2, 3)
            supptype = 'Ins'
        else:
            # skipping delly TRA
            # skipping BND events as they may be ambiguous, in terms of the path
            svtype_skipped[svtype] = svtype_skipped.get(svtype, 0) + 1
            continue

        # check ref gap overlap
        if filter_gaps and end > pos:
            sv_interval = pyinter.closedopen(pos, end)
            sv_gap_intersection = chrom_gaps[chrom].intersection([sv_interval])
            if len(sv_gap_intersection) > 0:
                skipped_refgap += 1
                continue

        # create breakpoints and blocks, keeping in mind uncertainty and possible insertion
        if caller == 'lumpy' and svtype != 'INS':
            # lumpy intervals are not symmetric. POS and END are each the "best guess" for
            # the breakpoints
            bp = [(pos, pos), (end, end)]
        elif svtype != 'INS':
            # if (cipos[1] != -cipos[0] or ciend[1] != -ciend[0]) and \
            #    (pos + cipos[1] < end + ciend[0]):
            if (pos + cipos[1] < end + ciend[0]):
                bp = [(pos + cipos[0], pos + cipos[1]),
                      (end + ciend[0], end + ciend[1])]
            else:
                bp = [(pos, pos), (end, end)]
                filterstring += ':BPOVERLAP'
        else:
            # if cipos[1] != -cipos[0]:
            if cipos[1] > cipos[0]:
                bp = [(pos + cipos[0], pos + cipos[1])]
            else:
                bp = [(pos, pos)]
        pe = [(x, supptype) for x in range(pe_support)]
        # TODO SupportingSplit
        splits = []
        for i in range(split_support):
            aln_tmp = pysam.AlignedSegment()
            aln_tmp.qname = i
            aln_tmp.is_read1 = True
            split_type = supptype + '+'
            splits.append(SupportingSplit(aln_tmp, None, None, None, None, split_type))
        breakpoints = {x: Breakpoint(x, pe=pe, splits=splits) for x in bp}
        slop_left, slop_right = flank_size, flank_size
        start = bp[0][0] - slop_left
        end = bp[-1][1] + slop_right
        cbout = create_blocks(breakpoints, pyinter.IntervalSet(), chrom, start, end, verbosity)
        blocks, _, left_bp, right_bp = cbout

        if svtype == 'INS':
            blocks.append(GenomeInterval(chrom, 0, inslen, is_de_novo=True))

        paths = [path, refpath] if is_het else [path, path]
        score = 0

        coords = (start, end)
        scc = seen_coords_count.get(coords, 0)
        if scc > 0:
            id_extra = chr(ord('a') + scc)
        else:
            id_extra = ''
        seen_coords_count[coords] = scc + 1

        this_data = (paths, blocks, left_bp, right_bp, score, filterstring,
                     id_extra, tagd_extra, tags2_extra)
        data.append(this_data)
    for svtype, count in svtype_skipped.items():
        log.write('skipped_svtype\t{0}\t{1}\n'.format(svtype, count))
    log.write('skipped_refgap\t{0}\n'.format(skipped_refgap))
    do_sv_processing(data, outdir, reffile, log, verbosity, write_extra)

    vcf.close()
    log.close()
Beispiel #13
0
def create_blocks(breakpoints, gaps, chrom_name, start, end, verbosity):
    # create list of blocks between breakpoints
    # while adjusting for genome gaps
    gap_indices = set()
    gap_indices.add(0)
    blocks = []
    left_breakpoints = []
    right_breakpoints = []

    breakpoints[(end, end)] = Breakpoint((end, end))

    bploc = list(breakpoints.keys())
    bploc.sort()

    last_end = start
    last_breakpoint = Breakpoint((start, start))

    for bpl in bploc:
        breakpoint = breakpoints[bpl]

        if bpl[0] <= start or bpl[1] > end:
            continue
        iset = pyinter.IntervalSet()
        blockinterval = pyinter.closedopen(last_end, bpl[0])

        iset.add(blockinterval)
        adjusted_blocks = iset.difference(gaps)
        adjusted_blocks = sorted(list(adjusted_blocks))

        if verbosity > 1:
            print('bploc {0}'.format(bpl))
            print('bp {0}'.format(breakpoint))
            print('blockinterval {0}'.format(blockinterval))
            print('adjusted {0}'.format(adjusted_blocks))

        for ab in adjusted_blocks:
            if ab.lower_value == ab.upper_value:  # block completely within a gap
                gap_indices.add(len(blocks))
                break
            else:
                if ab.lower_value != blockinterval.lower_value:
                    gap_indices.add(len(blocks))
                    left_breakpoint = Breakpoint(
                        (ab.lower_value, ab.lower_value))
                else:
                    left_breakpoint = last_breakpoint
                if ab.upper_value != blockinterval.upper_value:
                    gap_indices.add(len(blocks) + 1)
                    right_breakpoint = Breakpoint(
                        (ab.upper_value, ab.upper_value))
                else:
                    right_breakpoint = breakpoint
                if verbosity > 1:
                    print('adding {0}'.format(
                        GenomeInterval(chrom_name, ab.lower_value,
                                       ab.upper_value)))
                    print('\tleft {0}'.format(left_breakpoint))
                    print('\tright {0}'.format(right_breakpoint))
                blocks.append(
                    GenomeInterval(chrom_name, ab.lower_value, ab.upper_value))
                left_breakpoints.append(left_breakpoint)
                right_breakpoints.append(right_breakpoint)
        last_end = bpl[1]
        last_breakpoint = breakpoints[bpl]
    gap_indices.add(len(blocks))
    gap_indices = sorted(list(gap_indices))
    if verbosity > 1:
        print('--creating blocks--')
        print(breakpoints)
        print(blocks)
        print(gap_indices)
        print(left_breakpoints)
        print(right_breakpoints)
    return blocks, gap_indices, left_breakpoints, right_breakpoints
Beispiel #14
0
def get_time_intervals(time_points):
    inter_lst = []
    for start, end in pairwise(time_points):
        inter_lst.append(pyinter.closed(start, end))
    intervalSet = pyinter.IntervalSet(inter_lst)
    return intervalSet
    def test_add_variants_to_set_from_bed(self):

        common_entities = create_common_entities()
        project = common_entities['project']
        self.ref_genome_1 = common_entities['reference_genome']

        alignment_group = AlignmentGroup.objects.create(
            label='Alignment 1',
            reference_genome=self.ref_genome_1,
            aligner=AlignmentGroup.ALIGNER.BWA)

        (self.sample_1, created) = ExperimentSample.objects.get_or_create(
            project=project, label=SAMPLE_1_LABEL)

        sample_alignment = ExperimentSampleToAlignment.objects.create(
            alignment_group=alignment_group, experiment_sample=self.sample_1)

        # Create variants in the bed regions from best_test.bed
        for var_poor_map in range(20):
            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(101, 200),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

        for var_no_cov in range(20):
            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(301, 400),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

            variant = Variant.objects.create(
                type=Variant.TYPE.TRANSITION,
                reference_genome=self.ref_genome_1,
                chromosome=Chromosome.objects.get(
                    reference_genome=self.ref_genome_1),
                position=random.randint(501, 600),
                ref_value='A')

            vccd = VariantCallerCommonData.objects.create(
                variant=variant,
                source_dataset_id=1,
                alignment_group=alignment_group,
                data={})

        new_bed_path = copy_dataset_to_entity_data_dir(
            entity=sample_alignment, original_source_location=TEST_BED)

        bed_dataset = add_dataset_to_entity(
            sample_alignment,
            dataset_label=Dataset.TYPE.BED_CALLABLE_LOCI,
            dataset_type=Dataset.TYPE.BED_CALLABLE_LOCI,
            filesystem_location=new_bed_path)

        vs_to_v_map = add_variants_to_set_from_bed(sample_alignment,
                                                   bed_dataset)

        variant_set_labels = set([vs.label for vs in vs_to_v_map.keys()])
        self.assertEqual(set(['POOR_MAPPING_QUALITY', 'NO_COVERAGE']),
                         variant_set_labels)

        for variant_set, variants in vs_to_v_map.items():
            for v in variants:
                # POOR MAPPING QUAL should be from 101 to 200
                if variant_set.label == 'POOR_MAPPING_QUALITY':
                    self.assertTrue(v.position in pyinter.closedopen(101, 200))
                # NO COVERAGE should be from 301 to 400, 501 to 600
                elif variant_set.label == 'NO_COVERAGE':
                    self.assertTrue(v.position in pyinter.IntervalSet([
                        pyinter.closedopen(301, 400),
                        pyinter.closedopen(501, 600)
                    ]))
                else:
                    raise AssertionError('bad variant set %s made.' %
                                         variant_set.label)
Beispiel #16
0
def len_without_gaps(chrom_name, start, end, gapsfile):
    gaps = load_genome_gaps(gapsfile, chrom_name)
    region = pyinter.IntervalSet()
    region.add(pyinter.closedopen(start, end))
    diff = region.difference(gaps)
    return sum(x.upper_value - x.lower_value for x in diff)