Beispiel #1
0
def Vertex(bbs, dirn, bbids=None, min_seg_len=1, parallel=0):
    """Summary

    Args:
        bbs (TYPE): Description
        bbids (TYPE): Description
        dirn (TYPE): Description
        min_seg_len (TYPE): Description

    Returns:
        TYPE: Description
    """
    dirn_map = {'N': 0, 'C': 1, '_': 2}
    din = dirn_map[dirn[0]]
    dout = dirn_map[dirn[1]]
    if bbids is None:
        bbids = np.arange(len(bbs))

    exe = cf.ProcessPoolExecutor if parallel else InProcessExecutor
    with exe() as pool:
        futures = list()
        for bb, bid in zip(bbs, bbids):
            futures.append(
                pool.submit(vertex_single, bb._state, bid, din, dout,
                            min_seg_len))
        verts = [f.result() for f in futures]
    verts = [v for v in verts if v is not None]
    if not verts:
        raise ValueError('no way to make vertex: \'' + dirn + '\'')
    tup = tuple(np.concatenate(_) for _ in zip(*verts))
    assert len({x.shape[0] for x in tup}) == 1
    ibblock, ires = tup[5], tup[2]

    inout = np.stack([
        util.unique_key(ibblock, ires[:, 0]),
        util.unique_key(ibblock, ires[:, 1])
    ],
                     axis=-1).astype('i4')

    inbreaks = util.contig_idx_breaks(inout[:, 0])
    assert inbreaks.dtype == np.int32

    return _Vertex(*tup, inout, inbreaks, np.array([din, dout], dtype='i4'))
Beispiel #2
0
def get_allowed_splices(
        u,
        ublks,
        v,
        vblks,
        splicedb=None,
        splice_max_rms=0.7,
        splice_ncontact_cut=30,
        splice_clash_d2=4.0**2,  # ca only
        splice_contact_d2=8.0**2,
        splice_rms_range=6,
        splice_clash_contact_range=60,
        splice_clash_contact_by_helix=True,
        splice_ncontact_no_helix_cut=0,
        splice_nhelix_contacted_cut=0,
        splice_max_chain_length=999999,
        skip_on_fail=True,
        parallel=False,
        verbosity=1,
        cache_sync=0.001,
        precache_splices=False,
        pbar=False,
        pbar_interval=10.0,
        **kw):
    assert (u.dirn[1] + v.dirn[0]) == 1, 'get_allowed_splices dirn mismatch'

    # note: this is duplicated in edge_batch.py and they need to be the same
    params = (splice_max_rms, splice_ncontact_cut, splice_clash_d2,
              splice_contact_d2, splice_rms_range, splice_clash_contact_range,
              splice_clash_contact_by_helix, splice_ncontact_no_helix_cut,
              splice_nhelix_contacted_cut, splice_max_chain_length)

    outidx = _get_outidx(u.inout[:, 1])
    outblk = u.ibblock[outidx]
    outres = u.ires[outidx, 1]

    inblk = v.ibblock[v.inbreaks[:-1]]
    inres = v.ires[v.inbreaks[:-1], 0]
    inblk_breaks = contig_idx_breaks(inblk)

    outblk_res = defaultdict(list)
    for iblk, ires in zip(outblk, outres):
        outblk_res[iblk].append(ires)
    for iblk in outblk_res.keys():
        outblk_res[iblk] = np.array(outblk_res[iblk], 'i4')

    inblk_res = defaultdict(list)
    for iblk, ires in zip(inblk, inres):
        inblk_res[iblk].append(ires)
    for iblk in inblk_res.keys():
        inblk_res[iblk] = np.array(inblk_res[iblk], 'i4')
        assert np.all(sorted(inblk_res[iblk]) == inblk_res[iblk])

    nout = sum(len(a) for a in outblk_res.values())
    nent = sum(len(a) for a in inblk_res.values())
    valid_splices = [list() for i in range(nout)]

    swapped = False
    if u.dirn[1] == 0:  # swap so N-to-C!
        swapped = True
        u, ublks, v, vblks = v, vblks, u, ublks
        outblk_res, inblk_res = inblk_res, outblk_res
        outblk, inblk = inblk, outblk

    pairs_with_no_valid_splices = 0
    tcache = 0

    exe = InProcessExecutor()
    if parallel:
        exe = cf.ProcessPoolExecutor(max_workers=parallel)
    # exe = cf.ThreadPoolExecutor(max_workers=parallel) if parallel else InProcessExecutor()
    with exe as pool:
        futures = list()
        ofst0 = 0
        for iblk0, ires0 in outblk_res.items():
            blk0 = ublks[iblk0]
            key0 = blk0.filehash
            t = time()
            cache = splicedb.partial(params, key0) if splicedb else None
            tcache += time() - t
            ofst1 = 0
            for iblk1, ires1 in inblk_res.items():
                blk1 = vblks[iblk1]
                key1 = blk1.filehash
                if cache and key1 in cache and cache[key1]:
                    splices = cache[key1]
                    future = NonFuture(splices, dummy=True)
                else:
                    future = pool.submit(
                        _jit_splice_metrics, blk0.chains, blk1.chains,
                        blk0.ncac, blk1.ncac, blk0.stubs, blk1.stubs,
                        blk0.connections, blk1.connections, blk0.ss, blk1.ss,
                        blk0.cb, blk1.cb, splice_clash_d2, splice_contact_d2,
                        splice_rms_range, splice_clash_contact_range,
                        splice_clash_contact_by_helix, splice_max_rms,
                        splice_max_chain_length, skip_on_fail)
                fs = (iblk0, iblk1, ofst0, ofst1, ires0, ires1)
                future.stash = fs
                futures.append(future)
                ofst1 += len(ires1)
            ofst0 += len(ires0)

        if verbosity > 0 and tcache > 1.0:
            print('get_allowed_splices read caches time:', tcache)

        future_iter = cf.as_completed(futures)
        if pbar and not precache_splices:
            future_iter = tqdm(cf.as_completed(futures),
                               'checking splices',
                               mininterval=pbar_interval,
                               total=len(futures))
        for future in future_iter:
            iblk0, iblk1, ofst0, ofst1, ires0, ires1 = future.stash
            result = future.result()
            if len(result) is 5 and isinstance(result[0], np.ndarray):
                # is newly computed result, not from cache
                rms, nclash, ncontact, ncnh, nhc = result
                ok = ((nclash == 0) * (rms <= splice_max_rms) *
                      (ncontact >= splice_ncontact_cut) *
                      (ncnh >= splice_ncontact_no_helix_cut) *
                      (nhc >= splice_nhelix_contacted_cut))
                result = _splice_respairs(ok, ublks[iblk0], vblks[iblk1])
                if np.sum(ok) == 0:
                    print('N no clash', np.sum(nclash == 0))
                    print('N rms', np.sum(rms <= splice_max_rms))
                    print('N contact', np.sum(ncontact >= splice_ncontact_cut))

                if splicedb:
                    key0 = ublks[iblk0].filehash  # C-term side
                    key1 = vblks[iblk1].filehash  # N-term side
                    splicedb.add(params, key0, key1, result)
                    if np.random.random() < cache_sync:
                        print('sync_to_disk splices data')
                        splicedb.sync_to_disk()

            if swapped:
                result = result[1], result[0]
                ires0, ires1 = ires1, ires0
                ofst0, ofst1 = ofst1, ofst0

            if len(result[0]) == 0:
                pairs_with_no_valid_splices += 1
                continue
            index_of_ires0 = _index_of_map(ires0, np.max(result[0]))
            index_of_ires1 = _index_of_map(ires1, np.max(result[1]))
            irs = index_of_ires0[result[0]]
            jrs = index_of_ires1[result[1]]
            ok = (irs >= 0) * (jrs >= 0)
            irs = irs[ok] + ofst0
            jrs = jrs[ok] + ofst1
            for ir, jr in zip(irs, jrs):
                valid_splices[ir].append(jr)

    if cache_sync > 0 and splicedb:
        splicedb.sync_to_disk()

    if pairs_with_no_valid_splices:
        print('pairs with no valid splices: ', pairs_with_no_valid_splices,
              'of',
              len(outblk_res) * len(inblk_res))

    return valid_splices, nout, nent
Beispiel #3
0
def test_contig_idx_breaks():
    tst = np.array([1, 1, 1, 1, 3, 3, 3, 3], dtype='i4')
    assert np.all(util.contig_idx_breaks(tst) == [0, 4, 8])
Beispiel #4
0
def Vertex(bbs, dirn, bbids=None, min_seg_len=1, verbosity=0):
    dirn_map = {"N": 0, "C": 1, "_": 2}
    din = dirn_map[dirn[0]]
    dout = dirn_map[dirn[1]]
    if bbids is None:
        bbids = np.arange(len(bbs))

    # exe = cf.ProcessPoolExecutor if parallel else InProcessExecutor
    # with exe() as pool:
    #     futures = list()
    #     for bb, bid in zip(bbs, bbids):
    #         futures.append(
    #             pool.
    #             submit(vertex_single, bb._state, bid, din, dout, min_seg_len)
    #         )
    #     verts = [f.result() for f in futures]
    verts = [
        vertex_single(bb._state,
                      bid,
                      din,
                      dout,
                      min_seg_len,
                      verbosity=verbosity) for bb, bid in zip(bbs, bbids)
    ]
    verts = [v for v in verts if v is not None]

    if not verts:
        raise ValueError("no way to make vertex: '" + dirn + "'")
    tup = tuple(np.concatenate(_) for _ in zip(*verts))
    assert len({x.shape[0] for x in tup}) == 1
    ibblock, ires = tup[5], tup[2]

    # print(np.stack((ibblock, ires[:, 1])).T)

    assert _check_bbires_inorder(ibblock, ires[:, 0])
    # not true as some pruned from validity checks
    # assert _check_bbires_inorder(ibblock, ires[:, 1])

    inout = np.stack(
        [
            util.unique_key_int32s(ibblock, ires[:, 0]),
            util.unique_key_int32s(ibblock, ires[:, 1]),
        ],
        axis=-1,
    ).astype(
        "i4"
    )  # yapf: disable

    # inout2 = np.stack([
    #     util.unique_key(ibblock, ires[:, 0]),
    #     util.unique_key(ibblock, ires[:, 1])
    # ],
    #                  axis=-1).astype('i4')
    # if not np.all(inout == inout2):
    #     np.set_printoptions(threshold=np.nan)
    #     print(
    #         np.stack((
    #             inout[:, 0], inout2[:, 0], ibblock, ires[:, 0], inout[:, 1],
    #             inout2[:, 1], ibblock, ires[:, 1]
    #         )).T
    #     )

    # assert inout.shape == inout2.shape
    # assert np.all(inout == inout2)

    inbreaks = util.contig_idx_breaks(inout[:, 0])
    assert inbreaks.dtype == np.int32
    assert np.all(inbreaks <= len(inout))

    return _Vertex(*tup, inout, inbreaks, np.array([din, dout], dtype="i4"),
                   min_seg_len)
Beispiel #5
0
def splice_metrics(u,
                   ublks,
                   v,
                   vblks,
                   clashd2=3.0**2,
                   contactd2=10.0**2,
                   rms_range=9,
                   clash_contact_range=9,
                   rms_cut=1.1,
                   skip_on_fail=True,
                   parallel=False,
                   progressbar=False):

    assert (u.dirn[1] + v.dirn[0]) == 1
    outidx = [
        np.where(u.inout[:, 1] == i)[0][0]
        for i in range(np.max(u.inout[:, 1]) + 1)
    ]

    outblk = u.ibblock[outidx]
    outres = u.ires[outidx, 1]
    inblk = v.ibblock[v.inbreaks[:-1]]
    inres = v.ires[v.inbreaks[:-1], 0]
    # outblk_breaks = contig_idx_breaks(outblk)
    inblk_breaks = contig_idx_breaks(inblk)

    outblk_res = defaultdict(list)
    for iblk, ires in zip(outblk, outres):
        outblk_res[iblk].append(ires)

    inblk_res = defaultdict(list)
    for iblk, ires in zip(inblk, inres):
        inblk_res[iblk].append(ires)

    for iblk in inblk_res.keys():
        inblk_res[iblk] = np.array(inblk_res[iblk], 'i4')
    for iblk in outblk_res.keys():
        outblk_res[iblk] = np.array(outblk_res[iblk], 'i4')

    if u.dirn[1] == 0:  # swap!
        u, ublks, v, vblks = v, vblks, u, ublks
        outblk_res, inblk_res = inblk_res, outblk_res
        outblk, inblk = inblk, outblk

    metrics = _SCM_Scores(nclash=np.zeros(
        (len(outblk), len(inblk)), dtype=np.int32) - 1,
                          ncontact=np.zeros(
                              (len(outblk), len(inblk)), dtype=np.int32) - 1,
                          rms=np.zeros(
                              (len(outblk), len(inblk)), dtype=np.float32) - 1)

    exe = cf.ProcessPoolExecutor if parallel else InProcessExecutor
    with exe() as pool:
        futures = list()
        offset0 = 0
        for iblk0, ires0 in outblk_res.items():
            blk0 = ublks[iblk0]
            offset1 = 0
            for iblk1, ires1 in inblk_res.items():
                blk1 = vblks[iblk1]
                future = pool.submit(_jit_splice_metrics, blk0.chains,
                                     blk1.chains, blk0.ncac, blk1.ncac,
                                     blk0.stubs, blk1.stubs, ires0, ires1,
                                     clashd2, contactd2, rms_range,
                                     clash_contact_range, rms_cut,
                                     skip_on_fail)
                future.stash = (iblk0, iblk1, offset0, offset1, len(ires0),
                                len(ires1))
                futures.append(future)
                offset1 += len(ires1)
            offset0 += len(ires0)

        iter = cf.as_completed(futures)
        if progressbar:
            iter = tqdm(cf.as_completed(futures), total=len(futures))
        for i, future in enumerate(iter):
            iblk0, iblk1, offset0, offset1, nres0, nres1 = future.stash
            rms, nclash, ncontact = future.result()
            myslice = (slice(offset0,
                             offset0 + nres0), slice(offset1, offset1 + nres1))
            metrics.rms[myslice] = rms
            metrics.nclash[myslice] = nclash
            metrics.ncontact[myslice] = ncontact

    if u.dirn[1] == 0:  # swap!
        metrics = _SCM_Scores(metrics.nclash.T, metrics.ncontact.T,
                              metrics.rms.T)

    return metrics
Beispiel #6
0
      splice_rms_range,
      splice_clash_contact_range,
      splice_clash_contact_by_helix,
      splice_ncontact_no_helix_cut,
      splice_nhelix_contacted_cut,
      splice_max_chain_length,
      splice_min_dotz,
   )

   outidx = _get_outidx(u.inout[:, 1])
   outblk = u.ibblock[outidx]
   outres = u.ires[outidx, 1]

   inblk = v.ibblock[v.inbreaks[:-1]]
   inres = v.ires[v.inbreaks[:-1], 0]
   inblk_breaks = contig_idx_breaks(inblk)

   outblk_res = defaultdict(list)
   for iblk, ires in zip(outblk, outres):
      outblk_res[iblk].append(ires)
   for iblk in outblk_res.keys():
      outblk_res[iblk] = np.array(outblk_res[iblk], "i4")

   inblk_res = defaultdict(list)
   for iblk, ires in zip(inblk, inres):
      inblk_res[iblk].append(ires)
   for iblk in inblk_res.keys():
      inblk_res[iblk] = np.array(inblk_res[iblk], "i4")
      assert np.all(sorted(inblk_res[iblk]) == inblk_res[iblk])

   nout = sum(len(a) for a in outblk_res.values())