Ejemplo n.º 1
0
def _both_indexes(scdf, ocdf, how=False):

    assert (how in "containment first last outer right left".split() +
            [False, None]) or isinstance(how, int)
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    elif how == "first":
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)
    elif how == "last":
        _self_indexes, _other_indexes = it.last_overlap_both(
            starts, ends, indexes)
        six = scdf.index
        oix = ocdf.index
    elif how in ["outer", "left", "right"]:

        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)

        missing_in_s = scdf.index.difference(_self_indexes)
        missing_in_o = ocdf.index.difference(_other_indexes)

        filler_s = np.ones(len(missing_in_o), dtype=int) * -1
        filler_o = np.ones(len(missing_in_s), dtype=int) * -1

        if how == "outer":
            _self_indexes = np.concatenate(
                [_self_indexes, missing_in_s, filler_s])
            _other_indexes = np.concatenate(
                [_other_indexes, filler_o, missing_in_o])
        elif how == "left":
            _self_indexes = np.concatenate([_self_indexes, missing_in_s])
            _other_indexes = np.concatenate([_other_indexes, filler_o])
        elif how == "right":
            _self_indexes = np.concatenate([_self_indexes, filler_s])
            _other_indexes = np.concatenate([_other_indexes, missing_in_o])

    return _self_indexes, _other_indexes
Ejemplo n.º 2
0
def _both_dfs(scdf, ocdf, how=False):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    ocdf = ocdf.reset_index(drop=True)
    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes
    scdf = scdf.reindex(_self_indexes)
    ocdf = ocdf.reindex(_other_indexes)

    return scdf, ocdf
Ejemplo n.º 3
0
def _intersection(scdf, ocdf, kwargs):

    how = kwargs["how"]

    if ocdf.empty or scdf.empty:
        return None

    assert how in "containment first last".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    in_dtype = ocdf.Start.dtype

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how or how is None:
        _self_indexes, _other_indexes = oncls.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = oncls.all_containments_both(
            starts, ends, indexes)
    elif how == "first":
        _self_indexes, _other_indexes = oncls.first_overlap_both(
            starts, ends, indexes)
    elif how == "last":
        _self_indexes, _other_indexes = oncls.last_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    scdf, ocdf = scdf.reindex(_self_indexes), ocdf.reindex(_other_indexes)

    new_starts = pd.Series(
        np.where(scdf.Start.values > ocdf.Start.values, scdf.Start,
                 ocdf.Start),
        index=scdf.index,
        dtype=in_dtype)

    new_ends = pd.Series(
        np.where(scdf.End.values < ocdf.End.values, scdf.End, ocdf.End),
        index=scdf.index,
        dtype=in_dtype)

    pd.options.mode.chained_assignment = None  # default='warn'
    scdf.loc[:, "Start"] = new_starts
    scdf.loc[:, "End"] = new_ends
    pd.options.mode.chained_assignment = 'warn'

    if not scdf.empty:
        return scdf
    else:
        return None
Ejemplo n.º 4
0
    def test_ncls():
        # ids = starts

        print(starts, ends, ids)

        ncls = NCLS(starts, ends, ids)
        print(ncls)
        print(ncls.intervals())

        assert list(ncls.find_overlap(0, 2)) == []
        assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)]

        r, l = ncls.all_overlaps_both(starts, ends, ids)
        assert list(r) == [0, 3]
        assert list(l) == [0, 3]
Ejemplo n.º 5
0
def _number_overlapping(scdf, ocdf, kwargs):

    keep_nonoverlapping = kwargs.get("keep_nonoverlapping", True)

    if scdf.empty:
        return None
    if ocdf.empty:
        if keep_nonoverlapping:
            df = scdf.copy()
            # print(df)
            df.insert(df.shape[1], "NumberOverlaps", 0)
            # print("df" * 100)
            # print(df)
            return df
        else:
            return None

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    _self_indexes, _other_indexes = oncls.all_overlaps_both(
        starts, ends, indexes)

    s = pd.Series(_self_indexes)
    counts_per_read = s.value_counts()[s.unique()].reset_index()
    counts_per_read.columns = ["Index", "Count"]

    df = scdf.copy()

    if keep_nonoverlapping:
        _missing_indexes = np.setdiff1d(scdf.index, _self_indexes)
        missing = pd.DataFrame(data={
            "Index": _missing_indexes,
            "Count": 0
        },
                               index=_missing_indexes)
        counts_per_read = pd.concat([counts_per_read, missing])
    else:
        df = df.loc[_self_indexes]

    counts_per_read = counts_per_read.set_index("Index")

    df.insert(df.shape[1], "NumberOverlaps", counts_per_read)

    return df
Ejemplo n.º 6
0
    def ncls_overlap(self, decimal_places = 5, start_idx = 0, end_idx = None):
        # if end_idx is none set as the end of the list
        if end_idx is None:
            end_idx == len(self.start)
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) (since NCLS works optimally with integers)

        # convert the retention times to integers so that it is compatible with ncls
        int_start = (self.start * (10**decimal_places)).astype(int) 
        int_end = (self.end * (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(int_start, int_end, self.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array

        return np.column_stack(ncls.all_overlaps_both(int_start[start_idx:end_idx], int_end[start_idx:end_idx], self.idx[start_idx:end_idx])) #column stack puts the two lists vertically (easier iteration for np.vectorize)
Ejemplo n.º 7
0
def test_ncls():
    # ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts, ends, ids)
    print(ncls)
    print(ncls.intervals())

    assert list(ncls.find_overlap(0, 2)) == []
    print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806)))
    assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)]

    r, l = ncls.all_overlaps_both(starts, ends, ids)
    assert list(r) == [2147483647, 3]
    assert list(l) == [2147483647, 3]
Ejemplo n.º 8
0
    def overlap(self, im=True, decimal_places=5):
        #if index not linear then filter and use this hidden index
        use_hidden = False  #if true that means reindex done for overlap (with have to unindex before return results)
        if not np.all(self.retentionTable.idx == np.arange(
                0, len(self.retentionTable.start))):
            hidden_idx = np.arange(0, len(self.retentionTable.start))
            use_hidden = True
        else:
            hidden_idx = self.retentionTable.idx

        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
                         (10**decimal_places)).astype(int)
        ret_int_end = (self.retentionTable.end *
                       (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, hidden_idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ret_int_start, ret_int_end, hidden_idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)

        #filter out pairs where x=y, although these overlap not interested in them
        ret_idx = ret_idx[ret_idx[:, 0] != ret_idx[:, 1]]

        if ret_idx.size > 0:  #only look for overlap if there is overlap in retention time
            #if im flag on, then have to check for overlap in both mz and im dimensions
            if im:
                rslt = ret_idx[self.__vecMzImOverlap(ret_idx[:, 0],
                                                     ret_idx[:, 1])]
            else:
                rslt = ret_idx[self.mzTable.vec_idx_overlap(
                    ret_idx[:, 0], ret_idx[:, 1])]
        else:
            rslt = np.array([])

        #unindex if need to
        if use_hidden:
            return Precursor.unindex(rslt, self.retentionTable.idx)
        else:
            return rslt
Ejemplo n.º 9
0
def test_ncls():
    starts = pd.Series(range(0, int(1e6)))
    ends = starts + 100
    ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts.values, ends.values, ids.values)

    # starts = pd.Series([0, 4])
    # ends = pd.Series([2, 5])
    # indexes = pd.Series([98, 99])
    print(starts, ends, indexes)
    it = ncls.all_overlaps_both_stack(starts.values, ends.values,
                                      indexes.values)
    it2 = ncls.all_overlaps_both(starts.values, ends.values, indexes.values)

    print(it)
    print(it2)
    assert it == it2
Ejemplo n.º 10
0
def _both_indexes(scdf, ocdf, how=False):

    assert (how in "containment first".split() + [False, None]) or isinstance(
        how, int)
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    return _self_indexes, _other_indexes
Ejemplo n.º 11
0
    def overlap_against_window(self, exp, im=True, decimal_places=5):
        print("starting overlap against window")
        ms2 = exp.ms2
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
                         (10**decimal_places)).astype(int)
        ret_int_end = (self.retentionTable.end *
                       (10**decimal_places)).astype(int)

        ms2_time_int_start = (ms2.timeTable.start *
                              (10**decimal_places)).astype(int)
        ms2_time_int_end = (ms2.timeTable.end *
                            (10**decimal_places)).astype(int)

        print(ret_int_start)
        print(ret_int_end)

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ms2_time_int_start, ms2_time_int_end,
                                   ms2.timeTable.idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)
        ret_idx = np.fliplr(ret_idx)

        #if im flag on, then have to check for overlap in both mz and im dimensions
        if im:
            return ret_idx[self.vec_mz_im_overlap(ret_idx[:, 0],
                                                  ret_idx[:, 1],
                                                  idx2_data=ms2)]
        else:
            #print(ms2.mzTable)
            return ret_idx[self.mzTable.vec_idx_overlap(ret_idx[:, 0],
                                                        ret_idx[:, 1],
                                                        yData=ms2.mzTable)]
Ejemplo n.º 12
0
def _both_dfs(scdf, ocdf, how=False, **kwargs):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    return scdf.loc[_self_indexes], ocdf.loc[_other_indexes]
Ejemplo n.º 13
0
    def overlap_against_other(self, query, im=True, decimal_places=5):
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
                         (10**decimal_places)).astype(int)
        ret_int_end = (self.retentionTable.end *
                       (10**decimal_places)).astype(int)

        query_ret_int_start = (query.retentionTable.start *
                               (10**decimal_places)).astype(int)
        query_ret_int_end = (query.retentionTable.end *
                             (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(query_ret_int_start, query_ret_int_end,
                                   query.retentionTable.idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)
        print(ret_idx)
        #reverse columns so have the retention time idx first then the frame idx
        ret_idx = np.flipr(ret_idx)
        print(ret_idx)

        #can't filter out because the index are not the same
        #ret_idx = ret_idx[ret_idx[:,0] != ret_idx[:,1]]

        #if im flag on, then have to check for overlap in both mz and im dimensions
        if im:
            return ret_idx[self.__vecMzImOverlap(ret_idx[:, 0], ret_idx[:, 1])]
        else:
            return ret_idx[self.mzTable.vec_idx_overlap(
                ret_idx[:, 0], ret_idx[:, 1])]
Ejemplo n.º 14
0
values2 = np.ones(len(starts2))

# Test AIList
i = AIList()

i.from_array(starts1, ends1, ids1, values1)
i.construct()

ai_res = i.intersect_from_array(starts2, ends2, ids2)

i.intersect(starts2[50], ends2[50])

# Test NCLS
n = NCLS(starts1, ends1, ids1)

n_res = n.all_overlaps_both(starts2, ends2, ids2)

list(n.find_overlap(starts2[50], ends2[50]))

# Test pandas
p = pd.IntervalIndex.from_tuples(list(zip(starts1, ends1)))

p.overlaps(pd.Interval(starts2[50], ends2[50]))

# Test quicksect
b = quicksect.IntervalTree()
for i in range(len(starts1)):
    b.add(starts1[i], ends1[i])

b.search(starts2[50], ends2[50])
Ejemplo n.º 15
0
def projection(GTF_FILE, VCF_FILE, chrom_set=set()):
    """
    Projects VCF file to transcript coordinates.
    Creates intermediate file ``

    Parameters
    ----------
    GTF_FILE : string containing GTF file name, assumed to be unzipped
    VCF_FILE : string containing VCF file name, can be a gzipped file 
    chrom_set : set() set of chromosomes to be sampled 

    Returns
    -------
    vcf_txome : pandas dataframe with the following header
    [
        'chrom_x',
        'gene',
        'txome',
        'relative_pos',
        'transcript_length',
        'id',
        'ref',
        'alt',
        'qual',
        'filter',
        'info',
        'format',
        'samples'
    ]

    call it truncated VCF
    """

    from timeit import default_timer as timer

    start = timer()
    # records = read_vcf(VCF_FILE)
    records = tiny_vcf_reader(VCF_FILE)
    df = pd.DataFrame(records, columns=VCF_FIELDS)
    end = timer()

    print('parsed vcf in', (end - start), 'seconds')

    start = timer()
    # Create a minimal for GTF
    import sys
    import subprocess
    cmd = [
        'cat', GTF_FILE,
        '| awk -F \"\t\" \'($3 == \"transcript\") {print $1,$4,$5,$9}\'',
        '| tr -d \";\\"\"',
        '| awk \'{print $1,$2,$3,$5,$9}\' > chrome_gene_tr.info'
    ]
    print('running ...')
    print(' '.join(cmd))
    retval = subprocess.call(' '.join(cmd), shell=True)
    if (retval):
        print('awk commant failed')
        sys.exit(1)

    gtf_df = pd.read_csv('chrome_gene_tr.info',
                         sep=' ',
                         names=['chrom', 'start', 'end', 'gene', 'txome'],
                         header=None)
    retval = subprocess.call('rm chrome_gene_tr.info', shell=True)
    if (retval):
        print('can\'t delete the intermediate file')

    if len(chrom_set) == 0:
        chrom_set = set(gtf_df.chrom.values)
    dataframes = []

    for i in chrom_set:
        gtf_df_subset = gtf_df.loc[gtf_df.chrom == i]
        df_subset = df.loc[df.chrom == i]

        if (not len(df_subset)):
            continue

        start_val = gtf_df.loc[gtf_df.chrom == i].start.values
        end_val = gtf_df.loc[gtf_df.chrom == i].end.values
        indices = gtf_df.loc[gtf_df.chrom == i].index.values

        query_start_val = df.loc[df.chrom == i].pos.values
        query_end_val = df.loc[df.chrom == i].pos.values + 1
        query_indices = df.loc[df.chrom == i].index.values

        ncls = NCLS(np.array(start_val), np.array(end_val), indices)
        result = ncls.all_overlaps_both(query_start_val, query_end_val,
                                        query_indices)
        map_df = pd.DataFrame(list(zip(*result)),
                              columns=['vcf_index', 'gtf_index'])

        if (not len(map_df)):
            continue

        vcf_gtf_subset = pd.merge(
            gtf_df_subset.join(map_df.set_index('gtf_index')),
            df_subset,
            left_on='vcf_index',
            left_index=False,
            right_index=True,
        )

        dataframes += [vcf_gtf_subset]
        print('chromosome ', i, ' done')

    if (not len(dataframes)):
        print('there is no intersection with the VCF...exiting')
        return None

    print('Merging the chromosomes...')

    vcf_gtf = pd.concat(dataframes)
    vcf_gtf['relative_pos'] = vcf_gtf['pos'] - vcf_gtf['start']
    vcf_gtf['transcript_length'] = vcf_gtf['end'] - vcf_gtf['start'] + 1

    end = timer()

    print('elapsed ', (end - start), " seconds")

    return (vcf_gtf)
Ejemplo n.º 16
0


from ncls import NCLS

import pickle
import pandas as pd
import numpy as np


starts = np.array(list(reversed([3, 5, 8])), dtype=np.int)
ends = np.array(list(reversed([6, 7, 9])), dtype=np.int)
indexes = np.array(list(reversed([0, 1, 2])), dtype=np.int)

# starts = np.array([3, 5, 8], dtype=np.int)
# ends = np.array([6, 7, 9], dtype=np.int)
# indexes = np.array([0, 1, 2], dtype=np.int)

ncls = NCLS(starts, ends, indexes)

starts2 = np.array([1, 6])
ends2 = np.array([10, 7])
indexes2 = np.array([0, 1])

print(ncls.all_overlaps_both(starts2, ends2, indexes2))