Python NCLS Exemples, ncls.NCLS Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : coverage.py Projet : xma82/pyranges

def _coverage(scdf, ocdf, kwargs):

    if scdf.empty:
        return None
    if ocdf.empty:
        df = scdf.copy()
        df.insert(df.shape[1], "FractionOverlaps", 0.0)
        return df

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    _lengths = oncls.coverage(starts, ends, indexes)
    _lengths = _lengths / (ends - starts)
    _fractions = _lengths
    _fractions = _fractions.astype("float64")
    _fractions = np.nan_to_num(_fractions)

    scdf = scdf.copy()

    scdf.insert(scdf.shape[1], "FractionOverlaps", _fractions)

    return scdf

Exemple #2

0

Afficher le fichier

def _both_dfs(scdf, ocdf, how=False):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    ocdf = ocdf.reset_index(drop=True)
    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes
    scdf = scdf.reindex(_self_indexes)
    ocdf = ocdf.reindex(_other_indexes)

    return scdf, ocdf

Exemple #3

0

Afficher le fichier

    def __init__(self, bedFile):
        super(BedObject, self).__init__()

        # This function builds an index tree from the bed file to have a fast check if a read falls within
        # a specified area or not.
        # the hard work is done by NCLS (https://github.com/biocore-ntnu/ncls) which is also used by the
        # pyranges module

        starts = []
        ends = []
        currChr = None
        self.__ncls = {}

        with open(bedFile) as f:
            for line in f:
                # break the line into fields
                lineArray = line.strip().split()
                # if the chromosome is still the same, or we do this the first time, we append
                if currChr == lineArray[0] or currChr is None:
                    # this is not changing anything but for the first time (when currChr is None), but
                    # thats fine as this is neither time consuming, nor the bottle neck, its just not
                    # pretty
                    currChr = lineArray[0]

                    # add the starts and stops to the list
                    starts.append(int(lineArray[1]))
                    ends.append(int(lineArray[2]))

                else:
                    # convert to array with dtype (ncls needs that)
                    starts = array(starts, dtype=int64)
                    ends = array(ends, dtype=int64)
                    # add one to the end, to have inclusive ends
                    ends = ends + 1

                    # create the data structure (third column is ids... which could be anything, but
                    # needs to be a number )
                    tmpNcls = NCLS(starts, ends, starts)
                    # store the data structure under its chromosome name
                    self.__ncls[currChr] = tmpNcls

                    # reset all the things for the next chromosome (and initialise it while we are
                    # already at it)
                    currChr = lineArray[0]
                    starts = [lineArray[1]]
                    ends = [lineArray[2]]
            # finally, when we are done with everything, and the currentChr is not None, we need to
            # add the things one last time (just like in the else statement)
            if not currChr is None:
                # convert to array with dtype (ncls needs that)
                starts = array(starts, dtype=int64)
                ends = array(ends, dtype=int64)
                # add one to the end, to have inclusive ends
                ends = ends + 1

                # create the data structure (third column is ids... which could be anything, but
                # needs to be a number )
                tmpNcls = NCLS(starts, ends, starts)
                # store the data structure under its chromosome name
                self.__ncls[currChr] = tmpNcls

Exemple #4

0

Afficher le fichier

Fichier : multithreaded.py Projet : Runsheng/pyranges

def _first_df(scdf, ocdf, how=False, invert=False, n_jobs=1, **kwargs):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    print("n_jobs " * 10)
    print(n_jobs)

    if n_jobs > 1:
        print("deepcopy")
        scdf = scdf.copy(deep=True)

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _indexes = it.has_overlaps(starts, ends, indexes)
    elif how == "containment":
        _indexes = it.has_containments(starts, ends, indexes)

    if not invert:
        return scdf.reindex(_indexes)
    else:
        return scdf.loc[~scdf.index.isin(_indexes)]

Exemple #5

0

Afficher le fichier

def _overlap(scdf, ocdf, kwargs):

    invert = kwargs["invert"]
    return_indexes = kwargs.get("return_indexes", False)

    if scdf.empty or ocdf.empty:
        return None

    how = kwargs["how"]

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _indexes = it.all_overlaps_self(starts, ends, indexes)
    elif how == "containment":
        _indexes = it.has_containment(starts, ends, indexes)
    else:
        _indexes = it.has_overlaps(starts, ends, indexes)

    if invert:
        _indexes = scdf.index.difference(_indexes)

    if return_indexes:
        return _indexes

    return scdf.reindex(_indexes)

Exemple #6

0

Afficher le fichier

    def test_all_containments_both():

        starts = np.array([5, 10], dtype=np.int64)
        ends = np.array([6, 50], dtype=np.int64)
        ids = np.array([0, 1], dtype=np.int64)

        ncls = NCLS(starts, ends, ids)
        subs, covers = ncls.all_containments_both(starts, ends, ids)

        print(ncls.intervals())

        assert list(subs) == [0, 1] == list(covers)

Exemple #7

0

Afficher le fichier

def test_all_containments_both():

    starts = np.array([1291845632, 3002335232], dtype=int)
    ends = np.array([1292894207, 3002597375], dtype=int)
    ids = np.array([0, 1], dtype=int)

    ncls = NCLS(starts, ends, ids)
    subs, covers = ncls.all_containments_both(starts, ends, ids)

    print(ncls.intervals())

    assert list(subs) == [0, 1] == list(covers)

Exemple #8

0

Afficher le fichier

Fichier : gintervals.py Projet : mmosmond/cvtk

 def __init__(self, starts=None, ends=None, indices=None, reduce=False):
     self.ncls = None
     if starts is not None and indices is not None:
         if ends is None:
             ends = [s + 1 for s in starts]
         if reduce:
             starts, ends, indices = list(
                 zip(*merge_overlaps(zip(starts, ends, indices))))
         starts = np.array(starts, dtype='i8')
         ends = np.array(ends, dtype='i8')
         indices = np.array(indices, dtype='i8')
         self.ncls = NCLS(starts, ends, indices)

Exemple #9

0

Afficher le fichier

Fichier : node.py Projet : YuejiangLIU/social-nce-trajectron-plus-plus

    def __init__(self, node_type, node_id, nodes_list, is_robot=False):
        super(MultiNode, self).__init__(node_type, node_id, data=None, is_robot=is_robot)
        self.nodes_list = nodes_list
        for node in self.nodes_list:
            node.is_robot = is_robot

        self.first_timestep = min(node.first_timestep for node in self.nodes_list)
        self._last_timestep = max(node.last_timestep for node in self.nodes_list)

        starts = np.array([node.first_timestep for node in self.nodes_list], dtype=np.int64)
        ends = np.array([node.last_timestep for node in self.nodes_list], dtype=np.int64)
        ids = np.arange(len(self.nodes_list), dtype=np.int64)
        self.interval_tree = NCLS(starts, ends, ids)

Exemple #10

0

Afficher le fichier

Fichier : intersection.py Projet : xie186/pyranges

def _intersection(scdf, ocdf, kwargs):

    how = kwargs["how"]

    if ocdf.empty or scdf.empty:
        return None

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    in_dtype = ocdf.Start.dtype

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how or how is None:
        _self_indexes, _other_indexes = oncls.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = oncls.all_containments_both(
            starts, ends, indexes)
    elif how == "first":
        _self_indexes, _other_indexes = oncls.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    scdf, ocdf = scdf.reindex(_self_indexes), ocdf.reindex(_other_indexes)

    new_starts = pd.Series(np.where(scdf.Start.values > ocdf.Start.values,
                                    scdf.Start, ocdf.Start),
                           index=scdf.index,
                           dtype=in_dtype)

    new_ends = pd.Series(np.where(scdf.End.values < ocdf.End.values, scdf.End,
                                  ocdf.End),
                         index=scdf.index,
                         dtype=in_dtype)

    pd.options.mode.chained_assignment = None  # default='warn'
    scdf.loc[:, "Start"] = new_starts
    scdf.loc[:, "End"] = new_ends
    pd.options.mode.chained_assignment = 'warn'

    if not scdf.empty:
        return scdf
    else:
        return None

Exemple #11

0

Afficher le fichier

    def test_ncls():
        # ids = starts

        print(starts, ends, ids)

        ncls = NCLS(starts, ends, ids)
        print(ncls)
        print(ncls.intervals())

        assert list(ncls.find_overlap(0, 2)) == []
        assert list(ncls.find_overlap(0, 2_147_483_647)) == [(5, 6, 0), (2_147_483_645, 2_147_483_646, 3)]

        r, l = ncls.all_overlaps_both(starts, ends, ids)
        assert list(r) == [0, 3]
        assert list(l) == [0, 3]

Exemple #12

0

Afficher le fichier

Fichier : coverage.py Projet : xma82/pyranges

def _number_overlapping(scdf, ocdf, kwargs):

    keep_nonoverlapping = kwargs.get("keep_nonoverlapping", True)

    if scdf.empty:
        return None
    if ocdf.empty:
        if keep_nonoverlapping:
            df = scdf.copy()
            # print(df)
            df.insert(df.shape[1], "NumberOverlaps", 0)
            # print("df" * 100)
            # print(df)
            return df
        else:
            return None

    oncls = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    _self_indexes, _other_indexes = oncls.all_overlaps_both(
        starts, ends, indexes)

    s = pd.Series(_self_indexes)
    counts_per_read = s.value_counts()[s.unique()].reset_index()
    counts_per_read.columns = ["Index", "Count"]

    df = scdf.copy()

    if keep_nonoverlapping:
        _missing_indexes = np.setdiff1d(scdf.index, _self_indexes)
        missing = pd.DataFrame(data={
            "Index": _missing_indexes,
            "Count": 0
        },
                               index=_missing_indexes)
        counts_per_read = pd.concat([counts_per_read, missing])
    else:
        df = df.loc[_self_indexes]

    counts_per_read = counts_per_read.set_index("Index")

    df.insert(df.shape[1], "NumberOverlaps", counts_per_read)

    return df

Exemple #13

0

Afficher le fichier

Fichier : gintervals.py Projet : mmosmond/cvtk

class NestedContainmentList(object):
    def __init__(self, starts=None, ends=None, indices=None, reduce=False):
        self.ncls = None
        if starts is not None and indices is not None:
            if ends is None:
                ends = [s + 1 for s in starts]
            if reduce:
                starts, ends, indices = list(
                    zip(*merge_overlaps(zip(starts, ends, indices))))
            starts = np.array(starts, dtype='i8')
            ends = np.array(ends, dtype='i8')
            indices = np.array(indices, dtype='i8')
            self.ncls = NCLS(starts, ends, indices)

    def find_overlaps(self, start, end):
        if self.ncls is None:
            # we allow for empty objects, in which case nothing overlaps
            # use case: non-matching seqids
            return []
        overlaps = []
        for overlap in self.ncls.find_overlap(start, end):
            overlaps.append(Interval(*overlap))
        return overlaps

    @staticmethod
    def from_intervals(intervals, reduce=False):
        starts, ends, indices = zip(*intervals)
        starts = np.array(starts, dtype='i8')
        ends = np.array(ends, dtype='i8')
        indices = np.array(indices, dtype='i8')
        obj = NestedContainmentList(starts, ends, indices, reduce=reduce)
        return obj

Exemple #14

0

Afficher le fichier

Fichier : rangeTable.py Projet : jcharkow/imMQExplorer

    def ncls_overlap(self, decimal_places = 5, start_idx = 0, end_idx = None):
        # if end_idx is none set as the end of the list
        if end_idx is None:
            end_idx == len(self.start)
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount) (since NCLS works optimally with integers)

        # convert the retention times to integers so that it is compatible with ncls
        int_start = (self.start * (10**decimal_places)).astype(int) 
        int_end = (self.end * (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(int_start, int_end, self.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array

        return np.column_stack(ncls.all_overlaps_both(int_start[start_idx:end_idx], int_end[start_idx:end_idx], self.idx[start_idx:end_idx])) #column stack puts the two lists vertically (easier iteration for np.vectorize)

Exemple #15

0

Afficher le fichier

def test_ncls():
    # ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts, ends, ids)
    print(ncls)
    print(ncls.intervals())

    assert list(ncls.find_overlap(0, 2)) == []
    print("aaa", list(ncls.find_overlap(9_223_372_036_854_775_805, 9_223_372_036_854_775_806)))
    assert list(ncls.find_overlap(0, 9_223_372_036_854_775_806)) == [(5, 6, 2147483647), (9223372036854775805, 9223372036854775807, 3)]

    r, l = ncls.all_overlaps_both(starts, ends, ids)
    assert list(r) == [2147483647, 3]
    assert list(l) == [2147483647, 3]

Exemple #16

0

Afficher le fichier

Fichier : utils.py Projet : olgabot/cerebra

    def __init__(self, predicate, records):
        self.predicate = predicate
        self.records = []

        working_tree_map = {}

        for idx, record in enumerate(records):
            genome_pos = predicate(record)

            if genome_pos is None:
                continue

            chrom = genome_pos.chrom

            if not chrom in working_tree_map:
                # (starts, ends, ids)
                working_tree_map[chrom] = ([], [], [])

            starts, ends, ids = working_tree_map[chrom]
            starts.append(genome_pos.start)
            ends.append(genome_pos.end)
            ids.append(idx)

            self.records.append(record)

            idx += 1

        tree_map = {}

        for chrom, (starts, ends, ids) in working_tree_map.items():
            tree_map[chrom] = NCLS(np.array(starts, dtype=np.long),
                                   np.array(ends, dtype=np.long),
                                   np.array(ids, dtype=np.long))

        self.tree_map = tree_map

Exemple #17

0

Afficher le fichier

Fichier : annotation_table_sequence_details.py Projet : wenmm/Genome_analysis_scripts

def gmap_parse_ncls(gmapFile, cutoff):
    gmapLoc = {}
    starts = []
    ends = []
    ids = []
    ongoingCount = 0
    with open(gmapFile, 'r') as fileIn:
        for line in fileIn:
            # Skip unneccessary lines
            if line.startswith('#'):
                continue
            sl = line.split('\t')
            if sl[2] != 'cDNA_match':  # I don't think any other type of line is present in a GMAP gff3 file produced with PASA's settings, but this could potentially future proof the script?
                continue
            # Get details from line including start, stop, and orientation
            contigID = sl[0]
            contigStart = int(sl[3])
            contigStop = int(sl[4])
            identity = float(sl[5])
            if identity < cutoff:  # Speed up program by only holding onto hits that will pass our cutoff check.
                continue
            # Add to our NCLS                                                               # We index using ranges since it provides an easy way to retrieve GMAP matches by coordinates. Since these coordinates aren't unique, we filter any results returned by their contig ID.
            starts.append(contigStart)
            ends.append(
                contigStop + 1
            )  # NCLS indexes 0-based, so +1 to make this more logically compliant with gff3 1-based system.
            ids.append(ongoingCount)
            gmapLoc[ongoingCount] = contigID
            ongoingCount += 1
    # Build the NCLS object
    starts = pd.Series(starts)
    ends = pd.Series(ends)
    ids = pd.Series(ids)
    ncls = NCLS(starts.values, ends.values, ids.values)
    return ncls, gmapLoc

Exemple #18

0

Afficher le fichier

Fichier : bedcov-ncls.py Projet : zachcp/cgranges

def main(argv):
	if len(argv) < 3:
		print("Usage: bedcov.py <loaded.bed> <streamed.bed>")
		sys.exit(1)

	bed, i = {}, 0
	start = timer()
	with open(argv[1]) as fp:
		for line in fp:
			t = line[:-1].split("\t")
			if not t[0] in bed:
				bed[t[0]] = [[], [], [], None]
			bed[t[0]][0].append(t[1])
			bed[t[0]][1].append(t[2])
			bed[t[0]][2].append(i)
			i += 1
	sys.stderr.write("Read in {} sec\n".format(timer() - start))
	start = timer()
	for ctg in bed:
		bed[ctg][3] = NCLS(np.array(bed[ctg][0], dtype=np.long), np.array(bed[ctg][1], dtype=np.long), np.array(bed[ctg][2], dtype=np.long))
	sys.stderr.write("Index in {} sec\n".format(timer() - start))
	start = timer()
	with open(argv[2]) as fp:
		for line in fp:
			t = line[:-1].split("\t")
			if not t[0] in bed:
				print("{}\t{}\t{}\t0".format(t[0], t[1], t[2]))
			else:
				cnt = 0
				it = bed[t[0]][3].find_overlap(long(t[1]), long(t[2]))
				for r in it: cnt += 1
				print("{}\t{}\t{}\t{}".format(t[0], t[1], t[2], cnt))
	sys.stderr.write("Query in {} sec\n".format(timer() - start))

Exemple #19

0

Afficher le fichier

Fichier : Misc.py Projet : SebastianHollizeck/mismatchfinder

def buildNCLSindex(sites):

    starts = array(sites, dtype=int64)
    ends = array(starts + 1, dtype=int64)
    idxs = arange(len(starts))

    index = NCLS(starts, ends, idxs)
    return index

Exemple #20

0

Afficher le fichier

    def overlap(self, im=True, decimal_places=5):
        #if index not linear then filter and use this hidden index
        use_hidden = False  #if true that means reindex done for overlap (with have to unindex before return results)
        if not np.all(self.retentionTable.idx == np.arange(
                0, len(self.retentionTable.start))):
            hidden_idx = np.arange(0, len(self.retentionTable.start))
            use_hidden = True
        else:
            hidden_idx = self.retentionTable.idx

        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
                         (10**decimal_places)).astype(int)
        ret_int_end = (self.retentionTable.end *
                       (10**decimal_places)).astype(int)

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, hidden_idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ret_int_start, ret_int_end, hidden_idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)

        #filter out pairs where x=y, although these overlap not interested in them
        ret_idx = ret_idx[ret_idx[:, 0] != ret_idx[:, 1]]

        if ret_idx.size > 0:  #only look for overlap if there is overlap in retention time
            #if im flag on, then have to check for overlap in both mz and im dimensions
            if im:
                rslt = ret_idx[self.__vecMzImOverlap(ret_idx[:, 0],
                                                     ret_idx[:, 1])]
            else:
                rslt = ret_idx[self.mzTable.vec_idx_overlap(
                    ret_idx[:, 0], ret_idx[:, 1])]
        else:
            rslt = np.array([])

        #unindex if need to
        if use_hidden:
            return Precursor.unindex(rslt, self.retentionTable.idx)
        else:
            return rslt

Exemple #21

0

Afficher le fichier

 def as_ncls_dict(self) -> Dict[Chrom, NCLS]:
     res = {}
     for chrom, chrom_df in self._obj.groupby("chrom"):
         res[chrom] = NCLS(
             chrom_df.start.values.astype(np.int64),
             chrom_df.end.values.astype(np.int64),
             chrom_df.index.values.astype(np.int64),
         )
     return res

Exemple #22

0

Afficher le fichier

def _subtraction(scdf, ocdf, **kwargs):

    if ocdf.empty or scdf.empty:
        return scdf

    strandedness = kwargs["strandedness"]
    strand = True if strandedness else False

    chromosome = scdf.Chromosome.head(1).iloc[0]
    kwargs["chromosome"] = chromosome

    if "Strand" in ocdf and strand:
        strand = scdf.Strand.head(1).iloc[0]
        kwargs["strand"] = strand

    o = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    idx_self, new_starts, new_ends = o.set_difference_helper(
        scdf.Start.values, scdf.End.values, scdf.index.values,
        scdf.__num__.values)

    missing_idx = pd.Index(scdf.index).difference(idx_self)

    idx_to_drop = new_starts != -1

    new_starts = new_starts[idx_to_drop]
    new_ends = new_ends[idx_to_drop]

    idx_self = idx_self[idx_to_drop]
    new_starts = pd.Series(new_starts, index=idx_self)
    new_ends = pd.Series(new_ends, index=idx_self)

    scdf = scdf.reindex(missing_idx.union(idx_self)).sort_index()
    new_starts = new_starts.sort_index()
    new_ends = new_ends.sort_index()

    if len(idx_self):
        scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts.values
        scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends.values

    if not scdf.empty:
        return scdf
    else:
        return None

Exemple #23

0

Afficher le fichier

 def filter_by_human_annotations(self, article, annotations):
     ncls = NCLS(*get_intervals(article['annotations']))
     new_annotations = []
     num_filtered = 0
     for annotation in annotations:
         entity_start, entity_end = get_start_end(annotation)
         matched_human_annotation = list(
             ncls.find_overlap(entity_start, entity_end))
         if len(matched_human_annotation) == 0:
             new_annotations.append(annotation)
         else:
             human_annotation = article['annotations'][
                 matched_human_annotation[0][2]]
             human_annotation_start, human_annotation_end = get_start_end(
                 human_annotation)
             assert intersect(human_annotation_start, human_annotation_end,
                              entity_start, entity_end)
             num_filtered += 1
     assert len(new_annotations) + num_filtered == len(annotations)
     return new_annotations, num_filtered

Exemple #24

0

Afficher le fichier

Fichier : test_ncls.py Projet : alexreg/ncls

def test_ncls():
    starts = pd.Series(range(0, int(1e6)))
    ends = starts + 100
    ids = starts

    print(starts, ends, ids)

    ncls = NCLS(starts.values, ends.values, ids.values)

    # starts = pd.Series([0, 4])
    # ends = pd.Series([2, 5])
    # indexes = pd.Series([98, 99])
    print(starts, ends, indexes)
    it = ncls.all_overlaps_both_stack(starts.values, ends.values,
                                      indexes.values)
    it2 = ncls.all_overlaps_both(starts.values, ends.values, indexes.values)

    print(it)
    print(it2)
    assert it == it2

Exemple #25

0

Afficher le fichier

Fichier : join.py Projet : dmaloneynygc/pyranges

def _both_indexes(scdf, ocdf, how=False):

    assert (how in "containment first".split() + [False, None]) or isinstance(
        how, int)
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    return _self_indexes, _other_indexes

Exemple #26

0

Afficher le fichier

    def overlap_against_window(self, exp, im=True, decimal_places=5):
        print("starting overlap against window")
        ms2 = exp.ms2
        #decimal_places = accuracy of the retentionTime axis (how many decimal places should take into acoount)

        # convert the retention times to integers so that it is compatible with ncls
        ret_int_start = (self.retentionTable.start *
                         (10**decimal_places)).astype(int)
        ret_int_end = (self.retentionTable.end *
                       (10**decimal_places)).astype(int)

        ms2_time_int_start = (ms2.timeTable.start *
                              (10**decimal_places)).astype(int)
        ms2_time_int_end = (ms2.timeTable.end *
                            (10**decimal_places)).astype(int)

        print(ret_int_start)
        print(ret_int_end)

        #create the ncls object
        ncls = NCLS(ret_int_start, ret_int_end, self.retentionTable.idx)

        #find all pairwise retention time overlaps, store in a vertical 2XN numpy nd array
        ret_idx = np.column_stack(
            ncls.all_overlaps_both(ms2_time_int_start, ms2_time_int_end,
                                   ms2.timeTable.idx)
        )  #column stack puts the two lists vertically (easier iteration for np.vectorize)
        ret_idx = np.fliplr(ret_idx)

        #if im flag on, then have to check for overlap in both mz and im dimensions
        if im:
            return ret_idx[self.vec_mz_im_overlap(ret_idx[:, 0],
                                                  ret_idx[:, 1],
                                                  idx2_data=ms2)]
        else:
            #print(ms2.mzTable)
            return ret_idx[self.mzTable.vec_idx_overlap(ret_idx[:, 0],
                                                        ret_idx[:, 1],
                                                        yData=ms2.mzTable)]

Exemple #27

0

Afficher le fichier

Fichier : test_multiprocessing.py Projet : davemarr621/NestedContainers

def create_ncls(seed):

    np.random.seed(seed)

    total_nb = int(1e7)

    starts = randint(0, int(1e8), total_nb)
    ends = starts + 100

    ncls = NCLS(starts, ends, starts)

    print("returning")
    return ncls

Exemple #28

0

Afficher le fichier

Fichier : multithreaded.py Projet : Runsheng/pyranges

def _both_dfs(scdf, ocdf, how=False, **kwargs):

    assert how in "containment first".split() + [False, None]
    starts = scdf.Start.values
    ends = scdf.End.values
    indexes = scdf.index.values

    it = NCLS(ocdf.Start.values, ocdf.End.values, ocdf.index.values)

    if not how:
        _self_indexes, _other_indexes = it.all_overlaps_both(
            starts, ends, indexes)
    elif how == "containment":
        _self_indexes, _other_indexes = it.all_containments_both(
            starts, ends, indexes)
    else:
        _self_indexes, _other_indexes = it.first_overlap_both(
            starts, ends, indexes)

    _self_indexes = _self_indexes
    _other_indexes = _other_indexes

    return scdf.loc[_self_indexes], ocdf.loc[_other_indexes]

Exemple #29

0

Afficher le fichier

def create_eij_ncls_dict(standard_event_dict):

    eij_by_chrom_strand = {}
    eij_indexed_event_dict = {}
    eij_only_count_dict = {}
    ncls_by_chrom_strand = {}

    for event, event_val in standard_event_dict.iteritems():

        strand = event_val["strand"]
        chrom = event_val["chrom"]

        for eij in event_val["included_ei_junctions"]:

            ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault(
                strand, set()).add(int(eij))

            eij_index = chrom + "_" + str(eij) + "_" + strand

            eij_indexed_event_dict.setdefault(eij_index,
                                              set()).add(event + "_included")

            eij_only_count_dict.setdefault(eij_index, 0)

        for eij in event_val["excluded_ei_junctions"]:

            ((eij_by_chrom_strand).setdefault(chrom, {})).setdefault(
                strand, set()).add(int(eij))

            eij_index = chrom + "_" + str(eij) + "_" + strand

            eij_indexed_event_dict.setdefault(eij_index,
                                              set()).add(event + "_excluded")

            eij_only_count_dict.setdefault(eij_index, 0)

    for chrom, chrom_dict in eij_by_chrom_strand.iteritems():

        ncls_by_chrom_strand[chrom] = {}

        for strand, strand_dict in chrom_dict.iteritems():

            starts = np.array(list(strand_dict)) - 1  ## to make 0-based
            ends = starts
            ids = starts

            ncls_by_chrom_strand[chrom][strand] = NCLS(starts, ends, ids)

    return (ncls_by_chrom_strand, eij_indexed_event_dict, eij_only_count_dict)

Exemple #30

0

Afficher le fichier

Fichier : multithreaded.py Projet : Runsheng/pyranges

def _subtraction(scdf, ocdf, **kwargs):

    chromosome, strand = parse_grpby_key(kwargs["key"])

    if ocdf.empty or scdf.empty:
        return scdf

    strandedness = kwargs["strandedness"]
    strand = True if strandedness else False

    oc = _cluster(ocdf, chromosome, strand)
    o = NCLS(oc.Start.values, oc.End.values, oc.index.values)

    idx_self, new_starts, new_ends = o.set_difference_helper(
        scdf.Start.values, scdf.End.values, scdf.index.values)

    missing_idx = pd.Index(scdf.index).difference(idx_self)

    idx_to_drop = new_starts != -1

    new_starts = new_starts[idx_to_drop]
    new_ends = new_ends[idx_to_drop]

    idx_self = idx_self[idx_to_drop]
    new_starts = pd.Series(new_starts, index=idx_self).sort_index()
    new_ends = pd.Series(new_ends, index=idx_self).sort_index()
    idx_self = np.sort(idx_self)

    scdf = scdf.reindex(missing_idx.union(idx_self))

    if len(idx_self):

        scdf.loc[scdf.index.isin(idx_self), "Start"] = new_starts
        scdf.loc[scdf.index.isin(idx_self), "End"] = new_ends

    return scdf