コード例 #1
0
ファイル: data_handler.py プロジェクト: GenomeNet/GenomeNet
    def query_similar_activations(self, cells, source, activation_threshold=.3,
                                  data_transform='tanh', add_histograms=False, phrase_length=0,
                                  query_mode='fast', constrain_left=False, constrain_right=False, no_of_results=50):
        """ search for the longest sequences given the activation threshold and a set of cells

        :param cells: the cells
        :param source: path in states.h5
        :param activation_threshold: threshold
        :param data_transform: applied data transformation (tanh, tanhabs, raw)
        :return: list of (position, variance of no. active cells, length of longest activation of all cells)
        """
        cell_states, data_transformed = self.get_cached_matrix(data_transform, source)

        # print 'before cs:', '{:,}'.format(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        activation_threshold_corrected = activation_threshold
        if not data_transformed:
            activation_threshold_corrected = np.arctanh(activation_threshold)

        cut_off = 2

        # print 'out cs 1:', '{:,}'.format(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        if query_mode == "fast":
            num_of_cells_per_sum = 5  # how many cells are evaluated per batch
            maximal_length = int(5e5)  # only consider the first 500,000 time steps
            num_candidates = 1000
        else:  # all time steps but still be memory efficient
            maximal_length = cell_states.shape[0]
            num_of_cells_per_sum = int(np.floor(5e6 / maximal_length))
            num_of_cells_per_sum = 1 if num_of_cells_per_sum == 0 else num_of_cells_per_sum
            num_candidates = 10000

        # print 'num_cells', num_of_cells_per_sum

        cs_cand = None
        # start = time.time()
        no_slices = int(np.ceil(len(cells) * 1. / num_of_cells_per_sum))
        for c in range(0, no_slices):
            cell_range = cells[c * num_of_cells_per_sum:min((c + 1) * num_of_cells_per_sum, len(cells))]
            c_discrete = cell_states[:maximal_length, cell_range]
            hf.threshold_discrete(c_discrete, activation_threshold_corrected, 0, 1)

            if num_of_cells_per_sum > 1:
                c_batch = np.sum(c_discrete, axis=1)
            else:
                c_batch = c_discrete
            if cs_cand is None:
                cs_cand = c_batch
            else:
                cs_cand = cs_cand + c_batch

            del c_discrete, c_batch

        test_cell_number = len(cells)
        test_discrete = np.copy(cs_cand)
        collect_all_candidates = {}
        # start = time.time()
        while test_cell_number > 0 and len(collect_all_candidates) < num_candidates:
            if test_cell_number != len(cells):
                test_discrete[test_discrete > test_cell_number] = test_cell_number
            length, positions, value = hf.rle(test_discrete)
            # positions = np.array(positions)
            if phrase_length > 0:
                indices = np.argwhere((value == test_cell_number) & (length == phrase_length))
            else:
                indices = np.argwhere((value == test_cell_number) & (length >= cut_off))

            if constrain_left and not constrain_right:

                len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(),
                                  (test_cell_number - value[indices - 1]).flatten().astype(int).tolist()))
            elif not constrain_left and constrain_right:

                len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(),
                                  (test_cell_number - value[indices + 1]).flatten().astype(int).tolist()))
            elif constrain_left and constrain_right:

                len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(),
                                  (test_cell_number - value[indices + 1] - value[indices - 1]).flatten().astype(
                                      int).tolist()))
            else:
                len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(),
                                  np.zeros(len(indices)).astype(int).tolist()))

            for lp in len_pos:
                key = '{0}_{1}'.format(lp[0], lp[1])
                llp = collect_all_candidates.get(key, lp)
                collect_all_candidates[key] = llp

            test_cell_number -= 1

        all_candidates = list(collect_all_candidates.values())
        all_candidates.sort(key=lambda kk: kk[2], reverse=True)
        # for k, v in enumerate(all_candidates):
        #     if v[1] < 1000:
        #         print 'x', v, k
        all_candidates = all_candidates[:num_candidates]
        # print 'fff'
        # for k, v in enumerate(all_candidates):
        #     if v[1] < 1000:
        #         print 'x', v, k

        cell_count = len(cells)

        res = []

        max_pos = cell_states.shape[0]

        for cand in all_candidates:  # positions where all pivot cells start jointly
            ml = cand[0]  # maximal length of _all_ pivot cells on
            pos = cand[1]  # position of the pattern

            if pos < 1 or pos + ml + 1 > max_pos:
                continue
            # TODO: find a more elegant solution

            cs = np.array(cell_states[pos - 1:pos + ml + 1, :])  # cell values of _all_ cells for the range
            hf.threshold_discrete(cs, activation_threshold_corrected, -1, 1)  # discretize

            # create pattern mask of form -1 1 1..1 -1 = off on on .. on off
            mask = np.ones(ml + 2)
            mask[0] = -1 if constrain_left else 0  # ignore if not constraint
            mask[ml + 1] = -1 if constrain_right else 0  # ignore if not constraint

            cs_sum = np.dot(mask, cs)
            test_pattern_length = ml  # defines the length of the relevant pattern
            test_pattern_length += 1 if constrain_left else 0
            test_pattern_length += 1 if constrain_right else 0

            all_active_cells = np.where(cs_sum == test_pattern_length)[0]  # all cells  that are active for range

            intersect = np.intersect1d(all_active_cells, cells)  # intersection with selected cells
            union = np.union1d(all_active_cells, cells)  # union with selected cells

            res.append({'pos': pos,
                        'factors': [pos, 0, ml,  # int(value[int(indices[ll2]) + 1])
                                    (float(len(intersect)) / float(len(union))),  # Jaccard
                                    cell_count - len(intersect), len(union),
                                    len(intersect)]})  # how many selected cells are not active

        def key(elem):
            return -elem['factors'][6], elem['factors'][5], -elem['factors'][
                2]  # largest intersection, smallest union, longest phrase

        meta = {}
        if add_histograms:
            meta['fuzzy_length_histogram'] = np.bincount([x['factors'][2] for x in res])
            meta['strict_length_histogram'] = np.bincount([x['factors'][2] for x in res if x['factors'][4] == 0])

        if phrase_length > 1:
            res = [x for x in res if x['factors'][2] == phrase_length]

        res.sort(key=key)

        final_res = list(res[:no_of_results])

        # for elem in res_50:
        #     print elem, cell_count, -1. * (cell_count - elem[4]) / float(elem[3] + cell_count)
        # print(constrain_left, constrain_right)
        del res
        # print 'out cs 2:', '{:,}'.format(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        return final_res, meta
コード例 #2
0
ファイル: data_handler.py プロジェクト: GenomeNet/GenomeNet
    def get_states(self, pos_array, source, left=10, right=0, cell_selection=None, raw=False, round_values=5,
                   data_transform='tanh', activation_threshold=0.3, add_active_cells=False, transpose=False, rle=0):

        """Get information about states.

        :param pos_array: array of positions
        :param source: source path in states.h5 file
        :param left: positions to the left
        :param right: positions to the right
        :param cell_selection: selection of cells (None if all cells)
        :param raw: deliver the states submatrix as numpy array (default:false)
        :param round_values: if not raw then round to round_values digits
        :param data_transform: data transformation (default: tanh) -- options: raw, tanh, tanhabs
        :param activation_threshold: activation threshold for when a cell is considered to be active(default: 0.3)
        :param add_active_cells: add active cell count for each position (False)
        :param transpose: transpose states sub-matrix and active cell matrix (False)
        :return: [ ...{left: position left, right: position right, pos: requestes pos, data: states matrix},...],[sum_active]
        :rtype: (list, list)
        """

        if cell_selection is None:
            cell_selection = []

        cell_states, data_transformed = self.get_cached_matrix(data_transform, source)
        # cell_states = self.h5_files[self.config['states']['file']][source]

        res = []
        sum_active = []
        for pos in pos_array:
            left_pos = pos - min(left, pos)
            right_pos = min(len(cell_states), pos + 1 + right)

            if len(cell_selection) == 0:
                cs = cell_states[left_pos:right_pos]
            else:
                cs = cell_states[left_pos:right_pos, cell_selection]

            if not data_transformed:
                if data_transform == 'tanh':
                    np.tanh(cs, cs)
                if data_transform == 'tanh_abs':
                    np.tanh(cs, cs)
                    np.abs(cs, cs)

            sub_res = {
                'pos': pos,
                'left': left_pos,
                'right': right_pos - 1
            }

            if rle > 0:
                cs_t = np.transpose(np.copy(cs))
                disc = np.copy(cs_t)
                cs_t[cs_t < activation_threshold] = 0
                hf.threshold_discrete(disc, activation_threshold, 0, 1)

                for i in range(0, len(disc)):
                    state = disc[i]
                    lengths, pos, values = hf.rle(state)
                    offset = int(1 - values[0])
                    lengths_1 = lengths[offset::2]
                    pos_1 = pos[offset::2]
                    del_pos = np.argwhere(lengths_1 <= rle)
                    for p in del_pos:
                        cs_t[i, pos_1[p]:pos_1[p] + lengths_1[p]] = 0
                sub_res['data'] = cs_t if raw else [[round(y, round_values) for y in x] for x in cs_t.tolist()]

            else:
                if transpose:
                    sub_res['data'] = np.transpose(cs) if raw else [[round(y, round_values) for y in x] for x in
                                                                    np.transpose(cs).tolist()]
                else:
                    sub_res['data'] = cs if raw else [[round(y, round_values) for y in x] for x in cs.tolist()]

            # add count of active cells -- !!! cs will be destroyed here !!!
            if add_active_cells:
                activation_threshold_corrected = activation_threshold
                # already tanh applied if necessary

                a = cs
                hf.threshold_discrete(a, activation_threshold_corrected, 0, 1)

                sum_active.append(np.sum(a, axis=1).tolist())

            del cs
            res.append(sub_res)

        return res, sum_active