def query_similar_activations(self, cells, source, activation_threshold=.3, data_transform='tanh', add_histograms=False, phrase_length=0, query_mode='fast', constrain_left=False, constrain_right=False, no_of_results=50): """ search for the longest sequences given the activation threshold and a set of cells :param cells: the cells :param source: path in states.h5 :param activation_threshold: threshold :param data_transform: applied data transformation (tanh, tanhabs, raw) :return: list of (position, variance of no. active cells, length of longest activation of all cells) """ cell_states, data_transformed = self.get_cached_matrix(data_transform, source) # print 'before cs:', '{:,}'.format(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) activation_threshold_corrected = activation_threshold if not data_transformed: activation_threshold_corrected = np.arctanh(activation_threshold) cut_off = 2 # print 'out cs 1:', '{:,}'.format(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) if query_mode == "fast": num_of_cells_per_sum = 5 # how many cells are evaluated per batch maximal_length = int(5e5) # only consider the first 500,000 time steps num_candidates = 1000 else: # all time steps but still be memory efficient maximal_length = cell_states.shape[0] num_of_cells_per_sum = int(np.floor(5e6 / maximal_length)) num_of_cells_per_sum = 1 if num_of_cells_per_sum == 0 else num_of_cells_per_sum num_candidates = 10000 # print 'num_cells', num_of_cells_per_sum cs_cand = None # start = time.time() no_slices = int(np.ceil(len(cells) * 1. / num_of_cells_per_sum)) for c in range(0, no_slices): cell_range = cells[c * num_of_cells_per_sum:min((c + 1) * num_of_cells_per_sum, len(cells))] c_discrete = cell_states[:maximal_length, cell_range] hf.threshold_discrete(c_discrete, activation_threshold_corrected, 0, 1) if num_of_cells_per_sum > 1: c_batch = np.sum(c_discrete, axis=1) else: c_batch = c_discrete if cs_cand is None: cs_cand = c_batch else: cs_cand = cs_cand + c_batch del c_discrete, c_batch test_cell_number = len(cells) test_discrete = np.copy(cs_cand) collect_all_candidates = {} # start = time.time() while test_cell_number > 0 and len(collect_all_candidates) < num_candidates: if test_cell_number != len(cells): test_discrete[test_discrete > test_cell_number] = test_cell_number length, positions, value = hf.rle(test_discrete) # positions = np.array(positions) if phrase_length > 0: indices = np.argwhere((value == test_cell_number) & (length == phrase_length)) else: indices = np.argwhere((value == test_cell_number) & (length >= cut_off)) if constrain_left and not constrain_right: len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(), (test_cell_number - value[indices - 1]).flatten().astype(int).tolist())) elif not constrain_left and constrain_right: len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(), (test_cell_number - value[indices + 1]).flatten().astype(int).tolist())) elif constrain_left and constrain_right: len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(), (test_cell_number - value[indices + 1] - value[indices - 1]).flatten().astype( int).tolist())) else: len_pos = set(zip(length[indices].flatten().tolist(), positions[indices].flatten().tolist(), np.zeros(len(indices)).astype(int).tolist())) for lp in len_pos: key = '{0}_{1}'.format(lp[0], lp[1]) llp = collect_all_candidates.get(key, lp) collect_all_candidates[key] = llp test_cell_number -= 1 all_candidates = list(collect_all_candidates.values()) all_candidates.sort(key=lambda kk: kk[2], reverse=True) # for k, v in enumerate(all_candidates): # if v[1] < 1000: # print 'x', v, k all_candidates = all_candidates[:num_candidates] # print 'fff' # for k, v in enumerate(all_candidates): # if v[1] < 1000: # print 'x', v, k cell_count = len(cells) res = [] max_pos = cell_states.shape[0] for cand in all_candidates: # positions where all pivot cells start jointly ml = cand[0] # maximal length of _all_ pivot cells on pos = cand[1] # position of the pattern if pos < 1 or pos + ml + 1 > max_pos: continue # TODO: find a more elegant solution cs = np.array(cell_states[pos - 1:pos + ml + 1, :]) # cell values of _all_ cells for the range hf.threshold_discrete(cs, activation_threshold_corrected, -1, 1) # discretize # create pattern mask of form -1 1 1..1 -1 = off on on .. on off mask = np.ones(ml + 2) mask[0] = -1 if constrain_left else 0 # ignore if not constraint mask[ml + 1] = -1 if constrain_right else 0 # ignore if not constraint cs_sum = np.dot(mask, cs) test_pattern_length = ml # defines the length of the relevant pattern test_pattern_length += 1 if constrain_left else 0 test_pattern_length += 1 if constrain_right else 0 all_active_cells = np.where(cs_sum == test_pattern_length)[0] # all cells that are active for range intersect = np.intersect1d(all_active_cells, cells) # intersection with selected cells union = np.union1d(all_active_cells, cells) # union with selected cells res.append({'pos': pos, 'factors': [pos, 0, ml, # int(value[int(indices[ll2]) + 1]) (float(len(intersect)) / float(len(union))), # Jaccard cell_count - len(intersect), len(union), len(intersect)]}) # how many selected cells are not active def key(elem): return -elem['factors'][6], elem['factors'][5], -elem['factors'][ 2] # largest intersection, smallest union, longest phrase meta = {} if add_histograms: meta['fuzzy_length_histogram'] = np.bincount([x['factors'][2] for x in res]) meta['strict_length_histogram'] = np.bincount([x['factors'][2] for x in res if x['factors'][4] == 0]) if phrase_length > 1: res = [x for x in res if x['factors'][2] == phrase_length] res.sort(key=key) final_res = list(res[:no_of_results]) # for elem in res_50: # print elem, cell_count, -1. * (cell_count - elem[4]) / float(elem[3] + cell_count) # print(constrain_left, constrain_right) del res # print 'out cs 2:', '{:,}'.format(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) return final_res, meta
def get_states(self, pos_array, source, left=10, right=0, cell_selection=None, raw=False, round_values=5, data_transform='tanh', activation_threshold=0.3, add_active_cells=False, transpose=False, rle=0): """Get information about states. :param pos_array: array of positions :param source: source path in states.h5 file :param left: positions to the left :param right: positions to the right :param cell_selection: selection of cells (None if all cells) :param raw: deliver the states submatrix as numpy array (default:false) :param round_values: if not raw then round to round_values digits :param data_transform: data transformation (default: tanh) -- options: raw, tanh, tanhabs :param activation_threshold: activation threshold for when a cell is considered to be active(default: 0.3) :param add_active_cells: add active cell count for each position (False) :param transpose: transpose states sub-matrix and active cell matrix (False) :return: [ ...{left: position left, right: position right, pos: requestes pos, data: states matrix},...],[sum_active] :rtype: (list, list) """ if cell_selection is None: cell_selection = [] cell_states, data_transformed = self.get_cached_matrix(data_transform, source) # cell_states = self.h5_files[self.config['states']['file']][source] res = [] sum_active = [] for pos in pos_array: left_pos = pos - min(left, pos) right_pos = min(len(cell_states), pos + 1 + right) if len(cell_selection) == 0: cs = cell_states[left_pos:right_pos] else: cs = cell_states[left_pos:right_pos, cell_selection] if not data_transformed: if data_transform == 'tanh': np.tanh(cs, cs) if data_transform == 'tanh_abs': np.tanh(cs, cs) np.abs(cs, cs) sub_res = { 'pos': pos, 'left': left_pos, 'right': right_pos - 1 } if rle > 0: cs_t = np.transpose(np.copy(cs)) disc = np.copy(cs_t) cs_t[cs_t < activation_threshold] = 0 hf.threshold_discrete(disc, activation_threshold, 0, 1) for i in range(0, len(disc)): state = disc[i] lengths, pos, values = hf.rle(state) offset = int(1 - values[0]) lengths_1 = lengths[offset::2] pos_1 = pos[offset::2] del_pos = np.argwhere(lengths_1 <= rle) for p in del_pos: cs_t[i, pos_1[p]:pos_1[p] + lengths_1[p]] = 0 sub_res['data'] = cs_t if raw else [[round(y, round_values) for y in x] for x in cs_t.tolist()] else: if transpose: sub_res['data'] = np.transpose(cs) if raw else [[round(y, round_values) for y in x] for x in np.transpose(cs).tolist()] else: sub_res['data'] = cs if raw else [[round(y, round_values) for y in x] for x in cs.tolist()] # add count of active cells -- !!! cs will be destroyed here !!! if add_active_cells: activation_threshold_corrected = activation_threshold # already tanh applied if necessary a = cs hf.threshold_discrete(a, activation_threshold_corrected, 0, 1) sum_active.append(np.sum(a, axis=1).tolist()) del cs res.append(sub_res) return res, sum_active