def correct_dot_masks(masks, gain_map, excluded_pixels=None, allow_empty=False): mask_shape = masks.shape sig_shape = gain_map.shape masks = masks.reshape((-1, np.prod(sig_shape))) if excluded_pixels is not None: if is_sparse(masks): result = sparse.DOK(masks) else: result = masks.copy() desc = RepairDescriptor(sig_shape, excluded_pixels=excluded_pixels, allow_empty=allow_empty) for e, r, c in zip(desc.exclude_flat, desc.repair_flat, desc.repair_counts): result[:, e] = 0 rep = masks[:, e] / c # We have to loop because of sparse.pydata limitations for m in range(result.shape[0]): for rr in r[:c]: result[m, rr] = result[m, rr] + rep[m] if is_sparse(result): result = sparse.COO(result) else: result = masks result = result * gain_map.flatten() return result.reshape(mask_shape)
def build_trans_tables(retrieved, key, L): (dt, tSim, N, S, p, num_fact, p_fact, dzeta, a_pf, eps, f_russo, cm, a, U, w, tau_1, tau_2, tau_3_A, tau_3_B, g_A, beta, tau, t_0, g, random_seed, p_0, n_p, nSnap, russo2008_mode, muted_prop) = \ file_handling.load_parameters(key) n_seeds = len(retrieved) num_tables = [[] for order in range(L + 1)] proba_tables = [[] for order in range(L + 1)] print('Table creation') for order in tqdm(range(L)): chain_length = order + 1 num_tables[order] = sparse.DOK( shape=tuple([p for ii in range(chain_length)])) print('Fill num_tables') for kick_seed in tqdm(range(n_seeds)): for cue_ind in range(p): if isinstance(retrieved[kick_seed][cue_ind], list) \ and len(retrieved[kick_seed][cue_ind]) >= 3: # print(len(retrieved[kick_seed][cue_ind])) sequence = [] sequence = retrieved[kick_seed][cue_ind][3:] for ind_trans in range(len(sequence) - L - 1): trans_string = sequence[ind_trans:ind_trans + L + 1] for order in range(L): string = trans_string[:order + 1] num_tables[order][tuple(string)] += 1 print('Table conversion') for order in tqdm(range(L)): num_tables[order] = sparse.COO(num_tables[order]) proba_tables[order] = num_tables[order] / num_tables[order].sum() return num_tables, proba_tables
def _get_k1(self, system): """Calculates the second order terms where the scalar mapping is the inverse distance between atoms. Returns: 1D ndarray: flattened K2 values. """ grid = self.k1["grid"] start = grid["min"] stop = grid["max"] n = grid["n"] sigma = grid["sigma"] # Determine the geometry function geom_func_name = self.k1["geometry"]["function"] cmbtr = MBTRWrapper( self.atomic_number_to_index, self._interaction_limit, np.zeros((len(system), 3), dtype=int), ) k1_map = cmbtr.get_k1( system.get_atomic_numbers(), geom_func_name.encode(), b"unity", {}, start, stop, sigma, n, ) k1_map = self._make_new_k1map(k1_map) # Depending on flattening, use either a sparse matrix or a dense one. n_elem = self.n_elements if self.flatten: k1 = sparse.DOK((n_elem * n), dtype=np.float32) else: k1 = np.zeros((n_elem, n), dtype=np.float32) for key, gaussian_sum in k1_map.items(): i = key[0] # Denormalize if requested if not self.normalize_gaussians: max_val = 1 / (sigma * math.sqrt(2 * math.pi)) gaussian_sum /= max_val if self.flatten: start = i * n end = (i + 1) * n k1[start:end] = gaussian_sum else: k1[i, :] = gaussian_sum if self.flatten: k1 = k1.to_coo() return k1
def test_mask_patch_sparse(): for i in range(REPEATS): print(f"Loop number {i}") num_nav_dims = np.random.choice([1, 2, 3]) num_sig_dims = np.random.choice([2, 3]) nav_dims = tuple(np.random.randint(low=8, high=16, size=num_nav_dims)) sig_dims = tuple(np.random.randint(low=8, high=16, size=num_sig_dims)) # The mask-based correction is performed as float64 since it creates # numerical instabilities otherwise data = gradient_data(nav_dims, sig_dims).astype(np.float64) gain_map = (np.random.random(sig_dims) + 1).astype(np.float64) dark_image = np.random.random(sig_dims).astype(np.float64) exclude = exclude_pixels(sig_dims=sig_dims, num_excluded=3) damaged_data = data.copy() damaged_data /= gain_map damaged_data += dark_image damaged_data[(Ellipsis, *exclude)] = 1e24 print("Nav dims: ", nav_dims) print("Sig dims:", sig_dims) print("Exclude: ", exclude) masks = sparse.DOK(sparse.zeros((20, ) + sig_dims, dtype=np.float64)) indices = [ np.random.randint(low=0, high=s, size=s // 2) for s in (20, ) + sig_dims ] for tup in zip(*indices): masks[tup] = 1 masks = masks.to_coo() data_flat = data.reshape((np.prod(nav_dims), np.prod(sig_dims))) damaged_flat = damaged_data.reshape( (np.prod(nav_dims), np.prod(sig_dims))) correct_dot = sparse.dot(data_flat, masks.reshape((-1, np.prod(sig_dims))).T) corrected_masks = detector.correct_dot_masks(masks, gain_map, exclude) assert is_sparse(corrected_masks) reconstructed_dot =\ sparse.dot(damaged_flat, corrected_masks.reshape((-1, np.prod(sig_dims))).T)\ - sparse.dot(dark_image.flatten(), corrected_masks.reshape((-1, np.prod(sig_dims))).T) _check_result(data=correct_dot, corrected=reconstructed_dot, atol=1e-8, rtol=1e-5)
def matrix_load(self, path): ''' Loads a previously saved matrix from a .npz file (containing the matrix) and a .nfo file (containing the matrix tags). ''' # load matrix matrix = sparse.load_npz(os.path.splitext(path)[0] + '.npz') matrix = sparse.DOK( matrix) # convert to dict-of-keys for faster indexing # load matrix tags with open(os.path.splitext(path)[0] + '.nfo', 'rb') as f: tags = pickle.load(f) return matrix, tags
def overwrite(mpx, out=None): """ Overwrites tensors of mpx2 with tensors of mpx1, with fixed shape of mpx2 tensors. Parameters ---------- mpx : MPX (source) out : MPX (target) [modified] """ L = len(mpx) for i in range(L): m1 = sp.DOK.from_coo(out[i]) m2 = sp.DOK(mpx[i]) for coord in m2.data: m1[coord] = m2[coord] out[i] = m1.to_coo()
def resolve_relations(db_file, rel_file, meta_file, id_file): """ """ conn = open_db_connection(db_file) c = conn.cursor() # load or compute unique IDs if os.path.isfile(meta_file): meta = np.load(meta_file) off = meta[0] num_unique = meta[1] unique_ids = np.load(id_file) else: off = 0 c.execute("SELECT DISTINCT event1_id FROM Relations;") event_ids = set(c.fetchall()) for id2 in c.execute("SELECT event2_id FROM Relations;"): if not id2 in event_ids: event_ids.add(id2) unique_ids = np.char.array(list(event_ids)) num_unique = len(event_ids) np.save(id_file, unique_ids) np.save(meta_file, np.array([off, num_unique])) id_lookup = dict() for i, id_entr in enumerate(unique_ids): id_lookup[id_entr[0]] = i # load or compute (compressed) relations if os.path.isfile(rel_file): relations = sparse.load_npz(rel_file) else: relations = sparse.DOK((num_unique, num_unique, RELATION_COUNT), dtype=np.float32) for row in c.execute("SELECT * FROM Relations;"): id_out = row[1] id_in = row[2] relations[id_lookup[id_out], id_lookup[id_in], :] = row[3:] relations = sparse.COO(relations) sparse.save_npz(rel_file, relations) conn.close()
def bench_test_fused_pydata(tacoBench, num, pt1): loader = ImagePydataSparseTensorLoader() sparse_bin_img1 = safeCastPydataTensorToInts(loader.sparse_image(num, pt1, 1)) sparse_bin_img2 = safeCastPydataTensorToInts(loader.sparse_image(num, pt1+0.05, 2)) sparse_bin_window = loader.sparse_window(num, 3) bin_img1 = loader.dense_image(num, pt1, 1) bin_img2 = loader.dense_image(num, pt1 + 0.05, 2) bin_window = loader.dense_window(num) def sparse_bench(): return testOp(sparse_bin_img1, sparse_bin_img2, sparse_bin_window).astype('int') def dense_bench(): return testOp(bin_img1, bin_img2, bin_window).astype('int') ret = tacoBench(sparse_bench) sparse_xor_img = sparse_bench() xor_img = dense_bench() # Write result to TNS file to see what's different shape = xor_img.shape result = sparse.COO.from_numpy(xor_img, fill_value=0) dok = sparse.DOK(result) TnsFileDumper().dump_dict_to_file(shape, dok.data, os.path.join("temp", "numpy-result-{}.tns".format(num))) num_elements = float(np.prod(bin_img1.shape)) f = sparse_xor_img.fill_value print("shape1", sparse_bin_img1.shape) print("shape2", sparse_bin_img2.shape) print("sparse img1 nnz =", sparse_bin_img1.nnz, " ", np.sum(bin_img1 != 0)) print("sparse img2 nnz =", sparse_bin_img2.nnz, " ", np.sum(bin_img2 != 0)) print("sparse win nnz =", sparse_bin_window.nnz, " ", np.sum(bin_window != 0)) print("Total num elements", num_elements) print("Fill value", f) print("Sparse xor NNF = ", sparse_xor_img.nnz, "\t", "Dense xor NNF = ", np.sum(xor_img != int(f))) print("Dense xor NNZ = ", np.sum(xor_img != 0)) assert(sparse_xor_img.nnz == np.sum(xor_img != 1))
def correct_dot_masks(masks, gain_map, excluded_pixels=None): mask_shape = masks.shape sig_shape = gain_map.shape masks = masks.reshape((-1, np.prod(sig_shape))) if excluded_pixels is not None: if is_sparse(masks): result = sparse.DOK(masks) else: result = masks.copy() repairs = environments(excluded_pixels, sig_shape) for e, r in zip(*flatten_filter(excluded_pixels, repairs, sig_shape)): result[:, e] = 0 rep = masks[:, e] / len(r) # We have to loop because of sparse.pydata limitations for m in range(result.shape[0]): for rr in r: result[m, rr] = result[m, rr] + rep[m] if is_sparse(result): result = sparse.COO(result) else: result = masks result = result * gain_map.flatten() return result.reshape(mask_shape)
def get_rel_counts(ds_name, must_overlap=True): """ Get counts of all of the relations. Used for modeling directly P(rel | o1, o2) :param train_data: :param must_overlap: :return: """ if ds_name.find('vg') >= 0: with open(cfg.DATA_DIR + '/vg/rel_annotations_train.json') as f: train_data = json.load(f) elif ds_name.find('vrd') >= 0: with open(cfg.DATA_DIR + '/vrd/new_annotations_train.json') as f: train_data = json.load(f) else: raise NotImplementedError sparse_fg_matrix = sparse.DOK( ( cfg.MODEL.NUM_CLASSES - 1, # not include background cfg.MODEL.NUM_CLASSES - 1, # not include background cfg.MODEL.NUM_PRD_CLASSES + 1, # include background ), dtype=np.int6) sparse_bg_matrix = sparse.DOK( ( cfg.MODEL.NUM_CLASSES - 1, # not include background cfg.MODEL.NUM_CLASSES - 1, # not include background ), dtype=np.int6) for _, im_rels in train_data.items(): # get all object boxes gt_box_to_label = {} for i, rel in enumerate(im_rels): sbj_box = box_utils.y1y2x1x2_to_x1y1x2y2(rel['subject']['bbox']) obj_box = box_utils.y1y2x1x2_to_x1y1x2y2(rel['object']['bbox']) sbj_lbl = rel['subject']['category'] # not include background obj_lbl = rel['object']['category'] # not include background prd_lbl = rel['predicate'] # not include background if tuple(sbj_box) not in gt_box_to_label: gt_box_to_label[tuple(sbj_box)] = sbj_lbl if tuple(obj_box) not in gt_box_to_label: gt_box_to_label[tuple(obj_box)] = obj_lbl sparse_fg_matrix[sbj_lbl, obj_lbl, prd_lbl + 1] += 1 if cfg.MODEL.USE_OVLP_FILTER: if len(gt_box_to_label): gt_boxes = np.array(list(gt_box_to_label.keys()), dtype=np.int32) gt_classes = np.array(list(gt_box_to_label.values()), dtype=np.int32) o1o2_total = gt_classes[np.array(box_filter( gt_boxes, must_overlap=must_overlap), dtype=int)] for (o1, o2) in o1o2_total: sparse_bg_matrix[o1, o2] += 1 else: # consider all pairs of boxes, overlapped or non-overlapped for b1, l1 in gt_box_to_label.items(): for b2, l2 in gt_box_to_label.items(): if b1 == b2: continue sparse_bg_matrix[l1, l2] += 1 return sparse_fg_matrix.to_coo(), sparse_bg_matrix.to_coo()
def test_empty_dok_dtype(): d = sparse.DOK(5, dtype=np.uint8) s = sparse.COO(d) assert s.dtype == d.dtype
def _dok_like(a, drop_dims=("c", ), dtype="uint8"): dims = tuple(d for d in a.dims if d not in drop_dims) shape = tuple(a.sizes[d] for d in dims) return xarray.DataArray(sparse.DOK(shape=shape, dtype=dtype), dims=dims)
def create_interaction_list(interaction_df, num_individuals, fps=3, ringbuffer_size=5): ts = interaction_df.timestamp.min() rbs = ringbuffer_size rbs_hp = (rbs // 2) + 1 # cumulative interactions over whole period previous_interactions = sparse.COO([], shape=(num_individuals, num_individuals)) # frame sliding window interaction_ringbuffer = [ sparse.COO([], shape=(num_individuals, num_individuals)) for i in range(rbs) ] # cumulative interactions for all cameras within a 1/fps frame period # == 1 frame combined for all cameras current_interactions = sparse.COO([], shape=(num_individuals, num_individuals)) interval_counter = 0 events = [] print("Number of events {}".format(len(interaction_df)), flush=True) print("Number of timestamps {}".format( len(interaction_df.timestamp.unique())), flush=True) for timestamp, group in list( interaction_df.sort_values("timestamp").groupby("timestamp")): # still within current time interval if (timestamp - ts) < datetime.timedelta(milliseconds=int(900 / fps)): pass # end of current time interval else: # count as interaction if more than half of rbs consecutive frames had interactions # == median filter over temporal dimension with kernel size rbs if interval_counter >= rbs: new_interactions = sparse.stack(interaction_ringbuffer).sum( axis=0) > rbs_hp stopped_interactions = (previous_interactions.astype(np.int) - new_interactions.astype(np.int)) == 1 if stopped_interactions.sum() > 0: for bee_id_a, bee_id_b in np.argwhere( stopped_interactions): events.append((timestamp, bee_id_a, bee_id_b)) previous_interactions = new_interactions interaction_ringbuffer[interval_counter % rbs] = current_interactions # new time interval => reset adjacency matrix and timestamp current_interactions = sparse.COO([], shape=(num_individuals, num_individuals)) ts = group.timestamp.min() interval_counter += 1 # interaction adjacency matrix adj_data = {(min(k), max(k)): 1 for k in tuple(group["bee_id"].values)} adj = sparse.DOK(shape=(num_individuals, num_individuals), data=adj_data) # logical or => accumulate interactions from different cameras # for current time interval (~1/fps of a second) current_interactions += adj current_interactions.clip(0, 1, current_interactions) return events
def _get_k2(self, system, new_system, indices): """Calculates the second order terms where the scalar mapping is the inverse distance between atoms. Returns: 1D ndarray: flattened K2 values. """ grid = self.k2["grid"] start = grid["min"] stop = grid["max"] n = grid["n"] sigma = grid["sigma"] # Determine the weighting function and possible radial cutoff radial_cutoff = None weighting = self.k2.get("weighting") parameters = {} if weighting is not None: weighting_function = weighting["function"] if weighting_function == "exponential" or weighting_function == "exp": scale = weighting["scale"] threshold = weighting["threshold"] if scale != 0: radial_cutoff = -math.log(threshold) / scale parameters = { b"scale": weighting["scale"], b"threshold": weighting["threshold"], } else: weighting_function = "unity" # Determine the geometry function geom_func_name = self.k2["geometry"]["function"] # Calculate extended system if self.periodic: centers = new_system.get_positions() ext_system, cell_indices = dscribe.utils.geometry.get_extended_system( system, radial_cutoff, centers, return_cell_indices=True, ) ext_system = System.from_atoms(ext_system) else: ext_system = system cell_indices = np.zeros((len(system), 3), dtype=int) cmbtr = MBTRWrapper(self.atomic_number_to_index, self._interaction_limit, cell_indices) # If radial cutoff is finite, use it to calculate the sparse distance # matrix to reduce computational complexity from O(n^2) to O(n log(n)). # If radial cutoff is not available, calculate full matrix. n_atoms_ext = len(ext_system) n_atoms_new = len(new_system) ext_pos = ext_system.get_positions() new_pos = new_system.get_positions() if radial_cutoff is not None: dmat = new_system.get_distance_matrix_within_radius(radial_cutoff, pos=ext_pos) adj_list = dscribe.utils.geometry.get_adjacency_list(dmat) dmat_dense = np.full( (n_atoms_new, n_atoms_ext), sys.float_info.max ) # The non-neighbor values are treated as "infinitely far". dmat_dense[dmat.row, dmat.col] = dmat.data else: dmat_dense = scipy.spatial.distance.cdist(new_pos, ext_pos) adj_list = np.tile(np.arange(n_atoms_ext), (n_atoms_new, 1)) # Form new indices that include the existing atoms and the newly added # ones indices = np.array( np.append( indices, [n_atoms_ext + i for i in range(n_atoms_new - len(indices))]), dtype=int, ) k2_list = cmbtr.get_k2_local( indices, ext_system.get_atomic_numbers(), dmat_dense, adj_list, geom_func_name.encode(), weighting_function.encode(), parameters, start, stop, sigma, n, ) k2_list = self._make_new_klist_local(k2_list) # Depending on flattening, use either a sparse matrix or a dense one. n_elem = self.n_elements n_loc = len(indices) if self.flatten: k2 = sparse.DOK((n_loc, n_elem * n), dtype=np.float32) for i_loc, k2_map in enumerate(k2_list): for key, gaussian_sum in k2_map.items(): i = key[1] m = i start = int(m * n) end = int((m + 1) * n) # Denormalize if requested if not self.normalize_gaussians: max_val = 1 / (sigma * math.sqrt(2 * math.pi)) gaussian_sum /= max_val k2[i_loc, start:end] = gaussian_sum k2 = k2.to_coo() else: k2 = np.zeros((n_loc, n_elem, n), dtype=np.float32) for i_loc, k2_map in enumerate(k2_list): for key, gaussian_sum in k2_map.items(): i = key[1] # Denormalize if requested if not self.normalize_gaussians: max_val = 1 / (sigma * math.sqrt(2 * math.pi)) gaussian_sum /= max_val k2[i_loc, i, :] = gaussian_sum return k2
def _get_k3(self, system, new_system, indices): """Calculates the second order terms where the scalar mapping is the inverse distance between atoms. Returns: 1D ndarray: flattened K2 values. """ grid = self.k3["grid"] start = grid["min"] stop = grid["max"] n = grid["n"] sigma = grid["sigma"] # Determine the weighting function and possible radial cutoff radial_cutoff = None weighting = self.k3.get("weighting") parameters = {} if weighting is not None: weighting_function = weighting["function"] if weighting_function == "exponential" or weighting_function == "exp": scale = weighting["scale"] threshold = weighting["threshold"] if scale != 0: radial_cutoff = -0.5 * math.log(threshold) / scale parameters = {b"scale": scale, b"threshold": threshold} else: weighting_function = "unity" # Determine the geometry function geom_func_name = self.k3["geometry"]["function"] # Calculate extended system if self.periodic: centers_new = new_system.get_positions() centers_existing = system.get_positions()[indices] centers = np.concatenate((centers_new, centers_existing), axis=0) ext_system, cell_indices = dscribe.utils.geometry.get_extended_system( system, radial_cutoff, centers, return_cell_indices=True, ) ext_system = System.from_atoms(ext_system) else: ext_system = system cell_indices = np.zeros((len(system), 3), dtype=int) cmbtr = MBTRWrapper(self.atomic_number_to_index, self._interaction_limit, cell_indices) # If radial cutoff is finite, use it to calculate the sparse # distance matrix to reduce computational complexity from O(n^2) to # O(n log(n)) fin_system = ext_system + new_system n_atoms_ext = len(ext_system) n_atoms_fin = len(fin_system) n_atoms_new = len(new_system) ext_pos = ext_system.get_positions() new_pos = new_system.get_positions() if radial_cutoff is not None: # Calculate distance within the extended system dmat_ext_to_ext = ext_system.get_distance_matrix_within_radius( radial_cutoff, pos=ext_pos) col = dmat_ext_to_ext.col row = dmat_ext_to_ext.row data = dmat_ext_to_ext.data dmat = scipy.sparse.coo_matrix((data, (row, col)), shape=(n_atoms_fin, n_atoms_fin)) # Calculate the distances from the new positions to atoms in the # extended system using the cutoff if len(new_pos) != 0: dmat_ext_to_new = ext_system.get_distance_matrix_within_radius( radial_cutoff, pos=new_pos) col = dmat_ext_to_new.col row = dmat_ext_to_new.row data = dmat_ext_to_new.data dmat.col = np.append(dmat.col, col + n_atoms_ext) dmat.row = np.append(dmat.row, row) dmat.data = np.append(dmat.data, data) dmat.col = np.append(dmat.col, row) dmat.row = np.append(dmat.row, col + n_atoms_ext) dmat.data = np.append(dmat.data, data) # Calculate adjacencies and transform to the dense matrix for # sending information to C++ adj_list = dscribe.utils.geometry.get_adjacency_list(dmat) dmat_dense = np.full( (n_atoms_fin, n_atoms_fin), sys.float_info.max ) # The non-neighbor values are treated as "infinitely far". dmat_dense[dmat.row, dmat.col] = dmat.data # If no weighting is used, the full distance matrix is calculated else: dmat = scipy.sparse.lil_matrix((n_atoms_fin, n_atoms_fin)) # Fill in block for extended system dmat_ext_to_ext = ext_system.get_distance_matrix() dmat[0:n_atoms_ext, 0:n_atoms_ext] = dmat_ext_to_ext # Fill in block for extended system to new system dmat_ext_to_new = scipy.spatial.distance.cdist(ext_pos, new_pos) dmat[0:n_atoms_ext, n_atoms_ext:n_atoms_ext + n_atoms_new] = dmat_ext_to_new dmat[n_atoms_ext:n_atoms_ext + n_atoms_new, 0:n_atoms_ext] = dmat_ext_to_new.T # Calculate adjacencies and the dense version dmat = dmat.tocoo() adj_list = dscribe.utils.geometry.get_adjacency_list(dmat) dmat_dense = np.full( (n_atoms_fin, n_atoms_fin), sys.float_info.max ) # The non-neighbor values are treated as "infinitely far". dmat_dense[dmat.row, dmat.col] = dmat.data # Form new indices that include the existing atoms and the newly added # ones indices = np.array(np.append( indices, [n_atoms_ext + i for i in range(n_atoms_new)]), dtype=int) k3_list = cmbtr.get_k3_local( indices, fin_system.get_atomic_numbers(), dmat_dense, adj_list, geom_func_name.encode(), weighting_function.encode(), parameters, start, stop, sigma, n, ) k3_list = self._make_new_klist_local(k3_list) # Depending on flattening, use either a sparse matrix or a dense one. n_elem = self.n_elements n_loc = len(indices) if self.flatten: k3 = sparse.DOK((n_loc, int((n_elem * (3 * n_elem - 1) * n / 2))), dtype=np.float32) for i_loc, k3_map in enumerate(k3_list): for key, gaussian_sum in k3_map.items(): i = key[0] j = key[1] k = key[2] # This is the index of the spectrum. It is given by enumerating the # elements of a three-dimensional array and only considering # elements for which k>=i and i || j == 0. The enumeration begins # from [0, 0, 0], and ends at [n_elem, n_elem, n_elem], looping the # elements in the order k, i, j. if j == 0: m = k + i * n_elem - i * (i + 1) / 2 else: m = n_elem * (n_elem + 1) / 2 + (j - 1) * n_elem + k start = int(m * n) end = int((m + 1) * n) # Denormalize if requested if not self.normalize_gaussians: max_val = 1 / (sigma * math.sqrt(2 * math.pi)) gaussian_sum /= max_val k3[i_loc, start:end] = gaussian_sum k3 = k3.to_coo() else: k3 = np.zeros((n_loc, n_elem, n_elem, n_elem, n), dtype=np.float32) for i_loc, k3_map in enumerate(k3_list): for key, gaussian_sum in k3_map.items(): i = key[0] j = key[1] k = key[2] # Denormalize if requested if not self.normalize_gaussians: max_val = 1 / (sigma * math.sqrt(2 * math.pi)) gaussian_sum /= max_val k3[i_loc, i, j, k, :] = gaussian_sum return k3
def _get_k3(self, system): """Calculates the third order terms. Returns: 1D ndarray: flattened K3 values. """ grid = self.k3["grid"] start = grid["min"] stop = grid["max"] n = grid["n"] sigma = grid["sigma"] # Determine the weighting function and possible radial cutoff radial_cutoff = None weighting = self.k3.get("weighting") parameters = {} if weighting is not None: weighting_function = weighting["function"] if weighting_function == "exp" or weighting_function == "exponential": scale = weighting["scale"] threshold = weighting["threshold"] if scale != 0: radial_cutoff = -0.5 * math.log(threshold) / scale parameters = {b"scale": scale, b"threshold": threshold} else: weighting_function = "unity" # Determine the geometry function geom_func_name = self.k3["geometry"]["function"] # If needed, create the extended system if self.periodic: centers = system.get_positions() ext_system, cell_indices = dscribe.utils.geometry.get_extended_system( system, radial_cutoff, centers, return_cell_indices=True) ext_system = System.from_atoms(ext_system) else: ext_system = system cell_indices = np.zeros((len(system), 3), dtype=int) cmbtr = MBTRWrapper(self.atomic_number_to_index, self._interaction_limit, cell_indices) # If radial cutoff is finite, use it to calculate the sparse # distance matrix to reduce computational complexity from O(n^2) to # O(n log(n)) n_atoms = len(ext_system) if radial_cutoff is not None: dmat = ext_system.get_distance_matrix_within_radius(radial_cutoff) adj_list = dscribe.utils.geometry.get_adjacency_list(dmat) dmat_dense = np.full( (n_atoms, n_atoms), sys.float_info.max ) # The non-neighbor values are treated as "infinitely far". dmat_dense[dmat.col, dmat.row] = dmat.data # If no weighting is used, the full distance matrix is calculated else: dmat_dense = ext_system.get_distance_matrix() adj_list = np.tile(np.arange(n_atoms), (n_atoms, 1)) k3_map = cmbtr.get_k3( ext_system.get_atomic_numbers(), dmat_dense, adj_list, geom_func_name.encode(), weighting_function.encode(), parameters, start, stop, sigma, n, ) k3_map = self._make_new_kmap(k3_map) # Depending of flattening, use either a sparse matrix or a dense one. n_elem = self.n_elements if self.flatten: k3 = sparse.DOK((int(n_elem * n_elem * (n_elem + 1) / 2 * n)), dtype=np.float32) else: k3 = np.zeros((n_elem, n_elem, n_elem, n), dtype=np.float32) for key, gaussian_sum in k3_map.items(): i = key[0] j = key[1] k = key[2] # This is the index of the spectrum. It is given by enumerating the # elements of a three-dimensional array where for valid elements # k>=i. The enumeration begins from [0, 0, 0], and ends at [n_elem, # n_elem, n_elem], looping the elements in the order j, i, k. m = int(j * n_elem * (n_elem + 1) / 2 + k + i * n_elem - i * (i + 1) / 2) # Denormalize if requested if not self.normalize_gaussians: max_val = 1 / (sigma * math.sqrt(2 * math.pi)) gaussian_sum /= max_val if self.flatten: start = m * n end = (m + 1) * n k3[start:end] = gaussian_sum else: k3[i, j, k, :] = gaussian_sum if self.flatten: k3 = k3.to_coo() return k3
def test_coo_fv_interface(): s1 = sparse.full((5, 5), fill_value=1+np.random.rand()) s2 = sparse.DOK(s1) assert_eq(s1, s2) s3 = sparse.COO(s2) assert_eq(s1, s3)
def fit(self, training_caption_dict, image_object_dict, num_categories, train_markov = True, train_object_word = True): def create_ngram(tokens, n): """enumerate all ngrams from the list of tokens with automatic start and end paddings""" tokens_with_end = tokens + [self.end_token_index] return [tuple([self.start_token_index] * max(0, n - i - 1) + tokens_with_end[max(0, i + 1 - n): i + 1]) for i in range(len(tokens_with_end))] # captions are yet to have start/end tokens added # unknown token depends on the data. Do not add artificially unique_words = {Constant.start_token, Constant.end_token} unmatch_count = 0 matched_count = 0 for img_id, ngram_lists in training_caption_dict.items(): # make sure that the training data exists from both datasets if img_id not in image_object_dict: unmatch_count += 1 continue matched_count += 1 for ngrams in ngram_lists: unique_words.update(ngrams) print(f"{matched_count} images will be used for training") print(f"{unmatch_count} images unmatched") print(len(unique_words), "unique words") word_encoder = LabelEncoder() word_encoder.fit(list(unique_words)) self.word_encoder = word_encoder self.start_token_index, self.end_token_index = word_encoder.transform([Constant.start_token, Constant.end_token]) self.num_words = len(unique_words) self.num_obj_cats = num_categories if train_markov: # Count(w_t) word_count = np.zeros(self.num_words) # Count(w_t-2, w_t-1, w_t) state_transition_occurrence_matrix = sparse.DOK([self.num_words] * self.ngram_n) if train_object_word: # P(obj_cat | w_t) # flatten grid index dimension object_word_occurrence = np.zeros((self.num_obj_cats * self.grid_size ** 2, self.num_words)) # MLE for img_id, sentence_lists in training_caption_dict.items(): # make sure that the training data exists from both datasets if img_id not in image_object_dict: continue object_list = image_object_dict[img_id] for sentence in sentence_lists: encoded_sentence = word_encoder.transform(sentence).tolist() # add 1 start and end token per sentence for counting purpose for word in [self.start_token_index, self.end_token_index] + encoded_sentence: if train_markov: # add to word prob word_count[word] += 1 if train_object_word: # add to object-word prob for object_id, grid_ids in object_list.items(): for grid_id in grid_ids: object_word_occurrence[self.num_obj_cats * grid_id + object_id][word] += 1 if train_markov: # add to markov chain prob # create_ngram automatically pads the start and end of the encoded sentence for ngram in create_ngram(encoded_sentence, self.ngram_n): state_transition_occurrence_matrix[ngram] += 1 if train_markov: # P(w_t-2, w_t-1 | w_t) self.state_transition_prob_matrix = state_transition_occurrence_matrix / word_count # automatically converts from DOK to COO # P(w_t-2 | w_t-1) self.denominator_conditional_prob_matrix = state_transition_occurrence_matrix.to_coo().sum(-1) / word_count # impute the count of <start> and <end> token as the average count of all other regular words # this alleviates the problem of <end> token being generated too soon word_count_copy = word_count.copy() mask = np.ones(len(word_count_copy), dtype=bool) mask[[self.start_token_index, self.end_token_index]] = False word_count_copy[[self.start_token_index, self.end_token_index]] = word_count_copy[mask].mean() # P(w_t) self.word_log_prob = np.log(word_count_copy / word_count_copy.sum()) # for debugging purpose self.word_count = word_count if train_object_word: self.object_word_prob = object_word_occurrence / word_count
def test_dok_dask_array_is_sparse(): assert utils.is_dask_array_sparse(da.from_array(sparse.DOK((10, 10))))
def test_dok_indexing(): s = sparse.DOK((3, 3)) s[1, 2] = 0.5 x = s.todense() assert_eq(x[1::-1], s[1::-1])
def matrix(self, lastfm, tags=None, dim=3, save_to=None): ''' Computes a n-dimensional matrix where the (i_1, ... ,i_n)-th entry contains the number of tracks having all the i_1-th, ..., i_n-th tags (where the i's are the indexes in self.m_tags). Notes ----- To optimize performance, values are computed only with indexes in increasing order (which means, we only compute the number of tracks having tag-0 and tag-1, not vice-versa). This is something to keep in mind when indexing the matrix. To optimize memory, the matrix is saved in sparse format. DOK is the preferred sparse format for building and indexing, while COO is the preferred sparse format to perform mathematical operations). The dimension of the matrix captures the kind of queries which you will be able to perform. A matrix of dim=2 on tracks=['rock', 'pop', 'hip-hop'] will capture how many tracks have tags rock and pop, or pop and hip-hop, but not rock, pop and hip-hop at the same time. A matrix of dim=len(tags) will fully describe the database (or the subset of the database having the given tags). A matrix of dim>len(tags) will be rather pointless (but we won't prevent you from doing it). Parameters ---------- lastfm: LastFm, LastFm2Pandas Instance of tags database. Using LastFm2Pandas is strongly recommended here. tags: list List of tags to use. If None, all the tags will be used. dim: int The dimension of the matrix. save_to: str Filename or full path of the .npz file to save matrix and matrix tags. Use to load_from in the future. ''' # initialize matrix tags if tags is None: tags = lastfm.get_tags() else: tags = [tag for tag in tags if tag in lastfm.get_tags() ] # possibly purge inexistent tags # initialize matrix matrix = sparse.DOK( (len(tags), ) * dim, dtype=np.int32 ) # sparse dict-of-keys matrix (for easy creation, awful for calculations) # compute total number of steps to comatplotlibetion (see http://www.iosrjournals.org/iosr-jm/papers/Vol8-issue3/A0830110.pdf) n_steps = crazysum(n=len(tags), s=3, k=dim - 1) # check whether a progress bar is needed verbose = n_steps > 100 if verbose: progbar = Progbar(n_steps) # instantiate progress bar def count_intersect_tags(tags): tids_list = [lastfm.with_tag(tag) for tag in tags] tids_list.sort(key=len, reverse=True) tids = set( tids_list.pop() ) # start with shortest list of tids to improve performance; convert to set to be able to intersect for _ in range(len(tids_list)): tids = tids.intersection(tids_list.pop( )) # intersections performed from shortest list to longest return len(tids) # how many tids have all tags def count_intersect_tags_recursive( tags_idxs, dim ): # recursively iterate count_intersect_tags dim times; avoid repetitions such as 'rock AND pop AND folk' vs. 'rock AND folk AND pop' vs. 'folk AND pop AND rock' if dim >= 1: for i in range(tags_idxs[-1] + 1): count_intersect_tags_recursive(tags_idxs + (i, ), dim - 1) else: matrix[tags_idxs] = count_intersect_tags( np.take(tags, tags_idxs)) # add count to sparse matrix if verbose: progbar.add(1) # instantiate recursive loop for i in range(len(tags)): count_intersect_tags_recursive((i, ), dim - 1) matrix = matrix.to_coo() # convert to coordinate matrix if save_to is not None: # save matrix sparse.save_npz(save_to, matrix.to_coo( )) # default to compressed format (i.e. sparse format) # save matrix tags in serialized format with open(os.path.splitext(save_to)[0] + '.nfo', 'wb') as f: pickle.dump(tags, f) return matrix, tags