def test_ball_tree_pickle(): np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) # Test if BallTree with callable metric is picklable bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2) ind1, dist1 = bt1.query(X) ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol) bt2_pyfunc = pickle.loads(s_pyfunc) ind2, dist2 = bt2.query(X) ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc) assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_ball_tree_pickle(): rng = check_random_state(0) X = rng.random_sample((10, 3)) bt1 = BallTree(X, leaf_size=1) # Test if BallTree with callable metric is picklable bt1_pyfunc = BallTree(X, metric=dist_func, leaf_size=1, p=2) ind1, dist1 = bt1.query(X) ind1_pyfunc, dist1_pyfunc = bt1_pyfunc.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) s_pyfunc = pickle.dumps(bt1_pyfunc, protocol=protocol) bt2_pyfunc = pickle.loads(s_pyfunc) ind2, dist2 = bt2.query(X) ind2_pyfunc, dist2_pyfunc = bt2_pyfunc.query(X) assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1_pyfunc, ind2_pyfunc) assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) assert isinstance(bt2, BallTree) for protocol in (0, 1, 2): check_pickle_protocol(protocol)
def similar_products2(deep_f): qs = Product.objects.all() df=read_frame(qs) df['idx'] = range(1, len(df) + 1) feature_list=[] asin_list=[] for prod in qs: feature_list.append(prod.get_features()) asin_list.append(prod.asin) nparray = np.asarray(feature_list) #print nparray tree = BallTree(nparray) dist, ind = tree.query(deep_f, k=5) print ind index = ind[0] recom = index[0:] recommended_asins =[]; for i in recom: recommended_asins.append(asin_list[i]) recommended_prods = Product.objects.filter(asin__in = recommended_asins) return recommended_prods # image_train = graphlab.SFrame(data=df) # cur_prod = image_train[18:19] # print cur_prod # print image_train # knn_model = graphlab.nearest_neighbors.create(image_train, features = ['features'],label = 'asin',distance = 'levenshtein',method = 'ball_tree') # knn_model.save('my_knn') # #knn_model= graphlab.load_model('my_knn') # #print knn_model.query(cur_prod) # #knn_model = graphlab.nearest_neighbors.create(image_train, features = ['features'],label = 'keywords')
def similar_products(product): qs = Product.objects.all() df=read_frame(qs) df['idx'] = range(1, len(df) + 1) feature_list=[] asin_list=[] product_index = 0 inn=0 for prod in qs: feature_list.append(prod.get_features()) asin_list.append(prod.asin) if prod.asin == product.asin: product_index = inn inn+=1 nparray = np.asarray(feature_list) #print nparray tree = BallTree(nparray) dist, ind = tree.query(nparray[product_index], k=5) print ind index = ind[0] recom = index[1:] recommended_asins =[]; for i in recom: recommended_asins.append(asin_list[i]) recommended_prods = Product.objects.filter(asin__in = recommended_asins) return recommended_prods
def check_neighbors(dualtree, breadth_first, k, metric, kwargs): bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_array_almost_equal(dist1, dist2)
def test_query_haversine(): rng = check_random_state(0) X = 2 * np.pi * rng.random_sample((40, 2)) bt = BallTree(X, leaf_size=1, metric='haversine') dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine') assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def test_query_haversine(): np.random.seed(0) X = 2 * np.pi * np.random.random((40, 2)) bt = BallTree(X, leaf_size=1, metric='haversine') dist1, ind1 = bt.query(X, k=5) dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine') assert_array_almost_equal(dist1, dist2) assert_array_almost_equal(ind1, ind2)
def test_ball_tree_kde(kernel, h, rtol, atol, breadth_first, n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) bt = BallTree(X, leaf_size=10) dens_true = compute_kernel_slow(Y, X, kernel, h) dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first) assert_allclose(dens, dens_true, atol=atol, rtol=max(rtol, 1e-7))
def test_gaussian_kde(n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde from scipy.stats import gaussian_kde rng = check_random_state(0) x_in = rng.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: bt = BallTree(x_in[:, None]) gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
def test_ball_tree_query_metrics(metric): rng = check_random_state(0) if metric in BOOLEAN_METRICS: X = rng.random_sample((40, 10)).round(0) Y = rng.random_sample((10, 10)).round(0) elif metric in DISCRETE_METRICS: X = (4 * rng.random_sample((40, 10))).round(0) Y = (4 * rng.random_sample((10, 10))).round(0) k = 5 bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
def test_ball_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) kwargs = METRICS[metric] bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) dist1, ind1 = bt.query(Y, k, dualtree=dualtree, breadth_first=breadth_first) dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) # don't check indices here: if there are any duplicate distances, # the indices may not match. Distances should not have this problem. assert_array_almost_equal(dist1, dist2)
def test_ball_tree_pickle(): import pickle np.random.seed(0) X = np.random.random((10, 3)) bt1 = BallTree(X, leaf_size=1) ind1, dist1 = bt1.query(X) def check_pickle_protocol(protocol): s = pickle.dumps(bt1, protocol=protocol) bt2 = pickle.loads(s) ind2, dist2 = bt2.query(X) assert_allclose(ind1, ind2) assert_allclose(dist1, dist2) for protocol in (0, 1, 2): yield check_pickle_protocol, protocol
def test_ball_tree_query_radius(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind = bt.query_radius(query_pt, r + eps)[0] i = np.where(rad <= r + eps)[0] ind.sort() i.sort() assert_array_almost_equal(i, ind)
def test_ball_tree_query_radius_distance(n_samples=100, n_features=10): np.random.seed(0) X = 2 * np.random.random(size=(n_samples, n_features)) - 1 query_pt = np.zeros(n_features, dtype=float) eps = 1E-15 # roundoff error can cause test to fail bt = BallTree(X, leaf_size=5) rad = np.sqrt(((X - query_pt) ** 2).sum(1)) for r in np.linspace(rad[0], rad[-1], 100): ind, dist = bt.query_radius(query_pt, r + eps, return_distance=True) ind = ind[0] dist = dist[0] d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1)) assert_array_almost_equal(d, dist)
def test_gaussian_kde(n_samples=1000): """Compare gaussian KDE results to scipy.stats.gaussian_kde""" from scipy.stats import gaussian_kde np.random.seed(0) x_in = np.random.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: bt = BallTree(x_in[:, None]) try: gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) except TypeError: raise SkipTest("Old version of scipy, doesn't accept explicit bandwidth.") dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_array_almost_equal(dens_bt, dens_gkde, decimal=3)
def test_gaussian_kde(n_samples=1000): """Compare gaussian KDE results to scipy.stats.gaussian_kde""" from scipy.stats import gaussian_kde np.random.seed(0) x_in = np.random.normal(0, 1, n_samples) x_out = np.linspace(-5, 5, 30) for h in [0.01, 0.1, 1]: bt = BallTree(x_in[:, None]) try: gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in)) except TypeError: # older versions of scipy don't accept explicit bandwidth raise SkipTest dens_bt = bt.kernel_density(x_out[:, None], h) / n_samples dens_gkde = gkde.evaluate(x_out) assert_allclose(dens_bt, dens_gkde, rtol=1E-3, atol=1E-3)
def _nonlocalmeans_clustered(img, n_small=5, n_components=9, n_neighbors=30, h=10): Nw = (2 * n_small + 1) ** 2 h2 = h * h n_rows, n_cols = img.shape # precompute the coordinate difference for the big patch small_rows, small_cols = np.indices(((2 * n_small + 1), (2 * n_small + 1))) - n_small # put all patches so we can cluster them n_padded = np.pad(img, n_small, mode='reflect') patches = np.zeros((n_rows * n_cols, Nw)) n = 0 for r in range(n_small, n_small + n_rows): for c in range(n_small, n_small + n_cols): window = n_padded[r + small_rows, c + small_cols].flatten() patches[n, :] = window n += 1 transformed = PCA(n_components=n_components).fit_transform(patches) # index the patches into a tree tree = BallTree(transformed, leaf_size=2) print("Denoising") new_img = np.zeros_like(img) for r in range(n_rows): for c in range(n_cols): idx = r * n_cols + c dist, ind = tree.query(transformed[idx], k=n_neighbors) ridx = np.array([(int(i / n_cols), int(i % n_cols)) for i in ind[0, 1:]]) colors = img[ridx[:, 0], ridx[:, 1]] w = np.exp(-dist[0, 1:] / h2) new_img[r, c] = np.sum(w * colors) / np.sum(w) return new_img
def compute_distances(): # Load IXP-GST positions altitude = 1150 min_elev = 40 orbits = 32 sat_per_orbit = 50 inclination = 53 gst_file = "data/raw/ixp_geolocation.csv" src_file = "data/raw/WUP2018-F22-Cities_Over_300K_Annual.csv" # Load geo information sat_pos, gst_pos, src_pos = load_locations(altitude, orbits, sat_per_orbit, inclination, gst_file, src_file, time=15000) lon_sort_idx_src = np.argsort(src_pos[:, 1]) src_pos = (src_pos[lon_sort_idx_src]) # Remove SRCs that are too high in latitude higher = np.where(src_pos[:, 0] > 56)[0] src_pos = np.delete(src_pos, higher, axis=0) lon_sort_idx_gst = np.argsort(gst_pos[:, 1]) gst_pos = (gst_pos[lon_sort_idx_gst]) # %% sat_sat_dist = compute_sat_sat_distance(sat_pos, altitude, orbits, sat_per_orbit) # Compute the BallTree for the satellites. This gives nn to satellites. sat_tree = BallTree(np.deg2rad(sat_pos), metric=DistanceMetric.get_metric("haversine")) # Get the satellites that are in reach for the ground stations # and their distance. sat_gst_ind_city, sat_gst_dist_city = compute_gst_sat_distance( altitude, min_elev, src_pos, sat_tree) src_src_satellite = gsts_optimization(sat_gst_ind_city, sat_gst_dist_city, sat_sat_dist, n_gsts=src_pos.shape[0]) src_src_latency = src_src_satellite / LIGHT_IN_VACUUM # %% sat_gst_ind_ixp, sat_gst_dist_ixp = compute_gst_sat_distance( altitude, min_elev, gst_pos, sat_tree) gst_gst_satellite = gsts_optimization(sat_gst_ind_ixp, sat_gst_dist_ixp, sat_sat_dist, n_gsts=gst_pos.shape[0]) src_gst_ind, src_gst_dist = src_nearest_gst_distance(src_pos, gst_pos) n_src = src_pos.shape[0] src_gst_latency = compute_src_dst_latency(n_src, [], src_gst_ind, src_gst_dist, [], [], gst_gst_satellite) return src_gst_latency, src_src_latency, src_pos
batch_size = 512 X = np.random.random(size=(n_points, d)).astype(np.float32) res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) index.add(X) for bi in range(3,10): for ki in range(3, 10): t = time.time() D, I = index.search(X[0:2**bi,:], 2**ki) print 2**bi, 2**ki, int((time.time()-t)*1000) t = time.time() cpu_index = BallTree(X) print("BallTree build time (mins)", int((time.time()-t)/60)) #t = time.time() #D, I = cpu_index.query(X[0:batch_size,:], k) #print int((time.time()-t)*1000) for bi in range(3,10): for ki in range(3, 10): t = time.time() D, I = cpu_index.query(X[0:2**bi,:], 2**ki) print 2**bi, 2**ki, int((time.time()-t)*1000)
#print(WandV['pressure']) #X = np.array((WandV.values())) #print(len(WandV.values())) #print(WandV.values()) import pandas as pd df = pd.DataFrame() for i in WandV.values(): #print(pd.DataFrame(i)) df = df.append(pd.Series(i), ignore_index=True) #print("temp head",df.head()) #print("temp shape", df.shape) from sklearn.neighbors.ball_tree import BallTree print("KNN ...........") tree = BallTree(df, leaf_size=2) print("finding neighbor words .....") dist, ind = tree.query(df[:1], k=3) # doctest: +SKIP print(ind) # indices of 3 closest neighbors #[0 3 1] print(dist) # distances to 3 closest neighbors #[ 0. 0.19662693 0.29473397] v1 = df.iloc[0, :] v2 = df.iloc[363, :] v3 = df.iloc[3774, :] V1 = np.array(v1) V2 = np.array(v2) V3 = np.array(v3)
def get_ball_tree_index(X): return BallTree(X)
def __init__( self, reciprocal_lattice: Lattice, original_points: np.ndarray, original_dim: np.ndarray, extra_points: np.ndarray, ir_to_full_idx: Optional[np.ndarray] = None, extra_ir_points_idx: Optional[np.ndarray] = None, nworkers: int = pdefaults["nworkers"], ): """ Add a warning about only using the symmetry options if you are sure your extra k-points have been symmetrized Args: original_points: nworkers: """ self._nworkers = nworkers if nworkers != -1 else cpu_count() self._final_points = np.concatenate([original_points, extra_points]) self._reciprocal_lattice = reciprocal_lattice if ir_to_full_idx is None: ir_to_full_idx = np.arange( len(original_points) + len(extra_points)) if extra_ir_points_idx is None: extra_ir_points_idx = np.arange(len(extra_points)) logger.debug("Initializing periodic Voronoi calculator") all_points = np.concatenate((original_points, extra_points)) logger.debug(" ├── getting supercell k-points") supercell_points = get_supercell_points(all_points) supercell_idxs = np.arange(supercell_points.shape[0]) # filter points far from the zone boundary, this will lead to errors for # very small meshes < 5x5x5 but we are not interested in those mask = ((supercell_points > -0.75) & (supercell_points < 0.75)).all(axis=1) supercell_points = supercell_points[mask] supercell_idxs = supercell_idxs[mask] # want points in cartesian space so we can define a regular spherical # cutoff even if reciprocal lattice is not cubic. If we used a # fractional cutoff, the cutoff regions would not be spherical logger.debug(" ├── getting cartesian points") cart_points = reciprocal_lattice.get_cartesian_coords(supercell_points) cart_extra_points = reciprocal_lattice.get_cartesian_coords( extra_points[extra_ir_points_idx]) # small cutoff is slightly larger than the max regular grid spacing # means at least 1 neighbour point will always be included in each # direction, need to find cartesian length which covers the longest direction # of the mesh spacing = 1 / original_dim body_diagonal = reciprocal_lattice.get_cartesian_coords(spacing) xy = reciprocal_lattice.get_cartesian_coords( [spacing[0], spacing[1], 0]) xz = reciprocal_lattice.get_cartesian_coords( [spacing[0], 0, spacing[2]]) yz = reciprocal_lattice.get_cartesian_coords( [0, spacing[1], spacing[2]]) len_diagonal = np.linalg.norm(body_diagonal) len_xy = np.linalg.norm(xy) len_xz = np.linalg.norm(xz) len_yz = np.linalg.norm(yz) small_cutoff = (np.max([len_diagonal, len_xy, len_xz, len_yz]) * 1.6) big_cutoff = (small_cutoff * 1.77) logger.debug(" ├── initializing ball tree") # use BallTree for quickly evaluating which points are within cutoffs tree = BallTree(cart_points) n_supercell_points = len(supercell_points) # big points are those which surround the extra points within the big cutoff # (including the extra points themselves) logger.debug(" ├── calculating points in big radius") big_points_idx = _query_radius_iteratively(tree, n_supercell_points, cart_extra_points, big_cutoff) # Voronoi points are those we actually include in the Voronoi diagram self._voronoi_points = cart_points[big_points_idx] # small points are the points in all_points (i.e., original + extra points) for # which we want to calculate the Voronoi volumes. Outside the small cutoff, the # weights will just be the regular grid weight. logger.debug(" └── calculating points in small radius") small_points_idx = _query_radius_iteratively(tree, n_supercell_points, cart_extra_points, small_cutoff) # get the irreducible small points small_points_in_all_points = supercell_idxs[small_points_idx] % len( all_points) mapping = ir_to_full_idx[small_points_in_all_points] unique_mappings, ir_idx = np.unique(mapping, return_index=True) small_points_idx = small_points_idx[ir_idx] # get a mapping to go from the ir small points to the full BZ. groups = groupby(np.arange(len(all_points)), ir_to_full_idx) grouped_ir = groups[unique_mappings] counts = [len(g) for g in grouped_ir] self._expand_ir = np.repeat(np.arange(len(ir_idx)), counts) # get the indices of the expanded ir_small_points in all_points self._volume_in_final_idx = np.concatenate(grouped_ir) # get the indices of ir_small_points_idx (i.e., the points for which we will # calculate the volume) in voronoi_points self._volume_points_idx = _get_loc(big_points_idx, small_points_idx) # Prepopulate the final volumes array. By default, each point has the # volume of the original mesh. Note: at this point, the extra points # will have zero volume. This will array will be updated by # compute_volumes self._volume = reciprocal_lattice.volume self._final_volumes = np.full(len(all_points), 1 / len(original_points)) self._final_volumes[len(original_points):] = 0 self._final_volumes[self._volume_in_final_idx] = 0
def calculate_band_rates(self, spin: Spin, b_idx: int, nsplits: int): integral_conversion = ( (2 * np.pi) ** 3 / (self.amset_data.structure.lattice.volume * A_to_nm ** 3) / self.gauss_width) # prefactors have shape [nscatterers, ndoping, ntemp) elastic_prefactors = integral_conversion * np.array( [m.prefactor(spin, b_idx) for m in self.elastic_scatterers]) inelastic_prefactors = integral_conversion * np.array( [m.prefactor(spin, b_idx) for m in self.inelastic_scatterers]) if self.use_symmetry: kpoints_idx = self.amset_data.ir_kpoints_idx else: kpoints_idx = np.arange(len(self.amset_data.full_kpoints)) nkpoints = len(kpoints_idx) band_energies = self.amset_data.energies[spin][b_idx, kpoints_idx] # filter_points far from band edge where Fermi integrals are very small ball_band_energies = copy.deepcopy(band_energies) mask = (band_energies < self.scattering_energy_cutoffs[0]) | ( band_energies > self.scattering_energy_cutoffs[1]) # set the energies out of range to infinite so that they will not be # included in the scattering rate calculations ball_band_energies[mask] *= float("inf") ball_tree = BallTree(ball_band_energies[:, None], leaf_size=100) g = np.ones(self.amset_data.fermi_levels.shape + (len(self.amset_data.energies[spin][b_idx]), )) * 1e-9 s_g, g = create_shared_array(g, return_buffer=True) s_energies = create_shared_array(band_energies) s_kpoints = create_shared_array(self.amset_data.full_kpoints) s_k_norms = create_shared_array(self.amset_data.kpoint_norms) s_k_weights = create_shared_array(self.amset_data.kpoint_weights) s_a_factor = create_shared_array( self.amset_data.a_factor[spin][b_idx, kpoints_idx]) s_c_factor = create_shared_array( self.amset_data.c_factor[spin][b_idx, kpoints_idx]) rlat = self.amset_data.structure.lattice.reciprocal_lattice.matrix # spawn as many worker processes as needed, put all bands in the queue, # and let them work until all the required rates have been computed. workers = [] iqueue = Queue() oqueue = Queue() for i in range(self.nworkers): args = (self.scatterers, ball_tree, spin, b_idx, self.gauss_width * units.eV, s_g, s_energies, s_kpoints, s_k_norms, s_k_weights, s_a_factor, s_c_factor, len(band_energies), rlat, iqueue, oqueue) if self.use_symmetry: kwargs = { "grouped_ir_to_full": self.amset_data.grouped_ir_to_full, "ir_kpoints_idx": self.amset_data.ir_kpoints_idx} workers.append(Process(target=scattering_worker, args=args, kwargs=kwargs)) else: workers.append(Process(target=scattering_worker, args=args)) slices = list(gen_even_slices(nkpoints, nsplits)) for w in workers: w.start() elastic_rates = None if self.elastic_scatterers: elastic_rates = self._fill_workers( nkpoints, slices, iqueue, oqueue, desc="elastic", scattering_mask=mask) elastic_rates *= elastic_prefactors[..., None] if self.inelastic_scatterers: # currently only supports one inelastic scattering energy difference # convert frequency to THz and get energy in Rydberg energy_diff = (self.materials_properties["pop_frequency"] * 1e12 * 2 * np.pi * hbar * units.eV) n_inelastic = len(self.inelastic_scatterers) shape = (n_inelastic, len(self.amset_data.doping), len(self.amset_data.temperatures), nkpoints) in_rates = np.zeros(shape) out_rates = np.zeros(shape) # in 1/s # force = (self.amset_data.dfdk[spin][:, :, b_idx] * # default_small_e / hbar) force = np.zeros(self.amset_data.dfde[spin][:, :, b_idx].shape) # if max_iter == 1 then RTA, don't calculate in rates calculate_in_rate = self.max_g_iter != 1 calculate_out_rate = True for _ in range(self.max_g_iter): # rates are formatted as s1_i, s1_i, s2_o, s2_o etc inelastic_rates = self._fill_workers( nkpoints, slices, iqueue, oqueue, energy_diff=energy_diff, desc="inelastic", calculate_out_rate=calculate_out_rate, calculate_in_rate=calculate_in_rate, scattering_mask=mask) if calculate_in_rate: # in rates always returned first in_rates = inelastic_rates[:n_inelastic] in_rates *= inelastic_prefactors[..., None] if calculate_out_rate: # in rate independent of g so only need to calculate it once idx = n_inelastic if calculate_in_rate else 0 out_rates = inelastic_rates[idx:] out_rates *= inelastic_prefactors[..., None] calculate_out_rate = False if self.max_g_iter != 1: new_g = calculate_g( out_rates, in_rates, elastic_rates, force) g_diff = np.abs(np.average(new_g-g)) logger.debug(" ├── difference in g value: {:.2g}".format( g_diff)) if g_diff < self.g_tol: break # update the shared buffer g[:] = new_g[:] to_stack = [elastic_rates] if elastic_rates is not None else [] if calculate_in_rate: to_stack.append(in_rates) to_stack.append(out_rates) all_band_rates = np.vstack(to_stack) else: all_band_rates = elastic_rates # The "None"s at the end of the queue signal the workers that there are # no more jobs left and they must therefore exit. for i in range(self.nworkers): iqueue.put(None) for w in workers: w.join() w.terminate() return all_band_rates
def preprocess_table(input_file_path, output_file_path): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ encoders = {} logger = logging.getLogger(__name__) df_full_train, output_filepath_df_train, output_filepath_misc_train = read_table( input_file_path, logger, output_file_path, suffix="Train") df_full_test, output_filepath_df_test, output_filepath_misc_test = read_table( input_file_path, logger, output_file_path, suffix="Test") df_full_val, output_filepath_df_val, output_filepath_misc_val = read_table( input_file_path, logger, output_file_path, suffix="Validation") # Label encode categoricals for cat in CAT_COLUMNS: logger.info(f"to category: {cat}") df_full_train[cat] = df_full_train[cat].astype(str) df_full_test[cat] = df_full_test[cat].astype(str) df_full_val[cat] = df_full_val[cat].astype(str) CALC_COUNT_COLUMNS = [] df_to_fit_le = pd.concat([df_full_train, df_full_val], axis=0)[df_full_test.columns] # Label encode categoricals label_encoder = ce.CountEncoder(return_df=True, cols=CAT_COLUMNS, verbose=1, normalize=True) count_encoder = ce.CountEncoder(return_df=True, cols=COUNT_COLUMNS + CALC_COUNT_COLUMNS, verbose=1, normalize=True) # Encode train and test with LE label_encoder.fit(df_to_fit_le) df_full_train[df_full_test.columns] = label_encoder.transform( df_full_train[df_full_test.columns]) df_full_test = label_encoder.transform(df_full_test) df_full_val[df_full_test.columns] = label_encoder.transform( df_full_val[df_full_test.columns]) # Encode train and test with CE count_encoder.fit(df_to_fit_le) df_full_train[df_full_test.columns] = count_encoder.transform( df_full_train[df_full_test.columns]) df_full_test = count_encoder.transform(df_full_test) df_full_val[df_full_test.columns] = count_encoder.transform( df_full_val[df_full_test.columns]) # Encode aggregate statistics using BallTree: X = pd.concat( [df_full_train[['lat', 'long']], df_full_val[['lat', 'long']]], axis=0).values # Build a tree: tree = BallTree(X) # Calculate aggregate statistics using tree: X_to_get_data = pd.concat([df_full_train, df_full_val], axis=0) # # df_full_train = calculate_agg_statistics(tree,X_to_get_data,df_full_train) # df_full_val = calculate_agg_statistics(tree, X_to_get_data, df_full_val) # df_full_test = calculate_agg_statistics(tree, X_to_get_data, df_full_test) # print(df_full_train.shape) print(df_full_test.shape) print(df_full_val.shape) # Encode test: misc = {} misc["encoder_dict"] = encoders # profile = feature_df.profile_report(title=f'Pandas Profiling Report for {suffix}') # profile.to_file(output_file=os.path.join(project_dir, f"output_{suffix}.html")) df_full_train.to_pickle(output_filepath_df_train) df_full_test.to_pickle(output_filepath_df_test) df_full_val.to_pickle(output_filepath_df_val) with open(output_filepath_misc_train, "wb") as f: pickle.dump(misc, f) return 0
def check_neighbors(metric): bt = BallTree(X, leaf_size=1, metric=metric) dist1, ind1 = bt.query(Y, k) dist2, ind2 = brute_force_neighbors(X, Y, k, metric) assert_array_almost_equal(dist1, dist2)
def _fit(self, X): self._check_algorithm_metric() self._check_hubness_algorithm() self._check_algorithm_hubness_compatibility() if self.metric_params is None: self.effective_metric_params_ = {} else: self.effective_metric_params_ = self.metric_params.copy() effective_p = self.effective_metric_params_.get('p', self.p) if self.metric in ['wminkowski', 'minkowski']: self.effective_metric_params_['p'] = effective_p self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available if self.metric == 'minkowski': p = self.effective_metric_params_.pop('p', 2) if p <= 0: raise ValueError(f"p must be greater than one for minkowski metric, " f"or in ]0, 1[ for fractional norms.") elif p == 1: self.effective_metric_ = 'manhattan' elif p == 2: self.effective_metric_ = 'euclidean' elif p == np.inf: self.effective_metric_ = 'chebyshev' else: self.effective_metric_params_['p'] = p if isinstance(X, NeighborsBase): self._fit_X = X._fit_X self._tree = X._tree self._fit_method = X._fit_method self._index = X._index self._hubness_reduction = X._hubness_reduction return self elif isinstance(X, BallTree): self._fit_X = X.data self._tree = X self._fit_method = 'ball_tree' return self elif isinstance(X, KDTree): self._fit_X = X.data self._tree = X self._fit_method = 'kd_tree' return self elif isinstance(X, ApproximateNearestNeighbor): self._tree = None if isinstance(X, PuffinnLSH): self._fit_X = np.array([X.index_.get(i) for i in range(X.n_indexed_)]) * X.X_indexed_norm_ self._fit_method = 'lsh' elif isinstance(X, FalconnLSH): self._fit_X = X.X_train_ self._fit_method = 'falconn_lsh' elif isinstance(X, NNG): self._fit_X = None self._fit_method = 'nng' elif isinstance(X, HNSW): self._fit_X = None self._fit_method = 'hnsw' elif isinstance(X, RandomProjectionTree): self._fit_X = None self._fit_method = 'rptree' self._index = X # TODO enable hubness reduction here. # We do not store X_train in all cases atm. # self._hubness_reduction_method = self.hubness # self._set_hubness_reduction(self._fit_X) return self X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: raise ValueError(f"n_samples must be greater than 0 (but was {n_samples}.") if issparse(X): if self.algorithm not in ('auto', 'brute'): warnings.warn("cannot use tree with sparse input: " "using brute force") if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \ and not callable(self.effective_metric_): raise ValueError(f"Metric '{self.effective_metric_}' not valid for sparse input. " f"Use sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute']) " f"to get valid options. Metric can also be a callable function.") self._fit_X = X.copy() self._tree = None self._fit_method = 'brute' if self.hubness is not None: warnings.warn(f'cannot use hubness reduction with sparse data: disabling hubness reduction.') self.hubness = None self._hubness_reduction_method = None self._hubness_reduction = NoHubnessReduction() return self self._fit_method = self.algorithm self._fit_X = X self._hubness_reduction_method = self.hubness if self._fit_method == 'auto': # A tree approach is better for small number of neighbors, # and KDTree is generally faster when available if ((self.n_neighbors is None or self.n_neighbors < self._fit_X.shape[0] // 2) and self.metric != 'precomputed'): if self.effective_metric_ in VALID_METRICS['kd_tree']: self._fit_method = 'kd_tree' elif (callable(self.effective_metric_) or self.effective_metric_ in VALID_METRICS['ball_tree']): self._fit_method = 'ball_tree' else: self._fit_method = 'brute' else: self._fit_method = 'brute' self._index = None if self._fit_method == 'ball_tree': self._tree = BallTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'kd_tree': self._tree = KDTree(X, self.leaf_size, metric=self.effective_metric_, **self.effective_metric_params_) self._index = None elif self._fit_method == 'brute': self._tree = None self._index = None elif self._fit_method == 'lsh': self._index = PuffinnLSH(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'falconn_lsh': self._index = FalconnLSH(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'nng': self._index = NNG(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'hnsw': self._index = HNSW(**self.algorithm_params) self._index.fit(X) self._tree = None elif self._fit_method == 'rptree': self._index = RandomProjectionTree(**self.algorithm_params) self._index.fit(X) self._tree = None # because it's a tree, but not an sklearn tree... else: raise ValueError(f"algorithm = '{self.algorithm}' not recognized") # Fit hubness reduction method self._set_hubness_reduction(X) if self.n_neighbors is not None: if self.n_neighbors <= 0: raise ValueError(f"Expected n_neighbors > 0. Got {self.n_neighbors:d}") else: if not np.issubdtype(type(self.n_neighbors), np.integer): raise TypeError( f"n_neighbors does not take {type(self.n_neighbors)} value, " f"enter integer value" ) return self
def __init__(self, reciprocal_lattice: Lattice, original_points: np.ndarray, original_dim: np.ndarray, extra_points: np.ndarray, nworkers: int = pdefaults["nworkers"]): """ Args: original_points: nworkers: """ self._nworkers = nworkers if nworkers != -1 else cpu_count() supercell_points = get_supercell_points([2, 2, 2], original_points) # want points in cartesian space so we can define a regular spherical # cutoff even if reciprocal lattice is not cubic. If we used a # fractional cutoff, the cutoff regions would not be spherical cart_points = reciprocal_lattice.get_cartesian_coords(supercell_points) cart_extra_points = reciprocal_lattice.get_cartesian_coords( extra_points) # small cutoff is slighly larger than the max regular grid spacing # means at least 1 neighbour point will always be included in each # direction dim_lengths = np.dot(1 / original_dim, reciprocal_lattice.matrix) small_cutoff = np.max(dim_lengths) * 1.01 big_cutoff = small_cutoff * 2 # use BallTree for quickly evaluating which points are within cutoffs tree = BallTree(cart_points) # big cutoff points are those which surround the extra points within # the big cutoff (it does not include the extra points themselves) big_cutoff_points_idx = np.concatenate(tree.query_radius( cart_extra_points, big_cutoff), axis=0) # Voronoi points are those we actually calculate in the Voronoi diagram # e.g. the big points + extra points voronoi_points = supercell_points[big_cutoff_points_idx] self._voronoi_points = np.concatenate((voronoi_points, extra_points)) # small points are the points in original_points for which we want to # calculate the Voronoi volumes. Note this does not include the # indices of the extra points. Outside the small cutoff, the weights # will just be the regular grid weight. small_cutoff_points_idx = np.concatenate(tree.query_radius( cart_extra_points, small_cutoff), axis=0) # get the indices of small_cutoff_points in voronoi_points small_in_voronoi_idx = _get_loc(big_cutoff_points_idx, small_cutoff_points_idx) # get the indices of the small cutoff points + extra points # in voronoi points that we want the volumes for. The extra points # were just added at the end of big_cutoff_points, so getting their # indices is simple self._volume_points_idx = np.concatenate( (small_in_voronoi_idx, np.arange(len(extra_points)) + len(big_cutoff_points_idx))) # get the indices of the small_cutoff_points (not including the extra # points) in the original mesh. this works because the supercell # points are in the same order as the original mesh, just repeated for # each cell in the supercell small_in_original_idx = (small_cutoff_points_idx % len(original_points)) # get the indices of the small cutoff points + extra points in the # final volume array. Note that the final volume array has the same # order as original_mesh + extra_points self._volume_in_final_idx = np.concatenate( (small_in_original_idx, np.arange(len(extra_points)) + len(original_points))) # prepopulate the final volumes array. By default, each point has the # volume of the original mesh. Note: at this point, the extra points # will have zero volume. This will array will be updated by # compute_volumes self._final_volumes = np.full( len(original_points) + len(extra_points), 1 / len(original_points)) self._final_volumes[len(original_points):] = 0