def match(x,y,mytab): """Routine that matches the truth catalog with the input table Args: ---- x: `float` RA of the truth objects to match (in degrees) y: `float` dec of the truth objects to match (in degrees) mytab: `astropy.table.Table` table containing the L2 input catalog. Returns: ------- ind: `int` array of indices to select the truth objects that match the detected objects """ X = np.zeros((len(x),2)) X[:,0]=x X[:,1]=y tree = KDTree(X,leaf_size=40) Y = np.zeros((len(mytab),2)) Y[:,0]=mytab['coord_ra']*180/np.pi Y[:,1]=mytab['coord_dec']*180/np.pi dist, ind = tree.query(Y,k=1) print 'Matches with distance > 1 px, ', np.count_nonzero(dist>1) return ind
def match_bright(x,y,x2,y2,mags,dist=1./3600.): """Routine that matches the truth catalog with the input table Args: ---- x: `float` RA of the truth objects to match (in degrees) y: `float` dec of the truth objects to match (in degrees) x2: `float` RA of detected objects to match (in degrees) y2: `float` dec of detected objects to match (in degrees) mags: `float` array containing the true input magnitudes dist: `float` maximum distance in degrees considered to match the objects, the default is 1 arcsecond. Returns: ------- brightest_ind: `int` array of indices to select the truth objects that match the detected objects, returns -1 if no match has been found for a particular object """ X = np.zeros((len(x),2)) X[:,0]=x X[:,1]=y Y = np.zeros((len(x2),2)) Y[:,0]=x2 Y[:,1]=y2 tree = KDTree(X,leaf_size=40) ind = tree.query_radius(Y, r=dist) brightest_indices = np.zeros(len(ind),dtype=np.int64) for i,ii in enumerate(ind): sorted_indices = np.argsort(mags[ii]) if(len(sorted_indices)>0): brightest_indices[i] = ii[sorted_indices[0]] else: brightest_indices[i]=-1 return brightest_indices
def compute_centroids(X, C): """Compute the centroids for dataset X given centers C. Note: centers C may not belong to X. """ tree = KDTree(X) centroids = tree.query(C, k=1, return_distance=False).squeeze() return centroids
def count_close(x,y,x2,y2,distances): """Routine that counts the number of objects that are within certain radius Args: ---- x: `float` position X of objects to count y: `float` position Y of objects to count x2: `float` position X of the objects that serve as the center of the circle where we look for neighbors y2: `float` position Y of the objects that serve as the center of the circle where we look for neighbors distances: `float` array of radii where to count the objects Returns: ------- neighbors: `float` the mean number of neighbors in a circle of radii corresponding to each entry of distances err: `float` standard deviation of the number of neighbors in a circle of radii corresponding to each entry of distances """ X = np.zeros((len(x),2)) X[:,0]=x X[:,1]=y Y = np.zeros((len(x2),2)) Y[:,0]=x2 Y[:,1]=y2 tree = KDTree(X,leaf_size=40) neighbors = np.zeros(len(distances)) err = np.zeros(len(distances)) for i,distance in enumerate(distances): neighbors[i], err[i] = np.nanmean(tree.query_radius(Y, r=distance, count_only=True)), np.nanstd(tree.query_radius(Y, r=distance, count_only=True)) return neighbors, err
def compute_labels(X, C): """Compute the cluster labels for dataset X given centers C. """ # labels = np.argmin(pairwise_distances(C, X), axis=0) # THIS REQUIRES TOO MUCH MEMORY FOR LARGE X tree = KDTree(C) labels = tree.query(X, k=1, return_distance=False).squeeze() return labels
def buildDistanceMap (self, X, Y): classes = np.unique(Y) nClasses = len(classes) tree = KDTree(X) nRows = X.shape[0] TSOri = np.array([]).reshape(0,self.k) distanceMap = np.array([]).reshape(0,self.k) labels = np.array([]).reshape(0,self.k) for row in range(nRows): distances, indicesOfNeighbors = tree.query(X[row].reshape(1,-1), k = self.k+1) distances = distances[0][1:] indicesOfNeighbors = indicesOfNeighbors[0][1:] distanceMap = np.append(distanceMap, np.array(distances).reshape(1,self.k), axis=0) labels = np.append(labels, np.array(Y[indicesOfNeighbors]).reshape(1,self.k),axis=0) for c in classes: nTraining = np.sum(Y == c) labelTmp = labels[Y.ravel() == c,:] tmpKNNClass = labelTmp.ravel() TSOri = np.append(TSOri, len(tmpKNNClass[tmpKNNClass == c]) / (nTraining*float(self.k))) return distanceMap, labels, TSOri
def kdtree(data, lake_matrix, k_neighbors = 10, leaf_size = 20): # training kdtree = KDTree(data, leaf_size=leaf_size, metric='euclidean') # testing distances, indices = kdtree.query(lake_matrix, k=k_neighbors) return np.array(indices), distances
def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] min_samples = min(dim - 1, min_samples) tree = KDTree(X, metric=metric, leaf_size=leaf_size) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1] min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha) min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
def study_redmapper_lrg_3d(hemi='north'): # create 3d grid object grid = grid3d(hemi=hemi) # load SDSS data sdss = load_sdss_data_both_catalogs(hemi) # load redmapper catalog rm = load_redmapper(hemi=hemi) # get XYZ positions (Mpc) of both datasets x_sdss, y_sdss, z_sdss = grid.xyz_from_radecz(sdss['ra'], sdss['dec'], sdss['z'], applyzcut=False) x_rm, y_rm, z_rm = grid.xyz_from_radecz(rm['ra'], rm['dec'], rm['z_spec'], applyzcut=False) pos_sdss = np.vstack([x_sdss, y_sdss, z_sdss]).T pos_rm = np.vstack([x_rm, y_rm, z_rm]).T # build a couple of KDTree's, one for SDSS, one for RM. from sklearn.neighbors import KDTree tree_sdss = KDTree(pos_sdss, leaf_size=30) tree_rm = KDTree(pos_rm, leaf_size=30) lrg_counts = tree_sdss.query_radius(pos_rm, 100., count_only=True) pl.clf() pl.hist(lrg_counts, bins=50) ipdb.set_trace()
def match(x1, y1, x2=None, y2=None, k=5, kdt=None): X2 = np.vstack([x2, y2]).T X1 = np.vstack([x1, y1]).T if kdt is None: kdt = KDTree(X2, leaf_size=30, metric='euclidean') dists, inds = kdt.query(X1, k=k, return_distance=True) return dists, inds, kdt
def _hdbscan_large_kdtree_cdist(X, min_cluster_size=5, min_samples=None, alpha=1.0, metric='minkowski', p=2, gen_min_span_tree=False): if p is None: p = 2 dim = X.shape[0] min_samples = min(dim - 1, min_samples) if metric == 'minkowski': tree = KDTree(X, metric=metric, p=p) else: tree = KDTree(X, metric=metric) core_distances = tree.query(X, k=min_samples)[0][:,-1] min_spanning_tree = mst_linkage_core_cdist(X, core_distances, metric, p) min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] single_linkage_tree = label(min_spanning_tree) condensed_tree = condense_tree(single_linkage_tree, min_cluster_size) stability_dict = compute_stability(condensed_tree) cluster_list = get_clusters(condensed_tree, stability_dict) labels = -1 * np.ones(X.shape[0], dtype=int) probabilities = np.zeros(X.shape[0], dtype=float) for index, (cluster, prob) in enumerate(cluster_list): labels[cluster] = index probabilities[cluster] = prob return labels, probabilities, condensed_tree, single_linkage_tree, None
def margin(indices, k, X, y): margins = [] kd_tree = KDTree(X) for img_index in indices: margin = 0 in_class = 0 # most_frequent_class = 0 current_class = y[img_index] # print current_class dists, neighbour_indices = kd_tree.query(X[img_index].reshape((1, X[img_index].shape[0])), k) for index in neighbour_indices[0]: # print y[index] if y[index] == current_class: in_class += 1 neighbour_dict = {} for index in neighbour_indices[0]: if y[index] in neighbour_dict: neighbour_dict[y[index]] += 1 else: neighbour_dict[y[index]] = 1 neighbour_dict.pop(current_class) if neighbour_dict: most_frequent = max(neighbour_dict.items(), key=lambda x: x[1])[1] margin = in_class - most_frequent margins.append(margin) return margins
def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean size = X.shape[0] min_samples = min(size - 1, min_samples) tree = KDTree(X, metric=metric, leaf_size=leaf_size) #TO DO: Deal with p for minkowski appropriately dist_metric = DistanceMetric.get_metric(metric) #Get distance to kth nearest neighbour core_distances = tree.query(X, k=min_samples, dualtree=True, breadth_first=True)[0][:, -1] #Mutual reachability distance is implicite in mst_linkage_core_cdist min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric, alpha) #Sort edges of the min_spanning_tree by weight min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :] #Convert edge list into standard hierarchical clustering format single_linkage_tree = label(min_spanning_tree) return single_linkage_tree, None
def _rsl_prims_kdtree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2): if metric == 'minkowski': if p is None: raise TypeError('Minkowski metric given but no p value supplied!') if p < 0: raise ValueError('Minkowski metric with negative p value is not defined!') elif p is None: p = 2 # Unused, but needs to be integer; assume euclidean dim = X.shape[0] k = min(dim - 1, k) tree = KDTree(X, metric=metric) dist_metric = DistanceMetric.get_metric(metric) core_distances = tree.query(X, k=k)[0][:,-1] min_spanning_tree = mst_linkage_core_cdist(X, core_distances, dist_metric) single_linkage_tree = label(min_spanning_tree) single_linkage_tree = SingleLinkageTree(single_linkage_tree) labels = single_linkage_tree.get_clusters(cut, gamma) return labels, single_linkage_tree
def margin_new(indices, k, X, y): margins = [] kd_tree = KDTree(X) for img_index in indices: margin = 0 dist_to_class = 0 dist_to_others = 0 current_class = y[img_index] dists, neighbour_indices = kd_tree.query(X[img_index].reshape((1, X[img_index].shape[0])), k) classes = {} for i in xrange(neighbour_indices[0].shape[0]): index = neighbour_indices[0][i] if y[index] in classes: classes[y[index]] += dists[0][i] else: classes[y[index]] = dists[0][i] dist_to_class = classes[current_class] classes.pop(current_class) # print classes.items() if classes: dist_to_others = min(classes.items(), key=lambda x: x[1])[1] margin = dist_to_class - dist_to_others margins.append(margin) return margins
def constructLMap(self): self.obstacleArray = [] self.allPositions = [] #build your obstacle array for i in range( len(self.map.grid) ): for j in range( len(self.map.grid[0])): [x, y] = self.map.cell_position(i, j) if self.map.get_cell(x,y) == 1.0: self.obstacleArray.append(np.array(self.map.cell_position(i, j))) #print self.map.cell_position(i, j) self.allPositions.append(np.array(self.map.cell_position(i, j))) #pass it into kdtree eExp = [] kdt = KDTree(self.obstacleArray) dists = kdt.query(self.allPositions, k=1)[0][:] self.laserStdDev = self.config["laser_sigma_hit"] constant = 1.0/( m.sqrt( 2 * m.pi) * self.laserStdDev ) eExp = np.exp(-0.5*( dists**2 )/( self.laserStdDev**2 ) ) probObsGivenLaser = eExp self.lMap.grid = probObsGivenLaser.reshape(self.lMap.grid.shape) self.occupancyGridMsg = self.lMap.to_message() self.lMapPublisher.publish(self.occupancyGridMsg)
def test_kdtree_projection(datas): from sklearn.neighbors import KDTree from sklearn import random_projection # datas = parse() Fs = fingerprints(datas) # The random projection transformer = random_projection.GaussianRandomProjection(n_components = 128) Fs_new = transformer.fit_transform(Fs) print Fs_new.shape tree = KDTree(Fs_new, leaf_size=20) # Select a random target target_i = random.choice(range(len( datas ))) target = datas[target_i] Tf = np.vstack([fingerprint(target)]) Tf_new = transformer.transform(Tf) # Match it with timer(10): for _ in xrange(10): dist, ind = tree.query(Tf_new, k=3) assert datas[ind[0][0]] == datas[target_i]
def uniform_points_points_sampling(limits, points, n): """Select the spatial uniform points in the sample by sampling uniform spatial points and getting the nearest ones in the available ones. Parameters ---------- limits: numpy.ndarray, shape (2, 2) the limits of the space. There is the square four limits which defines the whole retrievable region. points: numpy.ndarray the points in the space selected. n: int the number of samples we want. Returns ------- indices: numpy.ndarray, shape(n) the indices of the samples. """ ## 0. Initialize retriever retriever = KDTree(points) ## 1. Compute spatial uniform points points_s = uniform_points_sampling(limits, n) ## 2. Get the nearest points in the sample result = retriever.query(points_s, k=1) indices = result[1] indices = indices.astype(int) return indices
def concat_features_by_neighbors(df_labels, df_features, X_names=['Offense Type'], grid=["Latitude", "Longitude"], radius=1./500., scale=np.array([1.,1.])): df_labels = df_labels.dropna(subset=grid) df_features = df_features.dropna(subset=grid) X = df_features.as_matrix(X_names) xy_features = df_features.as_matrix(grid) xy_labels = df_labels.as_matrix(grid) tree = KDTree(xy_features*scale) vocabulary = set() features = [] for nei in tree.query_radius(xy_labels*scale, radius): U,I = np.unique(X[nei], return_inverse=True) D = dict(zip(U,np.bincount(I))) map(vocabulary.add, D) features.append(D) return pd.concat([df_labels, pd.DataFrame([map(fi.get, vocabulary) for fi in features], index=df_labels.index, columns=vocabulary).fillna(0.)], axis=1)
def match_regions(polygons, regionlocs, n_dim=2): """ Parameters ---------- polygons: list or array_like the polygons information. regionlocs: array_like the location information of the regions. n_dim: integer the number of dimensions. Returns ------- assign_r: array_like the assignated regions. """ n = len(polygons) centroids = np.zeros((n, n_dim)) for i in xrange(n): centroids[i, :] = np.array(polygons[i]) ret = KDTree(regionlocs) assign_r = np.zeros(n).astype(int) for i in xrange(n): assign_r[i] = ret.query(centroids[[i]])[1][0] return assign_r
def get_hip_rank(points, sub): sub_coords = sub[['lat', 'lng']].values if not sub_coords.shape: return [] sub_scores = sub.checkinsCount.apply(int).values kdt = KDTree(sub_coords, metric='euclidean') d, i = kdt.query(np.array(points), k=10) return (sub_scores[i] / d**2 * 1e-11).sum(axis=1)
class KDBasedKNearestNeighbor(object): """ KDTree-based KNN classifier with L2 distance """ def __init__(self, k=1): self.k = k def fit(self, X_train, y_train): """ Build KDtree using http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html """ self.X_train = X_train self.y_train = y_train return self def calc_dist(self, X_test, metric, k=None): if k == None: k = self.k self.kd_tree = KDTree(self.X_train, metric=metric, leaf_size=self.k) return self def get_neighbors(self, X_test, k=None): if k == None: k = self.k neighbors = self.kd_tree.query(X_test, k) num_test = X_test.shape[0] y_pred = numpy.zeros(num_test) return neighbors[1] def predict_labels(self, X_test, k=None): """ Make prediction using kdtree Return array of predicted labels """ if k == None: k = self.k neighbors = self.kd_tree.query(X_test, k) num_test = X_test.shape[0] y_pred = numpy.zeros(num_test) for i in range(num_test): closest_y = self.y_train[neighbors[1][i]] count = Counter(closest_y) # print(count.most_common(1)) y_pred[i] = count.most_common(1)[0][0] return y_pred
def get_median_neighbors(df, n_neighbors, adj_r): ''' INPUT: Pandas dataframe, and the number of comparable neighbors of each listing we'll take the median price of in adding the median_neighbor_prices feature OUTPUT: Pandas dataframe with the median prices of the n_neighbors closest comparables added as a feature. This is accomplished using a KD-Tree model to search for nearest-neighbors ''' kd_df = df[['latitude', 'longitude']] kdvals = kd_df.values kd = KDTree(kdvals, leaf_size = 1000) cPickle.dump(kd, open('../models/kd_tree.pkl', 'wb')) neighbors = kd.query(kdvals, k=100) median_neighbor_prices = [] for i in xrange(len(df)): listing_neighbors = neighbors[1][i] listing_id = df.ix[i,'id'] n_beds = df.ix[i,'beds'] sale_y = df.ix[i, 'sale_y'] sub_df = df[(df.index.isin(listing_neighbors))] sub_df = sub_df[ (sub_df['beds'] == n_beds) & (sub_df['id'] != listing_id) ] comp_listings = [item for item in listing_neighbors if item in sub_df.index] df_filtered = pd.DataFrame() df_filtered['last sale price']= df['last sale price'][comp_listings][:n_neighbors] df_filtered['sale_y'] = df['sale_y'][comp_listings][:n_neighbors] df_filtered['price adjusted'] = df_filtered['last sale price'] * (1.0 + (sale_y - df_filtered['sale_y']) * adj_r) med_price = df_filtered['price adjusted'].median() if med_price > 0: median_neighbor_prices.append(med_price) else: df_filtered = pd.DataFrame() df_filtered['last sale price']= df['last sale price'][comp_listings][:n_neighbors+10] df_filtered['sale_y'] = df['sale_y'][comp_listings][:n_neighbors+10] df_filtered['price adjusted'] = df_filtered['last sale price'] * (1.0 + (sale_y - df_filtered['sale_y']) * adj_r) med_price = df_filtered['price adjusted'].median() if med_price > 0: median_neighbor_prices.append(med_price) else: df['price adjusted'] = df['last sale price'] * (1.0 + (sale_y - df['sale_y']) * adj_r) med_price = df['price adjusted'][comp_listings].median() median_neighbor_prices.append(med_price) df['med_neighbor_price'] = median_neighbor_prices rmse = np.mean((df['med_neighbor_price'] - df['last sale price'])**2)**0.5 print 'RMSE is ', rmse return df
def environment(x_h, y_h, z_h, x, y, z, D3): DD = np.array([x, y, z]) DD = DD.T tree = KDTree(DD, leaf_size=20000) index = np.where(x_h == x)[0] dist, ind = tree.query(DD[index], k=4) r3 = max(dist[0]) delta3 = D3**3.0 * (1.0/(r3**3.0) - 1.0/(D3**3.0)) return delta3
def retrieve_7major_cp(locs, raw_locs, raw_cps): raw_cps = np.array(raw_cps).astype(int) ret = KDTree(raw_locs) new_cps = [] for i in range(len(locs)): neighs = ret.query(locs[[i]], 7)[1].ravel() c = Counter([raw_cps[nei] for nei in neighs]) new_cps.append(c.keys()[np.argmax(c.values())]) return new_cps
def find_knn(pts0, eval_pts, k=15): ''' find the points within `pts0` closest to `eval_pts` ''' pts0range = (pts0.max(axis=0) - pts0.min(axis=0)) neigh = KDTree(pts0 / pts0range) nni = neigh.query(eval_pts / pts0range, k=k, return_distance=False) return nni
def main(): digits = load_digits() X = digits.data y = digits.target num_classes = np.unique(y).shape[0] plot_digits(X) # TSNE # Barnes-Hut: O(d NlogN) where d is dim and N is the number of samples # Exact: O(d N^2) t0 = time() tsne = manifold.TSNE(n_components=2, init="pca", method="barnes_hut", verbose=1) X_tsne = tsne.fit_transform(X) t1 = time() print "t-SNE: %.2f sec" % (t1 - t0) tsne.get_params() plt.figure(2) for k in range(num_classes): plt.plot(X_tsne[y == k, 0], X_tsne[y == k, 1], "o") plt.title("t-SNE embedding of digits dataset") plt.xlabel("X1") plt.ylabel("X2") axes = plt.gca() axes.set_xlim([X_tsne[:, 0].min() - 1, X_tsne[:, 0].max() + 1]) axes.set_ylim([X_tsne[:, 1].min() - 1, X_tsne[:, 1].max() + 1]) plt.show() # ISOMAP # 1. Nearest neighbors search: O(d log k N log N) # 2. Shortest path graph search: O(N^2(k+log(N)) # 3. Partial eigenvalue decomposition: O(dN^2) t0 = time() isomap = manifold.Isomap(n_neighbors=5, n_components=2) X_isomap = isomap.fit_transform(X) t1 = time() print "Isomap: %.2f sec" % (t1 - t0) isomap.get_params() plt.figure(3) for k in range(num_classes): plt.plot(X_isomap[y == k, 0], X_isomap[y == k, 1], "o", label=str(k), linewidth=2) plt.title("Isomap embedding of the digits dataset") plt.xlabel("X1") plt.ylabel("X2") plt.show() # Use KD-tree to find k-nearest neighbors to a query image kdt = KDTree(X_isomap) Q = np.array([[-160, -30], [-102, 14]]) kdt_dist, kdt_idx = kdt.query(Q, k=20) plot_digits(X[kdt_idx.ravel(), :])
def two_point(data, bins): """Two-point correlation function, using Landy-Szalay method Parameters ---------- data : array_like input data, shape = [n_samples, n_features] (2D ndarray) bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 (1D ndarray) Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(None) n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle around an axis, making background dist. data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) factor = len(data_R) * 1. / len(data) # Fast two-point correlation functions added in scikit-learn v. 0.14 # Makes tree to embed pairwise distances, increasing look-up speed KDT_D = KDTree(data) # actual distances KDT_R = KDTree(data_R) # randomized background distances counts_DD = KDT_D.two_point_correlation(data, bins) # number of points within bins[i] radius counts_RR = KDT_R.two_point_correlation(data_R, bins) # " " for randomized background DD = np.diff(counts_DD) # number of points in a disc from bins[i-1] to bins[i] RR = np.diff(counts_RR) # " " for randomized background # make zeros 1 for numerical stability (finite difference problems) RR_zero = (RR == 0) # mask creation RR[RR_zero] = 1 # apply update counts_DR = KDT_R.two_point_correlation(data, bins) # cross-correlation betw. actual and random DR = np.diff(counts_DR) # binned cross-corr corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR # the Landy-Szalay formula corr[RR_zero] = np.nan # back-apply the zeros found in RR return corr
def negativeLabels(features, positiveLabels): neg_lab = [[]]*len(features) for i in range(1, len(features)): kdt = KDTree(features[i]['RegionCenter'], metric='euclidean') neighb = kdt.query(features[i-1]['RegionCenter'], k=3, return_distance=False) for j in range(1, len(features[i])): for m in range(0, neighb.shape[1]): neg_lab[i].append([j,neighb[j][m]]) return neg_lab
def patch_classify(): """ patch可视化:观察patch在。 PCA空间,训练数据和实际数据的关系。 构造了kd-tree """ with open('training_data_full.pickle') as f: # 读取对应的原始patch kk = open("raw_data_full.pickle", 'rb') raw_lib = cPickle.load(kk) raw_lib = np.asarray(raw_lib, dtype='float32') # 读取数据转换特征 training_data = cPickle.load(f) patch_lib, feature_lib = training_data feature_lib, patch_lib = (np.asarray(feature_lib, dtype='float32'), np.asarray(patch_lib, dtype='float32')) feature_lib = feature_lib.reshape((-1, 4 * 9 * 9)) # 构造KD-tree tree = KDTree(feature_lib, leaf_size=len(feature_lib) / 100) # 在KD-tree当中搜索最近的100个点 dist, ind1 = tree.query(feature_lib[5678], k=100) nn1 = feature_lib[ind1][0] dist, ind2 = tree.query(feature_lib[10000], k=100) nn2 = feature_lib[ind2][0] dist, ind3 = tree.query(feature_lib[1233], k=100) nn3 = feature_lib[ind3][0] # 计算并转换PCA空间 pca = PCA(n_components=2) d2_data = pca.fit_transform(feature_lib).T # 降临近点的高维坐标转换成PCA空间的低维坐标 r1 = pca.transform(nn1).T r2 = pca.transform(nn2).T r3 = pca.transform(nn3).T # 设置绘制范围 ax = plt.axes([0.1, 0.1, 0.8, 0.8]) # 绘制全部数据的散点图 ax.scatter(d2_data[0], d2_data[1], c='g') # 绘制三个类别的散点图 ax.scatter(r1[0], r1[1], c='r') ax.scatter(r2[0], r2[1], c='b') ax.scatter(r3[0], r3[1], c='y') # patch_lib \ raw_lib分别是差值patch和原始patch patch_show(raw_lib[ind1][0], [0.05, 0.05, 0.4, 0.4], 'red') patch_show(raw_lib[ind2][0], [0.05, 0.55, 0.4, 0.4], 'blue') patch_show(raw_lib[ind3][0], [0.55, 0.05, 0.4, 0.4], 'yellow') plt.show()
def load_subsampled_clouds(self, subsampling_parameter): """ Presubsample point clouds and load into memory (Load KDTree for neighbors searches """ if 0 < subsampling_parameter <= 0.01: raise ValueError('subsampling_parameter too low (should be over 1 cm') # Create path for files tree_path = join(self.path, 'input_{:.3f}'.format(subsampling_parameter)) if not exists(tree_path): makedirs(tree_path) # Initiate containers self.input_trees = {'training': [], 'validation': []} self.input_colors = {'training': [], 'validation': []} self.input_labels = {'training': [], 'validation': []} for i, file_path in enumerate(self.train_files): # Restart timer t0 = time.time() # get cloud name and split cloud_name = file_path.split('/')[-1][:-4] if self.all_splits[i] == self.validation_split: cloud_split = 'validation' else: cloud_split = 'training' # Name of the input files KDTree_file = join(tree_path, '{:s}_KDTree.pkl'.format(cloud_name)) sub_ply_file = join(tree_path, '{:s}.ply'.format(cloud_name)) # Check if inputs have already been computed if isfile(KDTree_file): print('\nFound KDTree for cloud {:s}, subsampled at {:.3f}'.format(cloud_name, subsampling_parameter)) # read ply with data data = read_ply(sub_ply_file) sub_colors = np.vstack((data['red'], data['green'], data['blue'])).T sub_labels = data['class'] # Read pkl with search tree with open(KDTree_file, 'rb') as f: search_tree = pickle.load(f) else: print('\nPreparing KDTree for cloud {:s}, subsampled at {:.3f}'.format(cloud_name, subsampling_parameter)) # Read ply file data = read_ply(file_path) points = np.vstack((data['x'], data['y'], data['z'])).T colors = np.vstack((data['red'], data['green'], data['blue'])).T labels = data['class'] # Subsample cloud sub_points, sub_colors, sub_labels = grid_subsampling(points, features=colors, labels=labels, sampleDl=subsampling_parameter) # Rescale float color and squeeze label sub_colors = sub_colors / 255 sub_labels = np.squeeze(sub_labels) # Get chosen neighborhoods search_tree = KDTree(sub_points, leaf_size=50) # Save KDTree with open(KDTree_file, 'wb') as f: pickle.dump(search_tree, f) # Save ply write_ply(sub_ply_file, [sub_points, sub_colors, sub_labels], ['x', 'y', 'z', 'red', 'green', 'blue', 'class']) # Fill data containers self.input_trees[cloud_split] += [search_tree] self.input_colors[cloud_split] += [sub_colors] self.input_labels[cloud_split] += [sub_labels] size = sub_colors.shape[0] * 4 * 7 print('{:.1f} MB loaded in {:.1f}s'.format(size * 1e-6, time.time() - t0)) print('\nPreparing reprojection indices for testing') # Get number of clouds self.num_training = len(self.input_trees['training']) self.num_validation = len(self.input_trees['validation']) # Get validation and test reprojection indices self.validation_proj = [] self.validation_labels = [] i_val = 0 for i, file_path in enumerate(self.train_files): # Restart timer t0 = time.time() # Get info on this cloud cloud_name = file_path.split('/')[-1][:-4] # Validation projection and labels if self.all_splits[i] == self.validation_split: proj_file = join(tree_path, '{:s}_proj.pkl'.format(cloud_name)) if isfile(proj_file): with open(proj_file, 'rb') as f: proj_inds, labels = pickle.load(f) else: data = read_ply(file_path) points = np.vstack((data['x'], data['y'], data['z'])).T labels = data['class'] # Compute projection inds proj_inds = np.squeeze(self.input_trees['validation'][i_val].query(points, return_distance=False)) proj_inds = proj_inds.astype(np.int32) # Save with open(proj_file, 'wb') as f: pickle.dump([proj_inds, labels], f) self.validation_proj += [proj_inds] self.validation_labels += [labels] i_val += 1 print('{:s} done in {:.1f}s'.format(cloud_name, time.time() - t0)) print() return
class KNN(BaseDetector): """ kNN class for outlier detection. For an observation, its distance to its kth nearest neighbor could be viewed as the outlying score. It could be viewed as a way to measure the density. More to see the references below. Three kNN detectors are supported: largest: use the distance to the kth neighbor as the outlier score mean: use the average of all k neighbors as the outlier score median: use the median of the distance to k neighbors as the outlier score .. [1] Ramaswamy, S., Rastogi, R. and Shim, K., 2000, May. Efficient algorithms for mining outliers from large data sets. In ACM Sigmod Record (Vol. 29, No. 2, pp. 427-438). ACM. .. [2] Angiulli, F. and Pizzuti, C., 2002, August. Fast outlier detection in high dimensional spaces. In European Conference on Principles of Data Mining and Knowledge Discovery,pp. 15-27. :param contamination: the amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. :type contamination: float in (0, 0.5], optional (default=0.1) :param n_neighbors: Number of neighbors to use by default for k neighbors queries. :type n_neighbors: int, optional (default=5) :param method: {'largest', 'mean', 'median'} - largest: use the distance to the kth neighbor as the outlier score - mean: use the average of all k neighbors as the outlier score - median: use the median of the distance to k neighbors as the outlier score :type method: str, optional (default='largest') """ def __init__(self, contamination=0.1, n_neighbors=5, method='largest'): super(KNN, self).__init__(contamination=contamination) self.n_neighbors = n_neighbors self.method = method def fit(self, X, y=None): # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.tree_ = KDTree(X) neigh = NearestNeighbors(n_neighbors=self.n_neighbors) neigh.fit(X) dist_arr, _ = neigh.kneighbors(n_neighbors=self.n_neighbors, return_distance=True) if self.method == 'largest': dist = dist_arr[:, -1] elif self.method == 'mean': dist = np.mean(dist_arr, axis=1) elif self.method == 'median': dist = np.median(dist_arr, axis=1) self.decision_scores_ = dist.ravel() self._process_decision_scores() return self def decision_function(self, X): check_is_fitted(self, ['tree_', 'decision_scores_', 'threshold_', 'labels_']) X = check_array(X) # initialize the output score pred_scores = np.zeros([X.shape[0], 1]) for i in range(X.shape[0]): x_i = X[i, :] x_i = np.asarray(x_i).reshape(1, x_i.shape[0]) # get the distance of the current point dist_arr, _ = self.tree_.query(x_i, k=self.n_neighbors) if self.method == 'largest': dist = dist_arr[:, -1] elif self.method == 'mean': dist = np.mean(dist_arr, axis=1) elif self.method == 'median': dist = np.median(dist_arr, axis=1) pred_score_i = dist[-1] # record the current item pred_scores[i, :] = pred_score_i return pred_scores.ravel()
def kd_nn(cities): points = [(city.x, city.y) for city in cities] tree = KDTree(points, leaf_size=10, metric='euclidean') results = tree.query(points, k=2, return_distance=False) return results
class SurfacePointCloud: def __init__(self, mesh, points, normals=None, scans=None): self.mesh = mesh self.points = points self.normals = normals self.scans = scans self.kd_tree = KDTree(points) def get_random_surface_points(self, count, use_scans=True): if use_scans: indices = np.random.choice(self.points.shape[0], count) return self.points[indices, :] else: return self.mesh.sample(count) def get_sdf(self, query_points, use_depth_buffer=False, sample_count=11): if use_depth_buffer: distances, _ = self.kd_tree.query(query_points) distances = distances.astype(np.float32).reshape(-1) * -1 distances[self.is_outside(query_points)] *= -1 return distances else: distances, indices = self.kd_tree.query(query_points, k=sample_count) distances = distances.astype(np.float32) closest_points = self.points[indices] direction_to_surface = query_points[:, np.newaxis, :] - closest_points inside = np.einsum('ijk,ijk->ij', direction_to_surface, self.normals[indices]) < 0 inside = np.sum(inside, axis=1) > sample_count * 0.5 distances = distances[:, 0] distances[inside] *= -1 return distances def get_sdf_in_batches(self, query_points, use_depth_buffer=False, sample_count=11, batch_size=1000000): if query_points.shape[0] <= batch_size: return self.get_sdf(query_points, use_depth_buffer=use_depth_buffer, sample_count=sample_count) result = np.zeros(query_points.shape[0]) for i in range(int(math.ceil(query_points.shape[0] / batch_size))): start = i * batch_size end = min(result.shape[0], (i + 1) * batch_size) result[start:end] = self.get_sdf(query_points[start:end, :], use_depth_buffer=use_depth_buffer, sample_count=sample_count) return result def get_voxels(self, voxel_resolution, use_depth_buffer=False, sample_count=11, pad=False, check_result=False): from mesh_to_sdf.utils import get_raster_points, check_voxels sdf = self.get_sdf_in_batches(get_raster_points(voxel_resolution), use_depth_buffer, sample_count) voxels = sdf.reshape( (voxel_resolution, voxel_resolution, voxel_resolution)) if check_result and not check_voxels(voxels): raise BadMeshException() if pad: voxels = np.pad(voxels, 1, mode='constant', constant_values=1) return voxels def sample_sdf_near_surface(self, number_of_points=500000, use_scans=True, sign_method='normal', normal_sample_count=11, min_size=0): query_points = [] surface_sample_count = int(number_of_points * 47 / 50) // 2 surface_points = self.get_random_surface_points(surface_sample_count, use_scans=use_scans) query_points.append( surface_points + np.random.normal(scale=0.0025, size=(surface_sample_count, 3))) query_points.append( surface_points + np.random.normal(scale=0.00025, size=(surface_sample_count, 3))) unit_sphere_sample_count = number_of_points - surface_sample_count * 2 unit_sphere_points = np.random.uniform(-1, 1, size=(unit_sphere_sample_count * 2, 3)) unit_sphere_points = unit_sphere_points[ np.linalg.norm(unit_sphere_points, axis=1) < 1] query_points.append(unit_sphere_points[:unit_sphere_sample_count, :]) query_points = np.concatenate(query_points).astype(np.float32) if sign_method == 'normal': sdf = self.get_sdf_in_batches(query_points, use_depth_buffer=False, sample_count=normal_sample_count) elif sign_method == 'depth': sdf = surface_point_cloud.get_sdf_in_batches(query_points, use_depth_buffer=True) else: raise ValueError( 'Unknown sign determination method: {:s}'.format(sign_method)) if min_size > 0: model_size = np.count_nonzero( sdf[-unit_sphere_sample_count:] < 0) / unit_sphere_sample_count if model_size < min_size: raise BadMeshException() return query_points, sdf def show(self): scene = pyrender.Scene() scene.add(pyrender.Mesh.from_points(self.points, normals=self.normals)) pyrender.Viewer(scene, use_raymond_lighting=True, point_size=2) def is_outside(self, points): result = None for scan in self.scans: if result is None: result = scan.is_visible(points) else: result = np.logical_or(result, scan.is_visible(points)) return result
]) # generate threshold sum target_test_threshold = np.sum(test_scores_norm.clip(0), axis=1) test_target_list.append(target_test_threshold) method_list.append('threshold') # generate average of maximum (AOM) and maximum of average (MOA) target_test_aom = aom(test_scores_norm, n_buckets, n_clf) target_test_moa = moa(test_scores_norm, n_buckets, n_clf) test_target_list.extend([target_test_aom, target_test_moa]) method_list.extend(['aom', 'moa']) ################################################################### # use mean as the pseudo target for k in final_k_list: tree = KDTree(X_train_norm) dist_arr, ind_arr = tree.query(X_test_norm, k=k) m_list = [ 'a_dist_d', 'a_dist_r', 'a_dist_n', 'a_pear_d', 'a_pear_r', 'a_pear_n' ] # initialize different buckets pred_scores_best = np.zeros([X_test.shape[0], len(m_list)]) pred_scores_max_d = np.zeros([X_test.shape[0], len(m_list)]) pred_scores_max_f5 = np.zeros([X_test.shape[0], len(m_list)]) pred_scores_max_f10 = np.zeros([X_test.shape[0], len(m_list)]) pred_scores_max_f15 = np.zeros([X_test.shape[0], len(m_list)]) for i in range(X_test.shape[0]): # X_test_norm.shape[0]
class KNN(BaseDetector): # noinspection PyPep8 """kNN class for outlier detection. For an observation, its distance to its kth nearest neighbor could be viewed as the outlying score. It could be viewed as a way to measure the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for details. Three kNN detectors are supported: largest: use the distance to the kth neighbor as the outlier score mean: use the average of all k neighbors as the outlier score median: use the median of the distance to k neighbors as the outlier score Parameters ---------- contamination : float in (0., 0.5), optional (default=0.1) The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Used when fitting to define the threshold on the decision function. n_neighbors : int, optional (default = 5) Number of neighbors to use by default for k neighbors queries. method : str, optional (default='largest') {'largest', 'mean', 'median'} - 'largest': use the distance to the kth neighbor as the outlier score - 'mean': use the average of all k neighbors as the outlier score - 'median': use the median of the distance to k neighbors as the outlier score radius : float, optional (default = 1.0) Range of parameter space to use by default for `radius_neighbors` queries. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: - 'ball_tree' will use BallTree - 'kd_tree' will use KDTree - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force. leaf_size : int, optional (default = 30) Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem. metric : string or callable, default 'minkowski' metric to use for distance computation. Any metric from scikit-learn or scipy.spatial.distance can be used. If metric is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two arrays as input and return one value indicating the distance between them. This works for Scipy's metrics, but is less efficient than passing the metric name as a string. Distance matrices are not supported. Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] See the documentation for scipy.spatial.distance for details on these metrics. p : integer, optional (default = 2) Parameter for the Minkowski metric from sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Affects only kneighbors and kneighbors_graph methods. Attributes ---------- decision_scores_ : numpy array of shape (n_samples,) The outlier scores of the training data. The higher, the more abnormal. Outliers tend to have higher scores. This value is available once the detector is fitted. threshold_ : float The threshold is based on ``contamination``. It is the ``n_samples * contamination`` most abnormal samples in ``decision_scores_``. The threshold is calculated for generating binary outlier labels. labels_ : int, either 0 or 1 The binary labels of the training data. 0 stands for inliers and 1 for outliers/anomalies. It is generated by applying ``threshold_`` on ``decision_scores_``. """ def __init__(self, contamination=0.1, n_neighbors=5, method='largest', radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=1, **kwargs): super(KNN, self).__init__(contamination=contamination) self.n_neighbors = n_neighbors self.method = method self.radius = radius self.algorithm = algorithm self.leaf_size = leaf_size self.metric = metric self.p = p self.metric_params = metric_params self.n_jobs = n_jobs self.neigh_ = NearestNeighbors(n_neighbors=self.n_neighbors, radius=self.radius, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs, **kwargs) def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.tree_ = KDTree(X, leaf_size=self.leaf_size, metric=self.metric) self.neigh_.fit(X) dist_arr, _ = self.neigh_.kneighbors(n_neighbors=self.n_neighbors, return_distance=True) dist = self._get_dist_by_method(dist_arr) self.decision_scores_ = dist.ravel() self._process_decision_scores() return self def decision_function(self, X): """Predict raw anomaly score of X using the fitted detector. The anomaly score of an input sample is computed based on different detector algorithms. For consistency, outliers are assigned with larger anomaly scores. Parameters ---------- X : numpy array of shape (n_samples, n_features) The training input samples. Sparse matrices are accepted only if they are supported by the base estimator. Returns ------- anomaly_scores : numpy array of shape (n_samples,) The anomaly score of the input samples. """ check_is_fitted(self, ['tree_', 'decision_scores_', 'threshold_', 'labels_']) X = check_array(X) # initialize the output score pred_scores = np.zeros([X.shape[0], 1]) for i in range(X.shape[0]): x_i = X[i, :] x_i = np.asarray(x_i).reshape(1, x_i.shape[0]) # get the distance of the current point dist_arr, _ = self.tree_.query(x_i, k=self.n_neighbors) dist = self._get_dist_by_method(dist_arr) pred_score_i = dist[-1] # record the current item pred_scores[i, :] = pred_score_i return pred_scores.ravel() def _get_dist_by_method(self, dist_arr): """Internal function to decide how to process passed in distance array Parameters ---------- dist_arr : numpy array of shape (n_samples, n_neighbors) Distance matrix. Returns ------- dist : numpy array of shape (n_samples,) The outlier scores by distance. """ if self.method == 'largest': return dist_arr[:, -1] elif self.method == 'mean': return np.mean(dist_arr, axis=1) elif self.method == 'median': return np.median(dist_arr, axis=1)
# (a.y - b.y)**2 + # alpha*(a.theta - b.theta)**2) def dist_func(a, b): alpha = 1 return np.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2 + alpha * (a[2] - b[2])**2) pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func) model_states, X = make_states() for i in range(X.shape[0]): print(X[i, :]) print("TREE TIME") tree = KDTree(X, leaf_size=4, metric="euclidean") pts = np.array([(0, 0, 0)]) dist, ind = tree.query(pts, k=1) for i in ind: print(X[i]) print(np.asscalar(i)) print(model_states[np.asscalar(i)]) # print(dist) # print(KDTree.valid_metrics) # a = np.empty((5, 5, 3)) # b = np.empty((3, 5, 5)) # # r = 0 # for i in range(a.shape[0]): # for j in range(a.shape[1]):
'primary_focus_subject_Nutrition', 'primary_focus_subject_Other', 'primary_focus_subject_Parent Involvement', 'primary_focus_subject_Performing Arts', 'primary_focus_subject_Social Sciences', 'primary_focus_subject_Special Needs', 'primary_focus_subject_Team Sports', 'primary_focus_subject_Visual Arts', 'poverty_level_high poverty', 'poverty_level_highest poverty', 'poverty_level_low poverty', 'poverty_level_moderate poverty', 'grade_level_Grades 3-5', 'grade_level_Grades 6-8', 'grade_level_Grades 9-12', 'grade_level_Grades PreK-2', 'school_metro_rural', 'school_metro_suburban', 'school_metro_urban', 'resource_type_Books', 'resource_type_Other', 'resource_type_Supplies', 'resource_type_Technology', 'resource_type_Trips', 'resource_type_Visitors' ] tree = KDTree(lookup[X], metric="chebyshev") #---------- URLS AND WEB PAGES -------------# app = flask.Flask(__name__) @app.route("/") def viz_page(): with open("dc_prediction.html", 'r') as viz_file: return viz_file.read() @app.route("/score", methods=["POST"]) def score(): data = flask.request.json x = np.matrix(data["example"])
loc, keypoints = blob_detect(gray_crop) # print(loc.shape) # print(loc) # im_with_keypoints = cv2.drawKeypoints(gray_crop, keypoints, np.array([]), # (0, 0, 255), cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS) # cv2.imshow('keypoints', im_with_keypoints) if count == 0: loc_0 = loc.copy() recent_loc = loc.copy() elif count > 0: print('===========frame: {}================='.format(count)) # print(loc_0[1,:]) kdt = KDTree(loc, leaf_size=30, metric='euclidean') dist, ind = kdt.query(recent_loc, k=1) thd = (dist < 14) * 1 thd_nz = np.where(thd)[0] # update point if close enough point are detected recent_loc[thd_nz] = np.reshape(loc[ind[thd_nz]], (len(thd_nz), 2)) # visualize the displacement field loc_v = 2 * recent_loc - loc_0 # diff vector img_rgb = cv2.cvtColor(gray_crop, cv2.COLOR_GRAY2RGB) # draw image and save vectors for i in range(0, len(loc_0)): cv2.arrowedLine( img_rgb, (int(np.around( recent_loc[i, 0])), int(np.around(recent_loc[i, 1]))),
def get_top_n(): ref_meta = load_csv(REF_CSV) query_meta = load_csv(QUERY_CSV) full_ref_xy = get_xy(ref_meta) full_query_xy = get_xy(query_meta) num_q = full_query_xy.shape[0] pca_f = np.array(load_pickle(PCA_LV_PICKLE)) full_ref_f = np.array(load_pickle(REF_LV_PICKLE)) full_query_f = np.array(load_pickle(QUERY_LV_PICKLE)) full_xy_dists = pairwise_distances(full_query_xy, full_ref_xy, metric='euclidean') for d in DIMS: print(d) pca = PCA(whiten=True, n_components=d) pca = pca.fit(pca_f) pca_ref_f = pca.transform(full_ref_f) pca_query_f = pca.transform(full_query_f) for l in L: print(l) out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d)) mkdir(out_folder) name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1]) out_pickle = os.path.join(out_folder, '{}.pickle'.format(name)) if os.path.exists(out_pickle): print('{} already exists. Skipping.'.format(out_pickle)) continue ref_idx = [0] for i in range(len(full_ref_xy)): if sum((full_ref_xy[i, :] - full_ref_xy[ref_idx[-1], :])** 2) >= l**2: ref_idx.append(i) if len(ref_idx) < N: continue ref_f = np.array([pca_ref_f[i, :] for i in ref_idx]) xy_dists = np.array([full_xy_dists[:, i] for i in ref_idx]).transpose() print('Building tree') ref_tree = KDTree(ref_f) print('Retrieving') top_f_dists, top_i = np.array( ref_tree.query(pca_query_f, k=N, return_distance=True, sort_results=True)) top_f_dists = np.array(top_f_dists) top_i = np.array(top_i, dtype=int) top_g_dists = [[xy_dists[q, r] for r in top_i[q, :]] for q in range(num_q)] gt_i = np.argmin(xy_dists, axis=1) gt_g_dist = np.min(xy_dists, axis=1) # Translate to original indices top_i = [[ref_idx[r] for r in top_i[q, :]] for q in range(num_q)] gt_i = [ref_idx[r] for r in gt_i] save_pickle( [top_i, top_g_dists, top_f_dists, gt_i, gt_g_dist, ref_idx], out_pickle)
xvals = np.random.uniform(xmin, xmax, num_samples) yvals = np.random.uniform(ymin, ymax, num_samples) zvals = np.random.uniform(zmin, zmax, num_samples) zvalsMax = np.max(zvals) points = np.array(list(zip(xvals, yvals, zvals))) #points[:10] #for point in points: # if not collides(polygons, point): # to_keep.append(point) nodes = [] #points = np.random.random((100, 3)) # 10 points in 3 dimensions tree = KDTree(data[:, :3]) for p in points: idxs = tree.query([p], k=1, return_distance=False)[0] print(idxs) if not collides(polygons[idxs[0]], p): nodes.append(p) # idxs = tree.query([points[1]], k=3, return_distance=False)[0] # print(idxs) # idxs = tree.query([points[0]], k=3, return_distance=False)[0] # TODO: connect nodes # Suggested method # 1) cast nodes into a graph called "g" using networkx # 2) write a method "can_connect()" that: # casts two points as a shapely LineString() object
def createBatches(xyz, data, label, batchsize=2048): """ Create batches from numpy array with Nearest Neighbor approach. Leftover points are discarded. All input array should have the same amount of points. Input: Numpy Array xyz, this array is used to construct KDTree and determine nearest neighbors (Usually X,Y,Z coordinates) (num_points, 3) Numpy Array data, with features (num_points, num_features) Numpy Array label, with labels (num_points, ) Int batchsize, with batchsize (default: 2048) Return: Tuple of Numpy Array, with original xyz coordinates for visul (num_batches, batchsize, 3) Numpy Array, with batches of data (num_batches, batchsize, num_features) Numpy Array, with labels (num_batches, batchsize, ) """ xyz_batches = [] data_batches = [] label_batches = [] num_batches = xyz.shape[0] // batchsize # floor division to get num batches # debug output print("Number of Batches: " + str(num_batches)) start = time.time() for i in range(num_batches): # find points for each batch starting with next point in data #start = time.time() #tree = KDTree(xyz, leaf_size=xyz.shape[0]) #end = time.time() #print("build time: " + str(end - start)) # #start = time.time() #indices = tree.query(xyz[:1], k=batchsize, return_distance=False, sort_results=True) #end = time.time() #print("query time: " + str(end - start)) tree = KDTree(xyz, leaf_size=xyz.shape[0]) indices = tree.query(xyz[:1], k=batchsize, return_distance=False, sort_results=True) # append batch to stores # np squeez to get rid of single dimensional shape entries # before (1, batchsize, 3) -> after (batchsize, 3) xyz_batches.append(np.squeeze(xyz[indices])) data_batches.append(np.squeeze(data[indices])) label_batches.append(np.squeeze(label[indices])) # remove allocated indices to prepare next iteration data = np.delete(data, indices, axis=0) label = np.delete(label, indices, axis=0) xyz = np.delete(xyz, indices, axis=0) # to monitor progress for huge input sets if i % 10 == 0: print(str(i / num_batches)) # convert lists to numpy array and return tuple xyz = np.asarray(xyz_batches, dtype=np.float64) data = np.asarray(data_batches, dtype=np.float64) label = np.asarray(label_batches, dtype=np.int8) # create and return tuple return (xyz, data, label)
embed = embed_model.predict([x, m], batch_size=100, verbose=1) embed_dict[i] = embed del x, y, m np.save(filename, embed) del sequence_dict[i] print 'embedded', i, rev_label_dict[i] #embed_dict[i] = embed_dict[i][0:1000] del sequence_dict, model, embed_model result = [] tree_dict = dict() for i in range(N): tree_dict[i] = KDTree(embed_dict[i], leaf_size=10) print 'tree', i def distance(embed, tree, embed_name, tree_name): path = "/mnt/data/computervision/tara/results64/" dist_filename = path + embed_name + "_" + tree_name + "_distances.npy" ind_filename = path + embed_name + "_" + tree_name + "_indices.npy" if os.path.exists(dist_filename): dists = np.load(dist_filename) indices = np.load(ind_filename) else: (dists, indices) = tree.query(embed, k=1) np.save(dist_filename, dists) np.save(ind_filename, indices) dist = np.mean(dists)
class knn(): def fit(self, xtrain, ytrain, k, tree=True): self.xtrain = xtrain self.ytrain = ytrain self.k = k self.tree = tree self.correct = 0 def dist(self, a, b): return np.linalg.norm(a - b) def closest(self, row, k): best_dist = self.dist(row, self.xtrain[0]) best_indx = 0 for i in range(self.k, len(self.xtrain)): dist = self.dist(row, self.xtrain[i]) if dist < best_dist: best_dist = dist best_indx = i #if i % 100 == 0: #print("Iteration ", i) #print(self.ytrain[best_indx]) return self.ytrain[best_indx] def predictKD(self, xtest, k): self.predictions = [] self.kdtree = KDTree(self.xtrain, leaf_size=40) dist, ind = self.kdtree.query(xtest, k=self.k) self.predictions = self.ytrain[ind[:, 0]] self.predictions = np.squeeze(self.predictions) return self.predictions def predict(self, xtest, k): self.predictions = [] for row in xtest: label = self.closest(row, k) self.predictions.append(label) return self.predictions def accuracy_score(self, ytrue): self.correct = 0 for i in range(len(ytrue)): if ytrue[i] == self.predictions[i]: self.orrect += 1 return (self.correct / float(len(ytrue))) * 100.0 def get_results(self, ylabel): self.ylabel = ylabel size = len(self.ylabel) conf = confusion_matrix(self.ylabel, self.predictions) plt.figure(0).clf() plt.imshow(conf) print(classification_report(self.ylabel, self.predictions)) fpr = (len(ylabel) - self.correct) / float(len(ylabel)) tpr = self.correct / float(len(ylabel)) plt.figure(1).clf() plt.scatter(fpr, tpr, marker='o', label='KNN ROC point') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('Log_ROC') plt.show()
# For each point cloud, a sub sample of point will be used for the nearest neighbors and training sub_npy_file = NEW_PATH / folder / (file_name + '.npy') xyz = data[:,:3].astype(np.float32) colors = data[:,3:6].astype(np.uint8) if folder!=TEST_PATH or LABELS_AVAILABLE_IN_TEST_SET: labels = data[:,-1].astype(np.uint8) sub_xyz, sub_colors, sub_labels = DP.grid_sub_sampling(xyz, colors, labels, sub_grid_size) sub_colors = sub_colors / 255.0 np.save(sub_npy_file, np.concatenate((sub_xyz, sub_colors, sub_labels), axis=1).T) else: sub_xyz, sub_colors = DP.grid_sub_sampling(xyz, colors, None, sub_grid_size) sub_colors = sub_colors / 255.0 np.save(sub_npy_file, np.concatenate((sub_xyz, sub_colors), axis=1).T) # The search tree is the KD_tree saved for each point cloud search_tree = KDTree(sub_xyz) kd_tree_file = NEW_PATH / folder / (file_name + '_KDTree.pkl') with open(kd_tree_file, 'wb') as f: pickle.dump(search_tree, f) # Projection is the nearest points of the selected grid to each point of the cloud proj_idx = np.squeeze(search_tree.query(xyz, return_distance=False)) proj_idx = proj_idx.astype(np.int32) proj_save = NEW_PATH / folder / (file_name + '_proj.pkl') with open(proj_save, 'wb') as f: pickle.dump([proj_idx, labels], f)
def ConstructMatchingModelRandom(G1, G2, Type, AddTriplet): KP = ComputeFeatureDistance(G1.PFeature, G2.PFeature) KQ = ComputeKQ(G1, G2, Type) NP1 = G1.NofNodes NP2 = G2.NofNodes nT = np.floor(NP1 * NP2) t1 = np.floor(np.random.rand(3, nT) * NP1) while (True): probFound = False for i in range(3): ind = (t1[i, :] == t1[(i + 1) % 3, :]) if (np.sum(ind) != 0): idxs = np.nonzero(ind) t1[i][idxs] = np.floor(np.random.rand(1, len(idxs[0])) * NP1) probFound = True if (probFound == False): break t1 = t1.transpose() T = np.sort(t1, axis=1) T = T[np.lexsort(np.fliplr(T).T)] NRepeated = np.ones(T.shape[0], dtype=int) for i in range(1, T.shape[0]): if (np.sum(np.abs(T[i] - T[i - 1])) == 0): NRepeated[i] = 0 NRepTri = np.nonzero(NRepeated) T = T[NRepTri] TF1 = np.zeros([T.shape[0], 3]) for ti in range(T.shape[0]): TF1[ti] = computeTripletsFeatureSinAlpha(G1.P, T[ti]) NofT2 = G2.NofNodes * (G2.NofNodes - 1) * (G2.NofNodes - 2) T2 = np.zeros([NofT2, 3], dtype=int) TF2 = np.zeros([6 * NofT2, 3], dtype=float) T2Cnt = 0 for i1 in range(G2.NofNodes): for i2 in range(i1 + 1, G2.NofNodes): for i3 in range(i2 + 1, G2.NofNodes): T2[T2Cnt][0] = i1 T2[T2Cnt][1] = i2 T2[T2Cnt][2] = i3 T2Cnt += 1 T2 = PermunateTriplets(T2) for ti in range(T2.shape[0]): TF2[ti] = computeTripletsFeatureSinAlpha(G2.P, T2[ti]) kdt = KDTree(TF2, metric='euclidean') nNN = T.shape[0] [distT, indicesT] = kdt.query(TF1, k=nNN, return_distance=True) distT = np.exp(-(distT / np.mean(distT))) KP = np.exp(-KP) NofNodes = G1.NofNodes NofStates = intArray(NofNodes) for i in range(NofNodes): NofStates[i] = NofNodes G = CFactorGraph(NofNodes, NofStates) bi = doubleArray(NofNodes) for ni in range(NofNodes): for xi in range(NofNodes): bi[xi] = float(KP[ni][xi]) G.AddNodeBelief(ni, bi) nnzEdgeIdx = VecVecInt(KQ.shape[1]) for ni in range(G2.Edges.shape[0]): CurrentAssign = VecInt(2) CurrentAssign[0] = int(G2.Edges[ni][0]) CurrentAssign[1] = int(G2.Edges[ni][1]) InvCurrentAssign = VecInt(2) InvCurrentAssign[0] = int(G2.Edges[ni][1]) InvCurrentAssign[1] = int(G2.Edges[ni][0]) nnzEdgeIdx[ni] = CurrentAssign nnzEdgeIdx[ni + G2.Edges.shape[0]] = InvCurrentAssign for ei in range(KQ.shape[0]): CEdgeVec = VecInt(2) CEdgeVec[0] = int(G1.Edges[ei][0]) CEdgeVec[1] = int(G1.Edges[ei][1]) CurrentNNZV = doubleArray(KQ.shape[1]) for xij in range(KQ.shape[1]): CurrentNNZV[xij] = KQ[ei][xij] G.AddGenericGenericSparseFactor(CEdgeVec, nnzEdgeIdx, CurrentNNZV) for ti in range(distT.shape[0]): CTripletsVec = VecInt(3) CTripletsVec[0] = int(T[ti][0]) CTripletsVec[1] = int(T[ti][1]) CTripletsVec[2] = int(T[ti][2]) nnzTripIdx = VecVecInt(distT.shape[1]) nnzTripV = doubleArray(distT.shape[1]) for xijk in range(distT.shape[1]): cIdxVec = VecInt(3) cIdxVec[0] = int(T2[indicesT[ti][xijk]][0]) cIdxVec[1] = int(T2[indicesT[ti][xijk]][1]) cIdxVec[2] = int(T2[indicesT[ti][xijk]][2]) nnzTripIdx[xijk] = cIdxVec nnzTripV[xijk] = 6 * distT[ti][xijk] G.AddGenericGenericSparseFactor(CTripletsVec, nnzTripIdx, nnzTripV) G.AddAuctionFactor() return G
def load_subsampled_clouds(self, subsampling_parameter): """ Presubsample point clouds and load into memory (Load KDTree for neighbors searches """ if 0 < subsampling_parameter <= 0.01: raise ValueError( 'subsampling_parameter too low (should be over 1 cm') # Create path for files tree_path = join(self.path, 'input_{:.3f}'.format(subsampling_parameter)) if not exists(tree_path): makedirs(tree_path) # List of training files self.train_files = np.sort([ join(self.train_path, f) for f in listdir(self.train_path) if f[-4:] == '.ply' ]) # Add test files self.test_files = np.sort([ join(self.test_path, f) for f in listdir(self.test_path) if f[-4:] == '.ply' ]) files = np.hstack((self.train_files, self.test_files)) # Initiate containers self.input_trees = {'training': [], 'validation': [], 'test': []} self.input_colors = {'training': [], 'validation': [], 'test': []} self.input_labels = {'training': [], 'validation': []} # Advanced display N = len(files) progress_n = 30 fmt_str = '[{:<' + str(progress_n) + '}] {:5.1f}%' print('\nPreparing KDTree for all scenes, subsampled at {:.3f}'.format( subsampling_parameter)) for i, file_path in enumerate(files): # Restart timer t0 = time.time() # get cloud name and split cloud_name = file_path.split('/')[-1][:-4] cloud_folder = file_path.split('/')[-2] if 'train' in cloud_folder: if self.all_splits[i] == self.validation_split: cloud_split = 'validation' else: cloud_split = 'training' else: cloud_split = 'test' if (cloud_split != 'test' and self.load_test) or (cloud_split == 'test' and not self.load_test): continue # Name of the input files KDTree_file = join(tree_path, '{:s}_KDTree.pkl'.format(cloud_name)) sub_ply_file = join(tree_path, '{:s}.ply'.format(cloud_name)) # Check if inputs have already been computed if isfile(KDTree_file): # read ply with data data = read_ply(sub_ply_file) sub_reflectance = np.expand_dims(data['reflectance'], 1) if cloud_split == 'test': sub_labels = None else: sub_labels = data['class'] # Read pkl with search tree with open(KDTree_file, 'rb') as f: search_tree = pickle.load(f) else: # Read ply file data = read_ply(file_path) points = np.vstack( (data['x'], data['y'], data['z'])).astype(np.float32).T reflectance = np.expand_dims(data['reflectance'], 1).astype(np.float32) if cloud_split == 'test': int_features = None else: int_features = data['class'] # Saturate reflectance reflectance = np.minimum(reflectance, 50.0) # Subsample cloud sub_data = grid_subsampling(points, features=reflectance, labels=int_features, sampleDl=subsampling_parameter) # Rescale and saturate float reflectance sub_reflectance = sub_data[1] / 50.0 # Get chosen neighborhoods search_tree = KDTree(sub_data[0], leaf_size=50) # Save KDTree with open(KDTree_file, 'wb') as f: pickle.dump(search_tree, f) # Save ply if cloud_split == 'test': sub_labels = None write_ply(sub_ply_file, [sub_data[0], sub_reflectance], ['x', 'y', 'z', 'reflectance']) else: sub_labels = np.squeeze(sub_data[2]) write_ply(sub_ply_file, [sub_data[0], sub_reflectance, sub_labels], ['x', 'y', 'z', 'reflectance', 'class']) # Fill data containers self.input_trees[cloud_split] += [search_tree] self.input_colors[cloud_split] += [sub_reflectance] if cloud_split in ['training', 'validation']: self.input_labels[cloud_split] += [sub_labels] print('', end='\r') print(fmt_str.format('#' * (((i + 1) * progress_n) // N), 100 * (i + 1) / N), end='', flush=True) # Get number of clouds self.num_training = len(self.input_trees['training']) self.num_validation = len(self.input_trees['validation']) self.num_test = len(self.input_trees['test']) # Get validation and test reprojection indices self.validation_proj = [] self.validation_labels = [] self.test_proj = [] self.test_labels = [] i_val = 0 i_test = 0 # Advanced display N = max(self.num_validation + self.num_test, 1) print('', end='\r') print(fmt_str.format('#' * progress_n, 100), flush=True) print('\nPreparing reprojection indices for validation and test') for i, file_path in enumerate(files): # get cloud name and split cloud_name = file_path.split('/')[-1][:-4] cloud_folder = file_path.split('/')[-2] # Validation projection and labels if (not self.load_test ) and 'train' in cloud_folder and self.all_splits[ i] == self.validation_split: proj_file = join(tree_path, '{:s}_proj.pkl'.format(cloud_name)) if isfile(proj_file): with open(proj_file, 'rb') as f: proj_inds, labels = pickle.load(f) else: # Get original points data = read_ply(file_path) points = np.vstack((data['x'], data['y'], data['z'])).T labels = data['class'] # Compute projection inds proj_inds = np.squeeze( self.input_trees['validation'][i_val].query( points, return_distance=False)) proj_inds = proj_inds.astype(np.int32) # Save with open(proj_file, 'wb') as f: pickle.dump([proj_inds, labels], f) self.validation_proj += [proj_inds] self.validation_labels += [labels] i_val += 1 # Test projection if self.load_test and 'test' in cloud_folder: proj_file = join(tree_path, '{:s}_proj.pkl'.format(cloud_name)) if isfile(proj_file): with open(proj_file, 'rb') as f: proj_inds = pickle.load(f) else: # Get original points data = read_ply(file_path) points = np.vstack((data['x'], data['y'], data['z'])).T # Compute projection inds proj_inds = np.squeeze( self.input_trees['test'][i_test].query( points, return_distance=False)) proj_inds = proj_inds.astype(np.int32) # Save with open(proj_file, 'wb') as f: pickle.dump(proj_inds, f) self.test_proj += [proj_inds] self.test_labels += [np.zeros(0, dtype=np.int32)] i_test += 1 print('', end='\r') print(fmt_str.format('#' * (((i_val + i_test) * progress_n) // N), 100 * (i_val + i_test) / N), end='', flush=True) print('\n') return
def get_top_n(): # check if complete: ld_checkpoints = get_checkpoints('obm') ld_cp_names = [] for cp in ld_checkpoints: cp_name = cp.split('/')[-2] cp_name = ''.join(os.path.basename(cp_name).split('.')) # Removing '.' cp_name += '_e{}'.format(cp[-1]) ld_cp_names.append(cp_name) if any([x in QUERY_LV_PICKLE for x in ld_cp_names]): L = [0.0, 0.3, 1.0, 5.0] D = [64, 128, 256, 512, 1024, 2048, 4096] else: L = [0.0] D = [256] complete = True for l in L: for d in D: out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d)) name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1]) out_pickle = os.path.join(out_folder, '{}.pickle'.format(name)) if not os.path.exists(out_pickle): complete = False break if not complete: break if complete: print('Skipping complete {}'.format(QUERY_LV_PICKLE)) return ref_meta = load_csv(REF_CSV) query_meta = load_csv(QUERY_CSV) full_ref_xy = get_xy(ref_meta) full_query_xy = get_xy(query_meta) num_q = full_query_xy.shape[0] pca_f = np.array(load_pickle(PCA_LV_PICKLE)) full_ref_f = np.array(load_pickle(REF_LV_PICKLE)) full_query_f = np.array(load_pickle(QUERY_LV_PICKLE)) full_xy_dists = pairwise_distances(full_query_xy, full_ref_xy, metric='euclidean') for d in D: print(d) pca = PCA(whiten=True, n_components=d) pca = pca.fit(pca_f) pca_ref_f = pca.transform(full_ref_f) pca_query_f = pca.transform(full_query_f) for l in L: print(l) out_folder = os.path.join(OUT_ROOT, 'l{}_dim{}'.format(l, d)) mkdir(out_folder) name = ''.join(os.path.basename(QUERY_LV_PICKLE).split('.')[:-1]) out_pickle = os.path.join(out_folder, '{}.pickle'.format(name)) if os.path.exists(out_pickle): print('{} already exists. Skipping.'.format(out_pickle)) continue ref_idx = [0] for i in range(len(full_ref_xy)): if sum((full_ref_xy[i, :] - full_ref_xy[ref_idx[-1], :])** 2) >= l**2: ref_idx.append(i) if len(ref_idx) < N: continue ref_f = np.array([pca_ref_f[i, :] for i in ref_idx]) xy_dists = np.array([full_xy_dists[:, i] for i in ref_idx]).transpose() print('Building tree') ref_tree = KDTree(ref_f) print('Retrieving') top_f_dists, top_i = np.array( ref_tree.query(pca_query_f, k=N, return_distance=True, sort_results=True)) top_f_dists = np.array(top_f_dists) top_i = np.array(top_i, dtype=int) top_g_dists = [[xy_dists[q, r] for r in top_i[q, :]] for q in range(num_q)] gt_i = np.argmin(xy_dists, axis=1) gt_g_dist = np.min(xy_dists, axis=1) # Translate to original indices top_i = [[ref_idx[r] for r in top_i[q, :]] for q in range(num_q)] gt_i = [ref_idx[r] for r in gt_i] save_pickle( [top_i, top_g_dists, top_f_dists, gt_i, gt_g_dist, ref_idx], out_pickle)
cat_neighbors = cat_neighbors_z_slice[abs(cat_neighbors_z_slice['RA'] - gal['RA']) < 0.7 / dis / np.pi * 180] cat_neighbors = cat_neighbors[ abs(cat_neighbors['DEC'] - gal['DEC']) < 0.7 / dis / np.pi * 180] if len(cat_neighbors) == 0: # central gals which has no companion coord_random_list, radial_bkg = bkg(cat_neighbors_z_slice, coord_massive_gal, mode=mode) radial_dist_bkg += radial_bkg cat_random_copy = cut_random_cat(cat_random_copy, coord_random_list) continue else: ind = KDTree(np.array(cat_neighbors['RA', 'DEC']).tolist()).query_radius( [(gal['RA'], gal['DEC'])], 0.7 / dis / np.pi * 180) cat_neighbors = cat_neighbors[ind[0]] cat_neighbors = cat_neighbors[ cat_neighbors['NUMBER'] != gal['NUMBER']] if len(cat_neighbors) == 0: # central gals which has no companion coord_random_list, radial_bkg = bkg(cat_neighbors_z_slice, coord_massive_gal, mode=mode) radial_dist_bkg += radial_bkg cat_random_copy = cut_random_cat(cat_random_copy, coord_random_list) continue # isolation cut on central if gal[mass_keyname] < max(
def local_optimize_nn( data, graph, hub_info, n_components, learning_rate, a, b, gamma, negative_sample_rate, n_epochs, init, random_state, parallel=False, verbose=False, label=None, k=0, ): graph = graph.tocoo() graph.sum_duplicates() n_vertices = graph.shape[1] graph.data[ hub_info[graph.col] == 2 ] = 1.0 # current (NNs) -- other (hubs): 1.0 weight graph.data[ hub_info[graph.row] == 2 ] = 0.0 # current (hubs) -- other (hubs, nns): 0.0 weight (remove) graph.data[graph.data < (graph.data.max() / float(n_epochs))] = 0.0 graph.eliminate_zeros() init_data = np.array(init) if len(init_data.shape) == 2: if np.unique(init_data, axis=0).shape[0] < init_data.shape[0]: tree = KDTree(init_data) dist, ind = tree.query(init_data, k=2) nndist = np.mean(dist[:, 1]) embedding = init_data + random_state.normal( scale=0.001 * nndist, size=init_data.shape ).astype(np.float32) else: embedding = init_data epochs_per_sample = make_epochs_per_sample(graph.data, n_epochs) head = graph.row tail = graph.col embedding = ( 10.0 * (embedding - np.min(embedding, 0)) / (np.max(embedding, 0) - np.min(embedding, 0)) ).astype(np.float32, order="C") rng_state = random_state.randint(INT32_MIN, INT32_MAX, 3).astype(np.int64) embedding = nn_layout_optimize( embedding, embedding, head, tail, hub_info, n_epochs, n_vertices, epochs_per_sample, a, b, rng_state, gamma=gamma, learning_rate=learning_rate, negative_sample_rate=negative_sample_rate, parallel=parallel, verbose=verbose, k=k, label=label, ) return embedding
interest_indexes = np.where(hash_index == selected_id)[0] fuzzy_indexes = np.where((scalar_product <= upper_bound+bucket_residual)*\ (scalar_product >= lower_bound-bucket_residual))[0] # Restrict the points points_ = points[interest_indexes] labels_ = labels[interest_indexes] # 3.1 Voxelisation of the data N = points_.shape[0] indexes = np.arange(N) t0 = time.time() print('Building KDTree...') kd = KDTree(points_, metric='minkowski') t1 = time.time() print('KDTree built in {} sec'.format(t1 - t0)) # Single-Shot query for cubical voxels radius = 0.1 t0 = time.time() neighborhoods_inner_sphere = kd.query_radius(points_, r=radius) t1 = time.time() print('Query time for computing neighborhoods on all points: {} sec'. format(t1 - t0)) t0 = time.time() neighborhoods_outer_sphere = kd.query_radius(points_, r=radius * sqrt(3))
def get_nearest_neighbors_from_set(node_set, k): tree = KDTree(node_set) return tree.query(node_set, k=k)
_has_mongo = False else: _has_mongo = True if __name__ == '__main__': client = MongoClient(mongoconnection.server) db = client[mongoconnection.db] if mongoconnection.passwd is not None: db.authenticate(mongoconnection.user, password=mongoconnection.passwd) col = db[mongoconnection.col] coords = np.load(wind_data_path + '/Coords.npy') memo = np.zeros(coords.shape[0]) query = {"experiment": "rnnseq2seq", "site": ""} tree = KDTree(coords, leaf_size=1) count = 0 for i in range(coords.shape[0]): # print(i, end=' ') # if i%100 == 0: # print() # dist= tree.query_radius(coords[0,:].reshape(1, -1), r=0.1, return_distance=True, sort_results=True) dist = tree.query_radius(coords[i, :].reshape(1, -1), r=0.05, count_only=False, return_distance=False)[0] # print(i, dist) try: if len(dist) > 1: tsum = 0 for j in dist:
def detect_in_scope(vector_list, label_list): X = vector_list tree = KDTree(X, leaf_size=min(len(X) // 2, 400)) return tree
def __getitem__(self, index): rets = {} imgs = np.zeros((self.nViews, *self.OutputSize[::-1]), dtype=np.float32) if self.rgbd: imgs_rgb = np.zeros((self.nViews, *self.OutputSize[::-1], 3), dtype=np.float32) if self.segm: segm = np.zeros((self.nViews, 1, *self.OutputSize[::-1]), dtype=np.float32) if self.dynamicWeighting: dynamicW = np.zeros((self.nViews, 1, *self.OutputSize[::-1]), dtype=np.float32) if self.normal: normal = np.zeros((self.nViews, *self.OutputSize[::-1], 3), dtype=np.float32) R = np.zeros((self.nViews, 4, 4)) Q = np.zeros((self.nViews, 7)) assert (self.nViews == 2) ct0, ct1 = self.__getpair__(index) imgsPath = [] basePath = self.base_this frameid0 = f"{ct0:06d}" frameid1 = f"{ct1:06d}" if self.fullsize_rgbdn: imgs_rgb_full = np.zeros((self.nViews, 480, 640, 3), dtype=np.float32) imgs_full = np.zeros((self.nViews, 480, 640), dtype=np.float32) imgs_full[0] = self.LoadImage( os.path.join(basePath, 'obs_depth', '{}.png'.format(frameid0))).copy() imgs_full[1] = self.LoadImage( os.path.join(basePath, 'obs_depth', '{}.png'.format(frameid1))).copy() imgs_rgb_full[0] = self.LoadImage(os.path.join( basePath, 'obs_rgb', '{}.png'.format(frameid0)), depth=False).copy() / 255. imgs_rgb_full[1] = self.LoadImage(os.path.join( basePath, 'obs_rgb', '{}.png'.format(frameid1)), depth=False).copy() / 255. rets['rgb_full'] = imgs_rgb_full[np.newaxis, :] rets['depth_full'] = imgs_full[np.newaxis, :] imgs[0] = self.LoadImage( os.path.join(basePath, 'depth', '{}.png'.format(frameid0))).copy() imgs[1] = self.LoadImage( os.path.join(basePath, 'depth', '{}.png'.format(frameid1))).copy() dataMask = np.zeros((self.nViews, 1, *self.OutputSize[::-1]), dtype=np.float32) dataMask[0, 0, :, :] = (imgs[0] != 0) dataMask[1, 0, :, :] = (imgs[1] != 0) rets['dataMask'] = dataMask[np.newaxis, :] if self.rgbd: imgs_rgb[0] = self.LoadImage(os.path.join( basePath, 'rgb', '{}.png'.format(frameid0)), depth=False).copy() / 255. imgs_rgb[1] = self.LoadImage(os.path.join( basePath, 'rgb', '{}.png'.format(frameid1)), depth=False).copy() / 255. R[0] = np.loadtxt( os.path.join(basePath, 'pose', frameid0 + '.pose.txt')) R[1] = np.loadtxt( os.path.join(basePath, 'pose', frameid1 + '.pose.txt')) Q[0, :4] = rot2Quaternion(R[0][:3, :3]) Q[0, 4:] = R[0][:3, 3] Q[1, :4] = rot2Quaternion(R[1][:3, :3]) Q[1, 4:] = R[1][:3, 3] imgsPath.append(f"{basePath}/{ct0:06d}") imgsPath.append(f"{basePath}/{ct1:06d}") if self.normal: tp = self.LoadImage(os.path.join(basePath, 'normal', '{}.png'.format(frameid0)), depth=False).copy().astype('float') mask = (tp == 0).sum(2) < 3 tp[mask] = tp[mask] / 255. * 2 - 1 normal[0] = tp tp = self.LoadImage(os.path.join(basePath, 'normal', '{}.png'.format(frameid1)), depth=False).copy().astype('float') mask = (tp == 0).sum(2) < 3 tp[mask] = tp[mask] / 255. * 2 - 1 normal[1] = tp if self.segm: tp = (self.LoadImage(os.path.join(basePath, 'semantic_idx', '{}.png'.format(frameid0)), depth=False).copy())[:, :, 1] segm[0] = tp.reshape(segm[0].shape) tp = (self.LoadImage(os.path.join(basePath, 'semantic_idx', '{}.png'.format(frameid1)), depth=False).copy())[:, :, 1] segm[1] = tp.reshape(segm[1].shape) segm_ = np.zeros((self.nViews, 1, *self.OutputSize[::-1]), dtype=np.float32) segm_[0] = segm[0] segm_[1] = segm[1] segm_ = segm_[np.newaxis, :] if self.denseCorres: # get 3d point cloud for each pano pcs, masks = self.Pano2PointCloud( imgs[0], self.representation) # be aware of the order of returned pc!!! pct, maskt = self.Pano2PointCloud(imgs[1], self.representation) #pct = np.matmul(R[0],np.matmul(np.linalg.inv(R[1]),np.concatenate((pct,np.ones([1,pct.shape[1]])))))[:3,:] pct = np.matmul(np.linalg.inv(R[1]), np.concatenate( (pct, np.ones([1, pct.shape[1]]))))[:3, :] pcs = np.matmul(np.linalg.inv(R[0]), np.concatenate( (pcs, np.ones([1, pcs.shape[1]]))))[:3, :] # find correspondence using kdtree tree = KDTree(pct.T) IdxQuery = np.random.choice(range(pcs.shape[1]), 5000) # sample 5000 query points pcsQuery = pcs[:, IdxQuery] nearest_dist, nearest_ind = tree.query(pcsQuery.T, k=1) hasCorres = (nearest_dist < 0.08) idxTgtNeg = [] idxSrc = self.PanoIdx(masks[IdxQuery[np.where(hasCorres)[0]]], imgs.shape[1], imgs.shape[2], self.representation) idxTgt = self.PanoIdx(maskt[nearest_ind[hasCorres]], imgs.shape[1], imgs.shape[2], self.representation) if hasCorres.sum() < 200: rets['denseCorres'] = { 'idxSrc': np.zeros([1, 500, 2]), 'idxTgt': np.zeros([1, 500, 2]), 'valid': np.array([0]), 'idxTgtNeg': idxTgtNeg } else: # only pick 2000 correspondence per pair idx500 = np.random.choice(range(idxSrc.shape[0]), 500) idxSrc = idxSrc[idx500][np.newaxis, :] idxTgt = idxTgt[idx500][np.newaxis, :] rets['denseCorres'] = { 'idxSrc': idxSrc, 'idxTgt': idxTgt, 'valid': np.array([1]), 'idxTgtNeg': idxTgtNeg } # reprojct the second image into the first image plane if self.reproj: assert (imgs.shape[1] == 160 and imgs.shape[2] == 640) h = imgs.shape[1] pct, mask = util.depth2pc( imgs[1, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44], 'scannet') # be aware of the order of returned pc!!! colorpct = imgs_rgb[1, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44, :].reshape(-1, 3)[mask] normalpct = normal[1, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44, :].reshape(-1, 3)[mask] depthpct = imgs[1, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44].reshape(-1)[mask] R_this = np.matmul(R[0], np.linalg.inv(R[1])) R_this_p = R_this.copy() dR = util.randomRotation(epsilon=0.1) dRangle = angular_distance_np(dR[np.newaxis, :], np.eye(3)[np.newaxis, :])[0] R_this_p[:3, :3] = np.matmul(dR, R_this_p[:3, :3]) R_this_p[:3, 3] += np.random.randn(3) * 0.1 t2s_dr = np.matmul(R_this, np.linalg.inv(R_this_p)) pct_reproj = np.matmul( R_this_p, np.concatenate( (pct.T, np.ones([1, pct.shape[0]]))))[:3, :] pct_reproj_org = np.matmul( R_this, np.concatenate( (pct.T, np.ones([1, pct.shape[0]]))))[:3, :] flow = pct_reproj_org - pct_reproj normalpct = np.matmul(R_this_p[:3, :3], normalpct.T).T flow = flow.T t2s_rgb = self.reproj_helper(pct_reproj_org, colorpct, imgs_rgb[0].shape, 'color') t2s_rgb_p = self.reproj_helper(pct_reproj, colorpct, imgs_rgb[0].shape, 'color') t2s_n_p = self.reproj_helper(pct_reproj, normalpct, imgs_rgb[0].shape, 'normal') t2s_d_p = self.reproj_helper(pct_reproj, depthpct, imgs_rgb[0].shape[:2], 'depth') t2s_flow_p = self.reproj_helper(pct_reproj, flow, imgs_rgb[0].shape, 'color') t2s_mask_p = (t2s_d_p != 0).astype('int') pct, mask = util.depth2pc( imgs[0, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44], 'scannet') # be aware of the order of returned pc!!! colorpct = imgs_rgb[0, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44, :].reshape(-1, 3)[mask] normalpct = normal[0, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44, :].reshape(-1, 3)[mask] depthpct = imgs[0, 80 - 33:80 + 33, 160 + 80 - 44:160 + 80 + 44].reshape(-1)[mask] R_this = np.matmul(R[1], np.linalg.inv(R[0])) R_this_p = R_this.copy() dR = util.randomRotation(epsilon=0.1) dRangle = angular_distance_np(dR[np.newaxis, :], np.eye(3)[np.newaxis, :])[0] R_this_p[:3, :3] = np.matmul(dR, R_this_p[:3, :3]) R_this_p[:3, 3] += np.random.randn(3) * 0.1 s2t_dr = np.matmul(R_this, np.linalg.inv(R_this_p)) pct_reproj = np.matmul( R_this_p, np.concatenate( (pct.T, np.ones([1, pct.shape[0]]))))[:3, :] pct_reproj_org = np.matmul( R_this, np.concatenate( (pct.T, np.ones([1, pct.shape[0]]))))[:3, :] flow = pct_reproj_org - pct_reproj # assume always observe the second view(right view) normalpct = np.matmul(R_this_p[:3, :3], normalpct.T).T flow = flow.T s2t_rgb = self.reproj_helper(pct_reproj_org, colorpct, imgs_rgb[0].shape, 'color') s2t_rgb_p = self.reproj_helper(pct_reproj, colorpct, imgs_rgb[0].shape, 'color') s2t_n_p = self.reproj_helper(pct_reproj, normalpct, imgs_rgb[0].shape, 'normal') s2t_d_p = self.reproj_helper(pct_reproj, depthpct, imgs_rgb[0].shape[:2], 'depth') s2t_flow_p = self.reproj_helper(pct_reproj, flow, imgs_rgb[0].shape, 'color') s2t_mask_p = (s2t_d_p != 0).astype('int') # compute an envelop box try: tp = np.where(t2s_d_p.sum(0))[0] w0, w1 = tp[0], tp[-1] tp = np.where(t2s_d_p.sum(1))[0] h0, h1 = tp[0], tp[-1] except: w0, h0 = 0, 0 w1, h1 = t2s_d_p.shape[1] - 1, t2s_d_p.shape[0] - 1 t2s_box_p = np.zeros(t2s_d_p.shape) t2s_box_p[h0:h1, w0:w1] = 1 try: tp = np.where(s2t_d_p.sum(0))[0] w0, w1 = tp[0], tp[-1] tp = np.where(s2t_d_p.sum(1))[0] h0, h1 = tp[0], tp[-1] except: w0, h0 = 0, 0 w1, h1 = s2t_d_p.shape[1] - 1, s2t_d_p.shape[0] - 1 s2t_box_p = np.zeros(s2t_d_p.shape) s2t_box_p[h0:h1, w0:w1] = 1 rets['proj_dr'] = np.stack((t2s_dr, s2t_dr), 0)[np.newaxis, :] rets['proj_flow'] = np.stack((t2s_flow_p, s2t_flow_p), 0).transpose(0, 3, 1, 2)[np.newaxis, :] rets['proj_rgb'] = np.stack((t2s_rgb, s2t_rgb), 0).transpose(0, 3, 1, 2)[np.newaxis, :] rets['proj_rgb_p'] = np.stack( (t2s_rgb_p, s2t_rgb_p), 0).transpose(0, 3, 1, 2)[np.newaxis, :] rets['proj_n_p'] = np.stack((t2s_n_p, s2t_n_p), 0).transpose(0, 3, 1, 2)[np.newaxis, :] rets['proj_d_p'] = np.stack((t2s_d_p, s2t_d_p), 0).reshape(1, 2, 1, t2s_d_p.shape[0], t2s_d_p.shape[1]) rets['proj_mask_p'] = np.stack( (t2s_mask_p, s2t_mask_p), 0).reshape(1, 2, 1, t2s_mask_p.shape[0], t2s_mask_p.shape[1]) rets['proj_box_p'] = np.stack( (t2s_box_p, s2t_box_p), 0).reshape(1, 2, 1, t2s_box_p.shape[0], t2s_box_p.shape[1]) imgs = imgs[np.newaxis, :] if self.rgbd: imgs_rgb = imgs_rgb[np.newaxis, :].transpose(0, 1, 4, 2, 3) if self.normal: normal = normal[np.newaxis, :].transpose(0, 1, 4, 2, 3) R = R[np.newaxis, :] Q = Q[np.newaxis, :] if self.segm: rets['segm'] = segm_ if self.dynamicWeighting: rets['dynamicW'] = dynamicW[np.newaxis, :] rets['interval'] = self.interval_this rets['norm'] = normal rets['rgb'] = imgs_rgb rets['depth'] = imgs rets['Q'] = Q rets['R'] = R rets['imgsPath'] = imgsPath return rets
def relief(data, tags): ####################################### BUCLES #################################################### """ w = np.zeros(data.shape[1]) closest_enemy_id = -4 closest_friend_id = -4 for i in range(data.shape[0]): enemy_distance = 999 friend_distance = 999 for j in range(data.shape[0]): if i != j: current_distance = np.linalg.norm(data[i] - data[j]) if tags[i] == tags[j] and current_distance < friend_distance: friend_distance = current_distance closest_friend_id = j elif tags[i] != tags[j] and current_distance < enemy_distance: enemy_distance = current_distance closest_enemy_id = j w = w + np.abs(data[i] - data[closest_enemy_id]) - np.abs(data[i] - data[closest_friend_id]) """ ######################################### KDTree ################################################## w = np.zeros(data.shape[1]) closest_enemy_id = -4 closest_friend_id = -4 ally_found = False enemy_found = False tree = KDTree(data) nearest_ind = tree.query(data, k=data.shape[0], return_distance=False)[:, 1:] for i in range(nearest_ind.shape[0]): for j in range(nearest_ind.shape[1]): if not ally_found and tags[i] == tags[nearest_ind[i, j]]: ally_found = True closest_friend_id = nearest_ind[i, j] elif not enemy_found and tags[i] != tags[nearest_ind[i, j]]: enemy_found = True closest_enemy_id = nearest_ind[i, j] if ally_found and enemy_found: break ally_found = enemy_found = False w = w + np.abs(data[i] - data[closest_enemy_id]) - np.abs( data[i] - data[closest_friend_id]) ########################################################################################### w_max = np.max(w) w[w < 0.0] = 0.0 w /= w_max # Comentado para no retrasar la ejecucion del algoritmo """ for i in range(len(w)): plt.bar(i,w[i]) plt.show() """ return w
if fil.crs != WGS: fil = fil.to_crs(WGS) fil = fil.to_crs(UTM) fil['area'] = fil.area fil['centroid'] = fil['geometry'].centroid fil = fil.to_crs(WGS) fil = fil[['PID', 'centroid', 'area']] #short = fil[:50000] short = fil area_dict = dict(zip(list(short.index), list(short['area']))) matrix = list( zip(short.centroid.apply(lambda x: x.x), short.centroid.apply(lambda x: x.y))) KD_tree = KDTree(matrix) ### def Main(passed_dict): # unpack passed dict into local variables for this thread. short = passed_dict['df'] thread_no = passed_dict['thread_no'] print_thresh = passed_dict['print_thresh'] save_thresh = passed_dict['save_thresh'] # set up some counters / timings t = time.time() counter = 1
dataFile = open("mnist.dat", "wb") for x in x_train: dataFile.write(x.tobytes()) dataFile.close() y_bools = [y % 2 == 0 for y in y_train] y_str = [str(y) for y in y_train] df = pd.DataFrame({"y": y_train, "even": y_bools, "name": y_str}) df.index.rename('index', inplace=True) df.to_csv('mnist.csv') # KNN data for tests data = np.memmap("mnist.dat", dtype=np.float32) data = data.reshape([-1, 784]) tree = KDTree(data, leaf_size=2) dist, ind = tree.query(data[:100], k=5) dist, ind = tree.query(np.zeros([1, 784], dtype=np.float32), k=5) nbrs = { "d0": dist[:, 0], "d1": dist[:, 1], "d2": dist[:, 2], "d3": dist[:, 3], "d4": dist[:, 4], "i0": ind[:, 0], "i1": ind[:, 1], "i2": ind[:, 2], "i3": ind[:, 3], "i4": ind[:, 4],
class parameter_estimation(object): def __init__(self, exclude_FDR = False, salary_growth_outlier_weight = 0.1): # Read data self.data = pd.read_csv('data_cleaned/main_data.txt', sep = '\t') self.school_clustering = pd.read_csv('data_cleaned/school_clustering.txt', sep='\t') self.major_list = pd.read_csv('data_cleaned/major_list.txt', sep='\t') self.salary_growth = pd.read_csv('data_cleaned/salary_growth_data.txt', sep = '\t') self.salary_growth_outlier_weight = salary_growth_outlier_weight # Clean columns names self.data.columns = [x.lower() for x in self.data.columns] self.salary_growth.columns = [x.lower() for x in self.salary_growth.columns] if exclude_FDR: self.data = self.data[self.data['source']!='FDR Report'].copy() # Select data that are in the clustering data self.data = self.data[self.data['school_in_clustering'] == 'Y'] self.salary_growth = self.salary_growth[self.salary_growth['school_in_clustering'] == 'Y'] # Get sigma calculated from 25 - 75 percentile or Average - Median self.data['sigma_qt'] = self.data.apply(lambda row: self._sigma_qt(row),axis=1) # This sigma value is used in calculation from median from mean medain_average_ratio_t = self.data.query(''' median_salary>0 and average_salary>0 ''')[['median_salary','average_salary']].median() self.medain_average_ratio = medain_average_ratio_t[0]/medain_average_ratio_t[1] # Get salary median estimated self.data['salary_median'] = self.data.apply (lambda row: self._median(row, self.medain_average_ratio),axis=1) # Identify the schools that have overall records only only_all_schools = set(self.data.loc[(pd.isna(self.data['majorcategoryid'])),'school_name_matched']) - set(self.data.loc[(~pd.isna(self.data['majorcategoryid'])),'school_name_matched']) self.data.loc[self.data['school_name_matched'].isin(only_all_schools), 'only_all_flag'] = 1 self.data.fillna({'only_all_flag':0}, inplace=True) # Add salary growth match flag self.school_clustering.loc[self.school_clustering['school_name'].isin(self.salary_growth['school_name_matched']),'matched_flag_growth'] = 1 self.school_clustering.fillna({'matched_flag_growth':0}, inplace=True) # Add salary growth else match flag self.school_clustering.loc[self.school_clustering['school_name'].isin(self.salary_growth.query('il_flag==0')['school_name_matched']),'matched_flag_growth_else'] = 1 self.school_clustering.fillna({'matched_flag_growth_else':0}, inplace=True) # Add salary match flag self.school_clustering.loc[self.school_clustering['school_name'].isin(self.data.query('only_all_flag==0')['school_name_matched']),'matched_flag'] = 1 self.school_clustering.fillna({'matched_flag':0}, inplace=True) # Add sigma match flag sigma_schools = list(self.data.loc[self.data['sigma_qt']>0,'school_name_matched']) self.school_clustering.loc[self.school_clustering['school_name'].isin(sigma_schools),'matched_flag_sigma'] = 1 self.school_clustering.fillna({'matched_flag_sigma':0}, inplace=True) self.sigma_data = self.data.loc[self.data['sigma_qt']>0].copy() self.data.set_index('school_name_matched', inplace=True) self.salary_growth.set_index('school_name_matched', inplace=True) self.sigma_data.set_index('school_name_matched', inplace=True) ### create title_to_category_ratio self.data['major_category_median'] = self.data.groupby(['school_name_matched','state','school_name','major_category'])['salary_median'].transform(np.mean) self.data['ratio'] = self.data['major_category_median'] / self.data['salary_median'] self.title_to_category_ratio = self.data[self.data['major_title'] != 'all'].groupby(['major_category','major_title'])['ratio'].mean().reset_index() self.title_to_category_ratio = pd.merge(self.major_list[['major_title','major_category']], self.title_to_category_ratio, on = ['major_title','major_category'], how = 'left') self.title_to_category_ratio = self.title_to_category_ratio.fillna(1) ### create category_to_school_ratio salary_median = self.data.loc[self.data['major_category'] == 'all'].reset_index().groupby('school_name_matched')['salary_median'].mean().reset_index(name = 'school_median') category_salary = self.data.loc[self.data['major_category'] != 'all',['major_category','major_title','salary_median']].reset_index().drop_duplicates(keep='first') school_median = pd.merge(salary_median, category_salary, on = 'school_name_matched', how = 'inner') school_median = school_median[school_median['major_title'] == 'all'] school_median['ratio'] = school_median['salary_median'] / school_median['school_median'] self.category_to_school_ratio = school_median.groupby('major_category')['ratio'].mean().reset_index() self.category_to_school_ratio = pd.merge(self.major_list[['major_category']].drop_duplicates(),self.category_to_school_ratio, on = ['major_category'], how = 'left') self.category_to_school_ratio = self.category_to_school_ratio.fillna(1) self.data = self.data.drop(columns = ['major_category_median','ratio']) ### Initialize functions self.build_tree() def _sigma_qt(self, row): if row['salary_min']!= 0: sigma = (np.log(row['salary_max']) - np.log(row['salary_min']))/(norm.ppf(0.75, loc=0, scale=1) - norm.ppf(0.25, loc=0, scale=1)) elif row['average_salary'] != 0 and row['median_salary'] != 0 and row['average_salary'] > row['median_salary']: sigma = np.sqrt(2*(np.log(row['average_salary']) - np.log(row['median_salary']))) else: sigma = 0 return sigma def _median (self, row, r): if row['median_salary'] != 0 : return row['median_salary'] elif row['salary_min'] != 0: return np.exp((np.log(row['salary_min'])+np.log(row['salary_max']))/2) else: # return np.exp(np.log(row['average_salary']) - np.power(max_sigma,2)/2) return row['average_salary'] * r def build_tree(self): # features used for clustering clustering_features = ['state','level','control','long_x','lat_y','student_count','rank_num',\ 'tuition','school_city_demo','school_city_gdp','matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'] clustering_data = self.school_clustering[clustering_features] # one hot encoding for numerical data cat_vars = ['state','level','control'] for var in cat_vars: cat_list = pd.get_dummies(clustering_data[var], prefix=var) clustering_data1 =clustering_data.join(cat_list) clustering_data = clustering_data1 clustering_data_vars = clustering_data.columns.values.tolist() to_keep = [i for i in clustering_data_vars if i not in cat_vars] clustering_data = clustering_data[to_keep] # scale the features scaler = MinMaxScaler(feature_range=(0, 1)) scaler = scaler.fit(clustering_data) clustering_data_trans = pd.DataFrame(scaler.transform(clustering_data), columns = clustering_data.columns) clustering_data_trans.index = self.school_clustering['school_name'] self.data_train1 = (clustering_data_trans[clustering_data_trans['matched_flag'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) self.data_train2 = (clustering_data_trans[clustering_data_trans['matched_flag_growth_else'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) self.data_train3 = (clustering_data_trans[clustering_data_trans['matched_flag_growth'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) self.data_train4 = (clustering_data_trans[clustering_data_trans['matched_flag_sigma'] == 1].drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) # print(self.data_train.shape) self.data_test1 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) self.data_test2 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) self.data_test3 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) self.data_test4 = (clustering_data_trans.drop(columns=['matched_flag','matched_flag_growth_else','matched_flag_growth','matched_flag_sigma'])) # print(self.data_test.shape) self.kdt1 = KDTree(np.array(self.data_train1)) self.kdt2 = KDTree(np.array(self.data_train2)) self.kdt3 = KDTree(np.array(self.data_train3)) self.kdt4 = KDTree(np.array(self.data_train4)) def get_salary_neighbors(self, school_name, k = 3): a = np.expand_dims(np.array(self.data_test1.loc[school_name]), axis=0) _, ind_list = self.kdt1.query(a, k) x = self.data_train1.iloc[ind_list[0,:]].index return list(x) def get_growth_neighbors_else(self, school_name, k = 3): a = np.expand_dims(np.array(self.data_test2.loc[school_name]), axis=0) _, ind_list = self.kdt2.query(a, k) x = self.data_train2.iloc[ind_list[0,:]].index return list(x) def get_growth_neighbors(self, school_name, k = 3): a = np.expand_dims(np.array(self.data_test3.loc[school_name]), axis=0) _, ind_list = self.kdt3.query(a, k) x = self.data_train3.iloc[ind_list[0,:]].index return list(x) def get_sigma_neighbors(self, school_name, k = 3): a = np.expand_dims(np.array(self.data_test4.loc[school_name]), axis=0) _, ind_list = self.kdt4.query(a, k) x = self.data_train4.iloc[ind_list[0,:]].index return list(x) def find_one_median(self, school_name, major_title, major_category): matched_records = self.data.loc[[school_name]] ### if major_title is matched ### if major_title in list(matched_records.major_title): # print('find_major_title {}'.format(major_title)) median = matched_records.loc[matched_records['major_title'] == major_title,'salary_median'].mean() salary_similar = list(matched_records.loc[matched_records['major_title'] == major_title,'salary_median']) ### if major_category is matched ### elif major_category in np.unique(matched_records.major_category): medain_temp = matched_records.loc[matched_records['major_category']==major_category,'salary_median'].mean() # print(medain_temp) ratio = self.title_to_category_ratio.loc[self.title_to_category_ratio['major_title'] == major_title,'ratio'] median = medain_temp * float(ratio) salary_similar = list(matched_records.loc[matched_records['major_category']==major_category,'salary_median']) ### if major_category is not matched ### else: medain_temp = matched_records['salary_median'].mean() # print(medain_temp) ratio1 = self.category_to_school_ratio.loc[self.category_to_school_ratio['major_category']==major_category,'ratio'] ratio2 = self.title_to_category_ratio.loc[self.title_to_category_ratio['major_title'] == major_title,'ratio'] median = float(medain_temp) * float(ratio1) * float(ratio2) salary_similar = list(matched_records['salary_median']) return float(median), salary_similar def find_one_sigma(self, school_name, major_title, major_category): matched_records = self.sigma_data.loc[[school_name]] if major_title in list(matched_records.major_title): sigma = matched_records.loc[matched_records['major_title'] == major_title, 'sigma_qt'].mean() elif major_category in np.unique(matched_records.major_category): sigma = matched_records.loc[matched_records['major_category'] == major_category, 'sigma_qt'].mean() else: sigma = matched_records['sigma_qt'].mean() return sigma def get_value(self, school, m_title, k): # first get the major_category and school state from input m_category, state = self.match_input(school, m_title) # print(school) # print(m_title) # print(m_category) # if we have school's salary information: similar_schools = self.get_salary_neighbors(school, k) self.salary_similar_schools = similar_schools similar_schools_sigma = self.get_sigma_neighbors(school, k) # print(similar_schools) median_array = [] similar_median_array = [] sigma_array = [] for s1 in similar_schools: a,b = self.find_one_median(s1, m_title, m_category) median_array.append(a) similar_median_array.append(b) for s2 in similar_schools_sigma: sigma_array.append(self.find_one_sigma(s2, m_title, m_category)) # print(median_array) median_knn = np.mean(np.apply_over_axes(np.sort, np.array(median_array), axes=0)[:2]) sigma_knn = np.mean(sigma_array) if school in self.data[self.data['only_all_flag']==0].index: median_self, _ = self.find_one_median(school, m_title, m_category) median = 0.8 * median_self + 0.2 * median_knn else: median = median_knn if school in self.sigma_data.index: sigma_self = self.find_one_sigma(school, m_title, m_category) sigma = 0.8 * sigma_self + 0.2 * sigma_knn else: sigma = sigma_knn return median, sigma, similar_median_array def find_growth(self, school_name, major_title, major_category): matched_records = self.salary_growth.loc[[school_name]] if major_title in list(matched_records.major_title): growth = list(matched_records.loc[matched_records['major_title'] == major_title, ['growth_rate_2','growth_rate_3','growth_rate_4','growth_rate_5']].mean()) elif major_category in np.unique(matched_records.major_category): growth = list(matched_records.loc[matched_records['major_category'] == major_category, ['growth_rate_2','growth_rate_3','growth_rate_4','growth_rate_5']].mean()) else: growth = list(matched_records[['growth_rate_2','growth_rate_3','growth_rate_4','growth_rate_5']].mean()) return growth def get_growth(self, school, m_title): m_category, state = self.match_input(school, m_title) similar_schools = self.get_growth_neighbors(school, k = 3) similar_schools_else = self.get_growth_neighbors_else(school, k = 3) growth_array = [] growth_array_else = [] for ss in similar_schools: g1 = self.find_growth(ss, m_title, m_category) growth_array.append(g1) growth_array = np.array(growth_array) for sss in similar_schools_else: g2 = self.find_growth(sss, m_title, m_category) growth_array_else.append(g2) growth_array_else = np.array(growth_array_else) salary_growth_all = np.mean(growth_array, axis = 0) salary_growth_else = np.mean(growth_array_else, axis = 0) salary_growth = self.salary_growth_outlier_weight * salary_growth_all + (1-self.salary_growth_outlier_weight) * salary_growth_else return salary_growth def input_check(self,school): output = 1 if school not in list(self.school_clustering['school_name']): print('Error: School name \"{}\" is not recorded!'.format(school)) output = 0 return output def match_input(self, school, major): mc = self.major_list.loc[self.major_list['major_title'] == major,'major_category'].values[0] state = np.array(self.school_clustering.loc[self.school_clustering['school_name']==school, 'state'])[0] return mc,state def find_estimate(self, school, majorID, k=5): school = school.lower() check_result = self.input_check(school) output = {} if check_result == 0: output['Error'] = {} output['Error']['salary_year_1'] = -1 output['Error']['salary_year_2'] = -1 output['Error']['salary_year_3'] = -1 output['Error']['salary_year_4'] = -1 output['Error']['salary_year_5'] = -1 output['Error']['sigma'] = -1 else: if not isinstance(majorID, list): majorID = [majorID] for m in majorID: if int(m) < 1 or int(m) > max(self.major_list['majorID']): print('Error: majorID not existed!') output['Error'] = {} output['Error']['salary_year_1'] = -1 output['Error']['salary_year_2'] = -1 output['Error']['salary_year_3'] = -1 output['Error']['salary_year_4'] = -1 output['Error']['salary_year_5'] = -1 output['Error']['sigma'] = -1 output['Error']['similar_schools'] = -1 output['Error']['similar_salary'] = -1 else: m_title = (self.major_list.loc[self.major_list['majorID'] == int(m), 'major_title'].values)[0] output[m_title] = {} median, sigma, similar_median = self.get_value(school, m_title, k) salary_growth = self.get_growth(school, m_title) output[m_title]['salary_year_1'] = median output[m_title]['salary_year_2'] = output[m_title]['salary_year_1'] * (1 + salary_growth[0]) output[m_title]['salary_year_3'] = output[m_title]['salary_year_2'] * (1 + salary_growth[1]) output[m_title]['salary_year_4'] = output[m_title]['salary_year_3'] * (1 + salary_growth[2]) output[m_title]['salary_year_5'] = output[m_title]['salary_year_4'] * (1 + salary_growth[3]) output[m_title]['sigma'] = sigma output[m_title]['similar_schools'] = self.salary_similar_schools output[m_title]['similar_salary'] = similar_median return output
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gps', '-G', type=str, help="GPS file to run") parser.add_argument('--atm', '-A', type=str, help='ATM directory to run') parser.add_argument('--hemisphere', '-H', type=int, default=-1, help='hemisphere, must be 1 or -1') parser.add_argument('--query', '-Q', type=float, default=100, help='KD-Tree query radius') parser.add_argument('--median', '-M', default=False, action='store_true', help='Run block median') parser.add_argument('--scan', '-S', default=False, action='store_true', help='Run ATM scan fit') parser.add_argument('--verbose', '-v', default=False, action='store_true', help='verbose output of run') args = parser.parse_args() if args.hemisphere == 1: SRS_proj4 = '+proj=stere +lat_0=90 +lat_ts=70 +lon_0=-45 +k=1 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs ' elif args.hemisphere == -1: SRS_proj4 = '+proj=stere +lat_0=-90 +lat_ts=-71 +lon_0=0 +k=1 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs' # tilde expansion of file arguments GPS_file = os.path.expanduser(args.gps) fileBasename, fileExtension = os.path.splitext(GPS_file) ATM_dir = os.path.expanduser(args.atm) print("working on GPS file {0}, ATM directory {1}".format( GPS_file, ATM_dir)) if args.verbose else None # find Qfit files within ATM_dir Qfit_regex = re.compile( r"ATM1B.*_(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2}).*.h5") Qfit_files = [ os.path.join(ATM_dir, f) for f in os.listdir(ATM_dir) if Qfit_regex.search(f) ] # output directory out_dir = os.path.join(ATM_dir, 'xovers') if not os.path.isdir(out_dir): os.mkdir(out_dir) # output file out_file = 'vs_{0}.h5'.format(os.path.basename(fileBasename)) # check if output file exists if os.path.isfile(os.path.join(out_dir, out_file)): print("found: {0}".format(os.path.join( out_dir, out_file))) if args.verbose else None # read GPS HDF5 file GPS_field_dict = {None: ['latitude', 'longitude', 'z']} GPS = pc.data().from_h5(GPS_file, field_dict=GPS_field_dict).get_xy(SRS_proj4) # run block median over GPS data if args.median: GPS = blockmedian_for_gps(GPS, 5) # read all Qfit files within ATM directory Qlist = list() for f in sorted(Qfit_files): Qlist.append(pc.ATM_Qfit.data().from_h5(f)) # merge the list of ATM data and build the search tree Q_full = pc.data().from_list(Qlist).get_xy(SRS_proj4) # fit scan parameters to an ATM data structure if args.scan: Q_full = fit_ATM_data(Q_full) # run block median for qsub if args.median: Q_full = blockmedian_for_qsub(Q_full, 5) # construct search tree from ATM Qfit coords # pickle Qtree to save computational time for future runs if os.path.isfile(os.path.join(ATM_dir, 'tree.p')): Qtree = pickle.load(open(os.path.join(ATM_dir, 'tree.p'), 'rb')) else: Qtree = KDTree(np.c_[Q_full.x, Q_full.y]) pickle.dump(Qtree, open(os.path.join(ATM_dir, 'tree.p'), 'wb')) # output fields out_fields = [ 'x', 'y', 'z', 'longitude', 'latitude', 't_qfit', 'h_qfit_50m', 'sigma_qfit_50m', 'dz_50m', 'RDE_50m', 'N_50m', 'hbar_20m', 'h_qfit_10m', 'sigma_qfit_10m', 'dz_10m', 'RDE_10m', 'N_10m', 'x_10m_mean', 'y_10m_mean' ] # append scan fields to output template if args.scan: out_fields.extend(['scan_XT_50m', 'scan_XT_10m']) out_template = {f: np.NaN for f in out_fields} out = list() # query the search tree to find points within query radius Qquery = Qtree.query_radius(np.c_[GPS.x, GPS.y], args.query) # indices of GPS points within bin ind, = np.nonzero([np.any(i) for i in Qquery]) # loop over queries in the GPS data for i in ind: GPSsub = GPS.copy_subset(np.array([i])) # grab the Qfit bins around the GPS bin Qdata = Q_full.copy_subset(Qquery[i], by_row=True) Qdata.index( np.isfinite(Qdata.elevation) & np.isfinite(Qdata.latitude) & np.isfinite(Qdata.longitude)) # create output dictionary of GPS and plane-fit ATM comparison this_out = compare_gps_with_qfit(GPSsub, Qdata, out_template) if this_out is not None: out.append(this_out) # if there were overlapping points between the GPS and ATM data if out: D = dict() with h5py.File(os.path.join(out_dir, out_file), 'w') as h5f: for field in out[0].keys(): D[field] = np.array([ii[field] for ii in out]) print(field, D[field].dtype) if args.verbose else None h5f.create_dataset(field, data=D[field])