class NNScope: def get_minolab(self): tmp = pd.Series(self.y) tmp = tmp.value_counts() return min(tmp.keys(), key=lambda o: tmp[o]) def normalization(self): self.X -= np.mean(self.X, axis=0) self.X /= np.sqrt(np.var(self.X, axis=0)) def __init__(self, X, y, k): self.X = np.array(X, dtype='float64') self.normalization() self.y = y self.minolab = self.get_minolab() self.nn = NearestNeighbors(n_neighbors=k, n_jobs=-1) self.nn.fit(self.X) self.nn_maj = NearestNeighbors(n_neighbors=k, n_jobs=-1) self.nn_maj.fit(self.X[y != self.minolab]) self.distr = None # how many minority samples with given number of minotiry neighbors def calc_ratio(self): dis_all, _ = self.nn.kneighbors() dis_all = dis_all[self.y == self.minolab] dis_maj, _ = self.nn_maj.kneighbors(self.X[self.y == self.minolab]) self.WBNR = np.sqrt(np.mean(dis_all ** 2, axis=1) / np.mean(dis_maj ** 2, axis=1)) def show_ratio_distr(self): plt.hist(self.WBNR, bins=20)
def k_nearest_neighbors_scores(k, eng_vec_dict, fr_vec_dict): eng_mat, fr_mat, index_map = build_parallel_mats_from_dicts(eng_vec_dict, fr_vec_dict, translation_dict) # k + 1 since we discard the top neighbor, which is itself neighbors_en = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(eng_mat) dist_en, indices_en = neighbors_en.kneighbors(eng_mat) neighbors_fr = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(fr_mat) dist_fr, indices_fr = neighbors_fr.kneighbors(fr_mat) # since we built the matrices in parallel, we know now that indices map to each other, # so we simply check the overlap of those to calculate precision and recall. # calculate avg recall for k-recall avg_recall = 0. num_points = len(indices_en) + 0. knearest_map_en = dict() knearest_map_fr = dict() for i in range(0, int(num_points)): w_en = index_map[i][0] w_fr = index_map[i][1] index_set_en = set(indices_en[i][1:]) # should be size k index_set_fr = set(indices_fr[i][1:]) # should be size k if w_en not in knearest_map_en: knearest_map_en[w_en] = map(lambda z: index_map[z], index_set_en) if w_fr not in knearest_map_fr: knearest_map_fr[w_fr] = map(lambda z: index_map[z], index_set_fr) recall_count = sum(1 for i in index_set_fr if i in index_set_en) # precision = recall for this task recall = (recall_count + 0.)/len(index_set_en) avg_recall += recall return (avg_recall/num_points), knearest_map_en, knearest_map_fr
def nearestN(): X = [[125,1], [200,0], [70,0], [240,1], [114,0], [120,0], [264,1], [85,0], [150,0], [90,0]] # y = [ 0, 0, 0, 0, 1, 0, 0, 1, 0,1 ] model = NN(n_neighbors=1, radius=1) model.fit(X) y = [98.,0.] print model.kneighbors(y)
def main(): vectorizer = CountVectorizer(ngram_range=(1,2),max_df=1.0, min_df=0.0) nei = NearestNeighbors(algorithm='brute', metric='jaccard') matrix = vectorizer.fit_transform(training_set).todense() new_matrix = vectorizer.transform(new_comments).todense() nei.fit(matrix) path = '{0}/'.format(pathsplit(abspath(__file__))[0]) jsonfile = open(path + '{0}-nn.json'.format(n_neighbors), 'w') nodes = [{'name': (training_set+new_comments)[i], 'group':(groups + new_groups)[i]} for i in range(len(training_set+new_comments))] links = [] for i in range(len(matrix)): dist, idnei = nei.kneighbors(matrix[i], n_neighbors=n_neighbors + 1) dist, idnei = dist[0], idnei[0] for j in range(len(idnei[1:])): links.append({"source":i,"target":idnei[j+1],"value":10*(1 - dist[j+1])}) for i in range(len(new_comments)): dist, idnei = nei.kneighbors(new_matrix[i], n_neighbors=n_neighbors + 1) dist, idnei = dist[0], idnei[0] for j in range(len(idnei[1:])): links.append({"source":len(matrix) + i,"target":idnei[j],"value":10*(1 - dist[j+1])}) jsondumped = json.dumps({'nodes':nodes, 'links':links}, indent=2) jsonfile.write(jsondumped)
def estimator_knn_cv(X, y, clf, n_neigh): neigh = NearestNeighbors(n_neigh, metric="euclidean", algorithm="brute") neigh_est = NearestNeighbors(n_neigh, metric="manhattan", algorithm="brute") acc = [] for train, test in StratifiedKFold(y, 5): X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X_train, y_train) estimators = clf.estimators_ preds_train = np.array(map(lambda e: e.predict(X_train), estimators)).T preds_test = np.array(map(lambda e: e.predict(X_test), estimators)).T preds_train_proba = np.array(map(lambda e: e.predict_proba(X_train), estimators)) preds_test_proba = np.array(map(lambda e: e.predict_proba(X_test), estimators)) p_train = preds_train_proba.swapaxes(0, 1)[:, :, 0] p_test = preds_test_proba.swapaxes(0, 1)[:, :, 0] neigh.fit(X_train) dist, knn = neigh.kneighbors(X_test) neigh_est.fit(preds_train) dist, knn_est = neigh_est.kneighbors(preds_test) # neigh_est.fit(p_train);dist, knn_est = neigh_est.kneighbors(p_test) knn_combined_uniq = np.array(map(np.unique, np.hstack((knn[:, :30], knn_est[:, :30])))) pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn_combined_uniq]) # pp_uniq = np.array([stats.mode(y_train[nn])[0][0] for nn in knn[:,:30]]) preds_test_est_knn = np.array( [[stats.mode(y_train[nn])[0][0] for nn in knn_est[:, :i]] for i in xrange(1, n_neigh, 2)] ) acc.append( [accuracy_score(y_test, pred) for pred in np.vstack((preds_test_est_knn, clf.predict(X_test), pp_uniq))] ) mean_acc = np.mean(acc, axis=0) print " ".join("{:.3f}".format(v) for v in mean_acc), " max:{:.3f}".format(mean_acc.max())
def resample(self): # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Finding nns # Import the k-NN classifier from sklearn.neighbors import NearestNeighbors nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1) nearest_neighbour.fit(minx) nns = nearest_neighbour.kneighbors(minx, return_distance=False)[:, 1:] # Creating synthetic samples sx, sy = self.make_samples( minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose ) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2) nn.fit(ret_x) nns = nn.kneighbors(ret_x, return_distance=False)[:, 1] # Send the information to is_tomek function to get boolean vector back links = self.is_tomek(ret_y, nns, self.minc, self.verbose) if self.verbose: print("Over-sampling performed:" " " + str(Counter(ret_y[logical_not(links)]))) # Return data set without majority Tomek links. return ret_x[logical_not(links)], ret_y[logical_not(links)]
def estimate_dimension(X, n_neighbors='auto', neighbors_estimator=None): """Estimate intrinsic dimensionality. Based on "Manifold-Adaptive Dimension Estimation" Farahmand, Szepavari, Audibert ICML 2007. Parameters ---------- X : nd-array, shape (n_samples, n_features) Input data. n_neighbors : int or auto, default='auto' Number of neighbors used for estimate. 'auto' means ``np.floor(2 * np.log(n_samples))``. neighbors_estimator : NearestNeighbors object or None, default=None A pre-fitted neighbors object to speed up calculations. """ if n_neighbors == 'auto': n_neighbors = np.floor(2 * np.log(X.shape[0])).astype("int") if neighbors_estimator is None: neighbors_estimator = NearestNeighbors(n_neighbors=n_neighbors) neighbors_estimator.fit(X) full_dist = neighbors_estimator.kneighbors(X, n_neighbors=n_neighbors)[0][:, -1] half_dist = neighbors_estimator.kneighbors(X, n_neighbors=n_neighbors // 2)[0][:, -1] est = np.log(2) / np.log(full_dist / half_dist) est = np.minimum(est, X.shape[1]) return np.round(np.mean(est))
def resample(self): # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Finding nns # Import the k-NN classifier from sklearn.neighbors import NearestNeighbors nearest_neighbour = NearestNeighbors(n_neighbors=self.k + 1) nearest_neighbour.fit(minx) nns = nearest_neighbour.kneighbors(minx, return_distance=False)[:, 1:] # Creating synthetic samples sx, sy = self.make_samples( minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose ) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) # Create a k-NN to fit the whole data nn_obj = NearestNeighbors(n_neighbors=self.size_ngh) # Fit the whole dataset nn_obj.fit(ret_x) # Loop over the other classes under picking at random for key_idx, key in enumerate(self.ucd.keys()): # Get the sample of the current class sub_samples_x = ret_x[ret_y == key] sub_samples_y = ret_y[ret_y == key] # Find the NN for the current class nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False) # Get the label of the corresponding to the index nnhood_label = ret_y[nnhood_idx] == key # Check which one are the same label than the current class # Make an AND operation through the k neighbours nnhood_bool = np.all(nnhood_label, axis=1) # Get the samples which agree all together sel_x = np.squeeze(sub_samples_x[np.nonzero(nnhood_bool), :]) sel_y = sub_samples_y[np.nonzero(nnhood_bool)] if key_idx == 0: underx = sel_x[:, :] undery = sel_y[:] else: underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) if self.verbose: print("Over-sampling performed: " + str(Counter(undery))) return underx, undery
def eucl_distance(a, b): nbrs_a = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(a) if a.size > 0 else None nbrs_b = NearestNeighbors(n_neighbors=1, algorithm='kd_tree').fit(b) if b.size > 0 else None distances_a, _ = nbrs_a.kneighbors(b) if nbrs_a and b.size > 0 else ([np.inf], None) distances_b, _ = nbrs_b.kneighbors(a) if nbrs_b and a.size > 0 else ([np.inf], None) return [distances_a, distances_b]
def on_pick(self, event): ind = event.ind[0] arty = event.artist for key in nld.layers.keys(): layer = nld.layers[key] for plot in layer.plots: if plot is arty: self.neighb_sec = key break nbrs = NearestNeighbors(n_neighbors=50, n_jobs=1).fit(X) distances, indices = nbrs.kneighbors(X) #nbrs.fit(X) #W = barycenter_kneighbors_graph( # nbrs, n_neighbors=50, reg=1e-3, n_jobs=1) #knn = kneighbors_graph(X, 10).to_array() try: self.scatters.remove() self.two_scatters.remove() except: pass self.points = indices[ind] neighb_layer = nld.get_layer(self.neighb_sec) self.points = X[self.points] # self.points = [neighb_layer.x_data[0][ind], neighb_layer.y_data[0][self.points], neighb_layer.z_data[0][self.points]] section_num = int(self.neighb_sec[-1]) section_ax = self.fig.get_axes()[section_num + 1] section_layer = section_ax.get_layer(section_ax.title._text + ' proj') self.scatters = nld.scatter(self.points[:, 0], self.points[:, 1], self.points[:, 2], c='yellow', s=80) two_mat = np.column_stack((section_layer.x_data[0], section_layer.y_data[0])) two_nbrs = NearestNeighbors(n_neighbors=50, n_jobs=1).fit(two_mat) two_dists, two_inds = two_nbrs.kneighbors(two_mat) two_points = two_mat[two_inds[ind]] self.two_scatters = section_ax.scatter(two_points[:, 0], two_points[:, 1], c='green', s=80)
class KDTrees: def __init__(self, nb_neighbours, leaf_size): self.nbrs = NearestNeighbors(n_neighbors=nb_neighbours, algorithm='ball_tree', metric = 'haversine', leaf_size=leaf_size) # Compute distance in time between two points on the map def mapDistance(self, x, y): if (len(x) > 2): return np.sum((x - y) ** 2) else: if(x[0] < y[0]): tmp = y y = x x = tmp pos1 = str(x[0]) + ", " + str(x[1]) pos2 = str(y[0]) + ", " + str(y[1]) timestamp = datetime.now() sec_to_add = 32 * 3600 + (timestamp - datetime(1970, 1, 1)).total_seconds() - 2*3600 - timestamp.hour * 3600 - timestamp.minute * 60 - timestamp.second traject = gmaps.directions(pos1, pos2, mode="transit", departure_time=timestamp.fromtimestamp(sec_to_add)) try: print 'ok' return (traject[0]["legs"][0]["arrival_time"]["value"] - traject[0]["legs"][0]["departure_time"]["value"]) except: print 'bug' return 1000000000 def addPoints(self, points): self.nbrs.fit(points) def getNeighbours(self, points): self.nbrs.kneighbors(points)
def adasyn_sample(X,Y,minclass,K=5,n=200): indices = np.nonzero(Y==minclass) Ymin = Y[indices] Xmin = X[indices] Cmin = len(indices[0]) Xs = [] if n > Cmin: Xs.append(Xmin) n -= len(Ymin) else: # simple random without replacement undersampling return Xmin[random.sample(range(Cmin),n)] neigh = NearestNeighbors(n_neighbors=30) neigh.fit(X) nindices = neigh.kneighbors(Xmin,K,False) gamma = [float(sum(Y[i]==minclass))/K for i in nindices] gamma = gamma / np.linalg.norm(gamma,ord = 1) neigh = NearestNeighbors(n_neighbors=30) neigh.fit(Xmin) N = np.round(gamma*n).astype(int) assert len(N) == Cmin for (i,nn) in enumerate(N): nindices = neigh.kneighbors(Xmin[i],K,False)[0] for j in range(nn): alpha = random.random() Xnn = X[random.choice(nindices)] Xs.append((1.-alpha)*Xmin[i]+alpha*Xnn) Xadasyn = sparse.vstack(Xs) return Xadasyn
def RunAllKnnScikit(q): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query file # In this case we add this to the command line. Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') else: referenceData = np.genfromtxt(self.dataset, delimiter=',') with totalTimer: # Get all the parameters. k = re.search("-k (\d+)", options) leafSize = re.search("-l (\d+)", options) if not k: Log.Fatal("Required option: Number of furthest neighbors to find.") q.put(-1) return -1 else: k = int(k.group(1)) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) q.put(-1) return -1 if not leafSize: l = 20 elif int(leafSize.group(1)) < 0: Log.Fatal("Invalid leaf size: " + str(leafSize.group(1)) + ". Must" + " be greater than or equal to 0.") q.put(-1) return -1 else: l = int(leafSize.group(1)) try: # Perform All K-Nearest-Neighbors. model = NearestNeighbors(n_neighbors=k, algorithm='kd_tree', leaf_size=l) model.fit(referenceData) if len(self.dataset) == 2: out = model.kneighbors(queryData, k, return_distance=True) else: # We have to increment k by one because mlpack ignores the # self-neighbor, whereas scikit-learn will happily return the # nearest neighbor of point 0 as point 0. out = model.kneighbors(referenceData, k + 1, return_distance=True) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def findKNN(frequencyVector,newVector): samples = np.array(frequencyVector) neigh = NearestNeighbors(n_neighbors=5, metric="euclidean") neigh.fit(samples) indexList = neigh.kneighbors(newVector,return_distance=False).tolist() a=neigh.kneighbors(newVector) print a return indexList
class KNearestNeighbours(MLClassifierBase): """k Nearest Neighbours multi-label classifier.""" BRIEFNAME = "MLkNN" def __init__(self, k = 10, s = 1.0): super(KNearestNeighbours, self).__init__(None) self.k = k # Number of neighbours self.s = s # Smooth parameter def compute_prior(self, y): prior_prob_true = [] prior_prob_false = [] for label in xrange(self.num_labels): prior_prob_true.append(float(self.s + sum(instance[label] == 1 for instance in y)) / (self.s * 2 + self.num_instances)) prior_prob_false.append(1 - prior_prob_true[-1]) return prior_prob_true, prior_prob_false def compute_cond(self, X, y): self.knn = NearestNeighbors(self.k).fit(X) c = [[0] * (self.k + 1) for label in xrange(self.num_labels)] cn = [[0] * (self.k + 1) for label in xrange(self.num_labels)] for instance in xrange(self.num_instances): neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False) for label in xrange(self.num_labels): delta = sum(y[neighbor][label] for neighbor in neighbors[0]) (c if y[instance][label] == 1 else cn)[label][delta] += 1 cond_prob_true = [[0] * (self.k + 1) for label in xrange(self.num_labels)] cond_prob_false = [[0] * (self.k + 1) for label in xrange(self.num_labels)] for label in xrange(self.num_labels): for neighbor in xrange(self.k + 1): cond_prob_true[label][neighbor] = (self.s + c[label][neighbor]) / (self.s * (self.k + 1) + sum(c[label])) cond_prob_false[label][neighbor] = (self.s + cn[label][neighbor]) / (self.s * (self.k + 1) + sum(cn[label])) return cond_prob_true, cond_prob_false def fit(self, X, y): self.predictions = y; self.num_instances = len(y) self.num_labels = len(y[0]) # Computing the prior probabilities self.prior_prob_true, self.prior_prob_false = self.compute_prior(y) # Computing the posterior probabilities self.cond_prob_true, self.cond_prob_false = self.compute_cond(X, y) return self def predict(self, X): result = np.zeros((len(X), self.num_labels), dtype='i8') for instance in xrange(len(X)): neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False) for label in xrange(self.num_labels): delta = sum(self.predictions[neighbor][label] for neighbor in neighbors[0]) p_true = self.prior_prob_true[label] * self.cond_prob_true[label][delta] p_false = self.prior_prob_false[label] * self.cond_prob_false[label][delta] prediction = (p_true >= p_false) result[instance][label] = int(prediction) return result
def sample(s): if s.data is None: raise ValueError('data not loaded.') mdl = NearestNeighbors(n_neighbors=s.k1, n_jobs=-1) mdl.fit(s.X) _, nei_table = mdl.kneighbors() # the index of those minority points with minority neighbors noise_mino_idx = filter(lambda o: sum(s.y[nei_table[o]] == s.minolab) != 0 and s.y[o] == s.minolab, range(s.X.shape[0])) minoX = s.X[s.y == s.minolab] majX = s.X[s.y == s.majlab] mdl_maj = NearestNeighbors(n_neighbors=s.k2, n_jobs=-1) mdl_maj.fit(majX) # all majority examples on the bound _, tmp = mdl_maj.kneighbors(s.X[noise_mino_idx]) # remove dumplicate examples bound_maj_idx = np.unique(np.reshape(tmp, (1, -1))[0]) mdl_mino = NearestNeighbors(n_neighbors=s.k3, n_jobs=-1) mdl_mino.fit(minoX) # find minority examples on the bound backward _, tmp = mdl_mino.kneighbors(majX[bound_maj_idx]) bound_mino_idx = np.unique(np.reshape(tmp, (1, -1))[0]) bound_maj = majX[bound_maj_idx] bound_mino = minoX[bound_mino_idx] # difference matrix, shape = (majN, minoN). # Due to broadcast(strech), diff[i][j][k] would be maj[i][k]-mino[j][k], # thus vector diff[i][j]=maj[i]-mino[j] representing the outer vector diff. diff = bound_maj[:, None, :] - bound_mino Cf = lambda o: min(s.X.shape[1] / np.linalg.norm(o, 2), s.Cfth) * 1.0 / s.Cfth CM = np.apply_along_axis(Cf, 2, diff) W = np.mean(((CM * CM).T / np.sum(CM, axis=1)).T, axis=0) # P is the normalized Weight Vector, standing for the probability chosen to synthese P = W / np.sum(W) # np.save(open('W-{0}.ndarray'.format(s.mdl_args["gamma"]), 'w'), CM) # choose N bound minority examples to synthese, selection probability accroding to their weight chosen = np.random.choice(range(len(P)), size=s.N, p=P) chosenp = bound_mino[chosen] # would not implement CLUSTERING in MWMOTE, I could see no effort of that but time-consumption. _, nei = mdl_mino.kneighbors(chosenp, s.k1) dualp = minoX[[i[int(np.random.rand() * s.k1)] for i in nei]] generated = chosenp + np.random.rand(s.N, 1) * (dualp - chosenp) ret = np.hstack((np.vstack((minoX, generated, majX)), np.array([s.minolab] * (minoX.shape[0] + s.N) + [s.majlab] * majX.shape[0])[:, None])) np.random.shuffle(ret) return ret
def get_station_nearest_neighbors_list(self, station, nps, n): """ Returns the n nearest neighbors stations to given station among the stations in passed data frame "df". Args: station <string>: The station code for which nearest neighbors are needed. nps <int>: Number of previous stations to choose stations having nps model. n <int>: Number of nearest neighbors needed. """ # Choose the stations who have the respective nps models. # If the unknown station occurs as 3rd station in the complete journey, then # the nearest known station should have a 3 previous station model and so on. stns_hvng_nps_mdls = self._pdr.get_stations_having_nps_model_list(nps) # Get the station features data frame for known stations having nps models df = self._pdr.get_known_596_stations_features_df() df = df[df.Station.isin(stns_hvng_nps_mdls)] query_stn_feature = [[self._stn_geo_crdnates[station][0], self._stn_geo_crdnates[station][1], self._stn_deg_strength[station], self._stn_tfc_strength[station]]] # First choose neighbors which are geographically closer lat_lon_df = df[["Latitude", "Longitude"]] lat_lon_query_stn_ftr = [[self._stn_geo_crdnates[station][0], self._stn_geo_crdnates[station][1]]] ll_nbrs = NN(n_neighbors=n, algorithm="auto").fit(lat_lon_df) # ll_indices are directly indexed corresponding to stns_hvng_nps_mdls ll_distances, ll_indices = ll_nbrs.kneighbors(lat_lon_query_stn_ftr) # Subselect the chosen stations features from the complete station # features df. selected_station_fts_df = self._get_selected_stations_df(ll_indices[0], df) # Then choose neighbors based on degree and traffic strength among the # above chosen geographically closer stations. deg_tfc_df = selected_station_fts_df[["Degree_Strength", "Traffic_Strength"]] deg_tfc_query_stn_ftr = [[self._stn_deg_strength[station], self._stn_tfc_strength[station]]] dt_nbrs = NN(n_neighbors=n, algorithm="auto").fit(deg_tfc_df) # dt_indices are indexed with 0, so not directly related to # stns_hvng_nps_mdls dt_distances, dt_indices = dt_nbrs.kneighbors(deg_tfc_query_stn_ftr) # Once the dt_indices are obtained where the stations are arranged as per # increasing distance of degree and traffic strength features, get the # station codes from the df at those indices (since the dt_indices are # indexed from 0 onwards with respect to the ll_indices, hence the following # code). Also the ll_indices are with respect to the df. final_nearest_neighbors_stns_list = [df.iloc[ll_indices[0][idx]].Station for idx in dt_indices[0]] return final_nearest_neighbors_stns_list
def main(): if len(sys.argv) != 3: sys.stderr.write('Error: wrong number of arguments.\n') sys.stderr.write( 'Usage: %s <corpus path> <model path>\n' % (sys.argv[0],)) return 1 logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT) text = get_soap_data(sys.argv[1]) embedding = _get_w2v_embedding(sys.argv[2]) data = pandas.DataFrame() data["Transcript"] = text[0:200000] data["Transcript"] = data["Transcript"].str.lower() data["index_value"] = data.index vals = data["Transcript"].values logger.info('Averaging') at = Timer() vector_rep = [average_vector(v, embedding) for v in vals] logger.info('Averaging took %s s', at.elapsed()) # logger.info('Reassembling') # vector_rep = reduce(lambda a, b: a + b, vector_reps) # vector_rep = [average_vector(s, embedding) for s in vals] # logger.info('Saving vector...') # quick_save("big_ver", vector_rep) logger.info('Nearest neighbors fit') nnt = Timer() neighbors = NearestNeighbors( n_neighbors=10, metric="euclidean", algorithm='ball_tree') neighbors.fit(vector_rep) logger.info('Fitting took %s s', nnt.elapsed()) threshold = .6 # Of the top N, take the longest response for i in range(5): t = Timer() embedded = average_vector( 'how many women have you slept with', embedding) distance, indices = neighbors.kneighbors([embedded]) print 'Query time: %s s' % (t.elapsed(),) while True: sentence = raw_input("Enter some text:\n") sentence = sentence.lower() embedded = average_vector(sentence, embedding) distance, indices = neighbors.kneighbors([embedded]) for best in indices[0][0:5]: # Get the correct location best_match_index = data.iloc[best].index_value print 'Best match: %s' % ( data['Transcript'][best_match_index],) print 'Response1: %s' % ( data['Transcript'][best_match_index + 1],)
def match(GA_orig, GB_orig, order=3, max_depth=10, complexity=4): if len(GA_orig) > len(GB_orig): GA, GB = GB_orig.copy(), GA_orig.copy() logging.warning('Warning: reference graph is B not A') else: GA, GB = GA_orig.copy(), GB_orig.copy() # logging.warning('Matching graph A (%d nodes) to graph B (%d nodes)' % (len(GA_orig), len(GB_orig))) GA, GB = make_same_size(GA, GB) M = vertex_vectorize([GA, GB], complexity=complexity, normalization=True, inner_normalization=True) MA, MB = M[0], M[1] nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA) d, BprefA = nnA.kneighbors(MB) nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB) d, AprefB = nnB.kneighbors(MA) # mark bfv in vec attribute GA, GB = init_vec(GA), init_vec(GB) for k in range(order): ds = d[:, 0] id_max_A = np.argsort(ds)[k] id_max_B = AprefB[id_max_A][0] GA = annotate_with_bfs(GA, id_max_A, max_depth=max_depth) GB = annotate_with_bfs(GB, id_max_B, max_depth=max_depth) # draw_graph_set([GA,GB],n_graphs_per_line=2, size=9, secondary_vertex_label='vec') # vectorize 2nd time with real values this time M = vertex_vectorize([GA, GB], complexity=complexity, discrete=False, normalization=False, inner_normalization=False) MA, MB = M[0], M[1] nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA) d, BprefA = nnA.kneighbors(MB) nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB) d, AprefB = nnB.kneighbors(MA) A = ['A%d' % (i + 1) for i in range(len(GA))] B = ['B%d' % (i + 1) for i in range(len(GB))] Arankings = dict(((A[i], j + 1), B[AprefB[i, j]]) for i, j in product(range(len(GA)), range(len(GA)))) Brankings = dict(((B[i], j + 1), A[BprefA[i, j]]) for i, j in product(range(len(GB)), range(len(GB)))) rankings = Arankings rankings.update(Brankings) pairings = stable(rankings, A, B) # remove dummy node pairings npairings = trim_pairings(pairings, GA_orig, GB_orig) orderA, orderB = list(zip(*sorted(npairings))) return orderB
class KNearestNeighbours(ClassifierMixin): '''ML-KNN''' def __init__(self, k = 10, s = 1.0): super(KNearestNeighbours, self).__init__() self.k = k self.s = s def compute_prior(self, y): prior_prob_true = [] prior_prob_false = [] for label in xrange(self.num_labels): prior_prob_true.append(float(self.s + sum(instance[label] == 1 for instance in y)) / (self.s * 2 + self.num_instances)) prior_prob_false.append(1 - prior_prob_true[-1]) return prior_prob_true, prior_prob_false def compute_cond(self, X, y): self.knn = NearestNeighbors(self.k).fit(X) c = [[0] * (self.k + 1) for label in xrange(self.num_labels)] cn = [[0] * (self.k + 1) for label in xrange(self.num_labels)] for instance in xrange(self.num_instances): neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False) for label in xrange(self.num_labels): delta = sum(y[neighbor][label] for neighbor in neighbors[0]) (c if y[instance][label] == 1 else cn)[label][delta] += 1 cond_prob_true = [[0] * (self.k + 1) for label in xrange(self.num_labels)] cond_prob_false = [[0] * (self.k + 1) for label in xrange(self.num_labels)] for label in xrange(self.num_labels): for neighbor in xrange(self.k + 1): cond_prob_true[label][neighbor] = (self.s + c[label][neighbor]) / (self.s * (self.k + 1) + sum(c[label])) cond_prob_false[label][neighbor] = (self.s + cn[label][neighbor]) / (self.s * (self.k + 1) + sum(cn[label])) return cond_prob_true, cond_prob_false def fit(self, X, y): self.predictions = y; self.num_instances = len(y) self.num_labels = len(y[0]) self.prior_prob_true, self.prior_prob_false = self.compute_prior(y) self.cond_prob_true, self.cond_prob_false = self.compute_cond(X, y) return self def predict(self, X): result = np.zeros((len(X), self.num_labels), dtype='i8') for instance in xrange(len(X)): neighbors = self.knn.kneighbors(X[instance], self.k, return_distance=False) for label in xrange(self.num_labels): delta = sum(self.predictions[neighbor][label] for neighbor in neighbors[0]) p_true = self.prior_prob_true[label] * self.cond_prob_true[label][delta] p_false = self.prior_prob_false[label] * self.cond_prob_false[label][delta] prediction = (p_true >= p_false) result[instance][label] = int(prediction) return result
def pointwise_test(data, significance=0.05, standardize=False, plot=False): if standardize: data = standardize_mvn(data) n, p = data.shape k1, k2 = get_nbh_sizes(n, p) ## Step I: finding candidate modes nn = NearestNeighbors(k1, metric='euclidean').fit(data) possible_candidates = np.ones(n, dtype=np.bool) candidates = [] while np.sum(possible_candidates) > 0: distances, indices = nn.kneighbors(data[possible_candidates]) ind_new = np.argmin(distances[:, -1]) new_candidate = np.arange(n)[possible_candidates][ind_new] candidates.append(new_candidate) possible_candidates[indices[ind_new, :k2]] = False ## Step II: Thin out candidates non_modes = [] for i in candidates: mu = data[i, :] _, ind = nn.kneighbors(mu) ind = ind.ravel() X = data[ind[:k2], :] if hotelling_pval(X, mu) < 0.01: non_modes.append(i) modes = [i for i in candidates if not i in non_modes] ## Step III: SB-plot K = len(modes) in_other_modal_region = [] for i in range(K): if i in in_other_modal_region: continue for j in range(i+1, K): if j in in_other_modal_region: continue x = data[modes[i], :] y = data[modes[j], :] alpha = np.linspace(0, 1, 200).reshape(-1, 1) x_alpha = alpha*x + (1-alpha)*y dist_k1nn, _ = nn.kneighbors(x_alpha) d_k1nn = dist_k1nn[:, -1] SB_alpha = p*(np.log(d_k1nn) - np.log(max(d_k1nn[0], d_k1nn[-1]))) if (SB_alpha >= np.sqrt(2./k1)*norm.ppf(1-significance)).any() and plot: plt.plot(alpha, SB_alpha*np.sqrt(k1*1./2)) else: in_other_modal_region.append(j) modal_regions = [mode for j, mode in enumerate(modes) if not j in in_other_modal_region] if len(modal_regions) > 1: return True return False
def fit(self, XMeta, YMeta, YCaMeta, folder = "data/dataForMeta/"): #X ... features, y... trueValue, yC ... values predicted by classifier self.nrOfClassifiers = YCaMeta.shape[1] wholeTime, timeForRegion = 0,0 start = time.time() if self.printing: print("Starting to fit MetaDes") metaFeatures = [] metaResponse = [] nearestNeigbourRegion = NearestNeighbors(n_neighbors=self.K, metric=self.metric) nearestNeigbourRegion.fit(XMeta) nearestNeigbourOutputRegion = NearestNeighbors(n_neighbors=self.Kp, metric=self.metric) nearestNeigbourOutputRegion.fit(np.round(YCaMeta)) with open(folder+"MetaFeatures_K"+str(self.K)+"_Kp"+str(self.Kp)+".csv", "w") as fMetaFeatures: #we use this, because this can be very big folder, so we have to save incrementally in file for i, x in enumerate(XMeta): # for i in range(2000): x = XMeta[i] if(i%1000 == 0): print("Training examples covered: %d/%d" %(i, len(XMeta))) doc = DOC(np.round(YCaMeta[i]), mode=1)#degree of consensus, Morda premislit, kako to drugace dolocit if(doc <= self.hC): #we let in instances, where classifiers have smaller consensus than tresshold reg, opReg = {},{} start2 = time.time() # idxsReg = findRegion(XMeta, x, self.K, method='normalRegion') idxsReg = nearestNeigbourRegion.kneighbors(x, n_neighbors=self.K+1, return_distance=False)[0,1:] timeForRegion+= time.time() - start2 reg["X"], reg["Y"] = XMeta[idxsReg], YMeta[idxsReg] start2 = time.time() #idxsOP = findRegion(np.round(YCaMeta), np.round(YCaMeta[i]), self.Kp, method='outputProfileRegion') idxsOP = nearestNeigbourOutputRegion.kneighbors(np.round(YCaMeta[i]), n_neighbors=self.Kp + 1, return_distance=False)[0,1:] timeForRegion += time.time() - start2 opReg["X"], opReg["Y"] = XMeta[idxsOP], YMeta[idxsOP] for j, cls in enumerate(YCaMeta[i]): reg["YC"] = YCaMeta[idxsReg][:,j] #vzamemo vse response j-tega classifierja v okolici x opReg["YC"] = YCaMeta[idxsOP][:,j] f = computeMetaFeatures(reg, opReg) metaFeatures.append(list(f)) res = 1 if int(np.round(cls)) == int(np.round(YMeta[i])) else 0 metaResponse.append(res) [(fMetaFeatures.write(str(feat)), fMetaFeatures.write(",") if i != len(f)-1 else None) for i, feat in enumerate(f)] fMetaFeatures.write("\n") metaResponse = np.array(metaResponse) np.savetxt(folder+"MetaResponse_K"+str(self.K)+"_Kp"+str(self.Kp)+".csv", metaResponse, delimiter="\n") metaFeatures = np.array(metaFeatures) print("Fitting meta cls...") self.fitMetaCls(metaFeatures, metaResponse) print("Done!") wholeTime = time.time()-start print("For training metaDes we needed %d time for finding region out of %d \n " "so we spent %.3f for region seeking" %(timeForRegion, wholeTime, timeForRegion/wholeTime))
def embedding_refinement(data_matrix_highdim, data_matrix_lowdim, n_neighbors=8, emb_quality_th=1, n_iter=20): # extract neighbors list for high dimensional case neigh_high = NearestNeighbors(n_neighbors=n_neighbors) neigh_high.fit(data_matrix_highdim) neighbors_list_highdim = neigh_high.kneighbors(data_matrix_highdim, return_distance=0) n_instances = data_matrix_lowdim.shape[0] logger.debug('refinements max num iters: %d k in neqs: %d num insts: %d' % (n_iter, n_neighbors, n_instances)) for it in range(n_iter): average_embedding_quality_score, scores = knn_quality_score(data_matrix_lowdim, neighbors_list_highdim, n_neighbors) # select low quality embedded instances ids = [i for i, s in enumerate(scores) if relative_quality(i, scores, neighbors_list_highdim) <= emb_quality_th] # find average position of true knns and move point there new_data_matrix_lowdim = compute_average(ids, data_matrix_lowdim, neighbors_list_highdim) new_average_embedding_quality_score, new_scores = knn_quality_score(new_data_matrix_lowdim, neighbors_list_highdim, n_neighbors) if new_average_embedding_quality_score > average_embedding_quality_score: data_matrix_lowdim = new_data_matrix_lowdim n_refinements = len(ids) frac_refinements = float(n_refinements) / n_instances logger.debug('r %.2d neqs: %.3f \t %.2f (%d insts)' % (it + 1, new_average_embedding_quality_score, frac_refinements, n_refinements)) else: break return data_matrix_lowdim
def resample(self): # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Finding nns from sklearn.neighbors import NearestNeighbors print("Finding the %i nearest neighbours..." % self.k, end = "") NN = NearestNeighbors(n_neighbors = self.k + 1) NN.fit(minx) nns = NN.kneighbors(minx, return_distance=False)[:, 1:] print("done!") # Creating synthetic samples print("Creating synthetic samples...", end="") sx, sy = make_samples(minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs) print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis = 0) ret_y = concatenate((self.y, sy), axis = 0) return ret_x, ret_y
def find_k_neighbors(points, neighbor_number=5): from sklearn.neighbors import NearestNeighbors import numpy as np X = np.array(points) neighbors = NearestNeighbors(n_neighbors=neighbor_number + 1, algorithm='ball_tree').fit(X) distances, indices = neighbors.kneighbors(X) return [[str(point), list([str(x) for x in indices[point][1:]])] for point in xrange(len(points))]
def resample(self): from sklearn.neighbors import NearestNeighbors # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Find the NNs for all samples in the data set. print("Finding the %i nearest neighbours..." % self.m, end = "") NN = NearestNeighbors(n_neighbors = self.m + 1) NN.fit(self.x) print("done!") # Boolean array with True for minority samples in danger index = asarray([in_danger(x, self.y, self.m, miny[0], NN) for x in minx]) # If all minority samples are safe, return the original data set. if not any(index): print('There are no samples in danger. No borderline synthetic samples created.') return self.x, self.y # Find the NNs among the minority class NN.set_params(**{'n_neighbors' : self.k + 1}) NN.fit(minx) nns = NN.kneighbors(minx[index], return_distance=False)[:, 1:] # Create synthetic samples for borderline points. sx, sy = make_samples(minx[index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis = 0) ret_y = concatenate((self.y, sy), axis = 0) return ret_x, ret_y
class KNNmodel(): def __init__(self): self.knnModel = None def train(self, userFeatureTable, ratingsMat): userFeatureTable.loc[:, "age"] = userFeatureTable.loc[:, "age"] / 10. # ad hoc fix, make sure feature's range is similar self.knnModel = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(userFeatureTable) # ratingMat is the rating matrix self.ratingsMat = ratingsMat self.userFeatureTable = userFeatureTable self.userIds = self.userFeatureTable.index # the actual order seen by the knnmodel def predict(self, userFeature): distances, indices = self.knnModel.kneighbors(userFeature) # indices are the nearest neighbors' index in the matrix, which is different from userId. return self.userIds[indices[0]] def provideRec(self, userId): # data is a tuple of (user feature, item feature) userIds = self.predict(self.userFeatureTable.loc[userId].as_matrix().reshape(1, -1)) # remove himself as a nearest neighbor userIds = np.array(list(set(userIds) - set([userId]))) # for all nearest neighbors, compute the the average score, sorted from large to small # then report the item ids return self.ratingsMat[userIds - 1].mean(axis=0).argsort()[::-1] + 1
def random_forest_single_predict(test_filename, name, feature_file, train_file, k): name_list, data = readfile_real_name(test_filename) print 'reading file...' test_data = data[name_list.index(name)] with open(train_file, 'rb') as f: clf = cPickle.load(f) print 'done' result_rate = (clf.predict_proba(test_data))[0] class_name = clf.classes_ print name num = map(get_num, result_rate) name_list, feature_list = readfile_real_name_group(feature_file, class_name, num) neigh = NearestNeighbors() neigh.fit(feature_list) kneighbors_result_list = neigh.kneighbors(test_data, k, False)[0] print kneighbors_result_list for x in kneighbors_result_list: print name_list[x] classification_result = [] average_list = [] real_name = (name.split('_'))[0] counter = Counter(kneighbors_result_list) if real_name == name_list[counter.most_common(1)[0][0]].split('_')[0]: classification_result.append(1) else: classification_result.append(0) num = 0 for i in kneighbors_result_list: if (name_list[i].split('_'))[0] == real_name: num += 1 average_list.append((float)(num) / (float)(k)) print classification_result, average_list return classification_result, average_list
def sample(s): if s.data is None: raise ValueError('data not loaded.') mdl = NearestNeighbors(n_neighbors=s.k, n_jobs=-1) minoX = s.X[s.y == s.minolab] majX = s.X[s.y == s.majlab] mdl.fit(minoX) _, nei_table = mdl.kneighbors() generated = None for cnt, nei_idx in enumerate(nei_table): x = minoX[cnt] if s.rate >= 0.5 * s.k: nei = minoX[np.random.choice(nei_idx, int(s.rate))] new = x + np.random.rand(int(s.rate), 1) * (nei - x) else: nei = minoX[nei_idx] new = x + np.random.rand(s.k, 1) * (nei - x) # each of the synthesed k points has N/k * 100 % probability to be chosen new = new[np.random.rand(s.k) > s.rate * 1.0 / s.k] if generated is None: generated = new else: generated = np.vstack((generated, new)) # number of generated instances N = len(generated) ret = np.hstack((np.vstack((minoX, generated, majX)), np.array([s.minolab] * (minoX.shape[0] + N) + [s.majlab] * majX.shape[0])[:, None])) np.random.shuffle(ret) return ret
def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert_equal(kde.sample().shape, (1, 1))
class KNearestDatasets(object): def __init__(self, metric='l1', random_state=None, metric_params=None): self.logger = get_logger(__name__) self.metric = metric self.model = None self.metric_params = metric_params self.metafeatures = None self.runs = None self.best_configuration_per_dataset = None self.random_state = sklearn.utils.check_random_state(random_state) self.scaler = MinMaxScaler() if self.metric_params is None: self.metric_params = {} def fit(self, metafeatures, runs): """Fit the Nearest Neighbor model. Parameters ---------- metafeatures : pandas.DataFrame A pandas dataframe. Each row represents a dataset, each column a metafeature. runs : dict Dictionary containing a list of runs for each dataset. """ assert isinstance(metafeatures, pd.DataFrame) assert metafeatures.values.dtype in (np.float32, np.float64) assert np.isfinite(metafeatures.values).all() assert isinstance(runs, pd.DataFrame) assert runs.shape[1] == metafeatures.shape[0], \ (runs.shape[1], metafeatures.shape[0]) self.metafeatures = metafeatures self.runs = runs self.num_datasets = runs.shape[1] # Fit the metafeatures for scaler self.scaler.fit(self.metafeatures) # for each dataset, sort the runs according to their result best_configuration_per_dataset = {} for dataset_name in runs: if not np.isfinite(runs[dataset_name]).any(): best_configuration_per_dataset[dataset_name] = None else: configuration_idx = runs[dataset_name].index[np.nanargmin( runs[dataset_name].values)] best_configuration_per_dataset[ dataset_name] = configuration_idx self.best_configuration_per_dataset = best_configuration_per_dataset if callable(self.metric): self._metric = self.metric self._p = 0 elif self.metric.lower() == "l1": self._metric = "minkowski" self._p = 1 elif self.metric.lower() == "l2": self._metric = "minkowski" self._p = 2 else: raise ValueError(self.metric) self._nearest_neighbors = NearestNeighbors( n_neighbors=self.num_datasets, radius=None, algorithm="brute", leaf_size=30, metric=self._metric, p=self._p, metric_params=self.metric_params) def kNearestDatasets(self, x, k=1, return_distance=False): """Return the k most similar datasets with respect to self.metric Parameters ---------- x : pandas.Series A pandas Series object with the metafeatures for one dataset k : int Number of k nearest datasets which are returned. If k == -1, return all dataset sorted by similarity. return_distance : bool, optional. Defaults to False If true, distances to the new dataset will be returned. Returns ------- list Names of the most similar datasets, sorted by similarity list Sorted distances. Only returned if return_distances is set to True. """ assert type(x) == pd.Series if k < -1 or k == 0: raise ValueError( 'Number of neighbors k cannot be zero or negative.') elif k == -1: k = self.num_datasets X_train = self.scaler.transform(self.metafeatures) x = x.values.reshape((1, -1)) x = self.scaler.transform(x) self._nearest_neighbors.fit(X_train) distances, neighbor_indices = self._nearest_neighbors.kneighbors( x, n_neighbors=k, return_distance=True) assert k == neighbor_indices.shape[1] rval = [ self.metafeatures.index[i] # Neighbor indices is 2d, each row is the indices for one # dataset in x. for i in neighbor_indices[0] ] if return_distance is False: return rval else: return rval, distances[0] def kBestSuggestions(self, x, k=1, exclude_double_configurations=True): assert type(x) == pd.Series if k < -1 or k == 0: raise ValueError( 'Number of neighbors k cannot be zero or negative.') nearest_datasets, distances = self.kNearestDatasets( x, -1, return_distance=True) kbest = [] added_configurations = set() for dataset_name, distance in zip(nearest_datasets, distances): best_configuration = self.best_configuration_per_dataset[ dataset_name] if best_configuration is None: self.logger.warning("Found no best configuration for instance " "%s" % dataset_name) continue if exclude_double_configurations: if best_configuration not in added_configurations: added_configurations.add(best_configuration) kbest.append((dataset_name, distance, best_configuration)) else: kbest.append((dataset_name, distance, best_configuration)) if k != -1 and len(kbest) >= k: break if k == -1: k = len(kbest) return kbest[:k]
import numpy as np from sklearn.neighbors import NearestNeighbors from glob import glob import os from skimage import io import matplotlib.pyplot as plt """ Use kNN as baseline algorithm for finidng nearest neighbors. Validate results by observing classification error. The concept being that a model with less classification error will find nearest neighbors better as well.""" imgs = np.load(r'D:\pycharm_projects\AWSgeo\data.npy') # imgs_labels = np.load(r'D:\pycharm_projects\AWSgeo\labels.npy') logits = np.load(r'D:\pycharm_projects\AWSgeo\Tensorboard\model_2019-12-18-08-45-59\data_logits.npy') ############## sklearn ############## rand_arrange = np.random.permutation(len(logits)) ind = -1 neigh = NearestNeighbors(5) neigh.fit(logits[rand_arrange[:-1000]]) knns = neigh.kneighbors(logits[rand_arrange[ind]].reshape(1,-1), 6, return_distance=False) plt.figure();plt.imshow(imgs[rand_arrange[ind]]) io.imshow_collection(imgs[rand_arrange[knns[0]]]) ########## openCV ##################
# Wczytanie bibliotek. from sklearn import datasets from sklearn.neighbors import NearestNeighbors from sklearn.preprocessing import StandardScaler # Wczytanie danych. iris = datasets.load_iris() features = iris.data # Utworzenie egzemplarza StandardScaler. standardizer = StandardScaler() # Standaryzacja cech. features_standardized = standardizer.fit_transform(features) # Dwóch najbliższych sąsiadów. nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features_standardized) # Utworzenie obserwacji. new_observation = [ 1, 1, 1, 1] # Obliczenie odległości i odszukanie indeksów najbliższych sąsiadów obserwacji. distances, indices = nearest_neighbors.kneighbors([new_observation]) # Wyświetlenie najbliższych sąsiadów. features_standardized[indices]
TruncatedSVD_X = svd.transform(X) max_indices = np.argmax(TruncatedSVD_X, axis=1) id2cluster = {} for i in range(len(sethastypeSig)): no = sethastypeSig[i] id2cluster[no] = max_indices[i] #print no print 'start to get nearest neighbors' X_latent_neigh = np.loadtxt(latentfileName) nbrs = NearestNeighbors(n_neighbors=10, algorithm='kd_tree').fit(X_latent_neigh) print 'end to get nearest neighbors' for key in set_miss_rel: # print 'set_miss_rel',key distances, indices = nbrs.kneighbors(X_latent_neigh[key:key + 1]) neighbors = indices indice_num = np.shape(neighbors)[1] neighbours = {} neighbours_no = 0 for i in range(indice_num): ner = neighbors[0, i] temp = id2cluster.get(ner) if temp != None: id2cluster[key] = temp break # if neighbours.get(temp)!=None: # neighbours[temp] = neighbours[temp] + 1 # else: # neighbours[temp] = 1 # neighbours_no = neighbours_no + 1
def getReviewSentencesNNs(dictRestIDToFoodItemFoodItemVecSentSentVec, posNegSeeds, withinDis=.25): restIDfoodItemFoodItemVecSentSentVecs = makeDictAFiveLists( dictRestIDToFoodItemFoodItemVecSentSentVec) print " length all reviews", dictCount( dictRestIDToFoodItemFoodItemVecSentSentVec), len( restIDfoodItemFoodItemVecSentSentVecs) X = map(itemgetter(4), restIDfoodItemFoodItemVecSentSentVecs) Y = map(itemgetter(4), posNegSeeds) #posNegSeed[4] NNs = [] #X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) #Y = np.array([[0,0]]) nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(X) try: allDistances, indices = nbrs.kneighbors(Y) except: print "allDistances", allDistances for i, distancesFromASeed in enumerate(allDistances): for j, dist in enumerate( distancesFromASeed ): #0 since Y is just one seed we are looking at at a time if dist <= withinDis: try: if (dist == 0.0) and (posNegSeeds[i][1] in map( itemgetter(0), dictRestIDToFoodItemFoodItemVecSentSentVec.get( restIDfoodItemFoodItemVecSentSentVecs[indices[ i, j]][0]) )) and (posNegSeeds[i][3] in map( itemgetter(2), dictRestIDToFoodItemFoodItemVecSentSentVec.get( restIDfoodItemFoodItemVecSentSentVecs[indices[ i, j]][0]))): #and (restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][1] == posNegSeeds[i][1]) and (restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][3] == posNegSeeds[i][3]): beforeCount = dictCount( dictRestIDToFoodItemFoodItemVecSentSentVec) removeSeedsFromDict( dictRestIDToFoodItemFoodItemVecSentSentVec, [posNegSeeds[i]] ) #perhaps this happens where it didn't find it earlier, i'm not sure still figuring it out if dictCount(dictRestIDToFoodItemFoodItemVecSentSentVec ) == beforeCount: #print("try to remove a seed during getReviewSentencesNNs: no sucess") removeSeedsFromDict( dictRestIDToFoodItemFoodItemVecSentSentVec, [posNegSeeds[i]]) else: dictRestIDToFoodItemFoodItemVecSentSentVec.get( restIDfoodItemFoodItemVecSentSentVecs[indices[ i, j]][0]) #print("try to remove a seed during getReviewSentencesNNs: sucess") else: NNs.append( restIDfoodItemFoodItemVecSentSentVecs[indices[i, j]]) except TypeError as e: pass #print restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][0] #dictRestIDToFoodItemFoodItemVecSentSentVec.get(restIDfoodItemFoodItemVecSentSentVecs[indices[i,j]][0]) else: break #the rest of the distances will only get further away return NNs
np.save(GLOVE_VEC_PATH, vecs.astype(np.float32)) # Compute the real nearest neighbors for a set of test words. if not (os.path.exists(GLOVE_KNN_PATH)): with open(GLOVE_VOC_PATH) as fp: words = list(map(str.strip, fp)) word2idx = {w: i for i, w in enumerate(words)} vecs = np.load(GLOVE_VEC_PATH) knn = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean') knn.fit(vecs) test_ii = list(map(word2idx.get, GLOVE_TEST_WORDS)) nbrs = knn.kneighbors(vecs[test_ii], return_distance=False) with open(GLOVE_KNN_PATH, "w") as fp: for word, nbrs_ in zip(GLOVE_TEST_WORDS, nbrs): fp.write("%s %s\n" % (word, " ".join([words[i] for i in nbrs_]))) # Fit LSH models and compute the hash from each model on each word vector. if not os.path.exists(LSH_HASHES_PATH): with open(GLOVE_VOC_PATH) as fp: words = list(map(str.strip, fp)) word2idx = {w: i for i, w in enumerate(words)} vecs = np.load(GLOVE_VEC_PATH) lsh_models = [LSHModel(seed=i, H=H).fit(vecs) for i in range(L)]
genre_of_movie = cur.fetchall() genre_of_movie = [x[0] for x in genre_of_movie] movies_list.append(( i[0], i[1], len(set(genre_of_movie).intersection(genre_list)), )) movies_list = sorted(movies_list, key=cmp2, reverse=True) movies_list = movies_list[:50] # create a training dataset and find the k nearest neighbors. training_data = [[item[1]] for item in movies_list] train = np.array(training_data) nbrs = NearestNeighbors() nbrs.fit(train) indices = nbrs.kneighbors([[item[1]] for item in mov_id], n_neighbors=7, return_distance=False) # print indices # print the results for value in indices[0]: temp = cur.execute(''' SELECT movie FROM Movies WHERE id = (?)''', (movies_list[value][0], )).fetchone()[0] print(temp) conn.close()
# http://scikit-learn.org/stable/modules/neighbors.html from sklearn.neighbors import NearestNeighbors import numpy as np X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(X) distances, indices = nbrs.kneighbors(X) indices distances nbrs.kneighbors_graph(X).toarray() ------------------------------------------------------------------------------------------- from sklearn.neighbors import KDTree >>> import numpy as np >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) >>> kdt = KDTree(X, leaf_size=30, metric='euclidean') >>> kdt.query(X, k=2, return_distance=False)
def n_closest( self, word, num_closest=5, metric='cosine', return_similarity=True, mode_bidirectional='mean', mode_sequence='mean', ): """ find nearest words based on a word. Parameters ---------- word: str Eg, 'najib' num_closest: int, (default=5) number of words closest to the result. metric: str, (default='cosine') vector distance algorithm. return_similarity: bool, (default=True) if True, will return between 0-1 represents the distance. Returns ------- word_list: list of nearest words """ if not isinstance(word, str): raise ValueError('input must be a string') if not isinstance(num_closest, int): raise ValueError('num_closest must be an integer') if not isinstance(metric, str): raise ValueError('metric must be a string') if not isinstance(return_similarity, bool): raise ValueError('num_closest must be a boolean') if return_similarity: nn = NearestNeighbors(num_closest + 1, metric=metric).fit(self._embed_matrix) distances, idx = nn.kneighbors( self.get_vector_by_name( word, mode_bidirectional=mode_bidirectional, mode_sequence=mode_sequence, ).reshape((1, -1))) word_list = [] for i in range(1, idx.shape[1]): word_list.append( [self._reverse_dictionary[idx[0, i]], 1 - distances[0, i]]) return word_list else: wv = self.get_vector_by_name( word, mode_bidirectional=mode_bidirectional, mode_sequence=mode_sequence, ) closest_indices = self.closest_row_indices(wv, num_closest + 1, metric) word_list = [] for i in closest_indices: word_list.append(self._reverse_dictionary[i]) if word in word_list: word_list.remove(word) return word_list
class MLkNN(MLClassifierBase): """kNN classification method adapted for multi-label classification Parameters ---------- k : integer number of neighbours of each input instance to take into account s: boolean the smoothing parameter ignore_first_neighbours : integer ability to ignore first N neighbours, useful for comparing with other classification software, if you don't know what it does, the default is safe, see https://github.com/scikit-multilearn/scikit-multilearn/issues/22 """ BRIEFNAME = "MLkNN" def __init__(self, k=10, s=1.0, ignore_first_neighbours=0): super(MLkNN, self).__init__() self.k = k # Number of neighbours self.s = s # Smooth parameter self.ignore_first_neighbours = ignore_first_neighbours self.copyable_attrs = ['k', 's', 'ignore_first_neighbours'] def compute_prior(self, y): prior_prob_true = np.array( (self.s + y.sum(axis=0)) / (self.s * 2 + self.num_instances))[0] prior_prob_false = 1 - prior_prob_true return prior_prob_true, prior_prob_false def compute_cond(self, X, y): self.knn = NearestNeighbors(self.k).fit(X) c = sparse.lil_matrix((self.num_labels, self.k + 1), dtype='i8') cn = sparse.lil_matrix((self.num_labels, self.k + 1), dtype='i8') label_info = get_matrix_in_format(y, 'dok') neighbors = [ a[self.ignore_first_neighbours:] for a in self.knn.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False) ] for instance in range(self.num_instances): deltas = label_info[neighbors[instance], :].sum(axis=0) for label in range(self.num_labels): if label_info[instance, label] == 1: c[label, deltas[0, label]] += 1 else: cn[label, deltas[0, label]] += 1 c_sum = c.sum(axis=1) cn_sum = cn.sum(axis=1) cond_prob_true = sparse.lil_matrix((self.num_labels, self.k + 1), dtype='float') cond_prob_false = sparse.lil_matrix((self.num_labels, self.k + 1), dtype='float') for label in range(self.num_labels): for neighbor in range(self.k + 1): cond_prob_true[label, neighbor] = (self.s + c[label, neighbor]) / ( self.s * (self.k + 1) + c_sum[label, 0]) cond_prob_false[label, neighbor] = (self.s + cn[label, neighbor]) / ( self.s * (self.k + 1) + cn_sum[label, 0]) return cond_prob_true, cond_prob_false def fit(self, X, y): """Fit classifier with training data :param X: input features :type X: dense or sparse matrix (n_samples, n_features) :param y: binary indicator matrix with label assignments :type y: dense or sparse matrix of {0, 1} (n_samples, n_labels) :returns: Fitted instance of self """ self.train_labels = get_matrix_in_format(y, 'lil') self.num_instances = self.train_labels.shape[0] self.num_labels = self.train_labels.shape[1] # Computing the prior probabilities self.prior_prob_true, self.prior_prob_false = self.compute_prior( self.train_labels) # Computing the posterior probabilities self.cond_prob_true, self.cond_prob_false = self.compute_cond( X, self.train_labels) return self def predict(self, X): """Predict labels for X :param X: input features :type X: dense or sparse matrix (n_samples, n_features) :returns: binary indicator matrix with label assignments :rtype: sparse matrix of int (n_samples, n_labels) """ result = np.zeros((X.shape[0], self.num_labels)) neighbors = [ a[self.ignore_first_neighbours:] for a in self.knn.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False) ] for instance in range(X.shape[0]): deltas = self.train_labels[neighbors[instance], ].sum(axis=0) for label in range(self.num_labels): p_true = self.prior_prob_true[label] * self.cond_prob_true[ label, deltas[0, label]] p_false = self.prior_prob_false[label] * self.cond_prob_false[ label, deltas[0, label]] result[instance, label] = p_true / (p_false + p_true) # 本来的返回值是int( p_true > p_false),为符合论文要求改源代码为返回 return result def predict_proba(self, X): """Predict probabilities of label assignments for X :param X: input features :type X: dense or sparse matrix (n_samples, n_labels) :returns: matrix with label assignment probabilities :rtype: sparse matrix of float (n_samples, n_labels) """ result = sparse.lil_matrix((X.shape[0], self.num_labels), dtype='float') neighbors = [ a[self.ignore_first_neighbours:] for a in self.knn.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False) ] for instance in range(X.shape[0]): deltas = self.train_labels[neighbors[instance], ].sum(axis=0) for label in range(self.num_labels): p_true = self.prior_prob_true[label] * self.cond_prob_true[ label, deltas[0, label]] p_false = self.prior_prob_false[label] * self.cond_prob_false[ label, deltas[0, label]] result[instance, label] = p_true return result
test_y_all = [] nr_events_all = [] offline_time_fit = 0 current_online_event_times = [] for _, dt_test_bucket in dt_test_prefixes.groupby( dataset_manager.case_id_col): # select current test case test_y_all.extend( dataset_manager.get_label_numeric(dt_test_bucket)) nr_events_all.append(len(dt_test_bucket)) start = time.time() encoded_case = bucket_encoder.fit_transform(dt_test_bucket) _, knn_idxs = bucketer.kneighbors(encoded_case) knn_idxs = knn_idxs[0] relevant_cases_bucket = encoded_train.iloc[knn_idxs].index dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) if len(set(train_y)) < 2: preds_all.append(train_y[0]) else: feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ])
class KNearestNeighborsAssignement(object): """ """ def __init__(self, feature_name, max_distance, data_directory="", n_neighbors=1, algorithm="ball_tree", weights="distance"): """ """ self.feature_dim = None self.feature_name = feature_name self.trained = False self.n_neighbors = n_neighbors self.weights = weights self.nb_samples = 0 self.algorithm = algorithm self.max_distance = max_distance if data_directory == "": self.data_directory = "/tmp" else: self.data_directory = data_directory self.model = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.algorithm) try: data = np.load(self.data_directory + "/" + self.feature_name + "_knn_classif.npz") self.X = list(data["x"]) self.Y = list(data["y"]) self.nb_samples = len(self.X) self.feature_dim = len(self.X[0]) if self.n_neighbors is None: self.n_neighbors = math.sqrt(len(self.X[0])) self.train() except Exception: self.X = [] self.Y = [] def train(self): """ """ self.model.fit(np.array(self.X)) def update(self, feature, label): """ """ if self.feature_dim is None: self.feature_dim = len(feature) self.X.append(feature) self.Y.append(label) self.nb_samples += 1 def predict(self, feature): """ """ distances, matchs = self.model.kneighbors([feature]) distance = distances[0] if distance > self.max_distance: return False, "unknown", 0.0 indice = matchs[0][0] label = self.Y[indice] return True, label, distance def __del__(self): """ """ pass # TODO: save data def save(self, file): file = open(file, 'w') pickle.dump(self.knn, file) file.close() def load(self, file): file = open(file, 'r') self.knn = pickle.load(file) file.close()
ax2.legend(targets) ax2.grid() plt.show() from sklearn.neighbors import NearestNeighbors # test_data_size = tst.shape[0] test_data = portfolio_points # test_data = tst X = cropped_components # X = data[:-test_data_size] # test_data = data[-test_data_size:] nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(X) # distances, indices = nbrs.kneighbors(X) test_distances, test_indices = nbrs.kneighbors(test_data) dic = {name: 0 for name in names} print ("Calculating success rate for test using Nearest Neighboirs") for name in finalDfPerIm.loc[test_indices.flatten()]['target']: dic[name] += 1 m = test_indices.size for k, v in sorted(dic.items(), key=operator.itemgetter(0), reverse=True): print (k , "\t : \t", (int)((v / m) * 100) , "%") pass from sklearn.neighbors.nearest_centroid import NearestCentroid y = np.array(filter(lambda name: "Portfolio" in name, targs)) clf = NearestCentroid() clf.fit(X, y)
kmeans = KMeans(n_clusters=800) preprocessed_image = [] files = [x for x in os.listdir() if "jpg" in x] print(files) images = [cv2.imread(img) for img in files] descriptor_list = np.array([]) for image in images: image = gray(image) keypoint, descriptor = features(image, extractor) if len(descriptor_list) == 0: descriptor_list = np.array(descriptor) else: descriptor_list = np.vstack((descriptor_list, descriptor)) kmeans.fit(descriptor_list) for image in images: image = gray(image) keypoint, descriptor = features(image, extractor) if (descriptor is not None): histogram = build_histogram(descriptor, kmeans) preprocessed_image.append(histogram) data = cv2.imread("book1.jpg") data = gray(data) keypoint, descriptor = features(data, extractor) histogram = build_histogram(descriptor, kmeans) neighbor = NearestNeighbors(n_neighbors=5) neighbor.fit(preprocessed_image) dist, result = neighbor.kneighbors([histogram]) print([files[i] for i in result[0]])
instances_per_class[i,0] = np.size(elements) for ind, cnn_layer in enumerate(layer_names): #iterate through layers and count precision and recall for test data print(cnn_layer) features = Model(input = new_model.input, output=new_model.get_layer(cnn_layer).output) train_pred = features.predict(train_data) test_pred = features.predict(test_data) #find the k nearest neighbors of an image nn_model = NearestNeighbors(n_neighbors=n_nearest_imgs, metric='cosine') nn_model.fit(train_pred) for j in range(0,test_data.shape[0]): #iterate throught test images an_img = test_pred[j,:].reshape(1, -1) distances, indices = nn_model.kneighbors(an_img) s=0 for w in np.nditer(indices): #iterate through the most similar images if train_labels[w,0] == test_labels[j,0]: score[ind,0] += 1 s += 1 if train_labels[w,1] == test_labels[j,1]: score[ind,1] += 1 score[ind,2] += s/instances_per_class[int(test_labels[j,0]),0] #recall per image score[ind,2] /= test_data.shape[0] #recall of breed score[ind,0] /= max_images #precision of breed
imgs_train_reconstruct = model.decoder.predict(E_train) if modelName == "simpleAE": imgs_train_reconstruct = imgs_train_reconstruct.reshape( (-1, ) + shape_img_resize) plot_reconstructions(imgs_train, imgs_train_reconstruct, os.path.join(outDir, "{}_reconstruct.png".format(modelName)), range_imgs=[0, 255], range_imgs_reconstruct=[0, 1]) # Fit kNN model on training images print("Fitting k-nearest-neighbour model on training images...") knn = NearestNeighbors(n_neighbors=5, metric="cosine") knn.fit(E_train_flatten) # Perform image retrieval on test images print("Performing image retrieval on test images...") for i, emb_flatten in enumerate(E_test_flatten): _, indices = knn.kneighbors([emb_flatten ]) # find k nearest train neighbours img_query = imgs_test[i] # query image imgs_retrieval = [imgs_train[idx] for idx in indices.flatten()] # retrieval images outFile = os.path.join(outDir, "{}_retrieval_{}.png".format(modelName, i)) plot_query_retrieval(img_query, imgs_retrieval, outFile) # Plot t-SNE visualization print("Visualizing t-SNE on training images...") outFile = os.path.join(outDir, "{}_tsne.png".format(modelName)) plot_tsne(E_train_flatten, imgs_train, outFile)
def over_sampling(self): if self.k + 1 > self.n_train_less: print( 'Expected n_neighbors <= n_samples, but n_samples = {}, n_neighbors = {}, ' 'has changed the n_neighbors to {}'.format( self.n_train_less, self.k + 1, self.n_train_less)) self.k = self.n_train_less - 1 data_less_filter = [] num_maj_filter = [] length_less = len(self.train_less) num_maj = number_maj(self.train[:, 1:], self.train_less[:, 1:], self.tp_less, self.train[:, 0]) for m in range(len(num_maj)): if num_maj[m] < self.k: data_less_filter.append(self.train_less[m]) num_maj_filter.append(num_maj[m]) self.train_less = np.array(data_less_filter) distance_more, nn_array_more = NearestNeighbors( n_neighbors=self.k + 1).fit(self.train_more[:, 1:]).kneighbors( self.train_less[:, 1:], return_distance=True) distance_less, nn_array = NearestNeighbors(n_neighbors=self.k + 1).fit( self.train_less[:, 1:]).kneighbors(self.train_less[:, 1:], return_distance=True) distance_less = distance_less.sum(axis=1) distance_more = distance_more.sum(axis=1) distance = distance_less / distance_more # print(distance) density = 1 / distance # calculate density density = list( map(lambda x: min(100, x), density)) # Control the maximum density range at 100 # The density is sorted below, and the minority samples are also sorted in order of density. density_sorted = sorted(range(len(density)), key=lambda a: density[a], reverse=True) # sorted data_resorted = [] density_sorted_data = [] num_sorted = [] for i in range(len(self.train_less)): data_resorted.append(self.train_less[density_sorted[i]]) density_sorted_data.append(density[density_sorted[i]]) num_sorted.append(num_maj_filter[density_sorted[i]]) density = np.array(density_sorted_data) cluster_big_density = [] cluster_small_density = [] cluster_big_data = [] cluster_small_data = [] cluster_big_num = [] cluster_small_num = [] cluster = k_means(X=density.reshape((len(density), 1)), n_clusters=2) for i in range(cluster[1].shape[0]): if cluster[1][i] != cluster[1][i + 1]: # Partition cluster cluster_big_density = density[:i + 1] cluster_big_data = np.array(data_resorted)[:i + 1, :] cluster_big_num = num_sorted[:i + 1] cluster_small_density = density[i + 1:] cluster_small_data = np.array(data_resorted)[i + 1:, :] cluster_small_num = num_sorted[i + 1:] break # If there is only one point in a cluster, do not divide the cluster if len(cluster_big_data) < 2 or len(cluster_small_data) < 2: cluster_big_data = np.array(data_resorted) cluster_big_density = density cluster_big_num = num_sorted flag = 1 # if flag==1 only run big cluster once else: flag = 2 sum_0 = 0 sum_1 = 0 # Calculate weight for p in range(len(cluster_big_num)): sum_0 += (5 - cluster_big_num[p]) / self.k + 1 for p in range(len(cluster_small_num)): sum_0 += (5 - cluster_small_num[p]) / self.k + 1 ratio = [] # save the every cluster's totol weight ratio.append(sum_0) ratio.append(sum_1) wight = [5 / 6, 4 / 6, 3 / 6, 2 / 6, 1 / 6] kk = self.k diff = len(self.train_more ) - length_less # the number of samples need to synthesize totol_less = len(self.train_less) for i in range(flag): if i == 0: # big cluster density = cluster_big_density self.n_train_less = len(cluster_big_data) self.train_less = cluster_big_data maj_num_ab = cluster_big_num else: # small cluster density = cluster_small_density self.n_train_less = len(cluster_small_data) self.train_less = cluster_small_data maj_num_ab = cluster_small_num self.k = min( len(self.train_less) - 1, kk) # if len(self.train_less)<k,set k =len(self.train_less) # The number of sample points that need to be inserted at each point if flag == 1: number_synthetic = int( len(self.train_more) / self.IR - len(self.train_less)) else: if i == 0: number_synthetic = int( (len(self.train_less) / totol_less) * diff) len_big = number_synthetic else: number_synthetic = diff - len_big # Calculate how many points should be inserted for each sample N = list( map(lambda x: int((x / ratio[i]) * number_synthetic), wight)) self.reminder = number_synthetic - sum(N) self.num = 0 neighbors = NearestNeighbors(n_neighbors=self.k + 1).fit( self.train_less[:, 1:]) nn_array = neighbors.kneighbors(self.train_less[:, 1:], return_distance=False) self.synthetic = np.zeros((number_synthetic, self.n_attrs - 1)) for p in range(self.train_less.shape[0]): self._populate(p, nn_array[p][1:], number_synthetic, N, maj_num_ab) label_synthetic = np.array([self.tp_less] * number_synthetic).reshape( (number_synthetic, 1)) np.random.seed(self.random_state) synthetic_dl = self.synthetic synthetic_dl = np.hstack( (label_synthetic, synthetic_dl)) # class column data_res = synthetic_dl if i == 0: return_data = np.vstack((copy.deepcopy(self.train), data_res)) if flag == 1: return return_data self.new_index = 0 else: return_data = np.vstack((copy.deepcopy(return_data), data_res)) return return_data
def filterFunc(mrdModel, testData): qDim = mrdModel.X.mean.values.shape[1] scales1 = mrdModel.Y0.kern.input_sensitivity(summarize=False) scales2 = mrdModel.Y1.kern.input_sensitivity(summarize=False) scales1 = scales1 / scales1.max() scales2 = scales2 / scales2.max() # get the number of dimensions yThresh = 0.05 indices = np.asarray(range(qDim)) active1 = indices[scales1 >= yThresh] active2 = indices[scales2 >= yThresh] sharedDims = np.intersect1d(active2, active2) nShared = len(sharedDims) # get init latent state from optimization hybridFPS = 10000.0 deltaT = 1.0 / hybridFPS # state transition matrix def f_cv(x, dt): nShared = len(x) / 2 F = np.eye(2 * nShared) F[:nShared, nShared:] = dt * np.eye(nShared) return np.dot(F, x) def h_cv(x): nShared = len(x) / 2 return x[:nShared] # create kalman filter sigmas = filterpy.kalman.MerweScaledSigmaPoints(n=2 * nShared, alpha=0.1, beta=2.0, kappa=1.0) kf = UKF(dim_x=2 * nShared, dim_z=nShared, fx=f_cv, hx=h_cv, dt=deltaT, points=sigmas) # init state yIn = testData['Cloud'][0, :] [xPredict, infX] = mrdModel.Y0.infer_newX(yIn[None, :], optimize=True) xPredict = xPredict.mean kf.x = np.zeros((2 * nShared)) kf.x[:nShared] = xPredict[0, sharedDims] # init covariance kf.P *= 1e-4 # process and measurement noise kf.Q *= 1e-5 kf.R *= 1e-3 # model variables kKey = 'Cloud' mKey = 'TopCoord' qDim = mrdModel.X.mean.shape[1] nDimIn = testData[kKey].shape[1] nDimOut = testData[mKey].shape[1] nSamples = testData[mKey].shape[0] latentVals = np.zeros((nSamples, qDim)) predictVals = np.zeros((nSamples, nDimOut)) # obtain the training data latent positions latentPositions = mrdModel.X.mean nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree').fit(mrdModel.Y0.Y) startTime = time.time() for n in range(nSamples): yIn = testData[kKey][n, :] yTrueOut = testData[mKey][n, :] kf.predict() if n % hybridFPS == 0: [xPredict, infX] = mrdModel.Y0.infer_newX(yIn[None, :], optimize=True) xPredict = xPredict.mean kf.update(xPredict[0, sharedDims], R=1e-6 * np.eye(nShared)) else: _, indices = nn.kneighbors(np.atleast_2d(yIn)) xPredict = latentPositions[indices[0], :].mean(axis=0) kf.update(xPredict[sharedDims]) # how to apply hybrid here?? # kalman filter latentVal = np.atleast_2d(xPredict) latentVal[0, sharedDims] = kf.x[:nShared] yOut = mrdModel.predict(latentVal, Yindex=1) latentVals[n, :] = latentVal predictVals[n, :] = yOut[0] sys.stdout.write('.') sys.stdout.flush() stopTime = time.time() print '\nFinished Strategy Hybrid' nrmse = np.divide( np.sqrt( metrics.mean_squared_error(testData[mKey], predictVals, multioutput='raw_values')), testData[mKey].max(axis=0) - testData[mKey].min(axis=0)) rmse = np.sqrt( metrics.mean_squared_error(testData[mKey], predictVals, multioutput='raw_values')) corr = np.zeros((1, nDimOut)) for d in range(nDimOut): corr[0, d], _ = stats.pearsonr(testData[mKey][:, d], predictVals[:, d]) results = {} results['corr'] = corr results['rmse'] = rmse results['nrmse'] = nrmse results['pred'] = predictVals results['latent'] = latentVals results['time'] = nSamples / (stopTime - startTime) return results
import numpy as np from sklearn.neighbors import NearestNeighbors X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) neighbors = NearestNeighbors(n_neighbors=2).fit(X) distances, indices = neighbors.kneighbors(X, return_distance=True) print(distances) print(indices) print(neighbors.kneighbors_graph(X).toarray())
class Database(object): def __init__(self, database_vectors, targets, metric='cosine'): self.nn = NearestNeighbors(n_neighbors=database_vectors.shape[0], algorithm='brute', metric=metric) self.nn.fit(database_vectors) self.targets = np.cast[np.int](targets) bins = np.bincount(self.targets) idx = np.nonzero(bins)[0] self.instances_per_target = dict(zip(idx, bins[idx])) self.number_of_instances = float(len(targets)) self.recall_levels = np.arange(0, 1.01, 0.1) self.fine_recall_levels = np.arange(0, 1.01, 0.05) def get_binary_relevances(self, queries, targets): """ Executes the queries and returns the binary relevance vectors (one vector for each query) :param queries: the queries :param targets: the label of each query :return: """ distances, indices = self.nn.kneighbors(queries) relevant_vectors = np.zeros_like(indices) for i in range(targets.shape[0]): relevant_vectors[i, :] = self.targets[indices[i, :]] == targets[i] return relevant_vectors def get_metrics(self, relevant_vectors, targets): """ Evaluates the retrieval performance :param relevant_vectors: the relevant vectors for each query :param targets: labels of the queries :return: """ # Calculate precisions per query precision = np.cumsum(relevant_vectors, axis=1) / np.arange( 1, self.number_of_instances + 1) # Calculate interpolated precision for i in reversed(range(len(precision) - 1)): precision[:, i] = np.maximum(precision[:, i], precision[:, i + 1]) # Calculate recall per query instances_per_query = np.zeros((targets.shape[0], 1)) for i in range(targets.shape[0]): instances_per_query[i] = self.instances_per_target[targets[i]] recall = np.cumsum(relevant_vectors, axis=1) / instances_per_query # Calculate precision @ 11 recall point precision_at_recall_levels = np.zeros( (targets.shape[0], self.recall_levels.shape[0])) for i in range(len(self.recall_levels)): idx = np.argmin(np.abs(recall - self.recall_levels[i]), axis=1) precision_at_recall_levels[:, i] = precision[ np.arange(targets.shape[0]), idx] # Calculate fine-grained precision precision_at_fine_recall_levels = np.zeros( (targets.shape[0], self.fine_recall_levels.shape[0])) for i in range(len(self.fine_recall_levels)): idx = np.argmin(np.abs(recall - self.fine_recall_levels[i]), axis=1) precision_at_fine_recall_levels[:, i] = precision[ np.arange(targets.shape[0]), idx] # Calculate the means values of the metrics ap = np.mean(precision_at_recall_levels, axis=1) m_ap = np.mean(ap) interpolated_precision = np.mean(precision, axis=0) interpolated_fine_precision = np.mean(precision_at_fine_recall_levels, axis=0) return m_ap, interpolated_precision, interpolated_fine_precision, self.fine_recall_levels, def evaluate(self, queries, targets, batch_size=128): """ Evaluates the performance of the database using the following metrics: interpolated map, interpolated precision, and precision-recall curve :param queries: the queries :param targets: the labels :return: the evaluated metrics """ n_batches = len(targets) // batch_size m_ap, fine_precision, raw_precision = None, None, None for i in tqdm(range(n_batches)): cur_queries = queries[i * batch_size:(i + 1) * batch_size] cur_targets = targets[i * batch_size:(i + 1) * batch_size] relevant_vectors = self.get_binary_relevances( cur_queries, cur_targets) (c_m_ap, c_raw_precision, c_fine_precision, self.fine_recall_levels,) = \ self.get_metrics(relevant_vectors, cur_targets) if m_ap is None: m_ap = c_m_ap * batch_size fine_precision = c_fine_precision * batch_size raw_precision = c_raw_precision * batch_size else: m_ap += c_m_ap * batch_size fine_precision += c_fine_precision * batch_size raw_precision += c_raw_precision * batch_size if batch_size * n_batches < len(targets): cur_queries = queries[batch_size * n_batches:] cur_targets = targets[batch_size * n_batches:] relevant_vectors = self.get_binary_relevances( cur_queries, cur_targets) (c_m_ap, c_raw_precision, c_fine_precision, self.fine_recall_levels,) = \ self.get_metrics(relevant_vectors, cur_targets) m_ap += c_m_ap * len(cur_targets) fine_precision += c_fine_precision * len(cur_targets) raw_precision += c_raw_precision * len(cur_targets) m_ap = m_ap / float(len(targets)) fine_precision = fine_precision / float(len(targets)) raw_precision = raw_precision / float(len(targets)) results = { 'map': m_ap, 'precision': fine_precision, 'recall_levels': self.fine_recall_levels, 'raw_precision': raw_precision } return results
def explain(self, out_num): x = self.x y = self.y inliners = x[y == 1] outliers = x[y == -1] # Resample outlier to form cluster. n_dimens = len(outliers[0]) n_samples = 5 resampled = np.random.normal(outliers[out_num].tolist(), [0.01 for _ in range(n_dimens)], (n_samples, n_dimens)) np.append(resampled, outliers[out_num, :]) # Find context of outlier. n_neigh = 30 nbrs = NearestNeighbors(n_neighbors=n_neigh, algorithm='kd_tree', metric='euclidean').fit(inliners) distances, neighbors = nbrs.kneighbors(outliers) # Clustering outlier context. out_neigh = neighbors[out_num] context = inliners[out_neigh, :] db = DBSCAN(eps=0.1, min_samples=3).fit(context) labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) if n_clusters_ == 0: return "Could not find enough clusters in outlier context." # Mask for outlier neighbors' clusters. core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True n_features = len(x[0]) s_il = np.empty([n_clusters_, n_features]) cluster_sizes = np.empty([n_clusters_]) unique_labels = set(labels) for l in unique_labels: if l != -1: class_member_mask = (labels == l) cluster = context[class_member_mask & core_samples_mask] cluster_sizes[l] = (len(cluster)) # Compute coefficients. X = np.concatenate((cluster, resampled), axis=0) y = [1 if i < len(cluster) else 0 for i in range(len(X))] clf = LinearSVC(penalty="l1", dual=False, random_state=0, tol=1e-5) clf.fit(X, y) # Find nearest neighbors in cluster. # should nearest neighbor be form the cluster? nbrs = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', metric='euclidean').fit(cluster) cluster_nbrs = nbrs.kneighbors( cluster, return_distance=False)[:, 1] # [0] -- the same point for m in range(n_features): dist = [ abs(context[n][m] - cluster[i][m]) for i, n in enumerate(cluster_nbrs) ] gamma = sum(dist) / len(cluster) s_il[l][m] = abs(clf.coef_[0][m]) / gamma importance = np.empty([n_features]) for m in range(n_features): s_sum = 0 for l in unique_labels: if l != -1: s_sum += cluster_sizes[l] * s_il[l][m] importance[m] = s_sum / n_neigh return importance
# In[19]: ir.describe() #showing the fitted data # In[20]: # creating a test data import numpy as np test = np.array([5.4, 2, 2, 2.3]) test1 = test.reshape(1, -1) test1.shape # In[21]: nn.kneighbors(test1, 5) # In[22]: ir.iloc[[98, 93, 57, 60, 79]] # displaying specific rows using iloc() # ### KNeighborsClassifier Algorithm # In[23]: import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn import neighbors, datasets n_neighbors = 15
class ActiveLearningWithCostEmbedding(QueryStrategy): """Active Learning with Cost Embedding (ALCE) Cost sensitive multi-class algorithm. Assume each class has at least one sample in the labeled pool. Parameters ---------- cost_matrix : array-like, shape=(n_classes, n_classes) The ith row, jth column represents the cost of the ground truth being ith class and prediction as jth class. mds_params : dict, optional http://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html nn_params : dict, optional http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html embed_dim : int, optional (default: None) if is None, embed_dim = n_classes base_regressor : sklearn regressor random_state : {int, np.random.RandomState instance, None}, optional (default=None) If int or None, random_state is passed as parameter to generate np.random.RandomState instance. if np.random.RandomState instance, random_state is the random number generate. Attributes ---------- nn_ : sklearn.neighbors.NearestNeighbors object instance Examples -------- Here is an example of declaring a ActiveLearningWithCostEmbedding query_strategy object: .. code-block:: python import numpy as np from sklearn.svm import SVR from libact.query_strategies.multiclass import ActiveLearningWithCostEmbedding as ALCE cost_matrix = 2000. * np.random.rand(n_classes, n_classes) qs3 = ALCE(dataset, cost_matrix, SVR()) References ---------- .. [1] Kuan-Hao, and Hsuan-Tien Lin. "A Novel Uncertainty Sampling Algorithm for Cost-sensitive Multiclass Active Learning", In Proceedings of the IEEE International Conference on Data Mining (ICDM), 2016 """ def __init__(self, dataset, cost_matrix, base_regressor, embed_dim=None, mds_params={}, nn_params={}, random_state=None): super(ActiveLearningWithCostEmbedding, self).__init__(dataset) self.cost_matrix = cost_matrix self.base_regressor = base_regressor self.n_classes = len(cost_matrix) if embed_dim is None: self.embed_dim = self.n_classes else: self.embed_dim = embed_dim self.regressors = [ copy.deepcopy(self.base_regressor) for _ in range(self.embed_dim) ] self.random_state_ = seed_random_state(random_state) self.mds_params = { 'metric': False, 'n_components': self.embed_dim, 'n_uq': self.n_classes, 'max_iter': 300, 'eps': 1e-6, 'dissimilarity': "precomputed", 'n_init': 8, 'n_jobs': 1, 'random_state': self.random_state_ } self.mds_params.update(mds_params) self.nn_params = {} self.nn_params.update(nn_params) self.nn_ = NearestNeighbors(n_neighbors=1, **self.nn_params) dissimilarity = np.zeros((2 * self.n_classes, 2 * self.n_classes)) dissimilarity[:self.n_classes, self.n_classes:] = self.cost_matrix dissimilarity[self.n_classes:, :self.n_classes] = self.cost_matrix.T mds_ = MDSP(**self.mds_params) embedding = mds_.fit(dissimilarity).embedding_ self.class_embed = embedding[:self.n_classes, :] self.nn_.fit(embedding[self.n_classes:, :]) @inherit_docstring_from(QueryStrategy) def make_query(self): dataset = self.dataset unlabeled_entry_ids, pool_X = dataset.get_unlabeled_entries() # The input class should be 0-n_classes X, y = dataset.get_labeled_entries() pred_embed = np.zeros((len(pool_X), self.embed_dim)) for i in range(self.embed_dim): self.regressors[i].fit(X, self.class_embed[y, i]) pred_embed[:, i] = self.regressors[i].predict(pool_X) dist, _ = self.nn_.kneighbors(pred_embed) dist = dist[:, 0] ask_idx = self.random_state_.choice( np.where(np.isclose(dist, np.max(dist)))[0]) return unlabeled_entry_ids[ask_idx]
def k_distance(dataset, k): nbrs = NearestNeighbors(n_neighbors=k).fit(dataset) distances, indices = nbrs.kneighbors(dataset) return distances[:, k - 1].mean()
%matplotlib cells2 = pd.read_csv('CRC_clusters_neighborhoods_markers.csv') tissue_col = 'spots' neigh_col = 'neighborhood10' patient_col = 'patients' group_col = 'groups' X = 'X:X' Y = 'Y:Y' # calculate neighbors for each spot for spot in cells2[tissue_col].unique(): tissue = cells2[cells2[tissue_col] == spot] fit = NearestNeighbors(n_neighbors=1).fit(tissue[[X, Y]].values) m = fit.kneighbors()[1] cells2.loc[tissue.index, 'neigh_neigh'] = tissue.iloc[m[:, 0], :][neigh_col].values cells2['neigh_neigh'] = cells2['neigh_neigh'].astype(int) #compute for each patient, in each tissue and neighborhood, the number of cells in that neighborhoood counts = cells2.groupby([group_col,patient_col,tissue_col,neigh_col]).apply(lambda x: len(x)).unstack() #compute for each patient, in each tissue and neighborhood: the count of how many of the cells in that neighborhood are next to a cell in the other neighborhood neighs = cells2.groupby([group_col,patient_col,tissue_col,neigh_col]).apply(lambda x:x['neigh_neigh'].value_counts(sort = False)).unstack() #specify which neighborhoods you want to calculate neigh1,neigh2 = 0,4 # Comment out if you wish to average each spot for each patient
def res(jobfile): Resume_Vector = [] Ordered_list_Resume = [] Ordered_list_Resume_Score = [] LIST_OF_FILES = [] LIST_OF_FILES_PDF = [] LIST_OF_FILES_DOC = [] LIST_OF_FILES_DOCX = [] Resumes = [] Temp_pdf = [] os.chdir('./Original_Resumes') for file in glob.glob('**/*.pdf', recursive=True): LIST_OF_FILES_PDF.append(file) for file in glob.glob('**/*.doc', recursive=True): LIST_OF_FILES_DOC.append(file) for file in glob.glob('**/*.docx', recursive=True): LIST_OF_FILES_DOCX.append(file) LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF # LIST_OF_FILES.remove("antiword.exe") print("This is LIST OF FILES") print(LIST_OF_FILES) # print("Total Files to Parse\t" , len(LIST_OF_PDF_FILES)) print("####### PARSING ########") for nooo, i in enumerate(LIST_OF_FILES): Ordered_list_Resume.append(i) Temp = i.split(".") if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF": try: print("This is PDF", nooo) with open(i, 'rb') as pdf_file: read_pdf = PyPDF2.PdfFileReader(pdf_file) # page = read_pdf.getPage(0) # page_content = page.extractText() # Resumes.append(Temp_pdf) number_of_pages = read_pdf.getNumPages() for page_number in range(number_of_pages): page = read_pdf.getPage(page_number) page_content = page.extractText() page_content = page_content.replace('\n', ' ') # page_content.replace("\r", "") Temp_pdf = str(Temp_pdf) + str(page_content) # Temp_pdf.append(page_content) # print(Temp_pdf) Resumes.extend([Temp_pdf]) Temp_pdf = '' # f = open(str(i)+str("+") , 'w') # f.write(page_content) # f.close() except Exception as e: print(e) if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC": print("This is DOC", i) try: a = textract.process(i) a = a.replace(b'\n', b' ') a = a.replace(b'\r', b' ') b = str(a) c = [b] Resumes.extend(c) except Exception as e: print(e) if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX": print("This is DOCX", i) try: a = textract.process(i) a = a.replace(b'\n', b' ') a = a.replace(b'\r', b' ') b = str(a) c = [b] Resumes.extend(c) except Exception as e: print(e) if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE": print("This is EXE", i) pass print("Done Parsing.") Job_Desc = 0 LIST_OF_TXT_FILES = [] os.chdir('../Job_Description') f = open(jobfile, 'r') text = f.read() try: tttt = str(text) tttt = summarize(tttt, word_count=100) text = [tttt] except: text = 'None' f.close() vectorizer = TfidfVectorizer(stop_words='english') # print(text) vectorizer.fit(text) vector = vectorizer.transform(text) Job_Desc = vector.toarray() # print("\n\n") # print("This is job desc : " , Job_Desc) os.chdir('../') for i in Resumes: text = i tttt = str(text) try: tttt = summarize(tttt, word_count=100) text = [tttt] vector = vectorizer.transform(text) aaa = vector.toarray() Resume_Vector.append(vector.toarray()) except: pass # print(Resume_Vector) for i in Resume_Vector: samples = i neigh = NearestNeighbors(n_neighbors=1) neigh.fit(samples) NearestNeighbors(algorithm='auto', leaf_size=30) Ordered_list_Resume_Score.extend( neigh.kneighbors(Job_Desc)[0][0].tolist()) Z = [ x for _, x in sorted(zip(Ordered_list_Resume_Score, Ordered_list_Resume)) ] print(Ordered_list_Resume) print(Ordered_list_Resume_Score) flask_return = [] # for n,i in enumerate(Z): # print("Rankkkkk\t" , n+1, ":\t" , i) for n, i in enumerate(Z): # print("Rank\t" , n+1, ":\t" , i) # flask_return.append(str("Rank\t" , n+1, ":\t" , i)) name = getfilepath(i) #name = name.split('.')[0] rank = n + 1 res = ResultElement(rank, name) flask_return.append(res) # res.printresult() print(f"Rank{res.rank+1} :\t {res.filename}") return flask_return
x = SimilarityVectors.cosine_vecs(utt) top_n = 5 random_idc = set() while (len(random_idc) != top_n): random_idc.add(random.randint(0, len(x) - 1)) print('------------------------------------') print(random_idc) print('------------------------------------\n') test = [x[y] for y in random_idc] for alg in ['ball_tree', 'kd_tree']: print('Used Search Algorithm :', alg) nneighbors = NearestNeighbors(n_neighbors=top_n, \ algorithm=alg).fit(x) dist, idc = nneighbors.kneighbors([x[y] for y in random_idc]) print() for i, idx in enumerate(random_idc): print('Looking for neighbors of : "', utt[idx], '"') for k, j in enumerate(idc[i]): print('Index :', j, '\nDistance :', dist[i][k], '\n' + utt[j]) print() print('Indices :', idc, '\nDistances :', dist) print('------------------------------------\n')
v = np.zeros([const.num_particles, 3]).astype(np.float32) plt.ion() fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for iteration in range(100): timer[iteration][0] = time.time() plt.xlim(-10, 10) plt.ylim(-10, 10) ax.set_zlim(-10, 10) ax.scatter3D(r1.T[0], r1.T[1], r1.T[2]) plt.draw() plt.pause(0.000001) ax.cla() timer[iteration][1] = time.time() nn.fit(r1) neighbors = nn.kneighbors(r1, return_distance=False).flatten().astype(np.int32) timer[iteration][2] = time.time() r_dash = r1.flatten().astype(np.float32) v_n = v.flatten().astype(np.float32) density = np.zeros([const.num_particles, const.num_neighbhours]).flatten().astype(np.float32) force = np.zeros([const.num_particles, const.num_neighbhours, 3]).astype(np.float32) color_field_lap_val = np.zeros( [const.num_particles, const.num_neighbhours]).astype(np.float32) color_field_grad_val = np.zeros( [const.num_particles, const.num_neighbhours, 3]).astype(np.float32) timer[iteration][3] = time.time() calc_density(drv.Out(density),
def clusterHead(left_eyes, right_eyes, fullHeads=False): #We use NN to cluster head objects: eyes and nose, assuming there is at least one pair of eyes if not left_eyes or not right_eyes: heads = {} if fullHeads: for headsita in list(range(len(left_eyes))): newHead = head(left_eye=headsita) heads[headsita] = newHead for headsita in list(range(len(right_eyes))): newHead = head(right_eye=headsita) heads[headsita] = newHead elif len(left_eyes) > 1: neigh = NearestNeighbors(n_neighbors=2) neigh.fit(left_eyes) distances, from_right_to_left = neigh.kneighbors(right_eyes) index_taken = {} #[inr, distances[inr][0]] queue = list(range(len(right_eyes))) heads = {} j = -1 # we examine the terms and correct previous choices while queue: index_right_eye = queue[0] queue = queue[1:] # we grab the closest left eye to the inr index_left_eye = from_right_to_left[index_right_eye][0] if (index_left_eye) == [] and fullHeads: # if the point is asolated newHead = head(right_eye=index_right_eye) heads[j] = newHead j = j - 1 elif index_left_eye not in index_taken: #new index newHead = head(left_eye=index_left_eye, right_eye=index_right_eye, distance=distances[index_right_eye][0]) heads[index_left_eye] = newHead index_taken[index_left_eye] = [ index_right_eye, distances[index_right_eye][0] ] else: # we need to compare distances newdist = distances[index_right_eye][0] olddist = index_taken[index_left_eye][1] if olddist < newdist: # wrong left eye index_left_eye = from_right_to_left[index_right_eye][1] newdist = distances[index_right_eye][1] olddist = index_taken.get(index_left_eye, [[], None])[1] if index_left_eye not in index_taken: newHead = head(left_eye=index_left_eye, right_eye=index_right_eye, distance=distances[index_right_eye][1]) heads[index_left_eye] = newHead index_taken[index_left_eye] = [ index_right_eye, distances[index_right_eye][1] ] elif olddist < newdist and fullHeads: # olddist<newdist newHead = head(right_eye=index_right_eye) heads[j] = newHead j = j - 1 else: queue = queue + [index_taken[index_left_eye][0]] newHead = head(left_eye=index_left_eye, right_eye=index_right_eye, distance=newdist) heads[index_left_eye] = newHead index_taken[index_left_eye] = [ index_right_eye, distances[index_right_eye][1] ] else: # correct left eye already taken queue = queue + [index_taken[index_left_eye][0]] newHead = head(left_eye=index_left_eye, right_eye=index_right_eye, distance=newdist) heads[index_left_eye] = newHead index_taken[index_left_eye] = [index_right_eye, newdist] if fullHeads: missingheads = set(list(range( len(right_eyes)))).difference(index_taken) else: missingheads = [] for headsita in missingheads: newHead = head(left_eye=headsita) heads[headsita] = newHead else: neigh = NearestNeighbors(n_neighbors=1) neigh.fit(right_eyes) distances, from_right_to_left = neigh.kneighbors(left_eyes) newHead = head(left_eye=0, right_eye=from_right_to_left[0][0]) heads = {0: newHead} return heads
def findNextAstar(image, giTarg, profile, thresh): open = [] closed = [] query = getFeatures(image) start = Node(0, 0, giTarg, query, -1, 0) open.append(start) while (open != []): q = findLowestF(open) open.remove(q) nbors = NearestNeighbors(n_neighbors=5) nbors.fit(knndata[:, 0:12]) knn = nbors.kneighbors([q.index[0:12]]) knn = knn[1][0] dirs = [[0, 0], [0, 0], [0, 0], [0, 0]] for n in knn: ug = findGIList(n) for d in range(0, 4): dirs[d][0] += 1 dirs[d][1] += ug[d] successorGain = [d[1] / (d[0] - .00001) for d in dirs] print(successorGain) #exit() for i in range(4): index = random.randint(0, len(knn) - 1) sI = findFeatures(knn[random.randint(0, len(knn) - 1)]) gi = float(q.index[10]) sG = q.g + (gi / successorGain[i]) sH = ((q.g - gi) / gi) * (sum(profile) / len(profile)) giAcc = q.gi + gi sF = sG + sH sP = q.index print(sG) #print(giAcc) if (giAcc > giTarg): return [sG, giAcc] #lowest E for direction, don't return else: skip = False for c in closed: if ((c.index[0:12] == q.index[0:12]).all() and c.f <= q.f): skip = True if not skip: s = Node(sF, sG, sH, sI, sP, giAcc) open.append(s) #print([len(open),len(closed)]) closed.append(q) highest = 0 ret = None for i in closed: if (i.gi > highest): highest = i.gi ret = [i.g, i.gi] return ret