def _compute_tolerance_distance(self, sample, symbol): """Compute the distance tolerance. Computes distance tolerance in the feature vectors space below which we find the symbol similar. Then saves it to proper file. Args: sample (list of lists of int): list of feature-vectors, on which we base on. symbol (String): name of symbol to compute tolerance """ nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')\ .fit(sample) distances, _ = nbrs.kneighbors(sample) print(distances) means = [] for distances_row in distances: row = np.delete(distances_row, [0]) means.append(np.mean(row)) means.sort() critical_index = math.ceil(0.8 * len(means)) - 1 tolerance_distance = means[critical_index] * 1.3 print("tolerance distance: %.16f" % tolerance_distance) tolerance_distance_path = \ Classifier._get_file_path( self.files[DISTANCE_TOLERANCE_FILE], symbol) with open(tolerance_distance_path, 'w') as handle: handle.write("%.16f\n" % tolerance_distance) return tolerance_distance
def embedding_refinement(data_matrix_highdim, data_matrix_lowdim, n_neighbors=8, emb_quality_th=1, n_iter=20): # extract neighbors list for high dimensional case neigh_high = NearestNeighbors(n_neighbors=n_neighbors) neigh_high.fit(data_matrix_highdim) neighbors_list_highdim = neigh_high.kneighbors(data_matrix_highdim, return_distance=0) n_instances = data_matrix_lowdim.shape[0] logger.debug('refinements max num iters: %d k in neqs: %d num insts: %d' % (n_iter, n_neighbors, n_instances)) for it in range(n_iter): average_embedding_quality_score, scores = knn_quality_score(data_matrix_lowdim, neighbors_list_highdim, n_neighbors) # select low quality embedded instances ids = [i for i, s in enumerate(scores) if relative_quality(i, scores, neighbors_list_highdim) <= emb_quality_th] # find average position of true knns and move point there new_data_matrix_lowdim = compute_average(ids, data_matrix_lowdim, neighbors_list_highdim) new_average_embedding_quality_score, new_scores = knn_quality_score(new_data_matrix_lowdim, neighbors_list_highdim, n_neighbors) if new_average_embedding_quality_score > average_embedding_quality_score: data_matrix_lowdim = new_data_matrix_lowdim n_refinements = len(ids) frac_refinements = float(n_refinements) / n_instances logger.debug('r %.2d neqs: %.3f \t %.2f (%d insts)' % (it + 1, new_average_embedding_quality_score, frac_refinements, n_refinements)) else: break return data_matrix_lowdim
def resample(self): # Start with the minority class minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # Finding nns from sklearn.neighbors import NearestNeighbors print("Finding the %i nearest neighbours..." % self.k, end = "") NN = NearestNeighbors(n_neighbors = self.k + 1) NN.fit(minx) nns = NN.kneighbors(minx, return_distance=False)[:, 1:] print("done!") # Creating synthetic samples print("Creating synthetic samples...", end="") sx, sy = make_samples(minx, minx, self.minc, nns, int(self.ratio * len(miny)), random_state=self.rs) print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis = 0) ret_y = concatenate((self.y, sy), axis = 0) return ret_x, ret_y
class ContentBased(object): """ Modelo de recomendación de articulos basados en los tags con mas relevancia de cada uno de ellos. El modelo vectoriza cada articulo para poder calcular la similitud entre cada uno de ellos. """ def __init__(self, stop_words=None, token_pattern=None, metric='cosine', n_neighbors=5): if stop_words is None: stop_words = stopwords.words("english") if token_pattern is None: token_pattern = '(?u)\\b[a-zA-Z]\\w\\w+\\b' self.tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, token_pattern=token_pattern) self.nearest_neigbors = NearestNeighbors(metric=metric, n_neighbors=n_neighbors, algorithm='brute') def fit(self, datos, columna_descripcion): """ Entrenamos el modelo: 1/ Vectorizacion de cada articulo (Extracción y ponderación de atributos) 2/ Calculamos los articulos mas cercanos """ self.datos = datos datos_por_tags = self.tfidf_vectorizer.fit_transform(datos[columna_descripcion]) self.nearest_neigbors.fit(datos_por_tags) def predict(self, descripcion): """ Devuelve los articulos mas parecidos a la descripcion propuesta """ descripcion_tags = self.tfidf_vectorizer.transform(descripcion) if descripcion_tags.sum() == 0: return pd.DataFrame(columns=self.datos.columns) else: _, indices = self.nearest_neigbors.kneighbors(descripcion_tags) return self.datos.iloc[indices[0], :]
def find_k_neighbors(points, neighbor_number=5): from sklearn.neighbors import NearestNeighbors import numpy as np X = np.array(points) neighbors = NearestNeighbors(n_neighbors=neighbor_number + 1, algorithm='ball_tree').fit(X) distances, indices = neighbors.kneighbors(X) return [[str(point), list([str(x) for x in indices[point][1:]])] for point in xrange(len(points))]
def sample(s): if s.data is None: raise ValueError('data not loaded.') mdl = NearestNeighbors(n_neighbors=s.k, n_jobs=-1) minoX = s.X[s.y == s.minolab] majX = s.X[s.y == s.majlab] mdl.fit(minoX) _, nei_table = mdl.kneighbors() generated = None for cnt, nei_idx in enumerate(nei_table): x = minoX[cnt] if s.rate >= 0.5 * s.k: nei = minoX[np.random.choice(nei_idx, int(s.rate))] new = x + np.random.rand(int(s.rate), 1) * (nei - x) else: nei = minoX[nei_idx] new = x + np.random.rand(s.k, 1) * (nei - x) # each of the synthesed k points has N/k * 100 % probability to be chosen new = new[np.random.rand(s.k) > s.rate * 1.0 / s.k] if generated is None: generated = new else: generated = np.vstack((generated, new)) # number of generated instances N = len(generated) ret = np.hstack((np.vstack((minoX, generated, majX)), np.array([s.minolab] * (minoX.shape[0] + N) + [s.majlab] * majX.shape[0])[:, None])) np.random.shuffle(ret) return ret
def test_kernel_density_sampling(n_samples=100, n_features=3): rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) bandwidth = 0.2 for kernel in ['gaussian', 'tophat']: # draw a tophat sample kde = KernelDensity(bandwidth, kernel=kernel).fit(X) samp = kde.sample(100) assert_equal(X.shape, samp.shape) # check that samples are in the right range nbrs = NearestNeighbors(n_neighbors=1).fit(X) dist, ind = nbrs.kneighbors(X, return_distance=True) if kernel == 'tophat': assert np.all(dist < bandwidth) elif kernel == 'gaussian': # 5 standard deviations is safe for 100 samples, but there's a # very small chance this test could fail. assert np.all(dist < 5 * bandwidth) # check unsupported kernels for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']: kde = KernelDensity(bandwidth, kernel=kernel).fit(X) assert_raises(NotImplementedError, kde.sample, 100) # non-regression test: used to return a scalar X = rng.randn(4, 1) kde = KernelDensity(kernel="gaussian").fit(X) assert_equal(kde.sample().shape, (1, 1))
def findKNN(frequencyVector, newVector): samples = np.array(frequencyVector) neigh = NearestNeighbors(n_neighbors = 5, metric = "euclidean") neigh.fit(samples) indexList = neigh.kneighbors(newVector, return_distance = False).tolist() return indexList
def random_forest_single_predict(test_filename, name, feature_file, train_file, k): name_list, data = readfile_real_name(test_filename) print 'reading file...' test_data = data[name_list.index(name)] with open(train_file, 'rb') as f: clf = cPickle.load(f) print 'done' result_rate = (clf.predict_proba(test_data))[0] class_name = clf.classes_ print name num = map(get_num, result_rate) name_list, feature_list = readfile_real_name_group(feature_file, class_name, num) neigh = NearestNeighbors() neigh.fit(feature_list) kneighbors_result_list = neigh.kneighbors(test_data, k, False)[0] print kneighbors_result_list for x in kneighbors_result_list: print name_list[x] classification_result = [] average_list = [] real_name = (name.split('_'))[0] counter = Counter(kneighbors_result_list) if real_name == name_list[counter.most_common(1)[0][0]].split('_')[0]: classification_result.append(1) else: classification_result.append(0) num = 0 for i in kneighbors_result_list: if (name_list[i].split('_'))[0] == real_name: num += 1 average_list.append((float)(num) / (float)(k)) print classification_result, average_list return classification_result, average_list
def resample(self): """ :return: Return the data with majority samples that form a Tomek link removed. """ from sklearn.neighbors import NearestNeighbors # Find the nearest neighbour of every point nn = NearestNeighbors(n_neighbors=2) nn.fit(self.x) nns = nn.kneighbors(self.x, return_distance=False)[:, 1] # Send the information to is_tomek function to get boolean vector back if self.verbose: print("Looking for majority Tomek links...") links = self.is_tomek(self.y, nns, self.minc, self.verbose) if self.verbose: print("Under-sampling " "performed: " + str(Counter(self.y[logical_not(links)]))) # Return data set without majority Tomek links. return self.x[logical_not(links)], self.y[logical_not(links)]
def SMOTE(minority_samples, N, k): """ The SMOTE algorithm, please refer to: [JAIR'02]SMOTE - Synthetic Minority Over-sampling Technique minority_samples The minority sample array N Amount of SMOTE N% k Number of nearest neighbors @return (N/100)*len(minority_samples) synthetic minority class samples """ T = len(minority_samples) # number of minority samples if N < 100: T = N * 1.0 / 100 * T N = 100 N = int(N * 1.0 / 100) neigh = NearestNeighbors(n_neighbors = k, radius=1.0, algorithm='auto', leaf_size=30, p=2) neigh = neigh.fit(minority_samples) synthetic_samples = [] for i in range(T): target_sample = minority_samples[i] tmp = neigh.kneighbors(target_sample, k, return_distance=False) nnarray = tmp[0] populate(minority_samples, N, k, i, nnarray, synthetic_samples) return np.array(synthetic_samples, float)
def _wpca_analysis(L, C, intensities): """ Determine the eccentricity of each cluster using weighted PCA (See Jolliffe 2002, 14.2.1). The smallest normalized explained variance is small for flat of filiform objects. - L is a numpy matrix (one point on each row) - intensities are gray levels of each point No cluster assignment is used here: a ball of radius 10 around each center is used to find the cloud of points. """ np.set_printoptions(threshold=50000) n_points, n_features = L.shape tee.log('WPCA - Fitting NearestNeighbors on', n_points, 'points') nbrs = NearestNeighbors(radius=10.0).fit(L) for i, c in enumerate(C): array_c = np.array([c.x, c.y, c.z]) i_nbrs = nbrs.radius_neighbors([array_c], 10.0, return_distance=False)[0] points_within = L[i_nbrs] if len(points_within) < 64: # too small set, there is no point in running PCA c.EVR = [0.499, 0.499, 0.002] c.last_variance = c.EVR[2] else: w = np.sqrt(intensities[i_nbrs]/255.0) wX = np.dot(np.diag(w), points_within) pca = sklearn.decomposition.PCA(n_components=3) X_r = pca.fit(wX).transform(wX) c.EVR = pca.explained_variance_ratio_ c.last_variance = c.EVR[2] print('WPCA done on', i, '/', len(C), 'name=', c.name, 'EVR=', c.EVR)
def k_nearest_neighbors_scores(k, eng_vec_dict, fr_vec_dict): eng_mat, fr_mat, index_map = build_parallel_mats_from_dicts(eng_vec_dict, fr_vec_dict, translation_dict) # k + 1 since we discard the top neighbor, which is itself neighbors_en = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(eng_mat) dist_en, indices_en = neighbors_en.kneighbors(eng_mat) neighbors_fr = NearestNeighbors(n_neighbors=k+1, algorithm='ball_tree').fit(fr_mat) dist_fr, indices_fr = neighbors_fr.kneighbors(fr_mat) # since we built the matrices in parallel, we know now that indices map to each other, # so we simply check the overlap of those to calculate precision and recall. # calculate avg recall for k-recall avg_recall = 0. num_points = len(indices_en) + 0. knearest_map_en = dict() knearest_map_fr = dict() for i in range(0, int(num_points)): w_en = index_map[i][0] w_fr = index_map[i][1] index_set_en = set(indices_en[i][1:]) # should be size k index_set_fr = set(indices_fr[i][1:]) # should be size k if w_en not in knearest_map_en: knearest_map_en[w_en] = map(lambda z: index_map[z], index_set_en) if w_fr not in knearest_map_fr: knearest_map_fr[w_fr] = map(lambda z: index_map[z], index_set_fr) recall_count = sum(1 for i in index_set_fr if i in index_set_en) # precision = recall for this task recall = (recall_count + 0.)/len(index_set_en) avg_recall += recall return (avg_recall/num_points), knearest_map_en, knearest_map_fr
def knn_find(train, test, k = 2): """find first K knn neighbors of test samples from train samples [Args] ---- train: train data {array like, m x n, m samples, n features} list of sample, each sample are list of features. e.g. [[age = 18, weight = 120, height = 167], [age = 45, weight = 180, height = 173], ..., ] test: test data {array like, m x n, m samples, n features} data format is the same as train data k: number of neighbors how many neighbors you want to find [Returns] ------- distances: list of distance of knn-neighbors from test data [[dist(test1, train_knn1), dist(test1, train_knn2), ...], [dist(test2, train_knn1), dist(test2, train_knn2), ...], ..., ] indices: list of indice of knn-neighbors from test data [[test1_train_knn1_index, test1_train_knn2_index, ...], [test2_train_knn1_index, test2_train_knn2_index, ...], ..., ] """ nbrs = NearestNeighbors(n_neighbors=k, algorithm="kd_tree").fit(train) # default = "kd_tree" algorithm return nbrs.kneighbors(test)
def createSyntheticSamples(X,Y,nearestneigh,numNeighbors,majoritylabel,minoritylabel): (Xminority,Xmajority) = partitionSamples(X,Y) numFeatures = Xminority.shape[1] Xreduced = pca(Xminority) numOrigMinority=len(Xminority) #reducedMinoritykmeans = KMeans(init='k-means++', max_iter=500,verbose=False,tol=1e-4,k=numCentroids, n_init=5, n_neighbors=3).fit(Xreduced) reducedNN = NearestNeighbors(nearestneigh, algorithm='auto') reducedNN.fit(Xreduced) #Xsyn=np.array([numOrigMinority,numNeighbors*numFeatures]) trylist=[] #LOOPHERE for EACH (minority) point... for i,row in enumerate(Xreduced): neighbor_index = reducedNN.kneighbors(row, return_distance=False) closestPoints = Xminority[neighbor_index] #randomly choose one of the k nearest neighbors chosenNeighborsIndex = chooseNeighbor(neighbor_index,numNeighbors,i) chosenNeighbor = Xminority[chosenNeighborsIndex] #Calculate linear combination: #Take te difference between the orig minority sample and its selected neighbor, where X[1,] is the orig point diff = Xminority[i,]-chosenNeighbor #Multiply this difference by a number between 0 and 1 r = random.uniform(0,1) #Add it back to te orig minority vector and viola this is the synthetic sample syth_sample =Xminority[i,:]+r*diff syth_sample2 = syth_sample.tolist() trylist.append(syth_sample2) Xsyn=np.asarray(trylist).reshape(numNeighbors*numOrigMinority,numFeatures) maj_col=majoritylabel*np.ones([Xmajority.shape[0],1]) min_col=minoritylabel*np.ones([Xsyn.shape[0],1]) syth_Y = np.concatenate((maj_col,min_col),axis=0) syth_X = np.concatenate((Xmajority,Xsyn),axis=0) if(syth_X.shape[0]!=syth_Y.shape[0]): raise Exception("dim mismatch between features matrix and response matrix") return (syth_X, syth_Y)
def construct_A(X, k, binary=False): nbrs = NearestNeighbors(n_neighbors=1 + k).fit(X) if binary: return nbrs.kneighbors_graph(X) else: return nbrs.kneighbors_graph(X, mode='distance')
def shift(ep): ids,x1,y1,mk,mj,x2,y2 = np.genfromtxt(ep,unpack=True) local_mask = np.in1d(ids,bid) lid,lx1,ly1,lx2,ly2 = np.transpose([ids,x1,y1,x2,y2])[local_mask].T loc_xy = np.transpose([lx2,ly2]) nbrs = NN(n_neighbors=vecinos, algorithm='auto').fit(loc_xy) coo_xy = np.transpose([x2,y2]) dist, idx = nbrs.kneighbors(coo_xy) idx = idx[:,1:] dist = dist[:,1:] ctx = np.zeros(x1.size) cty = np.zeros(y1.size) for i in range(x1.size): star_locales = loc_xy[idx[i]] ep1_x = lx1[idx[i]] ep1_y = ly1[idx[i]] poptx, pcovx = curve_fit(linear,star_locales.T,ep1_x) popty, pcovy = curve_fit(linear,star_locales.T,ep1_y) ctx[i] += linear([x2[i],y2[i]],*poptx) cty[i] += linear([x2[i],y2[i]],*popty) shift_x = x1 - ctx shift_y = y1 - cty hdr = 'ID X Y MAG_K MAG_J PMX PMY' fmt = '%d %.3f %.3f %.3f %.3f %f %f' data = np.transpose([ids,x1,y1,mk,mj,shift_x,shift_y]) np.savetxt('./%s/%s' % (pm_folder,ep.split('/')[-1].replace('.mfma','.pm')), data, header=hdr, fmt=fmt)
def test_connectivity_popagation(): """ Check that connectivity in the ward tree is propagated correctly during merging. """ from sklearn.neighbors import NearestNeighbors X = np.array( [ (0.014, 0.120), (0.014, 0.099), (0.014, 0.097), (0.017, 0.153), (0.017, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.153), (0.018, 0.152), (0.018, 0.149), (0.018, 0.144), ] ) nn = NearestNeighbors(n_neighbors=10).fit(X) connectivity = nn.kneighbors_graph(X) ward = Ward(n_clusters=4, connectivity=connectivity) # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
class KDTrees: def __init__(self, nb_neighbours, leaf_size): self.nbrs = NearestNeighbors(n_neighbors=nb_neighbours, algorithm='ball_tree', metric = 'haversine', leaf_size=leaf_size) # Compute distance in time between two points on the map def mapDistance(self, x, y): if (len(x) > 2): return np.sum((x - y) ** 2) else: if(x[0] < y[0]): tmp = y y = x x = tmp pos1 = str(x[0]) + ", " + str(x[1]) pos2 = str(y[0]) + ", " + str(y[1]) timestamp = datetime.now() sec_to_add = 32 * 3600 + (timestamp - datetime(1970, 1, 1)).total_seconds() - 2*3600 - timestamp.hour * 3600 - timestamp.minute * 60 - timestamp.second traject = gmaps.directions(pos1, pos2, mode="transit", departure_time=timestamp.fromtimestamp(sec_to_add)) try: print 'ok' return (traject[0]["legs"][0]["arrival_time"]["value"] - traject[0]["legs"][0]["departure_time"]["value"]) except: print 'bug' return 1000000000 def addPoints(self, points): self.nbrs.fit(points) def getNeighbours(self, points): self.nbrs.kneighbors(points)
def nearestN(): X = [[125,1], [200,0], [70,0], [240,1], [114,0], [120,0], [264,1], [85,0], [150,0], [90,0]] # y = [ 0, 0, 0, 0, 1, 0, 0, 1, 0,1 ] model = NN(n_neighbors=1, radius=1) model.fit(X) y = [98.,0.] print model.kneighbors(y)
def _set_widths_nearest_neighbor(self): # Nearest neighbors contain center itself, find one more. nbrs = NearestNeighbors(n_neighbors=self.n_neighbors+1, algorithm='ball_tree').fit(self.centers) for i in range(len(self.centers)): distances, indices = nbrs.kneighbors(self.centers[i]) width = sum(distances[0])/(len(distances[0]-1)) self.kernels[i].set_param(self.p/width)
def resample(self): """ """ # Start with the minority class underx = self.x[self.y == self.minc] undery = self.y[self.y == self.minc] # Import the k-NN classifier from sklearn.neighbors import NearestNeighbors # Create a k-NN to fit the whole data nn_obj = NearestNeighbors(n_neighbors=self.size_ngh) # Fit the whole dataset nn_obj.fit(self.x) idx_to_exclude = [] # Loop over the other classes under picking at random for key in self.ucd.keys(): # Get the sample of the current class sub_samples_x = self.x[self.y == key] # Get the samples associated idx_sub_sample = np.nonzero(self.y == key)[0] # Find the NN for the current class nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False) # Get the label of the corresponding to the index nnhood_label = (self.y[nnhood_idx] == key) # Check which one are the same label than the current class # Make an AND operation through the three neighbours nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1)) # If the minority class remove the majority samples (as in politic!!!! ;)) if key == self.minc: # Get the index to exclude idx_to_exclude += nnhood_idx[np.nonzero(nnhood_label[np.nonzero(nnhood_bool)])].tolist() else: # Get the index to exclude idx_to_exclude += idx_sub_sample[np.nonzero(nnhood_bool)].tolist() # Create a vector with the sample to select sel_idx = np.ones(self.y.shape) sel_idx[idx_to_exclude] = 0 # Get the samples from the majority classes sel_x = np.squeeze(self.x[np.nonzero(sel_idx), :]) sel_y = self.y[np.nonzero(sel_idx)] underx = concatenate((underx, sel_x), axis=0) undery = concatenate((undery, sel_y), axis=0) if self.verbose: print("Under-sampling performed: " + str(Counter(undery))) return underx, undery
def test_method(method, k = 10, tests=5): from sklearn.neighbors import NearestNeighbors t0 = time.time() nn = NearestNeighbors(leaf_size=data.shape[0]).fit(data) score = 0.0 t_nn = 0.0 t_meth = 0.0 np.random.seed(0) for i in range(tests): d = data[np.random.randint(data.shape[0])] t0 = time.time() method_res = method(d, k) t_meth += time.time()-t0 t0 = time.time() nn_res = nn.kneighbors(d, n_neighbors=k, return_distance=False) t_nn += time.time()-t0 score += np.mean(np.in1d(nn_res, method_res)) t_nn /= tests t_meth /= tests r1 = 'NN time: %1.10f method time: %1.10f speedup: %1.10f' % (t_nn, t_meth, t_nn/t_meth) r2 = '%1.2f%% overlap' % ((score/tests) * 100) return r1 + '\n' + r2
def predict_appliance(home, appliance, feature): if home in all_homes[appliance]: home_to_pick=home else: home_to_pick=all_homes[appliance][0] print home_to_pick feature_dict = json.load(open("../data/output/sensitivity-numfeatures-allhomes/%s_%s_%d.json" %(appliance,feature, home_to_pick),"r")) f = feature_dict['f'] k = feature_dict['k'] clf = KNeighborsRegressor(n_neighbors=k) nn = NearestNeighbors(n_neighbors=k) df_new =df.copy() df_new = df_new.ix[all_homes[appliance]] df_new = df_new.ix[~df_new.index.isin([home])] #df_new = df_new.drop(home, axis=1) nn.fit(df_new[f].dropna()) distances, indices = nn.kneighbors(df.ix[home][f]) out = [] nghbrs_list = df_new.index[indices].values[0] for month in range(1, 13): if len(nghbrs_list>1): out.append(df_new[["%s_%d" %(appliance, month) ]].ix[nghbrs_list].sum().values[0]/k) else: out.append(df_new[["%s_%d" %(appliance, month) ]].ix[nghbrs_list].values[0]/k) return out
def knn_recommender(): category_feature_matrix = pickle.load(open('bi_feature_matrix/category_feature_matrix', 'rb')) global_image_mapping = pickle.load(open('bi_feature_matrix/global_image_mapping', 'rb')) image_model = NearestNeighbors(n_neighbors=10, algorithm="auto").fit(category_feature_matrix) features_image, probs_image = generate_tags( "http://images.wisegeek.com/beach.jpg") # features_image = [u'bridge', u'water', u'river', u'reflection', u'no person', u'sunset', u'sky', u'travel', # u'evening', u'architecture', u'dawn', u'city', u'dusk', u'light', u'suspension', u'urban', # u'landscape', u'transportation system', u'suspension bridge', u'lake'] # # probs_image = [0.9987363815307617, 0.9966945648193359, 0.9950028657913208, 0.9752582311630249, 0.9750866889953613, # 0.9703925848007202, 0.9699936509132385, 0.9686242341995239, 0.9574745893478394, 0.949645459651947, # 0.9459954500198364, 0.9424264430999756, 0.9072239398956299, 0.898352324962616, 0.8937841653823853, # 0.8838391304016113, 0.8808287382125854, 0.8789186477661133, 0.8749411106109619, 0.8702283501625061] feature_image_row_vector = np.zeros((1, len(features))) for j in range(len(features_image)): feature = features_image[j] if feature in features: feature_index = features.index(feature) feature_image_row_vector[0, feature_index] = probs_image[j] distance_near_image, images_near = image_model.kneighbors(feature_image_row_vector) for image_near in images_near[0]: id_image = global_image_mapping[image_near] cursor = train_collection.find({"1": id_image}) for data in cursor: print data["field14"]
def sampling_vectorized_file(file_number): j_train = j_train_prefix + format('%02d' % file_number) s_train = s_train_prefix + format('%02d' % file_number) if os.path.isfile(s_train): print s_train,'already exist...' # this file is already pending return touch(s_train) print 'creating'+s_train+'from',j_train,'...' # load pre-vectorized train text as multiple batch of nrows assert os.path.isfile(j_train) (X_train,y_train) = joblib.load(j_train) n = X_test.shape[0] # 35065 m = X_train.shape[0]/size_c*neighbor_c # 500000/10000 dist=np.zeros(shape=(n,m),dtype=float) indx=np.zeros(shape=(n,m),dtype=int) for i in range(m/neighbor_c): print j_train,':',i,'/',m/neighbor_c off_c = i*size_c X_c = X_train[off_c:off_c+size_c] nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute',metric='cosine').fit(X_c) t_dist,t_indx = nbrs.kneighbors(X_test) dist[:,neighbor_c*i:neighbor_c*(i+1)] = t_dist indx[:,neighbor_c*i:neighbor_c*(i+1)] = t_indx+off_c+file_number*500000 joblib.dump((dist,indx),s_train) return dist,indx
def _compute_neighbors(self): V,dim = self.data_frame.shape neighbors = NearestNeighbors(n_neighbors=self.num_neighbors,algorithm='auto').fit(self.data_frame) _,indices = neighbors.kneighbors(self.data_frame) self._adjacency_graph = neighbors.kneighbors_graph(self.data_frame,mode='connectivity') self._knn_graph = neighbors.kneighbors_graph(self.data_frame,mode='distance') self._neighbors = indices
def removeRedundantFrames(self): h, w, d = self.keyframes[0].shape n = len(self.keyframes) frames = np.zeros((n, 256)) self.frameHistFeats for i, kf in enumerate(self.keyframes): frames[i] = tools.getColorHist(kf).ravel() k = int(np.sqrt(n)) kmeans = KMeans(k) print("Clustering frames into {0} code vectors.".format(k)) kmeans.fit(self.frameHistFeats) bestFrameIndices = [] bestFrames = [] NN = NearestNeighbors(1) NN.fit(frames) centers = kmeans.cluster_centers_ for center in centers: nearest = NN.kneighbors(center, return_distance=False) bestFrameIndices.append(nearest[0]) bestFrameIndices.sort() for i in bestFrameIndices: bestFrames.append(self.keyframes[i]) return bestFrames
def main(): vectorizer = CountVectorizer(ngram_range=(1,2),max_df=1.0, min_df=0.0) nei = NearestNeighbors(algorithm='brute', metric='jaccard') matrix = vectorizer.fit_transform(training_set).todense() new_matrix = vectorizer.transform(new_comments).todense() nei.fit(matrix) path = '{0}/'.format(pathsplit(abspath(__file__))[0]) jsonfile = open(path + '{0}-nn.json'.format(n_neighbors), 'w') nodes = [{'name': (training_set+new_comments)[i], 'group':(groups + new_groups)[i]} for i in range(len(training_set+new_comments))] links = [] for i in range(len(matrix)): dist, idnei = nei.kneighbors(matrix[i], n_neighbors=n_neighbors + 1) dist, idnei = dist[0], idnei[0] for j in range(len(idnei[1:])): links.append({"source":i,"target":idnei[j+1],"value":10*(1 - dist[j+1])}) for i in range(len(new_comments)): dist, idnei = nei.kneighbors(new_matrix[i], n_neighbors=n_neighbors + 1) dist, idnei = dist[0], idnei[0] for j in range(len(idnei[1:])): links.append({"source":len(matrix) + i,"target":idnei[j],"value":10*(1 - dist[j+1])}) jsondumped = json.dumps({'nodes':nodes, 'links':links}, indent=2) jsonfile.write(jsondumped)
def move(self, event): # add the knn scheme to decide selected region when moving mouse if SKLEARN_INSTALLED: if event.button == 1 and event.is_dragging: # TODO: support multiple datasets here data = get_map_data_scatter(self.active_layer_artist.layer, self.active_layer_artist.visual, self._vispy_widget) # calculate the threshold and call draw visual width = event.pos[0] - self.selection_origin[0] height = event.pos[1] - self.selection_origin[1] drag_distance = math.sqrt(width**2 + height**2) canvas_diag = math.sqrt(self._vispy_widget.canvas.size[0]**2 + self._vispy_widget.canvas.size[1]**2) mask = np.zeros(self.active_layer_artist.layer.shape) # neighbor num proportioned to mouse moving distance n_neighbors = drag_distance / canvas_diag * self.active_layer_artist.layer.shape[0] if n_neighbors >= 1: neigh = NearestNeighbors(n_neighbors=n_neighbors) neigh.fit(data) select_index = neigh.kneighbors([self.selection_origin])[1] mask[select_index] = 1 self.mark_selected(mask, self.active_layer_artist.layer)
def entropy(data=None, prob=None, method='nearest-neighbors', bins=None, errorVal=1e-5, units='bits'): ''' given a probability distribution (prob) or an interable of symbols (data) compute and return its continuous entropy. inputs: ------ data: samples by dimensions ndarray prob: iterable with probabilities method: 'nearest-neighbors', 'gaussian', or 'bin' bins: either a list of num_bins, or a list of lists containing the bin edges errorVal: if prob is given, 'entropy' checks that the sum is about 1. It raises an error if abs(sum(prob)-1) >= errorVal units: either 'bits' or 'nats' Different Methods: 'nearest-neighbors' computes the binless entropy (bits) of a random vector using average nearest neighbors distance (Kozachenko and Leonenko, 1987). For a review see Beirlant et al., 2001 or Chandler & Field, 2007. 'gaussian' computes the binless entropy based on estimating the covariance matrix and assuming the data is normally distributed. 'bin' discretizes the data and computes the discrete entropy. ''' if prob is None and data is None: raise ValueError( "%s.entropy requires either 'prob' or 'data' to be defined" % __name__) if prob is not None and data is not None: raise ValueError( "%s.entropy requires only 'prob' or 'data to be given but not both" % __name__) if prob is not None and not isinstance(prob, np.ndarray): raise TypeError("'entropy' in '%s' needs 'prob' to be an ndarray" % __name__) if prob is not None and abs(prob.sum() - 1) > errorVal: raise ValueError("parameter 'prob' in '%s.entropy' should sum to 1" % __name__) if data.any(): num_samples = data.shape[0] if len(data.shape) == 1: num_dimensions = 1 else: num_dimensions = data.shape[1] if method == 'nearest-neighbors': from sklearn.neighbors import NearestNeighbors from scipy.special import gamma if data is None: raise ValueError( 'Nearest neighbors entropy requires original data') if len(data.shape) > 1: k = num_dimensions else: k = 1 nbrs = NearestNeighbors(n_neighbors=2, algorithm='auto').fit(data) # pdb.set_trace() distances, indices = nbrs.kneighbors(data) rho = distances[:, 1] # take nearest-neighbor distance (first column is always zero) Ak = (k * np.pi**(float(k) / float(2))) / gamma(float(k) / float(2) + 1) if units is 'bits': # 0.577215... is the Euler-Mascheroni constant (np.euler_gamma) return k * np.mean(np.log2(rho)) + np.log2( num_samples * Ak / k) + np.log2(np.exp(1)) * np.euler_gamma elif units is 'nats': # 0.577215... is the Euler-Mascheroni constant (np.euler_gamma) return k * np.mean(np.log(rho)) + np.log( num_samples * Ak / k) + np.log(np.exp(1)) * np.euler_gamma else: print('Units not recognized: {}'.format(units)) elif method == 'gaussian': from numpy.linalg import det if data is None: raise ValueError( 'Nearest neighbors entropy requires original data') detCov = det(np.dot(data.transpose(), data) / num_samples) normalization = (2 * np.pi * np.exp(1))**num_dimensions if detCov == 0: return -np.inf else: if units is 'bits': return 0.5 * np.log2(normalization * detCov) elif units is 'nats': return 0.5 * np.log(normalization * detCov) else: print('Units not recognized: {}'.format(units)) elif method == 'bin': if prob is None and bins is None: raise ValueError('Either prob or bins must be specified.') if data is not None: prob = symbols_to_prob(data, bins=bins) if units is 'bits': # compute the log2 of the probability and change any -inf by 0s logProb = np.log2(prob) logProb[logProb == -np.inf] = 0 elif units is 'nats': # compute the log2 of the probability and change any -inf by 0s logProb = np.log(prob) logProb[logProb == -np.inf] = 0 else: print('Units not recognized: {}'.format(units)) # return sum of product of logProb and prob # (not using np.dot here because prob, logprob are nd arrays) return -float(np.sum(prob * logProb))
# Calculating the minimums MinX = min(TopVoxels['X']) MinY = min(TopVoxels['Y']) MinZ = min(TopVoxels['Z']) # Subtracting the minimum from the column (to move it to 0,0,0) TopVoxels['X'] = TopVoxels['X'].apply(lambda x: x - MinX) TopVoxels['Y'] = TopVoxels['Y'].apply(lambda x: x - MinY) TopVoxels['Z'] = TopVoxels['Z'].apply(lambda x: x - MinZ) # Creating a 'deep' copy of the TopVoxels to use later because it has the same format and we want that TopVoxelNormals = TopVoxels.copy(deep=True) # Distance List k_value = 10 nbrs = NearestNeighbors(n_neighbors=k_value, algorithm='ball_tree').fit(TopVoxels) distances, indices = nbrs.kneighbors(TopVoxels) #Adding normal columns for index, row in TopVoxels.iterrows(): vectorMatrix = np.zeros(shape=[3, k_value]) sum_centroids = np.zeros(shape=[1, 3]) centroid = np.zeros(shape=[1, 3]) covariance_matrix = np.zeros(shape=[3, 3]) for j in range(k_value): vectorMatrix[0, j] = TopVoxels['X'][indices[index][j]] vectorMatrix[1, j] = TopVoxels['Y'][indices[index][j]] vectorMatrix[2, j] = TopVoxels['Z'][indices[index][j]] sum_centroids = np.add(sum_centroids, vectorMatrix[:, j])
def points_match_generator(ptrs, k_ptrs, affine_level, rand_move_level, batch_size): nbors=NearestNeighbors(n_neighbors=k_ptrs+1).fit(ptrs) while 1: coordinates_ref_batch = np.zeros((batch_size,k_ptrs*3+1),dtype='float32') coordinates_tgt_batch = np.zeros((batch_size,k_ptrs*3+1),dtype='float32') matching = np.zeros((batch_size,1),dtype='int') for batch in range(batch_size): # affine transform the reference point set ptrs_affine=affine_transform(ptrs,affine_level,rand_move_level) ########################################################################## # add additional errors to target set to simulate segmentation mistakes ########################################################################## random_idx = np.arange(np.shape(ptrs)[0]) np.random.shuffle(random_idx) idx_errors = random_idx[0:k_ptrs+1] errors = (np.random.rand(k_ptrs,3)-0.5)*10 ptrs_target = np.copy(ptrs_affine) ptrs_target[idx_errors[0:k_ptrs],:] = ptrs_affine[idx_errors[0:k_ptrs],:]+errors ############################################################### # calculate (relative) coordinates of a single reference point ############################################################### distances_ref,nbors_idx=nbors.kneighbors(ptrs[idx_errors[k_ptrs:k_ptrs+1],:], return_distance=True) mean_distance = np.mean(distances_ref) coordinates_ref_relative=(ptrs[nbors_idx[0,1:k_ptrs+1],:]-ptrs[nbors_idx[0,0],:])/mean_distance coordinates_ref = np.zeros(k_ptrs*3+1) coordinates_ref[0:k_ptrs*3] = coordinates_ref_relative.reshape(k_ptrs*3) coordinates_ref[k_ptrs*3] = mean_distance nbors_target=NearestNeighbors(n_neighbors=k_ptrs+1).fit(ptrs_target) matching_flag=np.random.rand() if matching_flag>0.5: ################################################################### # calculate coordinates of corresponding point in target point set ################################################################### distances_tgt_true,nbors_idx=nbors_target.kneighbors(ptrs_target[idx_errors[k_ptrs:k_ptrs+1],:], return_distance=True) mean_distance=np.mean(distances_tgt_true) coordinates_tgt_relative=(ptrs_target[nbors_idx[0,1:k_ptrs+1],:]-ptrs_target[nbors_idx[0,0],:])/mean_distance coordinates_tgt=np.zeros(k_ptrs*3+1) coordinates_tgt[0:k_ptrs*3]=coordinates_tgt_relative.reshape(k_ptrs*3) coordinates_tgt[k_ptrs*3]=mean_distance elif matching_flag<=0.5: ######################################################################### # calculate coordinates of a non-corresponding point in target point set ######################################################################### nbors_idx=nbors_target.kneighbors(ptrs_target[idx_errors[k_ptrs:k_ptrs+1],:], return_distance=False) random_nbors_idx=np.copy(nbors_idx[0,1:k_ptrs+1]) np.random.shuffle(random_nbors_idx) distances_tgt_false,nbors_idx_false=nbors_target.kneighbors(ptrs_target[random_nbors_idx[0:1],:], return_distance=True) mean_distance=np.mean(distances_tgt_false) coordinates_tgt_relative=(ptrs_target[nbors_idx_false[0,1:k_ptrs+1],:]-ptrs_target[nbors_idx_false[0,0],:])/mean_distance coordinates_tgt=np.zeros(k_ptrs*3+1) coordinates_tgt[0:k_ptrs*3]=coordinates_tgt_relative.reshape(k_ptrs*3) coordinates_tgt[k_ptrs*3]=mean_distance else: raise NameError("matching_flag has an abnormal value") coordinates_ref_batch[batch,:]=coordinates_ref.reshape(1,k_ptrs*3+1) coordinates_tgt_batch[batch,:]=coordinates_tgt.reshape(1,k_ptrs*3+1) matching_flag=int(matching_flag>0.5) matching[batch,:]=np.array(matching_flag).reshape(1,1) yield ([coordinates_ref_batch, coordinates_tgt_batch], matching)
def get_sizes_and_z_from_cell_list(tracked_cells_df, cells, frame_num, scale_xy, scale_z, neighbors, density, scaled_vol=1): new_metric = [] new_z = [] total_metric = [] total_z = [] EXCLUDED_metric = [] EXCLUDED_z = [] ### get list of all cell locations on current frame all_x = tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].X all_y = tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].Y all_z = tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].Z all_series_num = tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].SERIES if not scaled_vol: all_coords = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].coords) else: all_vols = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.FRAME == frame_num)].vol_rescaled) all_centroids = [np.asarray(all_x) * scale_xy, np.asarray(all_y) * scale_xy, np.asarray(all_z) * scale_z] all_centroids = np.transpose(all_centroids) if density: nbrs = NearestNeighbors(n_neighbors=neighbors, algorithm='ball_tree').fit(all_centroids) distances, indices = nbrs.kneighbors(all_centroids) """ Get all distances to see distribution comparing DEPTH and density/distance """ for dist_idx, obj in enumerate(distances): cur_dist = obj[1:-1] mean = np.mean(cur_dist) #total_dists.append(cur_dist) total_metric = np.concatenate((total_metric, [mean])) """ ALSO GET LIST EXCLUDING CURRENT METRIC """ series_num = np.asarray(all_series_num)[dist_idx] if series_num not in cells: EXCLUDED_metric = np.concatenate((EXCLUDED_metric, [mean])) EXCLUDED_z = np.concatenate((EXCLUDED_z, [all_centroids[dist_idx, -1]])) total_z = np.concatenate((total_z, all_centroids[:, -1])) """ Go cell by cell through NEW cells only """ for cur_cell in cells: dist_idx = np.where(all_series_num == cur_cell) cur_dists = distances[dist_idx][0][1:-1] mean_dist = np.mean(cur_dists) new_metric = np.concatenate((new_metric, [mean_dist])) ### compare it with all the other cells at the same depth that are NOT new cur_z = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.SERIES == cur_cell)].Z)[0] * scale_z new_z = np.concatenate((new_z, [cur_z])) elif not density: """ Get all distances to see distribution comparing DEPTH and density/distance """ if not scaled_vol: for obj in all_coords: cur_vol = len(obj) total_metric = np.concatenate((total_metric, [cur_vol])) else: total_metric = np.concatenate((total_metric, all_vols)) total_z = np.concatenate((total_z, np.asarray(all_z) * scale_z)) """ Get terminated cells """ for cur_cell in cells: dist_idx = np.where(all_series_num == cur_cell) if not scaled_vol: cur_vol = len(all_coords[dist_idx][0]) new_metric = np.concatenate((new_metric, [cur_vol])) else: cur_vol = all_vols[dist_idx] new_metric = np.concatenate((new_metric, cur_vol)) ### compare it with all the other cells at the same depth that are NOT new cur_z = np.asarray(tracked_cells_df.iloc[np.where(tracked_cells_df.SERIES == cur_cell)].Z)[0] * scale_z new_z = np.concatenate((new_z, [cur_z])) return total_metric, total_z, new_metric, new_z, EXCLUDED_metric, EXCLUDED_z
def perform_KNN(train_vecs, val_vecs, n_neighbors=1): print("-> Preparing KNN...") nbrs = NearestNeighbors(n_neighbors=n_neighbors, n_jobs=-1).fit(train_vecs) print("-> Finding closest neighbors...") distances, indices = nbrs.kneighbors(val_vecs) return indices
def fit(self, X, y=None): """Fit the DPA clustering on the data. Parameters ---------- X : array [n_samples, n_samples] if metric == “precomputed”, or, [n_samples, n_features] otherwise The input samples. Similarities / affinities between instances if ``affinity='precomputed'``. y : Ignored Not used, present here for API consistency by convention. Returns ------- self : object Returns self. """ # Input validation X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64, ensure_min_samples=2) allow_squared = self.affinity in [ "precomputed", "precomputed_nearest_neighbors" ] if X.shape[0] == X.shape[1] and not allow_squared: warnings.warn("The DPA clustering API has changed. ``fit``" "now constructs an affinity matrix from data. To use" " a custom affinity matrix, " "set ``affinity=precomputed``.") self.k_max_ = self.k_max self.dim_ = self.dim if not self.dim: if self.dim_algo == "auto": self.dim_ = X.shape[1] elif self.dim_algo == "twoNN": if self.block_ratio >= X.shape[0]: raise ValueError( "block_ratio is larger than the sample size, the minimum size for \ block analysis would be zero. Please set a value lower than " + str(X.shape[0])) self.dim_ = twoNearestNeighbors(blockAn=self.blockAn, block_ratio=self.block_ratio, metric=self.metric, frac=self.frac, n_jobs=self.n_jobs).fit(X).dim_ else: pass # If densities, uncertainties and k_hat are provided as input, compute only the # matrix of nearest neighbor: self.densities_ = self.densities self.err_densities_ = self.err_densities self.k_hat_ = self.k_hat if self.densities_ is not None and self.err_densities_ is not None and self.k_hat_ is not None: # If the nearest neighbors matrix is precomputed: if self.nn_distances is not None and self.nn_indices is not None: self.k_max_ = max(self.k_hat_) self.nn_distances_ = self.nn_distances self.nn_indices_ = self.nn_indices else: self.k_max_ = max(self.k_hat_) if self.metric == "precomputed": nbrs = NearestNeighbors( n_neighbors=self.k_max_ + 1, # The point i is counted in its neighborhood algorithm="brute", metric=self.metric, n_jobs=self.n_jobs).fit(X) else: nbrs = NearestNeighbors( n_neighbors=self.k_max_ + 1, # The point i is counted in its neighborhood algorithm="auto", metric=self.metric, n_jobs=self.n_jobs).fit(X) self.nn_distances_, nn_self.indices_ = nbrs.kneighbors(X) elif self.density_algo == "PAk": # If the nearest neighbors matrix is precomputed: if self.nn_distances is not None and self.nn_indices is not None: self.k_max_ = self.nn_distances.shape[1] - 1 PAk = PointAdaptive_kNN(k_max=self.k_max_, D_thr=self.D_thr, metric=self.metric, nn_distances=self.nn_distances, nn_indices=self.nn_indices, dim_algo=self.dim_algo, blockAn=self.blockAn, block_ratio=self.block_ratio, frac=self.frac, dim=self.dim_, n_jobs=self.n_jobs).fit(X) else: PAk = PointAdaptive_kNN(k_max=self.k_max_, D_thr=self.D_thr, metric=self.metric, dim_algo=self.dim_algo, blockAn=self.blockAn, block_ratio=self.block_ratio, frac=self.frac, dim=self.dim_, n_jobs=self.n_jobs).fit(X) self.nn_distances_ = PAk.distances_ self.nn_indices_ = PAk.indices_ self.densities_ = PAk.densities_ self.err_densities_ = PAk.err_densities_ self.k_hat_ = PAk.k_hat_ self.k_max_ = max(self.k_hat_) else: # TODO: implement option for kNN pass self.labels_, self.halos_, self.topography_, self.g_, self.centers_ = _DensityPeakAdvanced( self.densities_, self.err_densities_, self.k_hat_, self.nn_distances_, self.nn_indices_, self.Z) self.is_fitted_ = True return self
tf_value_of_input_data = input_tf(input_data) def input_idf(input_data): idf_vec_input_data = [] idf_each_doc_vec_input_data = [] for each_word_input_data,val in word_dict.items(): if each_word_input_data in input_data: word_value_in_each_doc_input_data = countIdfforwordvalue.get(each_word_input_data) idf_each_doc_vec_input_data.append(mth.log(length_of_docs / word_value_in_each_doc_input_data)) else: idf_each_doc_vec_input_data.append(0) return idf_each_doc_vec_input_data idf_value_of_input_data = input_idf(input_data) def computeTfIdf_input(tf_value_of_input_data, idf_value_of_input_data): tfidf_input_vec = [a * b for a, b in zip(tf_value_of_input_data, idf_value_of_input_data)] return tfidf_input_vec TfIdf_value_of_input_data = computeTfIdf_input(tf_value_of_input_data,idf_value_of_input_data) value_for_predict = np.array(TfIdf_value_of_input_data).reshape(1,-1) print("prediction") from sklearn.neighbors import NearestNeighbors neigh = NearestNeighbors(n_neighbors=5) neigh.fit(features) NearestNeighbors(algorithm='auto', leaf_size=30) print(neigh.kneighbors(value_for_predict))
# KNN brute Force Searching import numpy as np from sklearn.neighbors import NearestNeighbors import timeit #data creation X = np.random.random((1000, 3)) print(X) #training nn = NearestNeighbors(5, algorithm='kd_tree') nn.fit(X) # Testing test = np.array([0.3, 0.4, 0.4]) test1 = test.reshape(1, -1) print(nn.kneighbors(test1, 5))
def addAvgDiff(train_df, test_df, nn=20): #adding features """ ['diff_avg_price','diff_price_per_bed','diff_price_per_bath','diff_price_per_room'] """ #Note that removing outliers is moved into this function setOutlierNan(train_df) setOutlierNan(test_df) train_test = pd.concat([train_df.drop('interest_level', axis=1), test_df]).dropna() nnfinder = NearestNeighbors(n_neighbors=nn) nnfinder.fit(train_test.ix[:, ['latitude', 'longitude']]) train_df['diff_avg_price'] = np.nan train_df['diff_price_per_bed'] = np.nan train_df['diff_price_per_bath'] = np.nan train_df['diff_price_per_room'] = np.nan for i in train_df.index: if ~np.isnan(train_df.ix[i, 'latitude']) and ~np.isnan( train_df.ix[i, 'longitude']): _, idx = nnfinder.kneighbors( train_df.ix[i, ['latitude', 'longitude']].as_matrix().reshape( 1, -1)) #price temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price'] temp = temp[temp != np.inf] train_df.ix[i, 'diff_avg_price'] = train_df.ix[i, 'price'] - temp.mean() temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bed'] temp = temp[temp != np.inf] train_df.ix[i, 'diff_price_per_bed'] = train_df.ix[ i, 'price_per_bed'] - temp.mean() temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bath'] temp = temp[temp != np.inf] train_df.ix[i, 'diff_price_per_bath'] = train_df.ix[ i, 'price_per_bath'] - temp.mean() temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_room'] temp = temp[temp != np.inf] train_df.ix[i, 'diff_price_per_room'] = train_df.ix[ i, 'price_per_room'] - temp.mean() test_df['diff_avg_price'] = np.nan test_df['diff_price_per_bed'] = np.nan test_df['diff_price_per_bath'] = np.nan test_df['diff_price_per_room'] = np.nan for i in test_df.index: if ~np.isnan(test_df.ix[i, 'latitude']) and ~np.isnan( test_df.ix[i, 'longitude']): _, idx = nnfinder.kneighbors( test_df.ix[i, ['latitude', 'longitude']].as_matrix().reshape( 1, -1)) #price temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price'] temp = temp[temp != np.inf] test_df.ix[i, 'diff_avg_price'] = test_df.ix[i, 'price'] - temp.mean() temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bed'] temp = temp[temp != np.inf] test_df.ix[i, 'diff_price_per_bed'] = test_df.ix[ i, 'price_per_bed'] - temp.mean() temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_bath'] temp = temp[temp != np.inf] test_df.ix[i, 'diff_price_per_bath'] = test_df.ix[ i, 'price_per_bath'] - temp.mean() temp = train_test.iloc[ idx.reshape(idx.size).tolist(), :].ix[:, 'price_per_room'] temp = temp[temp != np.inf] test_df.ix[i, 'diff_price_per_room'] = test_df.ix[ i, 'price_per_room'] - temp.mean() train_df['longitude'] = train_df['longitude'].fillna(0) train_df['latitude'] = train_df['latitude'].fillna(0) test_df['longitude'] = test_df['longitude'].fillna(0) test_df['latitude'] = test_df['latitude'].fillna(0)
def perform_type_prediction(self, df, based_on_num_neigh=3): def create_binary_type_vector(t_types, a_types): vector = np.zeros(len(all_types)) i = [a_types.index(_) for _ in t_types] vector[i] = 1 return vector def create_binary_type_prediction_vector(t_types, a_types): vector = np.zeros(len(all_types)) i = [ a_types.index(_) for _ in itertools.chain.from_iterable(t_types) ] vector[i] += 1 return vector # get the types. Mapping from the index of subject to the index of object type_info = deserializer(path=self.p_folder, serialized_name='type_info') # get the index of objects / get type information =>>> s #type o all_types = sorted(set.union(*list(type_info.values()))) # Consider only points with type infos. e_w_types = df.loc[list(type_info.keys())] neigh = NearestNeighbors(n_neighbors=based_on_num_neigh, algorithm='kd_tree', metric='euclidean', n_jobs=-1).fit(e_w_types) # Get similarity results for selected entities df_most_similars = pd.DataFrame( neigh.kneighbors(e_w_types, return_distance=False)) # Reindex the target df_most_similars.index = e_w_types.index.values # As sklearn implementation of kneighbors returns the point itself as most similar point df_most_similars.drop(columns=[0], inplace=True) # Map back to the original indexes. KNN does not consider the index of Dataframe. mapper = dict(zip(list(range(len(e_w_types))), e_w_types.index.values)) # The values of most similars are mapped to original vocabulary positions df_most_similars = df_most_similars.applymap(lambda x: mapper[x]) k_values = [1, 3, 5, 10, 15, 30, 50, 100] self.logger.info('K values: {0}'.format(k_values)) for k in k_values: self.logger.info('##### {0} #####'.format(k)) similarities = list() for _, S in df_most_similars.iterrows(): true_types = type_info[_] type_predictions = [type_info[_] for _ in S.values[:k]] vector_true = create_binary_type_vector(true_types, all_types) vector_prediction = create_binary_type_prediction_vector( type_predictions, all_types) sim = cosine(vector_true, vector_prediction) similarities.append(1 - sim) report = pd.DataFrame(similarities) self.logger.info('Mean type prediction: {0}'.format( report.mean().values))
from sklearn.neighbors import NearestNeighbors from carotid import carotid_data_util as cdu from my_util import data_util import matplotlib.pyplot as plt id_all, x_data_all, y_data_all = cdu.get_ex_normal() # id_all, x_data_all, y_data_all = cdu.get_ex_data('RCCA') # x_data_all = data_util.scale(x_data_all) # a minimum minPts can be derived from the number of dimensions D in the data set, as minPts ≥ D + 1 minPts = x_data_all.shape[1] + 1 # The value for ε can then be chosen by using a k-distance graph, plotting the distance to the k = minPts-1 k = minPts - 1 nbrs = NearestNeighbors(n_neighbors=k).fit(x_data_all) distances, indices = nbrs.kneighbors(x_data_all) distanceDec = sorted(distances[:, k - 1], reverse=True) plt.plot(list(range(1, x_data_all.shape[0] + 1)), distanceDec) plt.show()
def _label_clusters_first_pass(self, n_datasets, n_sym_ops): """First pass labelling of clusters. Labels points into clusters such that cluster contains exactly one copy of each dataset. Args: n_datasets (int): The number of datasets. n_sym_ops (int): The number of symmetry operations. Returns: cluster_labels (np.ndarray): A label for each coordinate, labelled from 0 .. n_sym_ops. """ # initialise cluster labels: -1 signifies doesn't belong to a cluster cluster_labels = np.full(self.coords.shape[0], -1, dtype=int) cluster_id = 0 while (cluster_labels == -1).sum() > 0: coord_ids = np.arange(n_datasets * n_sym_ops) dataset_ids = coord_ids % n_datasets # select only those points that don't already belong to a cluster sel = np.where(cluster_labels == -1) X = self.coords[sel] dataset_ids = dataset_ids[sel] coord_ids = coord_ids[sel] # choose a high density point as seed for cluster nbrs = NearestNeighbors(n_neighbors=min(11, len(X)), algorithm="brute", metric="cosine").fit(X) distances, indices = nbrs.kneighbors(X) average_distance = np.array( [dist[1:].mean() for dist in distances]) i = average_distance.argmin() d_id = dataset_ids[i] cluster = np.array([coord_ids[i]]) cluster_dataset_ids = np.array([d_id]) xis = np.array([X[i]]) for j in range(n_datasets - 1): # select only those rows that don't correspond to a dataset already # present in current cluster sel = np.where(dataset_ids != d_id) X = X[sel] dataset_ids = dataset_ids[sel] coord_ids = coord_ids[sel] assert len(X) > 0 # Find nearest neighbour in cosine-space to the current cluster centroid nbrs = NearestNeighbors(n_neighbors=min(1, len(X)), algorithm="brute", metric="cosine").fit(X) distances, indices = nbrs.kneighbors([xis.mean(axis=0)]) k = indices[0][0] d_id = dataset_ids[k] cluster = np.append(cluster, coord_ids[k]) cluster_dataset_ids = np.append(cluster_dataset_ids, d_id) xis = np.append(xis, [X[k]], axis=0) # label this cluster cluster_labels[cluster] = cluster_id cluster_id += 1 return cluster_labels
def run_xmap(dataset=None, n_neighbors=None, negative_sample_rate=None, seed=None, model_folder="models", return_step=STEP.DATA_CLEANED, learn_mode=LEARN.UNSUPERVISED, runall=False): global f, SEED, NNK, NS, cmap, NFEATURE # runall = True SEED = seed NNK = n_neighbors NS = negative_sample_rate # define the distance measure to be used in UMAP. Many options are available in UMAP distancem = "euclidean" np.random.seed(SEED) # define the path to store intermediate outputs pathname = model_folder + "/" + "xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset current_step = STEP.INITIALIZING t0 = time.time() """ STEP 1: Loading the dataset """ if runall or not os.path.exists(pathname + ".cleandata"): f = open( "outputs/xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset + ".log", 'w') print_output("Loading Data ...") print_output("\tData set: {}".format(dataset)) data = pd.read_csv("../data/{}.csv".format(dataset)) feature_names = [ c.replace("_", "ubar").replace(".", "dot") for c in data.columns ][1:] target_name = data.columns[0] nfeatures = len(feature_names) NFEATURE = nfeatures data = data.values Y = data[:, 0].reshape(-1, 1) X = data[:, 1:] # scale the dataset scaler = MinMaxScaler() scaler.fit(X) X_norm = scaler.transform(X) last_step = STEP.DATA_CLEANED pickle.dump((X_norm, Y, scaler, nfeatures, feature_names, target_name), open(pathname + ".cleandata", "wb")) else: f = open( "outputs/xmap_k{}_ns{}_s{}_".format(NNK, NS, SEED) + dataset + ".log", 'w+') print_output("\tLoad cleaned data from " + pathname + ".cleandata") X_norm, Y, scaler, nfeatures, feature_names, target_name = pickle.load( open(pathname + ".cleandata", "rb")) current_step = STEP.DATA_CLEANED if current_step == return_step: return X_norm, Y, scaler, nfeatures, feature_names, target_name """ STEP 2: Learn the latent representation of the dataset (unsupervised or supervised) """ print_output("Learning UMAP ...") print(learn_mode) if learn_mode == LEARN.UNSUPERVISED: umapname = ".unsupervised_umap" else: umapname = ".supervised_umap" if runall or not os.path.exists(pathname + umapname): reducer = umap.UMAP(random_state=SEED, n_neighbors=NNK, negative_sample_rate=NS, metric=distancem) # reducer = PCA(n_components=2) if learn_mode == LEARN.UNSUPERVISED: reducer.fit(X_norm) else: reducer.fit(X_norm, y=Y) last_step = STEP.UMAP_TRAINED pickle.dump(reducer, open(pathname + umapname, "wb")) else: print_output("\tLoad trained umap from " + pathname + umapname) reducer = pickle.load(open(pathname + umapname, "rb")) embedding = reducer.transform(X_norm) current_step = STEP.UMAP_TRAINED if current_step == return_step: return embedding lnames = ["Negative", "Positive"] Y = Y.reshape(-1) cmap = [ "blue", "red", "purple", "hotpink", "black", "green", "orange", "teal", "brown", "lightsteelblue", "gray", "lime", "coral", "plum", "gold", "c", "tomato", "blueviolet", "darkseagreen" ] if return_step == None: plot_embedding_space(embedding, labels=Y, label_index=[0, 1], lnames=lnames, data_name="gt_" + dataset) """ STEP 3: summarise latent data using ATL and cluster data using learned graph from ATL """ # nepoch: number of times the data passed to ATL; age_max: maximum age of a connection; age increases if the # connection (different from the second best) links to the BMU. If max_age is too small the topological relationships will # be prematurely destroyed. Meanwhile if max_age is too large, some useless connections may survive because of randomness # or noise --> ATL needs to run longer to get the accurate results and more relationships will be preserved. # lamb: is the number of steps (or number of processed inputs) before ATL checks and cleans up the network. Lambda has # a similar effect as compared to max_age, i.e. small lamb leads to unstable network (unable to establish topological # relationhips) while large lamb may lead to redundant nodes and connections. print_output( "Learning topological relations and Determining contexts ....") if learn_mode == LEARN.UNSUPERVISED: soinnname = ".unsupervised_soinn" else: soinnname = ".supervised_soinn" if runall or not os.path.exists(pathname + soinnname): lamb = 200 # if data.shape[0] < lamb: # lamb = data.shape[0] nodes, connection, classes = atl.learning(input_data=embedding, max_nepoch=5, spread_factor=1.0, lamb=lamb) classes = 0 * classes cmap = cmap * 10 if return_step == None: plot_atl(nodes, connection) # create a network representation of the learned ATL graph G = nx.Graph() for i in range(0, nodes.shape[0]): for j in range(0, nodes.shape[0]): if connection[i, j] != 0: G.add_edge(i, j, weight=1.0) # use community detection algorithms to discover the subgraph community network_cd_alg = "best" n_components = nx.number_connected_components(G) max_context = int(n_components + np.sqrt(n_components)) threshold = 0.2 if network_cd_alg == "gn": from networkx.algorithms import community communities_generator = community.girvan_newman(G) while True: level_communities = next(communities_generator) size_com = [len(c) for c in level_communities if len(c) > 1] if min(size_com) < threshold * sum( [len(c) for c in level_communities]): break cms = sorted(map(sorted, level_communities)) elif network_cd_alg == "best" or network_cd_alg == "dendo": import community if network_cd_alg == "best": cms = community.best_partition(G, resolution=0.5) else: dendrogram = community.generate_dendrogram(G) sized = len(dendrogram) cms = community.partition_at_level(dendrogram, sized - 2) coms = set([cms[i] for i in cms]) cdict = {} for k in coms: cdict[k] = [] for i in cms: cdict[cms[i]].append(i) cms = [] for k in coms: cms.append(list(cdict[k])) else: # cms = list(nx.connected_components(G)) from networkx.algorithms import community communities_generator = community.girvan_newman(G) level_communities = next(communities_generator) cms = sorted(map(sorted, level_communities)) components = [c for c in cms if len(c) > 1] # print(components) count = 1 for comp in components: for n in comp: classes[n] = count count += 1 # each components or subgraphs can be treated as clusters as the ATL only links nodes with similar patterns # togethers. The lack of connections between two nodes indicates that there two nodes or data matching these # two nodes should belong to the same cluster. nclusters = len(components) nbrs = NearestNeighbors(n_neighbors=1).fit(nodes) distances, indices = nbrs.kneighbors(embedding) node_indices = list(indices.reshape(-1)) indices = np.array( [classes[node_indices[i]] for i in range(len(node_indices))]) last_step = STEP.ATL_TRAINED pickle.dump( (nodes, connection, classes, nclusters, node_indices, indices), open(pathname + soinnname, "wb")) else: print_output("\tLoad trained umap from " + pathname + soinnname) nodes, connection, classes, nclusters, node_indices, indices = pickle.load( open(pathname + soinnname, "rb")) current_step = STEP.ATL_TRAINED if current_step == return_step: return embedding, nodes, connection, classes, nclusters, node_indices, indices """ STEP 4: Try to explain context/cluster using Context Description Approximation (CDA). Can work with original features or interactive terms (representing or/and relations). """ cid = [c for c in range(nclusters + 1)] cid = cid + [100] if learn_mode == LEARN.UNSUPERVISED: explainnname = ".unsupervised_explain" else: explainnname = ".supervised_explain" if True or runall or not os.path.exists(pathname + explainnname): finteraction = False # True if interactive terms are considered interactionAND = False # True if AND relation is used n_identity_feature = 5 # determine the number of features/variables used to describe the cluster/context active_threshold = 0.01 # threshold to determine if a feature value can represent a given cluster/context cmap = 10 * [ "red", "blue", "purple", "hotpink", "black", "green", "orange", "teal", "brown", "lightsteelblue", "gray", "lime", "coral", "plum", "gold", "c", "tomato", "blueviolet", "darkseagreen" ] if return_step == None: plot_embedding_space(embedding, labels=indices, label_index=cid, lnames=["context__# " + str(c) for c in cid], data_name="pointcontext_" + dataset) cluster_sizes = plot_embedding_space( embedding, labels=indices, label_index=cid, lnames=["context__# " + str(c) for c in cid], data_name="highlightcontext_" + dataset) cluster_id_ranked_by_size = (-np.array(cluster_sizes)).argsort() poly = PolynomialFeatures(interaction_only=True, include_bias=False) cluster_explainer_dict = {} if nclusters > 1: print_output("Explaining contexts ...") xcluster_id = np.zeros(embedding.shape[0]) xcluster_id_details = np.zeros((embedding.shape[0], nclusters)) outputs = np.zeros((nclusters, len(feature_names))) cluster_characteristic_dict = {} feature_names_I = None XX = X_norm feature_names = [ ff.replace("ubar", "_").replace("dot", ".") for ff in feature_names ] if finteraction: if not interactionAND: XX = poly.fit_transform(1 - X_norm) XX = 1 - XX else: XX = poly.fit_transform(X_norm) feature_names_I = str(poly.get_feature_names()) for fi in range(nfeatures): feature_names_I = feature_names_I.replace( "'x" + str(fi) + "'", feature_names[fi]) feature_names_I = feature_names_I.replace( "'x" + str(fi) + " ", feature_names[fi] + " ") feature_names_I = feature_names_I.replace( " x" + str(fi) + "'", " " + feature_names[fi]) if not interactionAND: feature_names_I = feature_names_I.replace("[", "").replace( "]", "").replace("'", "").replace(", ", ",").replace(" ", " or ") else: feature_names_I = feature_names_I.replace("[", "").replace( "]", "").replace("'", "").replace(", ", ",").replace(" ", " and ") feature_names_I = feature_names_I.split(",") feature_names = feature_names_I outputs = np.zeros((nclusters, len(feature_names))) # for each cluster/context, repetitive patterns will be determined for i in range(nclusters): cluster_id = i #cluster_id_ranked_by_size[i] print_output("Context #" + str(cluster_id + 1)) Xc = XX[indices == cluster_id + 1] from scipy.stats import iqr for fi in range(len(feature_names)): outputs[cluster_id][fi] = min( np.sum(Xc[::, fi]) / len(Xc[::, fi]), 1 - np.sum(Xc[::, fi]) / len(Xc[::, fi])) true_features = [] false_features = [] numeric_features = [] impure_features = [] ranked_features = np.argsort(outputs[cluster_id]) for fi in ranked_features: if outputs[cluster_id][fi] <= active_threshold: (values, counts) = np.unique(Xc[::, fi], return_counts=True) ind = np.argmax(counts) val = values[ind] if val == 1.0: true_features.append(fi) elif val == 0.0: false_features.append(fi) else: numeric_features.append(fi) else: impure_features.append( (fi, np.min(Xc[::, fi]), np.max(Xc[::, fi]), np.average(Xc[::, fi]))) nzeros = len(feature_names) - np.count_nonzero( outputs[cluster_id]) mask = np.ones((embedding.shape[0], ), dtype=bool) countf = 0 print_output("\tTrue Features") count = 0 filter_true = [] for fi in true_features: if countf >= n_identity_feature: break countf += 1 count += 1 fmask = XX[::, fi] == 1.0 mask = mask & fmask filter_true.append(fi) if count > 0: print_output("\t\t" + str( sorted([ feature_names[ii] for ii in true_features[:count] ]))) print_output("\tFalse Features") count = 0 filter_false = [] for fi in false_features: if countf >= n_identity_feature: break countf += 1 count += 1 fmask = XX[::, fi] == 0.0 mask = mask & fmask filter_false.append(fi) true_features, false_features = filter_true, filter_false cluster_explainer_dict[cluster_id] = (finteraction, true_features, false_features, numeric_features, impure_features) if count > 0: print_output("\t\t" + str( sorted([ feature_names[ii] for ii in false_features[:count] ]))) print_output("\tNumeric Features") count = 0 for fi in numeric_features: if countf >= n_identity_feature: break countf += 1 count += 1 if count > 0: print_output("\t\t" + str([(feature_names[ii[0]], ii[1], ii[2]) for ii in numeric_features[:count]])) xcluster_id_details[mask, cluster_id] = 1 print_output("\t" + 20 * '-') print_output("\t" + 20 * '=') print_output("") last_step = STEP.CONTEXT_EXPLAINED pickle.dump(last_step, open(pathname + ".notes", "wb")) pickle.dump( (cluster_explainer_dict, xcluster_id_details, feature_names), open(pathname + explainnname, "wb")) else: print_output("\tLoad explainer from " + pathname + explainnname) cluster_explainer_dict, xcluster_id_details, feature_names = pickle.load( open(pathname + explainnname, "rb")) current_step = STEP.CONTEXT_EXPLAINED if current_step == return_step: return embedding, nodes, connection, classes, nclusters, node_indices, indices, cluster_explainer_dict, xcluster_id_details, feature_names run_time = time.time() - t0 print_output('Run in %.3f s' % run_time) print_output("Complete!!!")
def sort_min_diff(amat): mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat) v = mb.kneighbors(amat) smallest = np.argmin(v[0].sum(axis=1)) return amat[v[1][smallest]]