def _compute_centrality(loc_proto_lat, loc_proto_lon, locs_proto, df_c, file_name_out): distances = [1000, 5000, 15000] centrality = [] for d in distances: c = [] neigh = NearestNeighbors(radius=d, metric=spherical_distance) neigh.fit(locs_proto) for i in range(len(loc_proto_lat)): loc_x = loc_proto_lat[i] loc_y = loc_proto_lon[i] rng = neigh.radius_neighbors([[loc_x, loc_y]]) nei_index = list(rng[1][0]) c.append(len(nei_index)) centrality.append(c) df_c = df_c.assign(centrality1K=centrality[0], centrality5K=centrality[1], centrality15K=centrality[2]) df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False) return df_c
def clusterpolate(points, values, targets, radius=1, kernel_factory=bump, neighbors=None, num_jobs=None): """ Clusterpolate data. ``points`` (array-like) are the data points and ``values`` (array-like) are the associated values. ``targets`` (array-like) are the points at which the data should be clusterpolated. ``radius`` (float) is the radius of each data point's kernel. ``kernel_factory`` is a function that takes a radius and returns a corresponding kernel function. The kernel function must accept an array of distances (>= 0) and return the corresponding kernel values. The kernel function must be normalized (a distance of 0 must yield a value of 1) and it should be zero for distances greater than ``radius``. Neighbor lookup is done using an instance of :py:class:`sklearn.neighbors.NearestNeighbors`, constructed with the default options. You can pass an instance that is configured to suit your data via the ``neighbors`` parameter. By default, computations are parallelized according to the number of available CPUs. Set ``num_jobs`` to a specific number to use more or fewer parallel processes. Returns two arrays. The first contains the predicted value for the corresponding target point, and the second contains the target point's degree of membership (a float between 0 and 1). """ # Accept lists as inputs points = np.array(points) values = np.array(values) if points.shape[0] != values.shape[0]: raise ValueError('The numbers of points and values must match.') targets = np.array(targets) if neighbors is None: neighbors = sklearn.neighbors.NearestNeighbors(radius=radius) neighbors.fit(points) kernel = kernel_factory(radius) num_jobs = min(num_jobs or multiprocessing.cpu_count(), targets.shape[0]) tasks = np.array_split(targets, num_jobs) values = _map(_worker, tasks, (neighbors, values, kernel)) predictions = np.concatenate([v[0] for v in values]) membership = np.concatenate([v[1] for v in values]) return predictions, membership
def word2vec_model(unique_text: set, min_count: int = 1, window: int = 5, n_neighbors: int = 2, min_samples: int = 5, verbose: bool = False): """ Returns a word2vec model, a dbscan, clusters, the number of clusters, and the amount of noise """ model = gensim.models.Word2Vec([unique_text], min_count=min_count, size=len(unique_text), window=window) vec = model.wv.vectors # Get optimal epsilon neighbors = sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors) nbrs = neighbors.fit(vec) distances, _ = nbrs.kneighbors(vec) distances = np.sort(distances[:, 1], axis=0) epsilon = np.average(distances[:len(distances) // 2]) # Run cluster detection ting db = sklearn.cluster.DBSCAN(eps=epsilon, min_samples=min_samples).fit(vec) clusters = db.labels_ n_clusters = len(set(clusters)) - (-1 in clusters) n_noise = list(clusters).count(-1) word_clusters = [ np.array(unique_text)[clusters == i] for i in range(n_clusters) ] if verbose: print(f"Clusters: {n_clusters} | Noise: {n_noise}") return model, db, clusters, word_clusters, n_clusters, n_noise
def _compute_exclusivity(loc_proto_lat, loc_proto_lon, locs_proto, df_i, file_name_out): neigh = NearestNeighbors(radius=200, metric=spherical_distance) neigh.fit(locs_proto) exclusivity = [] for i in range(len(loc_proto_lat)): loc_x = loc_proto_lat[i] loc_y = loc_proto_lon[i] sup_loc = df_i.iloc[i]["support"] rng = neigh.radius_neighbors([[loc_x, loc_y]]) nei_index = list(rng[1][0]) support_list = df_i.iloc[nei_index]["support"] tot_sup = sum(support_list) exclusivity.append(sup_loc / tot_sup) df_c = pd.DataFrame(exclusivity, columns=['exclusivity']) df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False) return df_c
def _compute_rev_centrality(locs_proto, df_c, file_name_out): neigh = NearestNeighbors(n_neighbors=25, metric=spherical_distance) neigh.fit(locs_proto) n_neigh = [1, 3, 5, 8, 10, 20] rev_centrality = [] distances, _ = neigh.kneighbors(locs_proto) for n in n_neigh: r = [] for d in distances: r.append(d[n]) rev_centrality.append(r) df_c = df_c.assign(rev_centrality1=rev_centrality[0], rev_centrality3=rev_centrality[1], rev_centrality5=rev_centrality[2], rev_centrality8=rev_centrality[3], rev_centrality10=rev_centrality[4], rev_centrality20=rev_centrality[5]) df_c.to_csv(file_name_out + "_coll.csv", mode="w", index=False) return df_c
tree = tree.DecisionTreeClassifier() bayes = GaussianNB() neighbors = neighbors.KNeighborsClassifier() supportvc = svm.SVC() # Train your data with Decision Tree # http://scikit-learn.org/stable/modules/tree.html tree = tree.fit(train_X, train_Y) # Train your data with Bayes # http://scikit-learn.org/stable/modules/naive_bayes.html bayes = bayes.fit(train_X, train_Y) # Train your data with k neighbors # http://scikit-learn.org/stable/modules/neighbors.html neighbors = neighbors.fit(train_X, train_Y) # Train your data with Support Vector classifier # http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html supportvc = supportvc.fit(train_X, train_Y) test_X = [[150, 40, 30], [176, 69, 43], [188,92,48],[184,84,44],[183,83,44], [166,47,36],[170,60,38],[172,64,39],[182,80,42],[180,80,43]] test_Y = ['female', 'male', 'male','male','male','female','female', 'female','male','male'] tree_prediction = tree.predict(test_X) bayes_prediction = bayes.predict(test_X) neighbors_prediction = neighbors.predict(test_X) supportvc_prediction = supportvc.predict(test_X)
print(classLabelArray.shape) print(dataValues.shape) transposedataValues = dataValues.T training_accuracy = [] testing_accuracy = [] # Splitting the dataset as 75% Training and 25% Testing trainX, testX, trainY, testY = train_test_split(transposedataValues, classLabelArray, test_size=0.25) # Array Creation for K = 3 predictedArray = [] # Fitting the Data neighbors = neighbors.KNeighborsClassifier(n_neighbors=3, weights='uniform') neighbors.fit(trainX, trainY) neighbors_setting = range(0, 12) # Predicting the Data for nneighbors in neighbors_setting: testdatafetching = num.asarray(testX[i, :]) testdatafetching = num.reshape(testdatafetching, (1, -1)) print("KNN for K = 3") pred = neighbors.predict(testdatafetching) predictedArray.insert(i, pred[0]) print(pred) print("Test Accuracy: {:.2f}".format(neighbors.score(testX, testY)))
def geographic_charac(df_poi, df_c, file_name_out): loc_proto_lat = df_c["loc_proto_lat"] loc_proto_lon = df_c["loc_proto_lon"] locs_proto = [list(x) for x in zip(loc_proto_lat, loc_proto_lon)] poi_lat = df_poi["lat"] poi_lon = df_poi["lon"] poi_coords = [list(x) for x in zip(poi_lat, poi_lon)] # get category list categories = [ "gas", "parking", "pier", "hotel", "food", "leisure", "shop", "service", "supermarket" ] print("centrality poi") # count the number of poi for each category in a radius of 1 km centrality_poi = [] neigh = NearestNeighbors(radius=500, metric=spherical_distance) neigh.fit(poi_coords) for i in range(len(loc_proto_lat)): loc_x = loc_proto_lat[i] loc_y = loc_proto_lon[i] rng = neigh.radius_neighbors([[loc_x, loc_y]]) nei_index = list(rng[1][0]) count_c = dict.fromkeys(categories, 0) for j in nei_index: p_c = df_poi.iloc[j]["category"] count_c[p_c] += 1 centrality_poi.append(list(count_c.values())) df_g1 = pd.DataFrame(centrality_poi, columns=["n_" + c for c in categories]) df_g1.to_csv(file_name_out + "_geo.csv", mode="w", index=False) print("count_nearest_neighbour") knei_poi = [] # count how many poi for each category there are in the top 10 nearest neighbours neigh = NearestNeighbors(n_neighbors=30, metric=spherical_distance) neigh.fit(poi_coords) distances, indices = neigh.kneighbors(locs_proto) # for each point take list of 11 neighbours indices for nei_index in indices: count_c = dict.fromkeys(categories, 0) for j in nei_index: p_c = df_poi.iloc[j]["category"] count_c[p_c] += 1 knei_poi.append(list(count_c.values())) df_g2 = pd.DataFrame(knei_poi, columns=["k_" + c for c in categories]) df_g = pd.concat([df_g1, df_g2], axis=1) df_g.to_csv(file_name_out + "_geo.csv", mode="w", index=False) print("dist_nearest_neighbour") dist_poi = [] # take the min distance for each category in the top 10 nearest neighbours for i, d in zip(indices, distances): dist_c = dict.fromkeys(categories, 10000) for j in range(len(d)): p_c = df_poi.iloc[i[j]]["category"] if d[j] < dist_c[p_c]: dist_c[p_c] = d[j] dist_poi.append(list(dist_c.values())) df_g3 = pd.DataFrame(dist_poi, columns=["d_" + c for c in categories]) df_g = pd.concat([df_g, df_g3], axis=1) df_g.to_csv(file_name_out + "_geo.csv", mode="w", index=False) return df_g
#[height, weight, shoe size] X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40], [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 39], [171, 75, 42], [181, 85, 43]] #gender Y = [ 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'female', 'male', 'female', 'male' ] #create variables with classifiers tree = tree.DecisionTreeClassifier() neighbors = neighbors.KNeighborsClassifier() randomForest = ensemble.RandomForestClassifier() #fit them tree = tree.fit(X, Y) neighbors = neighbors.fit(X, Y) randomForest = randomForest.fit(X, Y) #create prediction variables for results predictionForTree = tree.predict([[167, 63, 41]]) predictionForNeighbors = neighbors.predict([[167, 63, 41]]) predictionForRandomForest = randomForest.predict([[167, 63, 41]]) #print all the results print("DecisionTree: ", predictionForTree) print("NearestNeighbors: ", predictionForNeighbors) print("RandomForest: ", predictionForRandomForest)