def cameras(): conn, cursor = connect() query = "SELECT title, x, y, rstp, F, current_frame FROM cameras" cursor.execute(query) cameras = cursor.fetchall() query = "SELECT * FROM occurrences WHERE DATE(`timestamp`)=CURDATE() AND e1 IS NOT NULL" cursor.execute(query) today_occurrences = cursor.fetchall() arr = [[to["e%i" % i] for i in range(1, 129)] for to in today_occurrences] arr = np.array(arr) model = OPTICS() model.fit(arr) indices = np.arange(len(today_occurrences)) result_occurrences = [] for i in range(np.max(model.labels_) + 1): person_indices = indices[model.labels_ == i] print(person_indices) if len(person_indices) < 4: continue index = np.random.choice(person_indices) result_occurrences.append('/'+'/'.join(today_occurrences[index]['human_picture'].split('/')[1:])) conn.close() print(len(result_occurrences)) return render_template('cameras.html', cameras=cameras, today_occurrences=result_occurrences)
def optics(self, x, threshold = 0.01, min_samples = 0.01): """ """ model = OPTICS(eps=threshold, min_samples = max(10, int(min_samples*len(x)))) model.fit(x) return model
def make_autoencoder(data, lr=0.001, enc_dim=100): # Auto encoder layers ae0 = Input(shape=products_shape, name='FeaturesInput') encode = Dense(enc_dim, activation='relu', kernel_initializer=he_normal(1), name='AE_feature_reduction')(ae0) decode = Dense(products_shape[0], activation='relu', name='AE_3')(encode) # inspired by https://www.frontiersin.org/articles/10.3389/fgene.2018.00585/full # clustering layers (will work with the help of OPTICS) # we want to find the probability of one product to be in 1 of total found clusters opt = OPTICS() opt.fit(minmax.fit_transform(data)) clusters = len(np.unique(opt.labels_)) print('Optimal number of cluster:', clusters) prob0 = Dense(enc_dim // 2, activation='relu', kernel_initializer=he_normal(1))(encode) prob1 = BatchNormalization()(prob0) prob = Dense(clusters, activation='softmax', name='Probability_Product')(prob1) autoencoder_ = Model(inputs=ae0, outputs=decode) encoder_ = Model(inputs=ae0, outputs=encode) p_prob = Model(inputs=ae0, outputs=prob) autoencoder_.compile(optimizer=Adam(learning_rate=lr), loss='mae', metrics=['mse']) return autoencoder_, encoder_, p_prob, opt
def optics(params): distance_path='' distance_path+=params["distance_path"] print(distance_path) distance=np.loadtxt(distance_path,dtype=np.float32) print(distance.shape) #using default values, set metric to 'precomputed' op = OPTICS(eps=0.03, min_samples =10, metric='precomputed') #check db print(op) op.fit(distance) #get labels labels = op.labels_ print(labels,labels.shape) #get number of clusters no_clusters = len(set(labels)) - (1 if -1 in labels else 0) print(no_clusters,"no_clusters") #for i in range(no_clusters): #print('Cluster : ', np.nonzero(labels == i)[0]) #print(type(labels)) return_val=tuple(labels.tolist()) #print(type(return_val)) return return_val
def setUp(self): n_points_per_cluster = 250 np.random.seed(0) C1 = np.zeros((n_points_per_cluster, 3)) C2 = np.zeros((n_points_per_cluster, 3)) C3 = np.zeros((n_points_per_cluster, 3)) C4 = np.zeros((n_points_per_cluster, 3)) C5 = np.zeros((n_points_per_cluster, 3)) C6 = np.zeros((n_points_per_cluster, 3)) C1[:, 1:3] = ([-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)) C2[:, 1:3] = ([4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)) C3[:, 1:3] = ([0, -2] + .2 * np.random.randn(n_points_per_cluster, 2)) C4[:, 1:3] = ([-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)) C5[:, 1:3] = ([3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)) C6[:, 1:3] = ([5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)) X = np.vstack( (C1[:, 1:3], C2[:, 1:3], C3[:, 1:3], C4[:, 1:3], C5[:, 1:3], C6[:, 1:3])) clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) # Run the fit clust.fit(X) self.tbhg = modeling.TBH() self.tbhg.optics = clust self.tbhg.locH = (C1, C2, C3, C4, C5, C6) # self.tbhg = TBHG(clust) pass
def optics_fit_predict(X, min_samples=50, cluster_method='dbscan', eps=2): """Perform OPTICS clustering Extracts an ordered list of points and reachability distances, and performs initial clustering using ``max_eps`` distance specified at OPTICS object instantiation. Parameters ---------- X : array, shape (n_samples, n_features), or (n_samples, n_samples) min_samples : The number of samples in a neighborhood for a point to be considered as a core point. cluster_method : 'dbscan' by default. Other available: 'xi' eps : The maximum distance between two samples for one to be considered as in the neighborhood of the other. Returns ------- labels: Prediction/labels """ opt = OPTICS(min_samples=min_samples, cluster_method=str(cluster_method)) opt.fit(X) labels = cluster_optics_dbscan(reachability=opt.reachability_, core_distances=opt.core_distances_, ordering=opt.ordering_, eps=eps) return labels
def optics_clustering(df, min_sample, min_cluster_size): clust = OPTICS(min_samples=min_sample, xi=.05, min_cluster_size=min_cluster_size) # Run the fit clust.fit(df) labels = clust.labels_[clust.ordering_] # Generate new column with cluster_id df['cluster_id'] = np.nan for each in range(len(df)): x = labels[each] df.loc[df.index == each, 'cluster_id'] = str(x) # Create an array containing n arrays filled with PIDs of n cluster n = df.col_name.unique() cluster = [] for each in n: cluster_no = str(each) array = np.array(df.pid.loc[df['cluster_id'] == cluster_no]) cluster.append(array) return df, cluster
def perform_optics_clustering(data, program_options: Options) -> ClusteredData: # The data that will be returned clustered_data = ClusteredData(data, list(), program_options=program_options) op = OPTICS(min_samples=program_options.OPTICS_MIN_SAMPLES, n_jobs=-1) op.fit(data) optic_labels = op.labels_ for k in range(optic_labels.max() + 1): class_members = optic_labels == k nodes_in_cluster = data[class_members] # optics has no way of telling you the final cluster centres so have to calculate it yourself cluster_centre = nodes_in_cluster.mean(axis=0) cluster = Cluster(cluster_centre=cluster_centre, nodes=nodes_in_cluster, cluster_type=ClusterType.FULL_CLUSTER, program_options=program_options) clustered_data.add_cluster(cluster) if optic_labels.min() == -1: class_members = optic_labels == -1 # There are unclassified nodes unclassified_nodes = data[class_members] for unclassified_node in unclassified_nodes: cluster_to_add = Cluster(unclassified_node, [unclassified_node], cluster_type=ClusterType.UNCLASSIFIED_NODE_CLUSTER, program_options=program_options) clustered_data.add_unclassified_node(cluster_to_add) return clustered_data
def get_clustered_data(data_matrix, clustering_algorithm=model_constants.KMEANS, distance_metric='euclidean', num_clusters=3): if clustering_algorithm.lower() == model_constants.AFFINITY_PROP: aff_prop = AffinityPropagation(affinity=distance_metric) aff_prop.fit(data_matrix) return aff_prop.labels_, aff_prop elif clustering_algorithm.lower() == model_constants.DBSCAN: dbscan = DBSCAN(metric=distance_metric) dbscan.fit(data_matrix) return dbscan.labels_, dbscan elif clustering_algorithm.lower() == model_constants.OPTICS: optics = OPTICS(metric=distance_metric) optics.fit(data_matrix) return optics.labels_, optics elif clustering_algorithm.lower() == model_constants.MEANSHIFT: mean_shift = MeanShift() mean_shift.fit(data_matrix) return mean_shift.labels_, mean_shift elif clustering_algorithm.lower() == model_constants.BIRCH: birch = Birch(n_clusters=num_clusters) birch.fit(data_matrix) return birch.labels_, birch elif clustering_algorithm.lower() == model_constants.AGGLOMERATIVE: agglomerative = AgglomerativeClustering(n_clusters=num_clusters, affinity=distance_metric) agglomerative.fit(data_matrix) return agglomerative.labels_, agglomerative else: kmeans = KMeans(n_clusters=num_clusters, random_state=42) kmeans.fit(data_matrix) return kmeans.labels_, kmeans
def routes_cluster(df, orig, dest, params, color, size): '''Groups routes using the OPTICS clustering algorithm.''' model = OPTICS(**params) X = np.hstack((df.loc[orig, ['x', 'y']].values, df.loc[dest, ['x', 'y']].values)) model.fit(X=X) n = len(X) fancy = model.ordering_.tolist() fancy2 = fancy + [f + n for f in model.ordering_] df = df.loc[fancy2, :] df['reachability_order'] = list(range(n)) * 2 df['reachability'] = model.reachability_[fancy].tolist() * 2 df['route_cluster'] = model.labels_[fancy].tolist() * 2 df['route_cluster'] = df['route_cluster'].replace(-1, np.nan) df['reachability_plot'] = df['route_cluster'] // size df.loc[orig, 'reachability_plot'] = df.loc[orig, 'reachability_plot'].fillna( method='bfill') df.loc[dest, 'reachability_plot'] = df.loc[dest, 'reachability_plot'].fillna( method='bfill') df.loc[orig, 'reachability_plot'] = df.loc[orig, 'reachability_plot'].fillna( method='ffill') df.loc[dest, 'reachability_plot'] = df.loc[dest, 'reachability_plot'].fillna( method='ffill') df['reachability_color'] = (df['route_cluster'] % size).map(color) df['reachability_color'] = df['reachability_color'].fillna('(1,1,1)') return df
def sub_cluster(wanted_gps, wanted_time, file_names, min_pic_num, show_idx=False): # normalization wanted_xyz = [drs.lonlat2xyz(x[0], x[1], x[2]) for x in wanted_gps] norm_xyz = np.array((wanted_xyz - np.mean(wanted_xyz, 0)) / (np.std(wanted_xyz, 0) + np.array([1e-15, 1e-15, 1e-15]))) wanted_secs = drs.convert_datetime_seconds(wanted_time) norm_secs = np.array((wanted_secs - np.mean(wanted_secs)) / (np.std(wanted_secs) + np.array([1e-15, 1e-15, 1e-15]))) # norm_info = np.array([np.array([x[1], x[2], y[0]]) for x, y in zip(norm_xyz, norm_secs)]) norm_info = np.array([np.array([x[0], x[1], x[2], y[0]]) for x, y in zip(norm_xyz, norm_secs)]) # img_cl_idx = hcluster.fclusterdata(norm_info, thres, criterion="distance", method = 'centroid') clust = OPTICS(min_pic_num) clust.fit(norm_info) img_cl_idx, res_eps, min_noise = find_best_thres(clust, len(wanted_gps)) # img_cl_idx = DBSCAN(thres, 3).fit_predict(norm_info) # plotting if show_idx: fig = plt.figure() ax = fig.add_subplot(111, projection='3d') title = "threshold: %f, number of clusters: %d" % (res_eps, len(np.unique(img_cl_idx))) ax.set_title(title) plt.show() # find file names in each cluster: res_fn = [[] for _ in np.unique(np.append(img_cl_idx, [-1]))] for idx, cl_idx in enumerate(img_cl_idx): try: res_fn[cl_idx + 1].append(file_names[idx]) except: pdb.set_trace() res_noise = res_fn[0] res_fn = np.array(res_fn[1:]) res_unchosen = res_fn[[len(x) < min_pic_num for x in res_fn]] res_fn = res_fn[[len(x) >= min_pic_num for x in res_fn]] return np.array([set(x) for x in res_fn]), np.array(img_cl_idx), res_noise, np.array(res_unchosen)
def test_correct_number_of_clusters(): # in 'auto' mode n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. # Compute OPTICS clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1) clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) assert n_clusters_1 == n_clusters # check attribute types and sizes assert clust.labels_.shape == (len(X),) assert clust.labels_.dtype.kind == 'i' assert clust.reachability_.shape == (len(X),) assert clust.reachability_.dtype.kind == 'f' assert clust.core_distances_.shape == (len(X),) assert clust.core_distances_.dtype.kind == 'f' assert clust.ordering_.shape == (len(X),) assert clust.ordering_.dtype.kind == 'i' assert set(clust.ordering_) == set(range(len(X)))
def cluster_optics(data_mat, metric='euclidean', min_samples=5, max_eps=np.inf, n_jobs=32, save_dir=''): """ Cluster data using OPTICS. :param data_mat: (array) data matrix :param metric: (str) distance metric to use in clustering :param min_samples: (int) minimum number of neighbours for core points :param max_eps: (float) maximum distance for OPTICS :param n_jobs: (int) number or jobs to spawn :param save_dir: (str) directory where to save resulting labels :return: (model, array) trained OPTICS model and labels array """ start_time = time.time() opt = OPTICS( min_samples=min_samples, metric=metric, n_jobs=n_jobs, max_eps=max_eps ) opt.fit(data_mat) print('Clustering took: {}'.format(time.time() - start_time)) opt_labs = opt.labels_ if save_dir: f_name = 'optics_labels_{}_ms{}_me{}'.format( metric, min_samples, max_eps ) np.save(os.path.join(save_dir, f_name + '.npy'), opt_labs) return opt, opt_labs
def find_pairs(self): """ Uses OPTICS algorithim to find clusters of similar securities within PCA component space. Once clusters labels are assigned, function generates series of tuples containing unique pairs of securities within the same cluster. """ if self.returns_reduced is None: raise ValueError("returns_reduced not found: must run \ .reduce_PCA() before this function") # Initialize and fit OPTICS cluster to PCA components clustering = OPTICS() clustering.fit(self.components_.T) # Create cluster data frame and identify trading pairs clusters = pd.DataFrame({ 'security': self.securities, 'cluster': clustering.labels_ }) # clusters with label == -1 are 'noise' clusters = clusters[clusters['cluster'] != -1] # Group securities by cluster and flatten list of combination lists groups = clusters.groupby('cluster') combos = list(groups['security'].apply(combinations, 2)) # All pairs pairs = list(chain.from_iterable(combos)) # Flatten list of lists print(f"Found {len(pairs)} potential pairs") self.pairs = pd.Series(pairs) self.cluster_labels_ = clustering.labels_
def test_min_cluster_size_invalid2(): clust = OPTICS(min_cluster_size=len(X) + 1) with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(X) clust = OPTICS(min_cluster_size=len(X) + 1, metric="euclidean") with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(sparse.csr_matrix(X))
def test_min_cluster_size_invalid(min_cluster_size): clust = OPTICS(min_cluster_size=min_cluster_size) with pytest.raises(ValueError, match="must be a positive integer or a "): clust.fit(X) clust = OPTICS(min_cluster_size=min_cluster_size, metric="euclidean") with pytest.raises(ValueError, match="must be a positive integer or a "): clust.fit(sparse.csr_matrix(X))
def cluster_embedded_maps_optics(aligned_maps): # embeding = embed(aligned_maps) embedding = np.vstack([xmap.flatten() for xmap in aligned_maps]) clusterer = OPTICS() clusterer.fit(embedding.astype(np.float64)) return clusterer.labels_
def test_bad_reachability(): msg = "All reachability values are inf. Set a larger max_eps." centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) with pytest.warns(UserWarning, match=msg): clust = OPTICS(max_eps=5.0 * 0.003, min_samples=10, eps=0.015) clust.fit(X)
def run_optics(data): print(">>> Running OPTICS") clf = OPTICS(min_samples=25, xi=.05, min_cluster_size=.02) tic = time.time() clf.fit(data) toc = time.time() print(" OPTICS found {} clusters".format(len(np.unique(clf.labels_)))) print(" OPTICS took {:.2f} s".format(toc - tic)) return clf.labels_
def _do_optics(self): """ No optimal parameter was mentioned in the paper, so use default parameters here. :return: clusterings: The cluster label of each stock """ cluster = OPTICS() cluster.fit(self.__pca_repr) self.__clusterings = cluster.labels_ return cluster.labels_
def OPTICS_Clustering(X): X = preprocess(X) cluster = OPTICS(min_samples=100, xi=.05, min_cluster_size=.05) cluster.fit(X) label_pred = cluster_optics_dbscan(reachability=cluster.reachability_, core_distances=cluster.core_distances_, ordering=cluster.ordering_, eps=2) label_pred = cluster.labels_ return label_pred
def form_clusters(): features = np.load( "xception_features_with_dim_{}x{}".format(img_width, img_height) + '.npy') # db = DBSCAN(eps=config['eps'], min_samples=config['min_pts']).fit(features) # labels = db.labels_ # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # n_noise_ = list(labels).count(-1) # # print('Estimated number of clusters: %d' % n_clusters_) # print('Estimated number of noise points: %d' % n_noise_) # # pca_features = PCA.fit(n_components=2) # # plt.scatter(features[:,0], features[:,1],c=db, cmap='Paired') # plt.title("DBSCAN") clust = OPTICS(min_samples=5, xi=.05) # Run the fit clust.fit(features) space = np.arange(len(features)) reachability = clust.reachability_[clust.ordering_] labels = clust.labels_[clust.ordering_] plt.figure(figsize=(10, 7)) G = gridspec.GridSpec(2, 1) ax1 = plt.subplot(G[0, :]) ax2 = plt.subplot(G[1, 0]) # Reachability plot colors = ['g.', 'r.', 'b.', 'y.', 'c.'] for klass, color in zip(range(0, 5), colors): Xk = space[labels == klass] Rk = reachability[labels == klass] ax1.plot(Xk, Rk, color, alpha=0.3) ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3) ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5) ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5) ax1.set_ylabel('Reachability (epsilon distance)') ax1.set_title('Reachability Plot') # OPTICS colors = ['g.', 'r.', 'b.', 'y.', 'c.'] for klass, color in zip(range(0, 5), colors): Xk = features[clust.labels_ == klass] ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3) ax2.plot(features[clust.labels_ == -1, 0], features[clust.labels_ == -1, 1], 'k+', alpha=0.1) ax2.set_title('Automatic Clustering\nOPTICS') plt.tight_layout() plt.show()
def _cluster_optics(self): optics = OPTICS(min_cluster_size=self.min_clu_size, min_samples=self.min_clu_size, metric=self.distance_metric, leaf_size=len(self.data)) optics.fit(X=self.data) pred = self._extract_best_optics(optics) # Append its k to the list of values if self.estimated_k: self.cand_k.append(1+max(pred)) return pred
def clustering(the_image_autoencoded, the_image_shape, number_of_clusters, extra_parameters=""): print() print("*** OPTICS clustering ***") print("---------------------------------") # https://scikit-learn.org/stable/modules/clustering.html # https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html # #sphx-glr-auto-examples-cluster-plot-optics-py # https://scikit-learn.org/stable/modules/clustering.html#optics print("Image shape: ", the_image_shape) print("OPTICS clustering") clust = OPTICS(min_samples=10, xi=.0005, min_cluster_size=.005) print("Running fit function for OPTICS clustering") clust.fit(the_image_autoencoded) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5) labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) labels_300 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=3) print("---------------------------") reachability = clust.reachability_[clust.ordering_] print("Reachability: ", reachability) print("---------------------------") print("Creating list for clustered data") clustered_data = np.zeros((the_image_shape[0], the_image_shape[1])) print("Clustered data shape: ", np.shape(clustered_data)) x = 0 y = 0 for i in range(the_image_shape[0] * the_image_shape[1]): clustered_data[y, x] = labels_050[y * the_image_shape[1] + x] x = x + 1 if x == the_image_shape[1]: x = 0 y = y + 1 return clustered_data
def test_min_cluster_size(min_cluster_size): redX = X[::2] # reduce for speed clust = OPTICS(min_samples=9, min_cluster_size=min_cluster_size).fit(redX) cluster_sizes = np.bincount(clust.labels_[clust.labels_ != -1]) if cluster_sizes.size: assert min(cluster_sizes) >= min_cluster_size # check behaviour is the same when min_cluster_size is a fraction clust_frac = OPTICS(min_samples=9, min_cluster_size=min_cluster_size / redX.shape[0]) clust_frac.fit(redX) assert_array_equal(clust.labels_, clust_frac.labels_)
def fit(self, X): """ Apply the ST OPTICS algorithm ---------- X : 2D numpy array with The first element of the array should be the time attribute as float. The following positions in the array are treated as spatial coordinates. The structure should look like this [[time_step1, x, y], [time_step2, x, y]..] For example 2D dataset: array([[0,0.45,0.43], [0,0.54,0.34],...]) Returns ------- self """ # check if input is correct X = check_array(X) if not self.eps1 > 0.0 or not self.eps2 > 0.0 or not self.min_samples > 0.0: raise ValueError('eps1, eps2, minPts must be positive') n, m = X.shape # Compute sqaured form Euclidean Distance Matrix for 'time' attribute and the spatial attributes time_dist = pdist(X[:, 0].reshape(n, 1), metric=self.metric) euc_dist = pdist(X[:, 1:], metric=self.metric) # filter the euc_dist matrix using the time_dist time_filter = math.pow(10, m) dist = np.where(time_dist <= self.eps2, euc_dist, time_filter) # speeds up the ST OPTICS if np.isinf(self.max_eps): self.max_eps = time_filter - 1 if np.isinf(self.eps1): self.eps1 = time_filter - 1 op = OPTICS(eps=self.eps1, min_samples=self.min_samples, metric='precomputed', max_eps=self.max_eps, cluster_method=self.cluster_method, xi=self.xi, n_jobs=self.n_jobs) op.fit(squareform(dist)) self.labels = op.labels_ self.reachability = op.reachability_ self.ordering = op.ordering_ self.core_distances = op.core_distances_ self.predecessor = op.predecessor_ self.cluster_hierarchy = op.cluster_hierarchy_ return self
def train_topics(tweets, model, _min_samples=5): quantified = get_topics_quantified(tweets, n_topics=1) quantified = [x[0] for x in quantified] optics = OPTICS(min_samples=_min_samples) optics.fit(quantified) file = open(model, 'wb') pickle.dump(optics, file) file.close() ### CREATE TWEET ARRAY EXAMPLES # tweets = ["Commended for no longer saying 'China virus' Did US military bring #Covid19 to 7th Military World Games Oct18-27, 2019 Wuhan, China? Patient zero: Maatja Benassi US Athlete/Intelligence Officer? Did World's military take it back to their countries?", "BREAKING | Boris Johnson will get lung ventilation - health source sptnkne.ws/BWtv #SputnikBreaking @BorisJohnson"] # #tweets = ["BREAKING | Boris Johnson will get lung ventilation - health source sptnkne.ws/BWtv #SputnikBreaking @BorisJohnson"] # #tweets = ["Commended for no longer saying 'China virus' Did US military bring #Covid19 to 7th Military World Games Oct18-27, 2019 Wuhan, China? Patient zero: Maatja Benassi US Athlete/Intelligence Officer? Did World's military take it back to their countries?"] # tweets = ["The legislative council belongs to the people of Hong Kong.Those people with ulterior motives indicated by forces hide behind the scenes laid seige to the legislative.The path of your darkness and the bright roads of the masses of the Hong Kong people will not inevitably coexist."] ### GET AND PRINT TOPICS # topics = get_topics(tweets); # print(topics); ### GET AND PRINT TOPIC VECTORS # quantified = get_topics_quantified(tweets); # print(quantified); ### CLUSTER TWEETS BASED ON TOPICS # print(train_topics(tweets, "models/model_001.pickle", _min_samples=1)); # print(cluster_topics(tweets, "models/model_001.pickle")); # # Read the CSV file mapping all tweet data to a motive. # labeledDataPath = "../data/actors_and_motives.csv"; # df = pd.read_csv(labeledDataPath, usecols=["tweet_docs", "motive"], converters={"tweet_docs": lambda x: x.strip("[]").split(", ")}); # # Removes leading and ending quote characters # df["tweet_docs"] = [[x.strip('\"') for x in df["tweet_docs"][i]] for i in range(len(df["tweet_docs"]))]; # MAX_LINES = 30; # tweets = []; # # Get all file paths and their associated motives from the dataframe. # _files, _classes = [], []; # for i in range(len(df["tweet_docs"])): # for j in range(len(df["tweet_docs"][i])): # data = pd.read_csv(df["tweet_docs"][i][j], usecols=["tweet_text"], nrows=MAX_LINES); # tweets += data["tweet_text"].tolist(); # train_topics(tweets, "models/model_001.pickle"); # tweets = ["Commended for no longer saying 'China virus' Did US military bring #Covid19 to 7th Military World Games Oct18-27, 2019 Wuhan, China? Patient zero: Maatja Benassi US Athlete/Intelligence Officer? Did World's military take it back to their countries?", "BREAKING | Boris Johnson will get lung ventilation - health source sptnkne.ws/BWtv #SputnikBreaking @BorisJohnson"] # labels = cluster_topics(tweets, "models/model_001.pickle"); # connected = sorted(zip(labels, tweets)); # for label, tweet in connected: # print(str(label) + ": " + str(tweet));
def test_minimum_number_of_sample_check(): # test that we check a minimum number of samples msg = "min_samples must be no greater than" # Compute OPTICS X = [[1, 1]] clust = OPTICS(max_eps=5.0 * 0.3, min_samples=10, min_cluster_size=1) # Run the fit with pytest.raises(ValueError, match=msg): clust.fit(X)
def doClustering(X = None, y = None, initial = False, silent = True, numClusters = 4): takekmeans = True takeoptics = False if not silent: print("- doClustering") X, y = rd.readTransformedData() # metric learning X2 = X.iloc[:, 0:].values if initial == False: votesX, votesY = rd.readFeedbackData() pairs = [] for index, row in votesX.iterrows(): pairs.append((X2[row["id_punkt1"]], X2[row["id_punkt2"]])) a = votesY itml = ITML() itml.fit(pairs, a) if not silent: print("Transform") X2 = itml.transform(X2) if takekmeans == True: # Compute kMeans # print("numCluster",numClusters) # number_clusters = numClusters kmeans = KMeans(n_clusters=numClusters , random_state=0).fit(X2) labels = kmeans.labels_ labels_true = y core_samples_mask = [0] * len(y) elif takeoptics == True: opt = OPTICS(min_samples=30, xi=.05) # opt = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) opt.fit(X2) labels = opt.labels_ labels_true = y core_samples_mask = [0] * len(y) else: # Compute DBSCAN # db = DBSCAN(eps=0.1, min_samples=10).fit(X2) db = DBSCAN(eps=0.6, min_samples=5).fit(X2) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ labels_true = y writeClusteringResult(X2, labels, labels_true, core_samples_mask) if not silent: print("+ doClustering") return 1
class OPTICS_algo_wrapper: def __init__(self): self.wrapped = OPTICS(min_samples=5, xi=.05, min_cluster_size=.05) self.data = [] self.indexes = [] def fit(self, data): self.wrapped.fit(data) self.data = data self.indexes = self.wrapped.labels_ def predict(self, data): return self.wrapped.fit_predict(data)
np.random.seed(0) n_points_per_cluster = 250 C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2) C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2) C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2) C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, C6)) clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05) # Run the fit clust.fit(X) labels_050 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=0.5) labels_200 = cluster_optics_dbscan(reachability=clust.reachability_, core_distances=clust.core_distances_, ordering=clust.ordering_, eps=2) space = np.arange(len(X)) reachability = clust.reachability_[clust.ordering_] labels = clust.labels_[clust.ordering_] plt.figure(figsize=(10, 7)) G = gridspec.GridSpec(2, 3) ax1 = plt.subplot(G[0, :])