def deserialize_dbscan_clustering(model_dict): model = DBSCAN(**model_dict["params"]) model.components_ = np.array(model_dict["components_"]) model.labels_ = np.array(model_dict["labels_"]) model.core_sample_indices_ = model_dict["core_sample_indices_"] model.n_features_in_ = model_dict["n_features_in_"] model._estimator_type = model_dict["_estimator_type"] return model
def deserialize_dbscan_clustering(model_dict): model = DBSCAN(**model_dict['params']) #model.eps = model_dict['params']['eps'] model.components_ = np.array(model_dict['components_']) model.labels_ = np.array(model_dict['labels_']) model.core_sample_indices_ = model_dict['core_sample_indices_'] model.n_features_in_ = model_dict['n_features_in_'] model._estimator_type = model_dict['_estimator_type'] return model
def get_grouping(lab2wv_dict, eps=0.3, metric='cosine', return_mapping=False): """ Gets label grouping after performing DBSCAN clustering. param lab2wv_dict: labels to word vectors dictionary (obtained using get_labels2wv_dict) param eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound on the distances of points within a cluster. (achieving reasonable clustering highly depends on dim of wvectors, 0.2 works well for 128D and 0.3 for 300D) param metric: metric used for similarity, options: 'cosine', 'l1', 'l2' param return_mapping: if set, the human readable form of grouping will also be returned :returns clust_assignment_labels (ndarray) and if 'return_mapping`, assignment dictionary """ print('Grouping labels using DBscan....') keys = np.array(list(lab2wv_dict.keys())) vals = np.array(list(lab2wv_dict.values())) clust = DBSCAN(eps=eps, min_samples=1, metric=metric) clust.fit(vals) # fall out strategy to prevent the grouping from messing up. If grouping is aggressive, cancel it completely. if len(set(clust.labels_)) <= 5: print('Label grouping seems to have not work, aborting grouping.') clust.labels_ = list(range(len(clust.labels_))) print( f'{len(clust.labels_)} labels were grouped into {len(set(clust.labels_))} groups.' ) assign = {} for i, v in enumerate(clust.labels_): if v not in assign.keys(): assign[v] = [] assign[v].append(keys[i]) for k, v in assign.items(): if len(v) > 1: print(f'{v} are grouped together.') if not return_mapping: return clust.labels_ return clust.labels_, assign
def train_dbscan(data): EPSILON_DISTANCES = range(1, 20) models = [] for epsilon_distance in EPSILON_DISTANCES: logger.info(f"DBSCAN clustering with epsilon distance = {epsilon_distance} started...") model = DBSCAN(eps=epsilon_distance).fit(data) logger.info(f"DBSCAN clustering with epsilon distance = {epsilon_distance} done...") models.append(model) predictions = [] for model in models: logger.info(f"Prediction started...") prediction = model.labels_(data) logger.info(f"Prediction done...") predictions.append(prediction) fig, ax = plt.subplots() ax.plot(EPSILON_DISTANCES, predictions) plt.savefig(f"{CURRENT_DIR}/algorithms/plots/DensityBased.jpeg") plt.show()
def main(): #get rid of scientific notation in numpy arrays np.set_printoptions(suppress=True) features = 40 # all_files = glob.glob('assignment5/data/*.wav') #assignment5/data # print(all_files) # #Writing file names to csv written by Alex B. # with open('all_files.csv', 'w') as csvFile: # wr = csv.writer(csvFile, delimiter="\n") # wr.writerow(all_files) # csvFile.close() # #Reading file names to list written by Alex B. # with open("all_files.csv", 'r') as csvFile: # reader = csv.reader(csvFile, delimiter='\n') # all_files.append(reader) # csvFile.close() #Alex Brockman all_files = [] with open('all_files.csv', newline='') as csvFile: for row in csv.reader(csvFile): all_files.append(row[0]) all_files.sort() data_list = [] sampling_rate_list = [] data_list = np.loadtxt('features_all.csv', delimiter=',') #Writing feature sets to csv written by Alex B. # #write feature set from every file to csv # with open('features.csv', 'w') as csvFile: # writer = csv.writer(csvFile) # for file in all_files: # data, sr = librosa.load(file) # mfccs = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=features) # mfccsscaled = np.mean(mfccs.T,axis=0) # data_list.append(mfccsscaled) # writer.writerow(mfccsscaled) # csvFile.close() # print(data_list) #Optimal EPS value code written by Alex B. # #find the optimal eps value # neigh = NearestNeighbors(n_neighbors=4) # nbrs = neigh.fit(data_list) # distances, indices = nbrs.kneighbors(data_list) # distances = np.sort(distances, axis=0) # distances = distances[:,1] # plt.plot(distances) # plt.show() # #create list from csv # with open('features.csv', 'r') as f: # reader = csv.reader(f) # feature = list(reader) # data_list.append(feature) # #data_list.append(list(reader)) # print(data_list) # data = file('features.csv').read() # table = [row.split(',') for row in data.split('\n')] # print(table) #turn lists into numpy arrays data_kmeans = data_list #data_list = np.array(data_list) sampling_rate_list = np.array(sampling_rate_list) #run DBSCAN on our data (Andrew implemented run time estimations) start_time = time.time() clustering = DBSCAN(eps=68, min_samples=2).fit(data_list) end_time = time.time() dbscan_elapsed_time = end_time - start_time print("Time DBSCAN: " + str(dbscan_elapsed_time)) clustercount = np.max(clustering.labels_) + 1 clusteringlist = list(clustering.labels_) #Setup for outputting to file (Done by Carlos) zipped = zip(clusteringlist, all_files) zipped = set(zipped) #Done by Alex num_clusters = [[] for i in range(1, clustercount + 1)] #Done by Alex for k, v in zipped: for i in range(len(zipped)): if k == i: num_clusters[i].append(v) i += 1 #Nested for loop written by Carlos S., write to file also written by Carlos #have to filter out each name and put it into respective sublists within big list sub_list_to_output = [] list_to_output = [] for item in num_clusters: for subitem in item: subitem = subitem.replace('assignment5/data/', '') subitem = subitem.replace('.wav', '') sub_list_to_output.append(subitem) list_to_output.append(sub_list_to_output) sub_list_to_output = [] #writes DBSCAN output into output_DBSCAN.txt output_file = 'output/output_DBSCAN.txt' with open(output_file, 'w') as fw: fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n') i = 0 for cluster in list_to_output: fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n') i += 1 #Outputting our own KMeans, Done by Alex Brockman, runtime by Andrew j = 0 start_time = time.time() while (j < 5): k = (len(list_to_output)) km = K_Means(k) kmeans_labels = km.fit(data_kmeans) j += 1 end_time = time.time() our_kmeans_elapsed_time = end_time - start_time clusteringlist = kmeans_labels print("Time Kmeans: " + str(our_kmeans_elapsed_time)) #Setup for outputting to file zipped = zip(clusteringlist, all_files) zipped = set(zipped) num_clusters = [[] for i in range(1, clustercount + 1)] for k, v in zipped: for i in range(len(zipped)): if k == i: num_clusters[i].append(v) i += 1 #have to filter out each name and put it into respective sublists within big list sub_list_to_output = [] list_to_output = [] for item in num_clusters: for subitem in item: subitem = subitem.replace('assignment5/data/', '') subitem = subitem.replace('.wav', '') sub_list_to_output.append(subitem) list_to_output.append(sub_list_to_output) sub_list_to_output = [] #writes KMEANS output into output_KMEANS.txt, logic by Carlos, implemented by Alex output_file = 'output/output_KMEANS.txt' with open(output_file, 'w') as fw: fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n') i = 0 for cluster in list_to_output: fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n') i += 1 #Our own kmeans results # print("Our Own KMeans Results:") # for key, value in (clusters[1]).items(): # print("Cluster {} contains ".format(key + 1) + str(len(value)) + " files") #scikit agglomerative output #run times by andrew start_time = time.time() scikit_agg_labels = sci_kit_agg_clustering(data_list, len(list_to_output)) end_time = time.time() scikit_agg_elapsed_time = end_time - start_time print("Time Agglomerative: " + str(scikit_agg_elapsed_time)) #Setup for outputting to file zipped = zip(scikit_agg_labels, all_files) zipped = set(zipped) num_clusters = [[] for i in range(1, clustercount + 1)] for k, v in zipped: for i in range(len(zipped)): if k == i: num_clusters[i].append(v) i += 1 #have to filter out each name and put it into respective sublists within big list sub_list_to_output = [] list_to_output = [] for item in num_clusters: for subitem in item: subitem = subitem.replace('assignment5/data/', '') subitem = subitem.replace('.wav', '') sub_list_to_output.append(subitem) list_to_output.append(sub_list_to_output) sub_list_to_output = [] #writes SCIKITAGG output into output_SCIKITAGG.txt output_file = 'output/output_SCIKITAGG.txt' with open(output_file, 'w') as fw: fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n') i = 0 for cluster in list_to_output: fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n') i += 1 #scikit kmeans output start_time = time.time() scikit_kmeans_labels = sci_kit_KMeans(data_list, len(list_to_output)) end_time = time.time() scikit_kmeans_elapsed_time = end_time - start_time print("Time Sci Kit Kmeans: " + str(scikit_kmeans_elapsed_time)) #Setup for outputting to file clusteringlist = scikit_agg_labels zipped = zip(clusteringlist, all_files) zipped = set(zipped) num_clusters = [[] for i in range(1, clustercount + 1)] for k, v in zipped: for i in range(len(zipped)): if k == i: num_clusters[i].append(v) i += 1 #have to filter out each name and put it into respective sublists within big list sub_list_to_output = [] list_to_output = [] for item in num_clusters: for subitem in item: subitem = subitem.replace('assignment5/data/', '') subitem = subitem.replace('.wav', '') sub_list_to_output.append(subitem) list_to_output.append(sub_list_to_output) sub_list_to_output = [] #writes SCIKITKMEANS output into output_SCIKITKMEANS.txt output_file = 'output/output_SCIKITKMEANS.txt' with open(output_file, 'w') as fw: fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n') i = 0 for cluster in list_to_output: fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n') i += 1 #excel file output written by Alex Brockman and Anish Prasanna count = 0 clustering.labels_ = list(clustering.labels_) scikit_agg_labels = list(scikit_agg_labels) scikit_kmeans_labels = list(scikit_kmeans_labels) resultmat = [[0 for x in range(len(data_list))] for y in range(len(data_list))] for i in range(len(kmeans_labels)): for j in range(len(kmeans_labels)): count = 0 if kmeans_labels[i] == scikit_agg_labels[j]: count += 1 resultmat[i].insert(j, count) if kmeans_labels[i] == scikit_kmeans_labels[j]: count += 1 resultmat[i].insert(j, count) if kmeans_labels[i] == clustering.labels_[j]: count += 1 resultmat[i].insert(j, count) df = pd.DataFrame.from_records(resultmat) df.to_excel("output.xlsx")
for i in range(int(P)): xr = random.random() yr = random.random() points.append([x + xr, y + yr]) """ for p in points: plt.plot(p[0],p[1],'o',color="black") if random.random() < .05: plt.draw() plt.pause(.0001) plt.show() """ X = np.array(points) clustering = DBSCAN(eps=0.16, min_samples=10).fit(X) clustering.labels_ = GaussianMixture(n_components=10, covariance_type="tied").fit_predict(X) #clustering = AgglomerativeClustering(n_clusters=5,linkage="single").fit(X) #num_clusters = len(set(clustering.labels_).difference(set([-1]))) """ for i in set(clustering.labels_): if len(np.where(clustering.labels_ == i)[0]) < 20: clustering.labels_[clustering.labels_==i] = -1 for j in range(len(clustering.labels_)): if clustering.labels_[j] > i: clustering.labels_[j] -= 1 """ data_clusters = np.zeros(shape=(15, dataGrid.data_length)) for i, p in enumerate(points): x = int(p[0]) q = int(p[1])
# clean up the data and extract the x, y indices of the points that have value of 1 # write your code [x, y] = np.nonzero(data) arr = [] for i in range(len(x)): arr.append([x[i], y[i]]) # clustering algorithm # write your code clustering = DBSCAN(eps=15, min_samples=8).fit(arr) length = len(clustering.labels_) for i in range(len(clustering.labels_)-1,-1,-1): if clustering.labels_[i] == -1: x = np.delete(x, i) y = np.delete(y, i) clustering.labels_ = np.delete(clustering.labels_, i) # plot the results # write your code df = pd.DataFrame({ 'x': y, 'y': x, 'label': clustering.labels_ }) fg = sns.FacetGrid(data=df, hue='label') fg.map(plt.scatter, 'x', 'y', s=6).add_legend() plt.xlim(0, 401) plt.ylim(100, 401) plt.show()