Exemple #1
0
def deserialize_dbscan_clustering(model_dict):
    model = DBSCAN(**model_dict["params"])
    model.components_ = np.array(model_dict["components_"])
    model.labels_ = np.array(model_dict["labels_"])
    model.core_sample_indices_ = model_dict["core_sample_indices_"]
    model.n_features_in_ = model_dict["n_features_in_"]
    model._estimator_type = model_dict["_estimator_type"]

    return model
def deserialize_dbscan_clustering(model_dict):
    model = DBSCAN(**model_dict['params'])
    #model.eps = model_dict['params']['eps']

    model.components_ = np.array(model_dict['components_'])
    model.labels_ = np.array(model_dict['labels_'])
    model.core_sample_indices_ = model_dict['core_sample_indices_']
    model.n_features_in_ = model_dict['n_features_in_']
    model._estimator_type = model_dict['_estimator_type']

    return model
Exemple #3
0
def get_grouping(lab2wv_dict, eps=0.3, metric='cosine', return_mapping=False):
    """
    Gets label grouping after performing DBSCAN clustering.

    param lab2wv_dict: labels to word vectors dictionary (obtained using get_labels2wv_dict)

    param eps: The maximum distance between two samples for one to be considered as in the neighborhood of the other.
    This is not a maximum bound on the distances of points within a cluster.
    (achieving reasonable clustering highly depends on dim of wvectors, 0.2 works well for 128D and 0.3 for 300D)

    param metric: metric used for similarity, options: 'cosine', 'l1', 'l2'
    param return_mapping: if set, the human readable form of grouping will also be returned

    :returns clust_assignment_labels (ndarray) and if 'return_mapping`, assignment dictionary

    """
    print('Grouping labels using DBscan....')
    keys = np.array(list(lab2wv_dict.keys()))
    vals = np.array(list(lab2wv_dict.values()))
    clust = DBSCAN(eps=eps, min_samples=1, metric=metric)
    clust.fit(vals)
    # fall out strategy to prevent the grouping from messing up. If grouping is aggressive, cancel it completely.
    if len(set(clust.labels_)) <= 5:
        print('Label grouping seems to have not work, aborting grouping.')
        clust.labels_ = list(range(len(clust.labels_)))

    print(
        f'{len(clust.labels_)} labels were grouped into {len(set(clust.labels_))} groups.'
    )
    assign = {}
    for i, v in enumerate(clust.labels_):
        if v not in assign.keys():
            assign[v] = []
        assign[v].append(keys[i])

    for k, v in assign.items():
        if len(v) > 1:
            print(f'{v} are grouped together.')

    if not return_mapping:
        return clust.labels_
    return clust.labels_, assign
def train_dbscan(data):
    EPSILON_DISTANCES = range(1, 20)

    models = []
    for epsilon_distance in EPSILON_DISTANCES:
        logger.info(f"DBSCAN clustering with epsilon distance = {epsilon_distance} started...")
        model = DBSCAN(eps=epsilon_distance).fit(data)
        logger.info(f"DBSCAN clustering with epsilon distance = {epsilon_distance} done...")

        models.append(model)

    predictions = []

    for model in models:
        logger.info(f"Prediction started...")
        prediction = model.labels_(data)
        logger.info(f"Prediction done...")
        predictions.append(prediction)

    fig, ax = plt.subplots()
    ax.plot(EPSILON_DISTANCES, predictions)
    plt.savefig(f"{CURRENT_DIR}/algorithms/plots/DensityBased.jpeg")
    plt.show()
Exemple #5
0
def main():

    #get rid of scientific notation in numpy arrays
    np.set_printoptions(suppress=True)
    features = 40

    # all_files = glob.glob('assignment5/data/*.wav') #assignment5/data
    # print(all_files)

    # #Writing file names to csv written by Alex B.
    # with open('all_files.csv', 'w') as csvFile:
    # 	wr = csv.writer(csvFile, delimiter="\n")
    # 	wr.writerow(all_files)
    # csvFile.close()

    # #Reading file names to list written by Alex B.
    # with open("all_files.csv", 'r') as csvFile:
    # 	reader = csv.reader(csvFile, delimiter='\n')
    # 	all_files.append(reader)
    # csvFile.close()

    #Alex Brockman
    all_files = []
    with open('all_files.csv', newline='') as csvFile:
        for row in csv.reader(csvFile):
            all_files.append(row[0])

    all_files.sort()
    data_list = []
    sampling_rate_list = []

    data_list = np.loadtxt('features_all.csv', delimiter=',')

    #Writing feature sets to csv written by Alex B.

    # #write feature set from every file to csv
    # with open('features.csv', 'w') as csvFile:
    # 	writer = csv.writer(csvFile)
    # 	for file in all_files:
    # 		data, sr = librosa.load(file)
    # 		mfccs = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=features)
    # 		mfccsscaled = np.mean(mfccs.T,axis=0)
    # 		data_list.append(mfccsscaled)
    # 		writer.writerow(mfccsscaled)
    # csvFile.close()
    # print(data_list)

    #Optimal EPS value code written by Alex B.

    # #find the optimal eps value
    # neigh = NearestNeighbors(n_neighbors=4)
    # nbrs = neigh.fit(data_list)
    # distances, indices = nbrs.kneighbors(data_list)

    # distances = np.sort(distances, axis=0)
    # distances = distances[:,1]
    # plt.plot(distances)
    # plt.show()

    # #create list from csv
    # with open('features.csv', 'r') as f:
    # 	reader = csv.reader(f)
    # 	feature = list(reader)
    # 	data_list.append(feature)
    # 	#data_list.append(list(reader))
    # print(data_list)

    # data = file('features.csv').read()
    # table = [row.split(',') for row in data.split('\n')]
    # print(table)

    #turn lists into numpy arrays
    data_kmeans = data_list
    #data_list = np.array(data_list)
    sampling_rate_list = np.array(sampling_rate_list)

    #run DBSCAN on our data (Andrew implemented run time estimations)
    start_time = time.time()
    clustering = DBSCAN(eps=68, min_samples=2).fit(data_list)
    end_time = time.time()
    dbscan_elapsed_time = end_time - start_time
    print("Time DBSCAN: " + str(dbscan_elapsed_time))
    clustercount = np.max(clustering.labels_) + 1
    clusteringlist = list(clustering.labels_)

    #Setup for outputting to file (Done by Carlos)
    zipped = zip(clusteringlist, all_files)
    zipped = set(zipped)

    #Done by Alex
    num_clusters = [[] for i in range(1, clustercount + 1)]

    #Done by Alex
    for k, v in zipped:
        for i in range(len(zipped)):
            if k == i:
                num_clusters[i].append(v)
        i += 1

    #Nested for loop written by Carlos S., write to file also written by Carlos
    #have to filter out each name and put it into respective sublists within big list
    sub_list_to_output = []
    list_to_output = []
    for item in num_clusters:
        for subitem in item:
            subitem = subitem.replace('assignment5/data/', '')
            subitem = subitem.replace('.wav', '')
            sub_list_to_output.append(subitem)
        list_to_output.append(sub_list_to_output)
        sub_list_to_output = []

    #writes DBSCAN output into output_DBSCAN.txt
    output_file = 'output/output_DBSCAN.txt'
    with open(output_file, 'w') as fw:
        fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n')
        i = 0
        for cluster in list_to_output:
            fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n')
            i += 1

    #Outputting our own KMeans, Done by Alex Brockman, runtime by Andrew
    j = 0
    start_time = time.time()
    while (j < 5):
        k = (len(list_to_output))
        km = K_Means(k)
        kmeans_labels = km.fit(data_kmeans)
        j += 1
    end_time = time.time()
    our_kmeans_elapsed_time = end_time - start_time
    clusteringlist = kmeans_labels
    print("Time Kmeans: " + str(our_kmeans_elapsed_time))

    #Setup for outputting to file
    zipped = zip(clusteringlist, all_files)
    zipped = set(zipped)

    num_clusters = [[] for i in range(1, clustercount + 1)]

    for k, v in zipped:
        for i in range(len(zipped)):
            if k == i:
                num_clusters[i].append(v)
        i += 1

    #have to filter out each name and put it into respective sublists within big list
    sub_list_to_output = []
    list_to_output = []
    for item in num_clusters:
        for subitem in item:
            subitem = subitem.replace('assignment5/data/', '')
            subitem = subitem.replace('.wav', '')
            sub_list_to_output.append(subitem)
        list_to_output.append(sub_list_to_output)
        sub_list_to_output = []

    #writes KMEANS output into output_KMEANS.txt, logic by Carlos, implemented by Alex
    output_file = 'output/output_KMEANS.txt'
    with open(output_file, 'w') as fw:
        fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n')
        i = 0
        for cluster in list_to_output:
            fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n')
            i += 1

    #Our own kmeans results
    # print("Our Own KMeans Results:")
    # for key, value in (clusters[1]).items():
    # 	print("Cluster {} contains ".format(key + 1) + str(len(value)) + " files")

    #scikit agglomerative output
    #run times by andrew
    start_time = time.time()
    scikit_agg_labels = sci_kit_agg_clustering(data_list, len(list_to_output))
    end_time = time.time()
    scikit_agg_elapsed_time = end_time - start_time
    print("Time Agglomerative: " + str(scikit_agg_elapsed_time))

    #Setup for outputting to file
    zipped = zip(scikit_agg_labels, all_files)
    zipped = set(zipped)

    num_clusters = [[] for i in range(1, clustercount + 1)]

    for k, v in zipped:
        for i in range(len(zipped)):
            if k == i:
                num_clusters[i].append(v)
        i += 1

    #have to filter out each name and put it into respective sublists within big list
    sub_list_to_output = []
    list_to_output = []
    for item in num_clusters:
        for subitem in item:
            subitem = subitem.replace('assignment5/data/', '')
            subitem = subitem.replace('.wav', '')
            sub_list_to_output.append(subitem)
        list_to_output.append(sub_list_to_output)
        sub_list_to_output = []

    #writes SCIKITAGG output into output_SCIKITAGG.txt
    output_file = 'output/output_SCIKITAGG.txt'
    with open(output_file, 'w') as fw:
        fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n')
        i = 0
        for cluster in list_to_output:
            fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n')
            i += 1

    #scikit kmeans output
    start_time = time.time()
    scikit_kmeans_labels = sci_kit_KMeans(data_list, len(list_to_output))
    end_time = time.time()
    scikit_kmeans_elapsed_time = end_time - start_time
    print("Time Sci Kit Kmeans: " + str(scikit_kmeans_elapsed_time))

    #Setup for outputting to file
    clusteringlist = scikit_agg_labels

    zipped = zip(clusteringlist, all_files)
    zipped = set(zipped)

    num_clusters = [[] for i in range(1, clustercount + 1)]

    for k, v in zipped:
        for i in range(len(zipped)):
            if k == i:
                num_clusters[i].append(v)
        i += 1

    #have to filter out each name and put it into respective sublists within big list
    sub_list_to_output = []
    list_to_output = []
    for item in num_clusters:
        for subitem in item:
            subitem = subitem.replace('assignment5/data/', '')
            subitem = subitem.replace('.wav', '')
            sub_list_to_output.append(subitem)
        list_to_output.append(sub_list_to_output)
        sub_list_to_output = []

    #writes SCIKITKMEANS output into output_SCIKITKMEANS.txt
    output_file = 'output/output_SCIKITKMEANS.txt'
    with open(output_file, 'w') as fw:
        fw.write('Number of Clusters: ' + str(len(list_to_output)) + '\n')
        i = 0
        for cluster in list_to_output:
            fw.write('Cluster ' + str(i) + ' contains: ' + str(cluster) + '\n')
            i += 1

    #excel file output written by Alex Brockman and Anish Prasanna
    count = 0
    clustering.labels_ = list(clustering.labels_)
    scikit_agg_labels = list(scikit_agg_labels)
    scikit_kmeans_labels = list(scikit_kmeans_labels)
    resultmat = [[0 for x in range(len(data_list))]
                 for y in range(len(data_list))]
    for i in range(len(kmeans_labels)):
        for j in range(len(kmeans_labels)):
            count = 0
            if kmeans_labels[i] == scikit_agg_labels[j]:
                count += 1
                resultmat[i].insert(j, count)
            if kmeans_labels[i] == scikit_kmeans_labels[j]:
                count += 1
                resultmat[i].insert(j, count)
            if kmeans_labels[i] == clustering.labels_[j]:
                count += 1
                resultmat[i].insert(j, count)
    df = pd.DataFrame.from_records(resultmat)
    df.to_excel("output.xlsx")
        for i in range(int(P)):
            xr = random.random()
            yr = random.random()
            points.append([x + xr, y + yr])
"""
for p in points:
    plt.plot(p[0],p[1],'o',color="black")
    if random.random() < .05:
        plt.draw()
        plt.pause(.0001)
plt.show()
"""

X = np.array(points)
clustering = DBSCAN(eps=0.16, min_samples=10).fit(X)
clustering.labels_ = GaussianMixture(n_components=10,
                                     covariance_type="tied").fit_predict(X)
#clustering = AgglomerativeClustering(n_clusters=5,linkage="single").fit(X)
#num_clusters = len(set(clustering.labels_).difference(set([-1])))
"""
for i in set(clustering.labels_):
    if len(np.where(clustering.labels_ == i)[0]) < 20:
        clustering.labels_[clustering.labels_==i] = -1
        for j in range(len(clustering.labels_)):
            if clustering.labels_[j] > i:
                clustering.labels_[j] -= 1
"""
data_clusters = np.zeros(shape=(15, dataGrid.data_length))

for i, p in enumerate(points):
    x = int(p[0])
    q = int(p[1])
Exemple #7
0
# clean up the data and extract the x, y indices of the points that have value of 1
# write your code
[x, y] = np.nonzero(data)
arr = []
for i in range(len(x)):
    arr.append([x[i], y[i]])
# clustering algorithm
# write your code
clustering = DBSCAN(eps=15, min_samples=8).fit(arr)
length = len(clustering.labels_)
for i in range(len(clustering.labels_)-1,-1,-1):
    if clustering.labels_[i] == -1:
        x = np.delete(x, i)
        y = np.delete(y, i)
        clustering.labels_ = np.delete(clustering.labels_, i)
# plot the results
# write your code


df = pd.DataFrame({
    'x': y,
    'y': x,
    'label': clustering.labels_
})
fg = sns.FacetGrid(data=df, hue='label')
fg.map(plt.scatter, 'x', 'y', s=6).add_legend()

plt.xlim(0, 401)
plt.ylim(100, 401)
plt.show()