Example #1
0
def clusterise_data(data_obj):
    """ Assigns a cluster label to each days present in the data received 
        using three different algorithms: MeanShift, Affinity Propagation, 
        or KMeans. 
        @param data_obj: List of dictionaries
    """
    L = len(data_obj)
    
    #Simply converts data_obj to a 2D list for computation
    List2D = [[None for _ in range(4)] for _ in range(L-1)]
    for i in range(L-1): #don't include current day
        #wake_up and sleep_duration are the most important factors
        List2D[i][0] = 5 * data_obj[i]["wake_up"]
        List2D[i][1] = 1 * data_obj[i]["sleep"]
        List2D[i][2] = 5 * data_obj[i]["sleep_duration"]
        List2D[i][3] = 0.5 * data_obj[i]["activity"]
    points = NumpyArray(List2D) #converts 2D list to numpyarray
        
    if ALGO == "Affinity Propagation":
        labels = AffinityPropagation().fit_predict(points)
    elif ALGO == "KMeans":
        labels= KMeans(init='k-means++', n_clusters=5, n_init=10)   .fit_predict(points)
    elif ALGO == "MeanShift":
        bandwidth = estimate_bandwidth(points, quantile=0.2, n_samples=20)
        labels = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit_predict(points)
    else:
        raise Exception("Algorithm not defined: "+str(ALGO))
        
    for i in range(L-1):
        data_obj[i]["cluster"] = labels[i]
    for unique_label in remove_duplicates(labels):
        debug_print(ALGO+": Cluster "+str(unique_label)+" contains "+str(labels.tolist().count(unique_label))+" data points")
    debug_print(ALGO+": Silhouette coefficient"+ str(metrics.silhouette_score(points, labels, metric='euclidean')*100)+"%")
Example #2
0
string_summary = []
for s in token_summary:
    string_summary.append(" ".join(token for token in s))
print("Getting TFIDF...")
vectorizer = TfidfVectorizer(norm='l2')

sim_matrix = cosine_similarity(vectorizer.fit_transform(string_summary))
#print("the shape of similarity matrix: " % np.shape(sim_matrix))

print("Start clustering...")
start = time.time()
labels = AffinityPropagation().fit_predict(sim_matrix)
print("{:.2f}s".format(time.time() - start))

labels = labels.tolist()
# print(labels)

dict = {}
for i in labels:
	if i not in dict:
		dict[i]=1
	else:
		dict[i]+=1

def dict2list(dic:dict):
	''' 将字典转化为列表 '''
	keys = dic.keys()
	vals = dic.values()
	lst = [(key, val) for key, val in zip(keys, vals)]    
	return lst