def Kmeans(file_name):
    data = getMatrix(file_name)
    list = []
    for i in range(data.shape[1]):
        list.append(data.iloc[:, i])
    initial_k = int(math.sqrt(data.shape[1] / 2))
    range_n_clusters = np.arange(2, initial_k, 1)
    k = []
    silhouette = []
    time_list = []
    for n_clusters in range_n_clusters:
        t_start = time.time()
        if n_clusters == 0 or n_clusters == 1:
            continue
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(list)
        avg_score = silhouette_score(list, cluster_labels)
        k.append(n_clusters)
        silhouette.append(avg_score)
        t_end = time.time()
        time_list.append(float(t_end - t_start))
        # print('k为' + str(n_clusters) + '的时候,时间:' + str(t_end - t_start) + 's')

        #n_cluster = 10时的扇形图
        # if n_clusters == 10:
        #     plt.figure(figsize=(6, 9))
        #     labels = []
        #     sizes = []
        #     for i in range(n_clusters):
        #         labels.append('label' + str(i))
        #         sizes.append(len(cluster_labels[cluster_labels == i]) / len(cluster_labels))
        #     patches, l_text, p_text = plt.pie(sizes, labels=labels,
        #                                       labeldistance=1.1, autopct='%3.1f%%', shadow=False,
        #                                       startangle=90, pctdistance=0.6)
        #     for t in l_text:
        #         t.set_size = (30)
        #     for t in p_text:
        #         t.set_size = (20)
        #     plt.axis('equal')
        #     plt.legend()
        #     plt.show()
    #柱状图
    plt.bar(k, time_list, width=0.8,facecolor="#9999ff",edgecolor="white")
    for x, y in zip(k, time_list):
        plt.text(x + 0.4, y + 0.05, '%.2f' % y, ha='center', va='bottom')
    plt.ylim(0.0, 1.0)
    plt.show()

    #折线图
    plt.figure('k-silhouette')
    plt.xlabel('K values')
    plt.ylabel('silhouette values')
    plt.title('k-silhouette table')
    plt.plot(k, silhouette)
    plt.show()
    return cluster_labels
def getLSHashOutput(filename, hash_size, k):
    matrix = getMatrix(filename)
    list = []
    for i in range(matrix.shape[1]):
        list.append(matrix.iloc[i])
    total_num = len(matrix.iloc[0])
    lsh = LSHash(hash_size=int(hash_size * total_num), input_dim=len(matrix.iloc[:,0]))
    for i in range(total_num):
        lsh.index(input_point=matrix.iloc[:,i], extra_data=matrix.columns[i])
    out_num = rand.randint(0, total_num - 1)
    #有多种lshash函数,默认是euclidean
    m = lsh.query(query_point=matrix.iloc[:, out_num], num_results=k + 1, distance_func='euclidean')
    print("输入的vipno是" + str(matrix.columns[out_num]) + "\n其桶中的vipno有:")
    bucket = []
    for i in range(len(m)):
        print(m[i][0][1])
        tag = np.argwhere(matrix.columns == m[i][0][1])
        bucket.append(int(tag))
    return bucket
def gmm_kmeans(file_name):
    data = getMatrix(file_name)
    list = []
    K_BEST = 2
    for i in range(data.shape[1]):
        list.append(data.iloc[:, i])

    # 算出Kmeans和Gaussian的labels对比正确率
    clf = GaussianMixture(n_components=K_BEST, covariance_type='full')
    clf.fit(list)
    gaussian_labels = clf.predict(list)
    kmeans_cluster = KMeans(n_clusters=K_BEST, random_state=10)
    kmeans_labels = kmeans_cluster.fit_predict(list)
    match_num = 0
    for i in range(data.shape[1]):
        if (gaussian_labels[i] == kmeans_labels[i]):
            match_num += 1
    ratio_kmeans = match_num / data.shape[1]
    print('与Kmeans相比,在K取' + str(K_BEST) + ',GMM的准确率是:' +
          str(ratio_kmeans * 100) + '%')
    return gaussian_labels
def gmm_dbscan(file_name, cova_type):
    data = getMatrix(file_name)
    list = []
    EPSK_BEST = 300
    for i in range(data.shape[1]):
        list.append(data.iloc[:, i])

    # 算出DBSCAN和Gaussian的labels的对比
    clf = GaussianMixture(n_components=1, covariance_type=cova_type)
    clf.fit(list)
    gaussian_labels = clf.predict(list)
    dbscan_cluster = DBSCAN(eps=EPSK_BEST, min_samples=10)
    dbscan_labels = dbscan_cluster.fit_predict(list)
    match_num = 0
    for i in range(data.shape[1]):
        if (dbscan_labels[i] == gaussian_labels[i]):
            match_num += 1
    ratio_dbscan = match_num / data.shape[1]
    print('与DBScan相比,在eps取' + str(EPSK_BEST) + ',GMM的准确率是:' +
          str(ratio_dbscan * 100) + '%')
    return gaussian_labels
def dbscan(file_name):
    data = getMatrix(file_name)
    list = []
    for i in range(data.shape[1]):
        list.append(data.iloc[:, i])
    range_n_clusters = np.arange(25, len(list) - 1, 10)
    eps = []
    silhouette = []
    noise_list = []
    time_list = []
    for i in range_n_clusters:
        start_t = time.time()
        db = DBSCAN(eps=i, min_samples=10).fit(list)
        end_time = time.time()
        labels = db.labels_
        avg_score = silhouette_score(list, labels)
        eps.append(i)
        silhouette.append(avg_score)
        noise_list.append(len(labels[labels == -1]) / len(labels))
        time_list.append(end_time - start_t)
        # 柱状图

    plt.title('time_DBSCAN')
    plt.bar(eps, time_list, width=2, facecolor="#9999ff", edgecolor="white")
    for x, y in zip(eps, time_list):
        plt.text(x + 0.4, y + 0.05, '%.2f' % y, ha='center', va='bottom')
    plt.ylim(0.0, 1.0)
    plt.show()

    plt.bar(eps, noise_list, width=2, facecolor="#9999ff", edgecolor="white")
    for x, y in zip(eps, noise_list):
        plt.text(x + 0.4, y + 0.05, '%.2f' % y, ha='center', va='bottom')
    plt.ylim(0.0, 1.0)
    plt.show()

    plt.figure('eps - Silhouette')
    plt.plot(eps, silhouette)
    plt.show()
    return labels