コード例 #1
0
 def __init__(self, tx=None, y=None, z=None, num_components=50):
     [D, self.WW, self.mu] = pca(tx, num_components)
     self.y = y
     self.tx = tx
     self.z = z
     for xi in tx:
         self.projections.append(
             project(self.WW, xi.reshape(1, -1), self.mu))
コード例 #2
0
ファイル: Mnist.py プロジェクト: ArrogantL/ML
def mnistProcess():
    """
    分析minist-5,目标图片是数字5的手写
    将不同pcnum的结果保存到data/mnistprocess

    这里将每一行当作一个样本,即28个28维样本。
    可选的其他样本划分方式:
    1. 加入行序号:由于行号规律强,PCA通常不会将其作为主成分,所以加不加影响不大。
    2. 28*28个三维样本,(行号,列号,灰度):行列号对PCA影响不大,仅剩下一个维度无法PCA

    """
    im = readData()
    for i in range(50):
        # i=17与i=18,人眼就非常难以分辨了。
        lowD, newD, topdvects = pca(im, i)
        PSNR = analyzePSNR(im, newD)
        print(i, PSNR)
        plt.imshow(np.matrix.tolist(newD), cmap='gray')
        plt.savefig("data/mnistprocess/" + str(i) + ".png")
コード例 #3
0
def look_rawdata(origin_num):
    fin = open('data/microarray.original.txt', 'r')
    lines = []
    fin.readline()
    fout = open('output/pca/pca%.2f.txt' % PCA_PERCENTAGE, 'w')
    dataset_np = np.zeros([ALL_DATA, PCA[str(PCA_PERCENTAGE)]])

    # read raw data
    for i in range(22283):
        line = fin.readline()
        line = line.split('\t')
        line = line[1:]
        lines.append(list(map(float, line)))

    print("Data has been read successfully.")

    # do PCA
    data = np.array(lines).T
    print("Now reducing dimension...")
    lowDData = pca(data, PCA_PERCENTAGE)
    #print(lowDData[0][0])
    print("Finished, the new dimension is :" + str(len(lowDData[0])))

    # save pca results (.txt file and .npy)
    print("Start writing new data...")
    j = 0
    for k in origin_num:
        for num in k[1]:
            for i in range(len(lowDData[num])):
                dataset_np[j][i] = lowDData[num][i].real      # the number will be xxx+0j without .real
                fout.write(str(lowDData[num][i].real) + '\t')
            j += 1
            fout.write('\n')
    np.save('output/pca/pca%.2f.npy' % PCA_PERCENTAGE, dataset_np)
    print("Finished the whole work.")

    fin.close()
    fout.close()

    return dataset_np
コード例 #4
0
def Train():
    global MLPObj, PrepareObj, RBFObj, PCAObj, sc_x
    MLPObj = MLP()
    PrepareObj = Preparation()
    RBFObj = RBF()
    PCAObj = pca()
    x_train, y_train, x_test, y_test, Original_x_train, Original_x_test = PrepareObj.GetDataset(
        "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Training",
        "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Testing"
    )
    if NNPCAVar.get():
        PCAObj.LoadWeights()
        x_train = PCAObj.transform(Original_x_train)
        x_test = PCAObj.transform(Original_x_test)
    from sklearn.preprocessing import StandardScaler
    sc_x = StandardScaler()
    x_train = sc_x.fit_transform(x_train)
    x_test = sc_x.transform(x_test)
    if LoadTrainVar.get():
        if AlgoVar.get():
            MLPObj.TrainTheModel(Hidden_Entry.get(), epochs_Entry.get(),
                                 LearningRate_Entry.get(), Neurons_Entry.get(),
                                 Activation_Entry.get(), MSE_Entry.get(),
                                 var.get(), x_train, y_train, x_test, y_test)
        else:
            RBFObj.TrainTheModel_rbf(Neurons_Entry.get(),
                                     LearningRate_Entry.get(), MSE_Entry.get(),
                                     epochs_Entry.get(), 5, x_train, y_train,
                                     x_test, y_test)
    else:
        if AlgoVar.get():
            MLPObj.LoadWeights(Hidden_Entry.get(), epochs_Entry.get(),
                               LearningRate_Entry.get(), Neurons_Entry.get(),
                               Activation_Entry.get(), MSE_Entry.get(),
                               var.get(), x_train, y_train, x_test, y_test)
        else:
            RBFObj.LoadWeights(Neurons_Entry.get(), LearningRate_Entry.get(),
                               MSE_Entry.get(), epochs_Entry.get(), 5, x_train,
                               y_train, x_test, y_test)
コード例 #5
0
    print("h:%s erro:%s"%(i,NaiveBayes.classificarParzen(wpdcCopia, mp1, mp2, i, ["N","R"])))
    print(confusion_matrix(WpdcOri.classes, wpdcCopia.classes,["N","R"]))
    
print("naive com janela de parzen retangular - wbdc")
for i in h:
    wbdcCopia = Base(copy.deepcopy(wbdcOri.classes),copy.deepcopy(wbdcOri.atributos)) 
    print("h:%s erro:%s"%(i,NaiveBayes.classificarParzen(wbdcCopia, m1, m2, i, ["M","B"],"r"))) 
    print(confusion_matrix(wbdcOri.classes, wbdcCopia.classes,["M","B"]))
print("naive com janela de parzen retangular - wpdc")
for i in h:
    wpdcCopia = Base(copy.deepcopy(WpdcOri.classes),copy.deepcopy(WpdcOri.atributos))
    print("h:%s erro:%s"%(i,NaiveBayes.classificarParzen(wpdcCopia, mp1, mp2, i, ["N","R"])))
    print(confusion_matrix(WpdcOri.classes, wpdcCopia.classes,["N","R"]))
#Q6
print("---------------------Sexta Questao------------------------\n")
wbdcPCA = pca(wbdcOri, len(wbdcOri.atributos[0])-1)
m1,m2 = separarElementosPorClasse(wbdcPCA, ["M","B"])
v1 = np.var(m1)
v2 = np.var(m2)
m1 = np.mean(m1, axis=0)
m2 = np.mean(m2, axis=0)
print("erro wbdc naiveBayes univariado:%s"%NaiveBayes.classificar(m1, m2, v1, v2, wbdcPCA,["M","B"],"u"))
print(confusion_matrix(wbdcOriSort.classes, wbdcPCA.classes))

WpdcPCA = pca(WpdcOri,len(WpdcOri.atributos[0])-1)
m1,m2 = separarElementosPorClasse(WpdcPCA, ["N","R"])
v1 = np.var(m1)
v2 = np.var(m2)
m1 = np.mean(m1, axis=0)
m2 = np.mean(m2, axis=0)
print("erro wpdc naiveBayes univariado:%s"%NaiveBayes.classificar(m1, m2, v1, v2, WpdcPCA,["N","R"],"u"))
from PCA import pca
TrainingDatasetPath = "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Training"
TestingDatasetPath = "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Testing"
classes = []
tmp_x_train = np.full((25,2500),0)
y_train = np.full((25,5),0)
idx=0
for filename in glob.glob(TrainingDatasetPath + '/*.jpg'): 
    img = cv2.imread(filename,0)
    GrayImage = cv2.resize(img, (50, 50)) 
    tmp_x_train[idx,:] = np.array(GrayImage).reshape((1,2500))
    image = filename[len(TrainingDatasetPath)+2:]
    if image.split("- ")[1][:-4] not in classes:
        classes.append(image.split("- ")[1][:-4])
    y_train[idx,classes.index(image.split("- ")[1][:-4])] = 1
    idx = idx + 1
   
tmp_x_test = np.full((26,2500),0)
y_test = np.full((26,5),0)
idx=0
for filename in glob.glob(TestingDatasetPath + '/*.jpg'): 
    img = cv2.imread(filename,0)
    GrayImage = cv2.resize(img, (50, 50)) 
    tmp_x_test[idx,:] = np.array(GrayImage).reshape((1,2500))
    image = filename[len(TrainingDatasetPath)+2:]
    y_test[idx,classes.index(image.split("- ")[1][:-4])] = 1
    idx = idx + 1        


o = pca()
w = o.fit(23,tmp_x_train,50,0.00000000000001)
コード例 #7
0
ファイル: cfiltering.py プロジェクト: Azizou/Python-ML
            cf_rating[indx] = res                                           # For every item rated by the user "user_id", we now also have the collaborative filtering rating of the neighborhood.
            
        # end for user_id
        return cf_rating

if __name__ == "__main__":
    
    os.chdir("..")
    ratings_train = pd.load('proc_data/ratings_train.pda')
   
    accuracyList = list() # a list of (embedding, deviation_from_rating, misclassification_error) tuples
    for k in range(2, 1677, 20):
        
        #Get user embedding
        
        userEmbedding = pca(ratings_train[['userid', 'itemid', 'rating']], k, 0).real
        print "Computed %d-dimensional user embedding." %(k)
        
        # Calculate collaborative filtering ratings
        
        cf = CFilter(ratings_train, userEmbedding, size = 20)
        print "Built CFilter object"
        predictedRatings = cf.get_user_cf_rating(ratings_train)
        
        # Estimate training error in terms of two metrics: average deviation from true
        # rating and average misclassification error in terms of classifying a movie
        # as "good" or "bad".
        
        predictedLabels = np.array([1 if rat > 3 else 0 for rat in predictedRatings])
        # make the "isgood" column of the data map to {0,1} instead of {-1, 1}
        # so that average squared loss works correctly
コード例 #8
0
ファイル: BoW_PCA.py プロジェクト: mkinoshi/IdeologyDetection
def main(lib_docs, con_docs, lib_test_docs, con_test_docs, num_evecs, num_words, cutoff=False, cutoff_rate=1.0):
	### MODEL CONSTRUCTION ###
	
	# tokenize the docs
	lib_tokenized_docs = []
	con_tokenized_docs = []
	
	for sentence in lib_docs:
		tokens = toknize_article(sentence)
		lib_tokenized_docs.append(tokens)
	
	for sentence in con_docs:
		tokens = toknize_article(sentence)
		con_tokenized_docs.append(tokens)
	
	# concat tokenized_docs lists
	all_tokenized_docs = lib_tokenized_docs + con_tokenized_docs
	
	# use all_tokenized_docs so that matrix's # of features matches
	dict =  corpora.Dictionary(all_tokenized_docs)
	
	# create matrix for each category
	lib_docs_matrix = vectorize_articles(lib_tokenized_docs, dict)
	con_docs_matrix = vectorize_articles(con_tokenized_docs, dict)
	
	# stack them and use it to do PCA on whole training data
	all_docs_matrix = np.vstack((lib_docs_matrix, con_docs_matrix))
	
	(proj_matrix, e_vecs, e_vals)=  pca(all_docs_matrix)
	
       	print lib_docs_matrix.shape
	print con_docs_matrix.shape

	# project each category matrix onto the transpose of eigenvector matrix	
	
	if cutoff:
		cutoff_index = pca_cutoff(e_vals, cutoff_rate)
		lib_proj_matrix = np.dot(lib_docs_matrix, e_vecs.T)[:,:cutoff_index]
		con_proj_matrix = np.dot(con_docs_matrix, e_vecs.T)[:,:cutoff_index]
	else:
		lib_proj_matrix = np.dot(lib_docs_matrix, e_vecs.T)
		con_proj_matrix = np.dot(con_docs_matrix, e_vecs.T)
	
	print lib_proj_matrix.shape
	print con_proj_matrix.shape
	
	
	# take mean of all rows and get a vector representing the average sentence for 
	# each category
	lib_mean_vector = lib_proj_matrix.mean(axis=0)
	con_mean_vector = con_proj_matrix.mean(axis=0)
	
	print lib_mean_vector
	print lib_mean_vector.shape
	print con_mean_vector
	print con_mean_vector.shape
	
	X = np.vstack((lib_mean_vector, con_mean_vector))


	kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
	
	
	### TESTING ###
	
	# tokenize the test docs
	lib_tokenized_test_docs = []
	con_tokenized_test_docs = []
	
	for sentence in lib_docs:
		tokens = toknize_article(sentence)
		lib_tokenized_test_docs.append(tokens)
	
	for sentence in con_docs:
		tokens = toknize_article(sentence)
		con_tokenized_test_docs.append(tokens)
	
	# create matrix for each category
	lib_test_docs_matrix = vectorize_articles(lib_tokenized_test_docs, dict)
	con_test_docs_matrix = vectorize_articles(con_tokenized_test_docs, dict)
	

	# project each matrix to eigenspace
	if cutoff:
		lib_proj_test_matrix = np.dot(lib_test_docs_matrix, e_vecs.T)[:,:cutoff_index]
		con_proj_test_matrix = np.dot(con_test_docs_matrix, e_vecs.T)[:,:cutoff_index]
	else:
		lib_proj_test_matrix = np.dot(lib_test_docs_matrix, e_vecs.T)
		con_proj_test_matrix = np.dot(con_test_docs_matrix, e_vecs.T)
	
	

	lib_result = kmeans.predict(lib_proj_test_matrix)
	con_result = kmeans.predict(con_proj_test_matrix)
	
	lib_hit = float(np.count_nonzero(lib_result == 0))
	con_hit = float(np.count_nonzero(con_result == 1))
	
	lib_accuracy = lib_hit/len(lib_result)
	con_accuracy = con_hit/len(con_result)
	
	print "Liberal Accuracy: ", lib_accuracy
	print "Conservative Accuracy: ", con_accuracy
	

	# find the top n words for the top m eigenvectors
	for row in range(num_evecs):
		print find_topn_words(e_vecs[row,:], dict, num_words)

	
	# plot along eigenvectors
	lib_x = lib_proj_test_matrix[:,0]
	lib_y = lib_proj_test_matrix[:,1]
	con_x = con_proj_test_matrix[:,0]
	con_y = con_proj_test_matrix[:,1]
	lib_z = lib_proj_test_matrix[:,2]
	con_z = con_proj_test_matrix[:,2]
	
	lib_xyz = [lib_x, lib_y, lib_z]
	con_xyz = [con_x, con_y, lib_z]
	
	two_dim_eigenplot(lib_xyz[:2], con_xyz[:2])
	three_dim_eigenplot(lib_xyz, con_xyz)
コード例 #9
0
            h, w = ggray.shape
            thresh = np.array([[255 if pixel > 0 else 0 for pixel in row]
                               for row in ggray])
            b = np.array(get_boundry_img_matrix(thresh, bval=1),
                         dtype=np.float32)
            perameter = np.sum(b) / (h * w)
            area = np.sum(
                np.sum([[1.0 for j in range(w) if ggray[i, j]]
                        for i in range(h)]))
            mean_area = area / (h * w)
            r, b, g = np.sum([
                gcolor[i, j] for j in range(gcolor.shape[1])
                for i in range(gcolor.shape[0])
            ],
                             axis=0) / (area * 256)
            _, _, eigen_value = pca(ggray)
            eccentricity = eigen_value[0] / eigen_value[1]
            l = [
                mean_area, perameter, r, b, g, eigen_value[0], eigen_value[1],
                eccentricity
            ]
            ftrain.append(np.array(l))

        for gi in range(len(xctest)):
            gcolor = xctest[gi]
            ggray = xgtest[gi]
            h, w = ggray.shape
            thresh = np.array([[255 if pixel > 0 else 0 for pixel in row]
                               for row in ggray])
            b = np.array(get_boundry_img_matrix(thresh, bval=1),
                         dtype=np.float32)
コード例 #10
0
#                                      [ 1  3 -3  1]   [P0]
# B(t) = T(t)*CP = [t^3 t^2 t^1 t^0] * | 3 -6  3  0| * |P1|
#                                      |-3  3  0  0|   |P2|
#                                      [ 1  0  0  0]   [P3]
#
# t_j = sum(i = 1 : j){d(p_i, p_(i-1))} / sum(i = 1 : N){d(p_i, p_(i-1))}
# d(p_i, p_j) = sqrt((u_i - u_j)^2 + (v_i - v_j)^2)
# ===============================================================================================
#

N_select = 10
# calculate principal direction, eigen values are sorting by ascending
# set z_value as 0.
points = np.array(points)
points[:, 2] = 0.
eigen_values, eigen_vectors = pca(points)
principal = eigen_vectors[:, -1]
normal = eigen_vectors[:, -2]
print("principal direction is:\n", principal)
print('normal direction is\n', normal)

#show_pca(points, normal)

sampled_points = subsample_along_principal(points, principal, N_select)
show_samples(points, sampled_points)
figure_title = 'curve fitting'
# plot_scatters(points, figure_title)
# plot_1st_order_curve(points, figure_title)
# plot_2nd_order_curve(points, figure_title)
plot_3rd_order_curve(points, figure_title)
コード例 #11
0
ファイル: PCA.py プロジェクト: CUGzca/CUGSECourses
    #         n = count
    #         break

    #print("eigValIndice=\n",eigValIndice)
    n_eigValIndice = eigValIndice[-1:-(n + 1):-1]  #[-1,-2),其实也就是第一个
    #print("n_eigValIndice=\n",n_eigValIndice)

    n_eigVect = eigVects[:, n_eigValIndice]  #二维的
    #print("n_eigVect=\n",n_eigVect)

    lowDataMat = newData * n_eigVect
    reconMat = (lowDataMat * n_eigVect.T) + meanVal
    return lowDataMat, reconMat


if '__main__' == __name__:
    data = scipy.io.loadmat("BU3D_feature.mat")
    dataMat = data.get("data")
    y = np.array(dataMat)[:, -1]
    dataMat = np.delete(dataMat, -1, axis=1)
    dataMat, reconMat = pca(dataMat=dataMat, n=2)

    #plotData(dataMat=dataMat,reconMat=reconMat)
    plt.scatter(dataMat[:, 0].tolist(),
                dataMat[:, 1].tolist(),
                marker='o',
                c=y)

    plt.title('PCA')
    plt.show()
コード例 #12
0
def call(options):
    if options == 1:
        openfile(1, 1)
        strip_empty_chars(1)
        convert_to_float(1)
        pca(rowdata, 1)
        openfile(2, 1)
        open_label_file(1)
        appendFile(1)
        convert_to_float(2)
        write_processed_file(1)
        openfile(3, 1)
        strip_empty_chars(2)
        convert_to_float(3)
        pca(testdata, 2)
        openfile(4, 1)
        open_label_file(2)
        appendFile(2)
        convert_to_float(4)
        write_to_new_file(1)
    elif options == 2:
        openfile(1, 1)
        strip_empty_chars(1)
        convert_to_float(1)
        corFil(rowdata, 1, attr)
        openfile(2, 2)
        open_label_file(1)
        appendFile(1)
        convert_to_float(2)
        write_processed_file(2)
        openfile(3, 2)
        strip_empty_chars(2)
        convert_to_float(3)
        corFil(testdata, 2, attr)
        openfile(4, 2)
        open_label_file(2)
        appendFile(2)
        convert_to_float(4)
        write_to_new_file(2)
    elif options == 3:
        openfile(1, 1)
        strip_empty_chars(1)
        convert_to_float(1)
        varFil(rowdata, 1, attr)
        openfile(2, 3)
        open_label_file(1)
        appendFile(1)
        convert_to_float(2)
        write_processed_file(3)
        openfile(3, 3)
        strip_empty_chars(2)
        convert_to_float(3)
        varFil(testdata, 2, attr)
        openfile(4, 3)
        open_label_file(2)
        appendFile(2)
        convert_to_float(4)
        write_to_new_file(3)
    else:
        openfile(1, 1)
        strip_empty_chars(1)
        convert_to_float(1)
        open_label_file(1)
        open_label_file(2)
        appendfile_woreduction(1)
        convert_to_float(1)
        write_processed_file(4)
        openfile(3, 1)
        strip_empty_chars(2)
        convert_to_float(3)
        appendfile_test(1)
        convert_to_float(3)
        write_to_new_file(4)
コード例 #13
0
ファイル: cfiltering_item.py プロジェクト: tanvi92/Python-ML
import os
import pandas as pd
import pickle as pkl

if __name__ == "__main__":

    os.chdir("..")
    ratings_train = pd.load('proc_data/ratings_train.pda')

    accuracyList = list(
    )  # a list of (embedding, deviation_from_rating, misclassification_error) tuples
    for k in range(2, 941, 20):

        #Get user embedding

        itemEmbedding = pca(ratings_train[['userid', 'itemid', 'rating']], k,
                            1).real
        #print itemEmbedding
        print "Computed %d-dimensional user embedding." % (k)

        # Calculate collaborative filtering ratings

        cf = CFilter_item(ratings_train, itemEmbedding, size=20)
        print "Built CFilter object"
        predictedRatings = cf.get_item_cf_rating(ratings_train)

        # Estimate training error in terms of two metrics: average deviation from true
        # rating and average misclassification error in terms of classifying a movie
        # as "good" or "bad".

        predictedLabels = np.array(
            [1 if rat > 3 else 0 for rat in predictedRatings])
コード例 #14
0
    (predictions, ) = tuple(
        LSTM.predict(input_fn=tf.contrib.timeseries.
                     predict_continuation_input_fn(evaluation, steps=5)))

    observed_times = evaluation["times"][0]
    observed = evaluation["observed"][0, :, :]
    evaluated_times = evaluation["times"][0]
    evaluated = evaluation["mean"][0]
    predicted_times = predictions['times']
    predicted = predictions["mean"]

    return observed, evaluated, predicted


_ = pca()
data = _.start()
group_id = data.Group_ID.unique()
jsn_dict = {}
jsn_Dict = {}
jsn_Dict['Data'] = []

for i in group_id:
    tf.reset_default_graph()
    obsList = []
    evaList = []
    preList = []
    data_group = data[data.Group_ID == i]['Grade']

    if data_group.shape[0] > 50:
        o, e, p = model_h(data_group)
コード例 #15
0
        label_true = label_test[index_test]
        #计算准确率
        if label_vote == label_true:
            count = count + 1

        index_test = index_test + 1
    #print(count)
    accuracy = float(count) / len(dataset_test)
    return accuracy


if __name__ == '__main__':
    #加载数据集
    dataset_train, label_train = load_dataset(
        '../two datasets/sonar-train.txt', ',')
    dataset_test, label_test = load_dataset('../two datasets/sonar-test.txt',
                                            ',')
    #根据训练集计算投影矩阵
    K = [10, 20, 30]
    for k in K:
        W = pca(dataset_train, k)

        #计算降维后的样本
        dataset_train_K = transform(dataset_train, W)
        dataset_test_K = transform(dataset_test, W)

        #1NN方法计算准确率
        accuracy = oneNN(dataset_train_K, dataset_test_K, label_train,
                         label_test)
        print "k= %d, accuracy= %f" % (k, accuracy)
コード例 #16
0
#   数中的重了,相当于求了两次协方差,所以相当于对data的协方差进行了特征提取,而本应该对data进
#   行特征提取!所以造成了U和V矩阵一直不对劲,造成了U和V一直都是1024×1024,实际上应该是:U矩
#   阵为m×m,V矩阵为1024×1024。各自的作用是:U矩阵负责行降维(提取前k列为主要特征),V矩阵负
#   责列降维(提取前k行为主要特征)。本例中,之前一直用U矩阵降维是错的,应该用V矩阵降维才对。
#
import numpy as np
import matplotlib.pyplot as plt
import scipy.io as scio
from PCA import pca, normalize, display
import cap

# ************* 练习1:2D变1D *************
# 需要把PCA算法中的k设置为1,并添加可视化
data = scio.loadmat('ex7data1.mat')['X']
data, _ = normalize(data)  # 数据预处理--特征缩放
u, v, _ = pca(data)  # 执行PCA算法
k = 1
v_reduce = v[:k, :]
z = np.matmul(data, v_reduce.T)
data_re = np.matmul(z, v_reduce)
# 可视化
plt.plot(data[:, 0], data[:, 1], '.b')  # 原始样本
plt.plot([v_reduce[0][0], 0], [v_reduce[0][1], 0], '--r')  # 特征向量
plt.plot(data_re[:, 0], data_re[:, 1], 'or')  # 压缩后再重建输入
plt.show()

# **************** 练习2:图片压缩 ****************
# Dataset共有1000张图片,只用前100个进行训练
data = scio.loadmat('ex7faces.mat')['X'][:100, :]  # 导入数据
data, data_mean = normalize(data)  # 归一化处理
コード例 #17
0
ファイル: ahc.py プロジェクト: ZizhouJia/data-mining-homework
        x, y = get_info(info, n)
        X = get_root(fa, x)
        Y = get_root(fa, y)
        if X != Y:
            fa[X] = Y
            cluster_num = cluster_num - 1
    for i in range(n):
        get_root(fa, i)
    return fa


if __name__ == '__main__':
    clusters = 700
    data, label_family, label_genus, label_species, label_record = data_reader.read_frog_data(
    )
    data = pca(data, 10)
    data = data / data.max(axis=0)
    res = [0 for _ in range(data.shape[0])]
    start_time = time.time()
    cluster_result = agnes(data, clusters)

    cluster_set = set(cluster_result)

    ii = 0
    for index in cluster_set:
        cluster_indexs = [
            i for i, x in enumerate(cluster_result) if x == index
        ]
        for cluster_index in cluster_indexs:
            res[cluster_index] = ii
        ii = ii + 1