Example #1
0
    def run(self):
        db = self.db
        N = db['N']

        if type(db['H_matrix']) == type(None):
            db['H_matrix'] = np.eye(N) - np.ones((N, N)) / N

        if db['kernel_type'] == 'Linear Kernel':
            optimize_linear_kernel(db)
        elif db['kernel_type'] == 'Gaussian Kernel':
            output = np.empty((N, 1), dtype=np.float32)

            if self.db['prev_clust'] == 0:
                self.kdac.SetupParams(self.params)
                self.kdac.Fit(db['data'], N, db['d'])
            elif (self.db['with_predefined_clustering'] == True):
                self.kdac.SetupParams(self.params)
                self.kdac.Fit(db['data'], N, db['d'], db['Y_matrix'], N,
                              db['C_num'])
            else:
                #import pdb; pdb.set_trace()
                self.kdac.SetupParams(self.params)
                self.kdac.Fit()

            print 'got to here'
            self.kdac.Predict(output, N, 1)

            db['allocation'] = output.T[0]
            db['allocation'].astype(np.int32)
            db['allocation'] += 1  # starts from 1 instead of 0

            db['binary_allocation'] = np.zeros((N, db['C_num']))

            #	Convert from allocation to binary_allocation
            for m in range(db['allocation'].shape[0]):
                db['binary_allocation'][m, int(db['allocation'][m]) - 1] = 1

            if db['Y_matrix'].shape[0] == 0:
                db['Y_matrix'] = db['binary_allocation']
            else:
                db['Y_matrix'] = np.append(db['Y_matrix'],
                                           db['binary_allocation'],
                                           axis=1)

            self.db['prev_clust'] += 1
            return

        elif self.db['kernel_type'] == 'Polynomial Kernel':
            optimize_polynomial_kernel(self.db)
        else:
            raise ValueError('Error : unknown kernel was used.')

        normalize_each_U_row(self.db)
        K_means(self.db)
        self.db['prev_clust'] += 1
def draw_Image(points, m, original_data, vertical_digit, horizontal_digit):
    # 下面开始画图
    # plt.figure(figsize=(12, 9))
    # plt.imshow(original_data);  #和plt.show()配合才能画图
    # plt.axis('off')
    # plt.show()

    #建立输出文件夹
    if not np.os.path.exists("../Home1.2_Image"):
        np.os.mkdir("../Home1.2_Image")
    for i in [2, 3, 5, 10, 20, 50]:
        # 建立输出文件夹
        filePackege = "../Home1.2_Image/K_equals" + str(i) + "/"
        if not np.os.path.exists(filePackege):
            np.os.mkdir(filePackege)
        print('Number of clusters:', i)
        starttime = time.time()
        #用K——means方法
        c_new, labels = K_means(points, i, m)
        # 用K——metroid方法
        #c_new, labels = K_metroid(points, i, m)
        endtime = time.time()
        runningTime = endtime - starttime
        runningtimeAnounce = "for K= " + str(
            i) + " the running time is " + str(runningTime) + "s"

        new_image = np.zeros((vertical_digit, horizontal_digit, 3))
        labels = np.array(labels)
        j = 0
        #生成image三维矩阵
        while j < m:
            row = j // horizontal_digit
            coloum = j % horizontal_digit
            new_image[row, coloum, :] = c_new[labels[j], :]
            j = j + 1

        j = 0
        #装换centroid,并画图
        new_centroid = np.zeros((1, i, 3))
        while j < i:
            new_centroid[0, j, :] = c_new[j, :]
            j = j + 1

        plt.figure(figsize=(12, 1))
        plt.imshow(new_centroid.astype('uint8'))
        plt.axis('off')
        fileaddressKdemo = "../Home1.2_Image/" + str(i) + "K.jpg"
        plt.savefig(fileaddressKdemo)
        # plt.show()
        plt.figure(figsize=(12, 9))
        plt.imshow(new_image.astype('uint8'))
        plt.axis('off')
        plt.text(12, 4, runningtimeAnounce)
        fileaddressdemo = filePackege + str(i) + "demo.jpg"
        plt.savefig(fileaddressdemo)
Example #3
0
    def run(self):
        db = self.db
        N = self.db['N']

        if db['data_type'] == 'Feature Matrix':
            self.db['data'] = self.center_data(self.db['data'])

        if self.db['kernel_type'] == 'Linear Kernel':
            optimize_linear_kernel(self.db)
        elif self.db['kernel_type'] == 'Gaussian Kernel':
            optimize_gaussian_kernel(self.db)
        elif self.db['kernel_type'] == 'Polynomial Kernel':
            optimize_polynomial_kernel(self.db)
        else:
            raise ValueError('Error : unknown kernel was used.')

        normalize_each_U_row(self.db)
        K_means(self.db)
        self.db['prev_clust'] += 1
Example #4
0
                                       R_Cramer_y=R_Cramer_y,
                                       R_cont_x=R_cont_x,
                                       R_Cramer_x=R_Cramer_x,
                                       dic=dic,
                                       normalize=True,
                                       verbose=True,
                                       path_rslt)

delta_time = round((datetime.now() - begin).total_seconds(), 1)
print('Données préparées en ' + str(delta_time) + 's')

#============================================================================
#               K-means
begin = datetime.now()
repartition = K_means.K_means(dfX,
                              Y,
                              nb_clusters_init=nb_clusters,
                              methode_prediction=method_prediction)
delta_time = round((datetime.now() - begin).total_seconds(), 1)
print('Clusters effectués en ' + str(delta_time) + 's')

#===========================================================================
#              IC
begin = datetime.now()
cluster_stat = IntConf.IntConf(repartition, alpha=0.85, path_rslt)
print(cluster_stat)
delta_time = round((datetime.now() - begin).total_seconds(), 1)
print('Intervalle de confiance éffectué en ' + str(delta_time) + 's')

#===========================================================================
#              Prediction
index = ~Y.isnull()
Example #5
0
def workflow(fichier,fichier_sortie, path, path_rslt, taille_ech, suffix_table, begin_distributed,fich_vae,fich_deb) :  
    #=======================================================================
    #                Parametrage statistique
    nb_clusters = taille_ech//1000
    R_cont_y = 0.25; R_Cramer_y = 0.20; R_cont_x = 0.85; R_Cramer_x = 0.80
    method_disc = 'Cramer'; method_continuous = 'regression'
    method_prediction = 'Ridge' #'random_forest'
    dic={'SGMT_PF_V4':['SGMT_PF_V4',1],'SGMT_PF_AXE_FIDELITE_V4':['SGMT_PF_AXE_FIDELITE_V4',3],'REVENU_EST_M':['REVENU_EST_M',3]}
    
    #============================================================================
    #                 Préparation des données
    begin = datetime.now()
    dfX, Y, R_dico = data_preparation.main(
                fichier = fichier,fichier_sortie= fichier_sortie,method_disc=method_disc,method_continuous=method_continuous,taille_ech = taille_ech,
                R_cont_y = R_cont_y , R_Cramer_y = R_Cramer_y, R_cont_x = R_cont_x, R_Cramer_x = R_Cramer_x, dic=dic,
                normalize = True, verbose = True, path_rslt=path_rslt, suffix_table=suffix_table )

    delta_time = round((datetime.now() - begin).total_seconds(),1)
    print('Données préparées en ' + str(delta_time) + 's')

    #============================================================================
    #               K-means
    begin = datetime.now()
    repartition = K_means.K_means(dfX,Y,nb_clusters_init= nb_clusters,methode_prediction=method_prediction)
    delta_time = round((datetime.now() - begin).total_seconds(),1)
    print('Clusters effectués en ' + str(delta_time) + 's')

    #===========================================================================
    #              IC
    begin = datetime.now()
    cluster_stat = IntConf.IntConf(repartition,alpha = 0.85,path_rslt=path_rslt, suffix_table=suffix_table )
    print(cluster_stat)
    delta_time = round((datetime.now() - begin).total_seconds(),1)
    print('Intervalle de confiance éffectué en ' + str(delta_time) + 's')

    #===========================================================================
    #              Prediction
    index = ~Y.isnull()
    dfX_train, dfX_test, Y_train, Y_test = train_test_split(dfX[index], Y[index],test_size = 0.4, random_state = 44)

    del dfX_train, dfX_test, Y_train
    result, result_test=prediction.IC_reg(repartition, dfX, Y,path_rslt=path_rslt, suffix_table=suffix_table)
    result_1 = prediction.get_result(repartition, dfX, Y, Y_test, method = method_prediction)
    #prediction.regression_graph(Y_real = result_1['Y_real'], Y_pred = result_1['Y_pred'], col = 'black')
    score_abs, score_rel = prediction.get_regression_score(Y_real = result_1['Y_real'], Y_pred = result_1['Y_pred'])
    print('Erreur moyenne de prevision: ' + str(score_abs) + ' (' + str(score_rel) + '%)')
    curve_Recall, AUC_ROC = prediction.classification_curve(Y_real = result_1['Y_real'], Y_pred = result_1['Y_pred'],
                       threshold_rich = 100000, col = 'black')
    print('AUC_ROC: '+str(AUC_ROC))
    print('AUC Precision-Recall: '+str(curve_Recall))
    #prediction.graph_variable_influence(dfX,result_1,col='black')

    #===========================================================================
    #              file of parameters 

    if os.path.exists(path+"Compare.xlsx") == False:
        S=pd.DataFrame(columns=["reference","data_x","data_y","method disc","method continuous","method prevision","Date",
        "Nb ligne","% NaN","k cluster","R_cont_y","R_Cramer_y","R_cont_x","R_Cramer_x","Nb var quali","Nb var quant",
        "Nb regroupement","score","score relative","AUC ROC","curve Recall","temps de calcul"])
        S.to_excel(path+"Compare.xlsx",encoding="utf-8", index=False) #,sep=";",encoding="utf-8"

    Nb_var_quant=len(R_dico['variables continues gardees']);
    Nb_var_quali=len(R_dico['variables discretes gardees']);
    Nb_regroupement=len(R_dico['groupe variables'])
    date = "%s" % datetime.now()
    NaN = len(Y[Y.isnull()==True])/Y.shape[0]
    delta_time = round((datetime.now() - begin_distributed).total_seconds(),1)

    wb = load_workbook(path+"Compare.xlsx")
    sheet = wb.get_sheet_by_name('Sheet1')
    reference = sheet.max_row + 1
    ABC=list("ABCDEFGHIJKLMNOPQRSTUV")
    liste=[reference,fich_deb,fich_vae,method_disc,method_continuous,method_prediction,date,taille_ech,str(NaN.__round__(2)),
           nb_clusters,R_cont_y,R_Cramer_y,R_cont_x,R_Cramer_x,Nb_var_quali,Nb_var_quant,Nb_regroupement,score_abs,
           str(score_rel)+'%',str(AUC_ROC),str(curve_Recall),str(delta_time) + 's']
    for i in range(len(ABC)):
        sheet[ABC[i] + str(reference)].value = liste[i]
        # except ValueError : 
            # print ("liste " + liste[i])
            # print("i " + str(i))
            # print ("ABC " + ABC[i])
            # print ("refer " + str(reference))
            # print ("sheet " + sheet[ABC[i] + str(reference)].value)
            # raise ValueError("C'est ici que ça plante")
    wb.save(path+"Compare.xlsx")

    mon_fichier = open(path_rslt + "fichier" + suffix_table + ".txt", "w")
    mon_fichier.write('liste variables qualitatives :')
    a=R_dico['variables discretes gardees'].sort(['R2'],ascending=[False])
    for i in a.values:
        mon_fichier.write('\n        '+str(i))

    a=R_dico['variables continues gardees'].sort(['R2'],ascending=[False])    
    mon_fichier.write('\n \nliste variables quantitatives :')
    for i in a.values:
        mon_fichier.write('\n        '+str(i))
   
    mon_fichier.write('\n \nliste groupe variables :')
    mon_fichier.write('\n          R²,    groupe variables correlees,    variable representante')
    for i in R_dico['groupe variables'].values:
        mon_fichier.write('\n        '+str(i))

    delta_time = round((datetime.now() - begin_distributed).total_seconds(),1)    
    mon_fichier.write('\n \nTemps de calcul est ' + str(delta_time)+ 's')
    mon_fichier.close() 
    print('Temps total de calcul du paquet est ' + str(delta_time) + 's')