def run(self): db = self.db N = db['N'] if type(db['H_matrix']) == type(None): db['H_matrix'] = np.eye(N) - np.ones((N, N)) / N if db['kernel_type'] == 'Linear Kernel': optimize_linear_kernel(db) elif db['kernel_type'] == 'Gaussian Kernel': output = np.empty((N, 1), dtype=np.float32) if self.db['prev_clust'] == 0: self.kdac.SetupParams(self.params) self.kdac.Fit(db['data'], N, db['d']) elif (self.db['with_predefined_clustering'] == True): self.kdac.SetupParams(self.params) self.kdac.Fit(db['data'], N, db['d'], db['Y_matrix'], N, db['C_num']) else: #import pdb; pdb.set_trace() self.kdac.SetupParams(self.params) self.kdac.Fit() print 'got to here' self.kdac.Predict(output, N, 1) db['allocation'] = output.T[0] db['allocation'].astype(np.int32) db['allocation'] += 1 # starts from 1 instead of 0 db['binary_allocation'] = np.zeros((N, db['C_num'])) # Convert from allocation to binary_allocation for m in range(db['allocation'].shape[0]): db['binary_allocation'][m, int(db['allocation'][m]) - 1] = 1 if db['Y_matrix'].shape[0] == 0: db['Y_matrix'] = db['binary_allocation'] else: db['Y_matrix'] = np.append(db['Y_matrix'], db['binary_allocation'], axis=1) self.db['prev_clust'] += 1 return elif self.db['kernel_type'] == 'Polynomial Kernel': optimize_polynomial_kernel(self.db) else: raise ValueError('Error : unknown kernel was used.') normalize_each_U_row(self.db) K_means(self.db) self.db['prev_clust'] += 1
def draw_Image(points, m, original_data, vertical_digit, horizontal_digit): # 下面开始画图 # plt.figure(figsize=(12, 9)) # plt.imshow(original_data); #和plt.show()配合才能画图 # plt.axis('off') # plt.show() #建立输出文件夹 if not np.os.path.exists("../Home1.2_Image"): np.os.mkdir("../Home1.2_Image") for i in [2, 3, 5, 10, 20, 50]: # 建立输出文件夹 filePackege = "../Home1.2_Image/K_equals" + str(i) + "/" if not np.os.path.exists(filePackege): np.os.mkdir(filePackege) print('Number of clusters:', i) starttime = time.time() #用K——means方法 c_new, labels = K_means(points, i, m) # 用K——metroid方法 #c_new, labels = K_metroid(points, i, m) endtime = time.time() runningTime = endtime - starttime runningtimeAnounce = "for K= " + str( i) + " the running time is " + str(runningTime) + "s" new_image = np.zeros((vertical_digit, horizontal_digit, 3)) labels = np.array(labels) j = 0 #生成image三维矩阵 while j < m: row = j // horizontal_digit coloum = j % horizontal_digit new_image[row, coloum, :] = c_new[labels[j], :] j = j + 1 j = 0 #装换centroid,并画图 new_centroid = np.zeros((1, i, 3)) while j < i: new_centroid[0, j, :] = c_new[j, :] j = j + 1 plt.figure(figsize=(12, 1)) plt.imshow(new_centroid.astype('uint8')) plt.axis('off') fileaddressKdemo = "../Home1.2_Image/" + str(i) + "K.jpg" plt.savefig(fileaddressKdemo) # plt.show() plt.figure(figsize=(12, 9)) plt.imshow(new_image.astype('uint8')) plt.axis('off') plt.text(12, 4, runningtimeAnounce) fileaddressdemo = filePackege + str(i) + "demo.jpg" plt.savefig(fileaddressdemo)
def run(self): db = self.db N = self.db['N'] if db['data_type'] == 'Feature Matrix': self.db['data'] = self.center_data(self.db['data']) if self.db['kernel_type'] == 'Linear Kernel': optimize_linear_kernel(self.db) elif self.db['kernel_type'] == 'Gaussian Kernel': optimize_gaussian_kernel(self.db) elif self.db['kernel_type'] == 'Polynomial Kernel': optimize_polynomial_kernel(self.db) else: raise ValueError('Error : unknown kernel was used.') normalize_each_U_row(self.db) K_means(self.db) self.db['prev_clust'] += 1
R_Cramer_y=R_Cramer_y, R_cont_x=R_cont_x, R_Cramer_x=R_Cramer_x, dic=dic, normalize=True, verbose=True, path_rslt) delta_time = round((datetime.now() - begin).total_seconds(), 1) print('Données préparées en ' + str(delta_time) + 's') #============================================================================ # K-means begin = datetime.now() repartition = K_means.K_means(dfX, Y, nb_clusters_init=nb_clusters, methode_prediction=method_prediction) delta_time = round((datetime.now() - begin).total_seconds(), 1) print('Clusters effectués en ' + str(delta_time) + 's') #=========================================================================== # IC begin = datetime.now() cluster_stat = IntConf.IntConf(repartition, alpha=0.85, path_rslt) print(cluster_stat) delta_time = round((datetime.now() - begin).total_seconds(), 1) print('Intervalle de confiance éffectué en ' + str(delta_time) + 's') #=========================================================================== # Prediction index = ~Y.isnull()
def workflow(fichier,fichier_sortie, path, path_rslt, taille_ech, suffix_table, begin_distributed,fich_vae,fich_deb) : #======================================================================= # Parametrage statistique nb_clusters = taille_ech//1000 R_cont_y = 0.25; R_Cramer_y = 0.20; R_cont_x = 0.85; R_Cramer_x = 0.80 method_disc = 'Cramer'; method_continuous = 'regression' method_prediction = 'Ridge' #'random_forest' dic={'SGMT_PF_V4':['SGMT_PF_V4',1],'SGMT_PF_AXE_FIDELITE_V4':['SGMT_PF_AXE_FIDELITE_V4',3],'REVENU_EST_M':['REVENU_EST_M',3]} #============================================================================ # Préparation des données begin = datetime.now() dfX, Y, R_dico = data_preparation.main( fichier = fichier,fichier_sortie= fichier_sortie,method_disc=method_disc,method_continuous=method_continuous,taille_ech = taille_ech, R_cont_y = R_cont_y , R_Cramer_y = R_Cramer_y, R_cont_x = R_cont_x, R_Cramer_x = R_Cramer_x, dic=dic, normalize = True, verbose = True, path_rslt=path_rslt, suffix_table=suffix_table ) delta_time = round((datetime.now() - begin).total_seconds(),1) print('Données préparées en ' + str(delta_time) + 's') #============================================================================ # K-means begin = datetime.now() repartition = K_means.K_means(dfX,Y,nb_clusters_init= nb_clusters,methode_prediction=method_prediction) delta_time = round((datetime.now() - begin).total_seconds(),1) print('Clusters effectués en ' + str(delta_time) + 's') #=========================================================================== # IC begin = datetime.now() cluster_stat = IntConf.IntConf(repartition,alpha = 0.85,path_rslt=path_rslt, suffix_table=suffix_table ) print(cluster_stat) delta_time = round((datetime.now() - begin).total_seconds(),1) print('Intervalle de confiance éffectué en ' + str(delta_time) + 's') #=========================================================================== # Prediction index = ~Y.isnull() dfX_train, dfX_test, Y_train, Y_test = train_test_split(dfX[index], Y[index],test_size = 0.4, random_state = 44) del dfX_train, dfX_test, Y_train result, result_test=prediction.IC_reg(repartition, dfX, Y,path_rslt=path_rslt, suffix_table=suffix_table) result_1 = prediction.get_result(repartition, dfX, Y, Y_test, method = method_prediction) #prediction.regression_graph(Y_real = result_1['Y_real'], Y_pred = result_1['Y_pred'], col = 'black') score_abs, score_rel = prediction.get_regression_score(Y_real = result_1['Y_real'], Y_pred = result_1['Y_pred']) print('Erreur moyenne de prevision: ' + str(score_abs) + ' (' + str(score_rel) + '%)') curve_Recall, AUC_ROC = prediction.classification_curve(Y_real = result_1['Y_real'], Y_pred = result_1['Y_pred'], threshold_rich = 100000, col = 'black') print('AUC_ROC: '+str(AUC_ROC)) print('AUC Precision-Recall: '+str(curve_Recall)) #prediction.graph_variable_influence(dfX,result_1,col='black') #=========================================================================== # file of parameters if os.path.exists(path+"Compare.xlsx") == False: S=pd.DataFrame(columns=["reference","data_x","data_y","method disc","method continuous","method prevision","Date", "Nb ligne","% NaN","k cluster","R_cont_y","R_Cramer_y","R_cont_x","R_Cramer_x","Nb var quali","Nb var quant", "Nb regroupement","score","score relative","AUC ROC","curve Recall","temps de calcul"]) S.to_excel(path+"Compare.xlsx",encoding="utf-8", index=False) #,sep=";",encoding="utf-8" Nb_var_quant=len(R_dico['variables continues gardees']); Nb_var_quali=len(R_dico['variables discretes gardees']); Nb_regroupement=len(R_dico['groupe variables']) date = "%s" % datetime.now() NaN = len(Y[Y.isnull()==True])/Y.shape[0] delta_time = round((datetime.now() - begin_distributed).total_seconds(),1) wb = load_workbook(path+"Compare.xlsx") sheet = wb.get_sheet_by_name('Sheet1') reference = sheet.max_row + 1 ABC=list("ABCDEFGHIJKLMNOPQRSTUV") liste=[reference,fich_deb,fich_vae,method_disc,method_continuous,method_prediction,date,taille_ech,str(NaN.__round__(2)), nb_clusters,R_cont_y,R_Cramer_y,R_cont_x,R_Cramer_x,Nb_var_quali,Nb_var_quant,Nb_regroupement,score_abs, str(score_rel)+'%',str(AUC_ROC),str(curve_Recall),str(delta_time) + 's'] for i in range(len(ABC)): sheet[ABC[i] + str(reference)].value = liste[i] # except ValueError : # print ("liste " + liste[i]) # print("i " + str(i)) # print ("ABC " + ABC[i]) # print ("refer " + str(reference)) # print ("sheet " + sheet[ABC[i] + str(reference)].value) # raise ValueError("C'est ici que ça plante") wb.save(path+"Compare.xlsx") mon_fichier = open(path_rslt + "fichier" + suffix_table + ".txt", "w") mon_fichier.write('liste variables qualitatives :') a=R_dico['variables discretes gardees'].sort(['R2'],ascending=[False]) for i in a.values: mon_fichier.write('\n '+str(i)) a=R_dico['variables continues gardees'].sort(['R2'],ascending=[False]) mon_fichier.write('\n \nliste variables quantitatives :') for i in a.values: mon_fichier.write('\n '+str(i)) mon_fichier.write('\n \nliste groupe variables :') mon_fichier.write('\n R², groupe variables correlees, variable representante') for i in R_dico['groupe variables'].values: mon_fichier.write('\n '+str(i)) delta_time = round((datetime.now() - begin_distributed).total_seconds(),1) mon_fichier.write('\n \nTemps de calcul est ' + str(delta_time)+ 's') mon_fichier.close() print('Temps total de calcul du paquet est ' + str(delta_time) + 's')