def train_model(para, data, path_excel): ## data and hyperparameters [train_data, train_data_interaction, user_num, item_num, test_data, pre_train_feature, hypergraph_embeddings, graph_embeddings, propagation_embeddings, sparse_propagation_matrix, _] = data [_, _, MODEL, LR, LAMDA, LAYER, EMB_DIM, BATCH_SIZE, TEST_USER_BATCH, N_EPOCH, IF_PRETRAIN, _, TOP_K] = para[0:13] if MODEL == 'LightLCFN': [_, _, _, KEEP_PORB, SAMPLE_RATE, GRAPH_CONV, PREDICTION, LOSS_FUNCTION, GENERALIZATION, OPTIMIZATION, IF_TRASFORMATION, ACTIVATION, POOLING] = para[13:] if MODEL == 'SGNN': [_, PROP_EMB, _] = para[13:] para_test = [train_data, test_data, user_num, item_num, TOP_K, TEST_USER_BATCH] ## Define the model if MODEL == 'MF': model = model_MF(n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA) if MODEL == 'NCF': model = model_NCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN) if MODEL == 'GCMC': model = model_GCMC(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix) if MODEL == 'NGCF': model = model_NGCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix) if MODEL == 'SCF': model = model_SCF(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix) if MODEL == 'CGMC': model = model_CGMC(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix) if MODEL == 'LightGCN': model = model_LightGCN(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, sparse_graph=sparse_propagation_matrix) if MODEL == 'LCFN': model = model_LCFN(layer=LAYER, n_users=user_num, n_items=item_num, emb_dim=EMB_DIM, lr=LR, lamda=LAMDA, pre_train_latent_factor=pre_train_feature, if_pretrain=IF_PRETRAIN, graph_embeddings=hypergraph_embeddings) if MODEL == 'LightLCFN': model = model_LightLCFN(n_users=user_num, n_items=item_num, lr=LR, lamda=LAMDA, emb_dim=EMB_DIM, layer=LAYER, pre_train_latent_factor=pre_train_feature, graph_embeddings=graph_embeddings, graph_conv = GRAPH_CONV, prediction = PREDICTION, loss_function=LOSS_FUNCTION, generalization = GENERALIZATION, optimization=OPTIMIZATION, if_pretrain=IF_PRETRAIN, if_transformation=IF_TRASFORMATION, activation=ACTIVATION, pooling=POOLING) if MODEL == 'SGNN': model = model_SGNN(n_users=user_num, n_items=item_num, lr=LR, lamda=LAMDA, emb_dim=EMB_DIM, layer=LAYER, pre_train_latent_factor=pre_train_feature, propagation_embeddings=propagation_embeddings, if_pretrain=IF_PRETRAIN, prop_emb=PROP_EMB) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) ## Split the training samples into batches batches = list(range(0, len(train_data_interaction), BATCH_SIZE)) batches.append(len(train_data_interaction)) ## Training iteratively F1_max = 0 F1_df = pd.DataFrame(columns=TOP_K) NDCG_df = pd.DataFrame(columns=TOP_K) t1 = time.clock() for epoch in range(N_EPOCH): for batch_num in range(len(batches) - 1): train_batch_data = [] for sample in range(batches[batch_num], batches[batch_num + 1]): (user, pos_item) = train_data_interaction[sample] sample_num = 0 while sample_num < (SAMPLE_RATE if MODEL == 'LightLCFN' else 1): neg_item = int(rd.uniform(0, item_num)) if not (neg_item in train_data[user]): sample_num += 1 train_batch_data.append([user, pos_item, neg_item]) train_batch_data = np.array(train_batch_data) _, loss = sess.run([model.updates, model.loss], feed_dict={model.users: train_batch_data[:, 0], model.pos_items: train_batch_data[:, 1], model.neg_items: train_batch_data[:, 2], model.keep_prob: KEEP_PORB if MODEL == 'LightLCFN' else 1}) ## test the model each epoch F1, NDCG = test_model(sess, model, para_test) F1_max = max(F1_max, F1[0]) ## print performance # print_value([epoch + 1, loss, F1_max, F1, NDCG]) if epoch % 10 == 0: print('%.5f' % (F1_max), end = ' ', flush = True) ## save performance F1_df.loc[epoch + 1] = F1 NDCG_df.loc[epoch + 1] = NDCG save_value([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False) if loss > 10 ** 10: break t2 = time.clock() print('time cost:', (t2 - t1) / 200) return F1_max
for key in list_index: value = para_dict.get(key) if len(value) > 1: # 如果这个参数对应的value(是个set)里存的值多于一个 changed_para.append(key) # 说明这个参数的值在不同次的实验中发生了改变 temp_list = [] temp_list.append(key) for v in value: temp_list.append(v) # 将参数名和参数值连接成一个list para_df = para_df.append( pd.DataFrame(temp_list).T) # 将这个list存成dataframe para_df = para_df.set_index(0) # 将第0列设为index(第0列是参数名) changed_para_str = ', '.join(changed_para) # 将改变了的参数的参数名连成字符串,用于生产输出文件的文件名 path_write = path + '\\data_collection' + '\\' + model_dataset + '_' + changed_para_str + '_' + str( int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx' save_value([[para_df, 'Parameters']], path_write, first_sheet=True) # 将所有出现过的参数名和参数值存下来 if len(changed_para) == 1 or len( changed_para) == 0: # 只有一个参数发生改变(没有参数改变的也当做只有一个参数改变的特殊情况) if len(changed_para) == 1: index_name = str(str(changed_para[0])) print("(*/ω\*) ", model_dataset + ": " + changed_para[0] + " is the variable") # 输出发生改变的参数 else: print("o(≧口≦)o ", model_dataset + ": there is no changed parameters") index_name = "dataset" top_k = str(para_df.loc['top_k', 1]) # 对top_k的值进行一些字符串处理 top_k = top_k.strip(',') top_k = top_k.strip('[') top_k = top_k.strip(']')
for e in wb.sheetnames: sheets.add(e) sheets = list(sheets) sheets.sort() parameter = pd.DataFrame(pd.read_excel(path_read + '\\' + value[0], sheetname=0, header = None , index_col=0)) # 将文件的参数存成dataframe的形式,将第0列设置为index parameter.index.name = 'para' # 设置dataframe的行名和列名 parameter.columns.name = 'value' dataset = str(parameter.loc['DATASET',1]) # 保留文件的dataset和model,作为后面输出文件的文件名 model = str(parameter.loc['MODEL',1]) #eta = str(parameter.loc['eta',1]) #lambda_r = str(parameter.loc['lambda_r',1]) path_write = path + '\\data_process\\' + dataset + '_' + model + '_' + str(int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx' # 输出文件的文件路径 #path_write = path + '\\data_process\\' +dataset+'_'+model+'_eta='+eta+'_lambda='+lambda_r+'_' + str(int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx' save_value([[parameter, 'Parameters']], path_write, first_sheet=True) # 将参数存进excel for sheet in sheets: # 处理表中的每一个sheet,包括F1和NDCG if operator.eq(sheet, 'Parameters') == 0 and operator.eq(sheet, 'Filename') == 0: # 如果这个sheet不是parameter也不是filename,则进行下面的操作 df_max = pd.DataFrame() # 存储这个sheet(F1或者NDCG)对应的处理结果,df_max表示这个表里的结果是之前待处理的表的max值合成的 df_top = pd.DataFrame() # df_top表示这个表里的结果是之前待处理的表的top_ave的平均值合成的 for file_p in value: # 对这个key对应的value的list里的每一个文件进行处理 temp_f = load_workbook(path_read + '\\' + file_p) temp_sn = temp_f.sheetnames if sheet in temp_sn: metric = pd.DataFrame(pd.read_excel(path_read + '\\' + file_p, sheetname=sheet, header=0, index_col=0)) # 读入某一个文件里的一个sheet list_max = process_metric(metric,method = 'max',para = top_ave ) # 处理这个metric,得到F1_max或NDCG_max,得到的值为一行 list_top = process_metric(metric, method = 'top', para = top_ave ) # 处理这个metric,得到F1_top或NDCG_top,得到的值为一行 df_max = df_max.append(list_max, ignore_index=True) # 将list_max 和 list_top分别加在对应的dataframe上 df_top = df_top.append(list_top, ignore_index=True) # 经过上述操作,在df_top 和df_max里,每一行由一个实验结果文件生成 df_top = df_top.append(df_top.mean(),ignore_index=True) # 再将上述所有各次实验的平均值追加在对应的dataframe后面 df_max = df_max.append(df_max.mean(), ignore_index=True)