def pop(): """ :return: """ api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id predict_results.append(api2pop[api_id]) all_predict_results.append(predict_results) print('pop test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds( ).name + 'pop' + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def MF(train_datas, test_datas, mode=''): all_predict_results = [] # 每个测试样例(多个api的)的评分 for slt_num in range(1, data_repository.get_args().slt_item_num + 1): # 不同个数的训练测试集 test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num - 1] # 增加处理和读取MF结果的接口 UV_obj = MF(data_repository.get_ds().data_root, mode, train_datas[slt_num - 1], slt_num) m_id2index, a_id2index = UV_obj.m_id2index, UV_obj.a_id2index for i in range(len(test_mashup_id_list)): test_mashup_id = test_mashup_id_list[i][0] # 每个mashup id predict_results = [] for test_api_id in test_api_id_list[i]: # id if test_mashup_id not in m_id2index or test_api_id not in a_id2index: dot = 0 else: m_embedding = UV_obj.m_embeddings[ m_id2index[test_mashup_id]] a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]] dot = np.dot(m_embedding, a_embedding) predict_results.append(dot) all_predict_results.append(predict_results) print('{}_{} test,done!'.format(mode, slt_num)) evaluate_result = evalute( test_api_id_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds().name + mode + str( slt_num) + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def bl_PasRec(): model_name = 'PasRec_2path' # 'PasRec_2path' epoch_num = 60 # 之前是40 40比20差点 neighbor_size = 15 topTopicNum = 3 args = data_repository.get_args() train_data, test_data = data_repository.get_ds( ).train_data, data_repository.get_ds().test_data HINRec_model = HINRec(args, model_name=model_name, epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: # 这里是每隔20epoch测试一下,所以train中输入test_data HINRec_model.train(test_data) HINRec_model.save_model() evalute_by_epoch( HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=False) # ,if_save_recommend_result=True)
def analyze_result(recommend_model, topKs): """ 读取recommend_result_path中的评价结果,再使用其他指标(pop和冗余度)进行评价 :param recommend_model: :param recommend_result_path: :param topKs: :return: """ recommend_result_path = os.path.join(recommend_model.model_dir, 'recommend_result_new.csv') mashup_ids, slt_api_ids, recommend_lists, grounds = [], [], [], [] def str2list(str_): list_ = str_.split(' ') return [int(id) for id in list_] with open(recommend_result_path) as f: reader = csv.DictReader(f) for row in reader: mashup_ids.append(str2list(row['mashup_id'])) slt_api_ids.append(str2list(row['slt_api_ids'])) recommend_lists.append(str2list(row['recommend_list'])) grounds.append(str2list(row['grounds'])) instance_num = len(mashup_ids) api_id2info = meta_data.pd.get_mashup_api_id2info('api') _, api_id2pop = meta_data.pd.get_api_co_vecs(pop_mode='') api_categories = [ get_mashup_api_allCategories('api', api_id2info, api_id, data_repository.get_args().Category_type) for api_id in range(meta_data.api_num) ] def evaluate_others(recommend_list): size = len(recommend_list) pop = sum([api_id2pop[api_id] for api_id in recommend_list]) / size union_tags = set() tag_sum_num = 0 for api_id in recommend_list: tags = api_categories[api_id] union_tags = union_tags.union(set(tags)) tag_sum_num += len(tags) redundance = 1 - len(union_tags) / tag_sum_num return np.array([pop, redundance]) # pop和冗余度 def analyze(): indicators_name = ['pop', 'redundancy'] indicators = np.zeros((instance_num, len(topKs), len(indicators_name))) # pop redundancy 看指标有哪些 for index in range(instance_num): # 单个mashup评价 for k_idx, k in enumerate(topKs): # 某个topK indicators[index, k_idx, :] = evaluate_others( recommend_lists[index][:k]) # 评价得到五个指标,K对NDCG等有用 return np.average(indicators, axis=0) indicators = analyze() recommend_result_path = os.path.join(recommend_model.model_dir, 'recommend_other_indicators.csv') summary_others(recommend_result_path, recommend_model.simple_name, indicators, topKs)
def CI_NI_fineTuning(): args = data_repository.get_args() train_data, test_data = data_repository.get_ds( ).train_data, data_repository.get_ds().test_data CI_recommend_model = CI_Model(args) CI_model_obj = CI_recommend_model.get_model() CI_model_obj = train_model(CI_recommend_model, CI_model_obj, train_data, test_data, args.train_mode, args.train_new)
def train_monitoring_loss_acc_model(recommend_model, model, train_data): """ 绘制loss_acc曲线, 观察过拟合欠拟合 """ train_labels = train_data[-1] train_instances_tuple = recommend_model.get_instances(*train_data[:-1]) model.compile(optimizer=Adam(lr=data_repository.get_args().learning_rate), loss='binary_crossentropy', metrics=['accuracy']) hist = model.fit([*train_instances_tuple], np.array(train_labels), batch_size=data_repository.get_args().small_batch_size, epochs=data_repository.get_args().num_epochs, verbose=1, shuffle=True, validation_split=0.1) # 可以观察过拟合欠拟合 plot_loss_acc(hist, recommend_model.get_simple_name()) return model
def show_prediction_res(i): print('for mashup {}:'.format(test_mashup_id_list[i][0])) if data_repository.get_args().need_slt_apis: print('slt_ids:', test_slt_ids[i]) sorted_pre2id = sorted(zip(prediction, test_api_id_list[i])) sorted_pres, sorted_ids = zip(*sorted_pre2id) print('candidate api ids', sorted_ids) print('predictions', sorted_pres) print('grounds', grounds[i])
def hdp_pop(if_pop=True): # pop root = os.path.join(data_repository.get_ds().data_root, 'baselines') if not os.path.exists(root): os.makedirs(root) mashup_hdp_path = os.path.join(root, 'mashup_HDP.txt') # ... api_hdp_path = os.path.join(root, 'api_HDP.txt') _mashup_hdp_features = np.loadtxt(mashup_hdp_path) _api_hdp_features = np.loadtxt(api_hdp_path) if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() # 测试 candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_hdp_features[test_mashup_id], _api_hdp_features[api_id]) if if_pop: sim_score *= api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('hdp_pop test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 name = 'hdp_pop' if if_pop else 'hdp' csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def save_loss_acc(train_log, model_name, epoch=0, if_multi_epoch=False): # if_multi_epoch:每次存一个epoch # 每个epoch存储loss,val_loss,acc,val_acc if not if_multi_epoch: with open(loss_path, 'a+') as f: if epoch == 0: # 第一个epoch记录模型名 f.write(model_name + '\n') if data_repository.get_args().validation_split == 0: f.write('epoch,loss,acc\n') else: f.write('epoch,loss,val_loss,acc,val_acc\n') if data_repository.get_args().validation_split == 0: f.write('{},{},{}\n'.format(epoch, train_log.history["loss"][0], train_log.history["acc"][0])) else: f.write('{},{},{},{},{}\n'.format( epoch, train_log.history["loss"][0], train_log.history["val_loss"][0], train_log.history["accuracy"][0], train_log.history["val_accuracy"][0])) else: with open(data_repository.get_args().loss_path, 'a+') as f: f.write(model_name + 'EarlyStop' + '\n') if data_repository.get_args().validation_split == 0: f.write('epoch,loss,acc\n') else: f.write('epoch,loss,val_loss,acc,val_acc\n') epoch_num = len(train_log.history["loss"]) for i in range(epoch_num): if data_repository.get_args().validation_split == 0: f.write('{},{},{}\n'.format(i, train_log.history["loss"][i], train_log.history["acc"][i])) else: f.write('{},{},{},{},{}\n'.format( i, train_log.history["loss"][i], train_log.history["val_loss"][i], train_log.history["acc"][i], train_log.history["val_acc"][i]))
def binary_keyword(if_pop=False): # pop api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() gd = get_default_gd() mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v( ) # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id if if_pop: sim_score = cos_sim( mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] else: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果 predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 name = 'WVSM_pop' if if_pop else 'WVSM' csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录 """
def get_sims_dict(process_new, remove_stopwords): # DHSR embedding_name = 'glove' embedding_dim = 50 tag_coefficient = 2 k = 1.2 b = 0.75 weighted_intervals = [-1, 0.15, 0.4, 0.8, 1] unweighted_intervals = [-1, 0.45, 0.8, 1] cs = cpt_DHSR_Sim(data_repository.get_args().cur_data_dir, embedding_name, embedding_dim, tag_coefficient, k, b, weighted_intervals, unweighted_intervals, process_new, remove_stopwords) return cs # 返回对象
def TF_IDF(if_pop): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ gd = get_default_gd() api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs('TF_IDF') candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id], _api_IFIDF_features[api_id]) if if_pop: predict_results.append(sim_score * api2pop[api_id]) else: predict_results.append(sim_score) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') name = 'TFIDF_pop' if if_pop else 'TFIDF' evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def plot_loss_acc(train_log, model_name): # 传入log对象,绘制曲线 epochs = data_repository.get_args().num_epochs plt.style.use("ggplot") plt.figure() plt.plot(np.arange(0, epochs), train_log.history["loss"], label="train_loss") plt.plot(np.arange(0, epochs), train_log.history["val_loss"], label="val_loss") plt.plot(np.arange(0, epochs), train_log.history["acc"], label="train_acc") plt.plot(np.arange(0, epochs), train_log.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy on the whole_model") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend(loc="upper right") plt.savefig("Loss_Accuracy_{}.jpg".format(model_name))
def __init__(self, data_dir, embedding_name, embedding_dim, tag_coefficient, k, b, weighted_intervals, unweighted_intervals, process_new, remove_stopwords): """ :param self: :param tag_coefficient: tag amplification coefficient :param k:smooth parameter :param b: smooth parameter :param weighted_intervals: :param unweighted_intervals: :return: """ self.process_new = process_new self.data_dir = data_dir self.embedding_name = embedding_name self.embedding_dim = embedding_dim self.tag_coefficient = int(tag_coefficient) self.k = k self.b = b self.weighted_intervals = weighted_intervals self.unweighted_intervals = unweighted_intervals self.num_mashup = 0 self.num_api = 0 self.word2inedx = {} # 词到index映射 在该类中主要以index形式存在 作key时比str节省内存 self.wordindex2IDF = {} self.average_len = 0 self.stopwords = data_repository.get_args( ).stop_words if remove_stopwords else set() self.mashup_descriptions = None self.api_descriptions = None self.wordindex2embedding = {} # 词index对应的embedding self.words_Sim = {} # 词对间的cos sim 不需求全部 随用随求并保存 {(,):float,} self.mashup2api_Sim = {} # mashup 到 api 的sim id形式 {(,):float,} self.initialize_sims_dict()
def train_early_stop(recommend_model, model, train_data, test_data): """ 训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试 :return: """ if_Train = True if data_repository.get_args().pairwise else False train_labels = train_data[-1] train_instances_tuple = recommend_model.get_instances( *train_data[:-1], pairwise_train_phase_flag=if_Train) train_model = recommend_model.get_pairwise_model( ) if data_repository.get_args().pairwise else model if data_repository.get_args().pairwise: train_model.compile(optimizer=recommend_model.optimizer, loss=lambda y_true, y_pred: y_pred, metrics=['accuracy']) else: train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min') hist = train_model.fit( [*train_instances_tuple], train_labels, epochs=data_repository.get_args().num_epochs, batch_size=data_repository.get_args().small_batch_size, callbacks=[early_stopping], validation_split=data_repository.get_args().validation_split, shuffle=True) # model.save_weights(data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, 'min_loss')) # !!! 改正 model_name = recommend_model.get_simple_name() + recommend_model.get_name( ) + '_min_loss' save_loss_acc(hist, model_name, if_multi_epoch=True) epoch_evaluate_result = evalute_by_epoch(recommend_model, model, model_name, test_data) return model
def get_predictions(): predictions = [] # 测试样本一次的预测结果 for i in range(test_instance_num): candidate_ids = test_api_id_list[i] test_batch_size = data_repository.get_args().test_batch_size prediction = [] # test api 太多,手动分batch预测 test_api_num = len(candidate_ids) batch_num = test_api_num // test_batch_size remainder = test_api_num % test_batch_size if remainder != 0: batch_num += 1 start_time = time.time() for j in range(batch_num): # 每个batch start_index = j * test_batch_size stop_index = test_api_num if (remainder != 0 and j == batch_num - 1) else (j + 1) * test_batch_size batch_api_ids = candidate_ids[start_index:stop_index] batch_instances_dict = { 'mashup': test_mashup_id_list[i][start_index:stop_index], 'api': batch_api_ids } if data_repository.get_args( ).data_mode == 'newScene' and data_repository.get_args( ).need_slt_apis: # TODO _slt_ids = [] _slt_ids.append( test_slt_ids[i]) # 同一行的同个mashup对各个api的评分中,已选择的apis一样 batch_instances_dict['slt_apis'] = _slt_ids * (stop_index - start_index) batch_instances_dict = recommend_model.get_instances( batch_instances_dict) batch_prediction = model.predict(batch_instances_dict) if len(batch_prediction.shape) == 2: batch_prediction = batch_prediction[:, 1] # 1:[0,1] batch_prediction = list(batch_prediction) prediction += batch_prediction # 一个mashup对所有候选的评分 predictions.append(list(prediction)) end_time = time.time() if record_time: with open(time_path, 'a+') as f1: if i == 0: f1.write(recommend_model.get_simple_name()) f1.write('\n') f1.write('num of instances,{},cost time,{}\n'.format( test_api_num, end_time - start_time)) # 展示几个mashup的推荐结果 def show_prediction_res(i): print('for mashup {}:'.format(test_mashup_id_list[i][0])) if data_repository.get_args().need_slt_apis: print('slt_ids:', test_slt_ids[i]) sorted_pre2id = sorted(zip(prediction, test_api_id_list[i])) sorted_pres, sorted_ids = zip(*sorted_pre2id) print('candidate api ids', sorted_ids) print('predictions', sorted_pres) print('grounds', grounds[i]) if i < show_cases: show_prediction_res(i) if i % 100 == 0: print('has test {}/{} mashup instances'.format( i, test_instance_num)) print('test,done!') return predictions
def evalute_by_epoch(recommend_model, model, model_name, test_data, show_cases=0, record_time=False, true_candidates_dict=None, if_save_recommend_result=False, evaluate_by_slt_apiNum=False): """ 对训练好的模型,进行测试:可以适用于完全冷启动和部分冷启动情景; :param show_cases: 显示几个推荐结果 :param record_time: 记录测试集的处理时间 :param true_candidates_dict: 重排序:是否使用IsRec等算法的处理方式:近邻没有调用过的服务评分设置为0 mashup id -> api ids list :param if_save_recommend_result: 是否存储每个样例的推荐结果,用于case分析 :param evaluate_by_slt_apiNum: 是否按照已选服务的数目将测试集分开评价 :return: """ # 某个已选择api数目下的测试集样本 test_mashup_id_list = test_data.get('mashup') test_api_id_list = test_data.get('api') grounds = test_data.get('all_ground_api_ids') test_slt_ids = test_data.get('slt_apis') csv_table_name = model_name + '\n' test_instance_num = len(test_mashup_id_list) # 获取所有的预测结果 def get_predictions(): predictions = [] # 测试样本一次的预测结果 for i in range(test_instance_num): candidate_ids = test_api_id_list[i] test_batch_size = data_repository.get_args().test_batch_size prediction = [] # test api 太多,手动分batch预测 test_api_num = len(candidate_ids) batch_num = test_api_num // test_batch_size remainder = test_api_num % test_batch_size if remainder != 0: batch_num += 1 start_time = time.time() for j in range(batch_num): # 每个batch start_index = j * test_batch_size stop_index = test_api_num if (remainder != 0 and j == batch_num - 1) else (j + 1) * test_batch_size batch_api_ids = candidate_ids[start_index:stop_index] batch_instances_dict = { 'mashup': test_mashup_id_list[i][start_index:stop_index], 'api': batch_api_ids } if data_repository.get_args( ).data_mode == 'newScene' and data_repository.get_args( ).need_slt_apis: # TODO _slt_ids = [] _slt_ids.append( test_slt_ids[i]) # 同一行的同个mashup对各个api的评分中,已选择的apis一样 batch_instances_dict['slt_apis'] = _slt_ids * (stop_index - start_index) batch_instances_dict = recommend_model.get_instances( batch_instances_dict) batch_prediction = model.predict(batch_instances_dict) if len(batch_prediction.shape) == 2: batch_prediction = batch_prediction[:, 1] # 1:[0,1] batch_prediction = list(batch_prediction) prediction += batch_prediction # 一个mashup对所有候选的评分 predictions.append(list(prediction)) end_time = time.time() if record_time: with open(time_path, 'a+') as f1: if i == 0: f1.write(recommend_model.get_simple_name()) f1.write('\n') f1.write('num of instances,{},cost time,{}\n'.format( test_api_num, end_time - start_time)) # 展示几个mashup的推荐结果 def show_prediction_res(i): print('for mashup {}:'.format(test_mashup_id_list[i][0])) if data_repository.get_args().need_slt_apis: print('slt_ids:', test_slt_ids[i]) sorted_pre2id = sorted(zip(prediction, test_api_id_list[i])) sorted_pres, sorted_ids = zip(*sorted_pre2id) print('candidate api ids', sorted_ids) print('predictions', sorted_pres) print('grounds', grounds[i]) if i < show_cases: show_prediction_res(i) if i % 100 == 0: print('has test {}/{} mashup instances'.format( i, test_instance_num)) print('test,done!') return predictions predictions = get_predictions() # 使用IsRec_best的策略处理一下待测服务的评分:没有被近邻mashup调用过的服务,评分直接设置为0 if true_candidates_dict is not None: for i in range(len(predictions)): # 每个mashup index true_candidates_list = true_candidates_dict[test_mashup_id_list[i] [0]] assert len(test_api_id_list[i]) == len(predictions[i]) _num = len(test_api_id_list[i]) for j in range(_num): if test_api_id_list[i][ j] not in true_candidates_list: # 如果一个待测api没有被近邻mashup调用过,评分为0 predictions[i][j] = 0 # 根据实例的已选择数目分别测试 if evaluate_by_slt_apiNum and data_repository.get_args( ).data_mode == 'newScene': def _filter(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_instance_num): if len(test_slt_ids[i]) == slt_apiNum: test_api_id_list_.append(test_api_id_list[i]) predictions_.append(predictions[i]) grounds_.append(grounds[i]) return test_api_id_list_, predictions_, grounds_ for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = _filter(slt_apiNum + 1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, data_repository.get_args().topKs) summary(evaluate_path, str(slt_apiNum + 1) + '_' + csv_table_name, evaluate_result, data_repository.get_args().topKs) # if if_save_recommend_result and data_repository.get_args( ).data_mode == 'newScene': recommend_result_path = os.path.join(recommend_model.model_dir, 'recommend_result.csv') evaluate_result = evalute(test_api_id_list, predictions, grounds, data_repository.get_args().topKs, test_mashup_id_list, test_slt_ids, recommend_result_path) # 评价并记录结果 summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # else: evaluate_result = evalute(test_api_id_list, predictions, grounds, data_repository.get_args().topKs) summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # return evaluate_result # topKs*5个指标
def train_best_NDCG_model(recommend_model, model, train_data, test_data, true_candidates_dict=None, CI_start_test_epoch=0, earlyStop_epochs=5): """ 训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型 :param recommend_model: 整体的推荐模型 :param model: model_core :param train_data: :param test_data: :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练 :param true_candidates_dict: :return: """ print('training_save_best_NDCG_model...') epoch_evaluate_results = [] # 模型 train_model = recommend_model.get_pairwise_model( ) if data_repository.get_args().pairwise else model # 数据 train_instances_dict = recommend_model.get_instances( train_data, pairwise_train_phase_flag=data_repository.get_args().pairwise) train_labels = train_data.get('label') if data_repository.get_args( ).final_activation == 'softmax': # 针对softmax变换labels train_labels = utils.to_categorical(train_labels, num_classes=2) best_epoch, best_NDCG_5 = 0, 0 for epoch in range(data_repository.get_args().num_epochs): if epoch == 0: # 首次训练要编译 # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy' # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy']) train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) print('whole_model compile,done!') print('Epoch {}'.format(epoch)) hist = train_model.fit( train_instances_dict, np.array(train_labels), batch_size=data_repository.get_args().batch_size, epochs=1, verbose=1, shuffle=True, validation_split=data_repository.get_args().validation_split) print('Epoch {}, train done!'.format(epoch)) # 记录:数据集情况,模型架构,训练设置 record_name = recommend_model.get_name() + data_repository.get_args( ).train_name if epoch == 0 else '' # 记录在测试集的效果,写入evalute.csv save_loss_acc(hist, record_name, epoch=epoch) # 每个epoch记录 # CI的前3轮效果差,一般不用测,提高速度 first_test_epoch = CI_start_test_epoch if isinstance( recommend_model, CI_Model) else 0 if epoch < first_test_epoch: epoch_evaluate_results.append(None) continue # epoch测试 epoch_evaluate_result = evalute_by_epoch( recommend_model, model, record_name, test_data, record_time=True if epoch == 0 else False, true_candidates_dict=true_candidates_dict) epoch_evaluate_results.append(epoch_evaluate_result) # 优于目前的best_NDCG_5才存储模型参数 TODO if epoch_evaluate_result[0][3] >= best_NDCG_5: best_NDCG_5 = epoch_evaluate_result[0][3] best_epoch = epoch model.save_weights( data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, epoch)) else: if epoch - best_epoch >= earlyStop_epochs: # 大于若干个epoch,效果没有提升,即时终止 break # 记录最优epoch和最优NDCG@5 with open( data_repository.get_ds().new_best_epoch_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_epoch)) with open( data_repository.get_ds().new_best_NDCG_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_NDCG_5)) print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5)) # 记录最优指标 csv_table_name = 'best_indicaters\n' summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch], data_repository.get_args().topKs) # 看word embedding矩阵是否发生改变,尤其是padding的0 # print('some embedding parameters after {} epoch:'.format(epoch)) # print (recommend_model.embedding_layer.get_weights ()[0][:2]) # 把记录的非最优的epoch模型参数都删除 try: for i in range(data_repository.get_args().num_epochs): temp_path = data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, i) if i != best_epoch and os.path.exists(temp_path): os.remove(temp_path) model.load_weights(data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, best_epoch)) finally: return model
def add_pop_predictions(recommend_model, csv_table_name, epoch, pop_mode='sigmoid', a_pop_ratio=0.0): test_mashup_id_list, test_api_id_list, predictions = None, None, None with open( os.path.join(data_repository.get_args().data_dir, 'model_predictions_{}.dat'.format(epoch)), 'rb') as f: test_mashup_id_list, test_api_id_list, predictions = pickle.load(f) api_id2covec, api_id2pop = recommend_model.pd.get_api_co_vecs( pop_mode=pop_mode) # 乘积 predictions_pop = [] for m_index in range(len(predictions)): a_mashup_predictions = predictions[m_index] temp_preditions = [] for a_index in range(len(a_mashup_predictions)): a_prediction = a_mashup_predictions[a_index] api_id = test_api_id_list[m_index][a_index] temp_preditions.append(api_id2pop[api_id] * a_prediction) predictions_pop.append(temp_preditions) evaluate_result_linear_sum = evalute( test_api_id_list, predictions_pop, data_repository.get_args().grounds, data_repository.get_args().topKs) # 评价 summary(evaluate_path, pop_mode + '_pop_prod\n' + csv_table_name, evaluate_result_linear_sum, data_repository.get_args().topKs) # 线性加权求和 pop_ratios = [0.2 + 0.2 * i for i in range(5)] for pop_ratio in pop_ratios: predictions_pop_linear = [] for m_index in range(len(predictions)): a_mashup_predictions = predictions[m_index] temp_preditions = [] for a_index in range(len(a_mashup_predictions)): a_prediction = a_mashup_predictions[a_index] api_id = test_api_id_list[m_index][a_index] temp_preditions.append((1 - pop_ratio) * a_prediction + pop_ratio * api_id2pop[api_id]) predictions_pop_linear.append(temp_preditions) evaluate_result_linear_sum = evalute( test_api_id_list, predictions_pop_linear, data_repository.get_args().grounds, data_repository.get_args().topKs) # 评价 summary(evaluate_path, pop_mode + '_pop_{}\n'.format(pop_ratio) + csv_table_name, evaluate_result_linear_sum, data_repository.get_args().topKs) predictions_pop_last = [] for m_index in range(len(predictions)): # 首先根据score选出候选 score_mapping = [ pair for pair in zip(test_api_id_list[m_index], predictions[m_index]) ] max_k_pairs = heapq.nlargest(100, score_mapping, key=lambda x: x[1]) # 根据score选取top100* max_k_candidates, _ = zip(*max_k_pairs) # 然后仅根据pop rank temp_preditions = [ api_id2pop[api_id] if api_id in max_k_candidates else -1 for api_id in test_api_id_list[m_index] ] predictions_pop_last.append(temp_preditions) evaluate_result_linear_sum = evalute( test_api_id_list, predictions_pop_last, data_repository.get_args().grounds, data_repository.get_args().topKs) # 评价 summary(evaluate_path, pop_mode + '_pop_last\n' + csv_table_name, evaluate_result_linear_sum, data_repository.get_args().topKs)
def Samanta(topK, if_pop=2, MF_mode='node2vec', pop_mode='', text_mode='HDP', LDA_topic_num=None): """ :param Para: :param if_pop 如何使用pop 0 不使用;1,只做重排序;2总乘积做排序 :param topK: 使用KNN表示新query的mf特征 :param text_mode: 使用哪种特征提取方式 LDA HDP :param pop_mode:pop值是否使用sigmoid规约到0-1区间 :param pop_mode:MF_mode 为了省事,直接用node2vec得了 :return: """ api2pop = None if if_pop: api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs( pop_mode) # TODO root = os.path.join(data_repository.get_ds().data_root, 'baselines') if not os.path.exists(root): os.makedirs(root) mashup_feature_path = os.path.join( root, 'mashup_{}.txt'.format(text_mode)) # ... api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode)) # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_feature_path): gd = get_default_gd() _mashup_features, _api_features = gd.model_pcs(text_mode, LDA_topic_num) np.savetxt(mashup_feature_path, _mashup_features) np.savetxt(api_feature_path, _api_features) else: _mashup_features = np.loadtxt(mashup_feature_path) _api_features = np.loadtxt(api_feature_path) candidate_ids_list = [] all_predict_results = [] test_data = data_repository.get_ds().test_data test_mashup_num = len(test_data.get('mashup')) mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df api_emb_df = data_repository.get_ds().MF_obj.api_emb_df for i in range(test_mashup_num): test_m_id = test_data.get('mashup')[i][0] # 每个mashup id candidate_ids = test_data.get('api')[i] candidate_ids_list.append(candidate_ids) # 用近邻mashup的latent factor加权表示自己 mid2sim = {} for train_m_id in mashup_emb_df.index.tolist(): mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id], _mashup_features[train_m_id]) # TODO topK_ids, topK_sims = zip(*( sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims = np.array(topK_sims) / sum(topK_sims) # sim归一化 cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, )) for z in range(len(topK_ids)): cf_feature += topK_sims[z] * mashup_emb_df['embedding'][ topK_ids[z]] # 计算跟每个api的打分 predict_results = [] temp_predict_results = [] # 需要用pop进行重排序时的辅助 api_zeros = np.zeros((data_repository.get_args().implict_feat_dim)) api_ids = set(api_emb_df.index.tolist()) for api_id in candidate_ids: # id api_i_feature = api_emb_df['embedding'][ api_id] if api_id in api_ids else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score = np.sum(np.multiply( api_i_feature, cf_feature)) # mashup和api latent factor的内积 sim_score = cos_sim(_mashup_features[test_m_id], _api_features[api_id]) # 特征的余弦相似度 if if_pop == 1: temp_predict_results.append((api_id, cf_score * sim_score)) elif if_pop == 0: predict_results.append(cf_score * sim_score) elif if_pop == 2: predict_results.append(cf_score * sim_score * api2pop[api_id]) if if_pop == 1: max_k_pairs = heapq.nlargest(topK, temp_predict_results, key=lambda x: x[1]) # 首先利用乘积排一次序 max_k_candidates, _ = zip(*max_k_pairs) max_k_candidates = set(max_k_candidates) predict_results = [ api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids ] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 _name = '_pop_{}'.format(if_pop) _name += data_repository.get_args().mf_mode csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format( topK) + _name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录 def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(data_repository.get_ds().slt_api_ids_instances[i] ) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(data_repository.get_ds().test_data.get( 'all_ground_api_ids')[i]) return test_api_id_list_, predictions_, grounds_ if data_repository.get_args().data_mode == 'newScene': for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, data_repository.get_args().topKs) summary(evaluate_path, str(slt_apiNum + 1) + '_' + csv_table_name, evaluate_result, data_repository.get_args().topKs) #