def MF(train_datas, test_datas, mode=''): all_predict_results = [] # 每个测试样例(多个api的)的评分 for slt_num in range(1, data_repository.get_args().slt_item_num + 1): # 不同个数的训练测试集 test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num - 1] # 增加处理和读取MF结果的接口 UV_obj = MF(data_repository.get_ds().data_root, mode, train_datas[slt_num - 1], slt_num) m_id2index, a_id2index = UV_obj.m_id2index, UV_obj.a_id2index for i in range(len(test_mashup_id_list)): test_mashup_id = test_mashup_id_list[i][0] # 每个mashup id predict_results = [] for test_api_id in test_api_id_list[i]: # id if test_mashup_id not in m_id2index or test_api_id not in a_id2index: dot = 0 else: m_embedding = UV_obj.m_embeddings[ m_id2index[test_mashup_id]] a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]] dot = np.dot(m_embedding, a_embedding) predict_results.append(dot) all_predict_results.append(predict_results) print('{}_{} test,done!'.format(mode, slt_num)) evaluate_result = evalute( test_api_id_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds().name + mode + str( slt_num) + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def get_true_candi_apis(self): # TODO: 没用到 # 根据IsRec的思想,只把近邻mashup调用过的服务作为候选 self.mid2candiAids = None self.mid2candiAids_path = os.path.join(self.model_dir, 'true_candi_apis.txt') if not os.path.exists(self.mid2candiAids_path): for key,id2PathSims in self.mID2PathSims.items(): m_id = key[0] # key = (m_id,tuple(slt_apis_list)) if m_id not in self.mid2candiAids.keys(): all_neighbor_mids = set() for id2sim in id2PathSims: # 到各个剪枝后的候选近邻的某种路径下的相似度 num= min(self.neighbor_size,len(id2sim)) sorted_id2sim = sorted(id2sim.items(),key=lambda x:x[1],reverse=True) [:num] # 某种路径下的近邻 sorted_ids,_ = zip(*sorted_id2sim) all_neighbor_mids = all_neighbor_mids.union(set(sorted_ids)) true_candi_apis = set() for neighbor_mid in all_neighbor_mids: if neighbor_mid in data_repository.get_ds().train_mashup_api_dict.keys(): true_candi_apis = true_candi_apis.union(set(data_repository.get_ds().train_mashup_api_dict[neighbor_mid])) # 该近邻mashup调用过的api self.mid2candiAids[m_id] = true_candi_apis save_dict(self.mid2candiAids_path,self.mid2candiAids) else: self.mid2candiAids = read_dict(self.mid2candiAids_path) return self.mid2candiAids
def bl_PasRec(): model_name = 'PasRec_2path' # 'PasRec_2path' epoch_num = 60 # 之前是40 40比20差点 neighbor_size = 15 topTopicNum = 3 args = data_repository.get_args() train_data, test_data = data_repository.get_ds( ).train_data, data_repository.get_ds().test_data HINRec_model = HINRec(args, model_name=model_name, epoch_num=epoch_num, neighbor_size=neighbor_size, topTopicNum=topTopicNum) if os.path.exists(HINRec_model.weight_path): print('have trained,return!') else: # 这里是每隔20epoch测试一下,所以train中输入test_data HINRec_model.train(test_data) HINRec_model.save_model() evalute_by_epoch( HINRec_model, HINRec_model, HINRec_model.model_name, test_data, evaluate_by_slt_apiNum=False) # ,if_save_recommend_result=True)
def pop(): """ :return: """ api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id predict_results.append(api2pop[api_id]) all_predict_results.append(predict_results) print('pop test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds( ).name + 'pop' + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def CI_NI_fineTuning(): args = data_repository.get_args() train_data, test_data = data_repository.get_ds( ).train_data, data_repository.get_ds().test_data CI_recommend_model = CI_Model(args) CI_model_obj = CI_recommend_model.get_model() CI_model_obj = train_model(CI_recommend_model, CI_model_obj, train_data, test_data, args.train_mode, args.train_new)
def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(data_repository.get_ds().slt_api_ids_instances[i] ) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(data_repository.get_ds().test_data.get( 'all_ground_api_ids')[i]) return test_api_id_list_, predictions_, grounds_
def initialize(): # 整合text和tag信息:一个mashup/api的信息整合在一起,一行 if tag_times > 0: assert len(mashup_descriptions) == len(mashup_categories) self.mashup_dow = [] for i in range(len(mashup_descriptions)): # 直接将文本和tag拼接,是否有更好的方法 self.mashup_dow.append(mashup_descriptions[i] + mashup_categories[i] * tag_times) else: self.mashup_dow = mashup_descriptions if tag_times > 0: assert len(api_descriptions) == len(api_categories) self.api_dow = [] for i in range(len(api_descriptions)): self.api_dow.append(api_descriptions[i] + api_categories[i] * tag_times) else: self.api_dow = api_descriptions if self.strict_train: # 训练用的mashup,api的编码 self.train_mashup_dow = [self.mashup_dow[m_id] for m_id in data_repository.get_ds().his_mashup_ids] self.dct = Dictionary(self.train_mashup_dow) self.train_mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.train_mashup_dow] # 词id-数目 else: self.dct = Dictionary(self.mashup_dow + self.api_dow) # 为每个mashup/api计算feature self.mashup_dow = [self.dct.doc2bow(mashup_info) for mashup_info in self.mashup_dow] # 所有mashup文本的词id-数目 self.api_dow = [self.dct.doc2bow(api_info) for api_info in self.api_dow]
def load_pretrained_model(recommend_model, model): """ 只需要载入并返回训练好的模型即可 :param recommend_model: :param para_mode: :return: """ with open( data_repository.get_ds().new_best_epoch_path.format( recommend_model.model_dir), 'r') as f: best_epoch = int(f.readline()) para_path = data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, best_epoch) model.load_weights(para_path) print('load whole_model:{},done!'.format(recommend_model.simple_name)) return model
def get_name(self): """ 用在记录结果部分,记录数据信息+模型信息 :return: """ if not self.name: self.name = data_repository.get_md( ).name + '_' + data_repository.get_ds( ).name + '_' + self.simple_name return self.name
def set_embedding_matrixs(self): # id->embedding self.i_factors_matrix = np.zeros( (data_repository.get_md().api_num + 1, self.args.implict_feat_dim)) api_emb_df = data_repository.get_ds().MF_obj.api_emb_df for row in zip(api_emb_df.index.tolist(), api_emb_df.embedding.tolist()): id, embedding = row[0], row[1] if isinstance(embedding, str): embedding = eval(embedding) self.i_factors_matrix[id] = embedding
def binary_keyword(if_pop=False): # pop api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() gd = get_default_gd() mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v( ) # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id if if_pop: sim_score = cos_sim( mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] else: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果 predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 name = 'WVSM_pop' if if_pop else 'WVSM' csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录 """
def set_paths(self): # 路径设置 self.model_dir = os.path.join(data_repository.get_ds().data_root, self.get_simple_name()) # 模型路径 if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.model_name_path = os.path.join(self.model_dir, 'model_name.dat') # self.CI_features_path = os.path.join(self.model_dir, 'CI_features.fea') # self.train_slt_apis_mid_features_path = os.path.join(self.model_dir, 'train_slt_apis_mid_features.csv') # self.test_slt_apis_mid_features_path = os.path.join(self.model_dir, 'test_slt_apis_mid_features.csv') self.ma_text_tag_feas_path = os.path.join( self.model_dir, 'mashup_api_text_tag_feas.dat') # mashup和api的提取的文本特征
def TF_IDF(if_pop): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ gd = get_default_gd() api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs('TF_IDF') candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id], _api_IFIDF_features[api_id]) if if_pop: predict_results.append(sim_score * api2pop[api_id]) else: predict_results.append(sim_score) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') name = 'TFIDF_pop' if if_pop else 'TFIDF' evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def train(self,test_data): """ 模仿librec的实现,每个api跟一对正负mashup组成一个样例,每个api的样本数最大为50;(均衡性问题?) 每20次测试一次,训练数据不用输入,用dataset :param test_data: :return: """ for index in range(self.epoch_num): loss = 0 for sampleCount in range(len(self.his_a_ids) * self.sample_ratio): # 每个 while True: a_id = choice(self.his_a_ids) if len(self.train_aid2mids[a_id]) == len(data_repository.get_ds().his_mashup_ids): # 如果被所有mashup调用,则没有负例 continue pos_m_ids = self.train_aid2mids[a_id] # 正例 pos_m_id = choice(list(pos_m_ids)) neg_m_ids = data_repository.get_ds().his_mashup_ids_set - pos_m_ids neg_m_id = choice(list(neg_m_ids)) break # 训练时计算相似度,已选择的服务应该不包含当前服务 posPredictRating,posPathScores = self.predict_an_instance(pos_m_id, a_id, data_repository.get_ds().train_mashup_api_dict[pos_m_id]-{a_id}) negPredictRating,negPathScores = self.predict_an_instance(neg_m_id, a_id, data_repository.get_ds().train_mashup_api_dict[neg_m_id]-{a_id}) diffValue = posPredictRating - negPredictRating deriValue = sigmoid(-diffValue); lossValue = -math.log(sigmoid(diffValue)) loss += lossValue for i in range(len(self.path_weights)): # 优化第i条路径对应的权重参数 temp_value = self.path_weights[i] self.path_weights[i] += self.learning_rate * (deriValue * (posPathScores[i]-negPathScores[i]) - self.reg * temp_value) loss += self.reg * temp_value * temp_value print('epoch:{}, loss:{}'.format(index, loss)) if index>0 and index%20==0: self.test_model(test_data)
def process(self, sim_model=None, train_data=None, test_data=None): # 准备各种相似度:可以是提供文本和tag特征的CI,也可以是提供相似度支持的HINRec_model self.his_mashup_NI_feas = data_repository.get_ds( ).MF_obj.mashup_emb_df['embedding'][ data_repository.get_ds().his_mashup_ids].tolist() # TODO if isinstance(self.his_mashup_NI_feas[0], str): self.his_mashup_NI_feas = list(map(eval, self.his_mashup_NI_feas)) self.his_mashup_NI_feas = np.array(self.his_mashup_NI_feas) if self.NI_sim_mode == 'tagSim': # 基于CI部分的特征计算相似度,MISR使用 TODO self.set_mashup_api_features(sim_model) else: self.m2neighors_path = os.path.join(sim_model.model_dir, 'm2neighors.dat') self.m2neighors = {} self.path_weights = sim_model.path_weights # 读取预训练的相似度模型中的meta-path权重 self.m2AllSimsPath = os.path.join( sim_model.model_dir, 'mID2AllSims_{}.sim'.format(self.NI_sim_mode)) self.m2ASimPath = os.path.join( sim_model.model_dir, 'mID2ASim_{}_{}_{}.sim'.format(self.NI_sim_mode, self.path_topK_mode, self.topK)) self.m2ASim, self.m2AllSims = {}, {} all_paths_sim_modes = [ 'PasRec', 'PasRec_2path', 'IsRec', 'IsRec_best' ] if self.NI_sim_mode in all_paths_sim_modes: # 计算mashup表示时需要已选择服务() self.m2NI_feas = {} self.m2NI_feas_path = os.path.join( sim_model.model_dir, 'NI_m_id2{}_{}_{}.feas'.format(self.NI_sim_mode, self.path_topK_mode, self.topK)) self.get_samples_m_feas(train_data, test_data, sim_model)
def train_model(recommend_model, model, train_data, test_data, train_mode, retrain=True, true_candidates_dict=None): """ 各种模型(完全冷启动和部分冷启动,完整和部分的)都可以通用 :param recommend_model: :param model: :param train_data: 与参数对应,是否加入slt_api_ids :param test_data: :param train_mode: 'best_NDCG' or 'min_loss' :param retrain: 是否重新训练模型 :return: """ # 模型相关的东西都放在该数据下的文件夹下,不同模型不同文件夹!!! model_dir = recommend_model.model_dir if not os.path.exists(model_dir): print('makedirs for:', model_dir) os.makedirs(model_dir) if os.path.exists(data_repository.get_ds().new_best_epoch_path.format( model_dir)) and not retrain: # 加载求过的结果 print('preTrained whole_model, exists!') return load_pretrained_model(recommend_model, model) else: if train_mode == 'best_NDCG': model = train_best_NDCG_model( recommend_model, model, train_data, test_data, true_candidates_dict=true_candidates_dict) elif train_mode == 'min_loss': model = train_early_stop(recommend_model, model, train_data, test_data) elif train_mode == 'monitor loss&acc': train_monitoring_loss_acc_model(recommend_model, model, train_data, test_data) else: print('wrong train_mode:') print(train_mode) return model
def get_id2PathSims(self,m_id,slt_apis_list=None,if_temp_save=True,if_cutByTopics=True): key = (m_id,tuple(slt_apis_list)) if slt_apis_list else m_id if key in self.mID2PathSims.keys(): # 重新加载该模型时为空,有必要时为NI即时计算 return self.mID2PathSims.get(key) else: his_m_ids = set(data_repository.get_ds().his_mashup_ids)-set([m_id]) if 'IsRec' in self.simple_name and if_cutByTopics: # IsRec是否使用剪枝策略:拥有相同tag的所有mashup中选择近邻 final_his_m_ids = [] for topic in self.m_id2topic[m_id]: final_his_m_ids += list(filter(lambda x: x in his_m_ids,self.topic2m_ids[topic])) his_m_ids = final_his_m_ids if self.simple_name == 'PasRec': id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids } # 特殊:计算文本相似度时,使用content的topic作为tag,用get_p1_sim id2P2Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath', self.m_id2topic) for neigh_m_id in his_m_ids } id2P3Sim = {neigh_m_id: self.mhs.get_p3_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids } id2P4Sim = {neigh_m_id: self.mhs.get_p4_sim(neigh_m_id, slt_apis_list, 'MetaPath') for neigh_m_id in his_m_ids} # 特殊:计算文本相似度时,使用content的topic作为tag,用get_p4_sim id2P5Sim = {neigh_m_id: self.mhs.get_p4_sim(neigh_m_id, slt_apis_list, 'MetaPath', self.a_id2topic) for neigh_m_id in his_m_ids} id2P6Sim = {neigh_m_id: self.mhs.get_p6_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids} id2PathSims = [id2P1Sim, id2P2Sim, id2P3Sim, id2P4Sim, id2P5Sim, id2P6Sim] # elif self.simple_name == 'IsRec': id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids} id2P2Sim = {neigh_m_id: self.mhs.get_p2_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'EmbMax') for neigh_m_id in his_m_ids} id2P3Sim = {neigh_m_id: self.mhs.get_p3_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids} id2P4Sim = {neigh_m_id: self.mhs.get_p4_sim(neigh_m_id, slt_apis_list, 'MetaPath') for neigh_m_id in his_m_ids} id2P5Sim = {neigh_m_id: self.mhs.get_p5_sim(neigh_m_id, slt_apis_list, 'EmbMax') for neigh_m_id in his_m_ids} id2P6Sim = {neigh_m_id: self.mhs.get_p6_sim(neigh_m_id, slt_apis_list) for neigh_m_id in his_m_ids} id2P7Sim = {neigh_m_id: self.mhs.get_p2_sim_sem(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'TF_IDF') for neigh_m_id in his_m_ids} id2PathSims = [id2P1Sim, id2P2Sim, id2P3Sim, id2P4Sim, id2P5Sim, id2P6Sim,id2P7Sim] # elif self.simple_name == 'PasRec_2path': id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids} id2P2Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath', self.m_id2topic) for neigh_m_id in his_m_ids} id2PathSims = [id2P1Sim, id2P2Sim] # elif self.simple_name == 'IsRec_best': id2P1Sim = {neigh_m_id: self.mhs.get_p1_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'MetaPath') for neigh_m_id in his_m_ids} id2P2Sim = {neigh_m_id: self.mhs.get_p2_sim(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'EmbMax') for neigh_m_id in his_m_ids} id2P3Sim = {neigh_m_id: self.mhs.get_p2_sim_sem(min(m_id, neigh_m_id), max(m_id, neigh_m_id), 'TF_IDF') for neigh_m_id in his_m_ids} id2PathSims = [id2P1Sim, id2P2Sim, id2P3Sim] # if if_temp_save: self.mID2PathSims[key] = id2PathSims return id2PathSims
def train_early_stop(recommend_model, model, train_data, test_data): """ 训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试 :return: """ if_Train = True if data_repository.get_args().pairwise else False train_labels = train_data[-1] train_instances_tuple = recommend_model.get_instances( *train_data[:-1], pairwise_train_phase_flag=if_Train) train_model = recommend_model.get_pairwise_model( ) if data_repository.get_args().pairwise else model if data_repository.get_args().pairwise: train_model.compile(optimizer=recommend_model.optimizer, loss=lambda y_true, y_pred: y_pred, metrics=['accuracy']) else: train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=2, mode='min') hist = train_model.fit( [*train_instances_tuple], train_labels, epochs=data_repository.get_args().num_epochs, batch_size=data_repository.get_args().small_batch_size, callbacks=[early_stopping], validation_split=data_repository.get_args().validation_split, shuffle=True) # model.save_weights(data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, 'min_loss')) # !!! 改正 model_name = recommend_model.get_simple_name() + recommend_model.get_name( ) + '_min_loss' save_loss_acc(hist, model_name, if_multi_epoch=True) epoch_evaluate_result = evalute_by_epoch(recommend_model, model, model_name, test_data) return model
def hdp_pop(if_pop=True): # pop root = os.path.join(data_repository.get_ds().data_root, 'baselines') if not os.path.exists(root): os.makedirs(root) mashup_hdp_path = os.path.join(root, 'mashup_HDP.txt') # ... api_hdp_path = os.path.join(root, 'api_HDP.txt') _mashup_hdp_features = np.loadtxt(mashup_hdp_path) _api_hdp_features = np.loadtxt(api_hdp_path) if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() # 测试 candidate_ids_list = [] all_predict_results = [] for i in range(len(data_repository.get_ds().test_mashup_id_list)): test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][ 0] # 每个mashup id candidate_ids = data_repository.get_ds().test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_hdp_features[test_mashup_id], _api_hdp_features[api_id]) if if_pop: sim_score *= api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('hdp_pop test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 name = 'hdp_pop' if if_pop else 'hdp' csv_table_name = data_repository.get_ds( ).name + name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录
def set_paths(self): self.model_dir = data_repository.get_ds().model_path.format( self.get_simple_name()) # 模型路径
def set_paths(self): self.model_dir = os.path.join(data_repository.get_ds().data_root,self.get_simple_name()) # 模型路径 if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) self.model_name_path = os.path.join(self.model_dir, 'model_name.dat')
def Samanta(topK, if_pop=2, MF_mode='node2vec', pop_mode='', text_mode='HDP', LDA_topic_num=None): """ :param Para: :param if_pop 如何使用pop 0 不使用;1,只做重排序;2总乘积做排序 :param topK: 使用KNN表示新query的mf特征 :param text_mode: 使用哪种特征提取方式 LDA HDP :param pop_mode:pop值是否使用sigmoid规约到0-1区间 :param pop_mode:MF_mode 为了省事,直接用node2vec得了 :return: """ api2pop = None if if_pop: api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs( pop_mode) # TODO root = os.path.join(data_repository.get_ds().data_root, 'baselines') if not os.path.exists(root): os.makedirs(root) mashup_feature_path = os.path.join( root, 'mashup_{}.txt'.format(text_mode)) # ... api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode)) # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_feature_path): gd = get_default_gd() _mashup_features, _api_features = gd.model_pcs(text_mode, LDA_topic_num) np.savetxt(mashup_feature_path, _mashup_features) np.savetxt(api_feature_path, _api_features) else: _mashup_features = np.loadtxt(mashup_feature_path) _api_features = np.loadtxt(api_feature_path) candidate_ids_list = [] all_predict_results = [] test_data = data_repository.get_ds().test_data test_mashup_num = len(test_data.get('mashup')) mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df api_emb_df = data_repository.get_ds().MF_obj.api_emb_df for i in range(test_mashup_num): test_m_id = test_data.get('mashup')[i][0] # 每个mashup id candidate_ids = test_data.get('api')[i] candidate_ids_list.append(candidate_ids) # 用近邻mashup的latent factor加权表示自己 mid2sim = {} for train_m_id in mashup_emb_df.index.tolist(): mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id], _mashup_features[train_m_id]) # TODO topK_ids, topK_sims = zip(*( sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims = np.array(topK_sims) / sum(topK_sims) # sim归一化 cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, )) for z in range(len(topK_ids)): cf_feature += topK_sims[z] * mashup_emb_df['embedding'][ topK_ids[z]] # 计算跟每个api的打分 predict_results = [] temp_predict_results = [] # 需要用pop进行重排序时的辅助 api_zeros = np.zeros((data_repository.get_args().implict_feat_dim)) api_ids = set(api_emb_df.index.tolist()) for api_id in candidate_ids: # id api_i_feature = api_emb_df['embedding'][ api_id] if api_id in api_ids else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score = np.sum(np.multiply( api_i_feature, cf_feature)) # mashup和api latent factor的内积 sim_score = cos_sim(_mashup_features[test_m_id], _api_features[api_id]) # 特征的余弦相似度 if if_pop == 1: temp_predict_results.append((api_id, cf_score * sim_score)) elif if_pop == 0: predict_results.append(cf_score * sim_score) elif if_pop == 2: predict_results.append(cf_score * sim_score * api2pop[api_id]) if if_pop == 1: max_k_pairs = heapq.nlargest(topK, temp_predict_results, key=lambda x: x[1]) # 首先利用乘积排一次序 max_k_candidates, _ = zip(*max_k_pairs) max_k_candidates = set(max_k_candidates) predict_results = [ api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids ] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute( candidate_ids_list, all_predict_results, data_repository.get_ds().test_data.get('all_ground_api_ids'), data_repository.get_args().topKs) # 评价 _name = '_pop_{}'.format(if_pop) _name += data_repository.get_args().mf_mode csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format( topK) + _name + "\n" # whole_model.name summary(evaluate_path, csv_table_name, evaluate_result, data_repository.get_args().topKs) # 记录 def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(data_repository.get_ds().slt_api_ids_instances[i] ) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(data_repository.get_ds().test_data.get( 'all_ground_api_ids')[i]) return test_api_id_list_, predictions_, grounds_ if data_repository.get_args().data_mode == 'newScene': for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, data_repository.get_args().topKs) summary(evaluate_path, str(slt_apiNum + 1) + '_' + csv_table_name, evaluate_result, data_repository.get_args().topKs) #
def train_best_NDCG_model(recommend_model, model, train_data, test_data, true_candidates_dict=None, CI_start_test_epoch=0, earlyStop_epochs=5): """ 训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型 :param recommend_model: 整体的推荐模型 :param model: model_core :param train_data: :param test_data: :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练 :param true_candidates_dict: :return: """ print('training_save_best_NDCG_model...') epoch_evaluate_results = [] # 模型 train_model = recommend_model.get_pairwise_model( ) if data_repository.get_args().pairwise else model # 数据 train_instances_dict = recommend_model.get_instances( train_data, pairwise_train_phase_flag=data_repository.get_args().pairwise) train_labels = train_data.get('label') if data_repository.get_args( ).final_activation == 'softmax': # 针对softmax变换labels train_labels = utils.to_categorical(train_labels, num_classes=2) best_epoch, best_NDCG_5 = 0, 0 for epoch in range(data_repository.get_args().num_epochs): if epoch == 0: # 首次训练要编译 # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy' # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy']) train_model.compile(optimizer=recommend_model.optimizer, loss='binary_crossentropy', metrics=['accuracy']) print('whole_model compile,done!') print('Epoch {}'.format(epoch)) hist = train_model.fit( train_instances_dict, np.array(train_labels), batch_size=data_repository.get_args().batch_size, epochs=1, verbose=1, shuffle=True, validation_split=data_repository.get_args().validation_split) print('Epoch {}, train done!'.format(epoch)) # 记录:数据集情况,模型架构,训练设置 record_name = recommend_model.get_name() + data_repository.get_args( ).train_name if epoch == 0 else '' # 记录在测试集的效果,写入evalute.csv save_loss_acc(hist, record_name, epoch=epoch) # 每个epoch记录 # CI的前3轮效果差,一般不用测,提高速度 first_test_epoch = CI_start_test_epoch if isinstance( recommend_model, CI_Model) else 0 if epoch < first_test_epoch: epoch_evaluate_results.append(None) continue # epoch测试 epoch_evaluate_result = evalute_by_epoch( recommend_model, model, record_name, test_data, record_time=True if epoch == 0 else False, true_candidates_dict=true_candidates_dict) epoch_evaluate_results.append(epoch_evaluate_result) # 优于目前的best_NDCG_5才存储模型参数 TODO if epoch_evaluate_result[0][3] >= best_NDCG_5: best_NDCG_5 = epoch_evaluate_result[0][3] best_epoch = epoch model.save_weights( data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, epoch)) else: if epoch - best_epoch >= earlyStop_epochs: # 大于若干个epoch,效果没有提升,即时终止 break # 记录最优epoch和最优NDCG@5 with open( data_repository.get_ds().new_best_epoch_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_epoch)) with open( data_repository.get_ds().new_best_NDCG_path.format( recommend_model.model_dir), 'w') as f: f.write(str(best_NDCG_5)) print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5)) # 记录最优指标 csv_table_name = 'best_indicaters\n' summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch], data_repository.get_args().topKs) # 看word embedding矩阵是否发生改变,尤其是padding的0 # print('some embedding parameters after {} epoch:'.format(epoch)) # print (recommend_model.embedding_layer.get_weights ()[0][:2]) # 把记录的非最优的epoch模型参数都删除 try: for i in range(data_repository.get_args().num_epochs): temp_path = data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, i) if i != best_epoch and os.path.exists(temp_path): os.remove(temp_path) model.load_weights(data_repository.get_ds().new_model_para_path.format( recommend_model.model_dir, best_epoch)) finally: return model
def __init__(self,args,model_name = 'PasRec',semantic_mode='HDP',LDA_topic_num='',epoch_num=15,neighbor_size=15,topTopicNum=3,cluster_mode='LDA',cluster_mode_topic_num=100): # semantic_mode='HDP',LDA_topic_num=None: about feature in HIN # cluster_mode='LDA',cluster_mode_topic_num: ABOUT clustering by LDA... self.simple_name = model_name if self.simple_name == 'IsRec_best': self.p1_weight, self.p2_weight, self.p3_weight = 1/3,1/3,1/3 self.path_weights = [self.p1_weight, self.p2_weight, self.p3_weight] elif self.simple_name == 'PasRec_2path': self.p1_weight, self.p2_weight = 1/2,1/2 self.path_weights = [self.p1_weight, self.p2_weight] elif self.simple_name == 'IsRec': self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight,self.p7_weight = 1/7,1/7,1/7,1/7,1/7,1/7,1/7 self.path_weights = [self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight,self.p7_weight] else : self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight = 1/6,1/6,1/6,1/6,1/6,1/6 self.path_weights = [self.p1_weight,self.p2_weight,self.p3_weight,self.p4_weight,self.p5_weight,self.p6_weight] self.neighbor_size = neighbor_size # 找最近邻时的规模 self.epoch_num = epoch_num self.learning_rate = 0.001 self.reg=0.001 self.sample_ratio = 50 # pairwise优化,每个api对应的训练pair数目 self.model_name = '{}_{}_epoch{}_nbSize{}TopicNum{}{}{}'.format(self.simple_name,semantic_mode, epoch_num, neighbor_size,topTopicNum,cluster_mode,cluster_mode_topic_num) self.model_dir = data_repository.get_ds().model_path.format(self.model_name) # 模型路径 # !!! self.weight_path = os.path.join(self.model_dir, 'weights.npy') # 最核心的数据,只保存它,其他无用! # 训练数据集 api_id: set(mashup_ids) self.train_aid2mids = {} for mashup_id, api_id in data_repository.get_ds().train_mashup_api_list: if api_id not in self.train_aid2mids.keys(): self.train_aid2mids[api_id] = set() self.train_aid2mids[api_id].add(mashup_id) self.his_a_ids = list(self.train_aid2mids.keys()) # 训练数据集中出现的api_id !!! self.notInvokeScore = 0 # 加入评价的api是历史mashup从未调用过的,基准评分0;参考1和0 0.5很差!!! # 文本,HIN相似度相关 self.HIN_path = os.path.join(self.model_dir, 'HIN_sims') # 存储各个HIN_sim源文件的root !!! self.semantic_mode = semantic_mode self.LDA_topic_num = LDA_topic_num # HIN中 文本相似度计算 只在IsRec_best中使用,因为PasRec和IsRec计算文本相似度时要么使用topic作为tag,要么使用EmbMax!!! HIN_gd = get_default_gd(tag_times=2,strict_train=False) embedding_matrix = get_embedding_matrix(HIN_gd.dct.token2id, args.embedding_name,dimension=args.embedding_dim) # 每个编码词的embedding HIN_gd.model_pcs(model_name = self.semantic_mode,LDA_topic_num =self.LDA_topic_num) # IsRec_best需要使用TF_IDF HIN_gd.get_all_encoded_comments() self.mhs = mashup_HIN_sims(embedding_matrix, gd = HIN_gd, semantic_name=self.semantic_mode, HIN_path=self.HIN_path,features=(HIN_gd._mashup_features, HIN_gd._api_features), if_text_sem=True,if_tag_sem=False) self.mID2PathSims={} # 每个mashupID(含已调用apis),跟历史mashup的各种路径的相似度 self.HIN_sims_changed_flag = False # topTopicNum在PasRec中用于基于LDA等的主题计算content相似度;在IsRec中用于从K个类中寻找近邻!!! self.topTopicNum = topTopicNum topic_gd = get_default_gd(tag_times=0,strict_train=True) # 用gensim处理文本,文本中不加tag topic_gd.model_pcs(model_name = cluster_mode,LDA_topic_num =cluster_mode_topic_num) # 暂时用HDP分类/提取特征;确定主题数之后改成LDA self.m_id2topic,self.a_id2topic = topic_gd.get_topTopics(self.topTopicNum) # 全部mashup: topic到mashup的映射;相当于按主题分类 self.topic2m_ids = {} for m_id,topic_indexes in enumerate(self.m_id2topic): for topic_index in topic_indexes: if topic_index not in self.topic2m_ids: self.topic2m_ids[topic_index] = [] self.topic2m_ids[topic_index].append(m_id) self.read_model() # 主要读取权重参数,其他不重要
def get_m2ASim(self, train_data, test_data, sim_model): """得到一个mashup到其他mashup的归一化的综合相似度向量""" if os.path.exists(self.m2neighors_path) and os.path.exists( self.m2ASimPath): # ...计算explicit用 with open(self.m2ASimPath, 'rb') as f: self.m2ASim = pickle.load(f) with open(self.m2neighors_path, 'rb') as f: self.m2neighors = pickle.load(f) else: # 一次性计算全部的并存储 print('m2ASim not exist, computing!') dict_ = self.get_m2AllSims( train_data, test_data, sim_model) # self.m2AllSims 每个sample的相似度映射 for key, id2PathSims in dict_.items(): m_id = key if isinstance(key, int) else key[0] # mashup ID if self.path_topK_mode == 'eachPathTopK': # 每个路径的topK for i in range(len(id2PathSims)): # 某一种路径的相似度 id2PathSim = id2PathSims[i] num = min(self.topK, len(id2PathSim)) id2PathSim = sorted(id2PathSim.items(), key=lambda x: x[1], reverse=True)[:num] id2PathSims[i] = { key: value for key, value in id2PathSim } id2score = { his_m_id: 0 for his_m_id in data_repository.get_ds().his_mashup_ids } # 到所有历史mashup的综合相似度 for his_m_id in id2score.keys(): # 每个历史近邻mashup if his_m_id != m_id: # 除去自身 for path_index, id2aPathSim in enumerate( id2PathSims): # 每种相似度路径 pathSim = 0 if his_m_id not in id2aPathSim.keys( ) else id2aPathSim[his_m_id] # 某个历史mid可能没有某种相似度 id2score[his_m_id] += pathSim * self.path_weights[ path_index] # 为显式设计,综合所有路径之后存储topk近邻 num = min(self.topK, len(id2score)) self.m2neighors[key], _ = zip( *(sorted(id2score.items(), key=lambda x: x[1], reverse=True)[:num])) # 按顺序存储topK个近邻的ID if self.path_topK_mode == 'allPathsTopK': # 最终所有路径综合评分的topK num = min(self.topK, len(id2score)) id2score = sorted(id2score.items(), key=lambda x: x[1], reverse=True)[:num] id2score = {key: value for key, value in id2score} sims = np.array([ id2score[his_m_id] if his_m_id in id2score.keys() else 0 for his_m_id in data_repository.get_ds().his_mashup_ids ]) # 按顺序排好的sims: (#his_m_ids) sum_sim = sum(sims) if sum_sim == 0: print('sims sum=0!') else: sims = sims / sum_sim self.m2ASim[key] = sims print('m2ASim, computed!') with open(self.m2ASimPath, 'wb') as f: pickle.dump(self.m2ASim, f) with open(self.m2neighors_path, 'wb') as f: pickle.dump(self.m2neighors, f) return self.m2ASim