def binary_keyword(text_tag_recommend_model): # pop all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs() gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v( all_mashup_num, all_api_num) # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'WVSM' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录 # 测试WJaccard(Weighted Jaccard) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id mashup_set = set(mashup_words_list[test_mashup_id]) api_set = set(api_words_list[api_id]) sim_score = 1.0 * len(mashup_set.intersection(api_set)) / len( mashup_set.union(api_set)) * api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('WJaccard test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'WJaccard' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def binary_keyword(if_pop = False): # pop api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () gd = get_default_gd() mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v () # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id if if_pop: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] else: sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) # 测试只使用特征向量的效果 predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 name = 'WVSM_pop' if if_pop else 'WVSM' csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录 """
def TF_IDF(if_pop): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ gd = get_default_gd() api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs() _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs ('TF_IDF') candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score=cos_sim(_mashup_IFIDF_features[test_mashup_id],_api_IFIDF_features[api_id]) if if_pop: predict_results.append(sim_score*api2pop[api_id]) else: predict_results.append(sim_score ) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') name = 'TFIDF_pop' if if_pop else 'TFIDF' evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def hdp_pop(if_pop = True): # pop root = os.path.join(dataset.crt_ds.root_path,'baselines') if not os.path.exists(root): os.makedirs(root) mashup_hdp_path=os.path.join(root, 'mashup_HDP.txt') # ... api_hdp_path = os.path.join(root, 'api_HDP.txt') _mashup_hdp_features = np.loadtxt (mashup_hdp_path) _api_hdp_features = np.loadtxt (api_hdp_path) if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () # 测试 candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score=cos_sim(_mashup_hdp_features[test_mashup_id],_api_hdp_features[api_id]) if if_pop: sim_score *= api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('hdp_pop test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 name = 'hdp_pop' if if_pop else 'hdp' csv_table_name = dataset.crt_ds.data_name + name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def pop(): """ :return: """ api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs () candidate_ids_list = [] all_predict_results=[] for i in range(len(dataset.crt_ds.test_mashup_id_list)): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id predict_results.append(api2pop[api_id]) all_predict_results.append(predict_results) print('pop test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 csv_table_name = dataset.crt_ds.data_name + 'pop' + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def TF_IDF(text_tag_recommend_model): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs( 'TF_IDF', all_mashup_num, all_api_num) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id], _api_IFIDF_features[api_id]) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'TF_IDF' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def MF(train_datas,test_datas,mode = ''): all_predict_results=[] # 每个测试样例(多个api的)的评分 for slt_num in range(1,new_Para.param.slt_item_num+1): # 不同个数的训练测试集 test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num-1] # 增加处理和读取MF结果的接口 UV_obj = get_UV(dataset.crt_ds.root_path, mode,train_datas[slt_num-1],slt_num) m_id2index,a_id2index = UV_obj.m_id2index,UV_obj.a_id2index for i in range(len(test_mashup_id_list)): test_mashup_id=test_mashup_id_list[i][0] # 每个mashup id predict_results = [] for test_api_id in test_api_id_list[i]: # id if test_mashup_id not in m_id2index or test_api_id not in a_id2index: dot = 0 else: m_embedding = UV_obj.m_embeddings[m_id2index[test_mashup_id]] a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]] dot = np.dot(m_embedding,a_embedding) predict_results.append(dot) all_predict_results.append(predict_results) print('{}_{} test,done!'.format(mode,slt_num)) evaluate_result = evalute(test_api_id_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 csv_table_name = dataset.crt_ds.data_name + mode + str(slt_num)+ "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录
def run_new_deepFM(CI_feas, NI_feas, train_data, test_data, all_api_num, epoch_num=10): # config = tf.ConfigProto() # config.gpu_options.allow_growth = True # session = tf.Session(config=config) # graph = tf.get_default_graph() # set_session(session) model = simple_DeepFM(CI_feature_num=4, NI_feature_num=2, CI_feature_dim=50, NI_feature_dim=25, final_feature_dim=32, task='binary', use_fm=True, l2_reg_linear=0, dnn_hidden_units=[]) model.compile( "adam", "binary_crossentropy", metrics=['binary_crossentropy'], ) print('bulid simple_DeepFM,done!') batch_size = 32 len_train = len(train_data[0]) mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features = CI_feas mashup_NI_features, api_NI_features = NI_feas features = [ mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features, mashup_NI_features, api_NI_features ] train_generator = data_generator(train_data, *features, bs=batch_size, all_api_num=all_api_num, mode="train") print('genarate train_generator ,done!') # 每训练一次就测试一次 num_test_instances = getNum_testData(test_data) for i in range(epoch_num): history = model.fit_generator(train_generator, steps_per_epoch=len_train // batch_size, epochs=1, verbose=2) test_generator = data_generator(test_data, *features, bs=batch_size, all_api_num=all_api_num, mode="test") print('genarate test_generator,done!') predictions = model.predict_generator( test_generator, steps=num_test_instances // batch_size + 1)[:, 1] print(predictions.shape) reshaped_predictions = [] # 评价 test_api_id_list, grounds = test_data[1], test_data[-1] index = 0 for test_api_ids in test_api_id_list: size = len(test_api_ids) # 当前mashup下的候选api的数目 reshaped_predictions.append( predictions[index:index + size]) # min(index + size,len(predictions)) index += size print(index) evaluate_result = evalute(test_api_id_list, reshaped_predictions, grounds, new_Para.param.topKs) # 评价 summary(new_Para.param.evaluate_path, 'deepFM_epoch_{}'.format(i), evaluate_result, new_Para.param.topKs) #
def Samanta(text_tag_recommend_model, topK, if_pop=False): """ :param Para: :param text_tag_recommend_model: 基于该model的基本数据 :param topK: :return: """ api2pop = None if if_pop: api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs() test_mashup_num = len(Para.test_mashup_id_list) all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) mashup_hdp_path = os.path.join(Para.data_dir, 'mashup_hdp.txt') api_hdp_path = os.path.join(Para.data_dir, 'api_hdp.txt') # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_hdp_path): # text,tag在encoding之后的向量,array形式 gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) _mashup_hdp_features, _api_hdp_features = gd.model_pcs( 'HDP', all_mashup_num, all_api_num) np.savetxt(mashup_hdp_path, _mashup_hdp_features) np.savetxt(api_hdp_path, _api_hdp_features) else: _mashup_hdp_features = np.loadtxt(mashup_hdp_path) _api_hdp_features = np.loadtxt(api_hdp_path) candidate_ids_list = [] all_predict_results = [] for i in range(test_mashup_num): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) id2sim = {} for local_train_mashup_index in range( len(Para.feature_train_mashup_ids)): # u_factors_matrix要用局部索引 id2sim[local_train_mashup_index] = cos_sim( _mashup_hdp_features[test_mashup_id], _mashup_hdp_features[ Para.feature_train_mashup_ids[local_train_mashup_index]]) topK_indexes, topK_sims = zip( *(sorted(id2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims = np.array(topK_sims) / sum(topK_sims) cf_feature = np.zeros((Para.num_feat)) for z in range(len(topK_indexes)): cf_feature += topK_sims[z] * Para.u_factors_matrix[topK_indexes[z]] predict_results = [] temp_predict_results = [] # 需要用pop进行重排序时的辅助 api_zeros = np.zeros((Para.num_feat)) for api_id in candidate_ids: # id api_i_feature = Para.i_factors_matrix[ Para.i_id2index[api_id]] if api_id in Para.i_id2index.keys( ) else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score = np.sum(np.multiply(api_i_feature, cf_feature)) sim_score = cos_sim(_mashup_hdp_features[test_mashup_id], _api_hdp_features[api_id]) if if_pop: temp_predict_results.append((api_id, cf_score * sim_score)) else: predict_results.append(cf_score * sim_score) if if_pop: max_k_pairs = heapq.nlargest(topK, temp_predict_results, key=lambda x: x[1]) # 根据score选取topK max_k_candidates, _ = zip(*max_k_pairs) max_k_candidates = set(max_k_candidates) predict_results = [ api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids ] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 _name = '_pop' if if_pop else '' csv_table_name = Para.data_name + 'Samanta_model' + _name + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def Samanta(topK,if_pop=2,MF_mode='node2vec',pop_mode='',text_mode='HDP',LDA_topic_num=None): """ :param Para: :param if_pop 如何使用pop 0 不使用;1,只做重排序;2总乘积做排序 :param topK: 使用KNN表示新query的mf特征 :param text_mode: 使用哪种特征提取方式 LDA HDP :param pop_mode:pop值是否使用sigmoid规约到0-1区间 :param pop_mode:MF_mode 为了省事,直接用node2vec得了 :return: """ api2pop=None if if_pop: api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs (pop_mode) root = os.path.join(dataset.crt_ds.root_path,'baselines') if not os.path.exists(root): os.makedirs(root) mashup_feature_path=os.path.join(root, 'mashup_{}.txt'.format(text_mode)) # ... api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode)) # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_feature_path): gd=get_default_gd() _mashup_features,_api_features=gd.model_pcs(text_mode,LDA_topic_num) np.savetxt(mashup_feature_path,_mashup_features) np.savetxt(api_feature_path, _api_features) else: _mashup_features=np.loadtxt(mashup_feature_path) _api_features=np.loadtxt(api_feature_path) # Para.set_MF_mode(MF_mode) # 设置latent factor # new_Para.param.mf_mode = MF_mode # 修改参数对象,慎用 candidate_ids_list = [] all_predict_results=[] test_mashup_num = len(dataset.crt_ds.test_mashup_id_list) for i in range(test_mashup_num): test_mashup_id=dataset.crt_ds.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = dataset.crt_ds.test_api_id_list[i] candidate_ids_list.append(candidate_ids) # 用近邻mashup的latent factor加权表示自己 localIndex2sim={} for local_index,train_m_id in enumerate(dataset.UV_obj.m_ids): # u_factors_matrix要用局部索引 localIndex2sim[local_index]=cos_sim(_mashup_features[test_mashup_id],_mashup_features[train_m_id]) topK_indexes,topK_sims=zip(*(sorted(localIndex2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims=np.array(topK_sims)/sum(topK_sims) # sim归一化 cf_feature=np.zeros((new_Para.param.num_feat,)) for z in range(len(topK_indexes)): cf_feature+= topK_sims[z] * dataset.UV_obj.m_embeddings[topK_indexes[z]] # 计算跟每个api的打分 predict_results = [] temp_predict_results=[] # 需要用pop进行重排序时的辅助 api_zeros=np.zeros((new_Para.param.num_feat)) for api_id in candidate_ids: # id a_id2index = dataset.UV_obj.a_id2index api_i_feature= dataset.UV_obj.a_embeddings[a_id2index[api_id]] if api_id in a_id2index.keys() else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score=np.sum(np.multiply(api_i_feature, cf_feature)) # mashup和api latent factor的内积 sim_score=cos_sim(_mashup_features[test_mashup_id],_api_features[api_id]) # 特征的余弦相似度 if if_pop==1: temp_predict_results.append((api_id,cf_score*sim_score)) elif if_pop==0: predict_results.append(cf_score*sim_score) elif if_pop == 2: predict_results.append (cf_score * sim_score*api2pop[api_id]) if if_pop==1: max_k_pairs = heapq.nlargest (topK, temp_predict_results, key=lambda x: x[1]) # 首先利用乘积排一次序 max_k_candidates, _ = zip (*max_k_pairs) max_k_candidates=set(max_k_candidates) predict_results=[api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, dataset.crt_ds.grounds, new_Para.param.topKs) # 评价 _name='_pop_{}'.format(if_pop) _name+= new_Para.param.mf_mode csv_table_name = dataset.crt_ds.data_name + 'Samanta_model_{}'.format(topK)+_name + "\n" # model.name summary(new_Para.param.evaluate_path, csv_table_name, evaluate_result, new_Para.param.topKs) # 记录 def divide(slt_apiNum): test_api_id_list_, predictions_, grounds_ = [], [], [] for i in range(test_mashup_num): if len(dataset.crt_ds.slt_api_ids_instances[i]) == slt_apiNum: test_api_id_list_.append(candidate_ids_list[i]) predictions_.append(all_predict_results[i]) grounds_.append(dataset.crt_ds.grounds[i]) return test_api_id_list_, predictions_, grounds_ if new_Para.param.data_mode == 'newScene': for slt_apiNum in range(3): test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum+1) evaluate_result = evalute(test_api_id_list_, predictions_, grounds_, new_Para.param.topKs) summary(new_Para.param.evaluate_path, str(slt_apiNum+1)+'_'+csv_table_name, evaluate_result, new_Para.param.topKs) #