def initialize(self, text_tag_recommend_model, text_tag_model, train_mashup_id_list, train_api_id_list, test_mashup_id_list, test_api_id_list, feature_train_mashup_ids): prod = len (test_mashup_id_list) * len (test_mashup_id_list[0]) D1_test_mashup_id_list = tuple (np.array (test_mashup_id_list).reshape (prod, )) # 将二维的test数据降维 D1_test_api_id_list = tuple (np.array (test_api_id_list).reshape (prod, )) feature_test_mashup_ids = sorted (list (set (D1_test_mashup_id_list))) # 测试用mashup的升序排列 feature_test_api_ids = [0] * len (feature_test_mashup_ids) feature_instances_tuple = text_tag_recommend_model.get_instances (feature_test_mashup_ids, feature_test_api_ids, True) # 只包含mashup信息 # test样本:提取text和tag feature text_tag_middle_model_1 = Model (inputs=[text_tag_model.inputs[0], text_tag_model.inputs[2]], outputs=[text_tag_model.get_layer ('concatenate_1').input[0], text_tag_model.get_layer ('concatenate_1').input[2]]) text_tag_test_mashup_features = np.hstack ( text_tag_middle_model_1.predict ([*feature_instances_tuple], verbose=0)) # text,tag 按照mashup id的大小顺序 # 训练+测试样本 求所有样本的 mashupid,apiid:x all_mashup_id_list = train_mashup_id_list + D1_test_mashup_id_list all_api_id_list = train_api_id_list + D1_test_api_id_list all_instances_tuple = text_tag_recommend_model.get_instances (all_mashup_id_list, all_api_id_list) text_tag_middle_model = Model (inputs=text_tag_model.inputs, outputs=[text_tag_model.get_layer ( 'text_tag_feature_extracter').output]) # 输出mashup api的text,tag整合后的特征 x_features = text_tag_middle_model.predict ([*all_instances_tuple]) self.x_feature_dim = len (x_features[0]) self._map = {} # 基于id for index in range (len (x_features)): self._map[(all_mashup_id_list[index], all_api_id_list[index])] = x_features[index] # 先train,后test mashup id all_feature_mashup_ids = feature_train_mashup_ids + feature_test_mashup_ids all_features = np.vstack ((self.ini_features_array, text_tag_test_mashup_features)) # CNN提取的文本特征和tag的embedding大小不一样,所以无法直接拼接计算sim;需要单独计算sim,然后加权求和!!! text_dim=self.inception_fc_unit_nums[-1] for i in range (len (all_feature_mashup_ids)): # 为所有mashup找最近 id2sim = {} for j in range (len (feature_train_mashup_ids)): # 从所有train中找,存放的是内部索引 if i != j: text_sim = cos_sim (all_features[i][:text_dim], all_features[j][:text_dim]) tag_sim = cos_sim (all_features[i][text_dim:], all_features[j][text_dim:]) id2sim[j]= self.text_weight*text_sim+(1- self.text_weight)*tag_sim topK_indexes, topK_sims = zip (*(sorted (id2sim.items (), key=lambda x: x[1], reverse=True)[:self.topK])) self.mashup_id2neighbors[all_feature_mashup_ids[i]]=[self.m_index2id[index] for index in topK_indexes] #每个mashup距离最近的mashup的id list topK_sims = np.array (topK_sims) / sum (topK_sims) cf_feature = np.zeros ((self.num_feat)) for z in range (len (topK_indexes)): cf_feature += topK_sims[z] * self.u_factors_matrix[topK_indexes[z]] self.mashup_id2CFfeature[all_feature_mashup_ids[i]] = cf_feature
def binary_keyword(text_tag_recommend_model): # pop all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs() gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v( all_mashup_num, all_api_num) # 测试WVSM(Weighted Vector Space Model) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(mashup_binary_matrix[test_mashup_id], api_binary_matrix[api_id]) * api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('WVSM test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'WVSM' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录 # 测试WJaccard(Weighted Jaccard) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id mashup_set = set(mashup_words_list[test_mashup_id]) api_set = set(api_words_list[api_id]) sim_score = 1.0 * len(mashup_set.intersection(api_set)) / len( mashup_set.union(api_set)) * api2pop[api_id] predict_results.append(sim_score) all_predict_results.append(predict_results) print('WJaccard test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'WJaccard' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def get_mean(self, mashup_text_index, api_text_index): embedding1 = np.array([ self.wordindex2embedding.get(word) for word in mashup_text_index ]).mean(axis=0) embedding2 = np.array([ self.wordindex2embedding.get(word) for word in api_text_index ]).mean(axis=0) sim = cos_sim(embedding1, embedding2) return sim
def cpt_wod_cos_sim(self, id1, id2): """ 计算词(id)间的sim,并存储供索引 :param id1: :param id2: :return: """ if id1 == id2: return 1 id_b = max(id1, id2) id_s = min(id1, id2) value = self.words_Sim.get((id_s, id_b)) # 小到大,按顺序 if value is None: value = cos_sim(self.wordindex2embedding.get(id_s), self.wordindex2embedding.get(id_b)) self.words_Sim[(id_s, id_b)] = value return value
def TF_IDF(text_tag_recommend_model): """ 可以跟写到Samanta的类中,但太混乱,没必要 :return: """ all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs( 'TF_IDF', all_mashup_num, all_api_num) candidate_ids_list = [] all_predict_results = [] for i in range(len(Para.test_mashup_id_list)): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) predict_results = [] for api_id in candidate_ids: # id sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id], _api_IFIDF_features[api_id]) predict_results.append(sim_score) all_predict_results.append(predict_results) print('TF_IDF test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 csv_table_name = Para.data_name + 'TF_IDF' + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录
def Samanta(text_tag_recommend_model, topK, if_pop=False): """ :param Para: :param text_tag_recommend_model: 基于该model的基本数据 :param topK: :return: """ api2pop = None if if_pop: api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs() test_mashup_num = len(Para.test_mashup_id_list) all_mashup_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('mashup')) all_api_num = len( text_tag_recommend_model.pd.get_mashup_api_index2name('api')) mashup_hdp_path = os.path.join(Para.data_dir, 'mashup_hdp.txt') api_hdp_path = os.path.join(Para.data_dir, 'api_hdp.txt') # 获取mashup_hdp_features,api_hdp_features if not os.path.exists(api_hdp_path): # text,tag在encoding之后的向量,array形式 gd = gensim_data(*text_tag_recommend_model.get_instances( [i for i in range(all_mashup_num)], [i for i in range(all_api_num)], False)) _mashup_hdp_features, _api_hdp_features = gd.model_pcs( 'HDP', all_mashup_num, all_api_num) np.savetxt(mashup_hdp_path, _mashup_hdp_features) np.savetxt(api_hdp_path, _api_hdp_features) else: _mashup_hdp_features = np.loadtxt(mashup_hdp_path) _api_hdp_features = np.loadtxt(api_hdp_path) candidate_ids_list = [] all_predict_results = [] for i in range(test_mashup_num): test_mashup_id = Para.test_mashup_id_list[i][0] # 每个mashup id candidate_ids = Para.test_api_id_list[i] candidate_ids_list.append(candidate_ids) id2sim = {} for local_train_mashup_index in range( len(Para.feature_train_mashup_ids)): # u_factors_matrix要用局部索引 id2sim[local_train_mashup_index] = cos_sim( _mashup_hdp_features[test_mashup_id], _mashup_hdp_features[ Para.feature_train_mashup_ids[local_train_mashup_index]]) topK_indexes, topK_sims = zip( *(sorted(id2sim.items(), key=lambda x: x[1], reverse=True)[:topK])) topK_sims = np.array(topK_sims) / sum(topK_sims) cf_feature = np.zeros((Para.num_feat)) for z in range(len(topK_indexes)): cf_feature += topK_sims[z] * Para.u_factors_matrix[topK_indexes[z]] predict_results = [] temp_predict_results = [] # 需要用pop进行重排序时的辅助 api_zeros = np.zeros((Para.num_feat)) for api_id in candidate_ids: # id api_i_feature = Para.i_factors_matrix[ Para.i_id2index[api_id]] if api_id in Para.i_id2index.keys( ) else api_zeros # 可能存在测试集中的api不在train中出现过的场景 cf_score = np.sum(np.multiply(api_i_feature, cf_feature)) sim_score = cos_sim(_mashup_hdp_features[test_mashup_id], _api_hdp_features[api_id]) if if_pop: temp_predict_results.append((api_id, cf_score * sim_score)) else: predict_results.append(cf_score * sim_score) if if_pop: max_k_pairs = heapq.nlargest(topK, temp_predict_results, key=lambda x: x[1]) # 根据score选取topK max_k_candidates, _ = zip(*max_k_pairs) max_k_candidates = set(max_k_candidates) predict_results = [ api2pop[api_id] if api_id in max_k_candidates else -1 for api_id in candidate_ids ] # 重排序 all_predict_results.append(predict_results) print('Samanta test,done!') evaluate_result = evalute(candidate_ids_list, all_predict_results, Para.grounds, Para.topKs) # 评价 _name = '_pop' if if_pop else '' csv_table_name = Para.data_name + 'Samanta_model' + _name + "\n" # model.name summary(Para.evaluate_path, csv_table_name, evaluate_result, Para.topKs) # 记录