def process_texts(self): """ 只处理文档 不处理tag :param data_dirs: 某个路径下的mashup和api文档 :return: """ self.encoded_texts=encoding_padding(meta_data.descriptions,new_Para.param.remove_punctuation) # 可得到各文本的encoded形式
def get_default_gd(default_encoding_texts=None,tag_times=2,mashup_only=False,strict_train=False): # 可传入encoding_texts对象 if default_encoding_texts is None: default_encoding_texts = encoding_padding (meta_data.descriptions+meta_data.tags, True) mashup_descriptions=default_encoding_texts.texts_in_index_nopadding[:meta_data.mashup_num] api_descriptions =default_encoding_texts.texts_in_index_nopadding[meta_data.mashup_num:meta_data.mashup_num+meta_data.api_num] mashup_categories =default_encoding_texts.texts_in_index_nopadding[meta_data.mashup_num+meta_data.api_num:2*meta_data.mashup_num+meta_data.api_num] api_categories =default_encoding_texts.texts_in_index_nopadding[2*meta_data.mashup_num+meta_data.api_num:] gd = gensim_data (mashup_descriptions, api_descriptions, mashup_categories, api_categories,tag_times,mashup_only=mashup_only,strict_train=strict_train) # 调整tag出现的次数 return gd
def process_text(self): """ 只处理文档 不处理tag :param data_dirs: 某个路径下的mashup和api文档 :return: """ mashup_descriptions, api_descriptions, mashup_categories, api_categories=self.pd.get_all_texts() descriptions = mashup_descriptions+api_descriptions # 先mashup后api 无tag self.encoded_texts=encoding_padding(descriptions,self.remove_punctuation) # 可得到各文本的encoded形式
def process_text(self): # 处理文本,先进行 """ process mashup and service together :param data_dirs: 某个路径下的mashup和api文档 :return: """ mashup_descriptions, api_descriptions, mashup_categories, api_categories = self.pd.get_all_texts ( self.Category_type) descriptions = mashup_descriptions + api_descriptions + mashup_categories + api_categories # 先mashup后api 最后是类别 """ with open('../data/all_texts','w',encoding='utf-8') as f: for text in descriptions: f.write('{}\n'.format(text)) """ self.encoded_texts = encoding_padding (descriptions, self.remove_punctuation) # 可得到各文本的encoded形式
def __init__(self, model_name='PasRec', semantic_mode='HDP', LDA_topic_num=None, epoch_num=15, neighbor_size=15, topTopicNum=3, cluster_mode='LDA', cluster_mode_topic_num=100): # topTopicNum在PasRec中用于计算content相似度;在IsRec中用于从K个类中寻找近邻 # semantic_mode='HDP',LDA_topic_num=None: about feature in HIN 只在IsRec_best中使用,因为PasRec和IsRec计算文本相似度时要么使用topic作为tag,要么使用EmbMax # cluster_mode='LDA',cluster_mode_topic_num: ABOUT clustering by LDA... self.simple_name = model_name self.epoch_num = epoch_num self.neighbor_size = neighbor_size # 找最近邻时的规模 self.topTopicNum = topTopicNum if self.simple_name == 'IsRec_best': self.p1_weight, self.p2_weight, self.p3_weight = 1 / 3, 1 / 3, 1 / 3 self.path_weights = [ self.p1_weight, self.p2_weight, self.p3_weight ] elif self.simple_name == 'PasRec_2path': self.p1_weight, self.p2_weight = 1 / 2, 1 / 2 self.path_weights = [self.p1_weight, self.p2_weight] elif self.simple_name == 'IsRec': self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight, self.p5_weight, self.p6_weight, self.p7_weight = 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7, 1 / 7 self.path_weights = [ self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight, self.p5_weight, self.p6_weight, self.p7_weight ] else: self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight, self.p5_weight, self.p6_weight = 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6, 1 / 6 self.path_weights = [ self.p1_weight, self.p2_weight, self.p3_weight, self.p4_weight, self.p5_weight, self.p6_weight ] self.learning_rate = 0.001 self.reg = 0.001 # 'new_true' _25pairs if LDA_topic_num is None: LDA_topic_num = '' self.model_name = '{}_{}_epoch{}_nbSize{}TopicNum{}{}{}NEW'.format( model_name, semantic_mode, epoch_num, neighbor_size, topTopicNum, cluster_mode, cluster_mode_topic_num) self.model_dir = dataset.crt_ds.model_path.format( self.model_name) # 模型路径 # !!! self.weight_path = os.path.join(self.model_dir, 'weights.npy') # 最核心的数据,只保存它,其他无用! # 数据集相关 self.all_mashup_num = meta_data.mashup_num self.all_api_num = meta_data.api_num self.his_m_ids = dataset.crt_ds.his_mashup_ids self.his_m_ids_set = set(self.his_m_ids) # 没区分 # self.train_mashup_api_list = meta_data.mashup_api_list # 纯正例的训练集!!! # self.train_mashup_api_dict = meta_data.pd.get_mashup_api_pair('dict') # 严格的训练集!!! self.train_mashup_api_list = [ pair for pair in meta_data.mashup_api_list if pair[0] in self.his_m_ids_set ] self.train_mashup_api_dict = { key: value for key, value in meta_data.pd.get_mashup_api_pair('dict').items() if key in self.his_m_ids_set } print(len(self.train_mashup_api_dict)) # 训练数据集 api_id: set(mashup_ids) self.train_aid2mids = {} for mashup_id, api_id in self.train_mashup_api_list: if api_id not in self.train_aid2mids.keys(): self.train_aid2mids[api_id] = set() self.train_aid2mids[api_id].add(mashup_id) self.his_a_ids = list( self.train_aid2mids.keys()) # 训练数据集中出现的api_id !!! self.notInvokeScore = 0 # 加入评价的api是历史mashup从未调用过的,基准评分0.5;参考1和0 0.5很差!!! # 文本,HIN相似度相关 self.HIN_path = os.path.join(self.model_dir, 'HIN_sims') # 存储各个HIN_sim源文件的root !!! self.semantic_mode = semantic_mode self.LDA_topic_num = LDA_topic_num encoded_texts = encoding_padding( meta_data.descriptions + meta_data.tags, new_Para.param.remove_punctuation) # 文本编码对象 embedding_matrix = get_embedding_matrix( encoded_texts.word2index, new_Para.param.embedding_name, dimension=new_Para.param.embedding_dim) # 每个编码词的embedding # HIN中 文本相似度计算 只在IsRec_best中使用,因为PasRec和IsRec计算文本相似度时要么使用topic作为tag,要么使用EmbMax!!! HIN_gd = get_default_gd(encoded_texts, tag_times=0, mashup_only=True, strict_train=True) # 用gensim处理文本,文本中不加tag self._mashup_features, self._api_features = HIN_gd.model_pcs( self.semantic_mode, self.LDA_topic_num) # IsRec_best需要使用TF_IDF!!! features = self._mashup_features, self._api_features self.mhs = mashup_HIN_sims( embedding_matrix, encoded_texts, semantic_name=self.semantic_mode, HIN_path=self.HIN_path, features=features, if_text_sem=True, if_tag_sem=False) # 计算HIN_sim的对象,传入的是mashup和api的文本feature self.mID2PathSims = {} # 每个mashupID(含已调用apis),跟历史mashup的各种路径的相似度 self.HIN_sims_changed_flag = False # topTopicNum在PasRec中用于基于LDA等的主题计算content相似度;在IsRec中用于从K个类中寻找近邻!!! topic_gd = get_default_gd(encoded_texts, tag_times=0, mashup_only=True, strict_train=True) # 用gensim处理文本,文本中不加tag topic_gd.model_pcs( cluster_mode, cluster_mode_topic_num) # 暂时用HDP分类/提取特征;确定主题数之后改成LDA self.m_id2topic, self.a_id2topic = topic_gd.get_topTopics( self.topTopicNum) self.topic2m_ids = {} # topic到mashup的映射;相当于按主题分类 全部mashup!不区分训练集测试集! for m_id, topic_indexes in enumerate(self.m_id2topic): for topic_index in topic_indexes: if topic_index not in self.topic2m_ids: self.topic2m_ids[topic_index] = [] self.topic2m_ids[topic_index].append(m_id) self.read_model() # 主要读取权重参数,其他不重要
def process_tags(self): self.encoded_tags = encoding_padding (meta_data.tags, new_Para.param.remove_punctuation) # 可得到各文本的encoded形式
def process_texts(self): self.encoded_texts = encoding_padding(meta_data.descriptions,new_Para.param.remove_punctuation) # 可得到各文本的encoded形式