Ejemplo n.º 1
0
 def get_text_embedding_layer(self):
     """"
     得到定制的word embedding层,在feature_extracter_from_texts中使用
     """
     if self.text_embedding_layer is None:
         # 得到词典中每个词对应的embedding
         num_words = min(self.args.MAX_NUM_WORDS,
                         len(data_repository.get_md().des_pd.word2index)
                         ) + 1  # 实际词典大小 +1  因为0代表0的填充向量
         self.text_embedding_matrix = get_embedding_matrix(
             data_repository.get_md().des_pd.word2index,
             self.args.embedding_name,
             dimension=self.args.embedding_dim)
         print('built embedding matrix, done!')
         self.text_embedding_layer = Embedding(
             num_words,
             self.args.embedding_dim,
             embeddings_initializer=Constant(self.text_embedding_matrix),
             embeddings_regularizer=regularizers.l2(
                 self.args.embeddings_regularizer),
             input_length=self.args.MAX_SEQUENCE_LENGTH,
             mask_zero=True,
             trainable=self.args.embedding_train,
             name='text_embedding_layer')  # mask_zero=True!!!
         print('built text embedding layer, done!')
     return self.text_embedding_layer
Ejemplo n.º 2
0
 def get_tag_embedding_layer(self):
     """"
     同text,处理tags,得到定制的word embedding层,在tag_feature_extractor中使用
     """
     if self.tag_embedding_layer is None:
         # 得到词典中每个词对应的embedding
         num_words = min(self.args.MAX_NUM_WORDS,
                         len(data_repository.get_md().cate_pd.word2index)
                         ) + 1  # 实际词典大小 +1  因为0代表0的填充向量
         self.tag_embedding_matrix = get_embedding_matrix(
             data_repository.get_md().cate_pd.word2index,
             self.args.embedding_name,
             dimension=self.args.embedding_dim)
         print('built tag embedding matrix, done!')
         self.tag_embedding_layer = Embedding(
             num_words,
             self.args.embedding_dim,
             embeddings_initializer=Constant(self.tag_embedding_matrix),
             embeddings_regularizer=regularizers.l2(
                 self.args.embeddings_regularizer),
             input_length=self.args.MAX_TAGS_NUM,
             mask_zero=True,
             trainable=self.args.embedding_train,
             name='tag_embedding_layer')
         print('built tag embedding layer, done!')
     return self.tag_embedding_layer
Ejemplo n.º 3
0
    def __init__(self, wordindex2emb, gd , HIN_path='', features=None, semantic_name='', if_text_sem=True,
                 if_tag_sem=True, if_mashup_sem=True, if_api_sem=True):

        self.ws = word_sim(wordindex2emb)  # embedding 层参数
        self.num_users = data_repository.get_md().mashup_num
        self.num_items = data_repository.get_md().api_num
        self.semantic_name = semantic_name  # 输入的特征的名字  默认为空,是利用CI模型做的;其他的使用HDP等要输入
        self.gd = gd  # 要用到其中的编码

        # 利用外部传入的content和tag的feature计算相似度
        if features is not None:
            if len(features) == 2:
                if if_text_sem and not if_tag_sem:  # 可以只用text的语义特征,PasRec等使用
                    self.mashup_texts_features, self.api_texts_features = features
                if if_mashup_sem and not if_api_sem:  # 可以只用mashup的语义特征  HIN中使用
                    self.mashup_texts_features, self.mashup_tag_features = features
            elif len(features) == 4:
                self.mashup_texts_features, self.mashup_tag_features, self.api_texts_features, self.api_tag_features = features

        self.mashup_apis_dict = list2dict(data_repository.get_md().mashup_api_list)
        self.api_id2provider = data_repository.get_md().api_df['API_Provider']

        # self.path= os.path.join(HIN_path,self.name) # 存放在CI文件夹下!data_repository.get_ds().data_root  no_kcv_root_path
        self.path = HIN_path  # 存放相似度的路径  kcvIndex/CIModelPath/HIN_sims/

        if not os.path.exists(self.path):
            os.makedirs(self.path)

        self.p1_sims, self.p2_sims, self.p3_sims, self.p4_sims, self.p5_sims, self.p6_sims = None, None, None, None, None, None
        self.p1_sims_sem, self.p2_sims_sem, self.p3_sims_sem, self.p4_sims_sem = None, None, None, None

        self.flag1, self.flag2, self.flag3, self.flag4, self.flag5, self.flag6 = False, False, False, False, False, False
        self.flag1_sem, self.flag2_sem, self.flag4_sem = False, False, False
Ejemplo n.º 4
0
def get_default_gd(tag_times=2, mashup_only=False,strict_train=False):
    # 对mashup和api的文本+tag,统一处理
    gd = gensim_data(get_iterable_values(data_repository.get_md().mashup_df,'final_description'),
                     get_iterable_values(data_repository.get_md().mashup_df,'Categories'),
                     get_iterable_values(data_repository.get_md().api_df,'final_description'),
                     get_iterable_values(data_repository.get_md().api_df,'Categories'),
                     tag_times = tag_times,
                     mashup_only = mashup_only,
                     strict_train=strict_train)  # 调整tag出现的次数
    return gd
Ejemplo n.º 5
0
 def set_embedding_layers(self):
     self.api_implict_emb_layer = Embedding(
         data_repository.get_md().api_num + 1,
         self.args.implict_feat_dim,
         embeddings_initializer=Constant(self.i_factors_matrix),
         mask_zero=False,
         trainable=False,
         name='api_implict_embedding_layer')
Ejemplo n.º 6
0
    def get_binary_v(self):
        dict_size = len(self.dct)
        mashup_binary_matrix = np.zeros((data_repository.get_md().mashup_num+1, dict_size))
        api_binary_matrix = np.zeros((data_repository.get_md().api_num+1, dict_size))
        mashup_words_list = []  # 每个mashup中所有出现过的词
        api_words_list = []
        for id in range(data_repository.get_md().mashup_num+1):
            temp_words_list, _ = zip(*self.mashup_dow[id])
            mashup_words_list.append(temp_words_list)
            for j in temp_words_list:  # 出现的词汇index
                mashup_binary_matrix[id][j] = 1.0

        for id in range(data_repository.get_md().api_num+1):
            temp_words_list, _ = zip(*self.api_dow[id])
            api_words_list.append(temp_words_list)
            for j in temp_words_list:  # 出现的词汇index
                api_binary_matrix[id][j] = 1.0
        return mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list
Ejemplo n.º 7
0
 def get_name(self):
     """
     用在记录结果部分,记录数据信息+模型信息
     :return:
     """
     if not self.name:
         self.name = data_repository.get_md(
         ).name + '_' + data_repository.get_ds(
         ).name + '_' + self.simple_name
     return self.name
Ejemplo n.º 8
0
    def model_pcs(self, model_name, LDA_topic_num=None):
        # 模型处理,返回mashup和api的特征:对同一个语料,可以先后使用不同的模型处理
        # hdp结果形式:[(0, 0.032271167132309014),(1, 0.02362695056720504)]
        if self.mashup_only:
            if self.strict_train:
                train_corpus = self.train_mashup_dow
            else:
                train_corpus = self.mashup_dow
        else:
            if self.strict_train:
                train_corpus = self.train_mashup_dow + self.api_dow
            else:
                train_corpus = self.mashup_dow + self.api_dow

        if model_name == 'HDP':
            self.model = HdpModel(train_corpus, self.dct)
            self.num_topics = self.model.get_topics().shape[0]
            print('num_topics', self.num_topics)
        elif model_name == 'TF_IDF':
            self.model = TfidfModel(train_corpus)
            self.num_topics = len(self.dct)
        elif model_name == 'LDA':
            if LDA_topic_num is None:
                self.model = LdaModel(train_corpus)
            else:
                self.model = LdaModel(train_corpus, num_topics=LDA_topic_num)
            self.num_topics = self.model.get_topics().shape[0]
        else:
            raise ValueError('wrong gensim_model name!')

        # 使用模型处理文本得到稀疏特征向量,再转化为标准的np格式(每个topic上都有)
        # *** 由于mashup_dow和api_dow默认是全部mashup/api的文本,所以得到的特征列表用全局性的id索引即可 ***
        self.mashup_features = [self.model[mashup_info] for mashup_info in self.mashup_dow]  # 每个mashup和api的feature
        self.api_features = [self.model[api_info] for api_info in self.api_dow]
        self.dense_mashup_features = np.zeros((data_repository.get_md().mashup_num, self.num_topics))
        self.dense_api_features = np.zeros((data_repository.get_md().api_num, self.num_topics))
        for i in range(data_repository.get_md().mashup_num):  # 部分维度有值,需要转化成规范array
            for index, value in self.mashup_features[i]:
                self.dense_mashup_features[i][index] = value
        for i in range(data_repository.get_md().api_num):
            for index, value in self.api_features[i]:
                self.dense_api_features[i][index] = value
        return self.dense_mashup_features, self.dense_api_features
Ejemplo n.º 9
0
 def set_embedding_matrixs(self):
     # id->embedding
     self.i_factors_matrix = np.zeros(
         (data_repository.get_md().api_num + 1, self.args.implict_feat_dim))
     api_emb_df = data_repository.get_ds().MF_obj.api_emb_df
     for row in zip(api_emb_df.index.tolist(),
                    api_emb_df.embedding.tolist()):
         id, embedding = row[0], row[1]
         if isinstance(embedding, str):
             embedding = eval(embedding)
         self.i_factors_matrix[id] = embedding
Ejemplo n.º 10
0
 def get_topTopics(self, topTopicNum=3):  # 选取概率最高的topK个主题 [(),(),...]
     mashup_topics = []
     api_topics = []
     for index in range(data_repository.get_md().mashup_num):
         sorted_mashup_feature = sorted(self.mashup_features[index], key=lambda x: x[1], reverse=True)
         try:
             topic_indexes, _ = zip(*sorted_mashup_feature)
         except:
             # 有时mashup_bow非空,但是mashup_feature为空
             topic_indexes = random.sample(range(data_repository.get_md().mashup_num), topTopicNum)
         num = min(len(topic_indexes), topTopicNum)
         mashup_topics.append(topic_indexes[:num])
     for index in range(data_repository.get_md().api_num):
         sorted_api_feature = sorted(self.api_features[index], key=lambda x: x[1], reverse=True)
         try:
             topic_indexes, _ = zip(*sorted_api_feature)
         except:
             topic_indexes = random.sample(range(data_repository.get_md().api_num), topTopicNum)
         num = min(len(topic_indexes), topTopicNum)
         api_topics.append(topic_indexes[:num])
     return mashup_topics, api_topics
Ejemplo n.º 11
0
    def get_mashup_api_features(self):
        """
        得到每个mashup和api经过特征提取器或者平均池化得到的特征,可以直接用id索引,供构造instance的文本部分使用
        :param text_tag_recommend_model:
        :param mashup_num:
        :param api_num:
        :return:
        """
        if os.path.exists(self.ma_text_tag_feas_path):
            with open(self.ma_text_tag_feas_path, 'rb') as f1:
                mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features = pickle.load(
                    f1)
        else:
            # 前四个分别是 user_text_vec, item_text_vec, user_tag_vec, item_tag_vec
            text_tag_middle_model = Model(
                inputs=[*self.model.inputs[:2]],
                outputs=[
                    self.model.get_layer('all_content_concatenate').input[0],
                    self.model.get_layer('all_content_concatenate').input[1],
                    self.model.get_layer('all_content_concatenate').input[2],
                    self.model.get_layer('all_content_concatenate').input[3]
                ])

            feature_mashup_ids = data_repository.get_md(
            ).mashup_df.index.tolist()
            feature_instances_tuple = self.get_instances(
                feature_mashup_ids, [0] * len(feature_mashup_ids))
            mashup_texts_features, _1, mashup_tag_features, _2 = text_tag_middle_model.predict(
                [*feature_instances_tuple], verbose=0)

            feature_api_ids = data_repository.get_md().api_df.index.tolist()
            feature_instances_tuple = self.get_instances(
                [0] * len(feature_api_ids), feature_api_ids)
            _1, api_texts_features, _2, api_tag_features = text_tag_middle_model.predict(
                [*feature_instances_tuple], verbose=0)

            with open(self.ma_text_tag_feas_path, 'wb') as f2:
                pickle.dump((mashup_texts_features, mashup_tag_features,
                             api_texts_features, api_tag_features), f2)
        return mashup_texts_features, mashup_tag_features, api_texts_features, api_tag_features
Ejemplo n.º 12
0
 def set_mashup_api_features(self, recommend_model):
     """
     TODO
     设置mashup和api的text和tag特征,用于计算相似度,进而计算mashup的NI表示;
     在get_model()和get_instances()之前设置
     :param recommend_model: 利用CI模型获得所有特征向量
     :return:
     """
     self.mashup_texts_features, self.mashup_tag_features, self.api_texts_features, self.api_tag_features = \
         recommend_model.get_mashup_api_features(data_repository.get_md().mashup_num, data_repository.get_md().api_num)
     # api 需要增加一个全为0的,放在最后,id为api_num,用来对slt_apis填充
     self.api_tag_features = np.vstack(
         (self.api_tag_features, np.zeros((1, self.word_embedding_dim))))
     self.api_texts_features = np.vstack(
         (self.api_texts_features,
          np.zeros((1, self.inception_fc_unit_nums[-1]))))
     self.features = (self.mashup_texts_features, self.mashup_tag_features,
                      self.api_texts_features, self.api_tag_features)
     self.CI_path = recommend_model.model_dir
Ejemplo n.º 13
0
    def set_others(self):
        # 在set_data()或read_data后设置
        self.his_mashup_ids = np.unique(
            self.train_df['mashup'].values)  # 训练mashup id的有序排列
        self.his_mashup_ids_set = set(self.his_mashup_ids)
        print('mashup num in training set :{}'.format(len(
            self.his_mashup_ids)))
        self.train_mashup_api_list = list(
            filter(lambda x: x[0] in self.his_mashup_ids_set,
                   data_repository.get_md().mashup_api_list))
        self.train_mashup_api_dict = list2dict(self.train_mashup_api_list)

        # 模型随数据变化,所以存储在数据的文件夹下
        self.model_path = os.path.join(self.data_root,
                                       '{}')  # simple_model_name  CI路径
        self.new_best_epoch_path = os.path.join(
            '{}', 'best_epoch.dat')  # model_dir,  .format(simple_model_name)
        self.new_model_para_path = os.path.join(
            '{}',
            'weights_{}.h5')  # model_dir, .format(simple_model_name, epoch)
        self.new_best_NDCG_path = os.path.join(
            '{}', 'best_NDCG.dat')  # model_dir,  .format(simple_model_name)
Ejemplo n.º 14
0
    def feature_extracter_from_texts(self, mashup_api=None):
        """
        对mashup,service的description均需要提取特征,右路的文本的整个特征提取过程
        公用的话应该封装成新的model!
        :param mashup_api: 默认是None,只有'HDP'/'Bert'时为非空
        :return: 输出的是一个封装好的model,所以可以被mashup和api公用
        """
        if self.args.text_extracter_mode in fixed_vector_modes and mashup_api is not None:

            if self.args.text_extracter_mode == 'Bert':
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                bertModel = BertModel.from_pretrained("bert-base-uncased")

                if mashup_api == 'mashup':
                    if self.mashup_text_feature_extracter is None:  # 没求过
                        mashup_texts = get_iterable_values(
                            data_repository.get_md().mashup_df,
                            'final_description',
                            return_ele_type='str')
                        dense_mashup_features = bertModel(
                            tokenizer(mashup_texts, return_tensors='tf'))
                        self.mashup_text_feature_extracter = vector_feature_extracter_from_texts(
                            'mashup', dense_mashup_features)
                    return self.mashup_text_feature_extracter
                elif mashup_api == 'api':
                    if self.api_text_feature_extracter is None:
                        api_texts = get_iterable_values(
                            data_repository.get_md().api_df,
                            'final_description',
                            return_ele_type='str')
                        dense_api_features = bertModel(
                            tokenizer(api_texts, return_tensors='tf'))
                        self.api_text_feature_extracter = vector_feature_extracter_from_texts(
                            'api', dense_api_features)
                    return self.api_text_feature_extracter
                else:
                    raise TypeError('wrong mashup_api mode!')

            else:
                if self.gd is None:
                    self.gd = get_default_gd(
                        tag_times=0, mashup_only=False,
                        strict_train=True)  # 用gensim处理文本,文本中不加tag
                    self.gd.model_pcs(self.args.text_extracter_mode)  #

                if mashup_api == 'mashup':
                    if self.mashup_text_feature_extracter is None:  # 没求过
                        self.mashup_text_feature_extracter = vector_feature_extracter_from_texts(
                            'mashup', self.gd.dense_mashup_features)
                    return self.mashup_text_feature_extracter
                elif mashup_api == 'api':
                    if self.api_text_feature_extracter is None:
                        self.api_text_feature_extracter = vector_feature_extracter_from_texts(
                            'api', self.gd.dense_api_features)
                    return self.api_text_feature_extracter
                else:
                    raise TypeError('wrong mashup_api mode!')

        elif self.text_feature_extracter is None:  # 没求过
            if 'trainable_bert' in self.args.text_extracter_mode.lower():
                self.text_feature_extracter = TFDistilBertModel.from_pretrained(
                    "distilbert-base-uncased")  # layer
                if self.args.frozen_bert:
                    self.text_feature_extracter.trainable = False
            else:
                text_input = Input(shape=(self.args.MAX_SEQUENCE_LENGTH, ),
                                   dtype='int32')
                text_embedding_layer = self.get_text_embedding_layer(
                )  # 参数还需设为外部输入!
                text_embedded_sequences = text_embedding_layer(
                    text_input)  # 转化为2D

                if self.args.text_extracter_mode in (
                        'inception', 'textCNN'):  # 2D转3D,第三维是channel
                    # print(text_embedded_sequences.shape)
                    text_embedded_sequences = Lambda(
                        lambda x: tf.expand_dims(x, axis=3))(
                            text_embedded_sequences)  # tf 和 keras的tensor 不同!!!
                    print(text_embedded_sequences.shape)

                if self.args.text_extracter_mode == 'inception':
                    x = inception_layer(
                        text_embedded_sequences, self.args.embedding_dim,
                        self.args.inception_channels,
                        self.args.inception_pooling)  # inception处理
                    print('built inception layer, done!')
                elif self.args.text_extracter_mode == 'textCNN':
                    x = textCNN_feature_extracter_from_texts(
                        text_embedded_sequences, self.args)
                elif self.args.text_extracter_mode == 'LSTM':
                    x = LSTM_feature_extracter_from_texts(
                        text_embedded_sequences, self.args)
                else:
                    raise TypeError('wrong extracter!')
                print('text feature after inception/textCNN/LSTM whole_model,',
                      x)  # 观察MLP转化前,模块输出的特征

                for FC_unit_num in self.args.inception_fc_unit_nums:
                    x = Dense(FC_unit_num,
                              kernel_regularizer=l2(self.args.l2_reg))(
                                  x)  # , activation='relu'
                    if self.args.inception_MLP_BN:
                        x = BatchNormalization(scale=False)(x)
                    x = PReLU()(x)  #
                    if self.args.inception_MLP_dropout:
                        x = tf.keras.layers.Dropout(0.5)(x)
                self.text_feature_extracter = Model(
                    text_input, x, name='text_feature_extracter')
        return self.text_feature_extracter
Ejemplo n.º 15
0
 def get_all_encoded_comments(self):
     self.unpadded_encoded_mashup_texts = self.encode(get_iterable_values(data_repository.get_md().mashup_df,'final_description'))
     self.unpadded_encoded_mashup_tags = self.encode(get_iterable_values(data_repository.get_md().mashup_df,'Categories'))
     self.unpadded_encoded_api_texts = self.encode(get_iterable_values(data_repository.get_md().api_df,'final_description'))
     self.unpadded_encoded_api_tags = self.encode(get_iterable_values(data_repository.get_md().api_df,'Categories'))
Ejemplo n.º 16
0
    def set_text_tag_enconding_layers(self):
        # 根据meta-data得到的文本和tag的编码表示,设置编码层
        all_mashup_num = data_repository.get_md().mashup_num
        all_api_num = data_repository.get_md().api_num

        if 'bert' in self.args.text_extracter_mode.lower():
            tokenizer = DistilBertTokenizer.from_pretrained(
                'distilbert-base-uncased')

            def encode(text):  # 利用DistilBertTokenizer编码
                encoded_text = tokenizer.encode(
                    text,
                    add_special_tokens=True,
                    truncation=True,
                    padding='max_length',
                    max_length=self.args.MAX_BERT_SEQUENCE_LENGTH)
                return encoded_text

            mid2encoded_text = data_repository.get_md(
            ).mashup_df['Description'].tolist()
            mid2encoded_text[-1] = ''  # ''会转化为nan,每次用之前要转化! TODO
            mid2encoded_text = np.array(
                [encode(text) for text in mid2encoded_text])
            aid2encoded_text = data_repository.get_md(
            ).api_df['Description'].tolist()
            aid2encoded_text[-1] = ''
            aid2encoded_text = np.array(
                [encode(text) for text in aid2encoded_text])
        else:
            mid2encoded_text = data_repository.get_md(
            ).mashup_df['padded_description'].tolist()
            if isinstance(mid2encoded_text[0], str):  # TODO
                mid2encoded_text = list(map(eval, mid2encoded_text))
            aid2encoded_text = data_repository.get_md(
            ).api_df['padded_description'].tolist()
            if isinstance(aid2encoded_text[0], str):  # TODO
                aid2encoded_text = list(map(eval, aid2encoded_text))

        MAX_LENGTH = self.args.MAX_BERT_SEQUENCE_LENGTH if 'bert' in self.args.text_extracter_mode.lower(
        ) else self.args.MAX_SEQUENCE_LENGTH
        self.mashup_text_encoding_layer = Embedding(
            all_mashup_num + 1,
            MAX_LENGTH,
            embeddings_initializer=Constant(mid2encoded_text),
            mask_zero=True,
            input_length=1,
            trainable=False,
            name='mashup_text_encoding_layer')

        self.api_text_encoding_layer = Embedding(
            all_api_num + 1,
            MAX_LENGTH,
            embeddings_initializer=Constant(aid2encoded_text),
            mask_zero=True,
            input_length=1,
            trainable=False,
            name='api_text_encoding_layer')

        mid2encoded_tags = data_repository.get_md(
        ).mashup_df['padded_categories'].tolist()
        if isinstance(mid2encoded_tags[0], str):  # TODO
            mid2encoded_tags = list(map(eval, mid2encoded_tags))
        self.mashup_tag_encoding_layer = Embedding(
            all_mashup_num + 1,
            self.args.MAX_TAGS_NUM,
            embeddings_initializer=Constant(mid2encoded_tags),
            mask_zero=True,
            input_length=1,
            trainable=False,
            name='mashup_tag_encoding_layer')

        aid2encoded_tags = data_repository.get_md(
        ).api_df['padded_categories'].tolist()
        if isinstance(aid2encoded_tags[0], str):
            aid2encoded_tags = list(map(eval, aid2encoded_tags))
        self.api_tag_encoding_layer = Embedding(
            all_api_num + 1,
            self.args.MAX_TAGS_NUM,
            embeddings_initializer=Constant(aid2encoded_tags),
            mask_zero=True,
            input_length=1,
            trainable=False,
            name='api_tag_encoding_layer')
        return
Ejemplo n.º 17
0
def Samanta(topK,
            if_pop=2,
            MF_mode='node2vec',
            pop_mode='',
            text_mode='HDP',
            LDA_topic_num=None):
    """
    :param Para:
    :param if_pop 如何使用pop  0 不使用;1,只做重排序;2总乘积做排序
    :param topK: 使用KNN表示新query的mf特征
    :param text_mode: 使用哪种特征提取方式  LDA  HDP
    :param pop_mode:pop值是否使用sigmoid规约到0-1区间
    :param pop_mode:MF_mode 为了省事,直接用node2vec得了
    :return:
    """

    api2pop = None
    if if_pop:
        api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs(
            pop_mode)  # TODO

    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_feature_path = os.path.join(
        root, 'mashup_{}.txt'.format(text_mode))  # ...
    api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode))

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_feature_path):
        gd = get_default_gd()
        _mashup_features, _api_features = gd.model_pcs(text_mode,
                                                       LDA_topic_num)
        np.savetxt(mashup_feature_path, _mashup_features)
        np.savetxt(api_feature_path, _api_features)
    else:
        _mashup_features = np.loadtxt(mashup_feature_path)
        _api_features = np.loadtxt(api_feature_path)

    candidate_ids_list = []
    all_predict_results = []

    test_data = data_repository.get_ds().test_data
    test_mashup_num = len(test_data.get('mashup'))
    mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df
    api_emb_df = data_repository.get_ds().MF_obj.api_emb_df

    for i in range(test_mashup_num):
        test_m_id = test_data.get('mashup')[i][0]  # 每个mashup id
        candidate_ids = test_data.get('api')[i]
        candidate_ids_list.append(candidate_ids)

        # 用近邻mashup的latent factor加权表示自己
        mid2sim = {}
        for train_m_id in mashup_emb_df.index.tolist():
            mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id],
                                          _mashup_features[train_m_id])  # TODO
        topK_ids, topK_sims = zip(*(
            sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims = np.array(topK_sims) / sum(topK_sims)  # sim归一化
        cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, ))
        for z in range(len(topK_ids)):
            cf_feature += topK_sims[z] * mashup_emb_df['embedding'][
                topK_ids[z]]

        # 计算跟每个api的打分
        predict_results = []
        temp_predict_results = []  # 需要用pop进行重排序时的辅助
        api_zeros = np.zeros((data_repository.get_args().implict_feat_dim))
        api_ids = set(api_emb_df.index.tolist())
        for api_id in candidate_ids:  # id
            api_i_feature = api_emb_df['embedding'][
                api_id] if api_id in api_ids else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score = np.sum(np.multiply(
                api_i_feature, cf_feature))  # mashup和api latent factor的内积
            sim_score = cos_sim(_mashup_features[test_m_id],
                                _api_features[api_id])  # 特征的余弦相似度
            if if_pop == 1:
                temp_predict_results.append((api_id, cf_score * sim_score))
            elif if_pop == 0:
                predict_results.append(cf_score * sim_score)
            elif if_pop == 2:
                predict_results.append(cf_score * sim_score * api2pop[api_id])
        if if_pop == 1:
            max_k_pairs = heapq.nlargest(topK,
                                         temp_predict_results,
                                         key=lambda x: x[1])  # 首先利用乘积排一次序
            max_k_candidates, _ = zip(*max_k_pairs)
            max_k_candidates = set(max_k_candidates)
            predict_results = [
                api2pop[api_id] if api_id in max_k_candidates else -1
                for api_id in candidate_ids
            ]  # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    _name = '_pop_{}'.format(if_pop)
    _name += data_repository.get_args().mf_mode
    csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format(
        topK) + _name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录

    def divide(slt_apiNum):
        test_api_id_list_, predictions_, grounds_ = [], [], []
        for i in range(test_mashup_num):
            if len(data_repository.get_ds().slt_api_ids_instances[i]
                   ) == slt_apiNum:
                test_api_id_list_.append(candidate_ids_list[i])
                predictions_.append(all_predict_results[i])
                grounds_.append(data_repository.get_ds().test_data.get(
                    'all_ground_api_ids')[i])
        return test_api_id_list_, predictions_, grounds_

    if data_repository.get_args().data_mode == 'newScene':
        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1)
            evaluate_result = evalute(test_api_id_list_, predictions_,
                                      grounds_,
                                      data_repository.get_args().topKs)
            summary(evaluate_path,
                    str(slt_apiNum + 1) + '_' + csv_table_name,
                    evaluate_result,
                    data_repository.get_args().topKs)  #