Beispiel #1
0
    def initialize(self, text_tag_recommend_model, text_tag_model, train_mashup_id_list, train_api_id_list,
                   test_mashup_id_list, test_api_id_list, feature_train_mashup_ids):
        prod = len (test_mashup_id_list) * len (test_mashup_id_list[0])
        D1_test_mashup_id_list = tuple (np.array (test_mashup_id_list).reshape (prod, ))  # 将二维的test数据降维
        D1_test_api_id_list = tuple (np.array (test_api_id_list).reshape (prod, ))

        feature_test_mashup_ids = sorted (list (set (D1_test_mashup_id_list)))  # 测试用mashup的升序排列
        feature_test_api_ids = [0] * len (feature_test_mashup_ids)
        feature_instances_tuple = text_tag_recommend_model.get_instances (feature_test_mashup_ids, feature_test_api_ids,
                                                                          True)  # 只包含mashup信息

        # test样本:提取text和tag feature
        text_tag_middle_model_1 = Model (inputs=[text_tag_model.inputs[0], text_tag_model.inputs[2]],
                                         outputs=[text_tag_model.get_layer ('concatenate_1').input[0],
                                                  text_tag_model.get_layer ('concatenate_1').input[2]])
        text_tag_test_mashup_features = np.hstack (
            text_tag_middle_model_1.predict ([*feature_instances_tuple], verbose=0))  # text,tag 按照mashup id的大小顺序

        # 训练+测试样本  求所有样本的  mashupid,apiid:x
        all_mashup_id_list = train_mashup_id_list + D1_test_mashup_id_list
        all_api_id_list = train_api_id_list + D1_test_api_id_list
        all_instances_tuple = text_tag_recommend_model.get_instances (all_mashup_id_list, all_api_id_list)
        text_tag_middle_model = Model (inputs=text_tag_model.inputs,
                                       outputs=[text_tag_model.get_layer (
                                           'text_tag_feature_extracter').output])  # 输出mashup api的text,tag整合后的特征

        x_features = text_tag_middle_model.predict ([*all_instances_tuple])
        self.x_feature_dim = len (x_features[0])
        self._map = {}  # 基于id
        for index in range (len (x_features)):
            self._map[(all_mashup_id_list[index], all_api_id_list[index])] = x_features[index]

        # 先train,后test mashup id
        all_feature_mashup_ids = feature_train_mashup_ids + feature_test_mashup_ids
        all_features = np.vstack ((self.ini_features_array, text_tag_test_mashup_features))

        # CNN提取的文本特征和tag的embedding大小不一样,所以无法直接拼接计算sim;需要单独计算sim,然后加权求和!!!
        text_dim=self.inception_fc_unit_nums[-1]


        for i in range (len (all_feature_mashup_ids)):  # 为所有mashup找最近
            id2sim = {}
            for j in range (len (feature_train_mashup_ids)):  # 从所有train中找,存放的是内部索引
                if i != j:
                    text_sim = cos_sim (all_features[i][:text_dim], all_features[j][:text_dim])
                    tag_sim = cos_sim (all_features[i][text_dim:], all_features[j][text_dim:])
                    id2sim[j]=  self.text_weight*text_sim+(1- self.text_weight)*tag_sim
            topK_indexes, topK_sims = zip (*(sorted (id2sim.items (), key=lambda x: x[1], reverse=True)[:self.topK]))
            self.mashup_id2neighbors[all_feature_mashup_ids[i]]=[self.m_index2id[index] for index in topK_indexes] #每个mashup距离最近的mashup的id list
            topK_sims = np.array (topK_sims) / sum (topK_sims)
            cf_feature = np.zeros ((self.num_feat))
            for z in range (len (topK_indexes)):
                cf_feature += topK_sims[z] * self.u_factors_matrix[topK_indexes[z]]
            self.mashup_id2CFfeature[all_feature_mashup_ids[i]] = cf_feature
Beispiel #2
0
def binary_keyword(text_tag_recommend_model):
    # pop

    all_mashup_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('mashup'))
    all_api_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('api'))
    api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs()

    gd = gensim_data(*text_tag_recommend_model.get_instances(
        [i for i in range(all_mashup_num)], [i for i in range(all_api_num)],
        False))
    mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v(
        all_mashup_num, all_api_num)

    # 测试WVSM(Weighted Vector Space Model)
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(Para.test_mashup_id_list)):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(mashup_binary_matrix[test_mashup_id],
                                api_binary_matrix[api_id]) * api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WVSM test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    csv_table_name = Para.data_name + 'WVSM' + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录

    # 测试WJaccard(Weighted Jaccard)
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(Para.test_mashup_id_list)):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            mashup_set = set(mashup_words_list[test_mashup_id])
            api_set = set(api_words_list[api_id])
            sim_score = 1.0 * len(mashup_set.intersection(api_set)) / len(
                mashup_set.union(api_set)) * api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WJaccard test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    csv_table_name = Para.data_name + 'WJaccard' + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录
Beispiel #3
0
 def get_mean(self, mashup_text_index, api_text_index):
     embedding1 = np.array([
         self.wordindex2embedding.get(word) for word in mashup_text_index
     ]).mean(axis=0)
     embedding2 = np.array([
         self.wordindex2embedding.get(word) for word in api_text_index
     ]).mean(axis=0)
     sim = cos_sim(embedding1, embedding2)
     return sim
Beispiel #4
0
    def cpt_wod_cos_sim(self, id1, id2):
        """
        计算词(id)间的sim,并存储供索引
        :param id1:
        :param id2:
        :return:
        """

        if id1 == id2:
            return 1
        id_b = max(id1, id2)
        id_s = min(id1, id2)
        value = self.words_Sim.get((id_s, id_b))  # 小到大,按顺序
        if value is None:
            value = cos_sim(self.wordindex2embedding.get(id_s),
                            self.wordindex2embedding.get(id_b))
            self.words_Sim[(id_s, id_b)] = value
        return value
Beispiel #5
0
def TF_IDF(text_tag_recommend_model):
    """
    可以跟写到Samanta的类中,但太混乱,没必要
    :return:
    """
    all_mashup_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('mashup'))
    all_api_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('api'))

    gd = gensim_data(*text_tag_recommend_model.get_instances(
        [i for i in range(all_mashup_num)], [i for i in range(all_api_num)],
        False))
    _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs(
        'TF_IDF', all_mashup_num, all_api_num)

    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(Para.test_mashup_id_list)):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id],
                                _api_IFIDF_features[api_id])
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('TF_IDF test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    csv_table_name = Para.data_name + 'TF_IDF' + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录
Beispiel #6
0
def Samanta(text_tag_recommend_model, topK, if_pop=False):
    """
    :param Para:
    :param text_tag_recommend_model: 基于该model的基本数据
    :param topK:
    :return:
    """

    api2pop = None
    if if_pop:
        api_co_vecs, api2pop = text_tag_recommend_model.pd.get_api_co_vecs()

    test_mashup_num = len(Para.test_mashup_id_list)

    all_mashup_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('mashup'))
    all_api_num = len(
        text_tag_recommend_model.pd.get_mashup_api_index2name('api'))

    mashup_hdp_path = os.path.join(Para.data_dir, 'mashup_hdp.txt')
    api_hdp_path = os.path.join(Para.data_dir, 'api_hdp.txt')

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_hdp_path):
        # text,tag在encoding之后的向量,array形式
        gd = gensim_data(*text_tag_recommend_model.get_instances(
            [i for i in range(all_mashup_num)],
            [i for i in range(all_api_num)], False))
        _mashup_hdp_features, _api_hdp_features = gd.model_pcs(
            'HDP', all_mashup_num, all_api_num)
        np.savetxt(mashup_hdp_path, _mashup_hdp_features)
        np.savetxt(api_hdp_path, _api_hdp_features)
    else:
        _mashup_hdp_features = np.loadtxt(mashup_hdp_path)
        _api_hdp_features = np.loadtxt(api_hdp_path)

    candidate_ids_list = []
    all_predict_results = []
    for i in range(test_mashup_num):
        test_mashup_id = Para.test_mashup_id_list[i][0]  # 每个mashup id
        candidate_ids = Para.test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        id2sim = {}
        for local_train_mashup_index in range(
                len(Para.feature_train_mashup_ids)):  # u_factors_matrix要用局部索引
            id2sim[local_train_mashup_index] = cos_sim(
                _mashup_hdp_features[test_mashup_id], _mashup_hdp_features[
                    Para.feature_train_mashup_ids[local_train_mashup_index]])
        topK_indexes, topK_sims = zip(
            *(sorted(id2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims = np.array(topK_sims) / sum(topK_sims)
        cf_feature = np.zeros((Para.num_feat))
        for z in range(len(topK_indexes)):
            cf_feature += topK_sims[z] * Para.u_factors_matrix[topK_indexes[z]]

        predict_results = []
        temp_predict_results = []  # 需要用pop进行重排序时的辅助
        api_zeros = np.zeros((Para.num_feat))
        for api_id in candidate_ids:  # id
            api_i_feature = Para.i_factors_matrix[
                Para.i_id2index[api_id]] if api_id in Para.i_id2index.keys(
                ) else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score = np.sum(np.multiply(api_i_feature, cf_feature))
            sim_score = cos_sim(_mashup_hdp_features[test_mashup_id],
                                _api_hdp_features[api_id])
            if if_pop:
                temp_predict_results.append((api_id, cf_score * sim_score))
            else:
                predict_results.append(cf_score * sim_score)

        if if_pop:
            max_k_pairs = heapq.nlargest(topK,
                                         temp_predict_results,
                                         key=lambda x: x[1])  # 根据score选取topK
            max_k_candidates, _ = zip(*max_k_pairs)
            max_k_candidates = set(max_k_candidates)
            predict_results = [
                api2pop[api_id] if api_id in max_k_candidates else -1
                for api_id in candidate_ids
            ]  # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(candidate_ids_list, all_predict_results,
                              Para.grounds, Para.topKs)  # 评价
    _name = '_pop' if if_pop else ''
    csv_table_name = Para.data_name + 'Samanta_model' + _name + "\n"  # model.name
    summary(Para.evaluate_path, csv_table_name, evaluate_result,
            Para.topKs)  # 记录