Ejemplo n.º 1
0
def pop():
    """
    :return:
    """
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            predict_results.append(api2pop[api_id])
        all_predict_results.append(predict_results)
    print('pop test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    csv_table_name = data_repository.get_ds(
    ).name + 'pop' + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
Ejemplo n.º 2
0
def MF(train_datas, test_datas, mode=''):
    all_predict_results = []  # 每个测试样例(多个api的)的评分
    for slt_num in range(1,
                         data_repository.get_args().slt_item_num +
                         1):  # 不同个数的训练测试集
        test_mashup_id_list, test_api_id_list, grounds = test_datas[slt_num -
                                                                    1]
        # 增加处理和读取MF结果的接口
        UV_obj = MF(data_repository.get_ds().data_root, mode,
                    train_datas[slt_num - 1], slt_num)
        m_id2index, a_id2index = UV_obj.m_id2index, UV_obj.a_id2index
        for i in range(len(test_mashup_id_list)):
            test_mashup_id = test_mashup_id_list[i][0]  # 每个mashup id
            predict_results = []
            for test_api_id in test_api_id_list[i]:  # id
                if test_mashup_id not in m_id2index or test_api_id not in a_id2index:
                    dot = 0
                else:
                    m_embedding = UV_obj.m_embeddings[
                        m_id2index[test_mashup_id]]
                    a_embedding = UV_obj.a_embeddings[a_id2index[test_api_id]]
                    dot = np.dot(m_embedding, a_embedding)
                predict_results.append(dot)
            all_predict_results.append(predict_results)
        print('{}_{} test,done!'.format(mode, slt_num))

        evaluate_result = evalute(
            test_api_id_list, all_predict_results,
            data_repository.get_ds().test_data.get('all_ground_api_ids'),
            data_repository.get_args().topKs)  # 评价
        csv_table_name = data_repository.get_ds().name + mode + str(
            slt_num) + "\n"  # whole_model.name
        summary(evaluate_path, csv_table_name, evaluate_result,
                data_repository.get_args().topKs)  # 记录
Ejemplo n.º 3
0
def bl_PasRec():
    model_name = 'PasRec_2path'  # 'PasRec_2path'
    epoch_num = 60  # 之前是40  40比20差点
    neighbor_size = 15
    topTopicNum = 3

    args = data_repository.get_args()
    train_data, test_data = data_repository.get_ds(
    ).train_data, data_repository.get_ds().test_data

    HINRec_model = HINRec(args,
                          model_name=model_name,
                          epoch_num=epoch_num,
                          neighbor_size=neighbor_size,
                          topTopicNum=topTopicNum)
    if os.path.exists(HINRec_model.weight_path):
        print('have trained,return!')
    else:
        # 这里是每隔20epoch测试一下,所以train中输入test_data
        HINRec_model.train(test_data)
        HINRec_model.save_model()
        evalute_by_epoch(
            HINRec_model,
            HINRec_model,
            HINRec_model.model_name,
            test_data,
            evaluate_by_slt_apiNum=False)  # ,if_save_recommend_result=True)
Ejemplo n.º 4
0
def analyze_result(recommend_model, topKs):
    """
    读取recommend_result_path中的评价结果,再使用其他指标(pop和冗余度)进行评价
    :param recommend_model:
    :param recommend_result_path:
    :param topKs:
    :return:
    """
    recommend_result_path = os.path.join(recommend_model.model_dir,
                                         'recommend_result_new.csv')
    mashup_ids, slt_api_ids, recommend_lists, grounds = [], [], [], []

    def str2list(str_):
        list_ = str_.split(' ')
        return [int(id) for id in list_]

    with open(recommend_result_path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            mashup_ids.append(str2list(row['mashup_id']))
            slt_api_ids.append(str2list(row['slt_api_ids']))
            recommend_lists.append(str2list(row['recommend_list']))
            grounds.append(str2list(row['grounds']))
    instance_num = len(mashup_ids)
    api_id2info = meta_data.pd.get_mashup_api_id2info('api')
    _, api_id2pop = meta_data.pd.get_api_co_vecs(pop_mode='')
    api_categories = [
        get_mashup_api_allCategories('api', api_id2info, api_id,
                                     data_repository.get_args().Category_type)
        for api_id in range(meta_data.api_num)
    ]

    def evaluate_others(recommend_list):
        size = len(recommend_list)
        pop = sum([api_id2pop[api_id] for api_id in recommend_list]) / size
        union_tags = set()
        tag_sum_num = 0
        for api_id in recommend_list:
            tags = api_categories[api_id]
            union_tags = union_tags.union(set(tags))
            tag_sum_num += len(tags)
        redundance = 1 - len(union_tags) / tag_sum_num
        return np.array([pop, redundance])  # pop和冗余度

    def analyze():
        indicators_name = ['pop', 'redundancy']
        indicators = np.zeros((instance_num, len(topKs),
                               len(indicators_name)))  # pop redundancy 看指标有哪些
        for index in range(instance_num):  # 单个mashup评价
            for k_idx, k in enumerate(topKs):  # 某个topK
                indicators[index, k_idx, :] = evaluate_others(
                    recommend_lists[index][:k])  # 评价得到五个指标,K对NDCG等有用
        return np.average(indicators, axis=0)

    indicators = analyze()
    recommend_result_path = os.path.join(recommend_model.model_dir,
                                         'recommend_other_indicators.csv')
    summary_others(recommend_result_path, recommend_model.simple_name,
                   indicators, topKs)
Ejemplo n.º 5
0
def CI_NI_fineTuning():
    args = data_repository.get_args()
    train_data, test_data = data_repository.get_ds(
    ).train_data, data_repository.get_ds().test_data

    CI_recommend_model = CI_Model(args)
    CI_model_obj = CI_recommend_model.get_model()
    CI_model_obj = train_model(CI_recommend_model, CI_model_obj, train_data,
                               test_data, args.train_mode, args.train_new)
Ejemplo n.º 6
0
def train_monitoring_loss_acc_model(recommend_model, model, train_data):
    """
    绘制loss_acc曲线, 观察过拟合欠拟合
    """
    train_labels = train_data[-1]
    train_instances_tuple = recommend_model.get_instances(*train_data[:-1])
    model.compile(optimizer=Adam(lr=data_repository.get_args().learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    hist = model.fit([*train_instances_tuple],
                     np.array(train_labels),
                     batch_size=data_repository.get_args().small_batch_size,
                     epochs=data_repository.get_args().num_epochs,
                     verbose=1,
                     shuffle=True,
                     validation_split=0.1)  # 可以观察过拟合欠拟合
    plot_loss_acc(hist, recommend_model.get_simple_name())
    return model
Ejemplo n.º 7
0
 def show_prediction_res(i):
     print('for mashup {}:'.format(test_mashup_id_list[i][0]))
     if data_repository.get_args().need_slt_apis:
         print('slt_ids:', test_slt_ids[i])
     sorted_pre2id = sorted(zip(prediction, test_api_id_list[i]))
     sorted_pres, sorted_ids = zip(*sorted_pre2id)
     print('candidate api ids', sorted_ids)
     print('predictions', sorted_pres)
     print('grounds', grounds[i])
Ejemplo n.º 8
0
def hdp_pop(if_pop=True):
    # pop
    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_hdp_path = os.path.join(root, 'mashup_HDP.txt')  # ...
    api_hdp_path = os.path.join(root, 'api_HDP.txt')

    _mashup_hdp_features = np.loadtxt(mashup_hdp_path)
    _api_hdp_features = np.loadtxt(api_hdp_path)

    if if_pop:
        api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    # 测试
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_hdp_features[test_mashup_id],
                                _api_hdp_features[api_id])
            if if_pop:
                sim_score *= api2pop[api_id]
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('hdp_pop test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    name = 'hdp_pop' if if_pop else 'hdp'
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
Ejemplo n.º 9
0
def save_loss_acc(train_log, model_name, epoch=0, if_multi_epoch=False):
    # if_multi_epoch:每次存一个epoch
    # 每个epoch存储loss,val_loss,acc,val_acc
    if not if_multi_epoch:
        with open(loss_path, 'a+') as f:
            if epoch == 0:  # 第一个epoch记录模型名
                f.write(model_name + '\n')
                if data_repository.get_args().validation_split == 0:
                    f.write('epoch,loss,acc\n')
                else:
                    f.write('epoch,loss,val_loss,acc,val_acc\n')
            if data_repository.get_args().validation_split == 0:
                f.write('{},{},{}\n'.format(epoch,
                                            train_log.history["loss"][0],
                                            train_log.history["acc"][0]))
            else:
                f.write('{},{},{},{},{}\n'.format(
                    epoch, train_log.history["loss"][0],
                    train_log.history["val_loss"][0],
                    train_log.history["accuracy"][0],
                    train_log.history["val_accuracy"][0]))
    else:
        with open(data_repository.get_args().loss_path, 'a+') as f:
            f.write(model_name + 'EarlyStop' + '\n')
            if data_repository.get_args().validation_split == 0:
                f.write('epoch,loss,acc\n')
            else:
                f.write('epoch,loss,val_loss,acc,val_acc\n')
            epoch_num = len(train_log.history["loss"])
            for i in range(epoch_num):
                if data_repository.get_args().validation_split == 0:
                    f.write('{},{},{}\n'.format(i,
                                                train_log.history["loss"][i],
                                                train_log.history["acc"][i]))
                else:
                    f.write('{},{},{},{},{}\n'.format(
                        i, train_log.history["loss"][i],
                        train_log.history["val_loss"][i],
                        train_log.history["acc"][i],
                        train_log.history["val_acc"][i]))
Ejemplo n.º 10
0
def binary_keyword(if_pop=False):
    # pop
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    gd = get_default_gd()
    mashup_binary_matrix, api_binary_matrix, mashup_words_list, api_words_list = gd.get_binary_v(
    )

    # 测试WVSM(Weighted Vector Space Model)
    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            if if_pop:
                sim_score = cos_sim(
                    mashup_binary_matrix[test_mashup_id],
                    api_binary_matrix[api_id]) * api2pop[api_id]
            else:
                sim_score = cos_sim(mashup_binary_matrix[test_mashup_id],
                                    api_binary_matrix[api_id])  # 测试只使用特征向量的效果
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('WVSM test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    name = 'WVSM_pop' if if_pop else 'WVSM'
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
    """
Ejemplo n.º 11
0
def get_sims_dict(process_new, remove_stopwords):
    # DHSR
    embedding_name = 'glove'
    embedding_dim = 50
    tag_coefficient = 2
    k = 1.2
    b = 0.75
    weighted_intervals = [-1, 0.15, 0.4, 0.8, 1]
    unweighted_intervals = [-1, 0.45, 0.8, 1]
    cs = cpt_DHSR_Sim(data_repository.get_args().cur_data_dir, embedding_name,
                      embedding_dim, tag_coefficient, k, b, weighted_intervals,
                      unweighted_intervals, process_new, remove_stopwords)
    return cs  # 返回对象
Ejemplo n.º 12
0
def TF_IDF(if_pop):
    """
    可以跟写到Samanta的类中,但太混乱,没必要
    :return:
    """
    gd = get_default_gd()
    api_co_vecs, api2pop = meta_data.pd.get_api_co_vecs()
    _mashup_IFIDF_features, _api_IFIDF_features = gd.model_pcs('TF_IDF')

    candidate_ids_list = []
    all_predict_results = []
    for i in range(len(data_repository.get_ds().test_mashup_id_list)):
        test_mashup_id = data_repository.get_ds().test_mashup_id_list[i][
            0]  # 每个mashup id
        candidate_ids = data_repository.get_ds().test_api_id_list[i]
        candidate_ids_list.append(candidate_ids)

        predict_results = []
        for api_id in candidate_ids:  # id
            sim_score = cos_sim(_mashup_IFIDF_features[test_mashup_id],
                                _api_IFIDF_features[api_id])
            if if_pop:
                predict_results.append(sim_score * api2pop[api_id])
            else:
                predict_results.append(sim_score)
            predict_results.append(sim_score)
        all_predict_results.append(predict_results)
    print('TF_IDF test,done!')

    name = 'TFIDF_pop' if if_pop else 'TFIDF'
    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    csv_table_name = data_repository.get_ds(
    ).name + name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录
Ejemplo n.º 13
0
def plot_loss_acc(train_log, model_name):
    # 传入log对象,绘制曲线
    epochs = data_repository.get_args().num_epochs
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(np.arange(0, epochs),
             train_log.history["loss"],
             label="train_loss")
    plt.plot(np.arange(0, epochs),
             train_log.history["val_loss"],
             label="val_loss")
    plt.plot(np.arange(0, epochs), train_log.history["acc"], label="train_acc")
    plt.plot(np.arange(0, epochs),
             train_log.history["val_acc"],
             label="val_acc")
    plt.title("Training Loss and Accuracy on the whole_model")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend(loc="upper right")
    plt.savefig("Loss_Accuracy_{}.jpg".format(model_name))
Ejemplo n.º 14
0
    def __init__(self, data_dir, embedding_name, embedding_dim,
                 tag_coefficient, k, b, weighted_intervals,
                 unweighted_intervals, process_new, remove_stopwords):
        """
        :param self:
        :param tag_coefficient: tag amplification coefficient
        :param k:smooth parameter
        :param b: smooth parameter
        :param weighted_intervals:
        :param unweighted_intervals:
        :return:
        """
        self.process_new = process_new
        self.data_dir = data_dir
        self.embedding_name = embedding_name
        self.embedding_dim = embedding_dim
        self.tag_coefficient = int(tag_coefficient)
        self.k = k
        self.b = b
        self.weighted_intervals = weighted_intervals
        self.unweighted_intervals = unweighted_intervals

        self.num_mashup = 0
        self.num_api = 0
        self.word2inedx = {}  # 词到index映射  在该类中主要以index形式存在  作key时比str节省内存
        self.wordindex2IDF = {}
        self.average_len = 0
        self.stopwords = data_repository.get_args(
        ).stop_words if remove_stopwords else set()

        self.mashup_descriptions = None
        self.api_descriptions = None
        self.wordindex2embedding = {}  # 词index对应的embedding

        self.words_Sim = {}  # 词对间的cos sim  不需求全部  随用随求并保存  {(,):float,}
        self.mashup2api_Sim = {}  # mashup 到 api 的sim  id形式 {(,):float,}

        self.initialize_sims_dict()
Ejemplo n.º 15
0
def train_early_stop(recommend_model, model, train_data, test_data):
    """
    训练时按照验证集的loss,early stopping得到最优的模型;最后基于该模型测试
    :return:
    """
    if_Train = True if data_repository.get_args().pairwise else False
    train_labels = train_data[-1]
    train_instances_tuple = recommend_model.get_instances(
        *train_data[:-1], pairwise_train_phase_flag=if_Train)

    train_model = recommend_model.get_pairwise_model(
    ) if data_repository.get_args().pairwise else model
    if data_repository.get_args().pairwise:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss=lambda y_true, y_pred: y_pred,
                            metrics=['accuracy'])
    else:
        train_model.compile(optimizer=recommend_model.optimizer,
                            loss='binary_crossentropy',
                            metrics=['accuracy'])

    early_stopping = EarlyStopping(monitor='val_loss',
                                   patience=10,
                                   verbose=2,
                                   mode='min')
    hist = train_model.fit(
        [*train_instances_tuple],
        train_labels,
        epochs=data_repository.get_args().num_epochs,
        batch_size=data_repository.get_args().small_batch_size,
        callbacks=[early_stopping],
        validation_split=data_repository.get_args().validation_split,
        shuffle=True)  #
    model.save_weights(data_repository.get_ds().new_model_para_path.format(
        recommend_model.model_dir, 'min_loss'))  # !!! 改正

    model_name = recommend_model.get_simple_name() + recommend_model.get_name(
    ) + '_min_loss'
    save_loss_acc(hist, model_name, if_multi_epoch=True)

    epoch_evaluate_result = evalute_by_epoch(recommend_model, model,
                                             model_name, test_data)
    return model
Ejemplo n.º 16
0
    def get_predictions():
        predictions = []  # 测试样本一次的预测结果
        for i in range(test_instance_num):
            candidate_ids = test_api_id_list[i]
            test_batch_size = data_repository.get_args().test_batch_size
            prediction = []

            # test api 太多,手动分batch预测
            test_api_num = len(candidate_ids)
            batch_num = test_api_num // test_batch_size
            remainder = test_api_num % test_batch_size
            if remainder != 0:
                batch_num += 1
            start_time = time.time()
            for j in range(batch_num):  # 每个batch
                start_index = j * test_batch_size
                stop_index = test_api_num if (remainder != 0
                                              and j == batch_num -
                                              1) else (j + 1) * test_batch_size
                batch_api_ids = candidate_ids[start_index:stop_index]

                batch_instances_dict = {
                    'mashup': test_mashup_id_list[i][start_index:stop_index],
                    'api': batch_api_ids
                }
                if data_repository.get_args(
                ).data_mode == 'newScene' and data_repository.get_args(
                ).need_slt_apis:  # TODO
                    _slt_ids = []
                    _slt_ids.append(
                        test_slt_ids[i])  # 同一行的同个mashup对各个api的评分中,已选择的apis一样
                    batch_instances_dict['slt_apis'] = _slt_ids * (stop_index -
                                                                   start_index)

                batch_instances_dict = recommend_model.get_instances(
                    batch_instances_dict)
                batch_prediction = model.predict(batch_instances_dict)

                if len(batch_prediction.shape) == 2:
                    batch_prediction = batch_prediction[:, 1]  # 1:[0,1]
                batch_prediction = list(batch_prediction)
                prediction += batch_prediction  # 一个mashup对所有候选的评分
            predictions.append(list(prediction))

            end_time = time.time()
            if record_time:
                with open(time_path, 'a+') as f1:
                    if i == 0:
                        f1.write(recommend_model.get_simple_name())
                        f1.write('\n')
                    f1.write('num of instances,{},cost time,{}\n'.format(
                        test_api_num, end_time - start_time))

            # 展示几个mashup的推荐结果
            def show_prediction_res(i):
                print('for mashup {}:'.format(test_mashup_id_list[i][0]))
                if data_repository.get_args().need_slt_apis:
                    print('slt_ids:', test_slt_ids[i])
                sorted_pre2id = sorted(zip(prediction, test_api_id_list[i]))
                sorted_pres, sorted_ids = zip(*sorted_pre2id)
                print('candidate api ids', sorted_ids)
                print('predictions', sorted_pres)
                print('grounds', grounds[i])

            if i < show_cases:
                show_prediction_res(i)
            if i % 100 == 0:
                print('has test {}/{} mashup instances'.format(
                    i, test_instance_num))
        print('test,done!')
        return predictions
Ejemplo n.º 17
0
def evalute_by_epoch(recommend_model,
                     model,
                     model_name,
                     test_data,
                     show_cases=0,
                     record_time=False,
                     true_candidates_dict=None,
                     if_save_recommend_result=False,
                     evaluate_by_slt_apiNum=False):
    """
    对训练好的模型,进行测试:可以适用于完全冷启动和部分冷启动情景;
    :param show_cases: 显示几个推荐结果
    :param record_time: 记录测试集的处理时间
    :param true_candidates_dict: 重排序:是否使用IsRec等算法的处理方式:近邻没有调用过的服务评分设置为0 mashup id -> api ids list
    :param if_save_recommend_result: 是否存储每个样例的推荐结果,用于case分析
    :param evaluate_by_slt_apiNum: 是否按照已选服务的数目将测试集分开评价
    :return:
    """

    # 某个已选择api数目下的测试集样本
    test_mashup_id_list = test_data.get('mashup')
    test_api_id_list = test_data.get('api')
    grounds = test_data.get('all_ground_api_ids')
    test_slt_ids = test_data.get('slt_apis')

    csv_table_name = model_name + '\n'
    test_instance_num = len(test_mashup_id_list)

    # 获取所有的预测结果
    def get_predictions():
        predictions = []  # 测试样本一次的预测结果
        for i in range(test_instance_num):
            candidate_ids = test_api_id_list[i]
            test_batch_size = data_repository.get_args().test_batch_size
            prediction = []

            # test api 太多,手动分batch预测
            test_api_num = len(candidate_ids)
            batch_num = test_api_num // test_batch_size
            remainder = test_api_num % test_batch_size
            if remainder != 0:
                batch_num += 1
            start_time = time.time()
            for j in range(batch_num):  # 每个batch
                start_index = j * test_batch_size
                stop_index = test_api_num if (remainder != 0
                                              and j == batch_num -
                                              1) else (j + 1) * test_batch_size
                batch_api_ids = candidate_ids[start_index:stop_index]

                batch_instances_dict = {
                    'mashup': test_mashup_id_list[i][start_index:stop_index],
                    'api': batch_api_ids
                }
                if data_repository.get_args(
                ).data_mode == 'newScene' and data_repository.get_args(
                ).need_slt_apis:  # TODO
                    _slt_ids = []
                    _slt_ids.append(
                        test_slt_ids[i])  # 同一行的同个mashup对各个api的评分中,已选择的apis一样
                    batch_instances_dict['slt_apis'] = _slt_ids * (stop_index -
                                                                   start_index)

                batch_instances_dict = recommend_model.get_instances(
                    batch_instances_dict)
                batch_prediction = model.predict(batch_instances_dict)

                if len(batch_prediction.shape) == 2:
                    batch_prediction = batch_prediction[:, 1]  # 1:[0,1]
                batch_prediction = list(batch_prediction)
                prediction += batch_prediction  # 一个mashup对所有候选的评分
            predictions.append(list(prediction))

            end_time = time.time()
            if record_time:
                with open(time_path, 'a+') as f1:
                    if i == 0:
                        f1.write(recommend_model.get_simple_name())
                        f1.write('\n')
                    f1.write('num of instances,{},cost time,{}\n'.format(
                        test_api_num, end_time - start_time))

            # 展示几个mashup的推荐结果
            def show_prediction_res(i):
                print('for mashup {}:'.format(test_mashup_id_list[i][0]))
                if data_repository.get_args().need_slt_apis:
                    print('slt_ids:', test_slt_ids[i])
                sorted_pre2id = sorted(zip(prediction, test_api_id_list[i]))
                sorted_pres, sorted_ids = zip(*sorted_pre2id)
                print('candidate api ids', sorted_ids)
                print('predictions', sorted_pres)
                print('grounds', grounds[i])

            if i < show_cases:
                show_prediction_res(i)
            if i % 100 == 0:
                print('has test {}/{} mashup instances'.format(
                    i, test_instance_num))
        print('test,done!')
        return predictions

    predictions = get_predictions()

    # 使用IsRec_best的策略处理一下待测服务的评分:没有被近邻mashup调用过的服务,评分直接设置为0
    if true_candidates_dict is not None:
        for i in range(len(predictions)):  # 每个mashup index
            true_candidates_list = true_candidates_dict[test_mashup_id_list[i]
                                                        [0]]
            assert len(test_api_id_list[i]) == len(predictions[i])
            _num = len(test_api_id_list[i])
            for j in range(_num):
                if test_api_id_list[i][
                        j] not in true_candidates_list:  # 如果一个待测api没有被近邻mashup调用过,评分为0
                    predictions[i][j] = 0

    # 根据实例的已选择数目分别测试
    if evaluate_by_slt_apiNum and data_repository.get_args(
    ).data_mode == 'newScene':

        def _filter(slt_apiNum):
            test_api_id_list_, predictions_, grounds_ = [], [], []
            for i in range(test_instance_num):
                if len(test_slt_ids[i]) == slt_apiNum:
                    test_api_id_list_.append(test_api_id_list[i])
                    predictions_.append(predictions[i])
                    grounds_.append(grounds[i])
            return test_api_id_list_, predictions_, grounds_

        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = _filter(slt_apiNum + 1)
            evaluate_result = evalute(test_api_id_list_, predictions_,
                                      grounds_,
                                      data_repository.get_args().topKs)
            summary(evaluate_path,
                    str(slt_apiNum + 1) + '_' + csv_table_name,
                    evaluate_result,
                    data_repository.get_args().topKs)  #

    if if_save_recommend_result and data_repository.get_args(
    ).data_mode == 'newScene':
        recommend_result_path = os.path.join(recommend_model.model_dir,
                                             'recommend_result.csv')
        evaluate_result = evalute(test_api_id_list, predictions, grounds,
                                  data_repository.get_args().topKs,
                                  test_mashup_id_list, test_slt_ids,
                                  recommend_result_path)  # 评价并记录结果
        summary(evaluate_path, csv_table_name, evaluate_result,
                data_repository.get_args().topKs)  #
    else:
        evaluate_result = evalute(test_api_id_list, predictions, grounds,
                                  data_repository.get_args().topKs)
        summary(evaluate_path, csv_table_name, evaluate_result,
                data_repository.get_args().topKs)  #

    return evaluate_result  # topKs*5个指标
Ejemplo n.º 18
0
def train_best_NDCG_model(recommend_model,
                          model,
                          train_data,
                          test_data,
                          true_candidates_dict=None,
                          CI_start_test_epoch=0,
                          earlyStop_epochs=5):
    """
    训练多个epoch,每个之后均测试,选择并返回NDCG等最终指标最优的模型
    :param recommend_model:  整体的推荐模型
    :param model:  model_core
    :param train_data:
    :param test_data:
    :param start_epoch: 之前该模型已经训练过多个epoch,在这个基础上接着训练
    :param true_candidates_dict:
    :return:
    """
    print('training_save_best_NDCG_model...')
    epoch_evaluate_results = []

    # 模型
    train_model = recommend_model.get_pairwise_model(
    ) if data_repository.get_args().pairwise else model

    # 数据
    train_instances_dict = recommend_model.get_instances(
        train_data,
        pairwise_train_phase_flag=data_repository.get_args().pairwise)
    train_labels = train_data.get('label')
    if data_repository.get_args(
    ).final_activation == 'softmax':  # 针对softmax变换labels
        train_labels = utils.to_categorical(train_labels, num_classes=2)

    best_epoch, best_NDCG_5 = 0, 0
    for epoch in range(data_repository.get_args().num_epochs):
        if epoch == 0:  # 首次训练要编译
            # loss_ = lambda y_true, y_pred: y_pred if data_repository.get_args().pairwise else 'binary_crossentropy'
            # train_model.compile(optimizer=recommend_model.optimizer, loss=loss_,metrics=['accuracy'])
            train_model.compile(optimizer=recommend_model.optimizer,
                                loss='binary_crossentropy',
                                metrics=['accuracy'])
            print('whole_model compile,done!')
        print('Epoch {}'.format(epoch))

        hist = train_model.fit(
            train_instances_dict,
            np.array(train_labels),
            batch_size=data_repository.get_args().batch_size,
            epochs=1,
            verbose=1,
            shuffle=True,
            validation_split=data_repository.get_args().validation_split)
        print('Epoch {}, train done!'.format(epoch))

        # 记录:数据集情况,模型架构,训练设置
        record_name = recommend_model.get_name() + data_repository.get_args(
        ).train_name if epoch == 0 else ''  # 记录在测试集的效果,写入evalute.csv
        save_loss_acc(hist, record_name, epoch=epoch)  # 每个epoch记录

        # CI的前3轮效果差,一般不用测,提高速度
        first_test_epoch = CI_start_test_epoch if isinstance(
            recommend_model, CI_Model) else 0
        if epoch < first_test_epoch:
            epoch_evaluate_results.append(None)
            continue

        # epoch测试
        epoch_evaluate_result = evalute_by_epoch(
            recommend_model,
            model,
            record_name,
            test_data,
            record_time=True if epoch == 0 else False,
            true_candidates_dict=true_candidates_dict)
        epoch_evaluate_results.append(epoch_evaluate_result)

        # 优于目前的best_NDCG_5才存储模型参数 TODO
        if epoch_evaluate_result[0][3] >= best_NDCG_5:
            best_NDCG_5 = epoch_evaluate_result[0][3]
            best_epoch = epoch
            model.save_weights(
                data_repository.get_ds().new_model_para_path.format(
                    recommend_model.model_dir, epoch))
        else:
            if epoch - best_epoch >= earlyStop_epochs:  # 大于若干个epoch,效果没有提升,即时终止
                break

    # 记录最优epoch和最优NDCG@5
    with open(
            data_repository.get_ds().new_best_epoch_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_epoch))
    with open(
            data_repository.get_ds().new_best_NDCG_path.format(
                recommend_model.model_dir), 'w') as f:
        f.write(str(best_NDCG_5))
    print('best epoch:{},best NDCG@5:{}'.format(best_epoch, best_NDCG_5))

    # 记录最优指标
    csv_table_name = 'best_indicaters\n'
    summary(evaluate_path, csv_table_name, epoch_evaluate_results[best_epoch],
            data_repository.get_args().topKs)

    # 看word embedding矩阵是否发生改变,尤其是padding的0
    # print('some embedding parameters after {} epoch:'.format(epoch))
    # print (recommend_model.embedding_layer.get_weights ()[0][:2])

    # 把记录的非最优的epoch模型参数都删除
    try:
        for i in range(data_repository.get_args().num_epochs):
            temp_path = data_repository.get_ds().new_model_para_path.format(
                recommend_model.model_dir, i)
            if i != best_epoch and os.path.exists(temp_path):
                os.remove(temp_path)
        model.load_weights(data_repository.get_ds().new_model_para_path.format(
            recommend_model.model_dir, best_epoch))
    finally:
        return model
Ejemplo n.º 19
0
def add_pop_predictions(recommend_model,
                        csv_table_name,
                        epoch,
                        pop_mode='sigmoid',
                        a_pop_ratio=0.0):
    test_mashup_id_list, test_api_id_list, predictions = None, None, None
    with open(
            os.path.join(data_repository.get_args().data_dir,
                         'model_predictions_{}.dat'.format(epoch)), 'rb') as f:
        test_mashup_id_list, test_api_id_list, predictions = pickle.load(f)

    api_id2covec, api_id2pop = recommend_model.pd.get_api_co_vecs(
        pop_mode=pop_mode)

    # 乘积
    predictions_pop = []
    for m_index in range(len(predictions)):
        a_mashup_predictions = predictions[m_index]
        temp_preditions = []
        for a_index in range(len(a_mashup_predictions)):
            a_prediction = a_mashup_predictions[a_index]
            api_id = test_api_id_list[m_index][a_index]
            temp_preditions.append(api_id2pop[api_id] * a_prediction)
        predictions_pop.append(temp_preditions)
    evaluate_result_linear_sum = evalute(
        test_api_id_list, predictions_pop,
        data_repository.get_args().grounds,
        data_repository.get_args().topKs)  # 评价
    summary(evaluate_path, pop_mode + '_pop_prod\n' + csv_table_name,
            evaluate_result_linear_sum,
            data_repository.get_args().topKs)

    # 线性加权求和
    pop_ratios = [0.2 + 0.2 * i for i in range(5)]
    for pop_ratio in pop_ratios:
        predictions_pop_linear = []
        for m_index in range(len(predictions)):
            a_mashup_predictions = predictions[m_index]
            temp_preditions = []
            for a_index in range(len(a_mashup_predictions)):
                a_prediction = a_mashup_predictions[a_index]
                api_id = test_api_id_list[m_index][a_index]
                temp_preditions.append((1 - pop_ratio) * a_prediction +
                                       pop_ratio * api_id2pop[api_id])
            predictions_pop_linear.append(temp_preditions)

        evaluate_result_linear_sum = evalute(
            test_api_id_list, predictions_pop_linear,
            data_repository.get_args().grounds,
            data_repository.get_args().topKs)  # 评价
        summary(evaluate_path,
                pop_mode + '_pop_{}\n'.format(pop_ratio) + csv_table_name,
                evaluate_result_linear_sum,
                data_repository.get_args().topKs)

    predictions_pop_last = []
    for m_index in range(len(predictions)):
        # 首先根据score选出候选
        score_mapping = [
            pair
            for pair in zip(test_api_id_list[m_index], predictions[m_index])
        ]
        max_k_pairs = heapq.nlargest(100, score_mapping,
                                     key=lambda x: x[1])  # 根据score选取top100*
        max_k_candidates, _ = zip(*max_k_pairs)
        # 然后仅根据pop rank
        temp_preditions = [
            api_id2pop[api_id] if api_id in max_k_candidates else -1
            for api_id in test_api_id_list[m_index]
        ]
        predictions_pop_last.append(temp_preditions)

    evaluate_result_linear_sum = evalute(
        test_api_id_list, predictions_pop_last,
        data_repository.get_args().grounds,
        data_repository.get_args().topKs)  # 评价
    summary(evaluate_path, pop_mode + '_pop_last\n' + csv_table_name,
            evaluate_result_linear_sum,
            data_repository.get_args().topKs)
Ejemplo n.º 20
0
def Samanta(topK,
            if_pop=2,
            MF_mode='node2vec',
            pop_mode='',
            text_mode='HDP',
            LDA_topic_num=None):
    """
    :param Para:
    :param if_pop 如何使用pop  0 不使用;1,只做重排序;2总乘积做排序
    :param topK: 使用KNN表示新query的mf特征
    :param text_mode: 使用哪种特征提取方式  LDA  HDP
    :param pop_mode:pop值是否使用sigmoid规约到0-1区间
    :param pop_mode:MF_mode 为了省事,直接用node2vec得了
    :return:
    """

    api2pop = None
    if if_pop:
        api_co_vecs, api2pop = data_repository.get_md().get_api_co_vecs(
            pop_mode)  # TODO

    root = os.path.join(data_repository.get_ds().data_root, 'baselines')
    if not os.path.exists(root):
        os.makedirs(root)
    mashup_feature_path = os.path.join(
        root, 'mashup_{}.txt'.format(text_mode))  # ...
    api_feature_path = os.path.join(root, 'api_{}.txt'.format(text_mode))

    # 获取mashup_hdp_features,api_hdp_features
    if not os.path.exists(api_feature_path):
        gd = get_default_gd()
        _mashup_features, _api_features = gd.model_pcs(text_mode,
                                                       LDA_topic_num)
        np.savetxt(mashup_feature_path, _mashup_features)
        np.savetxt(api_feature_path, _api_features)
    else:
        _mashup_features = np.loadtxt(mashup_feature_path)
        _api_features = np.loadtxt(api_feature_path)

    candidate_ids_list = []
    all_predict_results = []

    test_data = data_repository.get_ds().test_data
    test_mashup_num = len(test_data.get('mashup'))
    mashup_emb_df = data_repository.get_ds().MF_obj.mashup_emb_df
    api_emb_df = data_repository.get_ds().MF_obj.api_emb_df

    for i in range(test_mashup_num):
        test_m_id = test_data.get('mashup')[i][0]  # 每个mashup id
        candidate_ids = test_data.get('api')[i]
        candidate_ids_list.append(candidate_ids)

        # 用近邻mashup的latent factor加权表示自己
        mid2sim = {}
        for train_m_id in mashup_emb_df.index.tolist():
            mid2sim[train_m_id] = cos_sim(_mashup_features[test_m_id],
                                          _mashup_features[train_m_id])  # TODO
        topK_ids, topK_sims = zip(*(
            sorted(mid2sim.items(), key=lambda x: x[1], reverse=True)[:topK]))
        topK_sims = np.array(topK_sims) / sum(topK_sims)  # sim归一化
        cf_feature = np.zeros((data_repository.get_args().implict_feat_dim, ))
        for z in range(len(topK_ids)):
            cf_feature += topK_sims[z] * mashup_emb_df['embedding'][
                topK_ids[z]]

        # 计算跟每个api的打分
        predict_results = []
        temp_predict_results = []  # 需要用pop进行重排序时的辅助
        api_zeros = np.zeros((data_repository.get_args().implict_feat_dim))
        api_ids = set(api_emb_df.index.tolist())
        for api_id in candidate_ids:  # id
            api_i_feature = api_emb_df['embedding'][
                api_id] if api_id in api_ids else api_zeros  # 可能存在测试集中的api不在train中出现过的场景
            cf_score = np.sum(np.multiply(
                api_i_feature, cf_feature))  # mashup和api latent factor的内积
            sim_score = cos_sim(_mashup_features[test_m_id],
                                _api_features[api_id])  # 特征的余弦相似度
            if if_pop == 1:
                temp_predict_results.append((api_id, cf_score * sim_score))
            elif if_pop == 0:
                predict_results.append(cf_score * sim_score)
            elif if_pop == 2:
                predict_results.append(cf_score * sim_score * api2pop[api_id])
        if if_pop == 1:
            max_k_pairs = heapq.nlargest(topK,
                                         temp_predict_results,
                                         key=lambda x: x[1])  # 首先利用乘积排一次序
            max_k_candidates, _ = zip(*max_k_pairs)
            max_k_candidates = set(max_k_candidates)
            predict_results = [
                api2pop[api_id] if api_id in max_k_candidates else -1
                for api_id in candidate_ids
            ]  # 重排序

        all_predict_results.append(predict_results)
    print('Samanta test,done!')

    evaluate_result = evalute(
        candidate_ids_list, all_predict_results,
        data_repository.get_ds().test_data.get('all_ground_api_ids'),
        data_repository.get_args().topKs)  # 评价
    _name = '_pop_{}'.format(if_pop)
    _name += data_repository.get_args().mf_mode
    csv_table_name = data_repository.get_ds().name + 'Samanta_model_{}'.format(
        topK) + _name + "\n"  # whole_model.name
    summary(evaluate_path, csv_table_name, evaluate_result,
            data_repository.get_args().topKs)  # 记录

    def divide(slt_apiNum):
        test_api_id_list_, predictions_, grounds_ = [], [], []
        for i in range(test_mashup_num):
            if len(data_repository.get_ds().slt_api_ids_instances[i]
                   ) == slt_apiNum:
                test_api_id_list_.append(candidate_ids_list[i])
                predictions_.append(all_predict_results[i])
                grounds_.append(data_repository.get_ds().test_data.get(
                    'all_ground_api_ids')[i])
        return test_api_id_list_, predictions_, grounds_

    if data_repository.get_args().data_mode == 'newScene':
        for slt_apiNum in range(3):
            test_api_id_list_, predictions_, grounds_ = divide(slt_apiNum + 1)
            evaluate_result = evalute(test_api_id_list_, predictions_,
                                      grounds_,
                                      data_repository.get_args().topKs)
            summary(evaluate_path,
                    str(slt_apiNum + 1) + '_' + csv_table_name,
                    evaluate_result,
                    data_repository.get_args().topKs)  #