Example #1
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path + args.problem, 'NDCG')

    R_train = load_numpy(path=args.path, name=args.train)
    R_valid = load_numpy(path=args.path, name=args.valid)
    R_test = load_numpy(path=args.path, name=args.test)

    R_train = R_train + R_valid

    topK = [5, 10, 15, 20, 50]

    frame = []
    for idx, row in df.iterrows():
        start = timeit.default_timer()
        row = row.to_dict()
        row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"]
        row['topK'] = topK
        result = execute(R_train,
                         R_test,
                         row,
                         models[row['model']],
                         gpu_on=args.gpu)
        stop = timeit.default_timer()
        print('Time: ', stop - start)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.name)
Example #2
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path + args.tuning_result_path,
                                   'NDCG')

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    R_valid = load_numpy(path=args.data_dir, name=args.valid_set)
    R_test = load_numpy(path=args.data_dir, name=args.test_set)

    R_train = R_train + R_valid

    topK = [5, 10, 15, 20, 50]

    frame = []
    for idx, row in df.iterrows():
        start = timeit.default_timer()
        row = row.to_dict()
        row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"]
        row['topK'] = topK
        result = general(R_train,
                         R_test,
                         row,
                         models[row['model']],
                         measure=row['similarity'],
                         gpu_on=args.gpu,
                         model_folder=args.model_folder)
        stop = timeit.default_timer()
        print('Time: ', stop - start)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.save_path)
Example #3
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path+args.tuning_result_path, 'MAP@10')

    R_train = load_numpy(path=args.path, name=args.train)
    R_valid = load_numpy(path=args.path, name=args.valid)
    R_test = load_numpy(path=args.path, name=args.test)

    R_train = R_train + R_valid

#    R_train[(R_train <= 3).nonzero()] = 0
#    R_test[(R_test <= 3).nonzero()] = 0

#    R_train[(R_train > 3).nonzero()] = 1
#    R_test[(R_test > 3).nonzero()] = 1
#    import ipdb; ipdb.set_trace()

    topK = [5, 10, 15, 20, 50]

    frame = []
    for idx, row in df.iterrows():
        start = timeit.default_timer()
        row = row.to_dict()
        row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"]
        row['topK'] = topK
        result = execute(R_train, R_test, row, models[row['model']])
        stop = timeit.default_timer()
        print('Time: ', stop - start)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.name)
Example #4
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    parameters = find_best_hyperparameters(table_path + args.dataset_name,
                                           'NDCG')
    parameters_row = parameters.loc[parameters['model'] == args.model]

    if args.dataset_name == "yelp/":
        R_train_item_keyphrase = R_train_item_keyphrase.T

    start_time = time.time()

    results = critiquing(
        matrix_Train=R_train,
        matrix_Test=R_test,
        keyphrase_freq=R_train_keyphrase,
        item_keyphrase_freq=R_train_item_keyphrase,
        num_users_sampled=args.num_users_sampled,
        num_items_sampled=args.num_items_sampled,
        max_iteration_threshold=args.max_iteration_threshold,
        dataset_name=args.dataset_name,
        model=models[args.model],
        parameters_row=parameters_row,
        critiquing_model_name=args.critiquing_model_name,
        keyphrase_selection_method=args.keyphrase_selection_method,
        topk=args.topk,
        lamb=args.lamb)

    print("Final Time Elapsed: {}".format(inhour(time.time() - start_time)))

    table_path = load_yaml('config/global.yml', key='path')['tables']
    save_dataframe_csv(results, table_path, args.save_path)
Example #5
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path+args.param, 'NDCG')

    R_train = load_numpy(path=args.path, name=args.train)
    R_valid = load_numpy(path=args.path, name=args.valid)

    results = converge(R_train, R_valid, df, table_path, args.name, epochs=500, gpu_on=args.gpu)

    show_training_progress(results, hue='model', metric='NDCG', name="epoch_vs_ndcg")
Example #6
0
def main(args):

    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path + args.problem, 'NDCG')

    R_train = load_numpy(path=args.path, name=args.train)
    R_valid = load_numpy(path=args.path, name=args.valid)
    R_test = load_numpy(path=args.path, name=args.test)

    R_train = R_train + R_valid

    topK = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

    personalization(R_train,
                    R_test,
                    df,
                    topK,
                    args.problem,
                    args.model_folder,
                    gpu_on=args.gpu)
Example #7
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']
    df = find_best_hyperparameters(table_path + args.problem, 'NDCG')

    R_train = load_numpy(path=args.path, name=args.train)
    R_valid = load_numpy(path=args.path, name=args.valid)
    R_test = load_numpy(path=args.path, name=args.test)

    R_train = R_train + R_valid

    topK = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
    metric = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP']

    usercategory(R_train,
                 R_test,
                 df,
                 topK,
                 metric,
                 args.problem,
                 args.model_folder,
                 gpu_on=args.gpu)
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    if args.explanation:
        df = find_best_hyperparameters(table_path + args.tuning_result_path,
                                       'NDCG@10')
    else:
        df = find_best_hyperparameters(table_path + args.tuning_result_path,
                                       'NDCG')

    num_users = pd.read_csv(args.data_dir + args.dataset_name + '/' +
                            args.user_col + '.csv')[args.user_col].nunique()
    num_items = pd.read_csv(args.data_dir + args.dataset_name + '/' +
                            args.item_col + '.csv')[args.item_col].nunique()

    df_train = pd.read_csv(args.data_dir + args.dataset_name + '/' +
                           args.train_set)
    df_train = df_train[df_train[args.rating_col] == 1]
    df_train[args.keyphrase_vector_col] = df_train[
        args.keyphrase_vector_col].apply(ast.literal_eval)

    df_test = pd.read_csv(args.data_dir + args.dataset_name + '/' +
                          args.test_set)

    keyphrase_names = pd.read_csv(args.data_dir + args.dataset_name + '/' +
                                  args.keyphrase_set)[
                                      args.keyphrase_col].values

    if args.explanation:
        results = explanation_converge(num_users,
                                       num_items,
                                       args.user_col,
                                       args.item_col,
                                       args.rating_col,
                                       args.keyphrase_vector_col,
                                       df_train,
                                       df_test,
                                       keyphrase_names,
                                       df,
                                       table_path,
                                       args.save_path,
                                       epoch=args.epoch)
    else:
        results = converge(num_users,
                           num_items,
                           args.user_col,
                           args.item_col,
                           args.rating_col,
                           args.keyphrase_vector_col,
                           df_train,
                           df_test,
                           keyphrase_names,
                           df,
                           table_path,
                           args.save_path,
                           epoch=args.epoch)

    show_training_progress(results,
                           hue='model',
                           metric='NDCG',
                           name="epoch_vs_ndcg")
Example #9
0
def general(train, test, keyphrase_train, keyphrase_test, params, save_path, final_explanation=False):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']
    df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG')

    try:
        output_df = load_dataframe_csv(table_path, save_path)
    except:
        output_df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer'])

    for index, row in df.iterrows():

        algorithm = row['model']
        rank = row['rank']
        beta = row['beta']
        lamb_l2 = row['lambda_l2']
        lamb_keyphrase = row['lambda_keyphrase']
        lamb_latent = row['lambda_latent']
        lamb_rating = row['lambda_rating']
        learning_rate = row['learning_rate']
        epoch = row['epoch']
        corruption = row['corruption']
        optimizer = row['optimizer']

        row['topK'] = [5, 10, 15, 20, 50]
        row['metric'] = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP']

        format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, lambda_keyphrase: {}, " \
                 "lambda_latent: {}, lambda_rating: {}, learning_rate: {}, " \
                 "epoch: {}, corruption: {}, optimizer: {}"

        progress.section(format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer))

        progress.subsection("Training")

        model = models[algorithm](matrix_train=train,
                                  epoch=epoch,
                                  lamb_l2=lamb_l2,
                                  lamb_keyphrase=lamb_keyphrase,
                                  lamb_latent=lamb_latent,
                                  lamb_rating=lamb_rating,
                                  beta=beta,
                                  learning_rate=learning_rate,
                                  rank=rank,
                                  corruption=corruption,
                                  optimizer=optimizer,
                                  matrix_train_keyphrase=keyphrase_train)

        progress.subsection("Prediction")

        rating_score, keyphrase_score = model.predict(train.todense())

        progress.subsection("Evaluation")

        if final_explanation:
            prediction = predict_keyphrase(keyphrase_score,
                                           topK=row['topK'][-2])

            result = evaluate_explanation(prediction,
                                          keyphrase_test,
                                          row['metric'],
                                          row['topK'])
        else:
            prediction = predict(rating_score,
                                 topK=row['topK'][-1],
                                 matrix_Train=train)

            result = evaluate(prediction, test, row['metric'], row['topK'])

        result_dict = {'model': algorithm,
                       'rank': rank,
                       'beta': beta,
                       'lambda_l2': lamb_l2,
                       'lambda_keyphrase': lamb_keyphrase,
                       'lambda_latent': lamb_latent,
                       'lambda_rating': lamb_rating,
                       'learning_rate': learning_rate,
                       'epoch': epoch,
                       'corruption': corruption,
                       'optimizer': optimizer}

        for name in result.keys():
            result_dict[name] = [round(result[name][0], 4),
                                 round(result[name][1], 4)]

        output_df = output_df.append(result_dict, ignore_index=True)

        model.sess.close()
        tf.reset_default_graph()

        save_dataframe_csv(output_df, table_path, save_path)

    return output_df
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    parameters = find_best_hyperparameters(table_path + args.dataset_name,
                                           'NDCG')
    parameters_row = parameters.loc[parameters['model'] == args.model]

    lambs = [
        0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500,
        1000, 10000, 100000
    ]
    topks = [10, 20, 50, 100]

    if args.dataset_name == "yelp/":
        R_train_item_keyphrase = R_train_item_keyphrase.T

    for topk in topks:
        for lamb in lambs:
            results = critiquing(
                matrix_Train=R_train,
                matrix_Test=R_test,
                keyphrase_freq=R_train_keyphrase,
                item_keyphrase_freq=R_train_item_keyphrase,
                num_users_sampled=args.num_users_sampled,
                num_items_sampled=args.num_items_sampled,
                max_iteration_threshold=args.max_iteration_threshold,
                dataset_name=args.dataset_name,
                model=models[args.model],
                parameters_row=parameters_row,
                critiquing_model_name=args.critiquing_model_name,
                keyphrase_selection_method=args.keyphrase_selection_method,
                topk=topk,
                lamb=lamb)
            table_path = load_yaml('config/global.yml', key='path')['tables']
            topk_path = "topk_" + str(topk) + "/"
            save_name = args.save_path + topk_path + "tuning_at_lamb_" + str(
                lamb) + "_with_" + args.keyphrase_selection_method + ".csv"
            save_dataframe_csv(results, table_path, save_name)
Example #11
0
def general(num_users, num_items, user_col, item_col, rating_col,
            keyphrase_vector_col, df_train, df_test, keyphrase_names, params,
            save_path):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']
    df = find_best_hyperparameters(table_path + params['tuning_result_path'],
                                   'NDCG')

    try:
        output_df = load_dataframe_csv(table_path, save_path)
    except:
        output_df = pd.DataFrame(columns=[
            'model', 'rank', 'num_layers', 'train_batch_size',
            'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch',
            'negative_sampling_size'
        ])

    for index, row in df.iterrows():

        algorithm = row['model']
        rank = row['rank']
        num_layers = row['num_layers']
        train_batch_size = row['train_batch_size']
        predict_batch_size = row['predict_batch_size']
        lamb = row['lambda']
        learning_rate = row['learning_rate']
        epoch = 300
        negative_sampling_size = row['negative_sampling_size']

        row['topK'] = [5, 10, 15, 20, 50]
        row['metric'] = [
            'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'
        ]

        format = "model: {0}, rank: {1}, num_layers: {2}, train_batch_size: {3}, " \
                 "predict_batch_size: {4}, lambda: {5}, learning_rate: {6}, epoch: {7}, negative_sampling_size: {8}"
        progress.section(
            format.format(algorithm, rank, num_layers, train_batch_size,
                          predict_batch_size, lamb, learning_rate, epoch,
                          negative_sampling_size))

        progress.subsection("Initializing Negative Sampler")

        negative_sampler = Negative_Sampler(
            df_train[[user_col, item_col, keyphrase_vector_col]],
            user_col,
            item_col,
            rating_col,
            keyphrase_vector_col,
            num_items=num_items,
            batch_size=train_batch_size,
            num_keyphrases=len(keyphrase_names),
            negative_sampling_size=negative_sampling_size)

        model = models[algorithm](num_users=num_users,
                                  num_items=num_items,
                                  text_dim=len(keyphrase_names),
                                  embed_dim=rank,
                                  num_layers=num_layers,
                                  negative_sampler=negative_sampler,
                                  lamb=lamb,
                                  learning_rate=learning_rate)

        progress.subsection("Training")

        pretrained_path = load_yaml('config/global.yml',
                                    key='path')['pretrained']
        # try:
        #     model.load_model(pretrained_path+params['tuning_result_path'], row['model'])
        # except:
        model.train_model(df_train,
                          user_col,
                          item_col,
                          rating_col,
                          epoch=epoch)
        # model.save_model(pretrained_path+params['tuning_result_path'], row['model'])

        progress.subsection("Prediction")

        prediction, explanation = predict_elementwise(
            model,
            df_train,
            user_col,
            item_col,
            row['topK'][-1],
            batch_size=row['predict_batch_size'],
            enable_explanation=False,
            keyphrase_names=keyphrase_names)

        R_test = to_sparse_matrix(df_test, num_users, num_items, user_col,
                                  item_col, rating_col)

        result = evaluate(prediction, R_test, row['metric'], row['topK'])

        # Note Finished yet
        result_dict = {
            'model': row['model'],
            'rank': row['rank'],
            'num_layers': row['num_layers'],
            'train_batch_size': row['train_batch_size'],
            'predict_batch_size': row['predict_batch_size'],
            'lambda': row['lambda'],
            'topK': row['topK'][-1],
            'learning_rate': row['learning_rate'],
            'epoch': epoch,
            'negative_sampling_size': row['negative_sampling_size'],
        }

        for name in result.keys():
            result_dict[name] = round(result[name][0], 4)
        output_df = output_df.append(result_dict, ignore_index=True)

        model.sess.close()
        tf.reset_default_graph()

        save_dataframe_csv(output_df, table_path, save_path)

    return output_df