Example #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_file_path",
                        default="data/train.csv",
                        help="training file path")
    parser.add_argument("--test_file_path",
                        default="data/test.csv",
                        help="testing file path")
    parser.add_argument("--approach",
                        default="SVD",
                        help="Baseline | SVD | SlopeOne | NMF | CoClustering")
    parser.add_argument("--output_ranking_file",
                        default="ranking",
                        help="output ranking for test")
    bsl_options = {'method': 'sgd', 'n_epochs': 20, 'reg_u': 100, 'reg_i': 50}
    options = {
        "Baseline": BaselineOnly(bsl_options, verbose=True),
        "SVD": SVD(verbose=True, n_factors=20, n_epochs=3),
        "SlopeOne": SlopeOne(),
        "NMF": NMF(),
        "CoClustering": CoClustering()
    }
    args = parser.parse_args()
    reader = Reader(line_format='user item rating timestamp', sep='\t')
    algo = options[args.approach]
    train_data = Dataset.load_from_file(args.train_file_path, reader=reader)
    test_data = Dataset.load_from_file(args.test_file_path, reader=reader)
    train_set = train_data.build_full_trainset()
    test_set = test_data.build_full_trainset().build_testset()
    print("training....")
    algo.fit(train_set)
    print("testing...")
    predictions = algo.test(test_set)
    accuracy.mae(predictions, verbose=True)
    accuracy.rmse(predictions, verbose=True)
    ### Extra Credit
    output_ranking(predictions,
                   args.output_ranking_file + "_" + args.approach + ".out")
    precisions, recalls = precision_recall_at_k(predictions,
                                                k=10,
                                                threshold=2.5)
    print("Precision:",
          sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
    print("F-measure:", f_measure(precisions, recalls))
    print("conversion_rate:", get_conversion_rate(predictions, k=10))
    print("ndcg:", get_ndcg(predictions, k_highest_scores=10))
Example #2
0
def all_models(_file_path, modelname):
    #data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99)
    data = ds.get_data(_file_path)

    data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \
    rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'})

    reader = Reader(rating_scale=(1.0, 5.0))
    df_loaded = Dataset.load_from_df(data_surprise, reader)
    #trainset = df_loaded.build_full_trainset()

    results_list = []

    # features
    reviews = data.shape[0]
    n_users = data.customer_id.nunique()
    n_products = data.product_id.nunique()
    mean_rating = data.star_rating.mean()
    rating_std = data.star_rating.std()
    sparsity = reviews * 100 / (n_users * n_products)

    #for model in ['user_user', 'item_item', 'matrix_fact']:
    # Perform cross validation
    results = model_selection.cross_validate(select_model(
        df_loaded, model_selection=modelname),
                                             df_loaded,
                                             measures=['RMSE', 'MAE'],
                                             cv=5,
                                             verbose=False)

    # precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    kf = KFold(n_splits=5)
    trainset, testset = train_test_split(df_loaded, test_size=.25)
    map_k, mar_k = 0, 0
    algo = select_model(df_loaded, model_selection=modelname)
    #for trainset, testset in trainset.split():
    algo.fit(trainset)
    predictions = algo.test(testset)
    #uid = str(11613707)  # raw user id (as in the ratings file). They are **strings**!
    #iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

    # get a prediction for specific users and items.
    #pred = algo.predict(uid,verbose=True)
    #pred
    top_n = rec.get_top_n(predictions, n=30)

    #top_n[data_surprise.userID[38745832]]

    # top_n = rec.get_top_n(predictions,data_surprise,userID = 11613707)
    # pred_SVD_124 = top.get_top_n(predictions,userId = 13545982,data = data)
    #top_n.head(15)
    # pred_SVD_124
    print('Recommendations for the user')
    # print('user Id    Item Id')
    dfo = pd.DataFrame(columns=['UserId', 'ItemId'])
    i = 0
    for uid, user_ratings in top_n.items():
        row = [uid, top_n[uid]]
        dfo.loc[i] = row
        i = i + 1
        #dfo.to_csv('submissionF.csv', index = False)

        # print('Recommendations for the user')

        #print(uid, [iid for (iid, _) in user_ratings])
        precisions, recalls = metrics.precision_recall_at_k(predictions,
                                                            k=5,
                                                            threshold=4)

        # Precision and recall can then be added for all the splits
        merge = dfo.merge(data, left_on='UserId', right_on='customer_id')
        merge1 = merge[['UserId', 'product_title']]
        print(merge1)
        map_k += precisions
        mar_k += recalls
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series(map_k / 5, index=['map_k']))
        tmp = tmp.append(pd.Series(mar_k / 5, index=['mar_k']))
        tmp = tmp.append(pd.Series([str(_file_path)], index=['data']))
        tmp = tmp.append(pd.Series([str(modelname)], index=['Algorithm']))

        # features
        tmp = tmp.append(pd.Series(reviews, index=['reviews']))
        tmp = tmp.append(pd.Series(n_users, index=['n_users']))
        tmp = tmp.append(pd.Series(n_products, index=['n_products']))
        tmp = tmp.append(pd.Series(mean_rating, index=['mean_rating']))
        tmp = tmp.append(pd.Series(rating_std, index=['std_rating']))
        tmp = tmp.append(pd.Series(sparsity, index=['sparsity']))

        results_list.append(tmp)
        # print(results_list)
    results_df = pd.DataFrame(results_list)

    # saving the results file to folder
    return results_df
Example #3
0
def all_models(_file_path, _save_path):
    data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99)
    data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \
        rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'})

    reader = Reader(rating_scale=(1.0, 5.0))
    df_loaded = Dataset.load_from_df(data_surprise, reader)

    results_list = []

    # features
    reviews = data.shape[0]
    n_users = data.customer_id.nunique()
    n_products = data.product_id.nunique()
    mean_rating = data.star_rating.mean()
    rating_std = data.star_rating.std()
    sparsity = reviews * 100 / (n_users * n_products)

    for model in ['user_user', 'item_item', 'matrix_fact']:
        # Perform cross validation
        results = model_selection.cross_validate(select_model(
            df_loaded, model_selection=model),
                                                 df_loaded,
                                                 measures=['RMSE', 'MAE'],
                                                 cv=5,
                                                 verbose=False)

        # precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
        kf = KFold(n_splits=5)
        map_k, mar_k = 0, 0
        algo = select_model(df_loaded, model_selection=model)
        for trainset, testset in kf.split(df_loaded):
            algo.fit(trainset)
            predictions = algo.test(testset)
            precisions, recalls = metrics.precision_recall_at_k(predictions,
                                                                k=5,
                                                                threshold=4)

            # Precision and recall can then be added for all the splits

            map_k += precisions
            mar_k += recalls

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series(map_k / 5, index=['map_k']))
        tmp = tmp.append(pd.Series(mar_k / 5, index=['mar_k']))
        tmp = tmp.append(pd.Series([str(_file_path)], index=['data']))
        tmp = tmp.append(pd.Series([str(model)], index=['Algorithm']))

        # features
        tmp = tmp.append(pd.Series(reviews, index=['reviews']))
        tmp = tmp.append(pd.Series(n_users, index=['n_users']))
        tmp = tmp.append(pd.Series(n_products, index=['n_products']))
        tmp = tmp.append(pd.Series(mean_rating, index=['mean_rating']))
        tmp = tmp.append(pd.Series(rating_std, index=['std_rating']))
        tmp = tmp.append(pd.Series(sparsity, index=['sparsity']))

        results_list.append(tmp)
    print(results_list)
    results_df = pd.DataFrame(results_list)

    # saving the results file to folder
    if _save_path:
        results_df.to_csv(_save_path, mode='a', index=False)

    return results_df
    dfo = pd.DataFrame(columns=['UserId', 'ItemId'])
    i = 0

    for uid, user_ratings in top_n.items():

        # print('Recommendations for the user')
        # st.write('user ID,Item Id')
        row = [uid, top_n[uid]]
        dfo.loc[i] = row
        i = i + 1
        #st.write(uid, [iid for (iid, _) in user_ratings])
        print('user Id ,Item Id')
        print(uid, [iid for (iid, _) in user_ratings])

        precisions, recalls = metrics.precision_recall_at_k(predictions,
                                                            k=5,
                                                            threshold=4)

        # Precision and recall can then be added for all the splits
    merge = dfo.merge(data, left_on='UserId', right_on='customer_id')
    merge1 = merge[['product_title']]
    st.write(merge1)

    #UserId = '18206299'
    #merge2 = customer_recomendation(47781982)
    #st.write(merge2)
    #url = st.text_input('Enter CustomerID')
    #st.write('The Entered Customer Id is', url)
    #merge2 =merge1.loc[43173394]
    #st.write(merge2)
    map_k += precisions