Exemple #1
0
def NMF_Mat(df):
    model_nmf = NMF(n_components=10, init='random', random_state=0)
    m = model_nmf.fit_transform(df)
    h = model_nmf.components_
    nmf_mat = m @ h

    return nmf_mat
Exemple #2
0
def nmf(data, training, testing):
    '''
    Tune NMF parameters then calculates RMSE, coverage and running time of NMF

    Args:
        data(Dataset): the whole dataset divided into 5 folds
        training(Dataset): training dataset
        testing(Dataset): test dataset

    Returns:
        rmse: RMSE of NMF with optimized parameters
        top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    nmf_param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]}

    # optimize parameters
    grid_search = GridSearch(NMF, nmf_param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data)
    param = grid_search.best_params['RMSE']
    print('NMF:', param)

    # fit model using the optimized parameters
    nmf = NMF(n_factors=param['n_factors'], n_epochs=param['n_epochs'])
    nmf.train(training)

    # evaluate the model using test data
    predictions = nmf.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)
    return rmse, top_n
Exemple #3
0
    def recommender_nmf_baseline(self, train_file, test_file, output):

        train, test, train_dataset, test_dataset = prepare_datasets(
            train_file, test_file)
        # Use user_based true/false to switch between user-based or item-based collaborative filtering
        algo_nmf_baseline = NMF()

        algo_nmf_baseline.fit(train)

        #not_seen_elems = self.merge_train_set(train_dataset, test_dataset)

        #predictions_precision_svd = algo_svd.test(not_seen_elems, test, verbose=False, not_seen_flag=True)
        predictions_nmf_baseline = algo_nmf_baseline.test(test, verbose=False)

        #precisions, recalls = self.precision_recall_at_k(predictions_precision_svd, 10, threshold=0.0)
        # Precision and recall can then be averaged over all users
        #precision_avg = sum(prec for prec in precisions.values()) / len(precisions)
        #recall_avg = sum(rec for rec in recalls.values()) / len(recalls)
        #print('Precision: ' + str(precision_avg) + ' Recall: ' + str(recall_avg) + ' RMSE: ' + str(
        #    rmse(predictions_svd, verbose=False)) + ' MAE: ' + str(mae(predictions_svd, verbose=False)))
        print('NMF BASELINE: ' + ' RMSE ' +
              str(rmse(predictions_nmf_baseline, verbose=False)) + ' MAE ' +
              str(mae(predictions_nmf_baseline, verbose=False)))

        return algo_nmf_baseline
Exemple #4
0
def predict_NMF(userid):
    df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis=1)
    reader = Reader(rating_scale=(1, 30))

    #使用reader格式从文件中读取数据
    data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']],
                                reader=reader)

    #拆分训练集与测试集,75%的样本作为训练集,25%的样本作为测试集
    trainset, testset = train_test_split(data, test_size=.25)

    #使用NMF
    algo = NMF()
    algo.fit(trainset)
    pred_nmf = algo.test(testset)
    top_nmf_n = get_top_n(pred_nmf, n=5)

    movie_titles = pd.read_csv('movies_metadata.csv', usecols=['id', 'title'])
    movie_titles = movie_titles.rename(columns={'id': 'movieId'})
    movie_titles['movieId'] = pd.to_numeric(movie_titles['movieId'],
                                            errors='coerce').fillna(0)
    movie_titles['movieId'] = movie_titles['movieId'].astype('int')
    movie_titles.drop_duplicates()

    for uid, user_ratings in top_nmf_n.items():
        if (uid == userid):
            #print(uid, [iid for (iid, _) in user_ratings])
            title_list = [iid for (iid, _) in user_ratings]

    titles = movie_titles[movie_titles.movieId.isin(title_list)]
    print(titles[2:])
    return titles[2:]
Exemple #5
0
def recommendation_mf(userArray, numUsers, movieIds):

	ratings_dict = {'itemID': list(df_ratings.movie_id_ml) + list(numUsers*movieIds),
					'userID': list(df_ratings.user_id) + [max(df_ratings.user_id)+1+x for x in range(numUsers) for y in range(len(userArray[0]))],
					'rating': list(df_ratings.rating) + [item for sublist in userArray for item in sublist]
				}

	df = pd.DataFrame(ratings_dict)
	reader = Reader(rating_scale=(1, 5))
	data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
	trainset = data.build_full_trainset()

	nmf = NMF()
	nmf.fit(trainset)

	userIds = [trainset.to_inner_uid(max(df_ratings.user_id)+1+x) for x in range(numUsers)]

	mat = np.dot(nmf.pu, nmf.qi.T)

	scores = hmean(mat[userIds, :], axis=0)
	best_movies = scores.argsort()
	best_movies = best_movies[-9:][::-1]
	scores = scores[best_movies]
	movie_ind = [trainset.to_raw_iid(x) for x in best_movies]

	recommendation = list(zip(list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].title), 
					list(df_ML_movies[df_ML_movies.movie_id_ml.isin(movie_ind)].poster_url), 
					list(scores)))

	return recommendation
def do_nmf(data_raw, impute_params):
    data = data_raw.pivot(index="User", columns="Movie",
                          values="Prediction").to_numpy()
    reader = surprise.Reader(rating_scale=(1, 5))
    dataset = surprise.Dataset.load_from_df(
        data_raw[["User", "Movie", "Prediction"]], reader)
    trainset = dataset.build_full_trainset()

    algo = NMF(n_factors=impute_params["FACTORS"],
               n_epochs=impute_params["EPOCHS"],
               verbose=True)
    algo.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    predictions = pd.DataFrame(predictions)

    predictions.rename(columns={
        "uid": "User",
        "iid": "Movie",
        "est": "Prediction"
    },
                       inplace=True)
    predictions = predictions[["User", "Movie", "Prediction"]]

    data = pd.concat([data_raw, predictions], ignore_index=True)
    data = data.pivot(index="User", columns="Movie",
                      values="Prediction").to_numpy()
    return data
def main():
    f = open("Python/user_rated_movies.tsv", "r")
    user_ratings = []
    for line in f:
        inline = line.split('\t')
        rating = inline[2]
        mytuple = inline[0], inline[1], float(rating[:-1]), None
        user_ratings.append(mytuple)
    f.close()

    # data = Dataset.load_builtin(name=u'ml-1m')
    reader = Reader(line_format='user item rating', sep='\t')
    datain = pd.read_csv("ratings.tsv", sep="\t")
    data = Dataset.load_from_df(datain, reader=reader)
    for i in user_ratings:
        data.raw_ratings.append(i)

    movies = pd.read_csv("movies.tsv", sep="\t", header=None, low_memory=False)

    algo = NMF(n_factors=4, n_epochs=100, random_state=1)
    trainSet = data.build_full_trainset()
    algo.fit(trainSet)

    predictions = []
    #have i[0] and i[1] be the current user and movie id
    for index, row in movies.iterrows():
        pred = algo.predict(user_ratings[0][0], row[1], r_ui=4)
        predictions.append(pred)

    sortpred = sorted(predictions, key=lambda pred: pred[3])
    sortpred = sortpred[-10:]

    for i in sortpred:
        print(i[1])
Exemple #8
0
    def run_process(self, all_ips_data, ip_16_data, misclassifications, queue):
        historical_item = generate_prefix_data(all_ips_data, ip_16_data,
                                               self.reference_end_time,
                                               self.half_life_duration)
        matrix = []
        if len(historical_item) == 0:
            return
        if len(historical_item) < 5:
            for ip, bl_name_data in historical_item.items():
                queue.put(ip + ",0")
            return

        matrix_string = "userId,itemId,rating\n"
        all_blacklists = set()
        ip_order = set()
        for ip, bl_name_data in historical_item.items():
            ip_order.add(ip)
            for bl_name, score in bl_name_data.items():
                matrix_string = matrix_string + ip + "," + bl_name + "," + str(
                    score) + "\n"
                all_blacklists.add(bl_name)
        for ip in misclassifications:
            if ip in ip_order:
                matrix_string = matrix_string + ip + "," + "misclassifications,10" + "\n"

        matrix_string = StringIO(matrix_string)
        ratings = pd.read_csv(matrix_string)

        ratings_dict = {
            'itemID': list(ratings.itemId),
            'userID': list(ratings.userId),
            'rating': list(ratings.rating)
        }

        df = pd.DataFrame(ratings_dict)
        reader = Reader(rating_scale=(0, 10.0))
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
        epochs = 100
        broken_flag = False
        while True:
            algo = NMF(n_epochs=epochs, n_factors=self.n_factors)
            try:
                res = model_selection.cross_validate(algo,
                                                     data,
                                                     measures=['RMSE'])
            except:
                broken_flag = True
                break
            mean_rmse = sum(res["test_rmse"]) / len(res["test_rmse"])
            if mean_rmse <= 1:
                break
            epochs = epochs + 100
            if epochs >= self.epochs:
                break
        for ip in ip_order:
            prediction = algo.predict(ip, "misclassifications").est
            queue.put(ip + "," + str(round(prediction, 2)))
        return
Exemple #9
0
def colaborative_filtering_based_model(path, config, engine, df_valid_games):
    with open(path, 'r') as f:
        raw_strings = f.readlines()

    total_count = len(raw_strings)
    current_count = 0

    user_ratings = []
    scaler = MinMaxScaler((1, 5))

    for raw_string in raw_strings:
        user_id, user_inventory = list(json.loads(raw_string).items())[0]
        if user_inventory is not None:
            app_ids = [item['appid'] for item in user_inventory]
            app_scores = [item['playtime_forever'] for item in user_inventory]
            app_scores = scaler.fit_transform(np.log1p(app_scores).reshape(-1, 1))
            
            user_ratings_temp = [[user_id, app_ids[i], app_scores[i].item()] for i in range(len(app_ids))]
            user_ratings += user_ratings_temp

        show_work_status(1,total_count,current_count)
        current_count+=1

    user_item_ratings = pd.DataFrame(user_ratings)
    user_item_ratings.columns = ['user_id', 'item_id', 'rating']

    # Prediction part
    game_ids_set = set(df_valid_games.steam_appid)
    grouped_user_item_ratings = user_item_ratings.groupby('user_id')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(user_item_ratings[['user_id', 'item_id', 'rating']], reader)

    alg = NMF(n_factors=20)
    alg.fit(data.build_full_trainset())

    total_count = len(user_item_ratings.user_id.unique())
    current_count = 0
    dict_user_recommendations = {}
    for user in user_item_ratings.user_id.unique().tolist():
        temp = grouped_user_item_ratings.get_group(user)
        not_purchased_ids = game_ids_set - set([str(x) for x in temp.item_id])
        
        user_test_temp = [[user, not_purchased_id, 0] for not_purchased_id in not_purchased_ids]
        user_test_temp = pd.DataFrame(user_test_temp)
        user_test_temp.columns = ['user_id', 'item_id', 'rating']
        
        data = Dataset.load_from_df(user_test_temp[['user_id', 'item_id', 'rating']], reader)
        user_test = data.build_full_trainset().build_testset()
        results = alg.test(user_test)
        dict_user_recommendations.update({user: pd.DataFrame(results).sort_values('est', ascending=False).iloc[:10, 1].values.tolist()})
        
        show_work_status(1,total_count,current_count)
        current_count+=1   

    df_cf_based_results = pd.DataFrame(dict_user_recommendations).T
    df_cf_based_results.index.name = 'user_id'
    df_cf_based_results.reset_index(inplace=True)
    df_cf_based_results.to_sql(config.mysql_user_like_table, engine, if_exists='replace')
Exemple #10
0
def trainingRatings(movies, users, ratings):
    ratings_dict = {"movies": movies, "users": users, "ratings": ratings}
    df = pd.DataFrame(ratings_dict)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[["users", "movies", "ratings"]], reader)
    trainingSet = data.build_full_trainset()
    algo = NMF(n_factors=100, n_epochs=100, reg_pu=0.01)
    algo.fit(trainingSet)
    recommendMoviesForUsers(movies, users, algo)
    def initialize(self, data_filepath):
        self._data = Dataset.load_from_file(data_filepath,
                                            reader=Reader('ml-100k'))
        self._trainset = self._data.build_full_trainset()

        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        self._knn = KNNBaseline(sim_options=sim_options)
        self._nmf = NMF()

        start_new_thread(self._train)
Exemple #12
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
def get_u_v(data_raw, params):
    reader = surprise.Reader(rating_scale=(1, 5))
    # The columns must correspond to user id, item id and ratings (in that order).
    dataset = surprise.Dataset.load_from_df(
        data_raw[["User", "Movie", "Prediction"]], reader)
    trainset = dataset.build_full_trainset()
    algo = NMF(n_factors=params["GLOBAL_NMF_K"],
               n_epochs=params["GLOBAL_NMF_EPOCHS"],
               verbose=False)
    algo.fit(trainset)

    U_red = algo.pu
    V_red = algo.qi
    logging.info("return from get_u_v")
    return (U_red, V_red)
def check_for_args():
    args = sys.argv
    for arg in args:
        if (arg == 'SVD'):
            alg_list.append(SVD())
        elif (arg == 'SVDpp'):
            alg_list.append(SVDpp())
        elif (arg == 'SlopeOne'):
            alg_list.append(SlopeOne())
        elif (arg == 'NMF'):
            alg_list.append(NMF())
        elif (arg == 'NormalPredictor'):
            alg_list.append(NormalPredictor())
        elif (arg == 'KNNBaseline'):
            alg_list.append(KNNBaseline())
        elif (arg == 'KNNBasic'):
            alg_list.append(KNNBasic())
        elif (arg == 'KNNWithMeans'):
            alg_list.append(KNNWithMeans())
        elif (arg == 'KNNWithZScore'):
            alg_list.append(KNNWithZScore())
        elif (arg == 'BaselineOnly'):
            alg_list.append(BaselineOnly())
        elif (arg == 'CoClustering'):
            alg_list.append(CoClustering())

    return alg_list
def EvaluateDifferentAlgorithms():
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(),
            KNNWithZScore(),
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_6months,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

        print(
            pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
                'test_rmse'))
Exemple #16
0
def run_baselines(ratings_dict, compressed_test_ratings_dict, data_origin):
    for alg in algos:
        if alg == "KNNBasic":
            algo = KNNBasic()
        elif alg == "KNNWithZScore":
            algo = KNNWithZScore()
        elif alg == "SVD":
            algo = SVD()
        elif alg == "NMF":
            algo = NMF()
        elif alg == "SlopeOne":
            algo = SlopeOne()
        elif alg == "CoClustering":
            algo = CoClustering()

        if data_origin == 'netflix':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'netflix')
        elif data_origin == 'small':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, 'small')
        elif data_origin == '100k':
            nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing(
                algo, ratings_dict, compressed_test_ratings_dict, '100k')

        # print results
        print("\n\nAlg %s" % alg)
        print("Number of user-items pairs: %d" % nr_predictions)
        print("Accuracy: %.2f " % accuracy)
        print("RMSE: %.2f" % rmse)
        print("MAE: %.2f" % mae)
        print("Precision: %.2f" % precision)
        print("Recall: %.2f" % recall)
        print("F1: %.2f" % f1)
def benchmark(data):
    performance = []
    algorithms = [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering(),
        SVD_SGD_momentum(),
        SVDpp_SGD_momentum()
    ]
    for algorithm in algorithms:
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE', 'FCP'],
                                 cv=3,
                                 verbose=False)
        output = pd.DataFrame.from_dict(results).mean(axis=0)
        output = output.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        performance.append(output)
    output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values(
        'test_rmse')
    store_dataframe(output_df, 'Algorithm_Benchmark.csv')
def select_model(user_review):
    user_review = data_prep()
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(
        user_review[['user_id', 'business_id', 'stars']], reader)
    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            KNNBasic(),
            KNNBaseline(),
            KNNWithMeans(),
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF()
    ]:
        # Perform cross validation
        print(algorithm)
        print('start ......')
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)
        print(benchmark)
Exemple #19
0
def nmf():
    print('Algoritmo Baseline Only...')
    print('Que data desea utilizar?')
    print('(1) Android')
    print('(2) WordPress')
    data_utilizar = input()

    # Funcion de encoding para no tener error de lectura del archivo.
    reload(sys)
    sys.setdefaultencoding('utf8')

    if data_utilizar == 1:
        file_path = configuration.FILE_PATH_ANDROID
        reader = Reader(line_format='user item rating', sep='\t')
    else:
        file_path = configuration.FILE_PATH_WORDPRESS
        reader = Reader(line_format='user item rating', sep=',')

    # Dataset
    data = Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=10)

    algo = NMF()

    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    print_perf(perf)
Exemple #20
0
def get_model(model_name):
    algo = None
    if 'KNN' in model_name:
        model_name = model_name.split('_')
        knn_model_name = model_name[0]
        user_based = False if len(
            model_name) > 1 and model_name[1] == 'I' else True
        dis_method = 'msd' if len(model_name) < 3 else model_name[2]
        k = 20 if len(model_name) < 4 else int(model_name[3])
        sim_options = {'user_based': user_based, 'name': dis_method}
        if knn_model_name == 'KNNBasic':
            algo = KNNBasic(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithMeans':
            algo = KNNWithMeans(sim_options=sim_options, k=k)
        elif knn_model_name == 'KNNWithZScore':
            algo = KNNWithZScore(sim_options=sim_options, k=k)
    elif 'SVDpp' in model_name or 'SVD' in model_name or 'NMF' in model_name:
        model_name = model_name.split('_')
        n_factors = 25 if len(model_name) == 1 else int(model_name[1])
        if model_name[0] == 'SVDpp':
            algo = SVDpp(n_factors=n_factors)
        elif model_name[0] == 'SVD':
            algo = SVD(n_factors=n_factors)
        elif model_name[0] == 'NMF':
            algo = NMF(n_factors=n_factors)
    return algo
Exemple #21
0
    def model(self, alg_key):

        reader = Reader(rating_scale = (1, 5))

        data_result = Dataset.load_from_df(self.make_df()[['user_id', 'place_id', 'score']], reader)

        # split data into 5 folds

        data_result.split(n_folds=10)

        # evaluation

        if alg_key.lower() == "svd":
            alg = SVD()
        elif alg_key.lower() == "knn":
            alg = KNNBasic()
        elif alg_key.lower() == "nmf":
            alg = NMF()

        evaluate(alg, data_result, measures=['RMSE', 'MAE'])

        # prediction
        # user_0	smallShop_5645	2
        test_user = '******'
        test_id = 'smallShop_7089'
        real_score = 4

        trainset = data_result.build_full_trainset()

        alg.train(trainset)
        print(alg.predict(test_user, test_id, real_score))
Exemple #22
0
def crossvalidate(data):
    results = []
    for algorithm in [
            NormalPredictor(),
            KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)),
            KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)),
            BaselineOnly(),
            SVD(),
            SVDpp(),
            NMF(),
            SlopeOne(),
            CoClustering()
    ]:
        result = cross_validate(algorithm,
                                data,
                                measures=['RMSE'],
                                cv=5,
                                verbose=False)
        temp = pd.DataFrame.from_dict(result).mean(axis=0)
        temp = temp.append(
            pd.Series([str(algorithm).split(' ')[0].split(".")[-1]],
                      index=['Algorithm']))
        results.append(temp)
    rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values(
        'test_rmse')
    return rmse_values
Exemple #23
0
def q7():
    file_path = os.path.expanduser('restaurant_ratings.txt')
    reader = Reader(line_format='user item rating timestamp', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)

    data.split(n_folds=3)

    algo = NMF()
    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    print_perf(perf)
    def generate_algorithms(self, rating_data):
        """ here we separate untuned and tuned algo as it might take a really long time on tuning,
        it's easier to comment out the tuning part if needed
        
        Args: 
            param1: rating_data: the main data set
        Return:
                a dictionary of algorithms; key: name of algo, val: algo object

        """
        algo = {}
        algo.update({'SVD': SVD()})
        algo.update({'PMF': SVD(biased=False)})
        algo.update({'SVD++': SVDpp()})
        algo.update({'NMF': NMF()})
        print('Generated algo object for SVD, PMF, SVD++, and NMF.')

        # generate tuned SVD algorithm
        param_grid_svd = {
            'n_factors': [130, 200],
            'n_epochs': [50, 60],
            'lr_all': [0.0015, 0.002],
            'reg_all': [0.02, 0.03]
        }
        best_params_svd = self.tune_and_find_param('SVD', SVD, rating_data,
                                                   param_grid_svd)

        # initiate tuned MF algos with tuned hyperparameters
        SVD_tuned = SVD(n_factors=best_params_svd['n_factors'],
                        n_epochs=best_params_svd['n_epochs'],
                        lr_all=best_params_svd['lr_all'])

        # append new algos to result dict
        algo.update({'SVD_tuned': SVD_tuned})

        # code for future use: tuning SVDpp, NMF
        #
        # param_grid_svdpp = {'n_factors': [20, 30], 'n_epochs': [15, 25], 'lr_all': [0.005, 0.0085]}
        # best_params_svdpp = self.tune_and_find_param('SVD++', SVDpp, rating_data, param_grid_svdpp)
        #
        # param_grid_nmf = {'n_factors': [50, 55], 'n_epochs': [45, 50], 'lr_bu': [0.02, 0.025], 'lr_bi': [0.02, 0.025]}
        # best_params_nmf = self.tune_and_find_param('NMF', NMF, rating_data, param_grid_nmf)

        # SVDpp_tuned = SVDpp(n_factors = best_params_svdpp['n_factors'],
        #                 n_epochs = best_params_svdpp['n_epochs'],
        #                 lr_all = best_params_svdpp['lr_all'])
        #
        # NMF_tuned = NMF(n_factors = best_params_nmf['n_factors'],
        #                 n_epochs = best_params_nmf['n_epochs'],
        #                 lr_bu = best_params_nmf['lr_bu'],
        #                 lr_bi = best_params_nmf['lr_bi'])
        # algo.update({'SVD++_tuned': SVDpp_tuned})
        # algo.update({'NMF_tuned': NMF_tuned})

        return algo
Exemple #25
0
 def __init__(self):
     super().__init__("nmf",
                      NMF,
                      param_grid={
                          'n_factors': [15, 20],
                          'n_epochs': [50, 70]
                      })
     best_params = super().tune()
     print(best_params)
     self.algo = NMF(n_factors=best_params['n_factors'],
                     n_epochs=best_params['n_epochs'])
Exemple #26
0
def get_model_old(model_name):
    algo = None
    if model_name == 'KNNBasic_U':
        sim_options = {'user_based': True}
        algo = KNNBasic(sim_options=sim_options, k=20)
    elif model_name == 'KNNBasic_I':
        sim_options = {'user_based': False}
        algo = KNNBasic(sim_options=sim_options, k=20)
        # algo = KNNBasic()
    elif model_name == 'KNNWithMeans_I':
        algo = KNNWithMeans(sim_options={'user_based': False}, k=20)
    elif model_name == 'KNNWithMeans_U':
        algo = KNNWithMeans(sim_options={'user_based': True}, k=20)
    elif model_name == 'KNNWithZScore_I':
        algo = KNNWithZScore(sim_options={'user_based': False}, k=20)
    elif model_name == 'KNNWithZScore_U':
        algo = KNNWithZScore(sim_options={'user_based': True}, k=20)
    elif model_name == 'SVDpp':
        algo = SVDpp()
    elif model_name == 'SVD':
        algo = SVD()
    elif model_name == 'NMF':
        algo = NMF()
    elif 'NMF_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = NMF(n_factors=n_factors)
    elif 'SVDpp_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = SVDpp(n_factors=n_factors)
    elif 'SVD_' in model_name:
        n_factors = int(model_name.split("_")[1])
        algo = SVD(n_factors=n_factors)
    elif 'KNNBasic_U_' in model_name:
        k = int(model_name.split("_")[-1])
        sim_options = {'user_based': True}
        algo = KNNBasic(sim_options=sim_options, k=k)
    elif 'KNNBasic_I_' in model_name:
        k = int(model_name.split("_")[-1])
        sim_options = {'user_based': False}
        algo = KNNBasic(sim_options=sim_options, k=k)
    return algo
Exemple #27
0
    def __init__(self, algo='knn_baseline', filepath=None):
        if not os.path.exists(filepath):
            raise FileNotFoundError("{} not exist".format(filepath))
        self.filepath = filepath
        if algo == 'nmf':
            self.algo = NMF()
            self.model_name = 'nmf'
        else:
            self.algo = KNNBaseline()
            self.model_name = 'knn_baseline'

        self.convertor = DataConvertHelper()
Exemple #28
0
 def nmf(self, namefile, uid, iid, rati, value_uid, value_iid):
     test_data = pd.read_csv('./container/' + namefile)
     dt = pd.DataFrame(test_data)
     # Retrieve the trainset.
     reader = Reader(rating_scale=(0, 100))
     data = Dataset.load_from_df(dt[[uid, iid, rati]], reader)
     trainset = data.build_full_trainset()
     algo = NMF()
     algo.fit(trainset)
     pred = algo.predict(int(value_uid),
                         int(value_iid),
                         r_ui=1,
                         verbose=True)
     #var_rmse = accuracy.rmse(pred)
     #return result to json
     jsondata = {}
     jsondata = {}
     jsondata["uid"] = pred.uid
     jsondata["idd"] = pred.iid
     jsondata["rati"] = round(pred.est, 2)
     return jsondata
Exemple #29
0
def train():
    data = load_dataset()
    algo_svd = SVD()
    algo_nmf = NMF()

    print("Cross Validation procedure")
    kf = KFold(n_splits=KFOLD_NUM)
    for i, (trainset_cv, testset_cv) in enumerate(kf.split(data), start=1):
        print(f"===> Fold number {i}")
        # Save the first fold
        train_helper(algo_svd, "SVD", trainset_cv, testset_cv, i == 1)
        train_helper(algo_nmf, "NMF", trainset_cv, testset_cv, i == 1)
Exemple #30
0
def trainFinalModels(ratingsTrainDataset, ratingsTest, bestParamsNMF,
                     bestParamsKNN):
    ratingsTrainTrainset = ratingsTrainDataset.build_full_trainset()

    modelNMF = NMF(**bestParamsNMF)
    modelNMF.fit(ratingsTrainTrainset)
    saveModel(modelNMF, 'NMF')

    predictions = modelNMF.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('NMF', rmseValue, maeValue)

    modelKNN = KNNWithMeans(**bestParamsKNN)
    modelKNN.fit(ratingsTrainTrainset)
    saveModel(modelKNN, 'KNN')

    predictions = modelKNN.test(ratingsTest)
    rmseValue = rmse(predictions)
    maeValue = mae(predictions)
    saveFinalResult('KNN', rmseValue, maeValue)