def get_recommendations(builds, clean=cleaned, svd=None, encoder=False):
    clean = clean.drop_duplicates()
    print(clean)

    preprocessors = [StandardScaler()]
    if svd is not None and svd:
        svd = int(svd)
        svd = min([40, svd])
        preprocessors = [TruncatedSVD(svd)]
        clean[reg_cols] = scaler_reg.transform(clean[reg_cols])

    # if encoder is not None and encoder:
    #     #drop
    #     #scale

    #     predict = autoencoder_model.predict(clean.loc[builds] )

    #     #unscale
    #     #rename and combine columns

    recommender = Recommender(
        drop_columns=[
            'Date Published', 'price_build', 'number_ratings', 'avg_rating',
            'storage_price'
        ],
        preprocessors=preprocessors,
        # feature_weights = {'Core Clock' : 10},
    )
    recommender.fit(clean)
    return recommender.recommend(clean.loc[builds])
Ejemplo n.º 2
0
def train():
    print('reading u.user')
    u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
    users = pd.read_csv('/volumes/data/u.user',
                        sep='|',
                        names=u_cols,
                        encoding='latin-1')

    #Reading ratings file:
    print('reading u.data')
    r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
    ratings = pd.read_csv('/volumes/data/u.data',
                          sep='\t',
                          names=r_cols,
                          encoding='latin-1')

    #Reading items file:
    print('reading u.item')
    i_cols = [
        'movie_id', 'movie title', 'release date', 'video release date',
        'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
        'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
        'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
        'Thriller', 'War', 'Western'
    ]
    items = pd.read_csv('/volumes/data/u.item',
                        sep='|',
                        names=i_cols,
                        encoding='latin-1')

    print('merging files')
    movies100k_df = pd.merge(pd.merge(ratings, users),
                             items)[['user_id', 'movie_id', 'rating']]

    this_reco = Recommender()
    print('training recommender')
    this_reco.fit(movies100k_df,
                  user_id='user_id',
                  item_id='movie_id',
                  ratings='rating')

    print('saving recommender model')
    joblib.dump(this_reco, "/volumes/data/recommender-model.pkl")
    print('done')
Ejemplo n.º 3
0
    def train(self):
        viewed_together_data = self.read_data(
            self.data_paths[self.config.VIEWED_TOGETHER])
        bought_together_data = self.read_data(
            self.data_paths[self.config.BOUGHT_TOGETHER])
        all_products_data = self.read_data(
            self.data_paths[self.config.ALL_PRODUCTS])
        price_list_data = self.read_data(
            self.data_paths[self.config.PRICE_LIST])
        """getting some columns in lower case"""
        transformed_all_products_data = uniform_data(all_products_data,
                                                     self.product_attributes)
        """explode the lists into tuples of combinations per session ID for views and per user in bought"""
        print(
            "For the view Dataframe breaking lists of brands, product categories, product_types "
            "into permutations of brands, product categories, product_types as a list of tuples"
        )
        viewed_together_cols, group_by_col = [
            'SID_IDX', 'CONFIG_ID', 'PRODUCT_CATEGORY', 'PRODUCT_TYPE', 'BRAND'
        ], 'SID_IDX'
        (tuple_list_viewed_brand, tuple_list_viewed_product_category,
         tuple_list_viewed_product_type,
         tuple_list_viewed_config) = self.transform_data(
             viewed_together_data, self.product_attributes,
             viewed_together_cols, group_by_col)

        print(
            "For the bought Dataframe breaking lists of brands, product categories, product_types "
            "into permutations of brands, product categories, product_types as a list of tuples"
        )

        bought_together_cols, group_by_col = [
            'CUSTOMER_IDX', 'CONFIG_ID', 'PRODUCT_CATEGORY', 'PRODUCT_TYPE',
            'BRAND'
        ], 'CUSTOMER_IDX'

        (tuple_list_bought_brand, tuple_list_bought_product_category,
         tuple_list_bought_product_type,
         tuple_list_bought_config) = self.transform_data(
             bought_together_data, self.product_attributes,
             bought_together_cols, group_by_col)

        recommender = Recommender()
        trained_data, _ = recommender.fit(
            tuple_list_viewed_brand, tuple_list_bought_brand,
            tuple_list_viewed_product_category,
            tuple_list_bought_product_category, tuple_list_viewed_product_type,
            tuple_list_bought_product_type, tuple_list_viewed_config,
            tuple_list_bought_config, transformed_all_products_data,
            price_list_data)
        self.write_data(trained_data)
Ejemplo n.º 4
0
def eval_model(ratings_df):

    # Randomly split data into train and test datasets
    train_df, test_df = ratings_df.randomSplit(weights=[0.5, 0.5])

    print_ratings_counts(train_df, 'Train')
    print_ratings_counts(test_df, 'Test')

    estimator = Recommender(
        useALS=True,
        useBias=True,
        lambda_1=0.5,
        lambda_2=0.5,
        lambda_3=0,
        userCol='user',
        itemCol='item',
        ratingCol='rating',
        rank=128,
        regParam=0.16,
        maxIter=10,
        nonnegative=False
    )

    # estimator = ALS(
    #     userCol='user',
    #     itemCol='item',
    #     ratingCol='rating',
    #     rank=2,
    #     regParam=0.7,
    #     maxIter=5,
    #     nonnegative=True,
    #     coldStartStrategy='drop'
    # )

    evaluator_rmse = RegressionEvaluator(
        metricName="rmse", labelCol="rating", predictionCol="prediction")

    evaluator_ndcg10 = NDCG10Evaluator(spark)

    start_time = time.monotonic()
    step_start_time = time.monotonic()

    model = estimator.fit(train_df)

    print('Fit done in {} seconds.'.format(time.monotonic() - step_start_time))

    train_predictions_df = model.transform(train_df)
    
    step_start_time = time.monotonic()
    test_predictions_df = model.transform(test_df)

    print('Predictions done in {} seconds.'
        .format(time.monotonic() - step_start_time))
    print('All done in {} seconds.'.format(time.monotonic() - start_time))

    # print(predictions_df.printSchema())

    for row in test_predictions_df.head(30):
        print(row)

    print_avg_predictions(train_predictions_df, 'Train')
    print_avg_predictions(test_predictions_df, 'Test')

    train_rmse = evaluator_rmse.evaluate(train_predictions_df)
    test_rmse = evaluator_rmse.evaluate(test_predictions_df)
    print("Train RMSE: {}".format(train_rmse))
    print("Test RMSE: {}".format(test_rmse))

    train_ndcg10 = evaluator_ndcg10.evaluate(train_predictions_df)
    test_ndcg10 = evaluator_ndcg10.evaluate(test_predictions_df)
    print("Train NDCG10: {}".format(train_ndcg10))
    print("Test NDCG10: {}".format(test_ndcg10))
Ejemplo n.º 5
0
def plot_scores(train_df):

    best_rank_so_far = 250
    best_regParam_so_far = 0.001
    lambda_1 = 2
    lambda_2 = 2
    lambda_3 = 0.0
    nonnegative = False
    maxIter = 10
    useALS = True
    useBias = True
    implicitPrefs = False

    # eval_name = 'NDCG10'
    # evaluator = NDCG10Evaluator(spark)

    # eval_name = 'NDCG'
    # evaluator = NDCGEvaluator(spark)

    # eval_name = 'TopQuantileEvaluator'
    # evaluator = TopQuantileEvaluator(spark)

    eval_name = 'RMSE'
    evaluator = RegressionEvaluator(
        metricName="rmse", labelCol="rating", predictionCol="prediction")    

    print()
    print('best_rank_so_far: {}'.format(best_rank_so_far))
    print('best_regParam_so_far: {}'.format(best_regParam_so_far))
    print('lambda_1: {}'.format(lambda_1))
    print('lambda_2: {}'.format(lambda_2))
    print('lambda_3: {}'.format(lambda_3))
    print('nonnegative: {}'.format(nonnegative))
    print('maxIter: {}'.format(maxIter))
    print('useALS: {}'.format(useALS))
    print('useBias: {}'.format(useBias))
    print('implicitPrefs: {}'.format(implicitPrefs))
    print('eval_name: {}'.format(eval_name))
    print()

    train_df, val_df = train_df.randomSplit(weights=[0.5, 0.5])

    print_ratings_counts(train_df, 'plot_scores Train')
    print_ratings_counts(val_df, 'plot_scores Validation')


    # First get baseline scores with ALS turned off
    (   naive_score_train, naive_score_val,
        baseline_score_train, baseline_score_val
    ) = (
        get_baseline_scores(
            train_df, val_df, evaluator, eval_name)
    )
    
    ranks = [1, 2, 5, 10, 25, 50, 100, 250]
    rank_scores_train = []
    rank_scores_val =[]

    start_time = time.monotonic()

    for rank in ranks:
        step_start_time = time.monotonic()

        estimator = Recommender(
            lambda_1=lambda_1,
            lambda_2=lambda_2,
            lambda_3=lambda_3,
            useALS=useALS,
            useBias=useBias,
            userCol='user',
            itemCol='item',
            ratingCol='rating',
            rank=rank,
            regParam=best_regParam_so_far,
            maxIter=maxIter,
            nonnegative=nonnegative,
            implicitPrefs=implicitPrefs
        )

        model = estimator.fit(train_df)


        train_predictions_df = model.transform(train_df)
        val_predictions_df = model.transform(val_df)

        # print('train_predictions_df')
        # train_predictions_df.show()

        # print('val_predictions_df')
        # val_predictions_df.show()

        # exit()

        rank_scores_train.append(evaluator.evaluate(train_predictions_df))
        rank_scores_val.append(evaluator.evaluate(val_predictions_df))

        print('rank: {} train score: {} val score: {}'
            .format(
                rank,
                rank_scores_train[-1],
                rank_scores_val[-1]
            )
        )

        print('rank: {} train diff: {} val diff: {} ({} seconds)\n'
            .format(
                rank,
                rank_scores_train[-1] - baseline_score_train,
                rank_scores_val[-1] - baseline_score_val,
                time.monotonic() - step_start_time
            )
        )

    print('Done in {} seconds'
        .format(time.monotonic() - start_time))

    rank_scores_train = np.array(rank_scores_train)
    rank_scores_val = np.array(rank_scores_val)
    best_rank_index = np.argmin(rank_scores_val)

    print('Ranks:')
    print(ranks)
    print('Train score:')
    print(rank_scores_train)
    print('Validation score:')
    print(rank_scores_val)
    print('Train score - Baseline:')
    print(rank_scores_train - baseline_score_train)
    print('Validation score - baseline:')
    print(rank_scores_val - baseline_score_val)
    print('Best Rank: {}'.format(ranks[best_rank_index]))


    regParams = [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.50, 1]
    regParam_scores_train = []
    regParam_scores_val =[]

    start_time = time.monotonic()

    for regParam in regParams:
        step_start_time = time.monotonic()

        estimator = Recommender(
            lambda_1=lambda_1,
            lambda_2=lambda_2,
            lambda_3=lambda_3,
            useALS=useALS,
            useBias=useBias,
            userCol='user',
            itemCol='item',
            ratingCol='rating',
            rank=best_rank_so_far,
            regParam=regParam,
            maxIter=maxIter,
            nonnegative=nonnegative,
            implicitPrefs=implicitPrefs
        )

        model = estimator.fit(train_df)

        train_predictions_df = model.transform(train_df)
        val_predictions_df = model.transform(val_df)

        regParam_scores_train.append(evaluator.evaluate(train_predictions_df))
        regParam_scores_val.append(evaluator.evaluate(val_predictions_df))

        print('regParam: {} train score: {} val score: {}'
            .format(
                regParam,
                regParam_scores_train[-1],
                regParam_scores_val[-1]
            )
        )

        print('regParam: {} train diff: {} val diff: {} ({} seconds)\n'
            .format(
                regParam,
                regParam_scores_train[-1] - baseline_score_train,
                regParam_scores_val[-1] - baseline_score_val,
                time.monotonic() - step_start_time
            )
        )

    print('Done in {} seconds'
        .format(time.monotonic() - start_time))


    regParam_scores_train = np.array(regParam_scores_train)
    regParam_scores_val = np.array(regParam_scores_val)
    best_regParam_index = np.argmin(regParam_scores_val)

    print('RegParams:')
    print(regParams)
    print('Train score:')
    print(regParam_scores_train)
    print('Validation score:')
    print(regParam_scores_val)
    print('Train score - Baseline:')
    print(regParam_scores_train - baseline_score_train)
    print('Validation score - Baseline:')
    print(regParam_scores_val - baseline_score_val)
    print('Best RegParam: {}'.format(regParams[best_regParam_index]))



    fig, axes = plt.subplots(2, 2, figsize=(15, 9))
    flat_axes = axes.flatten()

    flat_axes[0].axhline(y=naive_score_train, label='Train Naive', 
        color='green', alpha=0.5)
    flat_axes[0].axhline(y=baseline_score_train, label='Train Baseline', 
        color='purple', alpha=0.5)
    flat_axes[0].plot(ranks, rank_scores_train, label='Train Model', alpha=0.5)
    flat_axes[0].axhline(y=naive_score_val, label='Validation Naive', 
        color='black', alpha=0.5)
    flat_axes[0].axhline(y=baseline_score_val, label='Validation Baseline', 
        color='orange', alpha=0.5)
    flat_axes[0].plot(ranks, rank_scores_val, label='Validation Model', alpha=0.5)


    flat_axes[0].set_title('{} vs. Rank (regParam={})'
        .format(eval_name, best_regParam_so_far))
    flat_axes[0].set_xlabel('Rank')
    flat_axes[0].set_ylabel(eval_name)
    flat_axes[0].legend()

    flat_axes[1].axhline(y=naive_score_train, label='Train Naive', 
        color='green', alpha=0.5)
    flat_axes[1].axhline(y=baseline_score_train, label='Train Baseline', 
        color='purple', alpha=0.5)
    flat_axes[1].plot(regParams, regParam_scores_train, label='Train Model',
        alpha=0.5)
    flat_axes[1].axhline(y=naive_score_val, label='Validation Naive', 
        color='black', alpha=0.5)
    flat_axes[1].axhline(y=baseline_score_val, label='Validation Baseline', 
        color='orange', alpha=0.5)
    flat_axes[1].plot(regParams, regParam_scores_val, label='Validation Model',
        alpha=0.5)

    flat_axes[1].set_title('{} vs. regParam (Rank={})'
        .format(eval_name, best_rank_so_far))
    flat_axes[1].set_xlabel('regParam')
    flat_axes[1].set_ylabel(eval_name)
    flat_axes[1].legend()

    flat_axes[2].plot(ranks, rank_scores_train - baseline_score_train,
        label='Train Diff', alpha=0.5)
    flat_axes[2].plot(ranks, rank_scores_val - baseline_score_val,
        label='Validation Diff', alpha=0.5)
    flat_axes[2].set_title('{} - Baseline vs. Rank (regParam={})'
        .format(eval_name, best_regParam_so_far))
    flat_axes[2].set_xlabel('Rank')
    flat_axes[2].set_ylabel(eval_name)
    flat_axes[2].legend()

    flat_axes[3].plot(regParams, regParam_scores_train - baseline_score_train,
        label='Train Diff', alpha=0.5)
    flat_axes[3].plot(regParams, regParam_scores_val - baseline_score_val,
        label='Validation Diff', alpha=0.5)
    flat_axes[3].set_title('{} - Baseline vs. regParam (Rank={})'
        .format(eval_name, best_rank_so_far))
    flat_axes[3].set_xlabel('regParam')
    flat_axes[3].set_ylabel(eval_name)
    flat_axes[3].legend()

    plt.tight_layout()
    plt.show()
Ejemplo n.º 6
0
def get_baseline_scores(train_df, val_df, evaluator, eval_name):
    stats_rating_df = (
        train_df
        .agg(
            F.avg('rating').alias('avg_rating'),
            F.stddev_samp('rating').alias('stddev_rating')
        )
    )

    stats_row = stats_rating_df.head()

    print('[plot_scores Train] Avg: {}'.format(stats_row[0]))
    print('[plot_scores Train] Std Dev: {}'.format(stats_row[1]))

    # Naive model: random normal rating centered on average rating and scaled
    # with standard deviation of training data.
    train_predict_df = (
        train_df
        .crossJoin(stats_rating_df)
        .withColumn(
            'prediction',
            F.col('avg_rating') + F.randn() * F.col('stddev_rating')
        )
        .select(
            'user',
            'item',
            'rating',
            'prediction'
        )
    )

    val_predict_df = (
        val_df
        .crossJoin(stats_rating_df)
        .withColumn(
            'prediction',
            F.col('avg_rating') + F.randn() * F.col('stddev_rating')
        )
        .select(
            'user',
            'item',
            'rating',
            'prediction'
        )
    )

    naive_score_train = evaluator.evaluate(train_predict_df)
    naive_score_val = evaluator.evaluate(val_predict_df)

    print('Train Naive {} score: {}'.format(eval_name, naive_score_train))
    print('Validation Naive {} score: {}'.format(eval_name, naive_score_val))

    estimator = Recommender(
        lambda_1=0.0,
        lambda_2=0.0,
        lambda_3=0.0,
        useALS=False,
        useBias=True,
        userCol='user',
        itemCol='item',
        ratingCol='rating'
    )

    model = estimator.fit(train_df)
    baseline_score_train = evaluator.evaluate(model.transform(train_df))
    baseline_score_val = evaluator.evaluate(model.transform(val_df))

    print('Train Baseline {} score: {}'.format(eval_name, baseline_score_train))
    print('Validation Baseline {} score: {}'.format(eval_name, baseline_score_val))

    return (
        naive_score_train, naive_score_val,
        baseline_score_train, baseline_score_val
    )
Ejemplo n.º 7
0
def train_and_save_model_data(ratings_df):
    lambda_1 = 0.001
    lambda_2 = 0.001
    lambda_3 = 0.0
    useALS = True
    useBias = True
    rank = 158
    regParam = 0.01
    maxIter = 10
    nonnegative = False
    implicitPrefs = False

    estimator = Recommender(
        lambda_1=lambda_1,
        lambda_2=lambda_2,
        useALS=useALS,
        useBias=useBias,
        userCol='user',
        itemCol='item',
        ratingCol='rating',
        rank=rank,
        regParam=regParam,
        maxIter=maxIter,
        nonnegative=nonnegative,
        implicitPrefs=implicitPrefs
    )

    # estimator = ALS(
    #     userCol='user',
    #     itemCol='item',
    #     ratingCol='rating',
    #     rank=rank,
    #     regParam=regParam,
    #     maxIter=maxIter,
    #     nonnegative=nonnegative,
    #     implicitPrefs=implicitPrefs
    # )

    model = estimator.fit(ratings_df)

    model.itemFactors.write.parquet(
        path='../data/item_factors',
        mode='overwrite',
        compression='gzip'
    )

    model.rating_stats_df.write.parquet(
        path='../data/rating_stats',
        mode='overwrite',
        compression='gzip'
    )

    model.user_bias_df.write.parquet(
        path='../data/user_bias',
        mode='overwrite',
        compression='gzip'
    )

    model.item_bias_df.write.parquet(
        path='../data/item_bias',
        mode='overwrite',
        compression='gzip'
    )

    model.residual_stats_df.write.parquet(
        path='../data/residual_stats',
        mode='overwrite',
        compression='gzip'
    )
    elif rec_type == 'movie':

        movie_names = rec.make_recommendations(_id, id_type)

        print('\nTop Recommendations for movie id {} are:'.format(_id))
        print(movie_names, end='\n\n')


if __name__ == '__main__':
    # instantiate recommender
    rec = Recommender()

    # fit recommender
    rec.fit(reviews_pth='data/train_data.csv',
            movies_pth='data/movies_clean.csv',
            learning_rate=.01,
            iterators=1)

    # make recommendations
    # 'user' or 'movie'

    _id = 66
    id_type = 'user'
    """
    _id = 1675434
    id_type = 'movie'"""

    run(id_type)
    ''' 
    More Examples:
    print(rec.make_recommendations(8, 'user'))  # user in the dataset