def get_recommendations(builds, clean=cleaned, svd=None, encoder=False): clean = clean.drop_duplicates() print(clean) preprocessors = [StandardScaler()] if svd is not None and svd: svd = int(svd) svd = min([40, svd]) preprocessors = [TruncatedSVD(svd)] clean[reg_cols] = scaler_reg.transform(clean[reg_cols]) # if encoder is not None and encoder: # #drop # #scale # predict = autoencoder_model.predict(clean.loc[builds] ) # #unscale # #rename and combine columns recommender = Recommender( drop_columns=[ 'Date Published', 'price_build', 'number_ratings', 'avg_rating', 'storage_price' ], preprocessors=preprocessors, # feature_weights = {'Core Clock' : 10}, ) recommender.fit(clean) return recommender.recommend(clean.loc[builds])
def train(): print('reading u.user') u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] users = pd.read_csv('/volumes/data/u.user', sep='|', names=u_cols, encoding='latin-1') #Reading ratings file: print('reading u.data') r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp'] ratings = pd.read_csv('/volumes/data/u.data', sep='\t', names=r_cols, encoding='latin-1') #Reading items file: print('reading u.item') i_cols = [ 'movie_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] items = pd.read_csv('/volumes/data/u.item', sep='|', names=i_cols, encoding='latin-1') print('merging files') movies100k_df = pd.merge(pd.merge(ratings, users), items)[['user_id', 'movie_id', 'rating']] this_reco = Recommender() print('training recommender') this_reco.fit(movies100k_df, user_id='user_id', item_id='movie_id', ratings='rating') print('saving recommender model') joblib.dump(this_reco, "/volumes/data/recommender-model.pkl") print('done')
def train(self): viewed_together_data = self.read_data( self.data_paths[self.config.VIEWED_TOGETHER]) bought_together_data = self.read_data( self.data_paths[self.config.BOUGHT_TOGETHER]) all_products_data = self.read_data( self.data_paths[self.config.ALL_PRODUCTS]) price_list_data = self.read_data( self.data_paths[self.config.PRICE_LIST]) """getting some columns in lower case""" transformed_all_products_data = uniform_data(all_products_data, self.product_attributes) """explode the lists into tuples of combinations per session ID for views and per user in bought""" print( "For the view Dataframe breaking lists of brands, product categories, product_types " "into permutations of brands, product categories, product_types as a list of tuples" ) viewed_together_cols, group_by_col = [ 'SID_IDX', 'CONFIG_ID', 'PRODUCT_CATEGORY', 'PRODUCT_TYPE', 'BRAND' ], 'SID_IDX' (tuple_list_viewed_brand, tuple_list_viewed_product_category, tuple_list_viewed_product_type, tuple_list_viewed_config) = self.transform_data( viewed_together_data, self.product_attributes, viewed_together_cols, group_by_col) print( "For the bought Dataframe breaking lists of brands, product categories, product_types " "into permutations of brands, product categories, product_types as a list of tuples" ) bought_together_cols, group_by_col = [ 'CUSTOMER_IDX', 'CONFIG_ID', 'PRODUCT_CATEGORY', 'PRODUCT_TYPE', 'BRAND' ], 'CUSTOMER_IDX' (tuple_list_bought_brand, tuple_list_bought_product_category, tuple_list_bought_product_type, tuple_list_bought_config) = self.transform_data( bought_together_data, self.product_attributes, bought_together_cols, group_by_col) recommender = Recommender() trained_data, _ = recommender.fit( tuple_list_viewed_brand, tuple_list_bought_brand, tuple_list_viewed_product_category, tuple_list_bought_product_category, tuple_list_viewed_product_type, tuple_list_bought_product_type, tuple_list_viewed_config, tuple_list_bought_config, transformed_all_products_data, price_list_data) self.write_data(trained_data)
def eval_model(ratings_df): # Randomly split data into train and test datasets train_df, test_df = ratings_df.randomSplit(weights=[0.5, 0.5]) print_ratings_counts(train_df, 'Train') print_ratings_counts(test_df, 'Test') estimator = Recommender( useALS=True, useBias=True, lambda_1=0.5, lambda_2=0.5, lambda_3=0, userCol='user', itemCol='item', ratingCol='rating', rank=128, regParam=0.16, maxIter=10, nonnegative=False ) # estimator = ALS( # userCol='user', # itemCol='item', # ratingCol='rating', # rank=2, # regParam=0.7, # maxIter=5, # nonnegative=True, # coldStartStrategy='drop' # ) evaluator_rmse = RegressionEvaluator( metricName="rmse", labelCol="rating", predictionCol="prediction") evaluator_ndcg10 = NDCG10Evaluator(spark) start_time = time.monotonic() step_start_time = time.monotonic() model = estimator.fit(train_df) print('Fit done in {} seconds.'.format(time.monotonic() - step_start_time)) train_predictions_df = model.transform(train_df) step_start_time = time.monotonic() test_predictions_df = model.transform(test_df) print('Predictions done in {} seconds.' .format(time.monotonic() - step_start_time)) print('All done in {} seconds.'.format(time.monotonic() - start_time)) # print(predictions_df.printSchema()) for row in test_predictions_df.head(30): print(row) print_avg_predictions(train_predictions_df, 'Train') print_avg_predictions(test_predictions_df, 'Test') train_rmse = evaluator_rmse.evaluate(train_predictions_df) test_rmse = evaluator_rmse.evaluate(test_predictions_df) print("Train RMSE: {}".format(train_rmse)) print("Test RMSE: {}".format(test_rmse)) train_ndcg10 = evaluator_ndcg10.evaluate(train_predictions_df) test_ndcg10 = evaluator_ndcg10.evaluate(test_predictions_df) print("Train NDCG10: {}".format(train_ndcg10)) print("Test NDCG10: {}".format(test_ndcg10))
def plot_scores(train_df): best_rank_so_far = 250 best_regParam_so_far = 0.001 lambda_1 = 2 lambda_2 = 2 lambda_3 = 0.0 nonnegative = False maxIter = 10 useALS = True useBias = True implicitPrefs = False # eval_name = 'NDCG10' # evaluator = NDCG10Evaluator(spark) # eval_name = 'NDCG' # evaluator = NDCGEvaluator(spark) # eval_name = 'TopQuantileEvaluator' # evaluator = TopQuantileEvaluator(spark) eval_name = 'RMSE' evaluator = RegressionEvaluator( metricName="rmse", labelCol="rating", predictionCol="prediction") print() print('best_rank_so_far: {}'.format(best_rank_so_far)) print('best_regParam_so_far: {}'.format(best_regParam_so_far)) print('lambda_1: {}'.format(lambda_1)) print('lambda_2: {}'.format(lambda_2)) print('lambda_3: {}'.format(lambda_3)) print('nonnegative: {}'.format(nonnegative)) print('maxIter: {}'.format(maxIter)) print('useALS: {}'.format(useALS)) print('useBias: {}'.format(useBias)) print('implicitPrefs: {}'.format(implicitPrefs)) print('eval_name: {}'.format(eval_name)) print() train_df, val_df = train_df.randomSplit(weights=[0.5, 0.5]) print_ratings_counts(train_df, 'plot_scores Train') print_ratings_counts(val_df, 'plot_scores Validation') # First get baseline scores with ALS turned off ( naive_score_train, naive_score_val, baseline_score_train, baseline_score_val ) = ( get_baseline_scores( train_df, val_df, evaluator, eval_name) ) ranks = [1, 2, 5, 10, 25, 50, 100, 250] rank_scores_train = [] rank_scores_val =[] start_time = time.monotonic() for rank in ranks: step_start_time = time.monotonic() estimator = Recommender( lambda_1=lambda_1, lambda_2=lambda_2, lambda_3=lambda_3, useALS=useALS, useBias=useBias, userCol='user', itemCol='item', ratingCol='rating', rank=rank, regParam=best_regParam_so_far, maxIter=maxIter, nonnegative=nonnegative, implicitPrefs=implicitPrefs ) model = estimator.fit(train_df) train_predictions_df = model.transform(train_df) val_predictions_df = model.transform(val_df) # print('train_predictions_df') # train_predictions_df.show() # print('val_predictions_df') # val_predictions_df.show() # exit() rank_scores_train.append(evaluator.evaluate(train_predictions_df)) rank_scores_val.append(evaluator.evaluate(val_predictions_df)) print('rank: {} train score: {} val score: {}' .format( rank, rank_scores_train[-1], rank_scores_val[-1] ) ) print('rank: {} train diff: {} val diff: {} ({} seconds)\n' .format( rank, rank_scores_train[-1] - baseline_score_train, rank_scores_val[-1] - baseline_score_val, time.monotonic() - step_start_time ) ) print('Done in {} seconds' .format(time.monotonic() - start_time)) rank_scores_train = np.array(rank_scores_train) rank_scores_val = np.array(rank_scores_val) best_rank_index = np.argmin(rank_scores_val) print('Ranks:') print(ranks) print('Train score:') print(rank_scores_train) print('Validation score:') print(rank_scores_val) print('Train score - Baseline:') print(rank_scores_train - baseline_score_train) print('Validation score - baseline:') print(rank_scores_val - baseline_score_val) print('Best Rank: {}'.format(ranks[best_rank_index])) regParams = [0.001, 0.0025, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.50, 1] regParam_scores_train = [] regParam_scores_val =[] start_time = time.monotonic() for regParam in regParams: step_start_time = time.monotonic() estimator = Recommender( lambda_1=lambda_1, lambda_2=lambda_2, lambda_3=lambda_3, useALS=useALS, useBias=useBias, userCol='user', itemCol='item', ratingCol='rating', rank=best_rank_so_far, regParam=regParam, maxIter=maxIter, nonnegative=nonnegative, implicitPrefs=implicitPrefs ) model = estimator.fit(train_df) train_predictions_df = model.transform(train_df) val_predictions_df = model.transform(val_df) regParam_scores_train.append(evaluator.evaluate(train_predictions_df)) regParam_scores_val.append(evaluator.evaluate(val_predictions_df)) print('regParam: {} train score: {} val score: {}' .format( regParam, regParam_scores_train[-1], regParam_scores_val[-1] ) ) print('regParam: {} train diff: {} val diff: {} ({} seconds)\n' .format( regParam, regParam_scores_train[-1] - baseline_score_train, regParam_scores_val[-1] - baseline_score_val, time.monotonic() - step_start_time ) ) print('Done in {} seconds' .format(time.monotonic() - start_time)) regParam_scores_train = np.array(regParam_scores_train) regParam_scores_val = np.array(regParam_scores_val) best_regParam_index = np.argmin(regParam_scores_val) print('RegParams:') print(regParams) print('Train score:') print(regParam_scores_train) print('Validation score:') print(regParam_scores_val) print('Train score - Baseline:') print(regParam_scores_train - baseline_score_train) print('Validation score - Baseline:') print(regParam_scores_val - baseline_score_val) print('Best RegParam: {}'.format(regParams[best_regParam_index])) fig, axes = plt.subplots(2, 2, figsize=(15, 9)) flat_axes = axes.flatten() flat_axes[0].axhline(y=naive_score_train, label='Train Naive', color='green', alpha=0.5) flat_axes[0].axhline(y=baseline_score_train, label='Train Baseline', color='purple', alpha=0.5) flat_axes[0].plot(ranks, rank_scores_train, label='Train Model', alpha=0.5) flat_axes[0].axhline(y=naive_score_val, label='Validation Naive', color='black', alpha=0.5) flat_axes[0].axhline(y=baseline_score_val, label='Validation Baseline', color='orange', alpha=0.5) flat_axes[0].plot(ranks, rank_scores_val, label='Validation Model', alpha=0.5) flat_axes[0].set_title('{} vs. Rank (regParam={})' .format(eval_name, best_regParam_so_far)) flat_axes[0].set_xlabel('Rank') flat_axes[0].set_ylabel(eval_name) flat_axes[0].legend() flat_axes[1].axhline(y=naive_score_train, label='Train Naive', color='green', alpha=0.5) flat_axes[1].axhline(y=baseline_score_train, label='Train Baseline', color='purple', alpha=0.5) flat_axes[1].plot(regParams, regParam_scores_train, label='Train Model', alpha=0.5) flat_axes[1].axhline(y=naive_score_val, label='Validation Naive', color='black', alpha=0.5) flat_axes[1].axhline(y=baseline_score_val, label='Validation Baseline', color='orange', alpha=0.5) flat_axes[1].plot(regParams, regParam_scores_val, label='Validation Model', alpha=0.5) flat_axes[1].set_title('{} vs. regParam (Rank={})' .format(eval_name, best_rank_so_far)) flat_axes[1].set_xlabel('regParam') flat_axes[1].set_ylabel(eval_name) flat_axes[1].legend() flat_axes[2].plot(ranks, rank_scores_train - baseline_score_train, label='Train Diff', alpha=0.5) flat_axes[2].plot(ranks, rank_scores_val - baseline_score_val, label='Validation Diff', alpha=0.5) flat_axes[2].set_title('{} - Baseline vs. Rank (regParam={})' .format(eval_name, best_regParam_so_far)) flat_axes[2].set_xlabel('Rank') flat_axes[2].set_ylabel(eval_name) flat_axes[2].legend() flat_axes[3].plot(regParams, regParam_scores_train - baseline_score_train, label='Train Diff', alpha=0.5) flat_axes[3].plot(regParams, regParam_scores_val - baseline_score_val, label='Validation Diff', alpha=0.5) flat_axes[3].set_title('{} - Baseline vs. regParam (Rank={})' .format(eval_name, best_rank_so_far)) flat_axes[3].set_xlabel('regParam') flat_axes[3].set_ylabel(eval_name) flat_axes[3].legend() plt.tight_layout() plt.show()
def get_baseline_scores(train_df, val_df, evaluator, eval_name): stats_rating_df = ( train_df .agg( F.avg('rating').alias('avg_rating'), F.stddev_samp('rating').alias('stddev_rating') ) ) stats_row = stats_rating_df.head() print('[plot_scores Train] Avg: {}'.format(stats_row[0])) print('[plot_scores Train] Std Dev: {}'.format(stats_row[1])) # Naive model: random normal rating centered on average rating and scaled # with standard deviation of training data. train_predict_df = ( train_df .crossJoin(stats_rating_df) .withColumn( 'prediction', F.col('avg_rating') + F.randn() * F.col('stddev_rating') ) .select( 'user', 'item', 'rating', 'prediction' ) ) val_predict_df = ( val_df .crossJoin(stats_rating_df) .withColumn( 'prediction', F.col('avg_rating') + F.randn() * F.col('stddev_rating') ) .select( 'user', 'item', 'rating', 'prediction' ) ) naive_score_train = evaluator.evaluate(train_predict_df) naive_score_val = evaluator.evaluate(val_predict_df) print('Train Naive {} score: {}'.format(eval_name, naive_score_train)) print('Validation Naive {} score: {}'.format(eval_name, naive_score_val)) estimator = Recommender( lambda_1=0.0, lambda_2=0.0, lambda_3=0.0, useALS=False, useBias=True, userCol='user', itemCol='item', ratingCol='rating' ) model = estimator.fit(train_df) baseline_score_train = evaluator.evaluate(model.transform(train_df)) baseline_score_val = evaluator.evaluate(model.transform(val_df)) print('Train Baseline {} score: {}'.format(eval_name, baseline_score_train)) print('Validation Baseline {} score: {}'.format(eval_name, baseline_score_val)) return ( naive_score_train, naive_score_val, baseline_score_train, baseline_score_val )
def train_and_save_model_data(ratings_df): lambda_1 = 0.001 lambda_2 = 0.001 lambda_3 = 0.0 useALS = True useBias = True rank = 158 regParam = 0.01 maxIter = 10 nonnegative = False implicitPrefs = False estimator = Recommender( lambda_1=lambda_1, lambda_2=lambda_2, useALS=useALS, useBias=useBias, userCol='user', itemCol='item', ratingCol='rating', rank=rank, regParam=regParam, maxIter=maxIter, nonnegative=nonnegative, implicitPrefs=implicitPrefs ) # estimator = ALS( # userCol='user', # itemCol='item', # ratingCol='rating', # rank=rank, # regParam=regParam, # maxIter=maxIter, # nonnegative=nonnegative, # implicitPrefs=implicitPrefs # ) model = estimator.fit(ratings_df) model.itemFactors.write.parquet( path='../data/item_factors', mode='overwrite', compression='gzip' ) model.rating_stats_df.write.parquet( path='../data/rating_stats', mode='overwrite', compression='gzip' ) model.user_bias_df.write.parquet( path='../data/user_bias', mode='overwrite', compression='gzip' ) model.item_bias_df.write.parquet( path='../data/item_bias', mode='overwrite', compression='gzip' ) model.residual_stats_df.write.parquet( path='../data/residual_stats', mode='overwrite', compression='gzip' )
elif rec_type == 'movie': movie_names = rec.make_recommendations(_id, id_type) print('\nTop Recommendations for movie id {} are:'.format(_id)) print(movie_names, end='\n\n') if __name__ == '__main__': # instantiate recommender rec = Recommender() # fit recommender rec.fit(reviews_pth='data/train_data.csv', movies_pth='data/movies_clean.csv', learning_rate=.01, iterators=1) # make recommendations # 'user' or 'movie' _id = 66 id_type = 'user' """ _id = 1675434 id_type = 'movie'""" run(id_type) ''' More Examples: print(rec.make_recommendations(8, 'user')) # user in the dataset