def fit(self, ratings_df): with elapsed_time('total fit'): for model in self.models: model.fit(ratings_df) x, y = get_xy(ratings_df) with elapsed_time('get blend predictions'): blend_predictions = self.get_blend_predictions(x) user_groups = x.groupby('userId') movie_groups = x.groupby('movieId') supports = [] for _, row in x.iterrows(): user_id = row['userId'] movie_id = row['movieId'] user_support = len(user_groups.get_group(user_id)) movie_support = len(movie_groups.get_group(movie_id)) support = min(user_support, movie_support) supports.append(support) # print Counter(supports) self.regression.fit(blend_predictions, y) print 'linear regression coefficients: %s, intercept: %.3f' % ( self.regression.coef_, self.regression.intercept_)
def fit(self, ratings_df): with elapsed_time('fit'): self.baseline_model.fit(ratings_df) ratings_df = self.baseline_model.create_modified_ratings(ratings_df) unique_movie_ids = np.array(sorted(ratings_df['movieId'].unique())) for _, row in ratings_df.iterrows(): movie_id = row['movieId'] user_id = row['userId'] rating = row['rating'] self.ratings_by_movie[movie_id][user_id] = rating self.ratings_by_user[user_id][movie_id] = rating for movie_id in unique_movie_ids: self.raters_by_movie[movie_id] = set(self.ratings_by_movie[movie_id].keys()) for movie_index_1, movie_id_1 in enumerate(unique_movie_ids): for movie_index_2 in xrange(movie_index_1 + 1, len(unique_movie_ids)): movie_id_2 = unique_movie_ids[movie_index_2] similarity = self.calculate_similarity(movie_id_1, movie_id_2) movie_pair = (movie_id_1, movie_id_2) self.movie_similarity[movie_pair] = similarity # self.movie_aij[movie_pair] = aij return self
def main(): ratings_df = read_ratings_df_with_timestamp('ml-latest-small/ratings.csv') # ratings_df = read_ratings_df('ml-latest-small/ratings_5_pct.csv') with elapsed_time('build model'): score_model(ratings_df, model_f=UserSimilarityModel, model_name='user similarity model')
def fit(self, ratings_df): with elapsed_time('effects init'): _, y_train = get_xy(ratings_df) self.y_mean = y_train.mean() movie_ratings = ratings_df.groupby('movieId')['rating'] self.user_groups = ratings_df.groupby('userId') self.movie_effects = self.calculate_movie_effects(movie_ratings) self.user_effects = self.calculate_user_effects(self.user_groups) return self
def main(): common.verify_output_dir(TMP_DIR) out = get_output_filename() command = "arecord -d 0 -c 2 -f S16_LE -r 44100 -t wav -D copy | oggenc -o %s -" % out print color(command, 'cyan') print color("Press CTRL+C to stop the recording process.", 'green') start = time.time() os.system(command) end = time.time() print color( common.elapsed_time(end, start), 'yellow' ) print color("Size of the output file: %s bytes." % common.numberToPrettyString(os.path.getsize(out)), 'yellow') print color("If you want to listen to the recorded file, execute the following command:", 'green') print color("mplayer %s" % out, 'cyan')
def main(): common.verify_output_dir(TMP_DIR) out = get_output_filename() command = "arecord -d 0 -c 2 -f S16_LE -r 44100 -t wav -D copy | oggenc -o %s -" % out print color(command, 'cyan') print color("Press CTRL+C to stop the recording process.", 'green') start = time.time() os.system(command) end = time.time() print color(common.elapsed_time(end, start), 'yellow') print color( "Size of the output file: %s bytes." % common.numberToPrettyString(os.path.getsize(out)), 'yellow') print color( "If you want to listen to the recorded file, execute the following command:", 'green') print color("mplayer %s" % out, 'cyan')
def build_model(ratings_df): train_scores = [] test_scores = [] train_rmse_scores = [] test_rmse_scores = [] n_iter = 1 # model = BaselineTotalMeanModel() # model = BaselineMeansModel(user_weight=0.5) # model = BaselineEffectsModel(movie_lambda=5.0, user_lambda=20.0) model = UserSimilarityModel(movie_lambda=5.0, user_lambda=20.0) for _ in xrange(n_iter): train_ratings_df, test_ratings_df = train_test_split(ratings_df) model = model.fit(train_ratings_df) x_train, y_train = get_xy(train_ratings_df) x_test, y_test = get_xy(test_ratings_df) with elapsed_time('scoring'): y_train_pred = model.predict(x_train) y_test_pred = model.predict(x_test) train_score = r2_score(y_train, y_train_pred) test_score = r2_score(y_test, y_test_pred) train_rmse = root_mean_squared_error(y_train, y_train_pred) test_rmse = root_mean_squared_error(y_test, y_test_pred) train_scores.append(train_score) test_scores.append(test_score) train_rmse_scores.append(train_rmse) test_rmse_scores.append(test_rmse) print 'mean train score: %.4f, std: %.4f' % (np.mean(train_scores), np.std(train_scores)) print 'mean test score: %.4f, std: %.4f' % (np.mean(test_scores), np.std(test_scores)) print print 'mean train rmse: %.4f, std: %.4f' % (np.mean(train_rmse_scores), np.std(train_rmse_scores)) print 'mean test rmse: %.4f, std: %.4f' % (np.mean(test_rmse_scores), np.std(test_rmse_scores))
def read_ratings_df(file_name): with elapsed_time('loaded csv'): ratings_df = pd.read_csv(file_name) return ratings_df
def read_ratings_df_with_timestamp(file_name): with elapsed_time('loaded csv'): ratings_df = pd.read_csv(file_name, parse_dates=['timestamp'], date_parser=date_parse) return ratings_df