def fit(self, ratings_df):
        with elapsed_time('total fit'):
            for model in self.models:
                model.fit(ratings_df)

            x, y = get_xy(ratings_df)

            with elapsed_time('get blend predictions'):
                blend_predictions = self.get_blend_predictions(x)

            user_groups = x.groupby('userId')
            movie_groups = x.groupby('movieId')
            supports = []
            for _, row in x.iterrows():
                user_id = row['userId']
                movie_id = row['movieId']
                user_support = len(user_groups.get_group(user_id))
                movie_support = len(movie_groups.get_group(movie_id))
                support = min(user_support, movie_support)
                supports.append(support)

            # print Counter(supports)

            self.regression.fit(blend_predictions, y)

            print 'linear regression coefficients: %s, intercept: %.3f' % (
                self.regression.coef_, self.regression.intercept_)
Example #2
0
    def fit(self, ratings_df):
        with elapsed_time('fit'):
            self.baseline_model.fit(ratings_df)

            ratings_df = self.baseline_model.create_modified_ratings(ratings_df)

            unique_movie_ids = np.array(sorted(ratings_df['movieId'].unique()))

            for _, row in ratings_df.iterrows():
                movie_id = row['movieId']
                user_id = row['userId']
                rating = row['rating']
                self.ratings_by_movie[movie_id][user_id] = rating
                self.ratings_by_user[user_id][movie_id] = rating

            for movie_id in unique_movie_ids:
                self.raters_by_movie[movie_id] = set(self.ratings_by_movie[movie_id].keys())

            for movie_index_1, movie_id_1 in enumerate(unique_movie_ids):
                for movie_index_2 in xrange(movie_index_1 + 1, len(unique_movie_ids)):
                    movie_id_2 = unique_movie_ids[movie_index_2]

                    similarity = self.calculate_similarity(movie_id_1, movie_id_2)
                    movie_pair = (movie_id_1, movie_id_2)
                    self.movie_similarity[movie_pair] = similarity
                    # self.movie_aij[movie_pair] = aij

        return self
Example #3
0
def main():
    ratings_df = read_ratings_df_with_timestamp('ml-latest-small/ratings.csv')
    # ratings_df = read_ratings_df('ml-latest-small/ratings_5_pct.csv')

    with elapsed_time('build model'):
        score_model(ratings_df,
                    model_f=UserSimilarityModel,
                    model_name='user similarity model')
Example #4
0
    def fit(self, ratings_df):
        with elapsed_time('effects init'):
            _, y_train = get_xy(ratings_df)
            self.y_mean = y_train.mean()

            movie_ratings = ratings_df.groupby('movieId')['rating']
            self.user_groups = ratings_df.groupby('userId')

            self.movie_effects = self.calculate_movie_effects(movie_ratings)
            self.user_effects = self.calculate_user_effects(self.user_groups)

        return self
Example #5
0
def main():
    common.verify_output_dir(TMP_DIR)
    out = get_output_filename()
    command = "arecord -d 0 -c 2 -f S16_LE -r 44100 -t wav -D copy | oggenc -o %s -" % out
    print color(command, 'cyan')
    print color("Press CTRL+C to stop the recording process.", 'green')
    start = time.time()
    os.system(command)
    end = time.time()
    print color( common.elapsed_time(end, start), 'yellow' )
    print color("Size of the output file: %s bytes." % common.numberToPrettyString(os.path.getsize(out)), 'yellow')
    print color("If you want to listen to the recorded file, execute the following command:", 'green')
    print color("mplayer %s" % out, 'cyan')
Example #6
0
def main():
    common.verify_output_dir(TMP_DIR)
    out = get_output_filename()
    command = "arecord -d 0 -c 2 -f S16_LE -r 44100 -t wav -D copy | oggenc -o %s -" % out
    print color(command, 'cyan')
    print color("Press CTRL+C to stop the recording process.", 'green')
    start = time.time()
    os.system(command)
    end = time.time()
    print color(common.elapsed_time(end, start), 'yellow')
    print color(
        "Size of the output file: %s bytes." %
        common.numberToPrettyString(os.path.getsize(out)), 'yellow')
    print color(
        "If you want to listen to the recorded file, execute the following command:",
        'green')
    print color("mplayer %s" % out, 'cyan')
def build_model(ratings_df):
    train_scores = []
    test_scores = []
    train_rmse_scores = []
    test_rmse_scores = []
    n_iter = 1

    # model = BaselineTotalMeanModel()
    # model = BaselineMeansModel(user_weight=0.5)
    # model = BaselineEffectsModel(movie_lambda=5.0, user_lambda=20.0)
    model = UserSimilarityModel(movie_lambda=5.0, user_lambda=20.0)

    for _ in xrange(n_iter):
        train_ratings_df, test_ratings_df = train_test_split(ratings_df)

        model = model.fit(train_ratings_df)

        x_train, y_train = get_xy(train_ratings_df)
        x_test, y_test = get_xy(test_ratings_df)

        with elapsed_time('scoring'):
            y_train_pred = model.predict(x_train)
            y_test_pred = model.predict(x_test)

            train_score = r2_score(y_train, y_train_pred)
            test_score = r2_score(y_test, y_test_pred)

            train_rmse = root_mean_squared_error(y_train, y_train_pred)
            test_rmse = root_mean_squared_error(y_test, y_test_pred)

        train_scores.append(train_score)
        test_scores.append(test_score)

        train_rmse_scores.append(train_rmse)
        test_rmse_scores.append(test_rmse)

    print 'mean train score: %.4f, std: %.4f' % (np.mean(train_scores),
                                                 np.std(train_scores))
    print 'mean test score: %.4f, std: %.4f' % (np.mean(test_scores),
                                                np.std(test_scores))
    print
    print 'mean train rmse: %.4f, std: %.4f' % (np.mean(train_rmse_scores),
                                                np.std(train_rmse_scores))
    print 'mean test rmse: %.4f, std: %.4f' % (np.mean(test_rmse_scores),
                                               np.std(test_rmse_scores))
def read_ratings_df(file_name):
    with elapsed_time('loaded csv'):
        ratings_df = pd.read_csv(file_name)
    return ratings_df
def read_ratings_df_with_timestamp(file_name):
    with elapsed_time('loaded csv'):
        ratings_df = pd.read_csv(file_name,
                                 parse_dates=['timestamp'],
                                 date_parser=date_parse)
    return ratings_df