Beispiel #1
0
def get_dataset(dataset_id):
    BUILTIN_DATASETS = {
        'ml-100k': {
            'path': './ml-100k/u.data',
            'line_format': 'user item rating timestamp',
            'rating_scale': (1, 5),
            'sep': '\t'
        },
        'ml-1m': {
            'path': './ml-1m/ratings.dat',
            'line_format': 'user item rating timestamp',
            'rating_scale': (1, 5),
            'sep': '::'
        },
        'ml-10m': {
            'path': './ml-10M100K/ratings.dat',
            'line_format': 'user item rating timestamp',
            'rating_scale': (1, 5),
            'sep': '::'
        }
    }
    dataset_props = BUILTIN_DATASETS[dataset_id]
    return Dataset.load_from_file(
        dataset_props.get('path'),
        Reader(line_format=dataset_props.get('line_format'),
               rating_scale=dataset_props.get('rating_scale'),
               sep=dataset_props.get('sep')))
def svd_ratings_predicate(observed_ratings_df,
                          truth_ratings_df,
                          fold='0',
                          phase='eval'):
    """
    pmf_ratings Predicates
    """
    print("SVD predicates")
    svd_model = SVD()
    reader = Reader(rating_scale=(0.2, 1))
    train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index(
    ).loc[:, ['userId', 'movieId', 'rating']],
                                         reader=reader)
    svd_model.fit(train_dataset.build_full_trainset())

    # make predictions
    predictions = pd.DataFrame(index=truth_ratings_df.index,
                               columns=['rating'])

    for row in truth_ratings_df.loc[:, ['rating']].iterrows():
        uid = row[0][0]
        iid = row[0][1]
        predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est

    write(predictions, 'svd_rating_obs', fold, phase)
Beispiel #3
0
 def __init__(self,
              love_matrix: np.array,
              model: AlgoBase,
              test_size: float = 0.2):
     self.love_matrix = love_matrix
     self.model = model
     self.reader = Reader(rating_scale=self.rating_scale)
     self.test_size = test_size
Beispiel #4
0
def evaluate_prediction(df, sample_num, dir_attack_profiles, attack):
    """

    :param df:
    :param sample_num:
    :param dir_attack_profiles:
    :param attack:
    :return: the elaborated dataframe and the sample name
    """

    print("\t\t\t\tAttack {0}".format(attack))
    target_item_id = int(attack.split('_')[2].split('.')[0])
    df_attack = pd.read_csv(os.path.join(dir_attack_profiles, attack))

    # Reduce the number of shilling profiles with respect to the maximum
    perc_of_shilling_users = round(
        cfg.attackSizePercentage / max(cfg.size_of_attacks), 2)
    shilling_users = df_attack.userId.unique()
    df_attack = df_attack[df_attack.userId.isin(
        shilling_users[:int(len(shilling_users) * perc_of_shilling_users)])]

    shilling_ids = list(df_attack['userId'].unique())

    df_attack = df_attack.append(df, ignore_index=True).reset_index()

    algo = get_algo(df_attack)

    # First train a Recommender Algorithm on the sample dataset.
    if cfg.model in [cfg.ncf]:
        algo.fit()
    else:
        reader = Reader(line_format='user item rating',
                        rating_scale=cfg.rating_scale.get(cfg.dataset))
        data = Dataset.load_from_df(df_attack[['userId', 'itemId', 'rating']],
                                    reader)
        trainset = data.build_full_trainset()
        algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    # predictions = algo.test(testset)

    # We are Evaluating on a Single Item
    # rec_list, final_positions, final_scores = get_rec_list_faster(predictions, target_item_id)

    # final_positions, final_scores = get_rec_list_faster(predictions, target_item_id, shilling_ids)
    print('\t\t\t\tEvaluating post prediction')

    if cfg.model in [cfg.ncf]:
        final_positions, final_scores = algo.test([target_item_id],
                                                  shilling_ids)
    else:
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)
        final_positions, final_scores = get_rec_list_faster(
            predictions, target_item_id, shilling_ids)
    print('\t\t\t\tEnd Evaluation of post prediction')

    return final_positions, final_scores, sample_num, target_item_id
Beispiel #5
0
    def load_builtin(cls, name='ml-100k', prompt=True):
        """Load a built-in dataset.

        If the dataset has not already been loaded, it will be downloaded and
        saved. You will have to split your dataset using the :meth:`split
        <DatasetAutoFolds.split>` method. See an example in the :ref:`User
        Guide <cross_validate_example>`.

        Args:
            name(:obj:`string`): The name of the built-in dataset to load.
                Accepted values are 'ml-100k', 'ml-1m', and 'jester'.
                Default is 'ml-100k'.
            prompt(:obj:`bool`): Prompt before downloading if dataset is not
                already on disk.
                Default is True.

        Returns:
            A :obj:`Dataset` object.

        Raises:
            ValueError: If the ``name`` parameter is incorrect.
        """

        try:
            dataset = BUILTIN_DATASETS[name]
        except KeyError:
            raise ValueError('unknown dataset ' + name +
                             '. Accepted values are ' +
                             ', '.join(BUILTIN_DATASETS.keys()) + '.')

        # if dataset does not exist, offer to download it
        if not os.path.isfile(dataset.path):
            answered = not prompt
            while not answered:
                print('Dataset ' + name + ' could not be found. Do you want '
                      'to download it? [Y/n] ',
                      end='')
                choice = input().lower()

                if choice in ['yes', 'y', '', 'omg this is so nice of you!!']:
                    answered = True
                elif choice in ['no', 'n', 'hell no why would i want that?!']:
                    answered = True
                    print("Ok then, I'm out!")
                    sys.exit()

            download_builtin_dataset(name)

        reader = Reader(**dataset.reader_params)

        return cls.load_from_file(file_path=dataset.path,
                                  reader=reader,
                                  item_path=dataset.item_path)
    def perform_operation(self):
        self.LOG_HANDLE.info(
            "Running the collaborative filtering algorithms...")
        latest_ratings_file_name = self.get_latest_output_file_name(
            configurations.RATINGS_FILE_IN_REQUIRED_FORMAT_FILE_NAME,
            next=False)[1]
        latest_ratings_file_location = os.path.join(
            configurations.OUTPUT_FILES_DIRECTORY, latest_ratings_file_name)
        self.LOG_HANDLE.info("Running recommender models on the file here: " +
                             latest_ratings_file_location)
        print("Running all recommender models")

        # Params from here: http://surprise.readthedocs.io/en/stable/reader.html
        reader = Reader(sep=constants.COMMA_STR)

        # Params from here: http://surprise.readthedocs.io/en/stable/dataset.html
        ratings_dataset = Dataset.load_from_file(latest_ratings_file_location,
                                                 reader)

        # Divide the data set into the training and test sets
        trainset, testset = train_test_split(
            ratings_dataset, test_size=model_params.test_set_size)

        # Add different algorithms here - Removed SVD PP algorithm
        collaborative_algorithms = [
            normal_algo_wrapper(),
            knn_algo_wrapper(),
            svd_algo_wrapper()
        ]

        rmse_values = {}

        for collaborative_algorithm in collaborative_algorithms:
            print("Started Algorithm: " + collaborative_algorithm.algo_name)
            rmse_values[collaborative_algorithm.
                        algo_name] = collaborative_algorithm.evaluate_on_test(
                            trainset, testset)
            collaborative_algorithm.perform_grid_search_with_cv(
                ratings_dataset)
            print("Completed Algorithm: " + collaborative_algorithm.algo_name)

        print("All recommender models have been run...")
        plt.scatter(rmse_values.keys(), rmse_values.values())
        plt.xlabel('Collaborative filtering algorithm')
        plt.ylabel('Root mean square error (RMSE) on test predictions')
        plt.show()
Beispiel #7
0
def main():
    with open('random_recommender_config.json', 'r') as f:
        config = json.load(f)

        path = config['path']
        separator = config['separator']
        n_folds = config['n_folds']

    output_recommendation_file_path = path + '<output_recommendation_file_path>'
    input_file_path = path + '<input_file_path>'
    ratings_file_path = path + '<ratings_file_path>'
    random_path = output_recommendation_file_path + 'random/'
    reader = Reader(line_format='user item rating timestamp', sep='	')

    recommender = RandomRecommender(ratings_file_path=ratings_file_path,
                                    separator=separator)
    recommender.recommend_rival(n_folds=n_folds,
                                train_test_file_path=input_file_path,
                                reader=reader,
                                recommendation_file_path=random_path)
Beispiel #8
0
    def estimate_preference(self, user_id, item_id):

        """
        Estimate the preference value by a specific user.
        :param user_id: Id of the user to recommend.
        :param item_id: Id of the item to recommend.
        :return: The estimate preference by the sepecific recommender.
        """

        # train file:
        df_ratings = self.rating_data_model.df_ratings
        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(self.rating_data_model.get_min_preference(), self.rating_data_model.get_max_preference()))
        train_data = Dataset(reader=reader)
        # The columns must correspond to user id, item id and ratings (in that order).
        raw_trainset = train_data.load_from_df(df_ratings[['user_id', 'item_id', 'rating']], reader)
        trainset = train_data.construct_trainset(raw_trainset.raw_ratings)

        # Train recommendation input_model:
        self.model.fit(trainset)

        return float(self.model.estimate(u=user_id, i=item_id)[0])
def evaluate_prediction(sample_path, sample_num):
    """
    
    :param sample_path: The Absolute Path of the sample useful to read the data samples csv file 
    :param sample_num: the number of sample under analysis
    :return: the elaborated dataframe and the sample name
    """ ""
    # Load the dataset (download it if needed).

    df = pd.read_csv(os.path.join(project_dir, sample_path))

    target_items = \
        pd.read_csv(os.path.join(project_dir, cfg.data, cfg.dataset, cfg.target_items),
                    usecols=['itemId'])['itemId'].tolist()
    try:
        algo = get_algo(df)
    except Exception as e:
        print(e)

    # First train a Recommender Algorithm on the sample dataset.
    print("\t\t\t\tFit {0}{1}".format(sample_path, sample_num))
    if cfg.model in [cfg.ncf]:
        algo.fit()
    else:
        reader = Reader(line_format='user item rating',
                        rating_scale=cfg.rating_scale.get(cfg.dataset))
        data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
        trainset = data.build_full_trainset()
        algo.fit(trainset)
    print("\t\t\t\tEND - Fit {0}{1}".format(sample_path, sample_num))
    # Than predict ratings for all pairs (u, i) that are NOT in the training set.

    print("\t\t\t\tPredict {0}{1}".format(sample_path, sample_num))
    if cfg.model in [cfg.ncf]:
        initial_positions, initial_scores = algo.test(target_items[:])
    else:
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)
        initial_positions, initial_scores = get_rec_list(
            predictions, target_items[:])
    print("\t\t\t\tEND - Predict {0}{1}".format(sample_path, sample_num))

    # rec_list, initial_positions, initial_scores = get_rec_list(predictions, target_items[:])
    print("\t\t\t\tStoring Initial Predictions {0}{1}".format(
        sample_path, sample_num))

    initial_prediction = {
        'initial_positions': initial_positions,
        'initial_scores': initial_scores
    }
    save_obj(
        initial_prediction,
        os.path.join(project_dir, cfg.model, cfg.results, cfg.dataset,
                     cfg.initial_prediction))

    # if cfg.save_full_rec_list:
    #     # Save Also FULL REC LIST
    #     save_obj(rec_list,
    #              os.path.join(project_dir, cfg.model, cfg.results, cfg.dataset,
    #                           'Full_{0}'.format(cfg.initial_prediction)))

    print("\t\t\t\tEND - Store Initial Positions {0}{1}".format(
        sample_path, sample_num))
Beispiel #10
0
def user_defined_file(file: Path) -> (surprise.dataset.DatasetAutoFolds):
    reader = Reader(line_format='user item rating timestamp',
                    sep=',',
                    skip_lines=1)
    data = Dataset.load_from_file(file, reader)
    return data
Beispiel #11
0
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.reader import Reader
import os
import pandas as pd
from sklearn.decomposition import PCA

if __name__ == "__main__":
    os.chdir("C:\\Users\\22560\\Documents\\iptv")
    behavior = pd.read_csv(
        "C:\\Users\\22560\\Documents\\iptv\\originalData\\behavior.csv")
    reader = Reader(rating_scale=(behavior['MEDIACOUNT'].min(),
                                  behavior['MEDIACOUNT'].max()))

    data = Dataset.load_from_df(
        behavior[['newUserID', 'TV_NAME', 'MEDIACOUNT']], reader)
    trainset, testset = train_test_split(data, test_size=.25)
    algo = SVD()
    algo.fit(trainset)

    predictions = algo.test(testset)
    accuracy.rmse(predictions)

    userattr = algo.pu
    itemattr = algo.qi

    pca = PCA(n_components=2)
    behavior.head()
# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
ratings = all_ratings.iloc[0:10**6]
data2 = pd.DataFrame({
    'userID': ratings['user'],
    'itemID': ratings['movie'],
    'rating': ratings["rating"]
})

data2['rating'].describe()

data = Dataset.load_from_df(
    data2, Reader(line_format=u'user item rating', rating_scale=(1, 5)))
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# define a cross-validation iterator
kf = KFold(n_splits=5)
time1 = datetime.now()
for trainset, testset in kf.split(data):
    time2 = datetime.now()
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
    print("hi:{}".format(time2 - time1))
    time1 = time2
    train_directory = "./data/02_intermediate/"
    submission_directory = "./data/07_model_output/"
    model_directory = "./data/06_models/"
    file_train = f"{train_directory}opiniones_train.csv"
    file_test = f"{train_directory}opiniones_test.csv"

    train = pd.read_csv(file_train)
    test = pd.read_csv(file_test)

    genre_means, genre_book_means = calculate_means(train)
    global_mean = train.puntuacion.mean()
    users_in_train = set(train.usuario.values)

    scale = (1.0, 10.0)
    reader = Reader(rating_scale=scale)
    data_train = Dataset.load_from_df(
        train[["usuario", "libro", "puntuacion"]], reader)
    trainset = data_train.build_full_trainset()

    # testing_algorithm(train)

    # SVD
    param_grid = {
        "n_factors": [70, 80, 90, 100, 110, 120, 130, 140, 150, 160],
        "n_epochs": [100],
        "lr_all": [0.002, 0.005, 0.01, 0.05],
        "reg_all": [0.1, 0.4, 0.6],
        "random_state": [0, 5, 42],
    }
def main():
    # Load data
    reader = Reader(sep=',', rating_scale=(0.0, 5.0), skip_lines=1)
    allMoives = Dataset.load_from_file('ratings.csv', reader=reader)
    popMoives = Dataset.load_from_file('popular.csv', reader=reader)
    unpopMoives = Dataset.load_from_file('unpopular.csv', reader=reader)
    varMoives = Dataset.load_from_file('variance.csv', reader=reader)
    binary = []
    binary.append(Dataset.load_from_file('bin2.5.csv', reader=reader))
    binary.append(Dataset.load_from_file('bin3.csv', reader=reader))
    binary.append(Dataset.load_from_file('bin3.5.csv', reader=reader))
    binary.append(Dataset.load_from_file('bin4.csv', reader=reader))
    with open('movies.csv', 'r', encoding='utf8') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')
        next(reader, None)
        movies = {int(movie[0]): movie[2] for movie in reader}

    # NMFs
    ks = range(2, 52, 2)
    mae, rmse = [0] * len(ks), [0] * len(ks)

    def nmf(dataName, data, biased=True):
        print('Start building NMF with ' + dataName + '!')
        for i, k in enumerate(ks):
            nmf = NMF(n_factors=k, biased=biased)
            scores = cross_validate(nmf, data, cv=10)
            mae[i] = scores['test_mae'].mean()
            rmse[i] = scores['test_rmse'].mean()
            print('k = ' + str(k) + ' finished!')
        plt.figure()
        plt.subplot(211)
        plt.plot(ks, mae)
        plt.xlabel('k')
        plt.ylabel('mean absolute error')
        plt.title('Mean absolute error vs. k of ' + dataName)
        plt.subplot(212)
        plt.plot(ks, rmse)
        plt.xlabel('k')
        plt.ylabel('root mean squared error')
        plt.title('Root mean squared error vs. k of ' + dataName)
        print('mae:')
        print(mae)
        print('rmse:')
        print(rmse)
        print('Finish building NMF with ' + dataName + '!')

    # Q17
    nmf('all movies', allMoives)

    # Q18
    optimalK = 4
    print('The optimal number of latent factors is ' + str(optimalK))

    # Q19
    nmf('popular movies', popMoives)

    # Q20
    nmf('unpopular movies', unpopMoives)

    # Q21
    nmf('high variance movies', varMoives)

    # Draw ROC Curve
    thresholds = [2.5, 3, 3.5, 4]

    def drawRoc(model, i, k):
        print('Start drawing ROC curve of NMF with optimal k = ' + str(k) +
              ', threshold = ' + str(thresholds[i]) + '!')
        train, test = train_test_split(binary[i],
                                       train_size=0.9,
                                       test_size=0.1)
        model.fit(train)
        labels = model.test(test)
        y_true = [label.r_ui for label in labels]
        y_pred = [label.est for label in labels]
        fpr, tpr, _ = roc_curve(y_true, y_pred)
        roc_auc = auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr,
                 tpr,
                 color='darkorange',
                 lw=2,
                 label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve of NMF with optimal k = ' + str(k) +
                  ', threshold = ' + str(thresholds[i]))
        plt.legend(loc="lower right")
        print('Finish drawing ROC curve of NMF with optimal k = ' + str(k) +
              ', threshold = ' + str(thresholds[i]) + '!')

    # Q22
    nmf = NMF(n_factors=optimalK)
    for i in range(len(thresholds)):
        drawRoc(nmf, i, optimalK)

    # Q23
    print("Start finding top K!")
    k, col = 20, 5
    nmf = NMF(n_factors=k)
    trainAllMovies = allMoives.build_full_trainset()
    nmf.fit(trainAllMovies)
    ids = [[] for _ in range(col)]
    for i in range(col):
        factors = nmf.qi[:, i]
        s = sorted([[i, factor] for i, factor in enumerate(factors)],
                   key=lambda x: x[1],
                   reverse=True)
        for k in range(10):
            ids[i].append(s[k][0])
    genres = [[] for _ in range(col)]
    for i in range(col):
        for j in range(10):
            genres[i].append(movies[int(trainAllMovies.to_raw_iid(ids[i][j]))])
    for i in range(col):
        print('Col ' + str(i + 1) + ':')
        for genre in genres[i]:
            print(genre, end=', ')
        print('')
    print("Finish finding top K!")

    # Q24
    nmf('all movies', allMoives, True)

    # Q25
    optimalKBiased = 2
    print('The optimal number of latent factors is ' + optimalKBiased)

    # Q26
    nmf('popular movies', popMoives, True)

    # Q27
    nmf('unpopular movies', unpopMoives, True)

    # Q28
    nmf('high variance movies', varMoives, True)

    # Q29
    optimalKBiased = 2
    nmfBiased = NMF(n_factors=optimalKBiased, biased=True)
    for i in range(len(thresholds)):
        drawRoc(nmfBiased, i, optimalKBiased)

    plt.show()
def prepare_boycott_task(i, experimental_iteration, args, config, ratings_df, seed_base, outname, algo_name, algo, head_items, data):
    """
    To simulate a boycott, we need to figure out which ratings are being held out
    For large datasets and large boycotts (e.g. 50% of ML-20M) this is very slow
    So we need to parallelize it

    That's the purpose of this function

    """
    if config['type'] == 'individual_users':
        row = experimental_iteration[1]
        identifier = row.user_id
        name = 'individual'
        if args.indices != 'all':
            if identifier < args.indices[0] or identifier > args.indices[1]:
                return
        boycott_uid_set = set([row.user_id])
        like_boycotters_uid_set = set([])
    
    elif config['type'] in [
        'sample_users',
        'gender', 'age', 'power', 'state', 'genre',
        'genre_strict',
        'occupation',
    ]:
        identifier = i
        name = experimental_iteration['name']

        possible_boycotters_df = experimental_iteration['df']                        
        print(name)
        print(possible_boycotters_df.head())
        if args.userfrac != 1.0:
            boycotters_df = possible_boycotters_df.sample(frac=args.userfrac, random_state=(seed_base+i)*2)
        else:
            boycotters_df = possible_boycotters_df
        boycott_uid_set = set(boycotters_df.user_id)
        like_boycotters_df = possible_boycotters_df.drop(boycotters_df.index)
        like_boycotters_uid_set = set(like_boycotters_df.user_id)

    tic = time.time()

    mask_boycott_ratings = ratings_df.user_id.isin(boycott_uid_set)
    non_boycott_user_ratings_df = ratings_df[~mask_boycott_ratings] # makes a df copy
    print('isin time: {}'.format(time.time() - tic))

    boycott_ratings_df = None
    boycott_user_lingering_ratings_df = None
    tic = time.time()

    # BAD (slow) CODE warning: this part is pretty slow when simulating large boycotts for large datasets (e.g. 90% of ML-20M)
    # room for improvement
    if args.ratingfrac == 1.0: # skip this complicated stuff!
        boycott_ratings_df = ratings_df[mask_boycott_ratings]
        # copy the df but drop all rows
        boycott_user_lingering_ratings_df = boycott_ratings_df.drop(boycott_ratings_df.index)
    else:
        for uid in boycott_uid_set:
            ratings_belonging_to_user = ratings_df[ratings_df.user_id == uid]
            if args.ratingfrac != 1.0:
                boycott_ratings_for_user = ratings_belonging_to_user.sample(frac=args.ratingfrac, random_state=(seed_base+i)*3)
            else:
                boycott_ratings_for_user = ratings_belonging_to_user
            lingering_ratings_for_user = ratings_belonging_to_user.drop(boycott_ratings_for_user.index)
            if boycott_ratings_df is None:
                boycott_ratings_df = boycott_ratings_for_user
            else:
                boycott_ratings_df = pd.concat([boycott_ratings_df, boycott_ratings_for_user])
            if boycott_user_lingering_ratings_df is None:
                boycott_user_lingering_ratings_df = lingering_ratings_for_user
            else:
                boycott_user_lingering_ratings_df = pd.concat([boycott_user_lingering_ratings_df, lingering_ratings_for_user])
    print('going through each uid time: {}'.format(time.time() - tic))
    
    print('Iteration: {}'.format(i))
    print('Boycott ratings: {}, Lingering Ratings from Boycott Users: {}'.format(
        len(boycott_ratings_df.index), len(boycott_user_lingering_ratings_df.index)
    ))
    all_non_boycott_ratings_df = pd.concat(
        [non_boycott_user_ratings_df, boycott_user_lingering_ratings_df])

    print('Created dataframes', psutil.virtual_memory().used / (1024**3))

    nonboycott = Dataset.load_from_df(
        all_non_boycott_ratings_df[['user_id', 'movie_id', 'rating']],
        reader=Reader()
    ) # makes a copy
    boycott = Dataset.load_from_df(
        boycott_ratings_df[['user_id', 'movie_id', 'rating']],
        reader=Reader()
    ) # makes a copy
    # why are the Dataset objects taking up 4GB when the dataframe is only 760 MB???
    print('nonboycott.raw_ratings size', sys.getsizeof(nonboycott.raw_ratings))
    print('Created dataset objects', psutil.virtual_memory().used / (1024**3))

    identifier = str(identifier).zfill(4)
    num_users = len(all_non_boycott_ratings_df.user_id.value_counts())
    num_movies = len(all_non_boycott_ratings_df.movie_id.value_counts())
    num_ratings =  len(all_non_boycott_ratings_df.index)

    # make sure to save the set of boycott ids and like boycott ids
    experiment_identifier_to_uid_sets = {
        identifier: {}
    }
    experiment_identifier_to_uid_sets[identifier]['boycott_uid_set'] = ';'.join(str(x) for x in boycott_uid_set)
    experiment_identifier_to_uid_sets[identifier]['like_boycotters_uid_set'] = ';'.join(str(x) for x in like_boycotters_uid_set)

    save_path = outname.replace('results/', 'predictions/boycotts/{}__'.format(identifier)).replace('.csv', '_')
    if args.save_path == 'False':
        print('Since you passed --save_path False, predictions will NOT BE SAVED')
        save_path = None
    elif args.save_path is None:
        save_path = os.getcwd() + '/' + save_path
    else:
        save_path = args.save_path + '/' + save_path

    if args.load_path == 'False':
        load_path = None
    elif args.load_path is None:
        load_path = os.getcwd() + '/predictions/standards/{}_{}_'.format(args.dataset, algo_name)
    else:
        load_path = args.load_path + '/standards/{}_{}_'.format(args.dataset, algo_name)

    load_boycotts_path = save_path
    if args.load_boycotts_path is None:
        load_boycotts_path = None
    return (
        algo_name, algo, nonboycott, boycott, boycott_uid_set, like_boycotters_uid_set, MEASURES, NUM_FOLDS,
        False, identifier,
        num_ratings,
        num_users,
        num_movies, name,
        head_items, save_path, load_path, load_boycotts_path, data
    ), experiment_identifier_to_uid_sets
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))

layout = go.Layout(title = 'Distribution Of Number of Ratings Per User',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

##Find the k for KNN algorithm
file_path = os.path.expanduser('/Users/wuyanxu/Desktop/finaldata.csv')

reader = Reader(line_format='item user rating', sep=',')

data = Dataset.load_from_file(file_path, reader=reader)

sim_options = {'name': 'cosine',
               'user_based': True 
               }

min_mean = float("inf")
optimal_k = 1

for k in [10,20,30,40,50,60,70,80,90,100]:
    algo = KNNBasic(sim_options=sim_options, k=k)
    x = cross_validate(algo, data, verbose=True)
    cur_mean = np.mean(x['test_rmse'])
    if(cur_mean < min_mean):
Beispiel #17
0
    './using-data/{}/train_ratings.csv'.format(data),
    header=0,
    dtype={
        'userId': np.int32,
        'movieId': np.int32,
        'rating': np.float
    })
df_eval_ratings = pd.read_csv('./using-data/{}/eval_ratings.csv'.format(data),
                              header=0,
                              dtype={
                                  'userId': np.int32,
                                  'movieId': np.int32,
                                  'rating': np.float
                              })

reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(
    df_train_ratings.loc[:, ['userId', 'movieId', 'rating']], reader)
test_data = Dataset.load_from_df(
    df_eval_ratings.loc[:, ['userId', 'movieId', 'rating']], reader)

trainset = train_data.build_full_trainset()
_, testset = train_test_split(test_data, test_size=.999)

lu = [0.2, 0.02, 0.002]
lv = [0.2, 0.02, 0.002]

for u in lu:
    for v in lv:
        algo_pmf = SVD(n_factors=50,
                       lr_all=0.005,
Beispiel #18
0
from steven.ratings_residuals_histogram import single_histogram, double_histogram
from steven.steven_baselines import MeanOfMeans

FILE_DIRECTORY = os.path.split(os.path.realpath(__file__))[0]
DATA_DIRECTORY = os.path.join(
    os.path.split(FILE_DIRECTORY)[0], 'data', 'movies')

if __name__ == "__main__":
    # Read data
    df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv'))

    # Drop unneeded column 'timestamp'
    df.drop('timestamp', axis=1, inplace=True)

    # Load the data into the surprise format
    reader = Reader()
    data = Dataset.load_from_df(df, reader=reader)

    # Train ALS model
    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    trainset, testset = train_test_split(data, test_size=0.25)
    algo = BaselineOnly(bsl_options=bsl_options)
    predictions = algo.fit(trainset).test(testset)

    # Get the RMSE of our predictions
    rmse = accuracy.rmse(predictions)

    # Get the cross-validated RMSE of our predictions
    cv_results = cross_validate(algo, data)
    cv_rmse = cv_results['test_rmse'].mean()
def main(args):
    """
    Run the sandbox experiments
    """
    out_prefix = 'out/' if args.send_to_out else ""
    times = OrderedDict()
    times['start'] = time.time()
    algos = ALGOS
    if args.movie_mean:
        algos = {
            'MovieMean': MovieMean(),
            'GlobalMean': GlobalMean(),
        }
    algos_for_standards = ALGOS_FOR_STANDARDS
    dfs = get_dfs(args.dataset)
    head_items = load_head_items(args.dataset)
    times['dfs_loaded'] = time.time() - times['start']
    print('Got dataframes, took {} seconds'.format(times['dfs_loaded']))
    print('Total examples: {}'.format(len(dfs['ratings'].index)))

    ratings_df, users_df, movies_df = dfs['ratings'], dfs['users'], dfs['movies']
    if args.mode == 'info':
        print(ratings_df.memory_usage(index=True))
        print(users_df.memory_usage(index=True))
        print(movies_df.memory_usage(index=True))

        print(ratings_df.info())
        print(users_df.info())
        return
    data = Dataset.load_from_df(
        ratings_df[['user_id', 'movie_id', 'rating']],
        reader=Reader()
    )
    times['data_constructed'] = time.time() - times['dfs_loaded']

    # note to reader: why are precision, recall, and ndcg all stuffed together in one string?
    # this ensures they will be computed all at once. Evaluation code will split them up for presentation
    metric_names = []
    for measure in MEASURES:
        if '_' in measure:
            splitnames = measure.lower().split('_')
            metric_names += splitnames
            metric_names += [x + '_frac' for x in splitnames]
            metric_names += ['tail' + x for x in splitnames]
        else:
            metric_names.append(measure.lower())
    metric_names = get_metric_names()
    if args.compute_standards:
        standard_results = defaultdict(list)
        for algo_name in algos_for_standards:
            for _ in range(args.num_standards):
                filename_ratingcv_standards = out_prefix + 'standard_results/{}_ratingcv_standards_for_{}.json'.format(
                    args.dataset, algo_name)

                print('Computing standard results for {}'.format(algo_name))
                if args.save_path is False:
                    save_path = None
                elif args.save_path is None:
                    save_path = os.getcwd() + '/' + out_prefix + 'predictions/standards/{}_{}_'.format(args.dataset, algo_name)
                else:
                    save_path = args.save_path

                if 'KNN' in algo_name and args.dataset == 'ml-20m':
                    # running this in parallel runs out of memory with KNN
                    results = cross_validate_custom(
                        algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(),
                        reader=Reader()), [], [], MEASURES, NUM_FOLDS, n_jobs=1, head_items=head_items,
                        save_path=save_path)
                else:
                    results = cross_validate_custom(
                        algos_for_standards[algo_name], data, Dataset.load_from_df(pd.DataFrame(),
                        reader=Reader()), [], [], MEASURES, NUM_FOLDS, head_items=head_items,
                        save_path=save_path)
                saved_results = {}
                for metric in metric_names:
                    saved_results[metric] = np.mean(results[metric + '_all'])
                    # frac_key = metric + '_frac_all'
                    # if frac_key in results:
                    #     saved_results[frac_key] = np.mean(results[frac_key])

                with open(filename_ratingcv_standards, 'w') as f:
                    json.dump(saved_results, f)
                    
                standard_results[algo_name].append(saved_results)
            standard_results_df = pd.DataFrame(standard_results[algo_name])
            print(standard_results_df.mean())
            standard_results_df.mean().to_csv('{}'.format(
                filename_ratingcv_standards).replace('.json', '_{}.csv'.format(
                    args.num_standards)
                )
            )

    experiment_configs = []
    if args.grouping == 'individual_users':
        experiment_configs += [{'type': 'individual_users', 'size': None}]
    elif args.grouping == 'sample':
        if args.sample_sizes:
            experiment_configs += [
                {
                    'type': 'sample_users', 'size': sample_size
                } for sample_size in args.sample_sizes]
        else:
            raise ValueError(
                'When using grouping="sample", you must provide a set of sample sizes'
            )
    elif args.grouping in [
        'gender', 'age', 'power', 'state', 'genre', 'genre_strict', 'occupation', 
    ]:
        experiment_configs += [{'type': args.grouping, 'size': None}]
    else:
        experiment_configs = []


    uid_to_error = {}
    experimental_iterations = []
    seed_base = args.indices[0]
    for config in experiment_configs:
        outname = out_prefix + concat_output_filename(
            args.dataset, config['type'], args.userfrac,
            args.ratingfrac,
            config['size'], args.num_samples, args.indices
        )
        if config['type'] == 'individual_users':
            experimental_iterations = list(users_df.iterrows())
        elif config['type'] == 'sample_users':
            experimental_iterations = [{
                'df': users_df.sample(config['size'], random_state=seed_base+index), # copies user_df
                'name': '{} user sample'.format(config['size'])
            } for index in range(args.num_samples)]
        elif config['type'] == 'gender':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_gender(users_df)
        elif config['type'] == 'age':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_age(users_df)
        elif config['type'] == 'state':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_state(users_df, dataset=args.dataset)
        elif config['type'] == 'genre':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_genre(
                    users_df=users_df, ratings_df=ratings_df, movies_df=movies_df,
                    dataset=args.dataset)
        elif config['type'] == 'genre_strict':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_genre_strict(
                    users_df=users_df, ratings_df=ratings_df, movies_df=movies_df,
                    dataset=args.dataset)
        elif config['type'] == 'power':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_power(users_df=users_df, ratings_df=ratings_df, dataset=args.dataset)
        elif config['type'] == 'occupation':
            for _ in range(args.num_samples):
                experimental_iterations += group_by_occupation(users_df)

        experiment_identifier_to_uid_sets = {}
        for algo_name in algos:
            prep_boycott_tasks = (
                delayed(prepare_boycott_task)(
                    i, experimental_iteration, args, config,
                    ratings_df, seed_base,
                    outname, algo_name, algos[algo_name], head_items, data
                ) for i, experimental_iteration in enumerate(experimental_iterations)
            )
            simulate_boycott_tasks = []
            tic = time.time()
            out = Parallel(n_jobs=-1, verbose=5, max_nbytes=None)((x for x in prep_boycott_tasks))
            for task_args, d in out:
                simulate_boycott_tasks.append(delayed(task)(*task_args))
                experiment_identifier_to_uid_sets.update(d)
            print('parallelized prep_boycott_task took {} seconds'.format(time.time() - tic))
            print('About to run Parallel() with {} tasks'.format(len(simulate_boycott_tasks)))
            out_dicts = Parallel(n_jobs=-1, verbose=5)((x for x in simulate_boycott_tasks))
            for d in out_dicts:
                res = d['subset_results']
                algo_name = d['algo_name']
                uid = str(d['identifier']) + '_' + d['algo_name']
                uid_to_error[uid] = {
                    'num_ratings': d['num_ratings'],
                    'num_users': d['num_users'],
                    'num_movies': d['num_movies'],
                    'name': d['name'],
                    'algo_name': d['algo_name'],
                }
                for metric in metric_names + ['fit_time', 'test_times', 'num_tested']:
                    for group in ['all', 'non-boycott', 'boycott', 'like-boycott', 'all-like-boycott']:
                        key = '{}_{}'.format(metric, group)
                        # if group in ['boycott', ]:
                        #     val = np.nanmean(res[key])
                        vals = res.get(key)
                        if vals:
                            val = np.mean(res[key])
                            uid_to_error[uid].update({
                                key: val,
                            })
                        standards_key = 'standards_' + key
                        standards_vals = res.get(standards_key)
                        if standards_vals:
                            standards_val = np.mean(res[standards_key])
                            uid_to_error[uid].update({
                                standards_key: standards_val,
                            })
        err_df = pd.DataFrame.from_dict(uid_to_error, orient='index')
        uid_sets_outname = outname.replace('results/', 'uid_sets/uid_sets_')
        pd.DataFrame.from_dict(experiment_identifier_to_uid_sets, orient='index').to_csv(uid_sets_outname)
        if args.movie_mean:
            outname = outname.replace('results/', 'results/MOVIEMEAN_')
        err_df.to_csv(outname)
        print('Full runtime was: {} for {} experimental iterations'.format(time.time() - times['start'], len(experimental_iterations)))