Example #1
0
def find_top_movies_per_genre(movies_per_genre, data_path):
    """Find the top movies of each genre, by number of ratings.

  Args:
    movies_per_genre: Number of top movies per genre.
    data_path: Path to movies.csv, ratings.csv.

  Returns:
    Numpy array of unique movieIds.

  """
    movies_df = pd.read_csv(file_util.open(f'{data_path}/movies.csv', 'r'))
    ratings_df = pd.read_csv(file_util.open(f'{data_path}/ratings.csv', 'r'))

    top_movies = np.array([], dtype=int)
    for genre in GENRES:
        filter_genre = movies_df['genres'].str.contains(genre)
        movies_in_genre = movies_df[filter_genre]['movieId'].values  # pylint: disable=unused-variable
        movies_in_genre_by_num_ratings = ratings_df.query(
            'movieId in @movies_in_genre').groupby('movieId').size()
        top_movies_in_genre = movies_in_genre_by_num_ratings.sort_values(
            ascending=False)[:movies_per_genre].index.values
        top_movies = np.append(top_movies, top_movies_in_genre)

    return np.unique(top_movies)
Example #2
0
  def test_read_write_files(self):
    test_root = self.create_tempdir().full_path
    with file_util.open('%s/hello.txt' % test_root, 'w') as outfile:
      outfile.write('hello!')

    with file_util.open('%s/hello.txt' % test_root, 'r') as readfile:
      self.assertEqual(readfile.read(), 'hello!')
Example #3
0
def write_csv_output(dataframes, directory):
    """Write csv file outputs."""
    movies, users, ratings = dataframes
    file_util.makedirs(directory)

    del movies['tag_id']  # This column isn't necessary.

    users.to_csv(file_util.open(os.path.join(directory, 'users.csv'), 'w'),
                 index=False,
                 columns=['userId'])
    movies.to_csv(file_util.open(os.path.join(directory, 'movies.csv'), 'w'),
                  index=False)
    ratings.to_csv(file_util.open(os.path.join(directory, 'ratings.csv'), 'w'),
                   index=False)
Example #4
0
 def test_plotting_works(self):
     """Tests whether the plotting feature in the evaluation function works."""
     with tempfile.TemporaryDirectory(dir=FLAGS.test_tmpdir) as tmpdirname:
         figure_file_path = os.path.join(tmpdirname, 'test_plot.png')
         with file_util.open(figure_file_path, 'wb') as figure_file_obj:
             evaluation.evaluate_agent(
                 self.agent,
                 self.env,
                 alpha=self.config.alpha,
                 num_users=10,
                 scatter_plot_trajectories=True,
                 figure_file_obj=figure_file_obj,
                 risk_score_extractor=evaluation.health_risk)
         filecontents = file_util.open(figure_file_path, 'rb').read()
         self.assertNotEmpty(filecontents)
def _setup_directories(config):
    file_util.makedirs(config['results_dir'])
    with file_util.open(
            os.path.join(config['results_dir'],
                         config['experiment_name'] + '_info.txt'),
            'w') as outfile:
        outfile.write(fg_core.to_json(config))
Example #6
0
    def _read_users(self, path):
        """Returns a dict of User objects."""
        users = {}
        for _, row in pd.read_csv(file_util.open(path)).iterrows():
            users[row.userId] = self._user_ctor(user_id=row.userId)

        return users
Example #7
0
def _setup_directories(config):
    try:
        file_util.makedirs(config['results_dir'])
        file_util.makedirs('./runs/{}'.format(config['experiment_name']))
    except FileExistsError:
        pass
    with file_util.open(
            os.path.join(config['results_dir'],
                         config['experiment_name'] + '_info.txt'),
            'w') as outfile:
        outfile.write(fg_core.to_json(config))
Example #8
0
def load_json_pickle(path):
  """Attempt to load an array-like object from a file path.

  Args:
    path: File to load.

  Returns:
    File loaded via either json or pickle.

  Raises:
    ValueError: If file could not be loaded as either json or pickle
  """
  try:
    return json.load(file_util.open(path, 'rb'))
  except ValueError:
    logging.debug(('File could not be loaded as json, falling back to pickle: '
                   '%s'), path)

  try:
    return pickle.load(file_util.open(path, 'rb'))
  except ValueError:
    raise ValueError(f'File could not be loaded as json or pickle: {path}')
def write_csv_output(dataframe, filename, directory):
    """Write dataframe to CSV.

  Args:
    dataframe: pandas DataFrame
    filename: name of the file (should end in ".csv")
    directory: directory to write to
  """
    if not filename.endswith('.csv'):
        raise ValueError('Filename does not end in .csv')
    file_util.makedirs(directory)

    dataframe.to_csv(file_util.open(os.path.join(directory, filename), 'w'),
                     index=False)
def load_embeddings(env_config):
    """Attempts to loads user and movie embeddings from a json or pickle file."""
    path = env_config.embeddings_path
    suffix = pathlib.Path(path).suffix
    if suffix == '.json':
        loader = json
        logging.info('Reading a json file. %s', path)
    elif suffix in ('.pkl', '.pickle'):
        loader = pickle
        logging.info('Reading a pickle file. %s', path)
    else:
        raise ValueError('Unrecognized file type! %s' % path)

    embedding_dict = loader.load(file_util.open(path, 'rb'))
    return types.SimpleNamespace(
        movies=np.array(embedding_dict[env_config.embedding_movie_key]),
        users=np.array(embedding_dict[env_config.embedding_user_key]))
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    recall_values = {}
    for setting in ['static', 'dynamic']:
        recall = []
        for seed in range(FLAGS.num_trials):
            with file_util.open(
                    os.path.join(FLAGS.source_path, str(seed),
                                 'cumulative_recall_%s.txt') % setting,
                    'r') as infile:
                recall.append(np.loadtxt(infile))
            recall_values[setting] = np.vstack(recall)

    lending_plots.plot_cumulative_recall_differences(
        recall_values,
        path=os.path.join(FLAGS.plotting_dir, 'combined_simpsons_paradox.pdf'))
Example #12
0
def find_top_movies_overall(num_movies, data_path):
    """Find the top movies overall, by number of ratings.

  Args:
    num_movies: Number of movies to return.
    data_path: Path to ratings.csv.

  Returns:
    Numpy array of unique movieIds.

  """
    ratings_df = pd.read_csv(file_util.open(f'{data_path}/ratings.csv', 'r'))

    top_movies = ratings_df.groupby('movieId').size().sort_values(
        ascending=False)[:num_movies].index.values

    top_movies = np.unique(top_movies)

    return top_movies
    def _read_movies(self, path):
        """Returns a dict of Movie objects."""
        movies = {}
        movie_df = pd.read_csv(file_util.open(path))

        for _, row in movie_df.iterrows():
            genres = [
                GENRE_MAP.get(genre, OTHER_GENRE_IDX)
                for genre in row.genres.split('|')
            ]
            assert isinstance(row.movieId, int)
            movie_id = row.movieId
            # `movie_vec` is left as None, and will be filled in later in the init
            # of this Dataset.
            movies[movie_id] = self._movie_ctor(
                movie_id,
                row.title,
                genres,
                vec=None,
                violence=row.violence_tag_relevance)
        return movies
def _write(path):
    """Write a plot to a path."""
    if path:
        plt.savefig(file_util.open(path, "w"), format="png")
Example #15
0
    def setUp(self):
        super().setUp()
        self.tempdir = tempfile.mkdtemp()
        self.test_data_dir = os.path.join(
            FLAGS.test_srcdir,
            os.path.split(os.path.abspath(__file__))[0],
            '../environments/recommenders/test_data')
        embedding_dim = 55

        user_emb = np.zeros((5, embedding_dim))
        movie_emb = np.zeros((5, embedding_dim))

        # Range of the dot product is [-2, 2]
        user_emb[:, 1] = (np.random.rand(5) - 0.5) * 4
        movie_emb[:, 1] = (np.random.rand(5) - 0.5) * 4

        # Add a bias term of 3.0 as dim 0.
        user_emb[:, 0] = 1.0
        movie_emb[:, 0] = 3.0
        initial_embeddings = {'users': user_emb, 'movies': movie_emb}
        pickle_file = os.path.join(self.tempdir, 'embeddings.pkl')

        with file_util.open(pickle_file, 'wb') as outfile:
            pickle.dump(initial_embeddings, outfile)
        self.env_config = movie_lens_dynamic.EnvConfig(
            seeds=movie_lens_dynamic.Seeds(0, 0),
            data_dir=self.test_data_dir,
            train_eval_test=[0.6, 0.2, 0.2],
            embeddings_path=pickle_file)

        self.config = {
            'results_dir': self.tempdir,
            'max_episode_length': 2,
            'initial_lambda': 0,
            'beta': 0.5,
            'alpha': 0.95,
            'experiment_suffix': 'my_test_expriment',
            'lambda_learning_rate': 0.1,
            'var_learning_rate': 0.1,
            'learning_rate': 0.1,
            'embedding_size': 10,
            'user_embedding_size': 0,
            'hidden_size': 4,
            'num_hidden_layers': 4,
            'num_users_eval': 4,
            'num_users_eval_final': 4,
            'num_episodes_per_update': 8,
            'optimizer_name': 'Adam',
            'num_updates': 2,
            'eval_deterministic': True,
            'gamma': 0.95,
            'baseline_value': 0.1,
            'momentum': 0.1,
            'clipnorm': 0.1,
            'clipval': 0.1,
            'checkpoint_every': 1,
            'lr_scheduler': 1,
            'eval_every': 1,
            'regularization_coeff': 0.5,
            'agent_seed': 103,
            'initial_agent_model': None,
            'activity_regularization_coeff': 0.1,
            'dropout': 0.1,
            'stateful_rnn': True,
        }
        self.config['user_id_input'] = self.config['user_embedding_size'] > 0
        self.config['experiment_name'] = 'id_' + hashlib.sha1(
            repr(sorted(self.config.items())).encode()).hexdigest()
        self.config['env_config'] = self.env_config
Example #16
0
def do_plotting(maximize_reward_result,
                equality_of_opportunity_result,
                static_equality_of_opportunity_result,
                plotting_dir,
                options=None):
    """Creates plots and writes them to a directory.

  Args:
    maximize_reward_result: The results from an experiment with a max-util
      agent.
    equality_of_opportunity_result: The results from an experiment with an
      agent constrained by equality of opportunity.
    static_equality_of_opportunity_result: The results from an experiment with
      an agent constrained by equality of opportunity without long-term credit
      dynamics.
    plotting_dir: A directory to write the plots.
    options: A set of PlotType enums that indicate which plots to create.
      If None, create everything.
  """

    if options is None:
        options = set(PlotTypes)

    if PlotTypes.CREDIT_DISTRIBUTIONS in options:
        plot_credit_distribution(maximize_reward_result['metric_results']
                                 ['initial_credit_distribution'],
                                 'Initial',
                                 path=os.path.join(plotting_dir,
                                                   'initial.pdf'))
        plot_credit_distribution(maximize_reward_result['metric_results']
                                 ['final_credit_distributions'],
                                 title=MAX_UTIL_TITLE,
                                 path=os.path.join(plotting_dir,
                                                   'max_utility.pdf'))
        plot_credit_distribution(
            equality_of_opportunity_result['metric_results']
            ['final_credit_distributions'],
            title=EQ_OPP_TITLE,
            path=os.path.join(plotting_dir, 'equalize_opportunity.pdf'))

    if PlotTypes.CUMULATIVE_LOANS in options:
        cumulative_loans = {
            'max reward':
            maximize_reward_result['metric_results']['cumulative_loans'],
            'equal-opp':
            equality_of_opportunity_result['metric_results']
            ['cumulative_loans']
        }
        plot_cumulative_loans(
            cumulative_loans, os.path.join(plotting_dir,
                                           'cumulative_loans.pdf'))

    if PlotTypes.THRESHOLD_HISTORY in options:
        plot_threshold_history(
            equality_of_opportunity_result['agent']['threshold_history'],
            os.path.join(plotting_dir, 'threshold_history.pdf'))

    if PlotTypes.MEAN_CREDIT_OVER_TIME in options:
        histories = {
            'max reward': maximize_reward_result['environment']['history'],
            'equal-opp':
            equality_of_opportunity_result['environment']['history']
        }

        plot_mu(histories, os.path.join(plotting_dir, 'mu.pdf'))

    if PlotTypes.CUMULATIVE_RECALLS in options:

        with file_util.open(
                os.path.join(plotting_dir, 'cumulative_recall_dynamic.txt'),
                'w') as outfile:
            np.savetxt(
                outfile, equality_of_opportunity_result['metric_results']
                ['cumulative_recall'])
        with file_util.open(
                os.path.join(plotting_dir, 'target_recall_dynamic.txt'),
                'w') as outfile:
            np.savetxt(
                outfile,
                list(equality_of_opportunity_result['agent']
                     ['tpr_targets'].values()))

        plot_recall_targets(
            equality_of_opportunity_result['agent']['tpr_targets'][(0, 1)],
            os.path.join(plotting_dir, 'target_recall_dynamic.pdf'))

        with file_util.open(
                os.path.join(plotting_dir, 'cumulative_recall_static.txt'),
                'w') as outfile:
            np.savetxt(
                outfile,
                static_equality_of_opportunity_result['metric_results']
                ['cumulative_recall'])
Example #17
0
def _write(path):
    """Write a plot to a path."""
    if path:
        plt.savefig(file_util.open(path, 'w'), format='png')
Example #18
0
    def _populate_genre_history(self,
                                data=None,
                                save_path=None,
                                min_freq=None):
        """Modifies stored users with watch history.

    Most users have watched at least one movie from each genre.  With that in
    mind, a minimum threshold is used to make this a non-trivial user feature.

    Because users vary in the number of movies they have watched, a genre
    is included in the genre history if and only if movies from this genre make
    up at least a certain percentage (specified by `min_freq`) of movies rated
    by a given user.

    Args:
      data: Numpy array specifying genre history.  If None, then this is
        generated from the raw data files.
      save_path: If specified (and `data` is None), then generate and save genre
        history to this path.
      min_freq: Minimum percentage of watches from a genre to quality as part of
        history, given as a float in [0., 1.]
    Updates: For each user, populates the genre history as a multi-hot encoding,
      a numpy array of length `NUM_GENRES`.
    """

        if data is not None:
            if save_path:
                raise ValueError('If data is given, save path must be None')

            if not isinstance(data, np.ndarray):
                raise TypeError('Provided data is not a numpy array')

            user_genre_history = np.copy(data)

        else:
            if min_freq < 0 or min_freq > 1:  # pytype: disable=unsupported-operands
                raise ValueError(
                    f'Expected min_freq in [0, 1], got {min_freq:.3f}')
            movie_df = pd.read_csv(file_util.open(self._movie_path))
            ratings_df = pd.read_csv(file_util.open(self._rating_path))

            # Populate multi-hot encoding of genres for each movie
            genre_vecs = np.zeros((NUM_MOVIES, len(GENRES)))

            for idx, row in movie_df.iterrows():
                genres = [
                    GENRE_MAP.get(genre, OTHER_GENRE_IDX)
                    for genre in row.genres.split('|')
                ]

                genre_vecs[idx, genres] = 1

            if not np.all(genre_vecs.sum(axis=1) > 0):
                raise ValueError('Some movies have no genres')

            # For each user, track the total number of movies of each genre,
            # where a movie can count for multiple genres.
            user_history = np.zeros((NUM_USERS, len(GENRES)))

            for _, row in ratings_df.iterrows():
                user_history[row.userId] += genre_vecs[row.movieId]

            # Normalize by the total number of watches for each user
            num_watches = ratings_df.groupby('userId').size()
            if not np.array_equal(num_watches.index.values,
                                  np.arange(NUM_USERS)):
                raise ValueError('userId in provided ratings file has gaps')
            user_genre_per_watch = user_history / np.array(num_watches)[:,
                                                                        None]
            user_genre_history = np.array(user_genre_per_watch > min_freq,
                                          dtype=int)

            if save_path:
                with file_util.open(save_path, 'wb') as f:
                    pickle.dump(user_genre_history, f)

        for user_ in self.get_users():
            user_.initial_genre_history = np.copy(
                user_genre_history[user_.user_id])
            user_.genre_history = np.copy(user_genre_history[user_.user_id])