def find_top_movies_per_genre(movies_per_genre, data_path): """Find the top movies of each genre, by number of ratings. Args: movies_per_genre: Number of top movies per genre. data_path: Path to movies.csv, ratings.csv. Returns: Numpy array of unique movieIds. """ movies_df = pd.read_csv(file_util.open(f'{data_path}/movies.csv', 'r')) ratings_df = pd.read_csv(file_util.open(f'{data_path}/ratings.csv', 'r')) top_movies = np.array([], dtype=int) for genre in GENRES: filter_genre = movies_df['genres'].str.contains(genre) movies_in_genre = movies_df[filter_genre]['movieId'].values # pylint: disable=unused-variable movies_in_genre_by_num_ratings = ratings_df.query( 'movieId in @movies_in_genre').groupby('movieId').size() top_movies_in_genre = movies_in_genre_by_num_ratings.sort_values( ascending=False)[:movies_per_genre].index.values top_movies = np.append(top_movies, top_movies_in_genre) return np.unique(top_movies)
def test_read_write_files(self): test_root = self.create_tempdir().full_path with file_util.open('%s/hello.txt' % test_root, 'w') as outfile: outfile.write('hello!') with file_util.open('%s/hello.txt' % test_root, 'r') as readfile: self.assertEqual(readfile.read(), 'hello!')
def write_csv_output(dataframes, directory): """Write csv file outputs.""" movies, users, ratings = dataframes file_util.makedirs(directory) del movies['tag_id'] # This column isn't necessary. users.to_csv(file_util.open(os.path.join(directory, 'users.csv'), 'w'), index=False, columns=['userId']) movies.to_csv(file_util.open(os.path.join(directory, 'movies.csv'), 'w'), index=False) ratings.to_csv(file_util.open(os.path.join(directory, 'ratings.csv'), 'w'), index=False)
def test_plotting_works(self): """Tests whether the plotting feature in the evaluation function works.""" with tempfile.TemporaryDirectory(dir=FLAGS.test_tmpdir) as tmpdirname: figure_file_path = os.path.join(tmpdirname, 'test_plot.png') with file_util.open(figure_file_path, 'wb') as figure_file_obj: evaluation.evaluate_agent( self.agent, self.env, alpha=self.config.alpha, num_users=10, scatter_plot_trajectories=True, figure_file_obj=figure_file_obj, risk_score_extractor=evaluation.health_risk) filecontents = file_util.open(figure_file_path, 'rb').read() self.assertNotEmpty(filecontents)
def _setup_directories(config): file_util.makedirs(config['results_dir']) with file_util.open( os.path.join(config['results_dir'], config['experiment_name'] + '_info.txt'), 'w') as outfile: outfile.write(fg_core.to_json(config))
def _read_users(self, path): """Returns a dict of User objects.""" users = {} for _, row in pd.read_csv(file_util.open(path)).iterrows(): users[row.userId] = self._user_ctor(user_id=row.userId) return users
def _setup_directories(config): try: file_util.makedirs(config['results_dir']) file_util.makedirs('./runs/{}'.format(config['experiment_name'])) except FileExistsError: pass with file_util.open( os.path.join(config['results_dir'], config['experiment_name'] + '_info.txt'), 'w') as outfile: outfile.write(fg_core.to_json(config))
def load_json_pickle(path): """Attempt to load an array-like object from a file path. Args: path: File to load. Returns: File loaded via either json or pickle. Raises: ValueError: If file could not be loaded as either json or pickle """ try: return json.load(file_util.open(path, 'rb')) except ValueError: logging.debug(('File could not be loaded as json, falling back to pickle: ' '%s'), path) try: return pickle.load(file_util.open(path, 'rb')) except ValueError: raise ValueError(f'File could not be loaded as json or pickle: {path}')
def write_csv_output(dataframe, filename, directory): """Write dataframe to CSV. Args: dataframe: pandas DataFrame filename: name of the file (should end in ".csv") directory: directory to write to """ if not filename.endswith('.csv'): raise ValueError('Filename does not end in .csv') file_util.makedirs(directory) dataframe.to_csv(file_util.open(os.path.join(directory, filename), 'w'), index=False)
def load_embeddings(env_config): """Attempts to loads user and movie embeddings from a json or pickle file.""" path = env_config.embeddings_path suffix = pathlib.Path(path).suffix if suffix == '.json': loader = json logging.info('Reading a json file. %s', path) elif suffix in ('.pkl', '.pickle'): loader = pickle logging.info('Reading a pickle file. %s', path) else: raise ValueError('Unrecognized file type! %s' % path) embedding_dict = loader.load(file_util.open(path, 'rb')) return types.SimpleNamespace( movies=np.array(embedding_dict[env_config.embedding_movie_key]), users=np.array(embedding_dict[env_config.embedding_user_key]))
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') recall_values = {} for setting in ['static', 'dynamic']: recall = [] for seed in range(FLAGS.num_trials): with file_util.open( os.path.join(FLAGS.source_path, str(seed), 'cumulative_recall_%s.txt') % setting, 'r') as infile: recall.append(np.loadtxt(infile)) recall_values[setting] = np.vstack(recall) lending_plots.plot_cumulative_recall_differences( recall_values, path=os.path.join(FLAGS.plotting_dir, 'combined_simpsons_paradox.pdf'))
def find_top_movies_overall(num_movies, data_path): """Find the top movies overall, by number of ratings. Args: num_movies: Number of movies to return. data_path: Path to ratings.csv. Returns: Numpy array of unique movieIds. """ ratings_df = pd.read_csv(file_util.open(f'{data_path}/ratings.csv', 'r')) top_movies = ratings_df.groupby('movieId').size().sort_values( ascending=False)[:num_movies].index.values top_movies = np.unique(top_movies) return top_movies
def _read_movies(self, path): """Returns a dict of Movie objects.""" movies = {} movie_df = pd.read_csv(file_util.open(path)) for _, row in movie_df.iterrows(): genres = [ GENRE_MAP.get(genre, OTHER_GENRE_IDX) for genre in row.genres.split('|') ] assert isinstance(row.movieId, int) movie_id = row.movieId # `movie_vec` is left as None, and will be filled in later in the init # of this Dataset. movies[movie_id] = self._movie_ctor( movie_id, row.title, genres, vec=None, violence=row.violence_tag_relevance) return movies
def _write(path): """Write a plot to a path.""" if path: plt.savefig(file_util.open(path, "w"), format="png")
def setUp(self): super().setUp() self.tempdir = tempfile.mkdtemp() self.test_data_dir = os.path.join( FLAGS.test_srcdir, os.path.split(os.path.abspath(__file__))[0], '../environments/recommenders/test_data') embedding_dim = 55 user_emb = np.zeros((5, embedding_dim)) movie_emb = np.zeros((5, embedding_dim)) # Range of the dot product is [-2, 2] user_emb[:, 1] = (np.random.rand(5) - 0.5) * 4 movie_emb[:, 1] = (np.random.rand(5) - 0.5) * 4 # Add a bias term of 3.0 as dim 0. user_emb[:, 0] = 1.0 movie_emb[:, 0] = 3.0 initial_embeddings = {'users': user_emb, 'movies': movie_emb} pickle_file = os.path.join(self.tempdir, 'embeddings.pkl') with file_util.open(pickle_file, 'wb') as outfile: pickle.dump(initial_embeddings, outfile) self.env_config = movie_lens_dynamic.EnvConfig( seeds=movie_lens_dynamic.Seeds(0, 0), data_dir=self.test_data_dir, train_eval_test=[0.6, 0.2, 0.2], embeddings_path=pickle_file) self.config = { 'results_dir': self.tempdir, 'max_episode_length': 2, 'initial_lambda': 0, 'beta': 0.5, 'alpha': 0.95, 'experiment_suffix': 'my_test_expriment', 'lambda_learning_rate': 0.1, 'var_learning_rate': 0.1, 'learning_rate': 0.1, 'embedding_size': 10, 'user_embedding_size': 0, 'hidden_size': 4, 'num_hidden_layers': 4, 'num_users_eval': 4, 'num_users_eval_final': 4, 'num_episodes_per_update': 8, 'optimizer_name': 'Adam', 'num_updates': 2, 'eval_deterministic': True, 'gamma': 0.95, 'baseline_value': 0.1, 'momentum': 0.1, 'clipnorm': 0.1, 'clipval': 0.1, 'checkpoint_every': 1, 'lr_scheduler': 1, 'eval_every': 1, 'regularization_coeff': 0.5, 'agent_seed': 103, 'initial_agent_model': None, 'activity_regularization_coeff': 0.1, 'dropout': 0.1, 'stateful_rnn': True, } self.config['user_id_input'] = self.config['user_embedding_size'] > 0 self.config['experiment_name'] = 'id_' + hashlib.sha1( repr(sorted(self.config.items())).encode()).hexdigest() self.config['env_config'] = self.env_config
def do_plotting(maximize_reward_result, equality_of_opportunity_result, static_equality_of_opportunity_result, plotting_dir, options=None): """Creates plots and writes them to a directory. Args: maximize_reward_result: The results from an experiment with a max-util agent. equality_of_opportunity_result: The results from an experiment with an agent constrained by equality of opportunity. static_equality_of_opportunity_result: The results from an experiment with an agent constrained by equality of opportunity without long-term credit dynamics. plotting_dir: A directory to write the plots. options: A set of PlotType enums that indicate which plots to create. If None, create everything. """ if options is None: options = set(PlotTypes) if PlotTypes.CREDIT_DISTRIBUTIONS in options: plot_credit_distribution(maximize_reward_result['metric_results'] ['initial_credit_distribution'], 'Initial', path=os.path.join(plotting_dir, 'initial.pdf')) plot_credit_distribution(maximize_reward_result['metric_results'] ['final_credit_distributions'], title=MAX_UTIL_TITLE, path=os.path.join(plotting_dir, 'max_utility.pdf')) plot_credit_distribution( equality_of_opportunity_result['metric_results'] ['final_credit_distributions'], title=EQ_OPP_TITLE, path=os.path.join(plotting_dir, 'equalize_opportunity.pdf')) if PlotTypes.CUMULATIVE_LOANS in options: cumulative_loans = { 'max reward': maximize_reward_result['metric_results']['cumulative_loans'], 'equal-opp': equality_of_opportunity_result['metric_results'] ['cumulative_loans'] } plot_cumulative_loans( cumulative_loans, os.path.join(plotting_dir, 'cumulative_loans.pdf')) if PlotTypes.THRESHOLD_HISTORY in options: plot_threshold_history( equality_of_opportunity_result['agent']['threshold_history'], os.path.join(plotting_dir, 'threshold_history.pdf')) if PlotTypes.MEAN_CREDIT_OVER_TIME in options: histories = { 'max reward': maximize_reward_result['environment']['history'], 'equal-opp': equality_of_opportunity_result['environment']['history'] } plot_mu(histories, os.path.join(plotting_dir, 'mu.pdf')) if PlotTypes.CUMULATIVE_RECALLS in options: with file_util.open( os.path.join(plotting_dir, 'cumulative_recall_dynamic.txt'), 'w') as outfile: np.savetxt( outfile, equality_of_opportunity_result['metric_results'] ['cumulative_recall']) with file_util.open( os.path.join(plotting_dir, 'target_recall_dynamic.txt'), 'w') as outfile: np.savetxt( outfile, list(equality_of_opportunity_result['agent'] ['tpr_targets'].values())) plot_recall_targets( equality_of_opportunity_result['agent']['tpr_targets'][(0, 1)], os.path.join(plotting_dir, 'target_recall_dynamic.pdf')) with file_util.open( os.path.join(plotting_dir, 'cumulative_recall_static.txt'), 'w') as outfile: np.savetxt( outfile, static_equality_of_opportunity_result['metric_results'] ['cumulative_recall'])
def _write(path): """Write a plot to a path.""" if path: plt.savefig(file_util.open(path, 'w'), format='png')
def _populate_genre_history(self, data=None, save_path=None, min_freq=None): """Modifies stored users with watch history. Most users have watched at least one movie from each genre. With that in mind, a minimum threshold is used to make this a non-trivial user feature. Because users vary in the number of movies they have watched, a genre is included in the genre history if and only if movies from this genre make up at least a certain percentage (specified by `min_freq`) of movies rated by a given user. Args: data: Numpy array specifying genre history. If None, then this is generated from the raw data files. save_path: If specified (and `data` is None), then generate and save genre history to this path. min_freq: Minimum percentage of watches from a genre to quality as part of history, given as a float in [0., 1.] Updates: For each user, populates the genre history as a multi-hot encoding, a numpy array of length `NUM_GENRES`. """ if data is not None: if save_path: raise ValueError('If data is given, save path must be None') if not isinstance(data, np.ndarray): raise TypeError('Provided data is not a numpy array') user_genre_history = np.copy(data) else: if min_freq < 0 or min_freq > 1: # pytype: disable=unsupported-operands raise ValueError( f'Expected min_freq in [0, 1], got {min_freq:.3f}') movie_df = pd.read_csv(file_util.open(self._movie_path)) ratings_df = pd.read_csv(file_util.open(self._rating_path)) # Populate multi-hot encoding of genres for each movie genre_vecs = np.zeros((NUM_MOVIES, len(GENRES))) for idx, row in movie_df.iterrows(): genres = [ GENRE_MAP.get(genre, OTHER_GENRE_IDX) for genre in row.genres.split('|') ] genre_vecs[idx, genres] = 1 if not np.all(genre_vecs.sum(axis=1) > 0): raise ValueError('Some movies have no genres') # For each user, track the total number of movies of each genre, # where a movie can count for multiple genres. user_history = np.zeros((NUM_USERS, len(GENRES))) for _, row in ratings_df.iterrows(): user_history[row.userId] += genre_vecs[row.movieId] # Normalize by the total number of watches for each user num_watches = ratings_df.groupby('userId').size() if not np.array_equal(num_watches.index.values, np.arange(NUM_USERS)): raise ValueError('userId in provided ratings file has gaps') user_genre_per_watch = user_history / np.array(num_watches)[:, None] user_genre_history = np.array(user_genre_per_watch > min_freq, dtype=int) if save_path: with file_util.open(save_path, 'wb') as f: pickle.dump(user_genre_history, f) for user_ in self.get_users(): user_.initial_genre_history = np.copy( user_genre_history[user_.user_id]) user_.genre_history = np.copy(user_genre_history[user_.user_id])