def write_csv_output(dataframes, directory): """Write csv file outputs.""" movies, users, ratings = dataframes file_util.makedirs(directory, exist_ok=True) del movies["tag_id"] # This column isn't necessary. users.to_csv( file_util.open(os.path.join(directory, "users.csv"), "w"), index=False, ) movies.to_csv( file_util.open(os.path.join(directory, "movies.csv"), "w"), index=False ) ratings.to_csv( file_util.open(os.path.join(directory, "ratings.csv"), "w"), index=False )
def _read_users(self, path): """Returns a dict of User objects.""" users = {} for _, row in pd.read_csv(file_util.open(path)).iterrows(): users[row.userId] = self._user_ctor( user_id=row.userId, sex=row.sex, age=row.age, occupation=row.occupation, zip_code=row.zip_code, budget=self._responses.get(row.userId, 0), ) return users
def load_embeddings(env_config): """Attempts to loads user and movie embeddings from a json or pickle file.""" path = env_config.embeddings_path suffix = pathlib.Path(path).suffix if suffix == ".json": loader = json logging.info("Reading a json file. %s", path) elif suffix in (".pkl", ".pickle"): loader = pickle logging.info("Reading a pickle file. %s", path) else: raise ValueError("Unrecognized file type! %s" % path) embedding_dict = loader.load(file_util.open(path, "rb")) return types.SimpleNamespace( movies=np.array(embedding_dict[env_config.embedding_movie_key]), users=np.array(embedding_dict[env_config.embedding_user_key]), )
def _read_movies(self, path): """Returns a dict of Movie objects.""" movies = {} movie_df = pd.read_csv(file_util.open(path)) for _, row in movie_df.iterrows(): genres = [ GENRE_MAP.get(genre, OTHER_GENRE_IDX) for genre in row.genres.split("|") ] assert isinstance(row.movieId, int) movie_id = row.movieId # `movie_vec` is left as None, and will be filled in later in the init # of this Dataset. movies[movie_id] = self._movie_ctor( movie_id, row.title, genres, vec=None, violence=row.violence_tag_relevance, ) return movies
def _read_responses(self, path): """Returns a dict containing the count of Response objects per user.""" df = pd.read_csv(file_util.open(path)) return df.groupby(by=["userId"]).rating.count().to_dict()