def parse_movielens(threshold=4, **kwargs): if os.path.isfile(BIN_DATA[ML_20M_ALT]): LOG.info("Already processed, skipping.") return source_file = os.path.join(DOWNLOAD[ML_20M_ALT], "ratings.csv") if not glob(source_file): download_movielens() LOG.info("Parsing movielens.") df = pd.read_csv(source_file) df.drop('timestamp', axis=1, inplace=True) df["rating"] = make_feedback_implicit(df["rating"], threshold) map_user_id = {u: i for i, u in enumerate(df.userId.unique())} map_movie_id = {m: i for i, m in enumerate(df.movieId.unique())} m_sp = sp.csr_matrix( (df.rating, ([map_user_id[u] for u in df.userId], [map_movie_id[m] for m in df.movieId])), shape=(len(map_user_id), len(map_movie_id)) ) m_sp.eliminate_zeros() save_as_npz(m_sp, BIN_DATA[ML_20M_ALT])
def download_movielens(): filepath = os.path.join(DATASETS_DIR, ML_20M_ALT + '.zip') if not glob(filepath): download_file(DOWNLOAD_URL[ML_20M], filepath) LOG.info("Extracting") extract_file(filepath, DATASETS_DIR)
def download_pinterest(): filepath = os.path.join(DATASETS_DIR, PINTEREST + '.zip') if not glob(filepath): download_file_from_google_drive(DOWNLOAD_GOOGLE_DRIVE_ID[PINTEREST], filepath) LOG.info("Extracting") extract_file(filepath, DATASETS_DIR) os.rename(os.path.join(DATASETS_DIR, 'pinterest_iccv'), DOWNLOAD[PINTEREST])
def download_lastfm(): filepath = os.path.join(DATASETS_DIR, 'lastfm-dataset-360K.tar.gz') if not glob(DOWNLOAD[LASTFM]): download_file(DOWNLOAD_URL[LASTFM], filepath) LOG.info("Extracting") extract_file(filepath, DATASETS_DIR) os.rename(os.path.join(DATASETS_DIR, 'lastfm-dataset-360K'), os.path.join(DATASETS_DIR, 'lastfm'))
def download_netflix(): filepath = os.path.join(DATASETS_DIR, NETFLIX + '.tar.gz') if not glob(filepath): download_file(DOWNLOAD_URL[NETFLIX], filepath) LOG.info("Extracting 1/2") extract_file(filepath, tempfile.gettempdir()) LOG.info("Extracting 2/2") extract_file( os.path.join(tempfile.gettempdir(), 'download', 'training_set.tar'), DATASETS_DIR) os.rename(os.path.join(DATASETS_DIR, 'training_set'), DOWNLOAD[NETFLIX])
def train( self, n_epochs: int, train_data: sparse.csr_matrix, validation_data_input: sparse.csr_matrix, validation_data_true: sparse.csr_matrix, batch_size_train: int, batch_size_validation: int, metrics: dict, # Dict[str, matrix -> matrix -> float] validation_step: 10, ): """ Train the model :param n_epochs: number of epochs :param train_data: train matrix of shape users count x items count :param metrics: Dictionary of metric names to metric functions :param validation_step: If it's set to n then validation is run once every n epochs """ self.metrics_history = defaultdict(lambda: []) self.time_elapsed_training_history = [] self.time_elapsed_validation_history = [] self.session.run(self.iter.initializer) for epoch in range(1, n_epochs + 1): self.log_which_epoch(epoch, n_epochs) init_time = time.time() for _ in range(self.n_batch_per_train): self.session.run(self.optimizer) training_duration = time.time() - init_time self.time_elapsed_training_history.append(training_duration) LOG.info("Train time:\t{}".format(training_duration)) if epoch % validation_step == 0 or epoch == n_epochs: init_time = time.time() metrics_scores = self.test(validation_data_input, validation_data_true, metrics) for name, score in metrics_scores.items(): self.metrics_history[name].append(score) validation_duration = time.time() - init_time self.time_elapsed_validation_history.append( validation_duration) LOG.info("Valid time:\t{}".format(validation_duration)) self.log_metrics(epoch, metrics_scores, n_epochs) self.log_training_time()
def log_training_time(self): LOG.info("Total elapsed train time: {}".format( np.sum(self.time_elapsed_training_history))) LOG.info("Total elapsed valid time: {}".format( np.sum(self.time_elapsed_validation_history))) LOG.info("Epoch average train time: {}".format( np.mean(self.time_elapsed_training_history))) LOG.info("Epoch average valid time: {}".format( np.mean(self.time_elapsed_validation_history)))
def parse_pinterest(**kwargs): if os.path.isfile(BIN_DATA[PINTEREST]): LOG.info("Already processed, skipping.") return data_file = 'subset_iccv_board_pins.bson' source_file = os.path.join(DOWNLOAD[PINTEREST], data_file) if not glob(source_file): raise Exception("Cannot find pinterest dataset") LOG.info("Parsing pinterest") with open(source_file, 'rb') as f: bsob = bson.decode_all(f.read()) map_id_pin = dict() map_pin_id = dict() map_board_id = dict() map_id_board = dict() pins = 0 board_pin_pairs = [] for i, board in enumerate(bsob): map_id_board[i] = board map_board_id[board['board_id']] = i for pin in board['pins']: if (pin not in map_pin_id): map_pin_id[pin] = pins map_id_pin[pins] = pin pins += 1 board_pin_pairs.append((map_board_id[board['board_id']], map_pin_id[pin])) boards = [board for (board, pin) in board_pin_pairs] pins = [pin for (board, pin) in board_pin_pairs] m_sp = sp.csr_matrix(([1] * len(boards), (boards, pins)), shape=(len(map_board_id), len(map_pin_id))) save_as_npz(m_sp, BIN_DATA[PINTEREST])
def parse_lastfm(**kwargs): if os.path.isfile(BIN_DATA[LASTFM]): LOG.info("Already processed, skipping.") return data_file = 'usersha1-artmbid-artname-plays.tsv' source_file = os.path.join(DOWNLOAD[LASTFM], data_file) if not glob(source_file): download_lastfm() LOG.info("Parsing lastfm") df = pd.read_csv(source_file, delimiter='\t', names=["User", "Artist id", "Artist name", "Plays"], dtype=str) artist_column = list(zip([str(i) for i in df['Artist id']], [str(i) for i in df['Artist name']])) user_column = df['User'] map_artist_id = {artist: i for i, artist in enumerate(sorted(set(artist_column)))} map_user_id = {user: i for i, user in enumerate(sorted(set(user_column)))} user_ids = [map_user_id[user] for user in user_column] artist_ids = [map_artist_id[artist] for artist in artist_column] m_sp = sp.csr_matrix(([1] * df.shape[0], (user_ids, artist_ids)), shape=(len(map_user_id), len(map_artist_id))) save_as_npz(m_sp, BIN_DATA[LASTFM])
def load_dataset(dataset: str, *args, **kwargs): """ Generic data loader. :param dataset: name of dataset to be loaded :return: 5 csr_matrices {train, valid_in, valid_out, test_in, test_out} """ assert dataset in DATASETS, "Wrong dataset name" if dataset == ML_20M: out = load_and_parse_ML_20M(DOWNLOAD[ML_20M], *args, **kwargs) LOG.info("Done") return out handler_map_parse = { NETFLIX: parse_netflix, ML_20M_ALT: parse_movielens, LASTFM: parse_lastfm, PINTEREST: parse_pinterest } handler_map_parse[dataset]() out = load_data(BIN_DATA[dataset], *args, **kwargs) LOG.info("Done") return out
def parse_netflix(threshold=3, **kwargs): if os.path.isfile(BIN_DATA[NETFLIX]): LOG.info("Already processed, skipping.") return files = glob(os.path.join(DOWNLOAD[NETFLIX], '*')) if not files: download_netflix() LOG.info("Parsing netflix") users = get_users(files) map_user_id = {u: i for i, u in enumerate(users)} csr_rows = [] csr_columns = [] csr_data = [] LOG.info("Parsing netflix, step 2/2") for movie_id, file_path in tqdm(enumerate(files)): df = pd.read_csv(file_path, names=['User', 'Rating', 'Date']) df.drop(0, inplace=True) df['Rating'] = make_feedback_implicit(df['Rating'], threshold) rows = [map_user_id[user] for user in df['User']] columns = [movie_id] * len(rows) data = list(df['Rating']) assert len(rows) == len(columns) and len(columns) == len(data) csr_rows += rows csr_columns += columns csr_data += data m_sp = sp.csr_matrix((csr_data, (csr_rows, csr_columns)), shape=(len(users), len(files))) m_sp.eliminate_zeros() save_as_npz(m_sp, BIN_DATA[NETFLIX])
def log_which_epoch(self, epoch, n_epochs): LOG.info("Epoch: {}".format(epoch))
def log_metrics(self, epoch, metrics_scores, n_epochs): for name, score in metrics_scores.items(): LOG.info("Mean {}:\t{}".format(name, score))