if TRAIN_TOKENS.exists() and VAL_TOKENS.exists() and IDX_TO_TOKEN.exists( ) and USE_CACHE: token_train = np.load(TRAIN_TOKENS) texts_train = np.load(TRAIN_TEXTS) token_val = np.load(VAL_TOKENS) texts_val = np.load(VAL_TEXTS) train_labels = np.load(TRAIN_LABELS) val_labels = np.load(VAL_LABELS) idx_to_token = pickle.load(Path(IDX_TO_TOKEN).open('rb')) vocab = Vocabulary(idx_to_token) else: texts_train, token_train, train_labels = get_all_tokenized( train_data, 1) texts_val, token_val, val_labels = get_all_tokenized(val_data, 1) vocab = Vocabulary.from_text(token_train) if USE_CACHE: np.save(str(TRAIN_TOKENS), token_train) np.save(str(TRAIN_TEXTS), texts_train) np.save(str(VAL_TOKENS), token_val) np.save(str(VAL_TEXTS), texts_val) np.save(str(TRAIN_LABELS), train_labels) np.save(str(VAL_LABELS), val_labels) pickle.dump(vocab._idx_to_token, open(IDX_TO_TOKEN, 'wb')) # load genres: GENRES = Path(GENRES_TYPES_FILE).open("r").readlines() GENRES = list(map(lambda x: x.strip(), GENRES)) n_genres = len(GENRES)