test_df.to_pickle(test_pre_path) else: train_df = pd.read_pickle(train_pre_path) val_df = pd.read_pickle(val_pre_path) test_df = pd.read_pickle(test_pre_path) ## get data and train columns data_column = list(set(train_df.columns) - set(args["targets"]))[0] if run_tokenization: ## do tokenization print("Tokenize") tokenizer = Tokenizer(tokenizeStr=tokenizer_model[0], fasttextFile=args["fasttext_file"], doLower=args["doLower"]) train_df[data_column] = tokenizer.fit_transform(train_df[data_column]) val_df[data_column] = tokenizer.transform(val_df[data_column]) test_df[data_column] = tokenizer.transform(test_df[data_column]) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) train_df.to_pickle(train_tok_path) val_df.to_pickle(val_tok_path) test_df.to_pickle(test_tok_path) else: train_df = pd.read_pickle(train_tok_path) val_df = pd.read_pickle(val_tok_path) test_df = pd.read_pickle(test_tok_path) ## for testing purposes
np.save(val_pre_path.format("data"), val_data, allow_pickle=True) np.save(test_pre_path.format("data"), test_data, allow_pickle=True) np.save(train_pre_path.format("target"), train_target, allow_pickle=True) np.save(val_pre_path.format("target"), val_target, allow_pickle=True) np.save(test_pre_path.format("target"), test_target, allow_pickle=True) if run_tokenization: ## do tokenization print("Tokenize") tokenizer = Tokenizer(args=tokenizer_model, fasttextFile=args["fasttext_file"], doLower=args["doLower"]) train_data = tokenizer.fit_transform(train_data) val_data = tokenizer.transform(val_data) test_data = tokenizer.transform(test_data) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) if sparse.issparse(train_data): sparse.save_npz(train_tok_path.format("data"), train_data) else: np.save(train_tok_path.format("data"), train_data) np.save(train_tok_path.format("target"), train_target) if sparse.issparse(val_data): sparse.save_npz(val_tok_path.format("data"), val_data) else: np.save(val_tok_path.format("data"), val_data) np.save(val_tok_path.format("target"), val_target)