train_file = os.path.join(data_dir, 'train.csv') valid_file = os.path.join(data_dir, 'valid.csv') RE_VECTORIZE = False if RE_VECTORIZE or not os.path.isfile(data_cache): train = pd.read_csv(train_file) valid = pd.read_csv(valid_file) vectorizer = IndexVectorizer(max_words = MAX_VOCAB_SIZE, min_frequency=MIN_WORD_FREQ, start_end_tokens=STAT_END_TOK, tokenize=TOKENIZE) train_ds = TextDataset(data=train, vectorizer=vectorizer, text_col='text') valid_ds = TextDataset(data=valid, vectorizer=vectorizer, text_col='text') pickle.dump([train_ds, valid_ds], open(data_cache, 'wb')) pickle.dump(vectorizer, open(vectorizer_cache, 'wb')) else: train_ds, valid_ds = pickle.load(open(data_cache, 'rb')) vectorizer = pickle.load(open(vectorizer_cache, 'rb')) print(f'Train size: {len(train_ds)}\nvalid size: {len(valid_ds)}') print(f"Vocab size: {len(vectorizer.vocabulary)}") train_dl = LMDataLoader(dataset=train_ds,
def load_and_cache_examples(args, tokenizer, evaluate=False): file_path = args.eval_data_file if evaluate else args.train_data_file return TextDataset(tokenizer, file_path=file_path)
def load_and_cache_examples(args, tokenizer, evaluate=False): file_path = args.eval_data_file if evaluate else args.train_data_file if args.line_by_line: return LineByLineTextDataset(tokenizer, args, file_path=file_path) else: return TextDataset(tokenizer, args, file_path=file_path)
def load_and_cache_examples(args, tokenizer): file_path = args.data_file if args.line_by_line: return LineByLineTextDataset(tokenizer, args, file_path=file_path) else: return TextDataset(tokenizer, args, file_path=file_path)
def __init__(self, data_dir, train_file, valid_file, max_vocab_size=20000, batch_size=50, revectorize=False): self.data_dir = data_dir self.TOKENIZE = SpacyTokenizer().tokenize self.MIN_WORD_FREQ = 2 self.MAX_VOCAB_SIZE = max_vocab_size self.STAT_END_TOK = True ## Training Language Model self.batch_size = batch_size self.target_seq_len = 65 self.max_seq_len = 75 self.min_seq_len = 5 # GPU setup self.use_gpu = torch.cuda.is_available() device_num = 0 self.device = torch.device( f"cuda:{device_num}" if self.use_gpu else "cpu") # IO setup today = datetime.datetime.now().strftime('%Y-%m-%d') model_cache_dir = os.path.join(data_dir, 'models') self.data_cache = os.path.join(model_cache_dir, 'data_cache.pkl') self.vectorizer_cache = os.path.join(model_cache_dir, 'lm_vectorizer.pkl') os.makedirs(model_cache_dir, exist_ok=True) self.model_file_lm = os.path.join(model_cache_dir, f'LM__{today}.json') self.train_file = train_file self.valid_file = valid_file self.revectorize = revectorize if self.revectorize or not os.path.isfile(self.data_cache): print("Vectorizing starting...") train = pd.read_csv(self.train_file) valid = pd.read_csv(self.valid_file) self.vectorizer = IndexVectorizer( max_words=self.MAX_VOCAB_SIZE, min_frequency=self.MIN_WORD_FREQ, start_end_tokens=self.STAT_END_TOK, tokenize=self.TOKENIZE) self.train_ds = TextDataset(data=train, vectorizer=self.vectorizer, text_col='text') self.valid_ds = TextDataset(data=valid, vectorizer=self.vectorizer, text_col='text') pickle.dump([self.train_ds, self.valid_ds], open(self.data_cache, 'wb')) pickle.dump(self.vectorizer, open(self.vectorizer_cache, 'wb')) else: self.train_ds, self.valid_ds = pickle.load( open(self.data_cache, 'rb')) self.vectorizer = pickle.load(open(self.vectorizer_cache, 'rb')) print("Vectorizing is complete.") print(f'Train size: {len(self.train_ds)}\n \ valid size:{len(self.valid_ds)}') print(f"Vocab size: {len(self.vectorizer.vocabulary)}") self.train_dl = LMDataLoader(dataset=self.train_ds, target_seq_len=self.target_seq_len, shuffle=True, max_seq_len=self.max_seq_len, min_seq_len=self.min_seq_len, p_half_seq_len=0.05, batch_size=self.batch_size) self.valid_dl = LMDataLoader(dataset=self.valid_ds, target_seq_len=self.target_seq_len, shuffle=True, max_seq_len=self.max_seq_len, min_seq_len=self.min_seq_len, p_half_seq_len=0.05, batch_size=self.batch_size) print("Created Data Loaders for documents")