Example #1
0
train_file = os.path.join(data_dir, 'train.csv')
valid_file = os.path.join(data_dir, 'valid.csv')





RE_VECTORIZE = False
if RE_VECTORIZE or not os.path.isfile(data_cache):
    train = pd.read_csv(train_file)
    valid = pd.read_csv(valid_file)
    vectorizer = IndexVectorizer(max_words = MAX_VOCAB_SIZE, 
                             min_frequency=MIN_WORD_FREQ,
                             start_end_tokens=STAT_END_TOK, 
                             tokenize=TOKENIZE)
    train_ds = TextDataset(data=train, vectorizer=vectorizer, text_col='text')
    valid_ds = TextDataset(data=valid, vectorizer=vectorizer, text_col='text')
    pickle.dump([train_ds, valid_ds], open(data_cache, 'wb'))
    pickle.dump(vectorizer, open(vectorizer_cache, 'wb'))
else:
    train_ds, valid_ds = pickle.load(open(data_cache, 'rb'))
    vectorizer = pickle.load(open(vectorizer_cache, 'rb'))





print(f'Train size: {len(train_ds)}\nvalid size: {len(valid_ds)}')
print(f"Vocab size: {len(vectorizer.vocabulary)}")

train_dl = LMDataLoader(dataset=train_ds, 
Example #2
0
def load_and_cache_examples(args, tokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    return TextDataset(tokenizer, file_path=file_path)
Example #3
0
def load_and_cache_examples(args, tokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer, args, file_path=file_path)
    else:
        return TextDataset(tokenizer, args, file_path=file_path)
def load_and_cache_examples(args, tokenizer):
    file_path = args.data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer, args, file_path=file_path)
    else:
        return TextDataset(tokenizer, args, file_path=file_path)
    def __init__(self,
                 data_dir,
                 train_file,
                 valid_file,
                 max_vocab_size=20000,
                 batch_size=50,
                 revectorize=False):
        self.data_dir = data_dir
        self.TOKENIZE = SpacyTokenizer().tokenize
        self.MIN_WORD_FREQ = 2
        self.MAX_VOCAB_SIZE = max_vocab_size
        self.STAT_END_TOK = True

        ## Training Language Model
        self.batch_size = batch_size
        self.target_seq_len = 65
        self.max_seq_len = 75
        self.min_seq_len = 5

        # GPU setup
        self.use_gpu = torch.cuda.is_available()
        device_num = 0
        self.device = torch.device(
            f"cuda:{device_num}" if self.use_gpu else "cpu")

        # IO setup
        today = datetime.datetime.now().strftime('%Y-%m-%d')
        model_cache_dir = os.path.join(data_dir, 'models')
        self.data_cache = os.path.join(model_cache_dir, 'data_cache.pkl')
        self.vectorizer_cache = os.path.join(model_cache_dir,
                                             'lm_vectorizer.pkl')
        os.makedirs(model_cache_dir, exist_ok=True)
        self.model_file_lm = os.path.join(model_cache_dir, f'LM__{today}.json')

        self.train_file = train_file
        self.valid_file = valid_file

        self.revectorize = revectorize
        if self.revectorize or not os.path.isfile(self.data_cache):
            print("Vectorizing starting...")
            train = pd.read_csv(self.train_file)
            valid = pd.read_csv(self.valid_file)
            self.vectorizer = IndexVectorizer(
                max_words=self.MAX_VOCAB_SIZE,
                min_frequency=self.MIN_WORD_FREQ,
                start_end_tokens=self.STAT_END_TOK,
                tokenize=self.TOKENIZE)

            self.train_ds = TextDataset(data=train,
                                        vectorizer=self.vectorizer,
                                        text_col='text')
            self.valid_ds = TextDataset(data=valid,
                                        vectorizer=self.vectorizer,
                                        text_col='text')

            pickle.dump([self.train_ds, self.valid_ds],
                        open(self.data_cache, 'wb'))
            pickle.dump(self.vectorizer, open(self.vectorizer_cache, 'wb'))
        else:
            self.train_ds, self.valid_ds = pickle.load(
                open(self.data_cache, 'rb'))
            self.vectorizer = pickle.load(open(self.vectorizer_cache, 'rb'))

        print("Vectorizing is complete.")
        print(f'Train size: {len(self.train_ds)}\n \
            valid size:{len(self.valid_ds)}')
        print(f"Vocab size: {len(self.vectorizer.vocabulary)}")

        self.train_dl = LMDataLoader(dataset=self.train_ds,
                                     target_seq_len=self.target_seq_len,
                                     shuffle=True,
                                     max_seq_len=self.max_seq_len,
                                     min_seq_len=self.min_seq_len,
                                     p_half_seq_len=0.05,
                                     batch_size=self.batch_size)

        self.valid_dl = LMDataLoader(dataset=self.valid_ds,
                                     target_seq_len=self.target_seq_len,
                                     shuffle=True,
                                     max_seq_len=self.max_seq_len,
                                     min_seq_len=self.min_seq_len,
                                     p_half_seq_len=0.05,
                                     batch_size=self.batch_size)

        print("Created Data Loaders for documents")