class MyDataset(object): def __init__(self, root_dir='data', batch_size=64, use_vector=True): self.TEXT = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True, batch_first=True) self.LABEL = LabelField(tensor_type=torch.FloatTensor) vectors = Vectors(name='mr_vocab.txt', cache='./') dataset_path = os.path.join(root_dir, '{}.tsv') self.dataset = {} self.dataloader = {} for target in ['train', 'dev', 'test']: self.dataset[target] = TabularDataset( path=dataset_path.format(target), format='tsv', fields=[('text', self.TEXT), ('label', self.LABEL)] ) if use_vector: self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors) else: self.TEXT.build_vocab(self.dataset[target], max_size=25000) self.LABEL.build_vocab(self.dataset[target]) self.dataloader[target] = Iterator(self.dataset[target], batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: len(x.text), shuffle=True)
def load_dataset_from_csv(params, device): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ # define tokenizer en = English() def tokenize(sentence): return [tok.text for tok in en.tokenizer(sentence)] TEXT = Field(sequential=True, tokenize=tokenize, lower=True, eos_token='<eos>', batch_first=True, fix_length=128) LABEL = LabelField() fields_list = [('Unnamed: 0', None), ('text', TEXT), ('conf', None), ('label', LABEL)] base_path = params.DATA_PATH train_path = os.path.join(base_path, "filtered_train.csv") test_path = os.path.join(base_path, "filtered_test.csv") train_data = TabularDataset(path=train_path, # the root directory where the data lies format='csv', skip_header=True, fields=fields_list) test_data = TabularDataset(path=test_path, # the root directory where the data lies format='csv', skip_header=True, fields=fields_list) if params.VOCAB_USE_GLOVE: TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ, vectors=GloVe(name='6B', dim=300)) logging.info("Loaded Glove embedding, Vector size of Text Vocabulary: " + str(TEXT.vocab.vectors.size())) else: TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors logging.info("Length of Text Vocabulary: " + str(len(TEXT.vocab))) train_iter, test_iter = data.BucketIterator.splits((train_data, test_data), batch_sizes=(params.TRAIN_BATCH_SIZE, params.TRAIN_BATCH_SIZE), sort_key=lambda x: len(x.text), repeat=False, shuffle=True, device=device) # Disable shuffle test_iter.shuffle = False return TEXT, word_embeddings, train_iter, test_iter
def clean_quora(path='../data/train.csv', output='list', tokenizer = nltk.word_tokenize, device=DEVICE, batch_size=32): data = pd.read_csv(path) questions1 = data['question1'].astype('str').tolist() questions2 = data['question2'].astype('str').tolist() is_duplicates = data['is_duplicate'].tolist() if output == 'list': return questions1, questions2, is_duplicates elif output == 'tokenized_list': return [tokenizer(q) for q in questions1], [tokenizer(q) for q in questions2], is_duplicates elif output == 'iterator' or output == 'iterator_from_file': TEXT = Field( sequential=True, tokenize = tokenizer, pad_first = False, dtype = torch.long, lower = True, batch_first = True ) TARGET = LabelField(use_vocab = False) if output == 'iterator': examples = [Example.fromlist((questions1[i], questions2[i], is_duplicates[i]), [('question1', TEXT), ('question2', TEXT) ('is_duplicate', TARGET)]) for i in range(len(questions1))] dataset = Dataset(examples, {'question1': TEXT, 'question2': TEXT, 'is_duplicate': TARGET}) if output == 'iterator_from_file': dataset = TabularDataset(path, 'csv', [('question1', TEXT), ('question2', TEXT), ('is_duplicate', TARGET)], skip_header=True) iterator = BucketIterator( dataset, batch_size=batch_size, sort_key=lambda x: len(x.question1) + len(x.question2), sort_within_batch=False, repeat = False, device = device # repeat=False # we pass repeat=False because we want to wrap this Iterator layer. ) TEXT.build_vocab(dataset) TARGET.build_vocab(dataset) return iterator #dataset = TabularDataset(path, 'csv', [('review', TEXT), ('sentiment', TARGET)]) else: raise ValueError('Processing type not understood')
def load_data(preprocessing=None): # Fields for the dataset # The actual review message #TEXT = Field(tokenize='spacy') # -- Old way, unclear exactly what language model is used TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, preprocessing=preprocessing) LABEL = LabelField(dtype=torch.float) # Get the entire dataset that we will then split data = TabularDataset(path=path, format='tsv', fields=[('text', TEXT), ('label', LABEL)]) # We should probabily look at the proportion of fake to non fake in each of these # set to make sure it is fairly even. Though probabilistically it should be I suppose train_data, valid_data, test_data = data.split( split_ratio=TRAIN_VAL_TEST_SPLIT, random_state=random.seed(SEED)) #valid_data, test_data = test_data.split(split_ratio=VAL_TEST_SPLIT, random_state=random.seed(SEED)) print('Size of train set: ' + str(len(train_data.examples))) print('Size of val / test: ' + str(len(valid_data.examples))) ''' # Try loading in the IMB dataset to label pos or negative train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) # Get train/valid split!! train_data, valid_data = train_data.split(random_state=random.seed(SEED)) ''' # Now we need to build the vocab for our actual data # Here we will use the pre-trained word vetors from "glove.6b.100" TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d") LABEL.build_vocab(train_data) # Print stuff for sanity checks print('Size of the vocab: ' + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_itr, valid_itr, test_itr = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device, sort_key=lambda x: len(x.text)) return TEXT, train_itr, valid_itr, test_itr
def data_load_without_cv(fname, args, seed=1234, split_ratio=0.9): TEXT = Field(sequential=True, tokenize=str.split, batch_first=True, fix_length=56, lower=True) LABEL = LabelField(sequential=False, dtype=torch.float) FIELDS = [('label', LABEL), ('text', TEXT)] dataset = TabularDataset(fname, fields=FIELDS, format='csv', skip_header=True) train_dataset, valid_dataset = dataset.split(random_state=random.seed(seed), split_ratio=split_ratio) TEXT.build_vocab(train_dataset) LABEL.build_vocab(train_dataset) train_iterator, valid_iterator = BucketIterator.splits((train_dataset, valid_dataset), batch_size=args.batch_size, device=args.device, sort=False, shuffle=True) return TEXT, train_iterator, valid_iterator
class ReviewsDataset(): def __init__(self, data_path, train_path): ## write the tokenizer tokenize = lambda review: review.split() ## define your fields for ID filed you can use RAWField class self.TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True) self.LABEL = LabelField() self.fields = [ ("PhraseId", None ), # we won't be needing the id, so we pass in None as the field ("SentenceId", None), ("Phrase", self.TEXT), ("Sentiment", self.LABEL) ] #{ 'Phrase': ('r', self.review), 'Sentiment': ('s', self.sentiment) } ## set paths self.data_path = data_path self.train_path = train_path def load_data(self): self.train_data = TabularDataset.splits( path='{}'.format(self.data_path), train='{}'.format(self.train_path), format='tsv', fields=self.fields)[0] self.TEXT.build_vocab(self.train_data, max_size=10000, min_freq=1) self.LABEL.build_vocab(self.train_data) self.train_iterator, _ = BucketIterator.splits( (self.train_data, None), batch_sizes=(64, 64), sort_within_batch=True, sort_key=lambda x: len(x.Phrase)) def __str__(self): return 'review: {} \n sentiment: {}'.format( self.train_data[0][0].__dict__['r'], self.train_data[0][0].__dict__['s'])
def pre_process_text(): ID = Field(sequential=False, use_vocab=False) # 处理CATEGORY,标签选择非序列,use_vocab置true建立词典,is_target置true指明这是目标变量 CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True) # 处理NEWS,文本选择序列,分词函数用jieba的lcut,返回句子原始长度方便RNN使用 NEWS = Field(sequential=True, tokenize=jieba.lcut, include_lengths=True) fields = [ ('id', ID), (None, None), ('category', CATEGORY), ('news', NEWS), ] # 加载数据 train_data = TabularDataset( os.path.join('data', 'train.csv'), format = 'csv', fields = fields, csv_reader_params={'delimiter': '\t'} ) valid_data = TabularDataset( os.path.join('data', 'dev.csv'), format = 'csv', fields = fields, csv_reader_params={'delimiter': '\t'} ) test_data = TabularDataset( os.path.join('data', 'test.csv'), format = 'csv', fields = fields, csv_reader_params={'delimiter': '\t'} ) # 创建字典 NEWS.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) CATEGORY.build_vocab(train_data) return CATEGORY, NEWS, train_data, valid_data, test_data
if __name__ == "__main__": text_field = Field(use_vocab=False, tokenize=tokenize_and_trunc, preprocessing=tokenizer.convert_tokens_to_ids, batch_first=True, init_token=init_token_idx, eos_token=eos_token_idx, pad_token=pad_token_idx, unk_token=unk_token_idx) label_field = LabelField() train_data, test_data = IMDB.splits(text_field, label_field) train_data, valid_data = train_data.split() label_field.build_vocab(train_data) n_epochs = 5 batch_size = 128 rnn_hidden_size = 256 dropout_p = 0.2 num_classes = len(label_field.vocab) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' model = BertGRU(bert.config.to_dict()['dim'], rnn_hidden_size, num_classes=num_classes, dropout_p=dropout_p) for name, params in model.named_parameters(): if name.startswith('embedding_layer'): params.requires_grad = False
class SequenceDataLoader(CommonDataLoader): def __init__(self, data_config): super(SequenceDataLoader, self).__init__(data_config) self._config = data_config self._tool = Tool() self.__build_field() self._load_data() pass def __build_field(self): self.TEXT = Field(sequential=True, use_vocab=True, lower=True, tokenize=tokenizer, include_lengths=True, batch_first=self._config.data.batch_first, pad_token='[PAD]', unk_token='[UNK]') # self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, # batch_first=self._config.data.batch_first) self.TAG = LabelField( sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, ) self._fields = [('text', self.TEXT), ('tag', self.TAG)] pass @timeit def _load_data(self): self.train_data = EmoDataset(path=self._config.data.train_path, fields=self._fields, file='train', config=self._config) self.valid_data = EmoDataset(path=self._config.data.valid_path, fields=self._fields, file='valid', config=self._config) self.test_data = EmoDataset(path=self._config.data.test_path, fields=self._fields, file='test', config=self._config) self.__build_vocab(self.train_data, self.valid_data, self.test_data) self.__build_iterator(self.train_data, self.valid_data, self.test_data) pass def __build_vocab(self, *dataset): """ :param dataset: train_data, valid_data, test_data :return: text_vocab, tag_vocab """ if self._config.pretrained_models.is_use: vocabs = self._tool.get_vocab_list(self._config.data.vocab_path) v = Vocab(vocabs, specials=['[PAD]', '[UNK]']) self.TEXT.build_vocab( vocabs, max_size=30000, min_freq=1, vectors=None, # vects替换为None则不使用词向量 ) else: self.TEXT.build_vocab(*dataset) self.TAG.build_vocab(*dataset) self.word_vocab = self.TEXT.vocab self.tag_vocab = self.TAG.vocab pass def __build_iterator(self, *dataset): self._train_iter = BucketIterator( dataset[0], batch_size=self._config.data.train_batch_size, shuffle=True, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) self._valid_iter = BucketIterator( dataset[1], batch_size=self._config.data.train_batch_size, shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) self._test_iter = BucketIterator( dataset[2], batch_size=self._config.data.train_batch_size, shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) pass def load_train(self): return self._train_iter pass def load_test(self): return self._test_iter pass def load_valid(self): return self._valid_iter pass
class DataLoader(object): def __init__(self, data_fields, train_file, valid_file, batch_size, device, skip_header, delimiter, pre_embeddings, vector_cache, min_freq=2, extend_vocab=True, pre_vocab_size=200000, use_pre_embedding=False): self.x_field = Field(sequential=True, tokenize=self.word_tokenize, batch_first=True, include_lengths=True) self.y_field = LabelField(batch_first=True) self.train_fields, self.x_var, self.y_vars = self.parse_fields( data_fields, self.x_field, self.y_field) self.train_ds = TabularDataset( train_file, fields=self.train_fields, skip_header=skip_header, format="csv", csv_reader_params={"delimiter": delimiter}) self.valid_ds = TabularDataset( valid_file, fields=self.train_fields, skip_header=skip_header, format="csv", csv_reader_params={"delimiter": delimiter}) self.x_field.build_vocab(self.train_ds, min_freq=min_freq) if use_pre_embedding: vectors = Vectors(pre_embeddings, vector_cache) if extend_vocab: self.extend_vocab_with_vectors(self.x_field.vocab, vectors, pre_vocab_size) vectors.unk_init = partial(init_unk, vocab_size=len(self.x_field.vocab)) self.x_field.vocab.load_vectors(vectors) self.y_field.build_vocab(self.train_ds) self.train_iter, self.valid_iter = BucketIterator.splits( (self.train_ds, self.valid_ds), batch_size=batch_size, device=device, sort=False, sort_key=lambda sample: len(getattr(sample, self.x_var)), sort_within_batch=False, shuffle=True, repeat=False, ) self.vocab = self.x_field.vocab self.vocab_size = len(self.x_field.vocab) self.num_labels = len(self.y_vars) self.num_classes = len(self.y_field.vocab) self.classes = list(self.y_field.vocab.stoi.values()) self.unk_token = self.x_field.unk_token self.pad_token = self.x_field.pad_token self.unk_idx = self.x_field.vocab.stoi[self.unk_token] self.pad_idx = self.x_field.vocab.stoi[self.pad_token] self.train_wrapper = BatchWrapper(self.train_iter, self.x_var, self.y_vars) self.valid_wrapper = BatchWrapper(self.valid_iter, self.x_var, self.y_vars) @staticmethod def word_tokenize(text): text = pretreatment(text) return jieba.lcut(text) @staticmethod def char_tokenize(text): text = pretreatment(text) return list(text) @staticmethod def parse_fields(data_fields, x_field, y_field): train_fields, x_var, y_vars = [], None, [] for field_name, var_type in data_fields.items(): if var_type == "x": x_var = field_name train_fields.append((field_name, x_field)) elif var_type == "y": y_vars.append(field_name) train_fields.append((field_name, y_field)) else: train_fields.append((field_name, None)) return train_fields, x_var, y_vars @staticmethod def extend_vocab_with_vectors(vocab, vectors, vocab_size): for word in list(vectors.stoi.keys())[:vocab_size]: if word in vocab.stoi: vocab.itos.append(word) vocab.stoi[word] = len(vocab.itos) - 1
import warnings warnings.filterwarnings('ignore') TEXT = Field(sequential=True, lower=True, include_lengths=True) # Поле текста LABEL = LabelField(dtype=torch.float) # Поле метки SEED = 1234 torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True train, test = datasets.IMDB.splits(TEXT, LABEL) # загрузим датасет train, valid = train.split(random_state=random.seed(SEED)) # разобьем на части TEXT.build_vocab(train) LABEL.build_vocab(train) device = "cuda" if torch.cuda.is_available() else "cpu" train_iter, valid_iter, test_iter = BucketIterator.splits( (train, valid, test), batch_size = 64, sort_within_batch = True, device = device) def binary_accuracy(preds, y): rounded_preds = torch.round(torch.sigmoid(preds)) correct = (rounded_preds == y).float() acc = correct.sum() / len(correct) return acc
def _create_serialized_20newsgroups_iterator(args): r""" Creates a serialized 20 newsgroups dataset :param args: Test setup information """ p_cls = {cls_id for cls_grp in args.pos for cls_id in cls_grp.value} n_cls = {cls_id for cls_grp in args.neg for cls_id in cls_grp.value} complete_train = _download_20newsgroups("train", p_cls, n_cls) tokenizer = nltk.tokenize.word_tokenize # noinspection PyPep8Naming TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, fix_length=args.seq_len) # noinspection PyPep8Naming LABEL = LabelField(sequential=False) complete_ds = _bunch_to_ds(complete_train, TEXT, LABEL) cache_dir = DATA_DIR / "vector_cache" cache_dir.mkdir(parents=True, exist_ok=True) TEXT.build_vocab(complete_ds, min_freq=2, vectors=torchtext.vocab.GloVe(name="6B", dim=args.embed_dim, cache=cache_dir)) size_scalar = 1 + VALIDATION_FRAC p_bunch, u_bunch = _select_bunch_uar(int(args.size_p * size_scalar), complete_train, p_cls, remove_from_bunch=False) n_bunch, u_bunch = _select_negative_bunch(int(args.size_n * size_scalar), u_bunch, n_cls, args.bias, remove_from_bunch=False) u_bunch = _reduce_to_fixed_size(u_bunch, new_size=int(args.size_u * size_scalar)) test_bunch = _download_20newsgroups("test", p_cls, n_cls) for name, bunch in (("P", p_bunch), ("N", n_bunch), ("U", u_bunch), ("Test", test_bunch)): _log_category_frequency(args.pos, name, bunch) # Binarize the labels for bunch in (p_bunch, u_bunch, n_bunch, test_bunch): _configure_binary_labels(bunch, pos_cls=p_cls, neg_cls=n_cls) # Sanity check assert np.all(p_bunch[LABEL_COL] == POS_LABEL), "Negative example in positive (labeled) set" assert len(p_bunch[LABEL_COL]) == int(args.size_p * size_scalar), \ "Positive set has wrong number of examples" assert np.all(n_bunch[LABEL_COL] == NEG_LABEL), "Positive example in negative (labeled) set" assert len(n_bunch[LABEL_COL]) == int(args.size_n * size_scalar), \ "Negative set has wrong number of examples" assert len(u_bunch[LABEL_COL]) == int(args.size_u * size_scalar), \ "Unlabeled set has wrong number of examples" ng_data = NewsgroupsSerial(text=TEXT, label=LABEL) full_train_ds = _build_train_set(p_bunch, u_bunch, n_bunch, TEXT, LABEL) split_ratio = 1 / (1 + VALIDATION_FRAC) ng_data.train, ng_data.valid = full_train_ds.split(split_ratio, stratified=True) ng_data.unlabel = _bunch_to_ds(u_bunch, TEXT, LABEL) ng_data.test = _bunch_to_ds(test_bunch, TEXT, LABEL) tot_unlabel_size = args.size_p + args.size_n + args.size_u assert len(ng_data.train.examples ) == tot_unlabel_size, "Train dataset is wrong size" LABEL.build_vocab(ng_data.train, ng_data.test) ng_data.dump(args)
def test_multinli(self): batch_size = 4 # create fields TEXT = ParsedTextField() TREE = ShiftReduceField() GENRE = LabelField() LABEL = LabelField() # create train/val/test splits train, val, test = MultiNLI.splits(TEXT, LABEL, TREE, GENRE) # check all are MultiNLI datasets assert type(train) == type(val) == type(test) == MultiNLI # check all have correct number of fields assert len(train.fields) == len(val.fields) == len(test.fields) == 6 # check fields are the correct type assert type(train.fields['premise']) == ParsedTextField assert type(train.fields['premise_transitions']) == ShiftReduceField assert type(train.fields['hypothesis']) == ParsedTextField assert type(train.fields['hypothesis_transitions']) == ShiftReduceField assert type(train.fields['label']) == LabelField assert type(train.fields['genre']) == LabelField assert type(val.fields['premise']) == ParsedTextField assert type(val.fields['premise_transitions']) == ShiftReduceField assert type(val.fields['hypothesis']) == ParsedTextField assert type(val.fields['hypothesis_transitions']) == ShiftReduceField assert type(val.fields['label']) == LabelField assert type(val.fields['genre']) == LabelField assert type(test.fields['premise']) == ParsedTextField assert type(test.fields['premise_transitions']) == ShiftReduceField assert type(test.fields['hypothesis']) == ParsedTextField assert type(test.fields['hypothesis_transitions']) == ShiftReduceField assert type(test.fields['label']) == LabelField assert type(test.fields['genre']) == LabelField # check each is the correct length assert len(train) == 392702 assert len(val) == 9815 assert len(test) == 9832 # build vocabulary TEXT.build_vocab(train) LABEL.build_vocab(train) GENRE.build_vocab(train) # ensure vocabulary has been created assert hasattr(TEXT, 'vocab') assert hasattr(TEXT.vocab, 'itos') assert hasattr(TEXT.vocab, 'stoi') # create iterators train_iter, val_iter, test_iter = Iterator.splits( (train, val, test), batch_size=batch_size) # get a batch to test batch = next(iter(train_iter)) # split premise and hypothesis from tuples to tensors premise, premise_transitions = batch.premise hypothesis, hypothesis_transitions = batch.hypothesis label = batch.label genre = batch.genre # check each is actually a tensor assert type(premise) == torch.Tensor assert type(premise_transitions) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(hypothesis_transitions) == torch.Tensor assert type(label) == torch.Tensor assert type(genre) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert premise_transitions.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert hypothesis_transitions.shape[-1] == batch_size assert label.shape[-1] == batch_size assert genre.shape[-1] == batch_size # repeat the same tests with iters instead of split train_iter, val_iter, test_iter = MultiNLI.iters(batch_size=batch_size, trees=True) # split premise and hypothesis from tuples to tensors premise, premise_transitions = batch.premise hypothesis, hypothesis_transitions = batch.hypothesis label = batch.label # check each is actually a tensor assert type(premise) == torch.Tensor assert type(premise_transitions) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(hypothesis_transitions) == torch.Tensor assert type(label) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert premise_transitions.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert hypothesis_transitions.shape[-1] == batch_size assert label.shape[-1] == batch_size # remove downloaded multinli directory shutil.rmtree('.data/multinli')
def test_xnli(self): batch_size = 4 # create fields TEXT = Field() GENRE = LabelField() LABEL = LabelField() LANGUAGE = LabelField() # create val/test splits, XNLI does not have a test set val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE) # check both are XNLI datasets assert type(val) == type(test) == XNLI # check all have the correct number of fields assert len(val.fields) == len(test.fields) == 5 # check fields are the correct type assert type(val.fields['premise']) == Field assert type(val.fields['hypothesis']) == Field assert type(val.fields['label']) == LabelField assert type(val.fields['genre']) == LabelField assert type(val.fields['language']) == LabelField assert type(test.fields['premise']) == Field assert type(test.fields['hypothesis']) == Field assert type(test.fields['label']) == LabelField assert type(test.fields['genre']) == LabelField assert type(test.fields['language']) == LabelField # check each is the correct length assert len(val) == 37350 assert len(test) == 75150 # build vocabulary TEXT.build_vocab(val) LABEL.build_vocab(val) GENRE.build_vocab(val) LANGUAGE.build_vocab(val) # ensure vocabulary has been created assert hasattr(TEXT, 'vocab') assert hasattr(TEXT.vocab, 'itos') assert hasattr(TEXT.vocab, 'stoi') # create iterators val_iter, test_iter = Iterator.splits((val, test), batch_size=batch_size) # get a batch to test batch = next(iter(val_iter)) # split premise and hypothesis from tuples to tensors premise = batch.premise hypothesis = batch.hypothesis label = batch.label genre = batch.genre language = batch.language # check each is actually a tensor assert type(premise) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(label) == torch.Tensor assert type(genre) == torch.Tensor assert type(language) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert label.shape[-1] == batch_size assert genre.shape[-1] == batch_size assert language.shape[-1] == batch_size # xnli cannot use the iters method, ensure raises error with self.assertRaises(NotImplementedError): val_iter, test_iter = XNLI.iters(batch_size=batch_size) # remove downloaded xnli directory shutil.rmtree('.data/xnli')
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', type=str, default='rnn', help= "Available models are: 'rnn', 'cnn', 'bilstm', 'fasttext', and 'distilbert'\nDefault is 'rnn'" ) parser.add_argument('--train_data_path', type=str, default="./data/train_clean.csv", help="Path to the training data") parser.add_argument('--test_data_path', type=str, default="./data/dev_clean.csv", help="Path to the test data") parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--vectors', type=str, default='fasttext.simple.300d', help=""" Pretrained vectors: Visit https://github.com/pytorch/text/blob/9ce7986ddeb5b47d9767a5299954195a1a5f9043/torchtext/vocab.py#L146 for more """) parser.add_argument('--max_vocab_size', type=int, default=750) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--bidirectional', type=bool, default=True) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--hidden_dim', type=int, default=64) parser.add_argument('--output_dim', type=int, default=1) parser.add_argument('--n_layers', type=int, default=2) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--n_epochs', type=int, default=5) parser.add_argument('--n_filters', type=int, default=100) parser.add_argument('--filter_sizes', type=list, default=[3, 4, 5]) args = parser.parse_args() torch.manual_seed(args.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ########## BILSTM ########## if args.model == "bilstm": print('\nBiLSTM') TEXT = Field(tokenize='spacy') LABEL = LabelField(dtype=torch.float) data_fields = [("text", TEXT), ("label", LABEL)] train_data = TabularDataset(args.train_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) test_data = TabularDataset(args.test_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) train_data, val_data = train_data.split(split_ratio=0.8, random_state=random.seed( args.seed)) TEXT.build_vocab(train_data, max_size=args.max_vocab_size, vectors=args.vectors, unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=device) input_dim = len(TEXT.vocab) embedding_dim = get_embedding_dim(args.vectors) pad_idx = TEXT.vocab.stoi[TEXT.pad_token] unk_idx = TEXT.vocab.stoi[TEXT.unk_token] model = BiLSTM(input_dim, embedding_dim, args.hidden_dim, args.output_dim, args.n_layers, args.bidirectional, args.dropout, pad_idx) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim) model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) best_valid_loss = float('inf') print("\nTraining...") print("===========") for epoch in range(1, args.n_epochs + 1): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), './checkpoints/{}-model.pt'.format(args.model)) print( f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s' ) print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) model.load_state_dict( torch.load('./checkpoints/{}-model.pt'.format(args.model))) test_loss, test_acc = evaluate(model, test_iterator, criterion) print('\nEvaluating...') print("=============") print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%' ) # Test Loss: 0.139, Test Acc: 95.27% ########## VANILLA RNN ########## else: print('\nVanilla RNN') TEXT = Field(tokenize='spacy') LABEL = LabelField(dtype=torch.float) data_fields = [("text", TEXT), ("label", LABEL)] train_data = TabularDataset(args.train_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) test_data = TabularDataset(args.test_data_path, format='csv', fields=data_fields, skip_header=True, csv_reader_params={'delimiter': ","}) train_data, val_data = train_data.split(split_ratio=0.8, random_state=random.seed( args.seed)) TEXT.build_vocab(train_data, max_size=args.max_vocab_size, vectors=args.vectors) LABEL.build_vocab(train_data) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, val_data, test_data), batch_size=args.batch_size, sort_key=lambda x: len(x.text), device=device) input_dim = len(TEXT.vocab) embedding_dim = get_embedding_dim(args.vectors) model = RNN(input_dim, embedding_dim, args.hidden_dim, args.output_dim) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) best_valid_loss = float('inf') print("\nTraining...") print("===========") for epoch in range(1, args.n_epochs + 1): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), './checkpoints/{}-model.pt'.format(args.model)) print( f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s' ) print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%' ) model.load_state_dict( torch.load('./checkpoints/{}-model.pt'.format(args.model))) test_loss, test_acc = evaluate(model, test_iterator, criterion) print('\nEvaluating...') print("=============") print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%' ) # Test Loss: 0.138, Test Acc: 95.05%
("doc_text", TEXT), ("label", LABEL)] train_data = Dataset(torch_examples, fields_dataset) save_examples(train_data, "../traindata.json") exit(0) else: TEXT = Field(tokenize=tokenize_en, batch_first=True, include_lengths=True) LABEL = LabelField(dtype=torch.float, batch_first=True) fields_dataset = [("query_title", TEXT), ("query_description", TEXT), ("doc_text", TEXT), ("label", LABEL)] train_data = Dataset( load_examples("../traindata.json", fields_dataset), fields_dataset) print("build_vocabulary...") TEXT.build_vocab(train_data, min_freq=1, vectors="glove.6B.300d") LABEL.build_vocab(train_data) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("build_iterator...") train_iterator, vaild_iterator = BucketIterator.splits( (train_data, train_data), batch_size=64, sort_key=lambda x: len(x.doc_text), sort_within_batch=False, device=device) size_of_vocab = len(TEXT.vocab) embedding_dim = 300 num_hidden_nodes = 128 num_layers = 2 num_output_nodes = 1 dropout = 0.2
# load data from json train_data, val_data, test_data = TabularDataset.splits( path="/home/CE/skrjanec/data_seg_all/" + SCENARIO + "/join", train="train_line.json", validation="val_line.json", test="val_line.json", format="json", fields=fields) # started with glove.6B.50d # next fasttext.en.300d segment_text.build_vocab(train_data, max_size=100000, vectors=VECTORS, unk_init=torch.Tensor.normal_) gold_event.build_vocab(train_data) # counts and frequency of classes in the train set # gold_event.vocab.freqs is a Counter object # divide every count with the largest count to get the weight for class_i # other options for weight calculation https://discuss.pytorch.org/t/what-is-the-weight-values-mean-in-torch-nn-crossentropyloss/11455/10 print("class count in train data", gold_event.vocab.freqs) count_max = max(gold_event.vocab.freqs.values()) # the weights should be a torch tensor weights = [] weights2 = [] weights3 = [] for lbl, count in gold_event.vocab.freqs.items(): weights.append(count_max / count) weights2.append(1 / count)
def load_dataset(batch_size, test_sen=None): office_actions = pd.read_csv('../data/office_actions.csv', usecols=['app_id', 'ifw_number', 'rejection_102', 'rejection_103'], nrows=100000) abstractList = [] idList = [] rejectionColumn = [] for num in range(10000): app_id = str(office_actions.app_id[num]) filename = "../json_files/oa_"+app_id+".json" try: jfile = open(filename, 'r') except FileNotFoundError: print("File Not Found") continue parsed_json = json.load(jfile) jfile.close() n = int(office_actions.rejection_102[num]) o = int(office_actions.rejection_103[num]) if n == 0 and o == 0: rejType = 0 #neither elif n == 0 and o == 1: rejType = 1 #obvious elif n == 1 and o == 0: rejType = 0 #novelty elif n == 1 and o == 1: rejType = 1 #both else: print("Office action error:", sys.exc_info()[0]) raise if rejType == 1 and rand(1) < 0.758: continue try: abstractList.append(parsed_json[0]['abstract_full']) idList.append(parsed_json[0]['application_number']) except IndexError: print("WARNING: file "+filename+" is empty!\n") continue rejectionColumn.append(rejType) all_data = {'text': abstractList, 'label': rejectionColumn} df = pd.DataFrame(all_data, index = idList) tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = LabelField(sequential=False) #fields={'Abstract': ('text', TEXT), 'RejectionType': ('labels', LABEL)} fields={'text': TEXT, 'label': LABEL} ds = DataFrameDataset(df, fields) TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(ds) train_data, test_data = ds.split() train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data word_embeddings = TEXT.vocab.vectors print ("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print ("Label Length: " + str(len(LABEL.vocab))) train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
def wsd( model_name='bert-base-uncased', #ensemble-distil-1-albert-1 / albert-xxlarge-v2 / bert-base-uncased classifier_input='token-embedding-last-1-layers', # token-embedding-last-layer / token-embedding-last-n-layers classifier_hidden_layers=[], reduce_options=True, freeze_base_model=True, max_len=512, batch_size=32, test=False, lr=5e-5, eps=1e-8, n_epochs=50, cls_token=False, # If true, the cls token is used instead of the relevant-word token cache_embeddings=False, # If true, the embeddings from the base model are saved to disk so that they only need to be computed once save_classifier=True # If true, the classifier part of the network is saved after each epoch, and the training is automatically resumed from this saved network if it exists ): train_path = "wsd_train.txt" test_path = "wsd_test_blind.txt" n_classes = 222 device = 'cuda' import __main__ as main print("Script: " + os.path.basename(main.__file__)) print("Loading base model %s..." % model_name) if model_name.startswith('ensemble-distil-'): last_n_distil = int(model_name.replace('ensemble-distil-', "")[0]) last_n_albert = int(model_name[-1]) from transformers import AlbertTokenizer from transformers.modeling_albert import AlbertModel base_model = AlbertModel.from_pretrained('albert-xxlarge-v2', output_hidden_states=True, output_attentions=False) tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') print( "Ensemble model with DistilBert last %d layers and Albert last %d layers" % (last_n_distil, last_n_albert)) elif model_name.startswith('distilbert'): tokenizer = DistilBertTokenizer.from_pretrained(model_name) base_model = DistilBertModel.from_pretrained(model_name, num_labels=n_classes, output_hidden_states=True, output_attentions=False) elif model_name.startswith('bert'): from transformers import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained(model_name) base_model = BertModel.from_pretrained(model_name, num_labels=n_classes, output_hidden_states=True, output_attentions=False) elif model_name.startswith('albert'): from transformers import AlbertTokenizer from transformers.modeling_albert import AlbertModel tokenizer = AlbertTokenizer.from_pretrained(model_name) base_model = AlbertModel.from_pretrained(model_name, output_hidden_states=True, output_attentions=False) use_n_last_layers = 1 if classifier_input == 'token-embedding-last-layer': use_n_last_layers = 1 elif classifier_input.startswith( 'token-embedding-last-') and classifier_input.endswith('-layers'): use_n_last_layers = int( classifier_input.replace('token-embedding-last-', "").replace('-layers', "")) else: raise ValueError("Invalid classifier_input argument") print("Using the last %d layers" % use_n_last_layers) def tokenize(str): return tokenizer.tokenize(str)[:max_len - 2] SENSE = LabelField(is_target=True) LEMMA = LabelField() TOKEN_POS = LabelField(use_vocab=False) TEXT = Field(tokenize=tokenize, pad_token=tokenizer.pad_token, init_token=tokenizer.cls_token, eos_token=tokenizer.sep_token) EXAMPLE_ID = LabelField(use_vocab=False) fields = [('sense', SENSE), ('lemma', LEMMA), ('token_pos', TOKEN_POS), ('text', TEXT), ('example_id', EXAMPLE_ID)] def read_data(corpus_file, fields, max_len=None): train_id_start = 0 test_id_start = 76049 # let the ids for the test examples start after the training example indices if corpus_file == "wsd_test_blind.txt": print("Loading test data...") id_start = test_id_start else: print("Loading train/val data...") id_start = train_id_start with open(corpus_file, encoding='utf-8') as f: examples = [] for i, line in enumerate(f): sense, lemma, word_position, text = line.split('\t') # We need to convert from the word position to the token position words = text.split() pre_word = " ".join(words[:int(word_position)]) pre_word_tokenized = tokenizer.tokenize(pre_word) token_position = len( pre_word_tokenized ) + 1 # taking into account the later addition of the start token example_id = id_start + i if max_len is None or token_position < max_len - 1: # ignore examples where the relevant token is cut off due to max_len if cls_token: token_position = 0 examples.append( Example.fromlist( [sense, lemma, token_position, text, example_id], fields)) else: print( "Example %d is skipped because the relevant token was cut off (token pos = %d)" % (example_id, token_position)) print(text) return Dataset(examples, fields) dataset = read_data(train_path, fields, max_len) random.seed(0) trn, vld = dataset.split(0.7, stratified=True, strata_field='sense') TEXT.build_vocab([]) if model_name.startswith('albert') or model_name.startswith( 'ensemble-distil-'): class Mapping: def __init__(self, fn): self.fn = fn def __getitem__(self, item): return self.fn(item) TEXT.vocab.stoi = Mapping(tokenizer.sp_model.PieceToId) TEXT.vocab.itos = Mapping(tokenizer.sp_model.IdToPiece) else: TEXT.vocab.stoi = tokenizer.vocab TEXT.vocab.itos = list(tokenizer.vocab) SENSE.build_vocab(trn) LEMMA.build_vocab(trn) trn_iter = BucketIterator(trn, device=device, batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, train=True, sort=True) vld_iter = BucketIterator(vld, device=device, batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, train=False, sort=True) if freeze_base_model: for mat in base_model.parameters(): mat.requires_grad = False # Freeze Bert model so that we only train the classifier on top if reduce_options: lemma_mask = defaultdict( lambda: torch.zeros(len(SENSE.vocab), device=device)) for example in trn: lemma = LEMMA.vocab.stoi[example.lemma] sense = SENSE.vocab.stoi[example.sense] lemma_mask[lemma][sense] = 1 lemma_mask = dict(lemma_mask) def mask( batch_logits, batch_lemmas ): # Masks out the senses that do not belong to the specified lemma for batch_i in range(len(batch_logits)): lemma = batch_lemmas[batch_i].item() batch_logits[batch_i, :] *= lemma_mask[lemma] return batch_logits else: def mask(batch_logits, batch_lemmas): return batch_logits experiment_name = model_name + " " + ( classifier_input if not model_name.startswith('ensemble-distil-') else "") + " " + str(classifier_hidden_layers) + " (" + ( " cls_token" if cls_token else "") + (" reduce_options" if reduce_options else "") + ( " freeze_base_model" if freeze_base_model else "" ) + " ) " + "max_len=" + str(max_len) + " batch_size=" + str( batch_size) + " lr=" + str(lr) + " eps=" + str(eps) + ( " cache_embeddings" if cache_embeddings else "") if model_name.startswith('ensemble-distil-'): model = WSDEnsembleModel(last_n_distil, last_n_albert, n_classes, mask, classifier_hidden_layers) else: model = WSDModel(base_model, n_classes, mask, use_n_last_layers, model_name, classifier_hidden_layers, cache_embeddings) history = None #if save_classifier: # if model.load_classifier(experiment_name): # # Existing saved model loaded # # Also load the corresponding training history # history = read_dict_file("results/"+experiment_name+".txt") model.cuda() print("Starting experiment " + experiment_name) if test: tst = read_data(test_path, fields, max_len=512) tst_iter = Iterator(tst, device=device, batch_size=batch_size, sort=False, sort_within_batch=False, repeat=False, train=False) batch_predictions = [] for batch in tst_iter: print('.', end='') sys.stdout.flush() text = batch.text.t() with torch.no_grad(): outputs = model(text, token_positions=batch.token_pos, lemmas=batch.lemma, example_ids=batch.example_id) scores = outputs[-1] batch_predictions.append(scores.argmax(dim=1)) batch_preds = torch.cat(batch_predictions, 0).tolist() predicted_senses = [SENSE.vocab.itos(pred) for pred in batch_preds] with open("test_predictions/" + experiment_name + ".txt", "w") as out: out.write("\n".join(predicted_senses)) else: no_decay = ['bias', 'LayerNorm.weight'] decay = 0.01 optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps) def save_results(history): with open("results/" + experiment_name + ".txt", "w") as out: out.write(str(history)) if save_classifier: if len(history['val_acc']) < 2 or history['val_acc'][-1] > max( history['val_acc'][:-1]): model.save_classifier(experiment_name, best=True) else: model.save_classifier(experiment_name, best=False) train(model, optimizer, trn_iter, vld_iter, n_epochs, save_results, history)
def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) # non-transferrable types primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}] for batch in primitive_objects: data = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert data == batch # batch is just a tensor batch = torch.rand(2, 3) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor' # tensor list batch = [torch.rand(2, 3), torch.rand(2, 3)] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].device.index == 0 and batch[0].type() == 'torch.cuda.FloatTensor' assert batch[1].device.index == 0 and batch[1].type() == 'torch.cuda.FloatTensor' # tensor list of lists batch = [[torch.rand(2, 3), torch.rand(2, 3)]] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor' assert batch[0][1].device.index == 0 and batch[0][1].type() == 'torch.cuda.FloatTensor' # tensor dict batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0]['a'].device.index == 0 and batch[0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[0]['b'].device.index == 0 and batch[0]['b'].type() == 'torch.cuda.FloatTensor' # tuple of tensor list and list of tensor dict batch = ([torch.rand(2, 3) for _ in range(2)], [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)} for _ in range(2)]) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['a'].device.index == 0 assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['b'].device.index == 0 assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor' # namedtuple of tensor BatchType = namedtuple('BatchType', ['a', 'b']) batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)] batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch[0].a.device.index == 0 assert batch[0].a.type() == 'torch.cuda.FloatTensor' # non-Tensor that has `.to()` defined class CustomBatchType: def __init__(self): self.a = torch.rand(2, 2) def to(self, *args, **kwargs): self.a = self.a.to(*args, **kwargs) return self batch = trainer.accelerator.batch_to_device(CustomBatchType(), torch.device('cuda:0')) assert batch.a.type() == 'torch.cuda.FloatTensor' # torchtext.data.Batch samples = [{ 'text': 'PyTorch Lightning is awesome!', 'label': 0 }, { 'text': 'Please make it work with torchtext', 'label': 1 }] text_field = Field() label_field = LabelField() fields = {'text': ('text', text_field), 'label': ('label', label_field)} examples = [Example.fromdict(sample, fields) for sample in samples] dataset = Dataset(examples=examples, fields=fields.values()) # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first text_field.build_vocab(dataset) label_field.build_vocab(dataset) batch = Batch(data=examples, dataset=dataset) batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0')) assert batch.text.type() == 'torch.cuda.LongTensor' assert batch.label.type() == 'torch.cuda.LongTensor'
# torchtext datasets fields = [('text', TEXT), ('label', LABEL)] train_ds = TabularDataset(path='', format='csv', fields=fields, skip_header=False) # split train_ds into train and test train_ds, val_ds = train_ds.split(split_ratio=0.7, random_state=random.seed(1)) print(vars(train_ds.examples[0])) # build vocabulary MAX_VOCAB = 30000 TEXT.build_vocab(train_ds, max_size=MAX_VOCAB) LABEL.build_vocab(train_ds) # build iterators device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iter, val_iter = BucketIterator.splits((train_ds, val_ds), batch_size=64, sort_within_batch=True, device=device) # model class BiLSTM(nn.Module): def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_lstm_layers, bidirectional, dropout, word_pad_idx): super(BiLSTM, self).__init__() self.emb = nn.Embedding(num_embeddings=input_dim,
def load_dataset(batch_size, cache_data=True, test_sen=None): if cache_data: print("Caching Data") office_actions = pd.read_csv( '../data/office_actions.csv', index_col='app_id', usecols=['app_id', 'rejection_102', 'rejection_103'], dtype={ 'app_id': int, 'rejection_102': int, 'rejection_103': int }, nrows=200000) abstractList = [] idList = [] rejectionColumn = [] obviousCount = 0 notCount = 0 path = "/scratch/dm4350/json_files/" count = 0 for filename in os.listdir(path): if count % 1000 == 0: print(count) filepath = path + filename try: jfile = open(filepath, 'r') except FileNotFoundError: print("File Not Found") continue try: parsed_json = json.load(jfile) jfile.close() except UnicodeDecodeError: print("WARNING: UnicodeDecodeError") continue except json.decoder.JSONDecodeError: print("WARNING: JSONDecodeError") continue app_id = int( filename.replace("oa_", "").replace(".json", "").replace("(1)", "")) try: row = office_actions.loc[app_id] except KeyError: print("WARNING: KeyError") continue try: n = int(row.rejection_102) o = int(row.rejection_103) except TypeError: n = int(row.rejection_102.iloc[0]) o = int(row.rejection_103.iloc[0]) if n == 0 and o == 0: rejType = 0 #neither elif n == 0 and o == 1: rejType = 1 #obvious elif n == 1 and o == 0: rejType = 0 #novelty elif n == 1 and o == 1: rejType = 1 #both else: print("Office actions dataframe error:", sys.exc_info()[0]) raise if obviousCount >= notCount and rejType == 1: continue obviousCount += o notCount += not (o) # Skip any files not in the appropriate IPC class try: found_A61 = False for s in parsed_json[0]['ipc_classes']: if (s.find("A61") != -1): found_A61 = True if not found_A61: continue except: print("WARNING: file " + filepath + " is empty!\n") continue # Read in data from json file if it exists try: a = parsed_json[0]['abstract_full'] i = parsed_json[0]['application_number'] except IndexError: print("WARNING: file " + filepath + " is empty!\n") continue except KeyError: print("WARNING: file " + filepath + " is empty!\n") continue abstractList.append(a) idList.append(i) rejectionColumn.append(rejType) count += 1 #if count > 2000: break df = pd.DataFrame({ 'text': abstractList, 'label': rejectionColumn }, index=idList) print("{} files loaded".format(count)) df.to_pickle('./data_cache/abstracts_df_A61.pkl') # with open("data_cache/TEXT.Field","wb")as f: # dill.dump(TEXT,f) # with open("data_cache/LABEL.Field","wb")as f: # dill.dump(LABEL,f) else: print('Loading Dataset from Cache') df = pd.read_pickle('./data_cache/abstracts_df_A61.pkl') # with open("data_cache/TEXT.Field","rb")as f: # TEXT=dill.load(f) # with open("data_cache/LABEL.Field","rb")as f: # LABEL=dill.load(f) tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200) LABEL = LabelField(sequential=False) fields = {'text': TEXT, 'label': LABEL} ds = DataFrameDataset(df, fields) TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(ds) train_data, test_data = ds.split() train_data, valid_data = train_data.split( ) # Further splitting of training_data to create new training_data & validation_data word_embeddings = TEXT.vocab.vectors print("Length of Text Vocabulary: " + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) train_iter, valid_iter, test_iter = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) vocab_size = len(TEXT.vocab) return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
def load_data(train_file, test_file, pretrain=None, save_dir=None): assert os.path.exists(train_file), f"{train_file} is not exist!" assert os.path.exists(test_file), f"{test_file} is not exist!" print("=" * 30 + "DATASET LOADER" + "=" * 30) sent_field = Field(tokenize=lambda x: x.split(), unk_token='<unk>', pad_token='<pad>', init_token=None, eos_token=None) doc_field = NestedField(sent_field, tokenize=sent_tokenize, pad_token='<pad>', init_token=None, eos_token=None, include_lengths=True) label_field = LabelField() fields = [("raw", RawField()), ("doc", doc_field), ("label", label_field)] print(f"Reading {train_file} ...") with open(train_file, "r", encoding="utf-8") as reader: lines = reader.readlines() examples = [] for line in lines: text, label = line.split('\t') examples.append( Example.fromlist([text, text.lower(), label], fields)) train_dataset = Dataset(examples, fields) reader.close() print(f"\tNum of train examples: {len(examples)}") print(f"Reading {test_file} ...") with open(test_file, "r", encoding="utf-8") as reader: lines = reader.readlines() examples = [] for line in lines: text, label = line.split('\t') examples.append( Example.fromlist([text, text.lower(), label], fields)) test_dataset = Dataset(examples, fields) reader.close() print(f"\tNum of valid examples: {len(examples)}") vectors = FastText('vi') doc_field.build_vocab(train_dataset, test_dataset, vectors=vectors) label_field.build_vocab(train_dataset, test_dataset) print(f"Building vocabulary ...") num_vocab = len(doc_field.vocab) num_classes = len(label_field.vocab) pad_idx = doc_field.vocab.stoi['<pad>'] print(f"\tNum of vocabulary: {num_vocab}") print(f"\tNum of classes: {num_classes}") if save_dir: with open(save_dir + "/vocab.json", "w", encoding="utf-8") as fv: vocabs = { "word": doc_field.vocab.stoi, "class": label_field.vocab.itos, 'pad_idx': pad_idx } json.dump(vocabs, fv) fv.close() with open(save_dir + "/fileds.json", "w", encoding="utf-8") as ff: field_vocabs = { "doc": doc_field.vocab.freqs, "label": label_field.vocab.freqs } json.dump(field_vocabs, ff) ff.close() print("=" * 73) return train_dataset, test_dataset, num_vocab, num_classes, pad_idx, vectors.vectors
def prepare_dataset(self, name='adult'): if name == 'adult': from utils.load_adult import get_train_test from utils.Custom_Dataset import Custom_Dataset import torch train_data, train_target, test_data, test_target = get_train_test() X_train = torch.tensor(train_data.values, requires_grad=False).float() y_train = torch.tensor(train_target.values, requires_grad=False).long() X_test = torch.tensor(test_data.values, requires_grad=False).float() y_test = torch.tensor(test_target.values, requires_grad=False).long() print("X train shape: ", X_train.shape) print("y train shape: ", y_train.shape) pos, neg =(y_train==1).sum().item() , (y_train==0).sum().item() print("Train set Positive counts: {}".format(pos),"Negative counts: {}.".format(neg), 'Split: {:.2%} - {:.2%}'.format(1. * pos/len(X_train), 1.*neg/len(X_train))) print("X test shape: ", X_test.shape) print("y test shape: ", y_test.shape) pos, neg =(y_test==1).sum().item() , (y_test==0).sum().item() print("Test set Positive counts: {}".format(pos),"Negative counts: {}.".format(neg), 'Split: {:.2%} - {:.2%}'.format(1. * pos/len(X_test), 1.*neg/len(X_test))) train_indices, valid_indices = get_train_valid_indices(len(X_train), self.train_val_split_ratio, self.sample_size_cap) train_set = Custom_Dataset(X_train[train_indices], y_train[train_indices], device=self.device) validation_set = Custom_Dataset(X_train[valid_indices], y_train[valid_indices], device=self.device) test_set = Custom_Dataset(X_test, y_test, device=self.device) return train_set, validation_set, test_set elif name == 'mnist': train = FastMNIST('datasets/MNIST', train=True, download=True) test = FastMNIST('datasets/MNIST', train=False, download=True) train_indices, valid_indices = get_train_valid_indices(len(train), self.train_val_split_ratio, self.sample_size_cap) from utils.Custom_Dataset import Custom_Dataset train_set = Custom_Dataset(train.data[train_indices], train.targets[train_indices], device=self.device) validation_set = Custom_Dataset(train.data[valid_indices],train.targets[valid_indices] , device=self.device) test_set = Custom_Dataset(test.data, test.targets, device=self.device) del train, test return train_set, validation_set, test_set elif name == 'cifar10': ''' from torchvision import transforms transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) ''' train = FastCIFAR10('datasets/cifar', train=True, download=True)#, transform=transform_train) test = FastCIFAR10('datasets/cifar', train=False, download=True)#, transform=transform_test) train_indices, valid_indices = get_train_valid_indices(len(train), self.train_val_split_ratio, self.sample_size_cap) from utils.Custom_Dataset import Custom_Dataset train_set = Custom_Dataset(train.data[train_indices], train.targets[train_indices], device=self.device) validation_set = Custom_Dataset(train.data[valid_indices],train.targets[valid_indices] , device=self.device) test_set = Custom_Dataset(test.data, test.targets, device=self.device) del train, test return train_set, validation_set, test_set elif name == "sst": import torchtext.data as data text_field = data.Field(lower=True) from torch import long as torch_long label_field = LabelField(dtype = torch_long, sequential=False) import torchtext.datasets as datasets train_data, validation_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True) indices_list = powerlaw(list(range(len(train_data))), self.n_participants) ratios = [len(indices) / len(train_data) for indices in indices_list] train_datasets = split_torchtext_dataset_ratios(train_data, ratios) text_field.build_vocab(*(train_datasets + [validation_data, test_data])) label_field.build_vocab(*(train_datasets + [validation_data, test_data])) self.args.text_field = text_field self.args.label_field = label_field return train_datasets, validation_data, test_data elif name == 'mr': import torchtext.data as data from utils import mydatasets text_field = data.Field(lower=True) from torch import long as torch_long label_field = LabelField(dtype = torch_long, sequential=False) # label_field = data.Field(sequential=False) train_data, dev_data = mydatasets.MR.splits(text_field, label_field, root='.data/mr', shuffle=False) validation_data, test_data = dev_data.split(split_ratio=0.5, random_state = random.seed(1234)) indices_list = powerlaw(list(range(len(train_data))), self.n_participants) ratios = [len(indices) / len(train_data) for indices in indices_list] train_datasets = split_torchtext_dataset_ratios(train_data, ratios) # print(train_data, dir(train_data)) # print((train_datasets[0].examples[0].text)) # print((train_datasets[0].examples[1].text)) # print((train_datasets[0].examples[2].text)) # exit() text_field.build_vocab( *(train_datasets + [validation_data, test_data] )) label_field.build_vocab( *(train_datasets + [validation_data, test_data] )) self.args.text_field = text_field self.args.label_field = label_field return train_datasets, validation_data, test_data elif name == 'imdb': from torch import long as torch_long # text_field = Field(tokenize = 'spacy', preprocessing = generate_bigrams) # generate_bigrams takes about 2 minutes text_field = Field(tokenize = 'spacy') label_field = LabelField(dtype = torch_long) dirname = '.data/imdb/aclImdb' from torch.nn.init import normal_ from torchtext import datasets train_data, test_data = datasets.IMDB.splits(text_field, label_field) # 25000, 25000 samples each # use 5000 out of 25000 of test_data as the test_data test_data, remaining = test_data.split(split_ratio=0.2 ,random_state = random.seed(1234)) # use 5000 out of the remaining 2000 of test_data as valid data valid_data, remaining = remaining.split(split_ratio=0.25 ,random_state = random.seed(1234)) # train_data, valid_data = train_data.split(split_ratio=self.train_val_split_ratio ,random_state = random.seed(1234)) indices_list = powerlaw(list(range(len(train_data))), self.n_participants) ratios = [len(indices) / len(train_data) for indices in indices_list] train_datasets = split_torchtext_dataset_ratios(train_data, ratios) MAX_VOCAB_SIZE = 25_000 text_field.build_vocab(*(train_datasets + [valid_data, test_data] ), max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d", unk_init = normal_) label_field.build_vocab( *(train_datasets + [valid_data, test_data] )) # INPUT_DIM = len(text_field.vocab) # OUTPUT_DIM = 1 # EMBEDDING_DIM = 100 PAD_IDX = text_field.vocab.stoi[text_field.pad_token] self.args.text_field = text_field self.args.label_field = label_field self.args.pad_idx = PAD_IDX return train_datasets, valid_data, test_data elif name == 'names': from utils.load_names import get_train_test from utils.Custom_Dataset import Custom_Dataset import torch from collections import Counter X_train, y_train, X_test, y_test, reference_dict = get_train_test() print("X train shape: ", X_train.shape) print("y train shape: ", y_train.shape) print("X test shape: ", X_test.shape) print("y test shape: ", y_test.shape) from utils.Custom_Dataset import Custom_Dataset train_set = Custom_Dataset(X_train, y_train) test_set = Custom_Dataset(X_test, y_test) return train_set, test_set
def main(args): print('start ..!') BATCH_SIZE = args.batch_size device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') TEXT = Field( sequential=True, # text: sequential data tokenize=str.split, batch_first=True, fix_length=56, # padding size: max length of data text lower=True) LABEL = LabelField(sequential=False, dtype=torch.float) w2v = KeyedVectors.load_word2vec_format( './model/GoogleNews-vectors-negative300.bin.gz', binary=True) data_dir = args.data_dir train_paths, val_paths = build_data(data_dir) N_EPOCHS = args.epochs EMBEDDING_DIM = args.embedding N_FILTERS = args.n_filters FILTER_SIZES = args.filter_sizes OUTPUT_DIM = 1 DROPOUT = args.dropout test_acc_lists = [] for kfold in range(10): # make datasets train_path = train_paths[kfold] val_path = val_paths[kfold] train_data = TabularDataset(path=train_path, skip_header=True, format='csv', fields=[('label', LABEL), ('text', TEXT)]) test_data = TabularDataset(path=val_path, skip_header=True, format='csv', fields=[('label', LABEL), ('text', TEXT)]) TEXT.build_vocab(train_data) LABEL.build_vocab(train_data) # for pretrained embedding vectors w2v_vectors = [] for token, idx in TEXT.vocab.stoi.items(): # pad token -> zero if idx == 1: w2v_vectors.append(torch.zeros(EMBEDDING_DIM)) # if word in word2vec vocab -> replace with pretrained word2vec elif token in w2v.wv.vocab.keys(): w2v_vectors.append(torch.FloatTensor(w2v[token])) # oov -> randomly initialized uniform distribution else: w2v_vectors.append( torch.distributions.Uniform(-0.25, +0.25).sample( (EMBEDDING_DIM, ))) TEXT.vocab.set_vectors(TEXT.vocab.stoi, w2v_vectors, EMBEDDING_DIM) pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors) # make iterators train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=BATCH_SIZE, device=device, sort=False, shuffle=True) # define a model INPUT_DIM = len(TEXT.vocab) model = CNN1d(pretrained_embeddings, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT) optimizer = optim.Adadelta(model.parameters(), rho=0.95) criterion = nn.BCEWithLogitsLoss() model = model.to(device) criterion = criterion.to(device) # train best_test_acc = -float('inf') model_name = './model/model' + str(kfold) + '.pt' print('kfold', kfold) for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) test_loss, test_acc = evaluate(model, test_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if test_acc > best_test_acc: best_test_acc = test_acc torch.save(model.state_dict(), model_name) # print(f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') # print(f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%') # print(f'\t\tTest. Loss: {test_loss:.3f} | Val. Acc: {test_acc * 100:.2f}%') logging.info( f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s' ) logging.info( f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%' ) logging.info( f'\t\tTest. Loss: {test_loss:.3f} | Val. Acc: {test_acc * 100:.2f}%' ) model.load_state_dict(torch.load(model_name)) test_loss, test_acc = evaluate(model, test_iterator, criterion) test_acc_lists.append(test_acc) logging.info(f'============== last test accuracy: {test_acc}') # print(f'============== last test accuracy: {test_acc}') print() return test_acc_lists