class MyDataset(object):

    def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)
Esempio n. 2
0
def load_dataset_from_csv(params, device):
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.

    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.

    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.

    """
    # define tokenizer
    en = English()

    def tokenize(sentence):
        return [tok.text for tok in en.tokenizer(sentence)]

    TEXT = Field(sequential=True, tokenize=tokenize, lower=True, eos_token='<eos>', batch_first=True, fix_length=128)
    LABEL = LabelField()

    fields_list = [('Unnamed: 0', None),
                   ('text', TEXT),
                   ('conf', None),
                   ('label', LABEL)]
    base_path = params.DATA_PATH
    train_path = os.path.join(base_path, "filtered_train.csv")
    test_path = os.path.join(base_path, "filtered_test.csv")
    train_data = TabularDataset(path=train_path,  # the root directory where the data lies
                                format='csv',
                                skip_header=True,
                                fields=fields_list)

    test_data = TabularDataset(path=test_path,  # the root directory where the data lies
                               format='csv',
                               skip_header=True,
                               fields=fields_list)

    if params.VOCAB_USE_GLOVE:
        TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ, vectors=GloVe(name='6B', dim=300))
        logging.info("Loaded Glove embedding, Vector size of Text Vocabulary: " + str(TEXT.vocab.vectors.size()))

    else:
        TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ)
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    logging.info("Length of Text Vocabulary: " + str(len(TEXT.vocab)))

    train_iter, test_iter = data.BucketIterator.splits((train_data, test_data),
                                                       batch_sizes=(params.TRAIN_BATCH_SIZE, params.TRAIN_BATCH_SIZE),
                                                       sort_key=lambda x: len(x.text), repeat=False, shuffle=True,
                                                       device=device)
    # Disable shuffle
    test_iter.shuffle = False
    return TEXT, word_embeddings, train_iter, test_iter
def clean_quora(path='../data/train.csv', output='list', tokenizer = nltk.word_tokenize, device=DEVICE, batch_size=32):
    data = pd.read_csv(path)
    questions1 = data['question1'].astype('str').tolist()
    questions2 = data['question2'].astype('str').tolist()
    is_duplicates = data['is_duplicate'].tolist()
    
    if output == 'list':
        return questions1, questions2, is_duplicates
    
    elif output == 'tokenized_list':
        return [tokenizer(q) for q in questions1], [tokenizer(q) for q in questions2], is_duplicates
    
    elif output == 'iterator' or output == 'iterator_from_file':
        TEXT = Field(
                sequential=True,
                tokenize = tokenizer,
                pad_first = False,
                dtype = torch.long,
                lower = True,
                batch_first = True
                )
        TARGET = LabelField(use_vocab = False)
        
        if output == 'iterator':
            examples = [Example.fromlist((questions1[i], questions2[i], is_duplicates[i]),
                                         [('question1', TEXT),
                                          ('question2', TEXT)
                                          ('is_duplicate', TARGET)]) for i in range(len(questions1))]
            dataset = Dataset(examples, {'question1': TEXT, 'question2': TEXT, 'is_duplicate': TARGET})
    
        if output == 'iterator_from_file':
            dataset = TabularDataset(path, 'csv', [('question1', TEXT),
                                                   ('question2', TEXT),
                                                   ('is_duplicate', TARGET)],
    skip_header=True)
        
        iterator = BucketIterator(
                dataset,
                batch_size=batch_size,
                sort_key=lambda x: len(x.question1) + len(x.question2),
                sort_within_batch=False,
                repeat = False,
                device = device
                # repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
        )
        
        TEXT.build_vocab(dataset)
        TARGET.build_vocab(dataset)
        
        
        return iterator
        

        #dataset = TabularDataset(path, 'csv', [('review', TEXT), ('sentiment', TARGET)])
        
    else:
        raise ValueError('Processing type not understood')
Esempio n. 4
0
def load_data(preprocessing=None):
    # Fields for the dataset
    # The actual review message

    #TEXT = Field(tokenize='spacy') # -- Old way, unclear exactly what language model is used
    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 preprocessing=preprocessing)
    LABEL = LabelField(dtype=torch.float)

    # Get the entire dataset that we will then split
    data = TabularDataset(path=path,
                          format='tsv',
                          fields=[('text', TEXT), ('label', LABEL)])

    # We should probabily look at the proportion of fake to non fake in each of these
    # set to make sure it is fairly even. Though probabilistically it should be I suppose
    train_data, valid_data, test_data = data.split(
        split_ratio=TRAIN_VAL_TEST_SPLIT, random_state=random.seed(SEED))
    #valid_data, test_data = test_data.split(split_ratio=VAL_TEST_SPLIT, random_state=random.seed(SEED))

    print('Size of train set: ' + str(len(train_data.examples)))
    print('Size of val / test: ' + str(len(valid_data.examples)))
    '''
    # Try loading in the IMB dataset to label pos or negative
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    # Get train/valid split!!
    train_data, valid_data = train_data.split(random_state=random.seed(SEED))
    '''

    # Now we need to build the vocab for our actual data
    # Here we will use the pre-trained word vetors from "glove.6b.100"
    TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d")
    LABEL.build_vocab(train_data)

    # Print stuff for sanity checks
    print('Size of the vocab: ' + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_itr, valid_itr, test_itr = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device,
        sort_key=lambda x: len(x.text))

    return TEXT, train_itr, valid_itr, test_itr
Esempio n. 5
0
def data_load_without_cv(fname, args, seed=1234, split_ratio=0.9):
    TEXT   = Field(sequential=True, tokenize=str.split, batch_first=True, fix_length=56, lower=True)
    LABEL  = LabelField(sequential=False, dtype=torch.float)
    FIELDS = [('label', LABEL), ('text', TEXT)]

    dataset = TabularDataset(fname, fields=FIELDS, format='csv', skip_header=True)

    train_dataset, valid_dataset  = dataset.split(random_state=random.seed(seed), split_ratio=split_ratio)

    TEXT.build_vocab(train_dataset)
    LABEL.build_vocab(train_dataset)

    train_iterator, valid_iterator = BucketIterator.splits((train_dataset, valid_dataset), batch_size=args.batch_size,
                                                           device=args.device, sort=False, shuffle=True)

    return TEXT, train_iterator, valid_iterator
Esempio n. 6
0
class ReviewsDataset():
    def __init__(self, data_path, train_path):

        ## write the tokenizer
        tokenize = lambda review: review.split()
        ## define your fields for ID filed you can use RAWField class
        self.TEXT = Field(sequential=True,
                          use_vocab=True,
                          tokenize=tokenize,
                          lower=True)
        self.LABEL = LabelField()

        self.fields = [
            ("PhraseId", None
             ),  # we won't be needing the id, so we pass in None as the field
            ("SentenceId", None),
            ("Phrase", self.TEXT),
            ("Sentiment", self.LABEL)
        ]  #{ 'Phrase': ('r', self.review), 'Sentiment': ('s', self.sentiment) }
        ## set paths
        self.data_path = data_path
        self.train_path = train_path

    def load_data(self):
        self.train_data = TabularDataset.splits(
            path='{}'.format(self.data_path),
            train='{}'.format(self.train_path),
            format='tsv',
            fields=self.fields)[0]

        self.TEXT.build_vocab(self.train_data, max_size=10000, min_freq=1)
        self.LABEL.build_vocab(self.train_data)
        self.train_iterator, _ = BucketIterator.splits(
            (self.train_data, None),
            batch_sizes=(64, 64),
            sort_within_batch=True,
            sort_key=lambda x: len(x.Phrase))

    def __str__(self):
        return 'review: {} \n sentiment: {}'.format(
            self.train_data[0][0].__dict__['r'],
            self.train_data[0][0].__dict__['s'])
Esempio n. 7
0
def pre_process_text():

    ID = Field(sequential=False, use_vocab=False)
    # 处理CATEGORY,标签选择非序列,use_vocab置true建立词典,is_target置true指明这是目标变量
    CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True)
    # 处理NEWS,文本选择序列,分词函数用jieba的lcut,返回句子原始长度方便RNN使用
    NEWS = Field(sequential=True, tokenize=jieba.lcut, include_lengths=True)

    fields = [
        ('id', ID),
        (None, None),
        ('category', CATEGORY),
        ('news', NEWS),
    ]

    # 加载数据
    train_data = TabularDataset(
        os.path.join('data', 'train.csv'),
        format = 'csv',
        fields = fields,
        csv_reader_params={'delimiter': '\t'}
    )    
    valid_data = TabularDataset(
        os.path.join('data', 'dev.csv'),
        format = 'csv',
        fields = fields,
        csv_reader_params={'delimiter': '\t'}
    )    
    test_data = TabularDataset(
        os.path.join('data', 'test.csv'),
        format = 'csv',
        fields = fields,
        csv_reader_params={'delimiter': '\t'}
    )
    
    # 创建字典
    NEWS.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    CATEGORY.build_vocab(train_data)

    return CATEGORY, NEWS, train_data, valid_data, test_data
Esempio n. 8
0

if __name__ == "__main__":
    text_field = Field(use_vocab=False,
                       tokenize=tokenize_and_trunc,
                       preprocessing=tokenizer.convert_tokens_to_ids,
                       batch_first=True,
                       init_token=init_token_idx,
                       eos_token=eos_token_idx,
                       pad_token=pad_token_idx,
                       unk_token=unk_token_idx)
    label_field = LabelField()

    train_data, test_data = IMDB.splits(text_field, label_field)
    train_data, valid_data = train_data.split()
    label_field.build_vocab(train_data)

    n_epochs = 5
    batch_size = 128
    rnn_hidden_size = 256
    dropout_p = 0.2
    num_classes = len(label_field.vocab)
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    model = BertGRU(bert.config.to_dict()['dim'],
                    rnn_hidden_size, num_classes=num_classes,
                    dropout_p=dropout_p)

    for name, params in model.named_parameters():
        if name.startswith('embedding_layer'):
            params.requires_grad = False
Esempio n. 9
0
class SequenceDataLoader(CommonDataLoader):
    def __init__(self, data_config):
        super(SequenceDataLoader, self).__init__(data_config)
        self._config = data_config
        self._tool = Tool()
        self.__build_field()
        self._load_data()
        pass

    def __build_field(self):
        self.TEXT = Field(sequential=True,
                          use_vocab=True,
                          lower=True,
                          tokenize=tokenizer,
                          include_lengths=True,
                          batch_first=self._config.data.batch_first,
                          pad_token='[PAD]',
                          unk_token='[UNK]')
        # self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True,
        #                  batch_first=self._config.data.batch_first)
        self.TAG = LabelField(
            sequential=True,
            use_vocab=True,
            tokenize=tokenizer,
            is_target=True,
        )
        self._fields = [('text', self.TEXT), ('tag', self.TAG)]
        pass

    @timeit
    def _load_data(self):
        self.train_data = EmoDataset(path=self._config.data.train_path,
                                     fields=self._fields,
                                     file='train',
                                     config=self._config)
        self.valid_data = EmoDataset(path=self._config.data.valid_path,
                                     fields=self._fields,
                                     file='valid',
                                     config=self._config)
        self.test_data = EmoDataset(path=self._config.data.test_path,
                                    fields=self._fields,
                                    file='test',
                                    config=self._config)
        self.__build_vocab(self.train_data, self.valid_data, self.test_data)
        self.__build_iterator(self.train_data, self.valid_data, self.test_data)
        pass

    def __build_vocab(self, *dataset):
        """
        :param dataset: train_data, valid_data, test_data
        :return: text_vocab, tag_vocab
        """
        if self._config.pretrained_models.is_use:
            vocabs = self._tool.get_vocab_list(self._config.data.vocab_path)
            v = Vocab(vocabs, specials=['[PAD]', '[UNK]'])
            self.TEXT.build_vocab(
                vocabs,
                max_size=30000,
                min_freq=1,
                vectors=None,  # vects替换为None则不使用词向量
            )
        else:
            self.TEXT.build_vocab(*dataset)
        self.TAG.build_vocab(*dataset)
        self.word_vocab = self.TEXT.vocab
        self.tag_vocab = self.TAG.vocab
        pass

    def __build_iterator(self, *dataset):
        self._train_iter = BucketIterator(
            dataset[0],
            batch_size=self._config.data.train_batch_size,
            shuffle=True,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
            device=self._config.device)
        self._valid_iter = BucketIterator(
            dataset[1],
            batch_size=self._config.data.train_batch_size,
            shuffle=False,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
            device=self._config.device)
        self._test_iter = BucketIterator(
            dataset[2],
            batch_size=self._config.data.train_batch_size,
            shuffle=False,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
            device=self._config.device)
        pass

    def load_train(self):
        return self._train_iter
        pass

    def load_test(self):
        return self._test_iter
        pass

    def load_valid(self):
        return self._valid_iter
        pass
Esempio n. 10
0
class DataLoader(object):
    def __init__(self,
                 data_fields,
                 train_file,
                 valid_file,
                 batch_size,
                 device,
                 skip_header,
                 delimiter,
                 pre_embeddings,
                 vector_cache,
                 min_freq=2,
                 extend_vocab=True,
                 pre_vocab_size=200000,
                 use_pre_embedding=False):
        self.x_field = Field(sequential=True,
                             tokenize=self.word_tokenize,
                             batch_first=True,
                             include_lengths=True)
        self.y_field = LabelField(batch_first=True)
        self.train_fields, self.x_var, self.y_vars = self.parse_fields(
            data_fields, self.x_field, self.y_field)

        self.train_ds = TabularDataset(
            train_file,
            fields=self.train_fields,
            skip_header=skip_header,
            format="csv",
            csv_reader_params={"delimiter": delimiter})
        self.valid_ds = TabularDataset(
            valid_file,
            fields=self.train_fields,
            skip_header=skip_header,
            format="csv",
            csv_reader_params={"delimiter": delimiter})

        self.x_field.build_vocab(self.train_ds, min_freq=min_freq)
        if use_pre_embedding:
            vectors = Vectors(pre_embeddings, vector_cache)
            if extend_vocab:
                self.extend_vocab_with_vectors(self.x_field.vocab, vectors,
                                               pre_vocab_size)
            vectors.unk_init = partial(init_unk,
                                       vocab_size=len(self.x_field.vocab))
            self.x_field.vocab.load_vectors(vectors)
        self.y_field.build_vocab(self.train_ds)

        self.train_iter, self.valid_iter = BucketIterator.splits(
            (self.train_ds, self.valid_ds),
            batch_size=batch_size,
            device=device,
            sort=False,
            sort_key=lambda sample: len(getattr(sample, self.x_var)),
            sort_within_batch=False,
            shuffle=True,
            repeat=False,
        )

        self.vocab = self.x_field.vocab
        self.vocab_size = len(self.x_field.vocab)
        self.num_labels = len(self.y_vars)
        self.num_classes = len(self.y_field.vocab)
        self.classes = list(self.y_field.vocab.stoi.values())
        self.unk_token = self.x_field.unk_token
        self.pad_token = self.x_field.pad_token
        self.unk_idx = self.x_field.vocab.stoi[self.unk_token]
        self.pad_idx = self.x_field.vocab.stoi[self.pad_token]
        self.train_wrapper = BatchWrapper(self.train_iter, self.x_var,
                                          self.y_vars)
        self.valid_wrapper = BatchWrapper(self.valid_iter, self.x_var,
                                          self.y_vars)

    @staticmethod
    def word_tokenize(text):
        text = pretreatment(text)
        return jieba.lcut(text)

    @staticmethod
    def char_tokenize(text):
        text = pretreatment(text)
        return list(text)

    @staticmethod
    def parse_fields(data_fields, x_field, y_field):
        train_fields, x_var, y_vars = [], None, []
        for field_name, var_type in data_fields.items():
            if var_type == "x":
                x_var = field_name
                train_fields.append((field_name, x_field))
            elif var_type == "y":
                y_vars.append(field_name)
                train_fields.append((field_name, y_field))
            else:
                train_fields.append((field_name, None))
        return train_fields, x_var, y_vars

    @staticmethod
    def extend_vocab_with_vectors(vocab, vectors, vocab_size):
        for word in list(vectors.stoi.keys())[:vocab_size]:
            if word in vocab.stoi:
                vocab.itos.append(word)
                vocab.stoi[word] = len(vocab.itos) - 1
Esempio n. 11
0
import warnings
warnings.filterwarnings('ignore')

TEXT = Field(sequential=True, lower=True, include_lengths=True)  # Поле текста
LABEL = LabelField(dtype=torch.float)  # Поле метки

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

train, test = datasets.IMDB.splits(TEXT, LABEL)  # загрузим датасет
train, valid = train.split(random_state=random.seed(SEED))  # разобьем на части

TEXT.build_vocab(train)
LABEL.build_vocab(train)

device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train, valid, test), 
    batch_size = 64,
    sort_within_batch = True,
    device = device)

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc
Esempio n. 12
0
def _create_serialized_20newsgroups_iterator(args):
    r"""
    Creates a serialized 20 newsgroups dataset

    :param args: Test setup information
    """
    p_cls = {cls_id for cls_grp in args.pos for cls_id in cls_grp.value}
    n_cls = {cls_id for cls_grp in args.neg for cls_id in cls_grp.value}
    complete_train = _download_20newsgroups("train", p_cls, n_cls)

    tokenizer = nltk.tokenize.word_tokenize
    # noinspection PyPep8Naming
    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 include_lengths=True,
                 fix_length=args.seq_len)
    # noinspection PyPep8Naming
    LABEL = LabelField(sequential=False)
    complete_ds = _bunch_to_ds(complete_train, TEXT, LABEL)
    cache_dir = DATA_DIR / "vector_cache"
    cache_dir.mkdir(parents=True, exist_ok=True)
    TEXT.build_vocab(complete_ds,
                     min_freq=2,
                     vectors=torchtext.vocab.GloVe(name="6B",
                                                   dim=args.embed_dim,
                                                   cache=cache_dir))

    size_scalar = 1 + VALIDATION_FRAC
    p_bunch, u_bunch = _select_bunch_uar(int(args.size_p * size_scalar),
                                         complete_train,
                                         p_cls,
                                         remove_from_bunch=False)
    n_bunch, u_bunch = _select_negative_bunch(int(args.size_n * size_scalar),
                                              u_bunch,
                                              n_cls,
                                              args.bias,
                                              remove_from_bunch=False)
    u_bunch = _reduce_to_fixed_size(u_bunch,
                                    new_size=int(args.size_u * size_scalar))

    test_bunch = _download_20newsgroups("test", p_cls, n_cls)

    for name, bunch in (("P", p_bunch), ("N", n_bunch), ("U", u_bunch),
                        ("Test", test_bunch)):
        _log_category_frequency(args.pos, name, bunch)

    # Binarize the labels
    for bunch in (p_bunch, u_bunch, n_bunch, test_bunch):
        _configure_binary_labels(bunch, pos_cls=p_cls, neg_cls=n_cls)

    # Sanity check
    assert np.all(p_bunch[LABEL_COL] ==
                  POS_LABEL), "Negative example in positive (labeled) set"
    assert len(p_bunch[LABEL_COL]) == int(args.size_p * size_scalar), \
        "Positive set has wrong number of examples"
    assert np.all(n_bunch[LABEL_COL] ==
                  NEG_LABEL), "Positive example in negative (labeled) set"
    assert len(n_bunch[LABEL_COL]) == int(args.size_n * size_scalar), \
        "Negative set has wrong number of examples"
    assert len(u_bunch[LABEL_COL]) == int(args.size_u * size_scalar), \
        "Unlabeled set has wrong number of examples"

    ng_data = NewsgroupsSerial(text=TEXT, label=LABEL)
    full_train_ds = _build_train_set(p_bunch, u_bunch, n_bunch, TEXT, LABEL)
    split_ratio = 1 / (1 + VALIDATION_FRAC)
    ng_data.train, ng_data.valid = full_train_ds.split(split_ratio,
                                                       stratified=True)

    ng_data.unlabel = _bunch_to_ds(u_bunch, TEXT, LABEL)
    ng_data.test = _bunch_to_ds(test_bunch, TEXT, LABEL)

    tot_unlabel_size = args.size_p + args.size_n + args.size_u
    assert len(ng_data.train.examples
               ) == tot_unlabel_size, "Train dataset is wrong size"

    LABEL.build_vocab(ng_data.train, ng_data.test)
    ng_data.dump(args)
Esempio n. 13
0
    def test_multinli(self):
        batch_size = 4

        # create fields
        TEXT = ParsedTextField()
        TREE = ShiftReduceField()
        GENRE = LabelField()
        LABEL = LabelField()

        # create train/val/test splits
        train, val, test = MultiNLI.splits(TEXT, LABEL, TREE, GENRE)

        # check all are MultiNLI datasets
        assert type(train) == type(val) == type(test) == MultiNLI

        # check all have correct number of fields
        assert len(train.fields) == len(val.fields) == len(test.fields) == 6

        # check fields are the correct type
        assert type(train.fields['premise']) == ParsedTextField
        assert type(train.fields['premise_transitions']) == ShiftReduceField
        assert type(train.fields['hypothesis']) == ParsedTextField
        assert type(train.fields['hypothesis_transitions']) == ShiftReduceField
        assert type(train.fields['label']) == LabelField
        assert type(train.fields['genre']) == LabelField

        assert type(val.fields['premise']) == ParsedTextField
        assert type(val.fields['premise_transitions']) == ShiftReduceField
        assert type(val.fields['hypothesis']) == ParsedTextField
        assert type(val.fields['hypothesis_transitions']) == ShiftReduceField
        assert type(val.fields['label']) == LabelField
        assert type(val.fields['genre']) == LabelField

        assert type(test.fields['premise']) == ParsedTextField
        assert type(test.fields['premise_transitions']) == ShiftReduceField
        assert type(test.fields['hypothesis']) == ParsedTextField
        assert type(test.fields['hypothesis_transitions']) == ShiftReduceField
        assert type(test.fields['label']) == LabelField
        assert type(test.fields['genre']) == LabelField

        # check each is the correct length
        assert len(train) == 392702
        assert len(val) == 9815
        assert len(test) == 9832

        # build vocabulary
        TEXT.build_vocab(train)
        LABEL.build_vocab(train)
        GENRE.build_vocab(train)

        # ensure vocabulary has been created
        assert hasattr(TEXT, 'vocab')
        assert hasattr(TEXT.vocab, 'itos')
        assert hasattr(TEXT.vocab, 'stoi')

        # create iterators
        train_iter, val_iter, test_iter = Iterator.splits(
            (train, val, test), batch_size=batch_size)

        # get a batch to test
        batch = next(iter(train_iter))

        # split premise and hypothesis from tuples to tensors
        premise, premise_transitions = batch.premise
        hypothesis, hypothesis_transitions = batch.hypothesis
        label = batch.label
        genre = batch.genre

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(premise_transitions) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(hypothesis_transitions) == torch.Tensor
        assert type(label) == torch.Tensor
        assert type(genre) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert premise_transitions.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert hypothesis_transitions.shape[-1] == batch_size
        assert label.shape[-1] == batch_size
        assert genre.shape[-1] == batch_size

        # repeat the same tests with iters instead of split
        train_iter, val_iter, test_iter = MultiNLI.iters(batch_size=batch_size,
                                                         trees=True)

        # split premise and hypothesis from tuples to tensors
        premise, premise_transitions = batch.premise
        hypothesis, hypothesis_transitions = batch.hypothesis
        label = batch.label

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(premise_transitions) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(hypothesis_transitions) == torch.Tensor
        assert type(label) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert premise_transitions.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert hypothesis_transitions.shape[-1] == batch_size
        assert label.shape[-1] == batch_size

        # remove downloaded multinli directory
        shutil.rmtree('.data/multinli')
Esempio n. 14
0
    def test_xnli(self):
        batch_size = 4

        # create fields
        TEXT = Field()
        GENRE = LabelField()
        LABEL = LabelField()
        LANGUAGE = LabelField()

        # create val/test splits, XNLI does not have a test set
        val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE)

        # check both are XNLI datasets
        assert type(val) == type(test) == XNLI

        # check all have the correct number of fields
        assert len(val.fields) == len(test.fields) == 5

        # check fields are the correct type
        assert type(val.fields['premise']) == Field
        assert type(val.fields['hypothesis']) == Field
        assert type(val.fields['label']) == LabelField
        assert type(val.fields['genre']) == LabelField
        assert type(val.fields['language']) == LabelField

        assert type(test.fields['premise']) == Field
        assert type(test.fields['hypothesis']) == Field
        assert type(test.fields['label']) == LabelField
        assert type(test.fields['genre']) == LabelField
        assert type(test.fields['language']) == LabelField

        # check each is the correct length
        assert len(val) == 37350
        assert len(test) == 75150

        # build vocabulary
        TEXT.build_vocab(val)
        LABEL.build_vocab(val)
        GENRE.build_vocab(val)
        LANGUAGE.build_vocab(val)

        # ensure vocabulary has been created
        assert hasattr(TEXT, 'vocab')
        assert hasattr(TEXT.vocab, 'itos')
        assert hasattr(TEXT.vocab, 'stoi')

        # create iterators
        val_iter, test_iter = Iterator.splits((val, test),
                                              batch_size=batch_size)

        # get a batch to test
        batch = next(iter(val_iter))

        # split premise and hypothesis from tuples to tensors
        premise = batch.premise
        hypothesis = batch.hypothesis
        label = batch.label
        genre = batch.genre
        language = batch.language

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(label) == torch.Tensor
        assert type(genre) == torch.Tensor
        assert type(language) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert label.shape[-1] == batch_size
        assert genre.shape[-1] == batch_size
        assert language.shape[-1] == batch_size

        # xnli cannot use the iters method, ensure raises error
        with self.assertRaises(NotImplementedError):
            val_iter, test_iter = XNLI.iters(batch_size=batch_size)

        # remove downloaded xnli directory
        shutil.rmtree('.data/xnli')
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--model',
        type=str,
        default='rnn',
        help=
        "Available models are: 'rnn', 'cnn', 'bilstm', 'fasttext', and 'distilbert'\nDefault is 'rnn'"
    )
    parser.add_argument('--train_data_path',
                        type=str,
                        default="./data/train_clean.csv",
                        help="Path to the training data")
    parser.add_argument('--test_data_path',
                        type=str,
                        default="./data/dev_clean.csv",
                        help="Path to the test data")
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--vectors',
                        type=str,
                        default='fasttext.simple.300d',
                        help="""
                                Pretrained vectors:
                                Visit 
                                https://github.com/pytorch/text/blob/9ce7986ddeb5b47d9767a5299954195a1a5f9043/torchtext/vocab.py#L146
                                for more 
                                """)
    parser.add_argument('--max_vocab_size', type=int, default=750)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--bidirectional', type=bool, default=True)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--hidden_dim', type=int, default=64)
    parser.add_argument('--output_dim', type=int, default=1)
    parser.add_argument('--n_layers', type=int, default=2)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--n_epochs', type=int, default=5)
    parser.add_argument('--n_filters', type=int, default=100)
    parser.add_argument('--filter_sizes', type=list, default=[3, 4, 5])

    args = parser.parse_args()

    torch.manual_seed(args.seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ##########  BILSTM ##########

    if args.model == "bilstm":
        print('\nBiLSTM')
        TEXT = Field(tokenize='spacy')
        LABEL = LabelField(dtype=torch.float)
        data_fields = [("text", TEXT), ("label", LABEL)]

        train_data = TabularDataset(args.train_data_path,
                                    format='csv',
                                    fields=data_fields,
                                    skip_header=True,
                                    csv_reader_params={'delimiter': ","})

        test_data = TabularDataset(args.test_data_path,
                                   format='csv',
                                   fields=data_fields,
                                   skip_header=True,
                                   csv_reader_params={'delimiter': ","})

        train_data, val_data = train_data.split(split_ratio=0.8,
                                                random_state=random.seed(
                                                    args.seed))

        TEXT.build_vocab(train_data,
                         max_size=args.max_vocab_size,
                         vectors=args.vectors,
                         unk_init=torch.Tensor.normal_)
        LABEL.build_vocab(train_data)

        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, val_data, test_data),
            batch_size=args.batch_size,
            sort_key=lambda x: len(x.text),
            device=device)

        input_dim = len(TEXT.vocab)
        embedding_dim = get_embedding_dim(args.vectors)
        pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
        unk_idx = TEXT.vocab.stoi[TEXT.unk_token]

        model = BiLSTM(input_dim, embedding_dim, args.hidden_dim,
                       args.output_dim, args.n_layers, args.bidirectional,
                       args.dropout, pad_idx)

        pretrained_embeddings = TEXT.vocab.vectors

        model.embedding.weight.data.copy_(pretrained_embeddings)
        model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
        model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        criterion = nn.BCEWithLogitsLoss()

        model.to(device)
        criterion.to(device)

        best_valid_loss = float('inf')

        print("\nTraining...")
        print("===========")
        for epoch in range(1, args.n_epochs + 1):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(),
                           './checkpoints/{}-model.pt'.format(args.model))

            print(
                f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
            )

        model.load_state_dict(
            torch.load('./checkpoints/{}-model.pt'.format(args.model)))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        print('\nEvaluating...')
        print("=============")
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%'
              )  # Test Loss: 0.139, Test Acc: 95.27%

    ##########  VANILLA RNN ##########

    else:
        print('\nVanilla RNN')
        TEXT = Field(tokenize='spacy')
        LABEL = LabelField(dtype=torch.float)
        data_fields = [("text", TEXT), ("label", LABEL)]

        train_data = TabularDataset(args.train_data_path,
                                    format='csv',
                                    fields=data_fields,
                                    skip_header=True,
                                    csv_reader_params={'delimiter': ","})

        test_data = TabularDataset(args.test_data_path,
                                   format='csv',
                                   fields=data_fields,
                                   skip_header=True,
                                   csv_reader_params={'delimiter': ","})

        train_data, val_data = train_data.split(split_ratio=0.8,
                                                random_state=random.seed(
                                                    args.seed))

        TEXT.build_vocab(train_data,
                         max_size=args.max_vocab_size,
                         vectors=args.vectors)
        LABEL.build_vocab(train_data)

        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, val_data, test_data),
            batch_size=args.batch_size,
            sort_key=lambda x: len(x.text),
            device=device)

        input_dim = len(TEXT.vocab)
        embedding_dim = get_embedding_dim(args.vectors)

        model = RNN(input_dim, embedding_dim, args.hidden_dim, args.output_dim)

        pretrained_embeddings = TEXT.vocab.vectors

        model.embedding.weight.data.copy_(pretrained_embeddings)

        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        criterion = nn.BCEWithLogitsLoss()

        model.to(device)
        criterion.to(device)

        best_valid_loss = float('inf')

        print("\nTraining...")
        print("===========")
        for epoch in range(1, args.n_epochs + 1):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(),
                           './checkpoints/{}-model.pt'.format(args.model))

            print(
                f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
            )

        model.load_state_dict(
            torch.load('./checkpoints/{}-model.pt'.format(args.model)))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        print('\nEvaluating...')
        print("=============")
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%'
              )  # Test Loss: 0.138, Test Acc: 95.05%
Esempio n. 16
0
                          ("doc_text", TEXT), ("label", LABEL)]
        train_data = Dataset(torch_examples, fields_dataset)
        save_examples(train_data, "../traindata.json")
        exit(0)
    else:
        TEXT = Field(tokenize=tokenize_en,
                     batch_first=True,
                     include_lengths=True)
        LABEL = LabelField(dtype=torch.float, batch_first=True)
        fields_dataset = [("query_title", TEXT), ("query_description", TEXT),
                          ("doc_text", TEXT), ("label", LABEL)]
        train_data = Dataset(
            load_examples("../traindata.json", fields_dataset), fields_dataset)
    print("build_vocabulary...")
    TEXT.build_vocab(train_data, min_freq=1, vectors="glove.6B.300d")
    LABEL.build_vocab(train_data)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("build_iterator...")
    train_iterator, vaild_iterator = BucketIterator.splits(
        (train_data, train_data),
        batch_size=64,
        sort_key=lambda x: len(x.doc_text),
        sort_within_batch=False,
        device=device)

    size_of_vocab = len(TEXT.vocab)
    embedding_dim = 300
    num_hidden_nodes = 128
    num_layers = 2
    num_output_nodes = 1
    dropout = 0.2
Esempio n. 17
0
# load data from json
train_data, val_data, test_data = TabularDataset.splits(
    path="/home/CE/skrjanec/data_seg_all/" + SCENARIO + "/join",
    train="train_line.json",
    validation="val_line.json",
    test="val_line.json",
    format="json",
    fields=fields)

# started with glove.6B.50d
# next fasttext.en.300d
segment_text.build_vocab(train_data,
                         max_size=100000,
                         vectors=VECTORS,
                         unk_init=torch.Tensor.normal_)
gold_event.build_vocab(train_data)

# counts and frequency of classes in the train set
# gold_event.vocab.freqs is a Counter object
# divide every count with the largest count to get the weight for class_i
# other options for weight calculation https://discuss.pytorch.org/t/what-is-the-weight-values-mean-in-torch-nn-crossentropyloss/11455/10
print("class count in train data", gold_event.vocab.freqs)
count_max = max(gold_event.vocab.freqs.values())

# the weights should be a torch tensor
weights = []
weights2 = []
weights3 = []
for lbl, count in gold_event.vocab.freqs.items():
    weights.append(count_max / count)
    weights2.append(1 / count)
Esempio n. 18
0
def load_dataset(batch_size, test_sen=None):

    office_actions = pd.read_csv('../data/office_actions.csv', usecols=['app_id', 'ifw_number', 'rejection_102', 'rejection_103'], nrows=100000)

    abstractList = []
    idList = []
    rejectionColumn = []
    for num in range(10000):

        app_id = str(office_actions.app_id[num])
        filename = "../json_files/oa_"+app_id+".json"

        try:
            jfile = open(filename, 'r')
        except FileNotFoundError:
            print("File Not Found")
            continue

        parsed_json = json.load(jfile)
        jfile.close()

        n = int(office_actions.rejection_102[num])
        o = int(office_actions.rejection_103[num])

        if n == 0 and o == 0:
            rejType = 0 #neither
        elif n == 0 and o == 1:
            rejType = 1 #obvious
        elif n == 1 and o == 0:
            rejType = 0 #novelty
        elif n == 1 and o == 1:
            rejType = 1 #both
        else:
            print("Office action error:", sys.exc_info()[0])
            raise

        if rejType == 1 and rand(1) < 0.758:
            continue

        try:
            abstractList.append(parsed_json[0]['abstract_full'])
            idList.append(parsed_json[0]['application_number'])
        except IndexError:
            print("WARNING: file "+filename+" is empty!\n")
            continue

        rejectionColumn.append(rejType)

    all_data = {'text': abstractList, 'label': rejectionColumn}
    df = pd.DataFrame(all_data, index = idList)

    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
    LABEL = LabelField(sequential=False)
    #fields={'Abstract': ('text', TEXT), 'RejectionType': ('labels', LABEL)}
    fields={'text': TEXT, 'label': LABEL}


    ds = DataFrameDataset(df, fields)

    TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(ds)

    train_data, test_data = ds.split()
    train_data, valid_data = train_data.split() # Further splitting of training_data to create new training_data & validation_data

    word_embeddings = TEXT.vocab.vectors
    print ("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print ("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print ("Label Length: " + str(len(LABEL.vocab)))

    train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Esempio n. 19
0
def wsd(
    model_name='bert-base-uncased',  #ensemble-distil-1-albert-1 / albert-xxlarge-v2 / bert-base-uncased
    classifier_input='token-embedding-last-1-layers',  # token-embedding-last-layer / token-embedding-last-n-layers
    classifier_hidden_layers=[],
    reduce_options=True,
    freeze_base_model=True,
    max_len=512,
    batch_size=32,
    test=False,
    lr=5e-5,
    eps=1e-8,
    n_epochs=50,
    cls_token=False,  # If true, the cls token is used instead of the relevant-word token
    cache_embeddings=False,  # If true, the embeddings from the base model are saved to disk so that they only need to be computed once
    save_classifier=True  # If true, the classifier part of the network is saved after each epoch, and the training is automatically resumed from this saved network if it exists
):
    train_path = "wsd_train.txt"
    test_path = "wsd_test_blind.txt"
    n_classes = 222
    device = 'cuda'

    import __main__ as main
    print("Script: " + os.path.basename(main.__file__))

    print("Loading base model %s..." % model_name)
    if model_name.startswith('ensemble-distil-'):
        last_n_distil = int(model_name.replace('ensemble-distil-', "")[0])
        last_n_albert = int(model_name[-1])
        from transformers import AlbertTokenizer
        from transformers.modeling_albert import AlbertModel
        base_model = AlbertModel.from_pretrained('albert-xxlarge-v2',
                                                 output_hidden_states=True,
                                                 output_attentions=False)
        tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2')
        print(
            "Ensemble model with DistilBert last %d layers and Albert last %d layers"
            % (last_n_distil, last_n_albert))
    elif model_name.startswith('distilbert'):
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        base_model = DistilBertModel.from_pretrained(model_name,
                                                     num_labels=n_classes,
                                                     output_hidden_states=True,
                                                     output_attentions=False)
    elif model_name.startswith('bert'):
        from transformers import BertTokenizer, BertModel
        tokenizer = BertTokenizer.from_pretrained(model_name)
        base_model = BertModel.from_pretrained(model_name,
                                               num_labels=n_classes,
                                               output_hidden_states=True,
                                               output_attentions=False)
    elif model_name.startswith('albert'):
        from transformers import AlbertTokenizer
        from transformers.modeling_albert import AlbertModel
        tokenizer = AlbertTokenizer.from_pretrained(model_name)
        base_model = AlbertModel.from_pretrained(model_name,
                                                 output_hidden_states=True,
                                                 output_attentions=False)

    use_n_last_layers = 1
    if classifier_input == 'token-embedding-last-layer':
        use_n_last_layers = 1
    elif classifier_input.startswith(
            'token-embedding-last-') and classifier_input.endswith('-layers'):
        use_n_last_layers = int(
            classifier_input.replace('token-embedding-last-',
                                     "").replace('-layers', ""))
    else:
        raise ValueError("Invalid classifier_input argument")
    print("Using the last %d layers" % use_n_last_layers)

    def tokenize(str):
        return tokenizer.tokenize(str)[:max_len - 2]

    SENSE = LabelField(is_target=True)
    LEMMA = LabelField()
    TOKEN_POS = LabelField(use_vocab=False)
    TEXT = Field(tokenize=tokenize,
                 pad_token=tokenizer.pad_token,
                 init_token=tokenizer.cls_token,
                 eos_token=tokenizer.sep_token)
    EXAMPLE_ID = LabelField(use_vocab=False)
    fields = [('sense', SENSE), ('lemma', LEMMA), ('token_pos', TOKEN_POS),
              ('text', TEXT), ('example_id', EXAMPLE_ID)]

    def read_data(corpus_file, fields, max_len=None):
        train_id_start = 0
        test_id_start = 76049  # let the ids for the test examples start after the training example indices
        if corpus_file == "wsd_test_blind.txt":
            print("Loading test data...")
            id_start = test_id_start
        else:
            print("Loading train/val data...")
            id_start = train_id_start
        with open(corpus_file, encoding='utf-8') as f:
            examples = []
            for i, line in enumerate(f):
                sense, lemma, word_position, text = line.split('\t')
                # We need to convert from the word position to the token position
                words = text.split()
                pre_word = " ".join(words[:int(word_position)])
                pre_word_tokenized = tokenizer.tokenize(pre_word)
                token_position = len(
                    pre_word_tokenized
                ) + 1  # taking into account the later addition of the start token
                example_id = id_start + i
                if max_len is None or token_position < max_len - 1:  # ignore examples where the relevant token is cut off due to max_len
                    if cls_token:
                        token_position = 0
                    examples.append(
                        Example.fromlist(
                            [sense, lemma, token_position, text, example_id],
                            fields))
                else:
                    print(
                        "Example %d is skipped because the relevant token was cut off (token pos = %d)"
                        % (example_id, token_position))
                    print(text)
        return Dataset(examples, fields)

    dataset = read_data(train_path, fields, max_len)
    random.seed(0)
    trn, vld = dataset.split(0.7, stratified=True, strata_field='sense')

    TEXT.build_vocab([])
    if model_name.startswith('albert') or model_name.startswith(
            'ensemble-distil-'):

        class Mapping:
            def __init__(self, fn):
                self.fn = fn

            def __getitem__(self, item):
                return self.fn(item)

        TEXT.vocab.stoi = Mapping(tokenizer.sp_model.PieceToId)
        TEXT.vocab.itos = Mapping(tokenizer.sp_model.IdToPiece)
    else:
        TEXT.vocab.stoi = tokenizer.vocab
        TEXT.vocab.itos = list(tokenizer.vocab)
    SENSE.build_vocab(trn)
    LEMMA.build_vocab(trn)

    trn_iter = BucketIterator(trn,
                              device=device,
                              batch_size=batch_size,
                              sort_key=lambda x: len(x.text),
                              repeat=False,
                              train=True,
                              sort=True)
    vld_iter = BucketIterator(vld,
                              device=device,
                              batch_size=batch_size,
                              sort_key=lambda x: len(x.text),
                              repeat=False,
                              train=False,
                              sort=True)

    if freeze_base_model:
        for mat in base_model.parameters():
            mat.requires_grad = False  # Freeze Bert model so that we only train the classifier on top

    if reduce_options:
        lemma_mask = defaultdict(
            lambda: torch.zeros(len(SENSE.vocab), device=device))
        for example in trn:
            lemma = LEMMA.vocab.stoi[example.lemma]
            sense = SENSE.vocab.stoi[example.sense]
            lemma_mask[lemma][sense] = 1
        lemma_mask = dict(lemma_mask)

        def mask(
            batch_logits, batch_lemmas
        ):  # Masks out the senses that do not belong to the specified lemma
            for batch_i in range(len(batch_logits)):
                lemma = batch_lemmas[batch_i].item()
                batch_logits[batch_i, :] *= lemma_mask[lemma]
            return batch_logits
    else:

        def mask(batch_logits, batch_lemmas):
            return batch_logits

    experiment_name = model_name + " " + (
        classifier_input if not model_name.startswith('ensemble-distil-') else
        "") + " " + str(classifier_hidden_layers) + " (" + (
            " cls_token" if cls_token else
            "") + (" reduce_options" if reduce_options else "") + (
                " freeze_base_model" if freeze_base_model else ""
            ) + "  ) " + "max_len=" + str(max_len) + " batch_size=" + str(
                batch_size) + " lr=" + str(lr) + " eps=" + str(eps) + (
                    " cache_embeddings" if cache_embeddings else "")

    if model_name.startswith('ensemble-distil-'):
        model = WSDEnsembleModel(last_n_distil, last_n_albert, n_classes, mask,
                                 classifier_hidden_layers)
    else:
        model = WSDModel(base_model, n_classes, mask, use_n_last_layers,
                         model_name, classifier_hidden_layers,
                         cache_embeddings)
    history = None
    #if save_classifier:
    #    if model.load_classifier(experiment_name):
    #        # Existing saved model loaded
    #        # Also load the corresponding training history
    #        history = read_dict_file("results/"+experiment_name+".txt")

    model.cuda()

    print("Starting experiment  " + experiment_name)
    if test:
        tst = read_data(test_path, fields, max_len=512)
        tst_iter = Iterator(tst,
                            device=device,
                            batch_size=batch_size,
                            sort=False,
                            sort_within_batch=False,
                            repeat=False,
                            train=False)
        batch_predictions = []
        for batch in tst_iter:
            print('.', end='')
            sys.stdout.flush()
            text = batch.text.t()
            with torch.no_grad():
                outputs = model(text,
                                token_positions=batch.token_pos,
                                lemmas=batch.lemma,
                                example_ids=batch.example_id)
                scores = outputs[-1]
            batch_predictions.append(scores.argmax(dim=1))
        batch_preds = torch.cat(batch_predictions, 0).tolist()
        predicted_senses = [SENSE.vocab.itos(pred) for pred in batch_preds]
        with open("test_predictions/" + experiment_name + ".txt", "w") as out:
            out.write("\n".join(predicted_senses))
    else:
        no_decay = ['bias', 'LayerNorm.weight']
        decay = 0.01
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps)

        def save_results(history):
            with open("results/" + experiment_name + ".txt", "w") as out:
                out.write(str(history))
            if save_classifier:
                if len(history['val_acc']) < 2 or history['val_acc'][-1] > max(
                        history['val_acc'][:-1]):
                    model.save_classifier(experiment_name, best=True)
                else:
                    model.save_classifier(experiment_name, best=False)

        train(model, optimizer, trn_iter, vld_iter, n_epochs, save_results,
              history)
Esempio n. 20
0
def test_single_gpu_batch_parse():
    trainer = Trainer(gpus=1)

    # non-transferrable types
    primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}]
    for batch in primitive_objects:
        data = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
        assert data == batch

    # batch is just a tensor
    batch = torch.rand(2, 3)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor'

    # tensor list
    batch = [torch.rand(2, 3), torch.rand(2, 3)]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].device.index == 0 and batch[0].type() == 'torch.cuda.FloatTensor'
    assert batch[1].device.index == 0 and batch[1].type() == 'torch.cuda.FloatTensor'

    # tensor list of lists
    batch = [[torch.rand(2, 3), torch.rand(2, 3)]]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor'
    assert batch[0][1].device.index == 0 and batch[0][1].type() == 'torch.cuda.FloatTensor'

    # tensor dict
    batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0]['a'].device.index == 0 and batch[0]['a'].type() == 'torch.cuda.FloatTensor'
    assert batch[0]['b'].device.index == 0 and batch[0]['b'].type() == 'torch.cuda.FloatTensor'

    # tuple of tensor list and list of tensor dict
    batch = ([torch.rand(2, 3) for _ in range(2)], [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)} for _ in range(2)])
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['a'].device.index == 0
    assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['b'].device.index == 0
    assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor'

    # namedtuple of tensor
    BatchType = namedtuple('BatchType', ['a', 'b'])
    batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].a.device.index == 0
    assert batch[0].a.type() == 'torch.cuda.FloatTensor'

    # non-Tensor that has `.to()` defined
    class CustomBatchType:

        def __init__(self):
            self.a = torch.rand(2, 2)

        def to(self, *args, **kwargs):
            self.a = self.a.to(*args, **kwargs)
            return self

    batch = trainer.accelerator.batch_to_device(CustomBatchType(), torch.device('cuda:0'))
    assert batch.a.type() == 'torch.cuda.FloatTensor'

    # torchtext.data.Batch
    samples = [{
        'text': 'PyTorch Lightning is awesome!',
        'label': 0
    }, {
        'text': 'Please make it work with torchtext',
        'label': 1
    }]

    text_field = Field()
    label_field = LabelField()
    fields = {'text': ('text', text_field), 'label': ('label', label_field)}

    examples = [Example.fromdict(sample, fields) for sample in samples]
    dataset = Dataset(examples=examples, fields=fields.values())

    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
    text_field.build_vocab(dataset)
    label_field.build_vocab(dataset)

    batch = Batch(data=examples, dataset=dataset)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))

    assert batch.text.type() == 'torch.cuda.LongTensor'
    assert batch.label.type() == 'torch.cuda.LongTensor'
Esempio n. 21
0
# torchtext datasets
fields = [('text', TEXT), ('label', LABEL)]
train_ds = TabularDataset(path='',
                          format='csv',
                          fields=fields,
                          skip_header=False)

# split train_ds into train and test
train_ds, val_ds = train_ds.split(split_ratio=0.7, random_state=random.seed(1))
print(vars(train_ds.examples[0]))

# build vocabulary
MAX_VOCAB = 30000
TEXT.build_vocab(train_ds, max_size=MAX_VOCAB)
LABEL.build_vocab(train_ds)

# build iterators
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, val_iter = BucketIterator.splits((train_ds, val_ds),
                                             batch_size=64,
                                             sort_within_batch=True,
                                             device=device)


# model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,
                 num_lstm_layers, bidirectional, dropout, word_pad_idx):
        super(BiLSTM, self).__init__()
        self.emb = nn.Embedding(num_embeddings=input_dim,
def load_dataset(batch_size, cache_data=True, test_sen=None):

    if cache_data:
        print("Caching Data")
        office_actions = pd.read_csv(
            '../data/office_actions.csv',
            index_col='app_id',
            usecols=['app_id', 'rejection_102', 'rejection_103'],
            dtype={
                'app_id': int,
                'rejection_102': int,
                'rejection_103': int
            },
            nrows=200000)

        abstractList = []
        idList = []
        rejectionColumn = []
        obviousCount = 0
        notCount = 0
        path = "/scratch/dm4350/json_files/"
        count = 0

        for filename in os.listdir(path):

            if count % 1000 == 0:
                print(count)

            filepath = path + filename
            try:
                jfile = open(filepath, 'r')
            except FileNotFoundError:
                print("File Not Found")
                continue

            try:
                parsed_json = json.load(jfile)
                jfile.close()
            except UnicodeDecodeError:
                print("WARNING: UnicodeDecodeError")
                continue
            except json.decoder.JSONDecodeError:
                print("WARNING: JSONDecodeError")
                continue

            app_id = int(
                filename.replace("oa_", "").replace(".json",
                                                    "").replace("(1)", ""))
            try:
                row = office_actions.loc[app_id]
            except KeyError:
                print("WARNING: KeyError")
                continue

            try:
                n = int(row.rejection_102)
                o = int(row.rejection_103)
            except TypeError:
                n = int(row.rejection_102.iloc[0])
                o = int(row.rejection_103.iloc[0])

            if n == 0 and o == 0:
                rejType = 0  #neither
            elif n == 0 and o == 1:
                rejType = 1  #obvious
            elif n == 1 and o == 0:
                rejType = 0  #novelty
            elif n == 1 and o == 1:
                rejType = 1  #both
            else:
                print("Office actions dataframe error:", sys.exc_info()[0])
                raise

            if obviousCount >= notCount and rejType == 1:
                continue

            obviousCount += o
            notCount += not (o)

            # Skip any files not in the appropriate IPC class
            try:
                found_A61 = False
                for s in parsed_json[0]['ipc_classes']:
                    if (s.find("A61") != -1):
                        found_A61 = True
                if not found_A61:
                    continue
            except:
                print("WARNING: file " + filepath + " is empty!\n")
                continue

            # Read in data from json file if it exists
            try:
                a = parsed_json[0]['abstract_full']
                i = parsed_json[0]['application_number']
            except IndexError:
                print("WARNING: file " + filepath + " is empty!\n")
                continue
            except KeyError:
                print("WARNING: file " + filepath + " is empty!\n")
                continue

            abstractList.append(a)
            idList.append(i)
            rejectionColumn.append(rejType)

            count += 1
            #if count > 2000: break

        df = pd.DataFrame({
            'text': abstractList,
            'label': rejectionColumn
        },
                          index=idList)
        print("{} files loaded".format(count))

        df.to_pickle('./data_cache/abstracts_df_A61.pkl')
        # with open("data_cache/TEXT.Field","wb")as f:
        #     dill.dump(TEXT,f)
        # with open("data_cache/LABEL.Field","wb")as f:
        #     dill.dump(LABEL,f)

    else:
        print('Loading Dataset from Cache')
        df = pd.read_pickle('./data_cache/abstracts_df_A61.pkl')
        # with open("data_cache/TEXT.Field","rb")as f:
        #     TEXT=dill.load(f)
        # with open("data_cache/LABEL.Field","rb")as f:
        #     LABEL=dill.load(f)

    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True,
                 tokenize=tokenize,
                 lower=True,
                 include_lengths=True,
                 batch_first=True,
                 fix_length=200)
    LABEL = LabelField(sequential=False)

    fields = {'text': TEXT, 'label': LABEL}
    ds = DataFrameDataset(df, fields)

    TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(ds)

    train_data, test_data = ds.split()
    train_data, valid_data = train_data.split(
    )  # Further splitting of training_data to create new training_data & validation_data
    word_embeddings = TEXT.vocab.vectors
    print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    train_iter, valid_iter, test_iter = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_key=lambda x: len(x.text),
        repeat=False,
        shuffle=True)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Esempio n. 23
0
def load_data(train_file, test_file, pretrain=None, save_dir=None):
    assert os.path.exists(train_file), f"{train_file} is not exist!"
    assert os.path.exists(test_file), f"{test_file} is not exist!"
    print("=" * 30 + "DATASET LOADER" + "=" * 30)
    sent_field = Field(tokenize=lambda x: x.split(),
                       unk_token='<unk>',
                       pad_token='<pad>',
                       init_token=None,
                       eos_token=None)
    doc_field = NestedField(sent_field,
                            tokenize=sent_tokenize,
                            pad_token='<pad>',
                            init_token=None,
                            eos_token=None,
                            include_lengths=True)
    label_field = LabelField()
    fields = [("raw", RawField()), ("doc", doc_field), ("label", label_field)]
    print(f"Reading {train_file} ...")
    with open(train_file, "r", encoding="utf-8") as reader:
        lines = reader.readlines()
        examples = []
        for line in lines:
            text, label = line.split('\t')
            examples.append(
                Example.fromlist([text, text.lower(), label], fields))
        train_dataset = Dataset(examples, fields)
        reader.close()
    print(f"\tNum of train examples: {len(examples)}")
    print(f"Reading {test_file} ...")
    with open(test_file, "r", encoding="utf-8") as reader:
        lines = reader.readlines()
        examples = []
        for line in lines:
            text, label = line.split('\t')
            examples.append(
                Example.fromlist([text, text.lower(), label], fields))
        test_dataset = Dataset(examples, fields)
        reader.close()
    print(f"\tNum of valid examples: {len(examples)}")
    vectors = FastText('vi')
    doc_field.build_vocab(train_dataset, test_dataset, vectors=vectors)
    label_field.build_vocab(train_dataset, test_dataset)
    print(f"Building vocabulary ...")
    num_vocab = len(doc_field.vocab)
    num_classes = len(label_field.vocab)
    pad_idx = doc_field.vocab.stoi['<pad>']
    print(f"\tNum of vocabulary: {num_vocab}")
    print(f"\tNum of classes: {num_classes}")
    if save_dir:
        with open(save_dir + "/vocab.json", "w", encoding="utf-8") as fv:
            vocabs = {
                "word": doc_field.vocab.stoi,
                "class": label_field.vocab.itos,
                'pad_idx': pad_idx
            }
            json.dump(vocabs, fv)
            fv.close()
        with open(save_dir + "/fileds.json", "w", encoding="utf-8") as ff:
            field_vocabs = {
                "doc": doc_field.vocab.freqs,
                "label": label_field.vocab.freqs
            }
            json.dump(field_vocabs, ff)
            ff.close()
    print("=" * 73)
    return train_dataset, test_dataset, num_vocab, num_classes, pad_idx, vectors.vectors
Esempio n. 24
0
	def prepare_dataset(self, name='adult'):
		if name == 'adult':
			from utils.load_adult import get_train_test
			from utils.Custom_Dataset import Custom_Dataset
			import torch

			train_data, train_target, test_data, test_target = get_train_test()

			X_train = torch.tensor(train_data.values, requires_grad=False).float()
			y_train = torch.tensor(train_target.values, requires_grad=False).long()
			X_test = torch.tensor(test_data.values, requires_grad=False).float()
			y_test = torch.tensor(test_target.values, requires_grad=False).long()

			print("X train shape: ", X_train.shape)
			print("y train shape: ", y_train.shape)
			pos, neg =(y_train==1).sum().item() , (y_train==0).sum().item()
			print("Train set Positive counts: {}".format(pos),"Negative counts: {}.".format(neg), 'Split: {:.2%} - {:.2%}'.format(1. * pos/len(X_train), 1.*neg/len(X_train)))
			print("X test shape: ", X_test.shape)
			print("y test shape: ", y_test.shape)
			pos, neg =(y_test==1).sum().item() , (y_test==0).sum().item()
			print("Test set Positive counts: {}".format(pos),"Negative counts: {}.".format(neg), 'Split: {:.2%} - {:.2%}'.format(1. * pos/len(X_test), 1.*neg/len(X_test)))

			train_indices, valid_indices = get_train_valid_indices(len(X_train), self.train_val_split_ratio, self.sample_size_cap)

			train_set = Custom_Dataset(X_train[train_indices], y_train[train_indices], device=self.device)
			validation_set = Custom_Dataset(X_train[valid_indices], y_train[valid_indices], device=self.device)
			test_set = Custom_Dataset(X_test, y_test, device=self.device)

			return train_set, validation_set, test_set
		elif name == 'mnist':

			train = FastMNIST('datasets/MNIST', train=True, download=True)
			test = FastMNIST('datasets/MNIST', train=False, download=True)

			train_indices, valid_indices = get_train_valid_indices(len(train), self.train_val_split_ratio, self.sample_size_cap)
			
			from utils.Custom_Dataset import Custom_Dataset

			train_set = Custom_Dataset(train.data[train_indices], train.targets[train_indices], device=self.device)
			validation_set = Custom_Dataset(train.data[valid_indices],train.targets[valid_indices] , device=self.device)
			test_set = Custom_Dataset(test.data, test.targets, device=self.device)

			del train, test

			return train_set, validation_set, test_set

		elif name == 'cifar10':

			'''
			from torchvision import transforms			
			transform_train = transforms.Compose([
				transforms.RandomCrop(32, padding=4),
				transforms.RandomHorizontalFlip(),
				transforms.ToTensor(),
				transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
			])

			transform_test = transforms.Compose([
				transforms.ToTensor(),
				transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
			])
			'''

			train = FastCIFAR10('datasets/cifar', train=True, download=True)#, transform=transform_train)
			test = FastCIFAR10('datasets/cifar', train=False, download=True)#, transform=transform_test)

			train_indices, valid_indices = get_train_valid_indices(len(train), self.train_val_split_ratio, self.sample_size_cap)
			
			from utils.Custom_Dataset import Custom_Dataset

			train_set = Custom_Dataset(train.data[train_indices], train.targets[train_indices], device=self.device)
			validation_set = Custom_Dataset(train.data[valid_indices],train.targets[valid_indices] , device=self.device)
			test_set = Custom_Dataset(test.data, test.targets, device=self.device)
			del train, test

			return train_set, validation_set, test_set
		elif name == "sst":
			import torchtext.data as data
			text_field = data.Field(lower=True)
			from torch import long as torch_long
			label_field = LabelField(dtype = torch_long, sequential=False)


			import torchtext.datasets as datasets
			train_data, validation_data, test_data = datasets.SST.splits(text_field, label_field, fine_grained=True)

			indices_list = powerlaw(list(range(len(train_data))), self.n_participants)
			ratios = [len(indices) / len(train_data) for indices in indices_list]

			train_datasets = split_torchtext_dataset_ratios(train_data, ratios)

			text_field.build_vocab(*(train_datasets + [validation_data, test_data]))
			label_field.build_vocab(*(train_datasets + [validation_data, test_data]))

			self.args.text_field = text_field
			self.args.label_field = label_field

			return train_datasets, validation_data, test_data

		elif name == 'mr':

			import torchtext.data as data
			from utils import mydatasets

			text_field = data.Field(lower=True)
			from torch import long as torch_long
			label_field = LabelField(dtype = torch_long, sequential=False)
			# label_field = data.Field(sequential=False)

			train_data, dev_data = mydatasets.MR.splits(text_field, label_field, root='.data/mr', shuffle=False)

			validation_data, test_data = dev_data.split(split_ratio=0.5, random_state = random.seed(1234))
			
			indices_list = powerlaw(list(range(len(train_data))), self.n_participants)
			ratios = [len(indices) / len(train_data) for indices in  indices_list]

			train_datasets = split_torchtext_dataset_ratios(train_data, ratios)

			# print(train_data, dir(train_data))
			# print((train_datasets[0].examples[0].text))
			# print((train_datasets[0].examples[1].text))
			# print((train_datasets[0].examples[2].text))
			# exit()


			text_field.build_vocab( *(train_datasets + [validation_data, test_data] ))
			label_field.build_vocab( *(train_datasets + [validation_data, test_data] ))

			self.args.text_field = text_field
			self.args.label_field = label_field

			return train_datasets, validation_data, test_data

		elif name == 'imdb':

			from torch import long as torch_long
			# text_field = Field(tokenize = 'spacy', preprocessing = generate_bigrams) # generate_bigrams takes about 2 minutes
			text_field = Field(tokenize = 'spacy')
			label_field = LabelField(dtype = torch_long)

			dirname = '.data/imdb/aclImdb'

			from torch.nn.init import normal_
			from torchtext import datasets


			train_data, test_data = datasets.IMDB.splits(text_field, label_field) # 25000, 25000 samples each

			# use 5000 out of 25000 of test_data as the test_data
			test_data, remaining = test_data.split(split_ratio=0.2 ,random_state = random.seed(1234))
			
			# use 5000 out of the remaining 2000 of test_data as valid data
			valid_data, remaining = remaining.split(split_ratio=0.25 ,random_state = random.seed(1234))

			# train_data, valid_data = train_data.split(split_ratio=self.train_val_split_ratio ,random_state = random.seed(1234))

			indices_list = powerlaw(list(range(len(train_data))), self.n_participants)
			ratios = [len(indices) / len(train_data) for indices in  indices_list]

			train_datasets = split_torchtext_dataset_ratios(train_data, ratios)

			MAX_VOCAB_SIZE = 25_000

			text_field.build_vocab(*(train_datasets + [valid_data, test_data] ), max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.100d",  unk_init = normal_)
			label_field.build_vocab( *(train_datasets + [valid_data, test_data] ))

			# INPUT_DIM = len(text_field.vocab)
			# OUTPUT_DIM = 1
			# EMBEDDING_DIM = 100

			PAD_IDX = text_field.vocab.stoi[text_field.pad_token]

			self.args.text_field = text_field
			self.args.label_field = label_field
			self.args.pad_idx = PAD_IDX

			return train_datasets, valid_data, test_data

		elif name == 'names':

			from utils.load_names import get_train_test
			from utils.Custom_Dataset import Custom_Dataset
			import torch
			from collections import Counter

			X_train, y_train, X_test, y_test, reference_dict = get_train_test()

			print("X train shape: ", X_train.shape)
			print("y train shape: ", y_train.shape)
			
			print("X test shape: ", X_test.shape)
			print("y test shape: ", y_test.shape)

			from utils.Custom_Dataset import Custom_Dataset
			train_set = Custom_Dataset(X_train, y_train)
			test_set = Custom_Dataset(X_test, y_test)

			return train_set, test_set
Esempio n. 25
0
def main(args):
    print('start ..!')
    BATCH_SIZE = args.batch_size
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    TEXT = Field(
        sequential=True,  # text: sequential data
        tokenize=str.split,
        batch_first=True,
        fix_length=56,  # padding size: max length of data text
        lower=True)
    LABEL = LabelField(sequential=False, dtype=torch.float)

    w2v = KeyedVectors.load_word2vec_format(
        './model/GoogleNews-vectors-negative300.bin.gz', binary=True)

    data_dir = args.data_dir

    train_paths, val_paths = build_data(data_dir)

    N_EPOCHS = args.epochs
    EMBEDDING_DIM = args.embedding
    N_FILTERS = args.n_filters
    FILTER_SIZES = args.filter_sizes
    OUTPUT_DIM = 1
    DROPOUT = args.dropout
    test_acc_lists = []

    for kfold in range(10):
        # make datasets
        train_path = train_paths[kfold]
        val_path = val_paths[kfold]
        train_data = TabularDataset(path=train_path,
                                    skip_header=True,
                                    format='csv',
                                    fields=[('label', LABEL), ('text', TEXT)])
        test_data = TabularDataset(path=val_path,
                                   skip_header=True,
                                   format='csv',
                                   fields=[('label', LABEL), ('text', TEXT)])

        TEXT.build_vocab(train_data)
        LABEL.build_vocab(train_data)

        # for pretrained embedding vectors
        w2v_vectors = []
        for token, idx in TEXT.vocab.stoi.items():
            # pad token -> zero
            if idx == 1:
                w2v_vectors.append(torch.zeros(EMBEDDING_DIM))
            # if word in word2vec vocab -> replace with pretrained word2vec
            elif token in w2v.wv.vocab.keys():
                w2v_vectors.append(torch.FloatTensor(w2v[token]))
            # oov -> randomly initialized uniform distribution
            else:
                w2v_vectors.append(
                    torch.distributions.Uniform(-0.25, +0.25).sample(
                        (EMBEDDING_DIM, )))

        TEXT.vocab.set_vectors(TEXT.vocab.stoi, w2v_vectors, EMBEDDING_DIM)
        pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)

        # make iterators
        train_iterator, test_iterator = BucketIterator.splits(
            (train_data, test_data),
            batch_size=BATCH_SIZE,
            device=device,
            sort=False,
            shuffle=True)

        # define a model
        INPUT_DIM = len(TEXT.vocab)

        model = CNN1d(pretrained_embeddings, INPUT_DIM, EMBEDDING_DIM,
                      N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
        optimizer = optim.Adadelta(model.parameters(), rho=0.95)
        criterion = nn.BCEWithLogitsLoss()

        model = model.to(device)
        criterion = criterion.to(device)

        # train
        best_test_acc = -float('inf')
        model_name = './model/model' + str(kfold) + '.pt'
        print('kfold', kfold)
        for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            test_loss, test_acc = evaluate(model, test_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if test_acc > best_test_acc:
                best_test_acc = test_acc
                torch.save(model.state_dict(), model_name)

            # print(f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            # print(f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
            # print(f'\t\tTest. Loss: {test_loss:.3f} |  Val. Acc: {test_acc * 100:.2f}%')
            logging.info(
                f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            logging.info(
                f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%'
            )
            logging.info(
                f'\t\tTest. Loss: {test_loss:.3f} |  Val. Acc: {test_acc * 100:.2f}%'
            )

        model.load_state_dict(torch.load(model_name))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)
        test_acc_lists.append(test_acc)
        logging.info(f'============== last test accuracy: {test_acc}')
        # print(f'============== last test accuracy: {test_acc}')
        print()
    return test_acc_lists