Example #1
0
def load_train_data(data_path, batch_size, max_src_len, max_trg_len, use_cuda=False):
    # Note: sequential=False, use_vocab=False, since we use preprocessed inputs.
    src_field = Field(sequential=True, use_vocab=False, include_lengths=True, batch_first=True,
                      pad_token=PAD, unk_token=UNK, init_token=None, eos_token=None,)
    trg_field = Field(sequential=True, use_vocab=False, include_lengths=True, batch_first=True,
                      pad_token=PAD, unk_token=UNK, init_token=BOS, eos_token=EOS,)
    fields = (src_field, trg_field)
    device = None if use_cuda else -1

    def filter_pred(example):
        if len(example.src) <= max_src_len and len(example.trg) <= max_trg_len:
            return True
        return False

    dataset = torch.load(data_path)
    train_src, train_tgt = dataset['train_src'], dataset['train_tgt']
    dev_src, dev_tgt = dataset['dev_src'], dataset['dev_tgt']

    train_data = ParallelDataset(train_src, train_tgt, fields=fields, filter_pred=filter_pred,)
    train_iter = Iterator(dataset=train_data, batch_size=batch_size, train=True, # Variable(volatile=False)
                          sort_key=lambda x: data.interleave_keys(len(x.src), len(x.trg)),
                          repeat=False, shuffle=True, device=device)
    dev_data = ParallelDataset(dev_src, dev_tgt, fields=fields,)
    dev_iter = Iterator(dataset=dev_data, batch_size=batch_size, train=False,    # Variable(volatile=True)
                        repeat=False, device=device, shuffle=False, sort=False,)

    return src_field, trg_field, train_iter, dev_iter
def load_dataset(data_path, train_batch_size=4096, dev_batch_size=1, max_len=100):
    """
    This assumes that the data is already pre-processed using Moses Tokenizer
    Returns iterators for the training/dev dataset

    Arguments:
        data_path: path of the dataset
        train_batch_size: batch size of the training data (defined in terms of number of tokens or sentences, depending on the model_type)
        dev_batch_size: batch size of the dev data (usually one)
        max_len: max length of sequeences in a batch
    """

    SRC = Field(tokenize=lambda s: s.split(), init_token="<s>",
                eos_token="</s>", batch_first=True, include_lengths=True)
    TRG = Field(tokenize=lambda s: s.split(), init_token="<s>",
                eos_token="</s>", batch_first=True, include_lengths=True)

    # create a TranslationDataset for both the train and dev set
    train_data = datasets.TranslationDataset(exts=("train.de", "train.en"), fields=(
        SRC, TRG), path=data_path, filter_pred=lambda x: len(vars(x)['src']) <= max_len and len(vars(x)['trg']) <= max_len)

    dev_data = datasets.TranslationDataset(
        exts=("dev.de", "dev.en"), fields=(SRC, TRG), path=data_path)

    # load in the Test Set
    test_examples = []
    with open(data_path + "test.de", "r") as f:
        for test_example in f.readlines():
            example = data.Example()
            setattr(example, "src", test_example.split())
            test_examples.append(example)

    test_data = data.Dataset(test_examples, fields=[("src", SRC)])

    # build he vocab using the training data
    SRC.build_vocab(train_data.src, train_data.trg)
    TRG.build_vocab(train_data.src, train_data.trg)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # use custom DataIterator in order to minimize padding in a sequence
    # and inoder to `pack` a batch fully inorder to maximmize the computation
    # in a GPU
    train_iterator = DataIterator(train_data, batch_size=train_batch_size, device=device,
                                  repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                                  batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, shuffle=True)

    # use a regular Iterator since we want to be able to compare
    # our translations to a gold standard file. If we use a
    # `DataIterator` then we will get our translations in shuffled/random
    # order
    dev_iterator = Iterator(dev_data, batch_size=dev_batch_size,
                            train=False, sort=False, repeat=False, device=device)

    # create Test Iterator for the test data
    test_iterator = Iterator(
        test_data, batch_size=1, train=False, sort=False, repeat=False, device=device)

    print(len(test_iterator))
    return train_iterator, dev_iterator, test_iterator, SRC, TRG
Example #3
0
    def tokenize(self):
        ENGLISH = Field(sequential=True,
                        use_vocab=True,
                        tokenize=str.split,
                        lower=True,
                        init_token="<sos>",
                        eos_token="<eos>")
        FRENCH = Field(sequential=True,
                        use_vocab=True,
                        tokenize=str.split,
                        lower=True,
                        init_token="<sos>",
                        eos_token="<eos>")

        """
        in order for this to work, change
        "csv.field_size_limit(sys.maxsize)" in torchtext/utils.py to "csv.field_size_limit(maxInt)"
        """
        train, test = TabularDataset.splits(path='./data/', train='train.csv', test='test.csv',
                                            format='csv', fields=[('en',ENGLISH),('fr',FRENCH)])
        ENGLISH.build_vocab(train, test)
        FRENCH.build_vocab(train, test)
        self.en_vocab = ENGLISH
        self.fr_vocab = FRENCH
        self.en_vocabsize = len(ENGLISH.vocab)
        self.fr_vocabsize = len(FRENCH.vocab)

        if self.config.debug :
            train_loader, test_loader = Iterator.splits((train, test), batch_size=2, device="cuda", shuffle=False,
                                                        sort_key=lambda x : len(x.en), sort_within_batch=False)
        else :
            train_loader, test_loader = Iterator.splits((train, test), batch_size=self.config.batchsize, device="cuda", shuffle=False,
                                                        sort_key=lambda x : len(x.en), sort_within_batch=False)
        return train_loader, test_loader
Example #4
0
def dataset2iter(workpath=WORK_PATH,
                 train_path=FILE_TRAIN,
                 validation_path=FILE_VALID,
                 test_path=FILE_TEST):
    fields = [('sentence', SENTENCE), ('wxx', LABEL), ('char', CHAR)]

    ######## 当你的数据行以 " 开头时,csv.reader 会认为此行包含分隔符,请额外传入以下参数。
    ######## torchtext相关文档: https://pytorch.org/text/stable/data.html#torchtext.data.TabularDataset.__init__
    ######## csv库相关文档: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
    csv_reader_params = {
        'doublequote': False,
        'quoting': csv.QUOTE_NONE,
    }

    data_train = TabularDataset(workpath + train_path,
                                format="tsv",
                                fields=fields,
                                skip_header=True,
                                csv_reader_params=csv_reader_params)
    data_valid = TabularDataset(workpath + validation_path,
                                format="tsv",
                                fields=fields,
                                skip_header=True,
                                csv_reader_params=csv_reader_params)
    data_test = TabularDataset(workpath + test_path,
                               format="tsv",
                               fields=fields,
                               skip_header=True,
                               csv_reader_params=csv_reader_params)

    pretrained_vectors = Vectors(name=GLOVE_PATH + TRAINED_VECTORS + '.txt',
                                 cache=GLOVE_PATH)
    SENTENCE.build_vocab(
        data_train,
        vectors=pretrained_vectors,
        unk_init=lambda x: torch.nn.init.uniform_(x, a=-0.25, b=0.25))
    LABEL.build_vocab(data_train)
    CHAR.build_vocab(data_train)
    # debug
    #return data_train,data_valid,data_test

    iter_train = Iterator(data_train,
                          batch_size=BATCH_SIZE,
                          train=True,
                          sort_key=lambda x: len(x.sentence),
                          shuffle=True,
                          device=DEVICE)
    iter_valid = Iterator(data_valid,
                          batch_size=BATCH_SIZE,
                          train=False,
                          sort=False,
                          shuffle=True,
                          device=DEVICE)
    iter_test = Iterator(data_test,
                         batch_size=BATCH_SIZE,
                         train=False,
                         sort=False,
                         shuffle=True,
                         device=DEVICE)
    return iter_train, iter_valid, iter_test
Example #5
0
def data_split(text_field, label_field, dataset, mode=False):
    if mode == 'init':
        for index, c in enumerate(dataset):
            partial = NLPDataLoader(c,
                                    text_field=text_field,
                                    label_field=label_field,
                                    test=False)
            if index == 0:
                text_field.build_vocab(partial)
                label_field.build_vocab(list(range(13)))
            else:
                text_counter = text_field.vocab.freqs
                for example in partial.examples:
                    text_counter.update(example.text)
                text_field.vocab = text_field.vocab_cls(
                    text_counter, specials=['<unk>', '<pad>'])
        return
    elif mode is False:
        dataset = NLPDataLoader(dataset,
                                text_field=text_field,
                                label_field=label_field,
                                test=False)
        return Iterator.splits((dataset, ), batch_size=20)
    elif mode is True:
        dataset = NLPDataLoader(dataset,
                                text_field=text_field,
                                label_field=label_field,
                                test=True)
        return Iterator.splits((dataset, ), batch_size=20, shuffle=False)
Example #6
0
    def fit(self, filepath, train_dev_ratio=0.8, batch_size=64, nepoch=10):
        """ Feed training data to train the model

        Args:
            filepath: a string, the path of dataset
            train_dev_ratio: a float, the ratio to split train and dev dataset
            batch_size: a integer, the size of batch when training
            nepoch: a integer, the number of training epochs

        TODO:
            1) support early stopping
            2) support customized delimiter
            3) support callback function
            4) support fit_generator
        """
        train, dev = self._process_data(filepath, train_dev_ratio)

        self.text_field.build_vocab(train, vectors="glove.6B.50d")
        self.label_field.build_vocab(train)

        self._build_network()

        train_iter = Iterator(train, batch_size=batch_size, shuffle=True)
        dev_iter = Iterator(dev, batch_size=batch_size, shuffle=True)

        optimizer = torch.optim.Adam(self.network.parameters(), lr=self.lr)
        loss_fn = nn.BCELoss()

        self.network.train()
        best_acc = 0
        for epoch in range(1, nepoch + 1):
            for i, batch in enumerate(train_iter):
                feature, target = batch.text, batch.label
                target = target.type(torch.FloatTensor)
                feature.data.t_(), target.data.sub_(1)
                optimizer.zero_grad()

                y_pred = self.network(feature).reshape(-1)
                loss = loss_fn(y_pred, target)
                loss.backward()
                optimizer.step()

                label_pred = (np.array(y_pred.data) > 0.5).astype(int)
                label_true = np.array(target)
                train_acc = accuracy_score(label_true, label_pred)
                output_str = '\rEpoch:{} batch:{} loss:{:.6f} acc:{:.2f}'
                sys.stdout.write(
                    output_str.format(epoch, i, loss.item(), train_acc))

            dev_acc, dev_p, dev_r, dev_f1 = self.evaluate(dev_iter)
            if dev_acc > best_acc:
                best_acc = dev_acc
                print('Saving best model:')
                output_str = '\nBest - acc:{:.2f} p:{:.2f} r:{:.2f} f1:{:.2f} \n \n'
                print(output_str.format(dev_acc, dev_p, dev_r, dev_f1))
                self._save_weights(self.network)

        return
Example #7
0
    def __init__(self, module_name, train_bs, eval_bs, device, log):
        self.module_name = module_name

        # split_chars = lambda x: list("".join(x.split()))
        split_chars = lambda x: list(x)  # keeps whitespaces

        source = Field(tokenize=split_chars,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        target = Field(tokenize=split_chars,
                       init_token='<sos>',
                       eos_token='<eos>',
                       batch_first=True)

        log("Loading FULL datasets ...")
        folder = os.path.join(DATASET_TARGET_DIR, module_name)
        train_dataset, eval_dataset, _ = TranslationDataset.splits(
            path=folder,
            root=folder,
            exts=(INPUTS_FILE_ENDING, TARGETS_FILE_ENDING),
            fields=(source, target),
            train=TRAIN_FILE_NAME,
            validation=EVAL_FILE_NAME,
            test=EVAL_FILE_NAME)

        log("Building vocab ...")
        source.build_vocab(train_dataset)
        target.vocab = source.vocab

        log("Creating iterators ...")
        train_iterator = Iterator(dataset=train_dataset,
                                  batch_size=train_bs,
                                  train=True,
                                  repeat=True,
                                  shuffle=True,
                                  device=device)

        eval_iterator = Iterator(dataset=eval_dataset,
                                 batch_size=eval_bs,
                                 train=False,
                                 repeat=False,
                                 shuffle=False,
                                 device=device)

        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.train_iterator = train_iterator
        self.eval_iterator = eval_iterator
        self.source = source
        self.target = target
Example #8
0
def create_data_iterator(batch_size, tabular_train, tabular_valid, d):
    # Create the Iterator for datasets (Iterator works like dataloader)

    train_iter = Iterator(tabular_train,
                          batch_size=batch_size,
                          device=d,
                          sort_within_batch=False,
                          repeat=False)
    valid_iter = Iterator(tabular_valid,
                          batch_size=batch_size,
                          device=d,
                          sort_within_batch=False,
                          repeat=False)

    return train_iter, valid_iter
Example #9
0
    def get_dataloader(self,
                       proc_id=0,
                       n_gpus=1,
                       device=torch.device('cpu'),
                       batch_size=64):
        def _distribute_dataset(dataset):
            n = len(dataset)
            part = dataset[n * proc_id // n_gpus:n * (proc_id + 1) // n_gpus]
            return torchtext.data.Dataset(part, dataset.fields)

        train_ds = _distribute_dataset(self.train_ds)
        self.verbose = self.verbose and (proc_id == 0)
        train_iter, valid_iter = BucketIterator.splits(
            (train_ds, self.valid_ds),
            batch_sizes=(batch_size, batch_size),
            sort_within_batch=True,
            sort_key=lambda x: len(x.input),
            device=device,
            repeat=False,
        )

        test_iter = Iterator(
            self.test_ds,
            batch_size=1,
            sort=False,
            sort_within_batch=False,
            device=device,
            repeat=False,
        )
        train_dl = BatchWrapper(train_iter)
        valid_dl = BatchWrapper(valid_iter)
        test_dl = BatchWrapper(test_iter)
        return train_dl, valid_dl, test_dl
Example #10
0
 def build_prediction_iterator(self, df):
     dataset = base.CommentsDataset(df, self.fields)
     pred_id = list(df['id'].values)
     pred_iter = Iterator(
         dataset, batch_size=self.params['batch_size'],
         repeat=False, shuffle=False, sort=False)
     return pred_id, pred_iter
    def eval(self, tasknum, total:int, trainloss:float=0, epoch:int=0):
        iter = Iterator(self.task[tasknum].te_dataset, batch_size=self.batch_size, repeat=False,
                                        sort_key=lambda x: len(x.syllable_contents), train=False,
                                        device=self.device)
        tq_iter = tqdm(enumerate(iter), total=math.ceil(total / self.batch_size),
                       unit_scale=self.batch_size, bar_format='{r_bar}')
        pred_lst = list()
        truth_lst = list()
        acc_lst = list()
        label_lst = list()
        self.model.eval()
        for i, batch in tq_iter:
            preds = self.model(batch.syllable_contents, tasknum)
            label = torch.tensor(batch.label, dtype=torch.long, device=self.device)
            if(self.isbinary):
                accs = torch.eq(preds > 0.5, batch.label > 0.5).to(torch.float)
            else:
                accs = torch.eq(torch.argmax(preds, dim=1), label).to(torch.long)
            label_lst += label.tolist()
            acc_lst += accs.tolist()
            pred_lst += preds.tolist()

        prec,recall,f1score,f1s,rocauc = self.getscore(pred_lst,label_lst)

        accuracy = sum(acc_lst)/total
        self.log_to_c3dl(json.dumps(
        {'type': 'test', 'epoch': epoch, 'accuracy': accuracy, 'precision': prec, 'recall': recall, 'f1score': f1score, 'ROC-AUC':rocauc}))

        wandb.log({'Epoch':epoch,'Accuracy':accuracy,'Precision':prec,'Recall':recall,'F1Score':f1score, 'Trainloss':trainloss, 'ROC-AUC':rocauc})
        if(len(f1s)!=0):
            for i in range(len(f1s)):
                wandb.log({f'Class {i} F1Score':f1s[i]})

        return acc_lst, total, prec, recall, f1score, f1s, rocauc
Example #12
0
def predict_and_save(dataset=None,
                     model=None,
                     dataset_path='dev.conll',
                     out_path='predict.conll',
                     **kwargs):
    """Combine original CONLL-X file with predictions.
  This is required since the iterator might have changed certain fields
  (e.g. lowercasing).
  We read the dataset_path separately and replace the fields we predicted.
  """
    device = torch.device(type='cuda') if use_cuda else torch.device(
        type='cpu')
    data_iter = Iterator(dataset,
                         1,
                         train=False,
                         sort=False,
                         shuffle=False,
                         device=device)
    start_time = time.time()

    i2pos = dataset.fields['pos'].vocab.itos

    with open(out_path, mode='w', encoding='utf-8') as f:
        with open(dataset_path, mode='r', encoding='utf-8') as data_f:
            with torch.no_grad():

                if "ud" in dataset_path:
                    original_iter = read_conllu(data_f)
                    ud = True
                    i2upos = dataset.fields['upos'].vocab.itos
                else:
                    original_iter = read_conllx(data_f)
                    ud = False

                for pred in predict(data_iter=data_iter, model=model):

                    tokens = next(original_iter)

                    pred_tags = [-1] * len(tokens)
                    pred_utags = [-1] * len(tokens)
                    write_tag, write_utag = False, False
                    if len(pred["pos"]) > 0:
                        pred_tags = pred["pos"].data.view(-1).tolist()
                        write_tag = True
                    if len(pred["upos"]) > 0:
                        pred_utags = pred["upos"].data.view(-1).tolist()
                        write_utag = True

                    for tok, pred_tag, pred_utag in \
                            zip(tokens, pred_tags, pred_utags):
                        if write_tag:
                            if ud:
                                tok.xpos = i2pos[pred_tag]
                            else:
                                tok.pos = i2pos[pred_tag]
                        if write_utag:
                            tok.upos = i2upos[pred_utag]

                        f.write(str(tok) + '\n')
                    f.write('\n')
Example #13
0
def get_iterator(dataset, device, batch_size, shuffle=True, repeat=False):
    train, val, test = dataset

    train_iter, val_iter = BucketIterator.splits(
        (train, val),
        batch_size=batch_size,
        device=device,
        sort_key=lambda x: len(x.comment_text),
        sort_within_batch=False,
        shuffle=shuffle,
        repeat=repeat)

    test_iter = Iterator(test,
                         batch_size=batch_size,
                         device=device,
                         sort_within_batch=False,
                         repeat=repeat,
                         sort=False)

    train_dl = BatchWrapper(train_iter, "comment_text", [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ])
    valid_dl = BatchWrapper(val_iter, "comment_text", [
        "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
    ])
    test_dl = BatchWrapper(test_iter, "comment_text", None)

    return train_dl, valid_dl, test_dl
Example #14
0
def test_text_cnn(config):

    os.environ['CUDA_VISIBLE_DEVICES'] = str(config['gpu'])

    base_path = config['base_path']
    save_path = os.path.join(base_path, 'text_cnn.pkl')
    vocab_path = os.path.join(base_path, 'vocab.pkl')

    TEXT = data.Field(sequential=True, lower=True, batch_first=True)
    LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)
    fields = [
        ('sentence', TEXT),
        ('label', LABEL)
    ]

    test_data = TabularDataset(path=os.path.join(base_path, 'test.tsv'),
                                format='tsv', skip_header=True, fields=fields)
    with open(vocab_path, 'rb') as handle:
        vocab = pickle.load(handle)
    TEXT.vocab = vocab

    device = torch.device('cuda:0')
    test_iter = Iterator(test_data, batch_size=config['batch_size'], shuffle=False, device=device)

    model = torch.load(save_path)

    test_accuracy = eval_text_cnn(model, test_iter)
    print('test_accuracy: %.4f' % test_accuracy)
Example #15
0
    def evaluate(self, dev_data):
        """ evaluate the dev dataset
        Args:
            dev_data: torchtext.data.Iterator or torchtext.data.Dataset

        Returns:
            a tuple of (accuracy, precision, recall, f1_score)
        """
        if isinstance(dev_data, Iterator):
            dev_iter = dev_data
        else:
            dev_iter = Iterator(dev_data, batch_size=32)

        self.network.eval()
        label_pred, label_true = [], []
        for batch in dev_iter:
            feature, target = batch.text, batch.label
            target = target.type(torch.FloatTensor)
            # since the label is {unk:0, 0: 1, 1: 2}, need subtrct 1
            feature.data.t_(), target.data.sub_(1)
            y_pred = self.network(feature)
            y_pred = y_pred.reshape(-1)
            label_pred += list((np.array(y_pred.data) > 0.5).astype(int))
            label_true += list(np.array(target))

        acc = accuracy_score(label_true, label_pred)
        p = precision_score(label_true, label_pred)
        r = recall_score(label_true, label_pred)
        f1 = f1_score(label_true, label_pred)
        output_str = '\nEval - acc:{:.2f} p:{:.2f} r:{:.2f} f1:{:.2f} \n'
        print(output_str.format(acc, p, r, f1))
        return acc, p, r, f1
Example #16
0
    def __init__(self, batch_size):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.pad_id = self.tokenizer._convert_token_to_id("[PAD]")
        self.batch_size = batch_size
        # Objects in which the data will be stored.
        self.text = Field(sequential=True,
                          lower=True,
                          tokenize=self.tokenizer.tokenize,
                          batch_first=True,
                          pad_token='[PAD]',
                          unk_token='[UNK]')
        self.labels = Field(sequential=False, is_target=True)

        self.train, self.dev, self.test = MultiNLI.splits(
            self.text, self.labels)

        # Builds vocabulary for the data.
        self.text.build_vocab(self.train, self.dev, self.test)
        self.labels.build_vocab(self.train)

        self.train_size = len(self.train)
        self.val_size = len(self.dev)
        self.test_size = len(self.test)
        self.name = 'mnli'
        # Standard torchtext iterators, these do not return input suitable for BERT.
        self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
            (self.train, self.dev, self.test),
            batch_size=self.batch_size,
            device=torch.device(
                'cuda:0' if torch.cuda.is_available() else 'cpu'))
    def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)
Example #18
0
    def classify_from_strings(self, strings: Union[List[str], str]) -> list:
        """

        method that can be used for classifying one or multiple examples with a trained classifier

        :param strings: a single string or a list of strings representing the pieces of text that should be classified
        :return: list containing the predictions of the models for the inputted pieces of text
        """
        assert self.has_trained
        if isinstance(strings, str):
            strings = [strings]
        if isinstance(strings, list):
            strings = [[string] for string in strings]

        fields = [('text', self._TEXT)]

        list_of_examples = [Example.fromlist(string, fields) for string in strings]
        dataset = torchtext.data.Dataset(list_of_examples, fields)

        data = Iterator(dataset, batch_size=1, device=torch.device("cpu"), sort=False, sort_within_batch=False,
                        repeat=False, shuffle=False)

        predictions = []

        for item in data:
            x = item.text
            self.model.to(self.device)
            self.model = self.model.eval()
            outputs = self.model([x[0].to(self.device), x[1].to(self.device)])
            predictions.extend(outputs.detach().cpu().argmax(1).tolist())
        results = [self._label_names[i] for i in predictions]
        return results
Example #19
0
  def __init__(self, batch_size):

    self.text = Field(
        lower=True,
        tokenize=lambda x: [tok.text for tok in spacy_en.tokenizer(x)],
        batch_first=True)
    self.label = Field(sequential=False, unk_token=None, is_target=True)

    self.train, self.dev, self.test = SNLI.splits(self.text, self.label)
    self.sizes = {
        'train': len(self.train),
        'val': len(self.dev),
        'test': len(self.test)
    }
    self.text.build_vocab(self.train, self.dev)
    self.label.build_vocab(self.train)

    vector_cache_loc = '.vector_cache/snli_vectors.pt'
    if os.path.isfile(vector_cache_loc):
      self.text.vocab.vectors = torch.load(vector_cache_loc)
    else:
      self.text.vocab.load_vectors('glove.840B.300d')
      torch.save(self.text.vocab.vectors, vector_cache_loc)

    # Batching
    self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
        (self.train, self.dev, self.test),
        batch_size=batch_size,
        device='cuda:0' if torch.cuda.is_available() else 'cpu')

    self.vocab_size = len(self.text.vocab)
    self.out_dim = len(self.label.vocab)
    self.labels = self.label.vocab.stoi
Example #20
0
def test_language_model(config: dict) -> None:

    os.environ['CUDA_VISIBLE_DEVICES'] = str(config['gpu'])

    base_path = config['base_path']
    save_path = os.path.join(base_path, 'language_model.pkl')
    vocab_path = os.path.join(base_path, 'vocab.pkl')

    TEXT = data.Field(sequential=True,
                      lower=True,
                      batch_first=True,
                      init_token=SOS,
                      eos_token=EOS)
    fields = [('sentence', TEXT)]

    test_data = TabularDataset(path=os.path.join(base_path, 'test.tsv'),
                               format='tsv',
                               skip_header=True,
                               fields=fields)
    with open(vocab_path, 'rb') as handle:
        vocab = pickle.load(handle)
    TEXT.vocab = vocab

    device = torch.device('cuda:0')
    test_iter = Iterator(test_data,
                         batch_size=config['batch_size'],
                         shuffle=False,
                         device=device)

    model = torch.load(save_path)
    criterion = nn.CrossEntropyLoss(ignore_index=PAD_INDEX)

    test_loss = eval_language_model(model, test_iter, criterion)
    print('test_loss: %.4f\ttest_ppl: %.4f' % (test_loss, 2**test_loss))
Example #21
0
File: pre.py Project: amlox2019/AML
    def __init__(self, dataset, batch_size, do_train, seed=1):
        super(BatchIterator, self).__init__()
        self.batch_size = batch_size
        self.do_train = do_train

        random.seed(seed)

        # We need different iterators for train and eval
        if self.do_train:
            iterator = BucketIterator(dataset=dataset,
                                      batch_size=self.batch_size,
                                      train=True,
                                      sort_key=lambda x: torchtext.data.
                                      interleave_keys(len(x.src), len(x.trg)))

        else:
            iterator = Iterator(dataset=dataset,
                                batch_size=self.batch_size,
                                sort=False,
                                sort_within_batch=False,
                                repeat=False)

        self.iterator = iterator
        self.num_batches = len(iterator)

        self.iter = iter(self.iterator)
Example #22
0
    def __init__(self, config):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.batch_size = config['batch_size']
        self.pad_id = self.tokenizer._convert_token_to_id("[PAD]")

        # Objects in which the data will be stored.
        self.text = Field(sequential=True,
                          lower=True,
                          tokenize=self.tokenizer.tokenize,
                          batch_first=True,
                          pad_token='[PAD]',
                          unk_token='[UNK]')
        self.labels = Field(sequential=False, is_target=True)

        self.train, self.dev, self.test = MultiNLI.splits(
            self.text, self.labels)

        # Builds vocabulary for the data.
        self.text.build_vocab(self.train, self.dev, self.test)
        self.labels.build_vocab(self.train)

        # Standard torchtext iterators, these do not return input suitable for BERT.
        self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
            (self.train, self.dev, self.test),
            batch_size=config['batch_size'],
            device=config['device'])
Example #23
0
    def create_iter(self, batch_size):
        """
        构建迭代器
        :param batch_size: 每批的大小
        :return: iter
        """
        # 定义torchtext中的Field
        fields = [('english', self.english), ('chinese', self.chinese)]
        examples = []
        # 构建中英文example
        for en, ch in zip(self.english_list, self.chinese_list):
            item = [en, ch]
            examples.append(data.Example().fromlist(item, fields))
        # 划分训练集,测试集
        train, test = Dataset(examples=examples,
                              fields=fields).split(split_ratio=0.8)
        self.english.build_vocab(train)
        self.chinese.build_vocab(train)
        self.english_voca_size = len(self.english.vocab)
        self.chinese_voca_size = len(self.chinese.vocab)
        train_iter, test_iter = Iterator.splits(
            (train, test),
            batch_sizes=(batch_size, len(test)),
            sort_key=lambda x: len(x.english),
            sort_within_batch=True,
            device=-1)

        return train_iter, test_iter
Example #24
0
def load_data():
    TEXT, LABELS, LENGTH = Field(sequential=True, use_vocab=True), \
                           Field(sequential=True, use_vocab=False, preprocessing=lambda x: list(map(int, x)),
                                 pad_token=-1), \
                           Field(sequential=False, use_vocab=False)
    train_set = TabularDataset(path=DATA_DIR + 'train.tok.tsv',
                               format='TSV',
                               fields=[('text', TEXT), ('labels', LABELS),
                                       ('length', LENGTH)],
                               skip_header=True)
    val_set = TabularDataset(path=DATA_DIR + 'dev.tok.tsv',
                             format='TSV',
                             fields=[('text', TEXT), ('labels', LABELS),
                                     ('length', LENGTH)],
                             skip_header=True)
    train_loader = Iterator(
        dataset=train_set,
        batch_size=BATCH_SIZE,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        shuffle=True,
    )
    val_loader = BucketIterator(
        dataset=val_set,
        batch_size=BATCH_SIZE,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        shuffle=False,
    )
    TEXT.build_vocab(train_set)
    # LABELS.build_vocab(train_set)

    return (TEXT, LABELS, LENGTH), (train_set, val_set), (train_loader,
                                                          val_loader)
Example #25
0
def train(dataset_dir, emb_file, epoch, batch_size):
    (train_data, test_data, text_field, label_field) = dataset.load_data(dataset_dir, emb_file)

    class_size = len(label_field.vocab)

    nbow = nbow_model.NBoW(class_size, text_field.vocab.vectors)
    nbow.train()

    optimizer = torch.optim.Adam(nbow.parameters())

    train_iter = Iterator(train_data, batch_size)
    for n in range(epoch):
        for batch in train_iter:
            optimizer.zero_grad()

            logit = nbow(batch.text.t())
            loss = F.cross_entropy(logit, batch.label)
            loss.backward()

            optimizer.step()

        nbow.eval()

        (accuracy, num_correct) = compute_accuracy(nbow, test_data)
        print('Epoch: {} Accuracy: {:.2f}% ({}/{})'.format(n + 1, accuracy * 100, num_correct, len(test_data)))

        nbow.train()
Example #26
0
def get_data_iter(train_csv, test_csv, fix_length):
    TEXT = data.Field(sequential=True,
                      lower=True,
                      fix_length=fix_length,
                      batch_first=True)
    LABEL = data.Field(sequential=False, use_vocab=False)
    train_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    train = TabularDataset(path=train_csv,
                           format="csv",
                           fields=train_fields,
                           skip_header=True)
    train_iter = BucketIterator(train,
                                batch_size=batch_size,
                                device=-1,
                                sort_key=lambda x: len(x.text),
                                sort_within_batch=False,
                                repeat=False)
    test_fields = [("label", LABEL), ("title", None), ("text", TEXT)]
    test = TabularDataset(path=test_csv,
                          format="csv",
                          fields=test_fields,
                          skip_header=True)
    test_iter = Iterator(test,
                         batch_size=batch_size,
                         device=-1,
                         sort=False,
                         sort_within_batch=False,
                         repeat=False)

    #    vectors = Vectors(name=word2vec_dir)
    #    TEXT.build_vocab(train, vectors=vectors)
    TEXT.build_vocab(train)
    vocab = TEXT.vocab
    return train_iter, test_iter, vocab
Example #27
0
def test(args):
    train_data, dev_data, test_data, sentence_vocab, pred_arg_vocab, _ = read_data(
        args.path, args.train, args.test, args.dev)

    test_iter = Iterator(test_data,
                         1,
                         sort_key=lambda x: len(x.sentence),
                         train=False,
                         repeat=False)
    model = torch.load(args.save_model)

    model.eval()

    instances_seen = 0
    labels = []
    outputs = []
    with torch.no_grad():
        for v_iteration, instance in enumerate(test_iter):
            model_outputs = model(instance)
            output = torch.sigmoid(model_outputs)
            outputs.append(output.item())
            labels.append(instance.label[0].item())

    pred = lambda x: 1 if x >= 0.15 else 0
    predicted = [pred(x) for x in outputs]
    print(f1_score(labels, predicted))
Example #28
0
    def predict(self, model_name, filepath=Path('../data/output/sub.csv')):
        """
        """

        preds = []
        sub_dataset = TabularDataset(filepath,
                                     format="CSV",
                                     fields=self.fields,
                                     skip_header=True)
        sub_iter = Iterator(sub_dataset,
                            batch_size=self.batch_size,
                            device=self.device,
                            train=False,
                            shuffle=False,
                            sort=False)
        self.load_checkpoint(model_name)
        self.model.eval()
        with torch.no_grad():
            for (label, text), _ in sub_iter:
                label = label.type(torch.LongTensor)
                label = label.to(self.device)
                text = text.type(torch.LongTensor)
                text = text.to(self.device)
                output = self.model(text, label)
                _, output = output
                preds.extend(torch.argmax(output, 1).tolist())

        id_list = ["twitter_" + str(n) for n in range(1, len(sub_dataset) + 1)]
        label_list = [
            "SARCASM" if pred == 1 else "NOT_SARCASM" for pred in preds
        ]
        df_sub = pd.DataFrame(list(zip(id_list, label_list)),
                              columns=['id', 'label'])
        return df_sub
Example #29
0
def predict_text_cnn(model_path, file_path, vocab_path, batch_size=64):

    TEXT = data.Field(sequential=True, lower=True, batch_first=True)
    fields = [('sentence', TEXT)]

    test_data = TabularDataset(path=file_path,
                               format='tsv',
                               skip_header=True,
                               fields=fields)
    with open(vocab_path, 'rb') as handle:
        vocab = pickle.load(handle)
    TEXT.vocab = vocab

    device = torch.device('cuda:0')
    test_iter = Iterator(test_data,
                         batch_size=batch_size,
                         shuffle=False,
                         device=device)
    model = torch.load(model_path)

    sentiments = []
    model.eval()
    with torch.no_grad():

        for batch in test_iter:
            sentence = batch.sentence
            logit = model(sentence)
            prob = torch.softmax(logit, dim=-1)[:, 1].tolist()
            sentiments.extend(prob)

    return sentiments
Example #30
0
def get_iterator(dataset, batch_size, train=True):
    return Iterator(dataset,
                    batch_size=batch_size,
                    device=device,
                    train=train,
                    shuffle=train,
                    sort=False)