Esempio n. 1
0
class Task(object):
    def __init__(self, task_id, task_name, train_set, dev_set, test_set):

        self.task_id = task_id
        self.task_name = task_name

        self.train_set = train_set
        self.dev_set = dev_set
        self.test_set = test_set

        self.train_data_loader = None
        self.dev_data_loader = None
        self.test_data_loader = None

    def init_data_loader(self, batch_size):

        self.train_data_loader = Batch(self.train_set,
                                       batch_size,
                                       sampler=RandomSampler())
        self.train_data_loader.init_iter()
        self.dev_data_loader = Batch(self.dev_set,
                                     batch_size,
                                     sampler=SequentialSampler())
        self.test_data_loader = Batch(self.test_set,
                                      batch_size,
                                      sampler=SequentialSampler())
Esempio n. 2
0
 def _tqdm_train(self):
     self.step = 0
     data_iterator = Batch(self.train_data,
                           batch_size=self.batch_size,
                           sampler=self.sampler,
                           as_numpy=False)
     total_steps = data_iterator.num_batches * self.n_epochs
     with tqdm(total=total_steps,
               postfix='loss:{0:<6.5f}',
               leave=False,
               dynamic_ncols=True) as pbar:
         avg_loss = 0
         for epoch in range(1, self.n_epochs + 1):
             pbar.set_description_str(
                 desc="Epoch {}/{}".format(epoch, self.n_epochs))
             for batch_x, batch_y in data_iterator:
                 _move_dict_value_to_device(batch_x,
                                            batch_y,
                                            device=self._model_device)
                 prediction = self._data_forward(self.model, batch_x)
                 loss = self._compute_loss(prediction, batch_y)
                 avg_loss += loss.item()
                 self._grad_backward(loss)
                 self._update()
                 self._summary_writer.add_scalar("loss",
                                                 loss.item(),
                                                 global_step=self.step)
                 for name, param in self.model.named_parameters():
                     if param.requires_grad:
                         self._summary_writer.add_scalar(
                             name + "_mean",
                             param.mean(),
                             global_step=self.step)
                         # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step)
                         # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step)
                 if (self.step + 1) % self.print_every == 0:
                     pbar.set_postfix_str("loss:{0:<6.5f}".format(
                         avg_loss / self.print_every))
                     avg_loss = 0
                     pbar.update(self.print_every)
                 self.step += 1
                 if self.validate_every > 0 and self.step % self.validate_every == 0 \
                         and self.dev_data is not None:
                     eval_res = self._do_validation(epoch=epoch,
                                                    step=self.step)
                     eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \
                                self.tester._format_eval_results(eval_res)
                     pbar.write(eval_str)
             if self.validate_every < 0 and self.dev_data:
                 eval_res = self._do_validation(epoch=epoch, step=self.step)
                 eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \
                            self.tester._format_eval_results(eval_res)
                 pbar.write(eval_str)
             if epoch != self.n_epochs:
                 data_iterator = Batch(self.train_data,
                                       batch_size=self.batch_size,
                                       sampler=self.sampler,
                                       as_numpy=False)
         pbar.close()
Esempio n. 3
0
    def init_data_loader(self, batch_size):

        self.train_data_loader = Batch(self.train_set,
                                       batch_size,
                                       sampler=RandomSampler())
        self.train_data_loader.init_iter()
        self.dev_data_loader = Batch(self.dev_set,
                                     batch_size,
                                     sampler=SequentialSampler())
        self.test_data_loader = Batch(self.test_set,
                                      batch_size,
                                      sampler=SequentialSampler())
Esempio n. 4
0
    def test(self):
        # turn on the testing mode; clean up the history
        network = self._model
        self._mode(network, is_test=True)
        data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False)
        eval_results = {}
        try:
            with torch.no_grad():
                for batch_x, batch_y in data_iterator:
                    _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
                    pred_dict = self._data_forward(self._predict_func, batch_x)
                    if not isinstance(pred_dict, dict):
                        raise TypeError(f"The return value of {get_func_signature(self._predict_func)} " 
                                                         f"must be `dict`, got {type(pred_dict)}.")
                    for metric in self.metrics:
                        metric(pred_dict, batch_y)
                for metric in self.metrics:
                    eval_result = metric.get_metric()
                    if not isinstance(eval_result, dict):
                        raise TypeError(f"The return value of {get_func_signature(metric.get_metric)} must be "
                                        f"`dict`, got {type(eval_result)}")
                    metric_name = metric.__class__.__name__
                    eval_results[metric_name] = eval_result
        except CheckError as e:
            prev_func_signature = get_func_signature(self._predict_func)
            _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature,
                                 check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y,
                                 dataset=self.data, check_level=0)

        if self.verbose >= 1:
            print("[tester] \n{}".format(self._format_eval_results(eval_results)))
        self._mode(network, is_test=False)
        return eval_results
Esempio n. 5
0
    def test(self):
        # turn on the testing mode; clean up the history
        network = self._model
        self.mode(network, is_test=True)
        self.eval_history.clear()
        output, truths = defaultdict(list), defaultdict(list)
        data_iterator = Batch(self.data,
                              self.batch_size,
                              sampler=RandomSampler(),
                              as_numpy=False)

        with torch.no_grad():
            for batch_x, batch_y in data_iterator:
                prediction = self.data_forward(network, batch_x)
                assert isinstance(prediction, dict)
                for k, v in prediction.items():
                    output[k].append(v)
                for k, v in batch_y.items():
                    truths[k].append(v)
            for k, v in output.items():
                output[k] = itertools.chain(*v)
            for k, v in truths.items():
                truths[k] = itertools.chain(*v)
            args = _build_args(self._evaluator, **output, **truths)
            eval_results = self._evaluator(**args)
        print("[tester] {}".format(self.print_eval_results(eval_results)))
        self.mode(network, is_test=False)
        return eval_results
Esempio n. 6
0
    def predict(self, network, data):
        """Perform inference using the trained model.

        :param network: a PyTorch model (cpu)
        :param data: a DataSet object.
        :return: list of list of strings, [num_examples, tag_seq_length]
        """
        # transform strings into DataSet object
        # data = self.prepare_input(data)

        # turn on the testing mode; clean up the history
        self.mode(network, test=True)
        batch_output = []

        data_iterator = Batch(data,
                              batch_size=self.batch_size,
                              sampler=SequentialSampler(),
                              use_cuda=False)

        for batch_x, _ in data_iterator:
            with torch.no_grad():
                prediction = self.data_forward(network, batch_x)
            batch_output.append(prediction)

        return self._post_processor(batch_output, self.label_vocab)
Esempio n. 7
0
    def test(self, filepath):

        tag_proc = self._dict['tag_indexer']
        cws_model = self.pipeline.pipeline[-2].model
        pipeline = self.pipeline.pipeline[:5]

        pipeline.insert(1, tag_proc)
        pp = Pipeline(pipeline)

        reader = ConlluCWSReader()

        # te_filename = '/home/hyan/ctb3/test.conllx'
        te_dataset = reader.load(filepath)
        pp(te_dataset)

        batch_size = 64
        te_batcher = Batch(te_dataset,
                           batch_size,
                           SequentialSampler(),
                           use_cuda=False)
        pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher, type='bmes')
        f1 = round(f1 * 100, 2)
        pre = round(pre * 100, 2)
        rec = round(rec * 100, 2)
        # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))

        return f1, pre, rec
Esempio n. 8
0
    def test(self):
        data = DataSet()
        for text, label in zip(texts, labels):
            x = TextField(text, is_target=False)
            y = LabelField(label, is_target=True)
            ins = Instance(text=x, label=y)
            data.append(ins)

        # use vocabulary to index data
        data.index_field("text", vocab)

        # define naive sampler for batch class
        class SeqSampler:
            def __call__(self, dataset):
                return list(range(len(dataset)))

        # use batch to iterate dataset
        data_iterator = Batch(data, 2, SeqSampler(), False)
        total_data = 0
        for batch_x, batch_y in data_iterator:
            total_data += batch_x["text"].size(0)
            self.assertTrue(batch_x["text"].size(0) == 2
                            or total_data == len(raw_texts))
            self.assertTrue(isinstance(batch_x, dict))
            self.assertTrue(isinstance(batch_x["text"], torch.LongTensor))
            self.assertTrue(isinstance(batch_y, dict))
            self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))
Esempio n. 9
0
    def process(self, dataset):
        self.model.eval()
        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
        data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler())

        batch_output = defaultdict(list)
        if hasattr(self.model, "predict"):
            predict_func = self.model.predict
        else:
            predict_func = self.model.forward
        with torch.no_grad():
            for batch_x, _ in data_iterator:
                refined_batch_x = _build_args(predict_func, **batch_x)
                prediction = predict_func(**refined_batch_x)
                seq_lens = batch_x[self.seq_len_field_name].tolist()

                for key, value in prediction.items():
                    tmp_batch = []
                    value = value.cpu().numpy()
                    if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
                        batch_output[key].extend(value.tolist())
                    else:
                        for idx, seq_len in enumerate(seq_lens):
                            tmp_batch.append(value[idx, :seq_len])
                        batch_output[key].extend(tmp_batch)
                if not self.seq_len_field_name in prediction:
                    batch_output[self.seq_len_field_name].extend(seq_lens)

        # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么
        for field_name, fields in batch_output.items():
            dataset.add_field(field_name, fields, is_input=True, is_target=False)

        return dataset
Esempio n. 10
0
    def test(self, network, dev_data):
        if torch.cuda.is_available() and self.use_cuda:
            self._model = network.cuda()
        else:
            self._model = network

        # turn on the testing mode; clean up the history
        self.mode(network, is_test=True)
        self.eval_history.clear()
        self.batch_output.clear()

        data_iterator = Batch(dev_data,
                              self.batch_size,
                              sampler=RandomSampler(),
                              use_cuda=self.use_cuda)
        step = 0

        for batch_x, batch_y in data_iterator:
            with torch.no_grad():
                prediction = self.data_forward(network, batch_x)
                eval_results = self.evaluate(prediction, batch_y)

            if self.save_output:
                self.batch_output.append(prediction)
            if self.save_loss:
                self.eval_history.append(eval_results)

            print_output = "[test step {}] {}".format(step, eval_results)
            logger.info(print_output)
            if self.print_every_step > 0 and step % self.print_every_step == 0:
                print(self.make_eval_output(prediction, eval_results))
            step += 1
Esempio n. 11
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                  [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         print(x, y)
Esempio n. 12
0
    def test_sequential_batch(self):
        batch_size = 32
        pause_seconds = 0.01
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)

        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
        for batch_x, batch_y in batch:
            time.sleep(pause_seconds)
Esempio n. 13
0
 def next_batch(self):
     try:
         return next(self.train_iter)
     except StopAsyncIteration:
         self.train_iter = iter(
             Batch(dataset=self.train_data,
                   batch_size=self.batch_size,
                   sampler=SequentialSampler()))
         return next(self.train_iter)
Esempio n. 14
0
 def test_numpy_padding(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertEqual(x["x"].shape, (4, 4))
         self.assertEqual(y["y"].shape, (4, 4))
Esempio n. 15
0
    def test_simple(self):
        dataset = construct_dataset(
            [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)])
        dataset.set_target()
        batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)

        cnt = 0
        for _, _ in batch:
            cnt += 1
        self.assertEqual(cnt, 10)
Esempio n. 16
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                  [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Esempio n. 17
0
 def test_dataset_batching(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
         self.assertEqual(len(x["x"]), 4)
         self.assertEqual(len(y["y"]), 4)
         self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
         self.assertListEqual(list(y["y"][-1]), [5, 6])
Esempio n. 18
0
 def test_numpy_to_tensor(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
Esempio n. 19
0
    def _train(self):
        if not self.use_tqdm:
            from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
        else:
            inner_tqdm = tqdm
        self.step = 0
        start = time.time()
        total_steps = (len(self.train_data) // self.batch_size + int(
            len(self.train_data) % self.batch_size != 0)) * self.n_epochs
        with inner_tqdm(total=total_steps,
                        postfix='loss:{0:<6.5f}',
                        leave=False,
                        dynamic_ncols=True) as pbar:
            avg_loss = 0
            data_iterator = Batch(self.train_data,
                                  batch_size=self.batch_size,
                                  sampler=self.sampler,
                                  as_numpy=False,
                                  prefetch=self.prefetch)
            for epoch in range(1, self.n_epochs + 1):
                pbar.set_description_str(
                    desc="Epoch {}/{}".format(epoch, self.n_epochs))
                last_stage = (epoch > self.n_epochs + 1 - self.final_epochs)
                if epoch == self.n_epochs + 1 - self.final_epochs:
                    print(
                        'Entering the final stage. (Only train the selected structure)'
                    )
                # early stopping
                self.callback_manager.on_epoch_begin(epoch, self.n_epochs)

                # 1. Training the shared parameters omega of the child models
                self.train_shared(pbar)

                # 2. Training the controller parameters theta
                if not last_stage:
                    self.train_controller()

                if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
                    (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
                        and self.dev_data is not None:
                    if not last_stage:
                        self.derive()
                    eval_res = self._do_validation(epoch=epoch, step=self.step)
                    eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
                                                                                total_steps) + \
                                self.tester._format_eval_results(eval_res)
                    pbar.write(eval_str)

                # lr decay; early stopping
                self.callback_manager.on_epoch_end(epoch, self.n_epochs,
                                                   self.optimizer)
            # =============== epochs end =================== #
            pbar.close()
Esempio n. 20
0
 def next_batch(self):
     try:
         _next_batch = next(self.train_iter)
         if _next_batch[0]['word_seq'].shape[0] != self.batch_size:
             raise StopIteration
         return _next_batch
     except StopIteration:
         self.train_iter = iter(
             Batch(dataset=self.train_data,
                   batch_size=self.batch_size,
                   sampler=SequentialSampler()))
         return self.next_batch()
Esempio n. 21
0
    def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32):

        if dataset == "yelp":
            dataset = DataSet()

            for db_set in ['train']:
                text_file = os.path.join(path, 'sentiment.' + db_set + '.text')
                label_file = os.path.join(path,
                                          'sentiment.' + db_set + '.labels')
                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        dataset.append(Instance(text=text, label=label))

            dataset.apply(lambda x: x['text'].lower(), new_field_name='text')
            dataset.apply(
                lambda x: ['<start>'] + x['text'].split() + ['<eos>'],
                new_field_name='words')
            dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1)
            dataset.apply(lambda x: x['words'] + ['<pad>'] *
                          (17 - len(x['words'])),
                          new_field_name='words')
            dataset.apply(lambda x: int(x['label']),
                          new_field_name='label_seq',
                          is_target=True)

            _train_data, _test_data = dataset.split(0.3)

            _vocab = Vocabulary(min_freq=2)
            _train_data.apply(
                lambda x: [_vocab.add(word) for word in x['words']])
            _vocab.build_vocab()

            _train_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)
            _test_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))
Esempio n. 22
0
    def train(self, network, train_data, dev_data=None):
        """General Training Procedure

        :param network: a model
        :param train_data: a DataSet instance, the training data
        :param dev_data: a DataSet instance, the validation data (optional)
        """
        # transfer model to gpu if available
        if torch.cuda.is_available() and self.use_cuda:
            self._model = network.cuda()
            # self._model is used to access model-specific loss
        else:
            self._model = network

        # define Tester over dev data
        if self.validate:
            default_valid_args = {"batch_size": self.batch_size, "pickle_path": self.pickle_path,
                                  "use_cuda": self.use_cuda, "evaluator": self._evaluator}
            validator = self._create_validator(default_valid_args)
            logger.info("validator defined as {}".format(str(validator)))

        # optimizer and loss
        self.define_optimizer()
        logger.info("optimizer defined as {}".format(str(self._optimizer)))
        self.define_loss()
        logger.info("loss function defined as {}".format(str(self._loss_func)))

        # main training procedure
        start = time.time()
        logger.info("training epochs started")
        for epoch in range(1, self.n_epochs + 1):
            logger.info("training epoch {}".format(epoch))

            # turn on network training mode
            self.mode(network, is_test=False)
            # prepare mini-batch iterator
            data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(),
                                  use_cuda=self.use_cuda)
            logger.info("prepared data iterator")

            # one forward and backward pass
            self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch)

            # validation
            if self.validate:
                if dev_data is None:
                    raise RuntimeError(
                        "self.validate is True in trainer, but dev_data is None. Please provide the validation data.")
                logger.info("validation started")
                validator.test(network, dev_data)
Esempio n. 23
0
    def train(self):
        """Start Training.

        :return:
        """
        try:
            if torch.cuda.is_available() and self.use_cuda:
                self.model = self.model.cuda()

            self.mode(self.model, is_test=False)

            start = time.time()
            self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
            print("training epochs started " + self.start_time)
            if self.save_path is None:

                class psudoSW:
                    def __getattr__(self, item):
                        def pass_func(*args, **kwargs):
                            pass

                        return pass_func

                self._summary_writer = psudoSW()
            else:
                path = os.path.join(
                    self.save_path,
                    'tensorboard_logs_{}'.format(self.start_time))
                self._summary_writer = SummaryWriter(path)

            epoch = 1
            while epoch <= self.n_epochs:

                data_iterator = Batch(self.train_data,
                                      batch_size=self.batch_size,
                                      sampler=RandomSampler(),
                                      as_numpy=False)

                self._train_epoch(data_iterator, self.model, epoch,
                                  self.dev_data, start)

                # validate_every override validation at end of epochs
                if self.dev_data and self.validate_every <= 0:
                    self.do_validation()
                epoch += 1
        finally:
            self._summary_writer.close()
            del self._summary_writer
Esempio n. 24
0
    def _print_train(self):
        epoch = 1
        start = time.time()
        while epoch <= self.n_epochs:

            data_iterator = Batch(self.train_data,
                                  batch_size=self.batch_size,
                                  sampler=self.sampler,
                                  as_numpy=False)

            for batch_x, batch_y in data_iterator:
                # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题
                _move_dict_value_to_device(batch_x,
                                           batch_y,
                                           device=self._model_device)
                prediction = self._data_forward(self.model, batch_x)
                loss = self._compute_loss(prediction, batch_y)
                self._grad_backward(loss)
                self._update()
                self._summary_writer.add_scalar("loss",
                                                loss.item(),
                                                global_step=self.step)
                for name, param in self.model.named_parameters():
                    if param.requires_grad:
                        self._summary_writer.add_scalar(name + "_mean",
                                                        param.mean(),
                                                        global_step=self.step)
                        # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step)
                        # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step)
                if self.print_every > 0 and self.step % self.print_every == 0:
                    end = time.time()
                    diff = timedelta(seconds=round(end - start))
                    print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time:  {}".format(
                        epoch, self.step, loss.data, diff)
                    print(print_output)

                if (self.validate_every > 0
                        and self.step % self.validate_every == 0
                        and self.dev_data is not None):
                    self._do_validation(epoch=epoch, step=self.step)

                self.step += 1

            # validate_every override validation at end of epochs
            if self.dev_data and self.validate_every <= 0:
                self._do_validation(epoch=epoch, step=self.step)
            epoch += 1
Esempio n. 25
0
    def test(self, model, dataset):
        self.model = model.cuda() if self.use_cuda else model
        self.model.eval()
        batchiter = Batch(dataset, self.batch_size, SequentialSampler(),
                          self.use_cuda)
        eval_res = defaultdict(list)
        i = 0
        for batch_x, batch_y in batchiter:
            with torch.no_grad():
                pred_y = self.model(**batch_x)
                eval_one = self.model.evaluate(**pred_y, **batch_y)
            i += self.batch_size
            for eval_name, tensor in eval_one.items():
                eval_res[eval_name].append(tensor)
        tmp = {}
        for eval_name, tensorlist in eval_res.items():
            tmp[eval_name] = torch.cat(tensorlist, dim=0)

        self.res = self.model.metrics(**tmp)
Esempio n. 26
0
    def test(self, network, dev_data):
        if torch.cuda.is_available() and self.use_cuda:
            self._model = network.cuda()
        else:
            self._model = network

        # turn on the testing mode; clean up the history
        self.mode(network, is_test=True)
        self.eval_history.clear()
        output_list = []
        truth_list = []

        data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda)

        for batch_x, batch_y in data_iterator:
            with torch.no_grad():
                prediction = self.data_forward(network, batch_x)
            output_list.append(prediction)
            truth_list.append(batch_y)
        eval_results = self.evaluate(output_list, truth_list)
        print("[tester] {}".format(self.print_eval_results(eval_results)))
        logger.info("[tester] {}".format(self.print_eval_results(eval_results)))
Esempio n. 27
0
    def predict(self, network, data):
        """Perform inference using the trained model.

        :param network: a PyTorch model (cpu)
        :param data: a DataSet object.
        :return: list of batch outputs
        """
        # turn on the testing mode; clean up the history
        self.mode(network, test=True)
        batch_output = []

        data_iterator = Batch(data,
                              batch_size=self.batch_size,
                              sampler=SequentialSampler(),
                              as_numpy=False)

        for batch_x, _ in data_iterator:
            with torch.no_grad():
                prediction = self.data_forward(network, batch_x)
            batch_output.append(prediction)

        return batch_output
Esempio n. 28
0
    def get_reward(self, dag, entropies, hidden, valid_idx=0):
        """Computes the perplexity of a single sampled model on a minibatch of
        validation data.
        """
        if not isinstance(entropies, np.ndarray):
            entropies = entropies.data.cpu().numpy()

        data_iterator = Batch(self.dev_data,
                              batch_size=self.batch_size,
                              sampler=self.sampler,
                              as_numpy=False,
                              prefetch=self.prefetch)

        for inputs, targets in data_iterator:
            valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag)
            valid_loss = utils.to_item(valid_loss.data)

            valid_ppl = math.exp(valid_loss)

            R = 80 / valid_ppl

            rewards = R + 1e-4 * entropies

            return rewards, hidden
Esempio n. 29
0
    def train(self, network, train_data, dev_data=None):
        """General Training Procedure

        :param network: a model
        :param train_data: a DataSet instance, the training data
        :param dev_data: a DataSet instance, the validation data (optional)
        """
        # transfer model to gpu if available
        if torch.cuda.is_available() and self.use_cuda:
            self._model = network.cuda()
            # self._model is used to access model-specific loss
        else:
            self._model = network

        # define Tester over dev data
        if self.validate:
            default_valid_args = {
                "save_output": True,
                "validate_in_training": True,
                "save_dev_input": True,
                "save_loss": True,
                "batch_size": self.batch_size,
                "pickle_path": self.pickle_path,
                "use_cuda": self.use_cuda,
                "print_every_step": 0
            }
            validator = self._create_validator(default_valid_args)
            logger.info("validator defined as {}".format(str(validator)))

        # optimizer and loss
        self.define_optimizer()
        logger.info("optimizer defined as {}".format(str(self._optimizer)))
        self.define_loss()
        logger.info("loss function defined as {}".format(str(self._loss_func)))

        # main training procedure
        start = time.time()
        logger.info("training epochs started")
        for epoch in range(1, self.n_epochs + 1):
            logger.info("training epoch {}".format(epoch))

            # turn on network training mode
            self.mode(network, is_test=False)
            # prepare mini-batch iterator
            data_iterator = Batch(train_data,
                                  batch_size=self.batch_size,
                                  sampler=RandomSampler(),
                                  use_cuda=self.use_cuda)
            logger.info("prepared data iterator")

            # one forward and backward pass
            self._train_step(data_iterator,
                             network,
                             start=start,
                             n_print=self.print_every_step,
                             epoch=epoch)

            # validation
            if self.validate:
                logger.info("validation started")
                validator.test(network, dev_data)

                if self.save_best_dev and self.best_eval_result(validator):
                    self.save_model(network, self.model_name)
                    print("Saved better model selected by validation.")
                    logger.info("Saved better model selected by validation.")

                valid_results = validator.show_metrics()
                print("[epoch {}] {}".format(epoch, valid_results))
                logger.info("[epoch {}] {}".format(epoch, valid_results))
Esempio n. 30
0
    def __init__(self,
                 path='.data/sst/trees',
                 data_type='sst',
                 batch_size=32,
                 split_ratio=0.1,
                 seq_len=15,
                 min_freq=2):

        data_set = DataSet()
        if data_type == 'yelp':
            path = '.data/yelp'
            for db_set in ['train']:
                text_file = os.path.join(path, 'sentiment.' + db_set + '.text')
                label_file = os.path.join(path,
                                          'sentiment.' + db_set + '.labels')

                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        data_set.append(Instance(text=text, label=label))

            data_set.apply(
                lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'],
                new_field_name='words')
            data_set.drop(lambda x: len(x['words']) > seq_len + 2)

        elif data_type == 'sst':
            path = '.data/sst/trees'
            text = data.Field(init_token='<start>',
                              eos_token='<eos>',
                              lower=True,
                              tokenize='spacy',
                              fix_length=16)
            label = data.Field(sequential=False, unk_token='<unk>')
            filter = lambda ex: len(ex.text
                                    ) <= seq_len and ex.label != 'neutral'
            sst_train = datasets.SST(os.path.join(path, 'train.txt'),
                                     text,
                                     label,
                                     filter_pred=filter)
            sst_dev = datasets.SST(os.path.join(path, 'dev.txt'),
                                   text,
                                   label,
                                   filter_pred=filter)
            sst_test = datasets.SST(os.path.join(path, 'test.txt'),
                                    text,
                                    label,
                                    filter_pred=filter)
            for ex in sst_train.examples + sst_dev.examples + sst_test.examples:
                data_set.append(
                    Instance(words=ex.text,
                             label={
                                 'negative': 0,
                                 'positive': 1
                             }[ex.label]))

            data_set.apply(
                lambda x: ['<start>'] + [w.lower()
                                         for w in x['words']] + ['<eos>'],
                new_field_name='words')

        elif data_type == 'test':
            with io.open('fasttrial1.pos', 'r', encoding="utf-8") as f:
                for text in f:
                    data_set.append(Instance(text=text, label=1))
            with io.open('fasttrial1.neg', 'r', encoding="utf-8") as f:
                for text in f:
                    data_set.append(Instance(text=text, label=0))

            data_set.apply(
                lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'],
                new_field_name='words')
            data_set.drop(lambda x: len(x['words']) > seq_len + 2)

        data_set.apply(lambda x: x['words'] + ['<pad>'] *
                       (seq_len + 2 - len(x['words'])),
                       new_field_name='words')

        _train_data, _ = data_set.split(split_ratio)

        _vocab = Vocabulary(min_freq=min_freq)
        _train_data.apply(lambda x: [_vocab.add(word) for word in x['words']])
        _vocab.build_vocab()

        data_set.apply(lambda x: [_vocab.to_index(w) for w in x['words']],
                       new_field_name='word_seq',
                       is_input=True)
        data_set.apply(lambda x: x['word_seq'][1:] + [0],
                       new_field_name='dec_target',
                       is_target=True)
        data_set.apply(lambda x: int(x['label']),
                       new_field_name='label_seq',
                       is_target=True)
        _train_data, _test_data = data_set.split(split_ratio)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))