コード例 #1
0
ファイル: torch_component.py プロジェクト: zhoumo99133/HanLP
    def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs):
        """Evaluate test set.

        Args:
            tst_data: Test set, which is usually a file path.
            save_dir: The directory to save evaluation scores or predictions.
            logger: Logger for reporting progress.
            batch_size: Batch size for test dataloader.
            output: Whether to save outputs into some file.
            **kwargs: Not used.

        Returns:
            (metric, outputs) where outputs are the return values of ``evaluate_dataloader``.
        """
        if not self.model:
            raise RuntimeError('Call fit or load before evaluate.')
        if isinstance(tst_data, str):
            tst_data = get_resource(tst_data)
            filename = os.path.basename(tst_data)
        else:
            filename = None
        if output is True:
            output = self.generate_prediction_filename(tst_data if isinstance(tst_data, str) else 'test.txt', save_dir)
        if logger is None:
            _logger_name = basename_no_ext(filename) if filename else None
            logger = self.build_logger(_logger_name, save_dir)
        if not batch_size:
            batch_size = self.config.get('batch_size', 32)
        data = self.build_dataloader(**merge_dict(self.config, data=tst_data, batch_size=batch_size, shuffle=False,
                                                  device=self.devices[0], logger=logger, overwrite=True))
        dataset = data
        while dataset and hasattr(dataset, 'dataset'):
            dataset = dataset.dataset
        num_samples = len(dataset) if dataset else None
        if output and isinstance(dataset, TransformableDataset):
            def add_idx(samples):
                for idx, sample in enumerate(samples):
                    if sample:
                        sample[IDX] = idx

            add_idx(dataset.data)
            if dataset.cache:
                add_idx(dataset.cache)

        criterion = self.build_criterion(**self.config)
        metric = self.build_metric(**self.config)
        start = time.time()
        outputs = self.evaluate_dataloader(data, criterion=criterion, filename=filename, output=output, input=tst_data,
                                           save_dir=save_dir,
                                           test=True,
                                           num_samples=num_samples,
                                           **merge_dict(self.config, batch_size=batch_size, metric=metric,
                                                        logger=logger, **kwargs))
        elapsed = time.time() - start
        if logger:
            if num_samples:
                logger.info(f'speed: {num_samples / elapsed:.0f} samples/second')
            else:
                logger.info(f'speed: {len(data) / elapsed:.0f} batches/second')
        return metric, outputs
コード例 #2
0
 def build(self, logger, **kwargs):
     self.transform.build_config()
     self.model = self.build_model(**merge_dict(self.config, training=kwargs.get('training', None),
                                                loss=kwargs.get('loss', None)))
     self.transform.lock_vocabs()
     optimizer = self.build_optimizer(**self.config)
     loss = self.build_loss(
         **self.config if 'loss' in self.config else dict(list(self.config.items()) + [('loss', None)]))
     # allow for different
     metrics = self.build_metrics(**merge_dict(self.config, metrics=kwargs.get('metrics', 'accuracy'),
                                               logger=logger, overwrite=True))
     if not isinstance(metrics, list):
         if isinstance(metrics, tf.keras.metrics.Metric):
             metrics = [metrics]
     if not self.model.built:
         sample_inputs = self.sample_data
         if sample_inputs is not None:
             self.model(sample_inputs)
         else:
             if len(self.transform.output_shapes[0]) == 1 and self.transform.output_shapes[0][0] is None:
                 x_shape = self.transform.output_shapes[0]
             else:
                 x_shape = list(self.transform.output_shapes[0])
                 for i, shape in enumerate(x_shape):
                     x_shape[i] = [None] + shape  # batch + X.shape
             self.model.build(input_shape=x_shape)
     self.compile_model(optimizer, loss, metrics)
     return self.model, optimizer, loss, metrics
コード例 #3
0
ファイル: torch_component.py プロジェクト: zhoumo99133/HanLP
    def load(self, save_dir: str, devices=None, verbose=HANLP_VERBOSE, **kwargs):
        """Load from a local/remote component.

        Args:
            save_dir: An identifier which can be a local path or a remote URL or a pre-defined string.
            devices: The devices this component will be moved onto.
            verbose: ``True`` to log loading progress.
            **kwargs: To override some configs.
        """
        save_dir = get_resource(save_dir)
        # flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]')
        if devices is None and self.model:
            devices = self.devices
        self.load_config(save_dir, **kwargs)
        self.load_vocabs(save_dir)
        if verbose:
            flash('Building model [blink][yellow]...[/yellow][/blink]')
        self.model = self.build_model(
            **merge_dict(self.config, training=False, **kwargs, overwrite=True,
                         inplace=True))
        if verbose:
            flash('')
        self.load_weights(save_dir, **kwargs)
        self.to(devices)
        self.model.eval()
コード例 #4
0
 def predict(self, data: Union[str, List[str]], batch_size: int = None, **kwargs):
     if not data:
         return []
     flat = isinstance(data, str) or isinstance(data, tuple)
     if flat:
         data = [data]
     samples = []
     for idx, d in enumerate(data):
         sample = {IDX: idx}
         if self.config.text_b_key:
             sample[self.config.text_a_key] = d[0]
             sample[self.config.text_b_key] = d[1]
         else:
             sample[self.config.text_a_key] = d
         samples.append(sample)
     dataloader = self.build_dataloader(samples,
                                        sorting=False,
                                        **merge_dict(self.config,
                                                     batch_size=batch_size,
                                                     shuffle=False,
                                                     device=self.device,
                                                     overwrite=True)
                                        )
     labels = [None] * len(data)
     vocab = self.vocabs.label
     for batch in dataloader:
         logits = self.feed_batch(batch)
         pred = logits.argmax(-1)
         pred = pred.tolist()
         for idx, tag in zip(batch[IDX], pred):
             labels[idx] = vocab.idx_to_token[tag]
     if flat:
         return labels[0]
     return labels
コード例 #5
0
ファイル: ud_parser.py プロジェクト: cfy42584125/HanLP-1
 def predict(self,
             data: Union[List[str], List[List[str]]],
             batch_size: int = None,
             **kwargs):
     if not data:
         return []
     flat = self.input_is_flat(data)
     if flat:
         data = [data]
     samples = self.build_samples(data)
     if not batch_size:
         batch_size = self.config.batch_size
     dataloader = self.build_dataloader(samples,
                                        device=self.devices[0],
                                        shuffle=False,
                                        **merge_dict(self.config,
                                                     batch_size=batch_size,
                                                     overwrite=True,
                                                     **kwargs))
     order = []
     outputs = []
     for batch in dataloader:
         out, mask = self.feed_batch(batch)
         self.decode_output(out, mask, batch)
         outputs.extend(self.prediction_to_human(out, batch))
         order.extend(batch[IDX])
     outputs = reorder(outputs, order)
     if flat:
         return outputs[0]
     return outputs
コード例 #6
0
 def predict(self,
             data: Union[str, List[str]],
             batch_size: int = None,
             **kwargs):
     if not data:
         return []
     flat = self.input_is_flat(data)
     if flat:
         data = [data]
     samples = self.build_samples(data)
     dataloader = self.build_dataloader(samples,
                                        device=self.device,
                                        **merge_dict(self.config,
                                                     batch_size=batch_size,
                                                     overwrite=True))
     outputs = []
     orders = []
     for idx, batch in enumerate(dataloader):
         out, mask = self.feed_batch(batch)
         prediction = self.decode_output(out, mask, batch, span_probs=None)
         # prediction = [x[0] for x in prediction]
         outputs.extend(prediction)
         orders.extend(batch[IDX])
     outputs = reorder(outputs, orders)
     if flat:
         return outputs[0]
     return outputs
コード例 #7
0
 def load(self, save_dir: str, logger=hanlp.utils.log_util.logger, **kwargs):
     self.meta['load_path'] = save_dir
     save_dir = get_resource(save_dir)
     self.load_config(save_dir)
     self.load_vocabs(save_dir)
     self.build(**merge_dict(self.config, training=False, logger=logger, **kwargs, overwrite=True, inplace=True))
     self.load_weights(save_dir, **kwargs)
     self.load_meta(save_dir)
コード例 #8
0
ファイル: torch_component.py プロジェクト: yatwql/HanLP
    def __call__(self, *args, **kwargs):
        """Predict on data fed by user. This method calls :meth:`~hanlp.common.torch_component.predict` but decorates
        it with ``torch.no_grad``.

        Args:
            *args: Sentences or tokens.
            **kwargs: Used in sub-classes.
        """
        return super().__call__(
            *args, **merge_dict(self.config, overwrite=True, **kwargs))
コード例 #9
0
ファイル: torch_component.py プロジェクト: zhoumo99133/HanLP
    def __call__(self, data, batch_size=None, **kwargs):
        """Predict on data fed by user. This method calls :meth:`~hanlp.common.torch_component.predict` but decorates
        it with ``torch.no_grad``.

        Args:
            data: Sentences or tokens.
            batch_size: Decoding batch size.
            **kwargs: Used in sub-classes.
        """
        return super().__call__(data, **merge_dict(self.config, overwrite=True,
                                                   batch_size=batch_size or self.config.get('batch_size', None),
                                                   **kwargs))
コード例 #10
0
    def execute_training_loop(self, trn: DataLoader, dev: DataLoader, epochs,
                              criterion, optimizer, metric, save_dir, logger,
                              patience, **kwargs):
        max_e, max_metric = 0, -1

        criterion = self.build_criterion()
        timer = CountdownTimer(epochs)
        ratio_width = len(f'{len(trn)}/{len(trn)}')
        scheduler = self.build_scheduler(
            **merge_dict(self.config, optimizer=optimizer, overwrite=True))
        if not patience:
            patience = epochs
        for epoch in range(1, epochs + 1):
            logger.info(f"[yellow]Epoch {epoch} / {epochs}:[/yellow]")
            self.fit_dataloader(trn,
                                criterion,
                                optimizer,
                                metric,
                                logger,
                                ratio_width=ratio_width)
            loss, dev_metric = self.evaluate_dataloader(dev, criterion, logger)
            if scheduler:
                if isinstance(scheduler, ReduceLROnPlateau):
                    scheduler.step(dev_metric.score)
                else:
                    scheduler.step(epoch)
            report_patience = f'Patience: {epoch - max_e}/{patience}'
            # save the model if it is the best so far
            if dev_metric > max_metric:
                self.save_weights(save_dir)
                max_e, max_metric = epoch, dev_metric
                report_patience = '[red]Saved[/red] '
            stop = epoch - max_e >= patience
            if stop:
                timer.stop()
            timer.log(
                f'{report_patience} lr: {optimizer.param_groups[0]["lr"]:.4f}',
                ratio_percentage=False,
                newline=True,
                ratio=False)
            if stop:
                break
        timer.stop()
        if max_e != epoch:
            self.load_weights(save_dir)
        logger.info(
            f"Max score of dev is {max_metric.score:.2%} at epoch {max_e}")
        logger.info(
            f"{timer.elapsed_human} elapsed, average time of each epoch is {timer.elapsed_average_human}"
        )
コード例 #11
0
 def predict(self,
             data: Any,
             batch_size=None,
             batch_max_tokens=None,
             conll=True,
             **kwargs):
     if not data:
         return []
     use_pos = self.use_pos
     flat = self.input_is_flat(data, use_pos)
     if flat:
         data = [data]
     samples = self.build_samples(data, use_pos)
     if not batch_max_tokens:
         batch_max_tokens = self.config.get('batch_max_tokens', None)
     if not batch_size:
         batch_size = self.config.batch_size
     dataloader = self.build_dataloader(
         samples,
         device=self.devices[0],
         shuffle=False,
         **merge_dict(self.config,
                      batch_size=batch_size,
                      batch_max_tokens=batch_max_tokens,
                      overwrite=True,
                      **kwargs))
     predictions, build_data, data, order = self.before_outputs(data)
     for batch in dataloader:
         arc_scores, rel_scores, mask, puncts = self.feed_batch(batch)
         self.collect_outputs(arc_scores, rel_scores, mask, batch,
                              predictions, order, data, use_pos, build_data)
     outputs = self.post_outputs(predictions,
                                 data,
                                 order,
                                 use_pos,
                                 build_data,
                                 conll=conll)
     if flat:
         return outputs[0]
     return outputs
コード例 #12
0
ファイル: torch_component.py プロジェクト: zhoumo99133/HanLP
    def fit(self,
            trn_data,
            dev_data,
            save_dir,
            batch_size,
            epochs,
            devices=None,
            logger=None,
            seed=None,
            finetune: Union[bool, str] = False,
            eval_trn=True,
            _device_placeholder=False,
            **kwargs):
        """Fit to data, triggers the training procedure. For training set and dev set, they shall be local or remote
        files.

        Args:
            trn_data: Training set.
            dev_data: Development set.
            save_dir: The directory to save trained component.
            batch_size: The number of samples in a batch.
            epochs: Number of epochs.
            devices: Devices this component will live on.
            logger: Any :class:`logging.Logger` instance.
            seed: Random seed to reproduce this training.
            finetune: ``True`` to load from ``save_dir`` instead of creating a randomly initialized component. ``str``
                to specify a different ``save_dir`` to load from.
            eval_trn: Evaluate training set after each update. This can slow down the training but provides a quick
                diagnostic for debugging.
            _device_placeholder: ``True`` to create a placeholder tensor which triggers PyTorch to occupy devices so
                other components won't take these devices as first choices.
            **kwargs: Hyperparameters used by sub-classes.

        Returns:
            Any results sub-classes would like to return. Usually the best metrics on training set.

        """
        # Common initialization steps
        config = self._capture_config(locals())
        if not logger:
            logger = self.build_logger('train', save_dir)
        if not seed:
            self.config.seed = 233 if isdebugging() else int(time.time())
        set_seed(self.config.seed)
        logger.info(self._savable_config.to_json(sort=True))
        if isinstance(devices, list) or devices is None or isinstance(devices, float):
            flash('[yellow]Querying CUDA devices [blink]...[/blink][/yellow]')
            devices = -1 if isdebugging() else cuda_devices(devices)
            flash('')
        # flash(f'Available GPUs: {devices}')
        if isinstance(devices, list):
            first_device = (devices[0] if devices else -1)
        elif isinstance(devices, dict):
            first_device = next(iter(devices.values()))
        elif isinstance(devices, int):
            first_device = devices
        else:
            first_device = -1
        if _device_placeholder and first_device >= 0:
            _dummy_placeholder = self._create_dummy_placeholder_on(first_device)
        if finetune:
            if isinstance(finetune, str):
                self.load(finetune, devices=devices)
            else:
                self.load(save_dir, devices=devices)
            logger.info(
                f'Finetune model loaded with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
                f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
        self.on_config_ready(**self.config)
        trn = self.build_dataloader(**merge_dict(config, data=trn_data, batch_size=batch_size, shuffle=True,
                                                 training=True, device=first_device, logger=logger, vocabs=self.vocabs,
                                                 overwrite=True))
        dev = self.build_dataloader(**merge_dict(config, data=dev_data, batch_size=batch_size, shuffle=False,
                                                 training=None, device=first_device, logger=logger, vocabs=self.vocabs,
                                                 overwrite=True)) if dev_data else None
        if not finetune:
            flash('[yellow]Building model [blink]...[/blink][/yellow]')
            self.model = self.build_model(**merge_dict(config, training=True))
            flash('')
            logger.info(f'Model built with {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}'
                        f'/{sum(p.numel() for p in self.model.parameters())} trainable/total parameters.')
            assert self.model, 'build_model is not properly implemented.'
        _description = repr(self.model)
        if len(_description.split('\n')) < 10:
            logger.info(_description)
        self.save_config(save_dir)
        self.save_vocabs(save_dir)
        self.to(devices, logger)
        if _device_placeholder and first_device >= 0:
            del _dummy_placeholder
        criterion = self.build_criterion(**merge_dict(config, trn=trn))
        optimizer = self.build_optimizer(**merge_dict(config, trn=trn, criterion=criterion))
        metric = self.build_metric(**self.config)
        if hasattr(trn.dataset, '__len__') and dev and hasattr(dev.dataset, '__len__'):
            logger.info(f'{len(trn.dataset)}/{len(dev.dataset)} samples in trn/dev set.')
            trn_size = len(trn) // self.config.get('gradient_accumulation', 1)
            ratio_width = len(f'{trn_size}/{trn_size}')
        else:
            ratio_width = None
        return self.execute_training_loop(**merge_dict(config, trn=trn, dev=dev, epochs=epochs, criterion=criterion,
                                                       optimizer=optimizer, metric=metric, logger=logger,
                                                       save_dir=save_dir,
                                                       devices=devices,
                                                       ratio_width=ratio_width,
                                                       trn_data=trn_data,
                                                       dev_data=dev_data,
                                                       eval_trn=eval_trn,
                                                       overwrite=True))
コード例 #13
0
 def fit(self, trn_data, dev_data, save_dir, batch_size, epochs, run_eagerly=False, logger=None, verbose=True,
         finetune: str = None, **kwargs):
     self._capture_config(locals())
     self.transform = self.build_transform(**self.config)
     if not save_dir:
         save_dir = tempdir_human()
     if not logger:
         logger = init_logger(name='train', root_dir=save_dir, level=logging.INFO if verbose else logging.WARN)
     logger.info('Hyperparameter:\n' + self.config.to_json())
     num_examples = self.build_vocab(trn_data, logger)
     # assert num_examples, 'You forgot to return the number of training examples in your build_vocab'
     logger.info('Building...')
     train_steps_per_epoch = math.ceil(num_examples / batch_size) if num_examples else None
     self.config.train_steps = train_steps_per_epoch * epochs if num_examples else None
     model, optimizer, loss, metrics = self.build(**merge_dict(self.config, logger=logger, training=True))
     logger.info('Model built:\n' + summary_of_model(self.model))
     if finetune:
         finetune = get_resource(finetune)
         if os.path.isdir(finetune):
             finetune = os.path.join(finetune, 'model.h5')
         model.load_weights(finetune, by_name=True, skip_mismatch=True)
         logger.info(f'Loaded pretrained weights from {finetune} for finetuning')
     self.save_config(save_dir)
     self.save_vocabs(save_dir)
     self.save_meta(save_dir)
     trn_data = self.build_train_dataset(trn_data, batch_size, num_examples)
     dev_data = self.build_valid_dataset(dev_data, batch_size)
     callbacks = self.build_callbacks(save_dir, **merge_dict(self.config, overwrite=True, logger=logger))
     # need to know #batches, otherwise progbar crashes
     dev_steps = math.ceil(self.num_samples_in(dev_data) / batch_size)
     checkpoint = get_callback_by_class(callbacks, tf.keras.callbacks.ModelCheckpoint)
     timer = Timer()
     try:
         history = self.train_loop(**merge_dict(self.config, trn_data=trn_data, dev_data=dev_data, epochs=epochs,
                                                num_examples=num_examples,
                                                train_steps_per_epoch=train_steps_per_epoch, dev_steps=dev_steps,
                                                callbacks=callbacks, logger=logger, model=model, optimizer=optimizer,
                                                loss=loss,
                                                metrics=metrics, overwrite=True))
     except KeyboardInterrupt:
         print()
         if not checkpoint or checkpoint.best in (np.Inf, -np.Inf):
             self.save_weights(save_dir)
             logger.info('Aborted with model saved')
         else:
             logger.info(f'Aborted with model saved with best {checkpoint.monitor} = {checkpoint.best:.4f}')
         # noinspection PyTypeChecker
         history: tf.keras.callbacks.History() = get_callback_by_class(callbacks, tf.keras.callbacks.History)
     delta_time = timer.stop()
     best_epoch_ago = 0
     if history and hasattr(history, 'epoch'):
         trained_epoch = len(history.epoch)
         logger.info('Trained {} epochs in {}, each epoch takes {}'.
                     format(trained_epoch, delta_time, delta_time / trained_epoch if trained_epoch else delta_time))
         save_json(history.history, io_util.path_join(save_dir, 'history.json'), cls=io_util.NumpyEncoder)
         monitor_history: List = history.history.get(checkpoint.monitor, None)
         if monitor_history:
             best_epoch_ago = len(monitor_history) - monitor_history.index(checkpoint.best)
         if checkpoint and monitor_history and checkpoint.best != monitor_history[-1]:
             logger.info(f'Restored the best model saved with best '
                         f'{checkpoint.monitor} = {checkpoint.best:.4f} '
                         f'saved {best_epoch_ago} epochs ago')
             self.load_weights(save_dir)  # restore best model
     return history