Exemple #1
0
    def _load_checkpoint(self):
        '''Load checkpoint and state dict'''
        max_epoch = -1

        for file in os.listdir(self.checkpoint_dir):
            if not file.startswith('epoch_'):
                continue

            _epoch = file.split('_')[-1]
            if not _epoch.isdigit():
                continue

            max_epoch = max(max_epoch, int(_epoch))

        if max_epoch == -1:
            if self.local_rank == 0:
                logger.warning('PaddleHub model checkpoint not found, start from scratch...')
            return

        # load best metrics
        self._load_metrics()

        self.current_epoch = max_epoch
        metric_msg = ['{}={:.4f}'.format(metric, value) for metric, value in self.best_metrics.items()]
        metric_msg = ' '.join(metric_msg)
        if self.local_rank == 0:
            logger.info('PaddleHub model checkpoint loaded. current_epoch={} [{}]'.format(
                self.current_epoch, metric_msg))

        model_path = os.path.join(self.checkpoint_dir, 'epoch_{}'.format(self.current_epoch))
        self.load_model(model_path)
Exemple #2
0
    def __init__(
        self,
        task=None,
        load_checkpoint=None,
        label_map=None,
    ):
        super(Bert, self).__init__()
        # TODO(zhangxuefei): add token_classification task
        if task == 'sequence_classification':
            self.model = BertForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='bert-large-cased')
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy(name='acc_accumulation')
        elif task is None:
            self.model = BertModel.from_pretrained(
                pretrained_model_name_or_path='bert-large-cased')
        else:
            raise RuntimeError(
                "Unknown task %s, task should be sequence_classification" %
                task)

        self.task = task
        self.label_map = label_map

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))
Exemple #3
0
    def __init__(
        self,
        task: str,
        num_class: int = None,
        label_map: Dict = None,
        load_checkpoint: str = None,
        **kwargs,
    ):
        super(PANN, self).__init__()

        if label_map:
            self.label_map = label_map
            self.num_class = len(label_map)
        else:
            self.num_class = num_class

        if task == 'sound-cls':
            self.cnn6 = CNN6(extract_embedding=True,
                             checkpoint=os.path.join(MODULE_HOME, 'panns_cnn6',
                                                     'cnn6.pdparams'))
            self.dropout = nn.Dropout(0.1)
            self.fc = nn.Linear(self.cnn6.emb_size, num_class)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        else:
            self.cnn6 = CNN6(extract_embedding=False,
                             checkpoint=os.path.join(MODULE_HOME, 'panns_cnn6',
                                                     'cnn6.pdparams'))

        self.task = task
        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))
 def _convert_examples_to_records(
         self, examples: List[InputExample]) -> List[dict]:
     """
     Converts all examples to records which the model needs.
     Args:
         examples(obj:`List[InputExample]`): All data examples returned by _read_file.
     Returns:
         records(:obj:`List[dict]`): All records which the model needs.
     """
     records = []
     for example in examples:
         record = self.tokenizer.encode(text=example.text_a,
                                        text_pair=example.text_b,
                                        max_seq_len=self.max_seq_len)
         # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab
         # When all words are not found in the vocab, the text will be dropped.
         if not record:
             logger.info(
                 "The text %s has been dropped as it has no words in the vocab after tokenization."
                 % example.text_a)
             continue
         if example.label:
             record['label'] = self.label_map[example.label]
         records.append(record)
     return records
Exemple #5
0
    def __init__(
        self,
        task: str = None,
        load_checkpoint: str = None,
        label_map: Dict = None,
        num_classes: int = 2,
        **kwargs,
    ):
        super(Electra, self).__init__()
        if label_map:
            self.label_map = label_map
            self.num_classes = len(label_map)
        else:
            self.num_classes = num_classes

        if task == 'sequence_classification':
            task = 'seq-cls'
            logger.warning(
                "current task name 'sequence_classification' was renamed to 'seq-cls', "
                "'sequence_classification' has been deprecated and will be removed in the future.",
            )
        if task == 'seq-cls':
            self.model = ElectraForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task == 'token-cls':
            self.model = ElectraForTokenClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = ChunkEvaluator(label_list=[
                self.label_map[i] for i in sorted(self.label_map.keys())
            ])
        elif task == 'text-matching':
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
            self.dropout = paddle.nn.Dropout(0.1)
            self.classifier = paddle.nn.Linear(
                self.model.config['hidden_size'] * 3, 2)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task is None:
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
        else:
            raise RuntimeError(
                "Unknown task {}, task should be one in {}".format(
                    task, self._tasks_supported))

        self.task = task

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))
Exemple #6
0
    def _convert_examples_to_records(
            self, examples: List[InputExample]) -> List[dict]:
        """
        Converts all examples to records which the model needs.
        Args:
            examples(obj:`List[InputExample]`): All data examples returned by _read_file.
        Returns:
            records(:obj:`List[dict]`): All records which the model needs.
        """
        records = []
        for example in examples:
            if isinstance(self.tokenizer, PretrainedTokenizer):
                record_a = self.tokenizer(text=example.text_a, max_seq_len=self.max_seq_len, \
                    pad_to_max_seq_len=True, return_length=True)
                record_b = self.tokenizer(text=example.text_b, max_seq_len=self.max_seq_len, \
                    pad_to_max_seq_len=True, return_length=True)
                record = {'text_a': record_a, 'text_b': record_b}
            else:
                raise RuntimeError(
                    "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer"
                    .format(type(self.tokenizer)))

            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization."
                    % example.text_a)
                continue
            if example.label:
                record['label'] = self.label_map[example.label]
            records.append(record)
        return records
Exemple #7
0
    def _convert_examples_to_records(self, examples, phase):
        '''
        Returns a list[dict] including all the input information what the model need.
        Args:
            examples (list): the data example, returned by _read_file.
            phase (str): the processing phase, can be 'train' 'dev' 'test' or 'predict'.
        Returns:
            a list with all the examples record.
        '''

        records = []
        with tqdm(total=len(examples)) as process_bar:
            for example in examples:
                record = self.tokenizer.encode(text=example.text_a,
                                               text_pair=example.text_b,
                                               max_seq_len=self.max_seq_len)
                # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab
                # When all words are not found in the vocab, the text will be dropped.
                if not record:
                    logger.info(
                        'The text %s has been dropped as it has no words in the vocab after tokenization.'
                        % example.text_a)
                    continue
                if example.label:
                    record['label'] = self.label_list.index(
                        example.label) if self.label_list else float(
                            example.label)
                records.append(record)
                process_bar.update(1)
        return records
Exemple #8
0
 def delete_hook(self, hook_type: str, name: str):
     '''
     delete the handler function of spectific event.
     Args:
         hook_type (str): the spectific event name
         name (str): the handler function name
     '''
     self._hooks.delete(hook_type, name)
     logger.info('Delete hook {}:{} successfully'.format(hook_type, name))
Exemple #9
0
 def test_records(self):
     if not self._test_records:
         examples = self.test_examples
         if not self.tokenizer or not examples:
             return []
         logger.info('Processing the test set...')
         self._test_records = self._convert_examples_to_records(
             examples, phase='test')
     return self._test_records
Exemple #10
0
 def predict_records(self):
     if not self._predict_records:
         examples = self.predict_examples
         if not self.tokenizer or not examples:
             return []
         logger.info('Processing the predict set...')
         self._predict_records = self._convert_examples_to_records(
             examples, phase='predict')
     return self._predict_records
Exemple #11
0
 def dev_records(self):
     if not self._dev_records:
         examples = self.dev_examples
         if not self.tokenizer or not examples:
             return []
         logger.info('Processing the dev set...')
         self._dev_records = self._convert_examples_to_records(examples,
                                                               phase='dev')
     return self._dev_records
Exemple #12
0
 def set_speaker_embedding(self, speaker_audio: str):
     assert os.path.exists(
         speaker_audio
     ), f'Speaker audio file: {speaker_audio} does not exists.'
     mel_sequences = self.speaker_processor.extract_mel_partials(
         self.speaker_processor.preprocess_wav(speaker_audio))
     self._speaker_embedding = self.speaker_encoder.embed_utterance(
         paddle.to_tensor(mel_sequences))
     logger.info(
         f'Speaker embedding has been set from file: {speaker_audio}')
Exemple #13
0
 def modify_hook(self, hook_type: str, name: str, func: Callable):
     '''
      modify the handler function of spectific event.
      Args:
          hook_type (str): the spectific event name
          name (str): the handler function name
          func (func): the new handler function
      '''
     self._hooks.modify(hook_type, name, func)
     logger.info('Modify hook {}:{} successfully'.format(hook_type, name))
Exemple #14
0
 def read_images(self, paths=[]):
     images = []
     for img_path in paths:
         assert os.path.isfile(
             img_path), "The {} isn't a valid file.".format(img_path)
         img = cv2.imread(img_path)
         if img is None:
             logger.info("error in loading image:{}".format(img_path))
             continue
         img = img[:, :, ::-1]
         images.append(img)
     return images
Exemple #15
0
 def init_if_load_best_model(self):
     if not self.is_best_model_loaded:
         best_model_path = os.path.join(self.config.checkpoint_dir, "best_model")
         logger.info("Load the best model from %s" % best_model_path)
         if os.path.exists(best_model_path):
             self.load_parameters(best_model_path)
             self.is_checkpoint_loaded = False
             self.is_best_model_loaded = True
         else:
             self.init_if_necessary()
     else:
         logger.info("The best model has been loaded")
Exemple #16
0
 def add_hook(self, hook_type: str, name: str = None, func: Callable = None):
     '''
     add the handler function to spectific event.
     Args:
         hook_type (str): the spectific event name
         name (str): the handler function name, default None
         func (func): the handler function, default None
     '''
     if name == None:
         name = 'hook_{}'.format(id(func))
     self._hooks.add(hook_type, name=name, func=func)
     logger.info('Add hook {}:{} successfully'.format(hook_type, name))
 def _download_and_uncompress_dataset(self, destination: str, url: str):
     """
     Downloads dataset and uncompresses it.
     Args:
        destination (:obj:`str`): The dataset cached directory.
        url (:obj: str): The link to be downloaded a dataset.
     """
     if not os.path.exists(destination):
         dataset_package = download(url=url, path=DATA_HOME)
         if is_xarfile(dataset_package):
             unarchive(dataset_package, DATA_HOME)
     else:
         logger.info("Dataset {} already cached.".format(destination))
Exemple #18
0
    def __init__(
        self,
        task=None,
        load_checkpoint=None,
        label_map=None,
        num_classes=2,
        **kwargs,
    ):
        super(ErnieTiny, self).__init__()
        if label_map:
            self.num_classes = len(label_map)
        else:
            self.num_classes = num_classes

        if task == 'sequence_classification':
            task = 'seq-cls'
            logger.warning(
                "current task name 'sequence_classification' was renamed to 'seq-cls', "
                "'sequence_classification' has been deprecated and will be removed the future.",
            )
        if task == 'seq-cls':
            self.model = ErnieForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='ernie-tiny',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task == 'token-cls':
            self.model = ErnieForTokenClassification.from_pretrained(
                pretrained_model_name_or_path='ernie-tiny',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task is None:
            self.model = ErnieModel.from_pretrained(
                pretrained_model_name_or_path='ernie-tiny', **kwargs)
        else:
            raise RuntimeError(
                "Unknown task {}, task should be one in {}".format(
                    task, self._tasks_supported))

        self.task = task
        self.label_map = label_map

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))
Exemple #19
0
    def _convert_examples_to_records(
            self, examples: List[InputExample]) -> List[dict]:
        """
        Converts all examples to records which the model needs.
        Args:
            examples(obj:`List[InputExample]`): All data examples returned by _read_file.
        Returns:
            records(:obj:`List[dict]`): All records which the model needs.
        """
        records = []
        for example in examples:
            if isinstance(self.tokenizer, PretrainedTokenizer):
                if Version(paddlenlp.__version__) <= Version('2.0.0rc2'):
                    record = self.tokenizer.encode(
                        text=example.text_a,
                        text_pair=example.text_b,
                        max_seq_len=self.max_seq_len)
                else:
                    record = self.tokenizer(text=example.text_a,
                                            text_pair=example.text_b,
                                            max_seq_len=self.max_seq_len,
                                            pad_to_max_seq_len=True,
                                            return_length=True)
            elif isinstance(self.tokenizer, JiebaTokenizer):
                pad_token = self.tokenizer.vocab.pad_token

                ids = self.tokenizer.encode(sentence=example.text_a)
                seq_len = min(len(ids), self.max_seq_len)
                if len(ids) > self.max_seq_len:
                    ids = trunc_sequence(ids, self.max_seq_len)
                else:
                    pad_token_id = self.tokenizer.vocab.to_indices(pad_token)
                    ids = pad_sequence(ids, self.max_seq_len, pad_token_id)
                record = {'text': ids, 'seq_len': seq_len}
            else:
                raise RuntimeError(
                    "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                    .format(type(self.tokenizer)))

            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization."
                    % example.text_a)
                continue
            if example.label:
                record['label'] = self.label_map[example.label]
            records.append(record)
        return records
Exemple #20
0
    def __init__(self, dataset: Generic, random_seed: int = None):
        self.dataset = dataset
        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
        np.random.seed(random_seed)

        # generate label map
        self.label_map = {}
        try:
            for index, label in enumerate(self.dataset.get_labels()):
                self.label_map[label] = index
            logger.info('Dataset label map = {}'.format(self.label_map))
        except:
            # some dataset like squad, its label_list=None
            logger.info(
                'Dataset is None or it has not any labels, label map = {}'.
                format(self.label_map))
 def _convert_examples_to_records(
         self, examples: List[InputExample]) -> List[dict]:
     """
     Returns a list[dict] including all the input information what the model need.
     Args:
         examples (list): the data examples, returned by _read_file.
     Returns:
         a list with all the examples record.
     """
     records = []
     for example in examples:
         tokens, labels = reseg_token_label(
             tokenizer=self.tokenizer,
             tokens=example.text_a.split(self.split_char),
             labels=example.label.split(self.split_char))
         record = self.tokenizer.encode(text=tokens,
                                        max_seq_len=self.max_seq_len)
         # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab
         # When all words are not found in the vocab, the text will be dropped.
         if not record:
             logger.info(
                 "The text %s has been dropped as it has no words in the vocab after tokenization."
                 % example.text_a)
             continue
         if labels:
             record["label"] = []
             tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(
                 record['input_ids'])
             tokens_index = 0
             for token in tokens_with_specical_token:
                 if tokens_index < len(
                         tokens) and token == tokens[tokens_index]:
                     record["label"].append(
                         self.label_list.index(labels[tokens_index]))
                     tokens_index += 1
                 elif token in [self.tokenizer.pad_token]:
                     record["label"].append(
                         self.ignore_label)  # label of special token
                 else:
                     record["label"].append(
                         self.label_list.index(self.no_entity_label))
         records.append(record)
     return records
Exemple #22
0
def load_checkpoint(
        checkpoint_dir: str, exe: paddle.static.Executor,
        main_program: paddle.static.Program) -> Tuple[bool, int, int, float]:

    ckpt_meta_path = os.path.join(checkpoint_dir, CKPT_FILE_NAME)
    ckpt = checkpoint_pb2.CheckPoint()
    logger.info('Try loading checkpoint from {}'.format(ckpt_meta_path))
    if os.path.exists(ckpt_meta_path):
        with open(ckpt_meta_path, 'rb') as f:
            ckpt.ParseFromString(f.read())
    current_epoch = 1
    global_step = 0
    best_score = -999

    if ckpt.latest_model_dir:
        paddle.static.load(executor=exe,
                           model_path=ckpt.latest_model_dir,
                           program=main_program)

        # Compatible with older versions without best_score in checkpoint_pb2
        try:
            best_score = ckpt.best_score
        except:
            best_score = -999

        logger.info('PaddleHub model checkpoint loaded. current_epoch={}, '
                    'global_step={}, best_score={:.5f}'.format(
                        ckpt.current_epoch, ckpt.global_step, best_score))

        return True, ckpt.current_epoch, ckpt.global_step, best_score

    logger.info('PaddleHub model checkpoint not found, start from scratch...')

    return False, current_epoch, global_step, best_score
Exemple #23
0
    def predict(self, images=[], paths=[], top_k=1):
        """
        
        Args:
            images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths
            paths (list[str]): The paths of images. If paths not images
        Returns:
            res (list): The result of chinese texts and save path of images.
        """

        if images != [] and isinstance(images, list) and paths == []:
            predicted_data = images
        elif images == [] and isinstance(paths, list) and paths != []:
            predicted_data = self.read_images(paths)
        else:
            raise TypeError(
                "The input data is inconsistent with expectations.")

        assert predicted_data != [], "There is not any image to be predicted. Please check the input data."

        all_results = []
        for img in predicted_data:
            if img is None:
                logger.info("error in loading image")
                all_results.append([])
                continue

            self.args.image_file = img
            self.args.top_k = top_k

            starttime = time.time()
            classes, scores = paddle_predict.predict(self.args, self.predictor)
            elapse = time.time() - starttime

            logger.info("Predict time: {}".format(elapse))
            all_results.append([classes.tolist(), scores.tolist(), elapse])
        return all_results
Exemple #24
0
    def __init__(self, extract_embedding: bool = True, checkpoint: str = None):

        super(CNN14, self).__init__()
        self.bn0 = nn.BatchNorm2D(64)
        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)

        self.fc1 = nn.Linear(2048, self.emb_size)
        self.fc_audioset = nn.Linear(self.emb_size, 527)

        if checkpoint is not None and os.path.isfile(checkpoint):
            state_dict = paddle.load(checkpoint)
            self.set_state_dict(state_dict)
            logger.info(
                f'Loaded CNN14 pretrained parameters from: {checkpoint}')
        else:
            logger.error(
                'No valid checkpoints for CNN14. Start training from scratch.')

        self.extract_embedding = extract_embedding
Exemple #25
0
 def _default_predict_end_event(self, run_states: List[RunState]):
     logger.info('PaddleHub predict finished.')
Exemple #26
0
    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
        """
        Instantiate an instance of `PretrainedModel` from a predefined
        model specified by name or path.
        Args:
            pretrained_model_name_or_path (str): A name of or a file path to a
                pretrained model.
            *args (tuple): position arguments for `__init__`. If provide, use
                this as position argument values for model initialization.
            **kwargs (dict): keyword arguments for `__init__`. If provide, use
                this to update pre-defined keyword argument values for model
                initialization.
        Returns:
            PretrainedModel: An instance of PretrainedModel.
        """
        pretrained_models = list(cls.pretrained_init_configuration.keys())
        resource_files = {}
        init_configuration = {}
        if pretrained_model_name_or_path in pretrained_models:
            for file_id, map_list in cls.pretrained_resource_files_map.items():
                resource_files[file_id] = map_list[
                    pretrained_model_name_or_path]
            init_configuration = copy.deepcopy(
                cls.
                pretrained_init_configuration[pretrained_model_name_or_path])
        else:
            if os.path.isdir(pretrained_model_name_or_path):
                for file_id, file_name in cls.resource_files_names.items():
                    full_file_name = os.path.join(
                        pretrained_model_name_or_path, file_name)
                    resource_files[file_id] = full_file_name
                resource_files["model_config_file"] = os.path.join(
                    pretrained_model_name_or_path, cls.model_config_file)
            else:
                raise ValueError(
                    "Calling {}.from_pretrained() with a model identifier or the "
                    "path to a directory instead. The supported model "
                    "identifiers are as follows: {}".format(
                        cls.__name__,
                        cls.pretrained_init_configuration.keys()))
        # FIXME(chenzeyu01): We should use another data path for storing model
        default_root = os.path.join(DATA_HOME, pretrained_model_name_or_path)
        resolved_resource_files = {}
        for file_id, file_path in resource_files.items():
            path = os.path.join(default_root, file_path.split('/')[-1])
            if file_path is None or os.path.isfile(file_path):
                resolved_resource_files[file_id] = file_path
            elif os.path.exists(path):
                logger.info("Already cached %s" % path)
                resolved_resource_files[file_id] = path
            else:
                logger.info("Downloading %s and saved to %s" %
                            (file_path, default_root))
                resolved_resource_files[file_id] = get_path_from_url(
                    file_path, default_root)

        # Prepare model initialization kwargs
        # Did we saved some inputs and kwargs to reload ?
        model_config_file = resolved_resource_files.pop(
            "model_config_file", None)
        if model_config_file is not None:
            with io.open(model_config_file, encoding="utf-8") as f:
                init_kwargs = json.load(f)
        else:
            init_kwargs = init_configuration
        # position args are stored in kwargs, maybe better not include
        init_args = init_kwargs.pop("init_args", ())
        # class name corresponds to this configuration
        init_class = init_kwargs.pop("init_class",
                                     cls.base_model_class.__name__)

        # Check if the loaded config matches the current model class's __init__
        # arguments. If not match, the loaded config is for the base model class.
        if init_class == cls.base_model_class.__name__:
            base_args = init_args
            base_kwargs = init_kwargs
            derived_args = ()
            derived_kwargs = {}
            base_arg_index = None
        else:  # extract config for base model
            derived_args = list(init_args)
            derived_kwargs = init_kwargs
            for i, arg in enumerate(init_args):
                if isinstance(arg, dict) and "init_class" in arg:
                    assert arg.pop(
                        "init_class") == cls.base_model_class.__name__, (
                            "pretrained base model should be {}").format(
                                cls.base_model_class.__name__)
                    base_arg_index = i
                    break
            for arg_name, arg in init_kwargs.items():
                if isinstance(arg, dict) and "init_class" in arg:
                    assert arg.pop(
                        "init_class") == cls.base_model_class.__name__, (
                            "pretrained base model should be {}").format(
                                cls.base_model_class.__name__)
                    base_arg_index = arg_name
                    break
            base_args = arg.pop("init_args", ())
            base_kwargs = arg
        if cls == cls.base_model_class:
            # Update with newly provided args and kwargs for base model
            base_args = base_args if not args else args
            base_kwargs.update(kwargs)
            model = cls(*base_args, **base_kwargs)
        else:
            # Update with newly provided args and kwargs for derived model
            base_model = cls.base_model_class(*base_args, **base_kwargs)
            if base_arg_index is not None:
                derived_args[base_arg_index] = base_model
            else:
                derived_args = (base_model, )  # assume at the first position
            derived_args = derived_args if not args else args
            derived_kwargs.update(kwargs)
            model = cls(*derived_args, **derived_kwargs)

        # Maybe need more ways to load resources.
        weight_path = list(resolved_resource_files.values())[0]
        assert weight_path.endswith(
            ".pdparams"), "suffix of weight must be .pdparams"
        state_dict = paddle.load(weight_path)

        # Make sure we are able to load base models as well as derived models
        # (with heads)
        start_prefix = ""
        model_to_load = model
        state_to_load = state_dict
        unexpected_keys = []
        missing_keys = []
        if not hasattr(model, cls.base_model_prefix) and any(
                s.startswith(cls.base_model_prefix)
                for s in state_dict.keys()):
            # base model
            state_to_load = {}
            start_prefix = cls.base_model_prefix + "."
            for k, v in state_dict.items():
                if k.startswith(cls.base_model_prefix):
                    state_to_load[k[len(start_prefix):]] = v
                else:
                    unexpected_keys.append(k)
        if hasattr(model, cls.base_model_prefix) and not any(
                s.startswith(cls.base_model_prefix)
                for s in state_dict.keys()):
            # derived model (base model with heads)
            model_to_load = getattr(model, cls.base_model_prefix)
            for k in model.state_dict().keys():
                if not k.startswith(cls.base_model_prefix):
                    missing_keys.append(k)
        if len(missing_keys) > 0:
            logger.info(
                "Weights of {} not initialized from pretrained model: {}".
                format(model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            logger.info(
                "Weights from pretrained model not used in {}: {}".format(
                    model.__class__.__name__, unexpected_keys))
        model_to_load.set_state_dict(state_to_load)
        if paddle.in_dynamic_mode():
            return model
        return model, state_to_load
Exemple #27
0
 def _save_checkpoint(self):
     '''Save model checkpoint and state dict'''
     model_path = os.path.join(self.checkpoint_dir,
                               'epoch_{}'.format(self.current_epoch))
     logger.info('Saving model checkpoint to {}'.format(model_path))
     self.save_model(model_path)
Exemple #28
0
 def _default_eval_start_event(self):
     logger.info('Evaluation on {} dataset start'.format(self.phase))
Exemple #29
0
 def _default_predict_start_event(self):
     logger.info('PaddleHub predict start')
Exemple #30
0
    def _convert_examples_to_records(
            self, examples: List[InputExample]) -> List[dict]:
        """
        Returns a list[dict] including all the input information what the model need.
        Args:
            examples (list): the data examples, returned by _read_file.
        Returns:
            a list with all the examples record.
        """
        records = []
        for example in examples:
            tokens = example.text_a.split(self.split_char)
            labels = example.label.split(self.split_char)

            # convert tokens into record
            if isinstance(self.tokenizer, PretrainedTokenizer):
                pad_token = self.tokenizer.pad_token

                tokens, labels = reseg_token_label(tokenizer=self.tokenizer,
                                                   tokens=tokens,
                                                   labels=labels)
                record = self.tokenizer.encode(text=tokens,
                                               max_seq_len=self.max_seq_len)
            elif isinstance(self.tokenizer, JiebaTokenizer):
                pad_token = self.tokenizer.vocab.pad_token

                ids = [
                    self.tokenizer.vocab.to_indices(token) for token in tokens
                ]
                seq_len = min(len(ids), self.max_seq_len)
                if len(ids) > self.max_seq_len:
                    ids = trunc_sequence(ids, self.max_seq_len)
                else:
                    pad_token_id = self.tokenizer.vocab.to_indices(pad_token)
                    ids = pad_sequence(ids, self.max_seq_len, pad_token_id)

                record = {'text': ids, 'seq_len': seq_len}
            else:
                raise RuntimeError(
                    "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                    .format(type(self.tokenizer)))

            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization."
                    % example.text_a)
                continue

            # convert labels into record
            if labels:
                record["label"] = []
                if isinstance(self.tokenizer, PretrainedTokenizer):
                    tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(
                        record['input_ids'])
                elif isinstance(self.tokenizer, JiebaTokenizer):
                    tokens_with_specical_token = [
                        self.tokenizer.vocab.to_tokens(id_)
                        for id_ in record['text']
                    ]
                else:
                    raise RuntimeError(
                        "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                        .format(type(self.tokenizer)))

                tokens_index = 0
                for token in tokens_with_specical_token:
                    if tokens_index < len(
                            tokens) and token == tokens[tokens_index]:
                        record["label"].append(
                            self.label_list.index(labels[tokens_index]))
                        tokens_index += 1
                    elif token in [pad_token]:
                        record["label"].append(
                            self.ignore_label)  # label of special token
                    else:
                        record["label"].append(
                            self.label_list.index(self.no_entity_label))
            records.append(record)
        return records