Esempio n. 1
0
    def __init__(
        self,
        task: str = None,
        load_checkpoint: str = None,
        label_map: Dict = None,
        num_classes: int = 2,
        **kwargs,
    ):
        super(Electra, self).__init__()
        if label_map:
            self.label_map = label_map
            self.num_classes = len(label_map)
        else:
            self.num_classes = num_classes

        if task == 'sequence_classification':
            task = 'seq-cls'
            logger.warning(
                "current task name 'sequence_classification' was renamed to 'seq-cls', "
                "'sequence_classification' has been deprecated and will be removed in the future.",
            )
        if task == 'seq-cls':
            self.model = ElectraForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task == 'token-cls':
            self.model = ElectraForTokenClassification.from_pretrained(
                pretrained_model_name_or_path='electra-small',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = ChunkEvaluator(label_list=[
                self.label_map[i] for i in sorted(self.label_map.keys())
            ])
        elif task == 'text-matching':
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
            self.dropout = paddle.nn.Dropout(0.1)
            self.classifier = paddle.nn.Linear(
                self.model.config['hidden_size'] * 3, 2)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task is None:
            self.model = ElectraModel.from_pretrained(
                pretrained_model_name_or_path='electra-small', **kwargs)
        else:
            raise RuntimeError(
                "Unknown task {}, task should be one in {}".format(
                    task, self._tasks_supported))

        self.task = task

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))
Esempio n. 2
0
def record_exception(msg: str) -> str:
    '''Record the current exception infomation into the PaddleHub log file witch will be automatically stored according to date.'''
    tb = traceback.format_exc()
    file = record(tb)
    logger.warning(
        '{}. Detailed error information can be found in the {}.'.format(
            msg, file))
Esempio n. 3
0
    def _load_checkpoint(self):
        '''Load checkpoint and state dict'''
        max_epoch = -1

        for file in os.listdir(self.checkpoint_dir):
            if not file.startswith('epoch_'):
                continue

            _epoch = file.split('_')[-1]
            if not _epoch.isdigit():
                continue

            max_epoch = max(max_epoch, int(_epoch))

        if max_epoch == -1:
            if self.local_rank == 0:
                logger.warning('PaddleHub model checkpoint not found, start from scratch...')
            return

        # load best metrics
        self._load_metrics()

        self.current_epoch = max_epoch
        metric_msg = ['{}={:.4f}'.format(metric, value) for metric, value in self.best_metrics.items()]
        metric_msg = ' '.join(metric_msg)
        if self.local_rank == 0:
            logger.info('PaddleHub model checkpoint loaded. current_epoch={} [{}]'.format(
                self.current_epoch, metric_msg))

        model_path = os.path.join(self.checkpoint_dir, 'epoch_{}'.format(self.current_epoch))
        self.load_model(model_path)
Esempio n. 4
0
    def __init__(self,
                 base_path,
                 train_file=None,
                 dev_file=None,
                 test_file=None,
                 predict_file=None,
                 label_file=None,
                 label_list=None,
                 train_file_with_header=False,
                 dev_file_with_header=False,
                 test_file_with_header=False,
                 predict_file_with_header=False):
        if not (train_file or dev_file or test_file):
            raise ValueError('At least one file should be assigned')
        self.base_path = base_path
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.predict_file = predict_file
        self.label_file = label_file
        self.label_list = label_list

        self.train_examples = []
        self.dev_examples = []
        self.test_examples = []
        self.predict_examples = []

        self.if_file_with_header = {
            'train': train_file_with_header,
            'dev': dev_file_with_header,
            'test': test_file_with_header,
            'predict': predict_file_with_header
        }

        if train_file:
            self._load_train_examples()
        if dev_file:
            self._load_dev_examples()
        if test_file:
            self._load_test_examples()
        if predict_file:
            self._load_predict_examples()
        if self.label_file:
            if not self.label_list:
                self.label_list = self._load_label_data()
            else:
                logger.warning(
                    'As label_list has been assigned, label_file is noneffective'
                )

        if self.label_list:
            self.label_index = dict(
                zip(self.label_list, range(len(self.label_list))))
Esempio n. 5
0
    def __init__(
        self,
        task=None,
        load_checkpoint=None,
        label_map=None,
        num_classes=2,
        **kwargs,
    ):
        super(ErnieTiny, self).__init__()
        if label_map:
            self.num_classes = len(label_map)
        else:
            self.num_classes = num_classes

        if task == 'sequence_classification':
            task = 'seq-cls'
            logger.warning(
                "current task name 'sequence_classification' was renamed to 'seq-cls', "
                "'sequence_classification' has been deprecated and will be removed the future.",
            )
        if task == 'seq-cls':
            self.model = ErnieForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path='ernie-tiny',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task == 'token-cls':
            self.model = ErnieForTokenClassification.from_pretrained(
                pretrained_model_name_or_path='ernie-tiny',
                num_classes=self.num_classes,
                **kwargs)
            self.criterion = paddle.nn.loss.CrossEntropyLoss()
            self.metric = paddle.metric.Accuracy()
        elif task is None:
            self.model = ErnieModel.from_pretrained(
                pretrained_model_name_or_path='ernie-tiny', **kwargs)
        else:
            raise RuntimeError(
                "Unknown task {}, task should be one in {}".format(
                    task, self._tasks_supported))

        self.task = task
        self.label_map = label_map

        if load_checkpoint is not None and os.path.isfile(load_checkpoint):
            state_dict = paddle.load(load_checkpoint)
            self.set_state_dict(state_dict)
            logger.info('Loaded parameters from %s' %
                        os.path.abspath(load_checkpoint))
Esempio n. 6
0
 def _read_file(self, input_file, phase=None):
     '''Reads a tab separated value file.'''
     has_warned = False
     with io.open(input_file, 'r', encoding='UTF-8') as file:
         reader = csv.reader(file, delimiter='\t', quotechar=None)
         examples = []
         for (i, line) in enumerate(reader):
             if i == 0:
                 ncol = len(line)
                 if self.if_file_with_header[phase]:
                     continue
             if phase != 'predict':
                 if ncol == 1:
                     raise Exception(
                         'the %s file: %s only has one column but it is not a predict file'
                         % (phase, input_file))
                 elif ncol == 2:
                     example = InputExample(guid=i,
                                            text_a=line[0],
                                            label=line[1])
                 elif ncol == 3:
                     example = InputExample(guid=i,
                                            text_a=line[0],
                                            text_b=line[1],
                                            label=line[2])
                 else:
                     raise Exception(
                         'the %s file: %s has too many columns (should <=3)'
                         % (phase, input_file))
             else:
                 if ncol == 1:
                     example = InputExample(guid=i, text_a=line[0])
                 elif ncol == 2:
                     if not has_warned:
                         logger.warning(
                             'the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b'
                             % (input_file))
                         has_warned = True
                     example = InputExample(guid=i,
                                            text_a=line[0],
                                            text_b=line[1])
                 else:
                     raise Exception(
                         'the predict file: %s has too many columns (should <=2)'
                         % (input_file))
             examples.append(example)
         return examples
Esempio n. 7
0
    def __init__(self,
                 base_path: str,
                 tokenizer: Union[BertTokenizer, CustomTokenizer],
                 max_seq_len: Optional[int] = 128,
                 mode: Optional[str] = "train",
                 data_file: Optional[str] = None,
                 label_file: Optional[str] = None,
                 label_list: Optional[List[str]] = None):
        """
        Ags:
            base_path (:obj:`str`): The directory to the whole dataset.
            tokenizer (:obj:`BertTokenizer` or :obj:`CustomTokenizer`):
                It tokenizes the text and encodes the data as model needed.
            max_seq_len (:obj:`int`, `optional`, defaults to :128):
                If set to a number, will limit the total sequence returned so that it has a maximum length.
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train, test or dev).
            data_file(:obj:`str`, `optional`, defaults to :obj:`None`):
                The data file name, which is relative to the base_path.
            label_file(:obj:`str`, `optional`, defaults to :obj:`None`):
                The label file name, which is relative to the base_path.
                It is all labels of the dataset, one line one label.
            label_list(:obj:`List[str]`, `optional`, defaults to :obj:`None`):
                The list of all labels of the dataset
        """
        self.data_file = os.path.join(base_path, data_file)
        self.label_list = label_list

        self.mode = mode
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

        if label_file:
            self.label_file = os.path.join(base_path, label_file)
            if not self.label_list:
                self.label_list = self._load_label_data()
            else:
                logger.warning(
                    "As label_list has been assigned, label_file is noneffective"
                )
        if self.label_list:
            self.label_map = {
                item: index
                for index, item in enumerate(self.label_list)
            }
Esempio n. 8
0
    def _default_eval_end_event(self, run_states: List[RunState]):
        '''
        Paddlehub default handler for eval_end_event, it will complete visualization and metrics calculation
        Args:
            run_states (object): the results in eval phase
        '''
        eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states)
        if 'train' in self._envs:
            self.vdl_writer.add_scalar(
                tag='Loss_{}'.format(self.phase), value=eval_loss, step=self._envs['train'].current_step)

        log_scores = ''
        for metric in eval_scores:
            if 'train' in self._envs:
                self.vdl_writer.add_scalar(
                    tag='{}_{}'.format(metric, self.phase),
                    value=eval_scores[metric],
                    step=self._envs['train'].current_step)

            log_scores += '{}={:.5f} '.format(metric, eval_scores[metric])
        logger.eval('[{} dataset evaluation result] loss={:.5f} {}[step/sec: {:.2f}]'.format(
            self.phase, eval_loss, log_scores, run_speed))

        eval_scores_items = eval_scores.items()
        if len(eval_scores_items):
            # The first metric will be chose to eval
            main_metric, main_value = list(eval_scores_items)[0]
        else:
            logger.warning('None of metrics has been implemented, loss will be used to evaluate.')
            # The larger, the better
            main_metric, main_value = 'negative loss', -eval_loss
        if self.phase in ['dev', 'val'] and main_value > self.best_score:
            self.best_score = main_value
            model_saved_dir = os.path.join(self.config.checkpoint_dir, 'best_model')
            logger.eval('best model saved to {} [best {}={:.5f}]'.format(model_saved_dir, main_metric, main_value))
            self.save_inference_model(dirname=model_saved_dir)
Esempio n. 9
0
    def __init__(self, speaker_audio: str = None, output_dir: str = './'):
        super(VoiceCloner, self).__init__()

        self.sample_rate = 22050  # Hyper params for the following model ckpts.
        speaker_encoder_ckpt = os.path.join(
            MODULE_HOME, 'lstm_tacotron2', 'assets',
            'ge2e_ckpt_0.3/step-3000000.pdparams')
        synthesizer_ckpt = os.path.join(
            MODULE_HOME, 'lstm_tacotron2', 'assets',
            'tacotron2_aishell3_ckpt_0.3/step-450000.pdparams')
        vocoder_ckpt = os.path.join(
            MODULE_HOME, 'lstm_tacotron2', 'assets',
            'waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams')

        # Speaker encoder
        self.speaker_processor = SpeakerVerificationPreprocessor(
            sampling_rate=16000,
            audio_norm_target_dBFS=-30,
            vad_window_length=30,
            vad_moving_average_width=8,
            vad_max_silence_length=6,
            mel_window_length=25,
            mel_window_step=10,
            n_mels=40,
            partial_n_frames=160,
            min_pad_coverage=0.75,
            partial_overlap_ratio=0.5)
        self.speaker_encoder = LSTMSpeakerEncoder(n_mels=40,
                                                  num_layers=3,
                                                  hidden_size=256,
                                                  output_size=256)
        self.speaker_encoder.set_state_dict(paddle.load(speaker_encoder_ckpt))
        self.speaker_encoder.eval()

        # Voice synthesizer
        self.synthesizer = Tacotron2(vocab_size=68,
                                     n_tones=10,
                                     d_mels=80,
                                     d_encoder=512,
                                     encoder_conv_layers=3,
                                     encoder_kernel_size=5,
                                     d_prenet=256,
                                     d_attention_rnn=1024,
                                     d_decoder_rnn=1024,
                                     attention_filters=32,
                                     attention_kernel_size=31,
                                     d_attention=128,
                                     d_postnet=512,
                                     postnet_kernel_size=5,
                                     postnet_conv_layers=5,
                                     reduction_factor=1,
                                     p_encoder_dropout=0.5,
                                     p_prenet_dropout=0.5,
                                     p_attention_dropout=0.1,
                                     p_decoder_dropout=0.1,
                                     p_postnet_dropout=0.5,
                                     d_global_condition=256,
                                     use_stop_token=False)
        self.synthesizer.set_state_dict(paddle.load(synthesizer_ckpt))
        self.synthesizer.eval()

        # Vocoder
        self.vocoder = ConditionalWaveFlow(upsample_factors=[16, 16],
                                           n_flows=8,
                                           n_layers=8,
                                           n_group=16,
                                           channels=128,
                                           n_mels=80,
                                           kernel_size=[3, 3])
        self.vocoder.set_state_dict(paddle.load(vocoder_ckpt))
        self.vocoder.eval()

        # Speaking embedding
        self._speaker_embedding = None
        if speaker_audio is None or not os.path.isfile(speaker_audio):
            speaker_audio = os.path.join(MODULE_HOME, 'lstm_tacotron2',
                                         'assets', 'voice_cloning.wav')
            logger.warning(
                f'Due to no speaker audio is specified, speaker encoder will use defult '
                f'waveform({speaker_audio}) to extract speaker embedding. You can use '
                '"set_speaker_embedding()" method to reset a speaker audio for voice cloning.'
            )
        self.set_speaker_embedding(speaker_audio)

        self.output_dir = os.path.abspath(output_dir)
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
Esempio n. 10
0
    def __init__(self,
                 dataset: Iterator = None,
                 feed_list: List = None,
                 data_reader: Generic = None,
                 main_program: paddle.static.Program = None,
                 startup_program: paddle.static.Program = None,
                 config: RunConfig = None,
                 metrics_choices: List[str] = None):
        # metrics item
        self.best_score = -999
        if not metrics_choices:
            metrics_choices = ['acc']
        elif metrics_choices == None:
            metrics_choices = []
        if isinstance(metrics_choices, list):
            self.metrics_choices = metrics_choices
        else:
            self.metrics_choices = [metrics_choices]

        if main_program is None:
            self._base_main_program = paddle_utils.clone_program(paddle.static.default_main_program(), for_test=False)
        else:
            self._base_main_program = paddle_utils.clone_program(main_program, for_test=False)
        if startup_program is None:
            self._base_startup_program = paddle_utils.clone_program(
                paddle.static.default_startup_program(), for_test=False)
        else:
            self._base_startup_program = paddle_utils.clone_program(startup_program, for_test=False)
        self.is_checkpoint_loaded = False
        self._base_compiled_program = None

        # run config
        self.config = config if config else RunConfig()
        self.place = self.places[0]
        self.device_count = len(self.places)

        if self.config.use_data_parallel:
            if not self.config.use_pyreader and self.config.batch_size < self.device_count:
                logger.warning(
                    'Batch size({}) is less than the count of devices({}), which is not allowed in current Paddle versions'
                    .format(self.config.batch_size, self.device_count))
                logger.warning('Batch size automatically adjusted to {}'.format(self.device_count))
                self.config._batch_size = self.device_count

        self.exe = paddle.static.Executor(place=self.place)
        self.build_strategy = paddle.static.BuildStrategy()

        # run environment
        self._phases = []
        self._envs = {}
        self._predict_data = None
        self._vdl_writer = None

        # event hooks
        self._hooks = TaskHooks()
        for hook_type, event_hooks in self._hooks._registered_hooks.items():
            self._hooks.add(hook_type, 'default', eval('self._default_{}'.format(hook_type)))
            setattr(BaseTask, '_{}'.format(hook_type), self.create_event_function(hook_type))

        # accelerate predict
        self.is_best_model_loaded = False
        self._predictor = None

        # set default phase
        self.enter_phase('train')

        self.dataset = dataset
        if dataset:
            self._label_list = dataset.get_labels()
        else:
            self._label_list = None

        self._base_data_reader = data_reader
        self._base_feed_list = feed_list

        self._compatible_mode = True if data_reader else False