Beispiel #1
0
def from_module_attr_to_pyobj(module_attr):
    if module_attr.type == module_desc_pb2.BOOLEAN:
        result = module_attr.b
    elif module_attr.type == module_desc_pb2.INT:
        result = module_attr.i
    elif module_attr.type == module_desc_pb2.STRING:
        result = module_attr.s
    elif module_attr.type == module_desc_pb2.FLOAT:
        result = module_attr.f
    elif module_attr.type == module_desc_pb2.LIST:
        result = []
        for index in range(len(module_attr.list.data)):
            result.append(
                from_module_attr_to_pyobj(module_attr.list.data[str(index)]))
    elif module_attr.type == module_desc_pb2.SET:
        result = set()
        for index in range(len(module_attr.set.data)):
            result.add(
                from_module_attr_to_pyobj(module_attr.set.data[str(index)]))
    elif module_attr.type == module_desc_pb2.MAP:
        result = {}
        for key, value in module_attr.map.data.items():
            key = get_pykey(key, module_attr.map.key_type[key])
            result[key] = from_module_attr_to_pyobj(value)
    elif module_attr.type == module_desc_pb2.NONE:
        result = None
    elif module_attr.type == module_desc_pb2.OBJECT:
        result = None
        logger.warning("can't tran module attr to python object")
    else:
        result = None
        logger.warning("unknown type of module attr")

    return result
Beispiel #2
0
    def new_eval_end_event(self, run_states):
        """
        Paddlehub default handler for eval_end_event, it will complete visualization and metrics calculation
        Args:
            run_states (object): the results in eval phase
        """
        eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states)

        if 'train' in self._envs:
            self.vdl_writer.add_scalar(tag="Loss_{}".format(self.phase),
                                       value=eval_loss,
                                       step=self._envs['train'].current_step)

        log_scores = ""

        s = []

        for metric in eval_scores:
            if 'train' in self._envs:
                self.vdl_writer.add_scalar(
                    tag="{}_{}".format(metric, self.phase),
                    value=eval_scores[metric],
                    step=self._envs['train'].current_step)
                # dev[metric].add_record(self._envs['train'].current_step, eval_scores[metric])
            log_scores += "%s=%.5f " % (metric, eval_scores[metric])
            s.append(eval_scores[metric])
            # dev[metric].add_record(self.current_step,eval_scores[metric])
        logger.eval(
            "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" %
            (self.phase, eval_loss, log_scores, run_speed))
        s.append(eval_loss)
        if 'train' in self._envs:
            s = [self._envs['train'].current_step] + s
            # dev_loss.add_record(self._envs['train'].current_step,eval_loss)
        s = [str(x) for x in s]
        with open('./work/log/%s_dev%s.txt' % (args.do_model, id),
                  'a',
                  encoding='utf-8') as f:
            f.write(','.join(s) + '\n')

        eval_scores_items = eval_scores.items()
        if len(eval_scores_items):
            # The first metric will be chose to eval
            if (args.dev_goal == 'f1'):
                main_metric, main_value = list(eval_scores_items)[0]
            else:  #loss
                main_metric, main_value = "negative loss", -eval_loss
        else:
            logger.warning(
                "None of metrics has been implemented, loss will be used to evaluate."
            )
            # The larger, the better
            main_metric, main_value = "negative loss", -eval_loss
        if self.phase in ["dev", "val"] and main_value > self.best_score:
            self.best_score = main_value
            model_saved_dir = os.path.join(self.config.checkpoint_dir,
                                           "best_model")
            logger.eval("best model saved to %s [best %s=%.5f]" %
                        (model_saved_dir, main_metric, main_value))
            self.save_inference_model(dirname=model_saved_dir)
Beispiel #3
0
    def __new__(cls,
                name=None,
                directory=None,
                module_dir=None,
                version=None,
                **kwargs):
        if cls.__name__ == "Module":
            if name:
                module = cls.init_with_name(
                    name=name, version=version, **kwargs)
            elif directory:
                module = cls.init_with_directory(directory=directory, **kwargs)
            elif module_dir:
                logger.warning(
                    "Parameter module_dir is deprecated, please use directory to specify the path"
                )
                if isinstance(module_dir, list) or isinstance(
                        module_dir, tuple):
                    directory = module_dir[0]
                    version = module_dir[1]
                else:
                    directory = module_dir
                module = cls.init_with_directory(directory=directory, **kwargs)
            CacheUpdater("update_cache", module.name, module.version).start()
        else:
            if not name and not directory:
                directory = os.path.dirname(
                    os.path.abspath(sys.modules[cls.__module__].__file__))
                module = Module.init_with_directory(
                    directory=directory, **kwargs)
            else:
                module = object.__new__(cls)

        return module
Beispiel #4
0
 def _check_paddle_version(self):
     if version_compare(self.paddle_version, paddle.__version__):
         logger.warning(
             "This Module is generated by the PaddlePaddle with version %s, and the local PaddlePaddle version is %s, which may cause serious incompatible bug. Please upgrade PaddlePaddle to the latest version."
             % (self.paddle_version, paddle.__version__))
         return False
     return True
Beispiel #5
0
 def _check_module_proto_version(self):
     if self.module_proto_version != module_proto_version:
         logger.warning(
             "Module description file version cannot be aligned with PaddleHub version"
         )
         return False
     return True
Beispiel #6
0
    def _postprocessing(self, run_states):
        if self._compatible_mode:
            id2label = {
                val: key
                for key, val in self._base_data_reader.label_map.items()
            }
        else:
            if self._label_list:
                id2label = {}
                for index, label in enumerate(self._label_list):
                    id2label[index] = label
            else:
                logger.warning(
                    "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
                )
                return run_states

        results = []
        for batch_states in run_states:
            batch_results = batch_states.run_results
            batch_infers = batch_results[0].reshape([-1]).astype(
                np.int32).tolist()
            seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist()
            current_id = 0
            for length in seq_lens:
                seq_infers = batch_infers[current_id:current_id + length]
                seq_result = list(map(id2label.get, seq_infers[1:-1]))
                current_id += length if self.add_crf else self.max_seq_len
                results.append(seq_result)
        return results
Beispiel #7
0
    def predict(
        self,
        data=None,
        label_list=None,
        load_best_model=True,
        return_result=False,
        accelerate_mode=True,
    ):
        """
        make prediction for the input data.

        Args:
            data (list): the data will be predicted. Its element should be a record when the task is initialized without data_reader param,
                         or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8).
            label_list (list): the label list, used to proprocess the output.
            load_best_model (bool): load the best model or not
            return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader but dataset parameter.
            accelerate_mode (bool): use high-performance predictor or not

        Returns:
            RunState: the running result of predict phase
        """
        if accelerate_mode:
            if isinstance(self._base_data_reader,
                          hub.reader.LACClassifyReader):
                logger.warning(
                    "LACClassifyReader does not support predictor, the accelerate_mode is closed now."
                )
                accelerate_mode = False
            elif isinstance(self, hub.TextGenerationTask):
                logger.warning(
                    "TextGenerationTask does not support predictor, the accelerate_mode is closed now."
                )
                accelerate_mode = False
        self.accelerate_mode = accelerate_mode

        with self.phase_guard(phase="predict"):
            self._predict_data = data
            if label_list:
                self._label_list = label_list
            self._predict_start_event()

            if load_best_model:
                self.init_if_load_best_model()
            else:
                self.init_if_necessary()
            if not self.accelerate_mode:
                run_states = self._run()
            else:
                if not self._predictor:
                    self._predictor = self._create_predictor()
                run_states = self._run_with_predictor()

            self._predict_end_event(run_states)
            self._predict_data = None
            if return_result or not self._compatible_mode:
                return self._postprocessing(run_states)
        return run_states
Beispiel #8
0
    def generate(self, texts, use_gpu=False, beam_width=5):
        """
        Get the continuation of the input poetry.

        Args:
             texts(list): the front part of a poetry.
             use_gpu(bool): whether use gpu to predict or not
             beam_width(int): the beam search width.

        Returns:
             results(list): the poetry continuations.
        """
        paddle.disable_static()

        if texts and isinstance(texts, list) and all(texts) and all(
            [isinstance(text, str) for text in texts]):
            predicted_data = texts
        else:
            raise ValueError(
                "The input texts should be a list with nonempty string elements."
            )

        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
        self.model.eval()
        results = []
        for text in predicted_data:
            sample_results = []
            encode_text = self.tokenizer.encode(text)
            src_ids = paddle.to_tensor(encode_text['input_ids']).unsqueeze(0)
            src_sids = paddle.to_tensor(
                encode_text['token_type_ids']).unsqueeze(0)
            output_ids = beam_search_infilling(
                self.model,
                src_ids,
                src_sids,
                eos_id=self.tokenizer.vocab['[SEP]'],
                sos_id=self.tokenizer.vocab['[CLS]'],
                attn_id=self.tokenizer.vocab['[MASK]'],
                pad_id=self.tokenizer.vocab['[PAD]'],
                unk_id=self.tokenizer.vocab['[UNK]'],
                vocab_size=len(self.tokenizer.vocab),
                max_decode_len=80,
                max_encode_len=20,
                beam_width=beam_width,
                tgt_type_id=1)
            output_str = self.rev_lookup(output_ids[0])

            for ostr in output_str.tolist():
                if '[SEP]' in ostr:
                    ostr = ostr[:ostr.index('[SEP]')]
                sample_results.append("".join(ostr))
            results.append(sample_results)
        return results
Beispiel #9
0
    def generate(self, texts, use_gpu=False, beam_width=5):
        """
        Get the predict result from the input texts.

        Args:
             texts(list): the input texts.
             use_gpu(bool): whether use gpu to predict or not
             beam_width(int): the beam search width.

        Returns:
             results(list): the predict result.
        """
        if texts and isinstance(texts, list) and all(texts) and all(
            [isinstance(text, str) for text in texts]):
            predicted_data = texts
        else:
            raise ValueError(
                "The input texts should be a list with nonempty string elements."
            )

        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        with fluid.dygraph.guard(place):
            self.model.eval()
            results = []
            for text in predicted_data:
                sample_results = []
                ids, sids = self.tokenizer.encode(text)
                src_ids = D.to_variable(np.expand_dims(ids, 0))
                src_sids = D.to_variable(np.expand_dims(sids, 0))
                output_ids = beam_search_infilling(
                    self.model,
                    src_ids,
                    src_sids,
                    eos_id=self.tokenizer.sep_id,
                    sos_id=self.tokenizer.cls_id,
                    attn_id=self.tokenizer.vocab['[MASK]'],
                    max_decode_len=50,
                    max_encode_len=50,
                    beam_width=beam_width,
                    tgt_type_id=1)
                output_str = self.rev_lookup(output_ids[0].numpy())

                for ostr in output_str.tolist():
                    if '[SEP]' in ostr:
                        ostr = ostr[:ostr.index('[SEP]')]
                    sample_results.append("".join(ostr))
                results.append(sample_results)
        return results
Beispiel #10
0
    def _read_file(self, input_file, phase=None):
        """Reads a tab separated value file."""
        has_warned = False
        with io.open(input_file, "r", encoding="UTF-8") as file:
            reader = csv.reader(file, delimiter="\t", quotechar=None)
            examples = []
            for (i, line) in enumerate(reader):

                if i == 0:
                    ncol = len(line)
                    if self.if_file_with_header[phase]:
                        continue
                if (len(line) != ncol):
                    print(line)
                if phase != "predict":
                    if ncol == 1:
                        raise Exception(
                            "the %s file: %s only has one column but it is not a predict file"
                            % (phase, input_file))
                    elif ncol == 2:
                        example = InputExample(guid=i,
                                               text_a=line[0],
                                               label=line[1])
                    elif ncol == 3:
                        example = InputExample(guid=i,
                                               text_a=line[0],
                                               text_b=line[1],
                                               label=line[2])
                    else:
                        raise Exception(
                            "the %s file: %s has too many columns (should <=3_实体识别)"
                            % (phase, input_file))
                else:
                    if ncol == 1:
                        example = InputExample(guid=i, text_a=line[0])
                    elif ncol == 2:
                        if not has_warned:
                            logger.warning(
                                "the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
                                % (input_file))
                            has_warned = True
                        example = InputExample(guid=i,
                                               text_a=line[0],
                                               text_b=line[1])
                    else:
                        raise Exception(
                            "the predict file: %s has too many columns (should <=2)"
                            % (input_file))
                examples.append(example)
                # print(example)
            return examples
Beispiel #11
0
 def preprocess(text):
     data_dict = {self.feed_key: [text]}
     processed = self.lac.lexical_analysis(data=data_dict)
     processed = [
         self.vocab[word] for word in processed[0]['word']
         if word in self.vocab
     ]
     if len(processed) == 0:
         if six.PY2:
             text = text.encode(sys_stdout_encoding())
         logger.warning(
             "The words in text %s can't be found in the vocabulary." %
             (text))
     return processed
Beispiel #12
0
    def __init__(self,
                 base_path,
                 train_file=None,
                 dev_file=None,
                 test_file=None,
                 predict_file=None,
                 label_file=None,
                 label_list=None,
                 train_file_with_header=False,
                 dev_file_with_header=False,
                 test_file_with_header=False,
                 predict_file_with_header=False):
        if not (train_file or dev_file or test_file):
            raise ValueError("At least one file should be assigned")
        self.base_path = base_path
        self.train_file = train_file
        self.dev_file = dev_file
        self.test_file = test_file
        self.predict_file = predict_file
        self.label_file = label_file
        self.label_list = label_list

        self.train_examples = []
        self.dev_examples = []
        self.test_examples = []
        self.predict_examples = []

        self.if_file_with_header = {
            "train": train_file_with_header,
            "dev": dev_file_with_header,
            "test": test_file_with_header,
            "predict": predict_file_with_header
        }

        if train_file:
            self._load_train_examples()
        if dev_file:
            self._load_dev_examples()
        if test_file:
            self._load_test_examples()
        if predict_file:
            self._load_predict_examples()
        if self.label_file:
            if not self.label_list:
                self.label_list = self._load_label_data()
            else:
                logger.warning(
                    "As label_list has been assigned, label_file is noneffective"
                )
Beispiel #13
0
def from_pyobj_to_module_attr(pyobj, module_attr, obj_filter=None):
    if obj_filter and obj_filter(pyobj):
        return
    if isinstance(pyobj, bool):
        module_attr.type = module_desc_pb2.BOOLEAN
        module_attr.b = pyobj
    elif isinstance(pyobj, six.integer_types):
        module_attr.type = module_desc_pb2.INT
        module_attr.i = pyobj
    elif isinstance(pyobj, six.text_type):
        module_attr.type = module_desc_pb2.STRING
        module_attr.s = pyobj
    elif isinstance(pyobj, six.binary_type):
        module_attr.type = module_desc_pb2.STRING
        module_attr.s = pyobj
    elif isinstance(pyobj, float):
        module_attr.type = module_desc_pb2.FLOAT
        module_attr.f = pyobj
    elif isinstance(pyobj, list) or isinstance(pyobj, tuple):
        module_attr.type = module_desc_pb2.LIST
        for index, obj in enumerate(pyobj):
            from_pyobj_to_module_attr(obj, module_attr.list.data[str(index)],
                                      obj_filter)
    elif isinstance(pyobj, set):
        module_attr.type = module_desc_pb2.SET
        for index, obj in enumerate(list(pyobj)):
            from_pyobj_to_module_attr(obj, module_attr.set.data[str(index)],
                                      obj_filter)
    elif isinstance(pyobj, dict):
        module_attr.type = module_desc_pb2.MAP
        for key, value in pyobj.items():
            from_pyobj_to_module_attr(value, module_attr.map.data[str(key)],
                                      obj_filter)
            module_attr.map.key_type[str(key)] = get_keyed_type_of_pyobj(key)
    elif isinstance(pyobj, type(None)):
        module_attr.type = module_desc_pb2.NONE
    else:
        module_attr.type = module_desc_pb2.OBJECT
        module_attr.name = str(pyobj.__class__.__name__)
        if not hasattr(pyobj, "__dict__"):
            logger.warning("python obj %s has not __dict__ attr" %
                           module_attr.name)
            return
        for key, value in pyobj.__dict__.items():
            from_pyobj_to_module_attr(value, module_attr.object.data[str(key)],
                                      obj_filter)
            module_attr.object.key_type[str(key)] = get_keyed_type_of_pyobj(
                key)
Beispiel #14
0
 def step(self):
     if self.scheduler["gradual_unfreeze"] > 0:
         self.epoch += 1
         if self.max_depth > 0 and self.epoch <= self.scheduler[
                 "gradual_unfreeze"]:
             set_gradual_unfreeze(
                 self.main_program,
                 unfreeze_depths=self.
                 sorted_depth[:self.max_depth * self.epoch //
                              self.scheduler["gradual_unfreeze"]])
         else:
             logger.warning(
                 "The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy."
                 % (self.max_depth))
     else:
         pass
Beispiel #15
0
    def __init__(self):
        logger.warning(
            "ImageClassificationDataset is no longer recommended from PaddleHub v1.5.0, "
            "please use BaseCVDataset instead of ImageClassificationDataset. "
            "It's more easy-to-use with more functions and support evaluating test set "
            "in the end of finetune automatically.")
        self.base_path = None
        self.train_list_file = None
        self.test_list_file = None
        self.validate_list_file = None
        self.label_list_file = None
        self.num_labels = 0
        self.label_list = []

        self.train_examples = []
        self.dev_examples = []
        self.test_examples = []
Beispiel #16
0
    def predict(self,
                data,
                load_best_model=True,
                return_result=False,
                accelerate_mode=True):
        """
        make prediction for the input data.

        Args:
            data (list): the data will be predicted.
            load_best_model (bool): load the best model or not
            return_result (bool): return a readable result or just the raw run result
            accelerate_mode (bool): use high-performance predictor or not

        Returns:
            RunState: the running result of predict phase
        """
        if not version_compare(paddle.__version__,
                               "1.6.2") and accelerate_mode:
            logger.warning(
                "Fail to open predict accelerate mode as it does not support paddle < 1.6.2. Please update PaddlePaddle."
            )
            accelerate_mode = False
        self.accelerate_mode = accelerate_mode

        with self.phase_guard(phase="predict"):
            self._predict_data = data
            self._predict_start_event()

            if load_best_model:
                self.init_if_load_best_model()
            else:
                self.init_if_necessary()
            if not self.accelerate_mode:
                run_states = self._run()
            else:
                if not self._predictor:
                    self._predictor = self._create_predictor()
                run_states = self._run_with_predictor()

            self._predict_end_event(run_states)
            self._predict_data = None
            if return_result:
                return self._postprocessing(run_states)
        return run_states
Beispiel #17
0
    def predict(self,
                data,
                load_best_model=True,
                return_result=False,
                accelerate_mode=True):
        """
        make prediction for the input data.

        Args:
            data (list): the data will be predicted.
            load_best_model (bool): load the best model or not
            return_result (bool): return a readable result or just the raw run result
            accelerate_mode (bool): use high-performance predictor or not

        Returns:
            RunState: the running result of predict phase
        """
        if accelerate_mode and isinstance(self._base_data_reader,
                                          hub.reader.LACClassifyReader):
            logger.warning(
                "LACClassifyReader does not support predictor, the accelerate_mode is closed now."
            )
            accelerate_mode = False
        self.accelerate_mode = accelerate_mode

        with self.phase_guard(phase="predict"):
            self._predict_data = data
            self._predict_start_event()

            if load_best_model:
                self.init_if_load_best_model()
            else:
                self.init_if_necessary()
            if not self.accelerate_mode:
                run_states = self._run()
            else:
                if not self._predictor:
                    self._predictor = self._create_predictor()
                run_states = self._run_with_predictor()

            self._predict_end_event(run_states)
            self._predict_data = None
            if return_result:
                return self._postprocessing(run_states)
        return run_states
Beispiel #18
0
 def check_module_valid(self, module_path):
     try:
         desc_pb_path = os.path.join(module_path, 'module_desc.pb')
         if os.path.exists(desc_pb_path) and os.path.isfile(desc_pb_path):
             info = {}
             desc = module_desc_pb2.ModuleDesc()
             with open(desc_pb_path, "rb") as fp:
                 desc.ParseFromString(fp.read())
             info['version'] = desc.attr.map.data["module_info"].map.data[
                 "version"].s
             return True, info
         else:
             logger.warning(
                 "%s does not exist, the module will be reinstalled" %
                 desc_pb_path)
     except:
         pass
     return False, None
Beispiel #19
0
    def _default_eval_end_event(self, run_states):
        """
        Paddlehub default handler for eval_end_event, it will complete visualization and metrics calculation

        Args:
            run_states (object): the results in eval phase
        """
        eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states)
        if 'train' in self._envs:
            self.tb_writer.add_scalar(
                tag="Loss_{}".format(self.phase),
                scalar_value=eval_loss,
                global_step=self._envs['train'].current_step)

        log_scores = ""
        for metric in eval_scores:
            if 'train' in self._envs:
                self.tb_writer.add_scalar(
                    tag="{}_{}".format(metric, self.phase),
                    scalar_value=eval_scores[metric],
                    global_step=self._envs['train'].current_step)
            log_scores += "%s=%.5f " % (metric, eval_scores[metric])
        logger.eval(
            "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" %
            (self.phase, eval_loss, log_scores, run_speed))

        eval_scores_items = eval_scores.items()
        if len(eval_scores_items):
            # The first metric will be chose to eval
            main_metric, main_value = list(eval_scores_items)[0]
        else:
            logger.warning(
                "None of metrics has been implemented, loss will be used to evaluate."
            )
            # The larger, the better
            main_metric, main_value = "negative loss", -eval_loss
        if self.phase in ["dev", "val"] and main_value > self.best_score:
            self.best_score = main_value
            model_saved_dir = os.path.join(self.config.checkpoint_dir,
                                           "best_model")
            logger.eval("best model saved to %s [best %s=%.5f]" %
                        (model_saved_dir, main_metric, main_value))
            self.save_inference_model(dirname=model_saved_dir)
Beispiel #20
0
    def __init__(self,
                 log_interval=10,
                 eval_interval=100,
                 use_pyreader=True,
                 use_data_parallel=True,
                 save_ckpt_interval=None,
                 use_cuda=True,
                 checkpoint_dir=None,
                 num_epoch=1,
                 batch_size=32,
                 enable_memory_optim=False,
                 strategy=None):
        """ Construct finetune Config """
        self._log_interval = log_interval
        self._eval_interval = eval_interval
        self._save_ckpt_interval = save_ckpt_interval
        self._use_cuda = use_cuda
        self._checkpoint_dir = checkpoint_dir
        self._num_epoch = num_epoch
        self._batch_size = batch_size
        if not use_pyreader:
            logger.warning(
                "The parameter use_pyreader has been dropped! PaddleHub over v1.8.0 will use pyreader by default."
            )
        self._use_pyreader = True

        self._use_data_parallel = use_data_parallel
        if strategy is None:
            self._strategy = DefaultStrategy()
        else:
            self._strategy = strategy
        if enable_memory_optim:
            logger.warning(
                "The memory optimization feature has been dropped! PaddleHub now doesn't optimize the memory of the program."
            )
        self._enable_memory_optim = False
        if checkpoint_dir is None:
            now = int(time.time())
            time_str = time.strftime("%Y%m%d%H%M%S", time.localtime(now))
            self._checkpoint_dir = "ckpt_" + time_str
        else:
            self._checkpoint_dir = checkpoint_dir
        logger.info("Checkpoint dir: {}".format(self._checkpoint_dir))
Beispiel #21
0
    def _eval_end_event(self, run_states):
        eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states)
        if 'train' in self._envs:
            self.tb_writer.add_scalar(
                tag="Loss_{}".format(self.phase),
                scalar_value=eval_loss,
                global_step=self._envs['train'].current_step)

        log_scores = ""
        for metric in eval_scores:
            if 'train' in self._envs:
                self.tb_writer.add_scalar(
                    tag="{}_{}".format(metric, self.phase),
                    scalar_value=eval_scores[metric],
                    global_step=self._envs['train'].current_step)
            log_scores += "%s=%.5f " % (metric, eval_scores[metric])
        logger.info(
            "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" %
            (self.phase, eval_loss, log_scores, run_speed))

        eval_scores_items = eval_scores.items()
        if len(eval_scores_items):
            # The first metric will be chose to eval
            main_metric, main_value = list(eval_scores_items)[0]
        else:
            logger.warning(
                "None of metrics has been implemented, loss will be used to evaluate."
            )
            # The larger, the better
            main_metric, main_value = "negative loss", -eval_loss
        if self.phase in ["dev", "val"] and main_value > self.best_score:
            self.best_score = main_value
            model_saved_dir = os.path.join(self.config.checkpoint_dir,
                                           "best_model")
            logger.info("best model saved to %s [best %s=%.5f]" %
                        (model_saved_dir, main_metric, main_value))

            save_result = fluid.io.save_persistables(
                executor=self.exe,
                dirname=model_saved_dir,
                main_program=self.main_program)
Beispiel #22
0
    def __init__(self,
                 vocab_path,
                 dataset=None,
                 label_map_config=None,
                 max_seq_len=512,
                 do_lower_case=True,
                 random_seed=None,
                 use_task_id=False,
                 sp_model_path=None,
                 word_dict_path=None,
                 in_tokens=False):
        super(BaseNLPReader, self).__init__(dataset, random_seed)
        self.max_seq_len = max_seq_len
        if sp_model_path and word_dict_path:
            self.tokenizer = tokenization.WSSPTokenizer(vocab_path,
                                                        sp_model_path,
                                                        word_dict_path,
                                                        ws=True,
                                                        lower=True)
        else:
            self.tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.mask_id = self.vocab["[MASK]"]
        self.in_tokens = in_tokens
        self.use_task_id = use_task_id

        if self.use_task_id:
            logger.warning(
                "use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now."
            )
            self.task_id = 0

        self.Record_With_Label_Id = namedtuple(
            'Record',
            ['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
        self.Record_Wo_Label_Id = namedtuple(
            'Record', ['token_ids', 'text_type_ids', 'position_ids'])
Beispiel #23
0
 def check_module_valid(self, module_path):
     try:
         desc_pb_path = os.path.join(module_path, 'module_desc.pb')
         if os.path.exists(desc_pb_path) and os.path.isfile(desc_pb_path):
             info = {}
             desc = module_desc_pb2.ModuleDesc()
             with open(desc_pb_path, "rb") as fp:
                 desc.ParseFromString(fp.read())
             info['version'] = desc.attr.map.data["module_info"].map.data[
                 "version"].s
             info['name'] = desc.attr.map.data["module_info"].map.data[
                 "name"].s
             return True, info
         else:
             module_file = os.path.realpath(
                 os.path.join(module_path, 'module.py'))
             if os.path.exists(module_file):
                 basename = os.path.split(module_path)[-1]
                 dirname = os.path.join(
                     *list(os.path.split(module_path)[:-1]))
                 sys.path.insert(0, dirname)
                 _module = importlib.import_module(
                     "{}.module".format(basename))
                 for _item, _cls in inspect.getmembers(
                         _module, inspect.isclass):
                     _item = _module.__dict__[_item]
                     _file = os.path.realpath(
                         sys.modules[_item.__module__].__file__)
                     if issubclass(
                             _item,
                             hub.Module) and _file.startswith(module_file):
                         version = _item._version
                         break
                 sys.path.pop(0)
                 return True, {'version': version, 'name': _item._name}
             logger.warning(
                 "%s does not exist, the module will be reinstalled" %
                 desc_pb_path)
     except:
         pass
     return False, None
Beispiel #24
0
    def __new__(cls, name=None, directory=None, module_dir=None, version=None):
        if cls.__name__ == "Module":
            if name:
                module = cls.init_with_name(name=name, version=version)
            elif directory:
                module = cls.init_with_directory(directory=directory)
            elif module_dir:
                logger.warning(
                    "Parameter module_dir is deprecated, please use directory to specify the path"
                )
                if isinstance(module_dir, list) or isinstance(
                        module_dir, tuple):
                    directory = module_dir[0]
                    version = module_dir[1]
                else:
                    directory = module_dir
                module = cls.init_with_directory(directory=directory)
            CacheUpdater("update_cache", module.name, module.version).start()
        else:
            module = object.__new__(cls)

        return module
Beispiel #25
0
 def _postprocessing(self, run_states):
     if self._compatible_mode:
         try:
             label_list = list(self._base_data_reader.label_map.keys())
         except:
             raise Exception(
                 "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead"
             )
     else:
         if self._label_list:
             label_list = self._label_list
         else:
             logger.warning(
                 "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
             )
             return run_states
     results = []
     for batch_state in run_states:
         batch_result = batch_state.run_results
         batch_infer = np.argmax(batch_result[0], axis=1)
         results += [
             label_list[sample_infer] for sample_infer in batch_infer
         ]
     return results
Beispiel #26
0
 def _read_file(self, input_file, phase=False):
     """
     读入json格式数据集
     """
     examples = []
     drop = 0
     with open(input_file, "r") as reader:
         input_data = json.load(reader)["data"]
     for entry in input_data:
         for paragraph in entry["paragraphs"]:
             paragraph_text = paragraph["context"]
             guid = []
             labels = [0] * len(self.label_list)
             for qa in paragraph["qas"]:
                 guid.append(qa["id"])
                 labels[self.label_list.index(qa["question"].replace(
                     '的主体是什么?', ''))] = 1
             guid = str(list(set(guid)))
             example = InputExample(guid=guid,
                                    label=labels,
                                    text_a=paragraph_text)
             examples.append(example)
     logger.warning("%i bad examples has been dropped" % drop)
     return examples
Beispiel #27
0
    def _postprocessing(self, run_states):
        results = []
        if self._compatible_mode:
            label_list = list(self._base_data_reader.label_map.keys())
        else:
            if self._label_list:
                label_list = self._label_list
            else:
                logger.warning(
                    "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter."
                )
                return run_states

        for batch_state in run_states:
            batch_result = batch_state.run_results
            for sample_id in range(len(batch_result[0])):
                sample_result = []
                for category_id in range(len(label_list)):
                    sample_category_prob = batch_result[category_id][sample_id]
                    sample_category_value = np.argmax(sample_category_prob)
                    sample_result.append(
                        {label_list[category_id]: sample_category_value})
                results.append(sample_result)
        return results
Beispiel #28
0
    def _check_module_integrity(self):
        result = True
        for file_info in self.file_infos:
            file_type = file_info.type
            file_path = file_info.file_name.replace(FILE_SEP, os.sep)
            file_path = os.path.join(self.module_path, file_path)
            if not os.path.exists(file_path):
                if file_info.is_need:
                    logger.warning(
                        "Module integrity check failed! Missing file [%s]" %
                        file_path)
                    result = False
            else:
                if file_type == check_info_pb2.FILE:
                    if not os.path.isfile(file_path):
                        logger.warning("File type check error %s" % file_path)
                        result = False

                if file_type == check_info_pb2.DIR:
                    if not os.path.isdir(file_path):
                        logger.warning("File type check error %s" % file_path)
                        result = False
        return result
Beispiel #29
0
def get_predictions(all_examples, all_features, all_results, n_best_size,
                    max_answer_length, do_lower_case, version_2_with_negative,
                    null_score_diff_threshold, is_english):

    _PrelimPrediction = collections.namedtuple("PrelimPrediction", [
        "feature_index", "start_index", "end_index", "start_logit", "end_logit"
    ])
    _NbestPrediction = collections.namedtuple(
        "NbestPrediction", ["text", "start_logit", "end_logit"])
    example_index_to_features = collections.defaultdict(list)
    for feature in all_features:
        example_index_to_features[feature.example_index].append(feature)

    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    all_predictions = collections.OrderedDict()
    all_nbest_json = collections.OrderedDict()
    scores_diff_json = collections.OrderedDict()

    for (example_index, example) in enumerate(all_examples):
        features = example_index_to_features[example_index]

        prelim_predictions = []
        # keep track of the minimum score of null start+end of position 0
        score_null = 1000000  # large and positive
        min_null_feature_index = 0  # the paragraph slice with min mull score
        null_start_logit = 0  # the start logit at the slice with min null score
        null_end_logit = 0  # the end logit at the slice with min null score
        for (feature_index, feature) in enumerate(features):
            if feature.unique_id not in unique_id_to_result:
                logger.info(
                    "As using multidevice, the last one batch is so small that the feature %s in the last batch is discarded "
                    % feature.unique_id)
                continue
            result = unique_id_to_result[feature.unique_id]
            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
            end_indexes = _get_best_indexes(result.end_logits, n_best_size)

            # if we could have irrelevant answers, get the min score of irrelevant
            if version_2_with_negative:
                feature_null_score = result.start_logits[
                    0] + result.end_logits[0]
                if feature_null_score < score_null:
                    score_null = feature_null_score
                    min_null_feature_index = feature_index
                    null_start_logit = result.start_logits[0]
                    null_end_logit = result.end_logits[0]

            for start_index in start_indexes:
                for end_index in end_indexes:
                    # We could hypothetically create invalid predictions, e.g., predict
                    # that the start of the span is in the question. We throw out all
                    # invalid predictions.
                    if start_index >= len(feature.tokens):
                        continue
                    if end_index >= len(feature.tokens):
                        continue
                    if start_index not in feature.token_to_orig_map:
                        continue
                    if end_index not in feature.token_to_orig_map:
                        continue
                    if not feature.token_is_max_context.get(
                            start_index, False):
                        continue
                    if end_index < start_index:
                        continue
                    length = end_index - start_index + 1
                    if length > max_answer_length:
                        continue
                    prelim_predictions.append(
                        _PrelimPrediction(
                            feature_index=feature_index,
                            start_index=start_index,
                            end_index=end_index,
                            start_logit=result.start_logits[start_index],
                            end_logit=result.end_logits[end_index]))

        if version_2_with_negative:
            prelim_predictions.append(
                _PrelimPrediction(feature_index=min_null_feature_index,
                                  start_index=0,
                                  end_index=0,
                                  start_logit=null_start_logit,
                                  end_logit=null_end_logit))
        prelim_predictions = sorted(prelim_predictions,
                                    key=lambda x:
                                    (x.start_logit + x.end_logit),
                                    reverse=True)

        seen_predictions = {}
        nbest = []
        if not prelim_predictions:
            logger.warning(("not prelim_predictions:", example.qas_id))
        for pred in prelim_predictions:
            if len(nbest) >= n_best_size:
                break
            feature = features[pred.feature_index]
            if pred.start_index > 0:  # this is a non-null prediction
                tok_tokens = feature.tokens[pred.start_index:(pred.end_index +
                                                              1)]
                orig_doc_start = feature.token_to_orig_map[pred.start_index]
                orig_doc_end = feature.token_to_orig_map[pred.end_index]
                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end +
                                                                 1)]
                if is_english:
                    tok_text = " ".join(tok_tokens)
                else:
                    tok_text = "".join(tok_tokens)
                # De-tokenize WordPieces that have been split off.
                tok_text = tok_text.replace(" ##", "")
                tok_text = tok_text.replace("##", "")

                # Clean whitespace
                tok_text = tok_text.strip()
                tok_text = " ".join(tok_text.split())
                if is_english:
                    orig_text = " ".join(orig_tokens)
                else:
                    orig_text = "".join(orig_tokens)

                final_text = get_final_text(tok_text, orig_text, do_lower_case,
                                            is_english)
                if final_text in seen_predictions:
                    continue

                seen_predictions[final_text] = True
            else:
                final_text = ""
                seen_predictions[final_text] = True

            nbest.append(
                _NbestPrediction(text=final_text,
                                 start_logit=pred.start_logit,
                                 end_logit=pred.end_logit))

        # if we didn't include the empty option in the n-best, include it
        if version_2_with_negative:
            if "" not in seen_predictions:
                nbest.append(
                    _NbestPrediction(text="",
                                     start_logit=null_start_logit,
                                     end_logit=null_end_logit))
        # In very rare edge cases we could have no valid predictions. So we
        # just create a nonce prediction in this case to avoid failure.
        if not nbest:
            nbest.append(
                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))

        assert len(nbest) >= 1

        total_scores = []
        best_non_null_entry = None
        for entry in nbest:
            total_scores.append(entry.start_logit + entry.end_logit)
            if not best_non_null_entry:
                if entry.text:
                    best_non_null_entry = entry

        probs = _compute_softmax(total_scores)

        nbest_json = []
        for (i, entry) in enumerate(nbest):
            output = collections.OrderedDict()
            output["text"] = entry.text
            output["probability"] = probs[i]
            output["start_logit"] = entry.start_logit
            output["end_logit"] = entry.end_logit
            nbest_json.append(output)

        assert len(nbest_json) >= 1

        if not version_2_with_negative:
            all_predictions[example.qas_id] = nbest_json[0]["text"]
        else:
            # predict "" iff the null score - the score of best non-null > threshold
            score_diff = score_null
            if best_non_null_entry:
                score_diff -= best_non_null_entry.start_logit + best_non_null_entry.end_logit
            scores_diff_json[example.qas_id] = score_diff
            if score_diff > null_score_diff_threshold:
                all_predictions[example.qas_id] = ""
            else:
                all_predictions[example.qas_id] = best_non_null_entry.text

        all_nbest_json[example.qas_id] = nbest_json

    return all_predictions, all_nbest_json, scores_diff_json
Beispiel #30
0
    def generate(self, texts, use_gpu=False, beam_width=5):
        """
        Get the continuation of the input poetry.

        Args:
             texts(list): the front part of a poetry.
             use_gpu(bool): whether use gpu to predict or not
             beam_width(int): the beam search width.

        Returns:
             results(list): the poetry continuations.
        """
        if texts and isinstance(texts, list) and all(texts) and all(
            [isinstance(text, str) for text in texts]):
            predicted_data = texts
        else:
            raise ValueError(
                "The input texts should be a list with nonempty string elements."
            )
        for i, text in enumerate(texts):
            if len(text) > self.line:
                logger.warning(
                    'The input text: %s, contains more than %i characters, which will be cut off'
                    % (text, self.line))
                texts[i] = text[:self.line]

            for char in text:
                if not '\u4e00' <= char <= '\u9fff':
                    logger.warning(
                        'The input text: %s, contains non-Chinese characters, which may result in magic output'
                        % text)
                    break

        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        with fluid.dygraph.guard(place):
            self.model.eval()
            results = []
            for text in predicted_data:
                sample_results = []
                ids, sids = self.tokenizer.encode(text)
                src_ids = D.to_variable(np.expand_dims(ids, 0))
                src_sids = D.to_variable(np.expand_dims(sids, 0))
                output_ids = beam_search_infilling(
                    self.model,
                    src_ids,
                    src_sids,
                    eos_id=self.tokenizer.sep_id,
                    sos_id=self.tokenizer.cls_id,
                    attn_id=self.tokenizer.vocab['[MASK]'],
                    max_decode_len=80,
                    max_encode_len=20,
                    beam_width=beam_width,
                    tgt_type_id=1)
                output_str = self.rev_lookup(output_ids[0].numpy())

                for ostr in output_str.tolist():
                    if '[SEP]' in ostr:
                        ostr = ostr[:ostr.index('[SEP]')]
                    sample_results.append("".join(ostr))
                results.append(sample_results)
        return results