def from_module_attr_to_pyobj(module_attr): if module_attr.type == module_desc_pb2.BOOLEAN: result = module_attr.b elif module_attr.type == module_desc_pb2.INT: result = module_attr.i elif module_attr.type == module_desc_pb2.STRING: result = module_attr.s elif module_attr.type == module_desc_pb2.FLOAT: result = module_attr.f elif module_attr.type == module_desc_pb2.LIST: result = [] for index in range(len(module_attr.list.data)): result.append( from_module_attr_to_pyobj(module_attr.list.data[str(index)])) elif module_attr.type == module_desc_pb2.SET: result = set() for index in range(len(module_attr.set.data)): result.add( from_module_attr_to_pyobj(module_attr.set.data[str(index)])) elif module_attr.type == module_desc_pb2.MAP: result = {} for key, value in module_attr.map.data.items(): key = get_pykey(key, module_attr.map.key_type[key]) result[key] = from_module_attr_to_pyobj(value) elif module_attr.type == module_desc_pb2.NONE: result = None elif module_attr.type == module_desc_pb2.OBJECT: result = None logger.warning("can't tran module attr to python object") else: result = None logger.warning("unknown type of module attr") return result
def new_eval_end_event(self, run_states): """ Paddlehub default handler for eval_end_event, it will complete visualization and metrics calculation Args: run_states (object): the results in eval phase """ eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states) if 'train' in self._envs: self.vdl_writer.add_scalar(tag="Loss_{}".format(self.phase), value=eval_loss, step=self._envs['train'].current_step) log_scores = "" s = [] for metric in eval_scores: if 'train' in self._envs: self.vdl_writer.add_scalar( tag="{}_{}".format(metric, self.phase), value=eval_scores[metric], step=self._envs['train'].current_step) # dev[metric].add_record(self._envs['train'].current_step, eval_scores[metric]) log_scores += "%s=%.5f " % (metric, eval_scores[metric]) s.append(eval_scores[metric]) # dev[metric].add_record(self.current_step,eval_scores[metric]) logger.eval( "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" % (self.phase, eval_loss, log_scores, run_speed)) s.append(eval_loss) if 'train' in self._envs: s = [self._envs['train'].current_step] + s # dev_loss.add_record(self._envs['train'].current_step,eval_loss) s = [str(x) for x in s] with open('./work/log/%s_dev%s.txt' % (args.do_model, id), 'a', encoding='utf-8') as f: f.write(','.join(s) + '\n') eval_scores_items = eval_scores.items() if len(eval_scores_items): # The first metric will be chose to eval if (args.dev_goal == 'f1'): main_metric, main_value = list(eval_scores_items)[0] else: #loss main_metric, main_value = "negative loss", -eval_loss else: logger.warning( "None of metrics has been implemented, loss will be used to evaluate." ) # The larger, the better main_metric, main_value = "negative loss", -eval_loss if self.phase in ["dev", "val"] and main_value > self.best_score: self.best_score = main_value model_saved_dir = os.path.join(self.config.checkpoint_dir, "best_model") logger.eval("best model saved to %s [best %s=%.5f]" % (model_saved_dir, main_metric, main_value)) self.save_inference_model(dirname=model_saved_dir)
def __new__(cls, name=None, directory=None, module_dir=None, version=None, **kwargs): if cls.__name__ == "Module": if name: module = cls.init_with_name( name=name, version=version, **kwargs) elif directory: module = cls.init_with_directory(directory=directory, **kwargs) elif module_dir: logger.warning( "Parameter module_dir is deprecated, please use directory to specify the path" ) if isinstance(module_dir, list) or isinstance( module_dir, tuple): directory = module_dir[0] version = module_dir[1] else: directory = module_dir module = cls.init_with_directory(directory=directory, **kwargs) CacheUpdater("update_cache", module.name, module.version).start() else: if not name and not directory: directory = os.path.dirname( os.path.abspath(sys.modules[cls.__module__].__file__)) module = Module.init_with_directory( directory=directory, **kwargs) else: module = object.__new__(cls) return module
def _check_paddle_version(self): if version_compare(self.paddle_version, paddle.__version__): logger.warning( "This Module is generated by the PaddlePaddle with version %s, and the local PaddlePaddle version is %s, which may cause serious incompatible bug. Please upgrade PaddlePaddle to the latest version." % (self.paddle_version, paddle.__version__)) return False return True
def _check_module_proto_version(self): if self.module_proto_version != module_proto_version: logger.warning( "Module description file version cannot be aligned with PaddleHub version" ) return False return True
def _postprocessing(self, run_states): if self._compatible_mode: id2label = { val: key for key, val in self._base_data_reader.label_map.items() } else: if self._label_list: id2label = {} for index, label in enumerate(self._label_list): id2label[index] = label else: logger.warning( "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter." ) return run_states results = [] for batch_states in run_states: batch_results = batch_states.run_results batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if self.add_crf else self.max_seq_len results.append(seq_result) return results
def predict( self, data=None, label_list=None, load_best_model=True, return_result=False, accelerate_mode=True, ): """ make prediction for the input data. Args: data (list): the data will be predicted. Its element should be a record when the task is initialized without data_reader param, or a plaintext string list when the task is initialized with data_reader param (deprecated in paddlehub v1.8). label_list (list): the label list, used to proprocess the output. load_best_model (bool): load the best model or not return_result (bool): return a readable result or just the raw run result. Always True when the task is not initialized with data_reader but dataset parameter. accelerate_mode (bool): use high-performance predictor or not Returns: RunState: the running result of predict phase """ if accelerate_mode: if isinstance(self._base_data_reader, hub.reader.LACClassifyReader): logger.warning( "LACClassifyReader does not support predictor, the accelerate_mode is closed now." ) accelerate_mode = False elif isinstance(self, hub.TextGenerationTask): logger.warning( "TextGenerationTask does not support predictor, the accelerate_mode is closed now." ) accelerate_mode = False self.accelerate_mode = accelerate_mode with self.phase_guard(phase="predict"): self._predict_data = data if label_list: self._label_list = label_list self._predict_start_event() if load_best_model: self.init_if_load_best_model() else: self.init_if_necessary() if not self.accelerate_mode: run_states = self._run() else: if not self._predictor: self._predictor = self._create_predictor() run_states = self._run_with_predictor() self._predict_end_event(run_states) self._predict_data = None if return_result or not self._compatible_mode: return self._postprocessing(run_states) return run_states
def generate(self, texts, use_gpu=False, beam_width=5): """ Get the continuation of the input poetry. Args: texts(list): the front part of a poetry. use_gpu(bool): whether use gpu to predict or not beam_width(int): the beam search width. Returns: results(list): the poetry continuations. """ paddle.disable_static() if texts and isinstance(texts, list) and all(texts) and all( [isinstance(text, str) for text in texts]): predicted_data = texts else: raise ValueError( "The input texts should be a list with nonempty string elements." ) if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') self.model.eval() results = [] for text in predicted_data: sample_results = [] encode_text = self.tokenizer.encode(text) src_ids = paddle.to_tensor(encode_text['input_ids']).unsqueeze(0) src_sids = paddle.to_tensor( encode_text['token_type_ids']).unsqueeze(0) output_ids = beam_search_infilling( self.model, src_ids, src_sids, eos_id=self.tokenizer.vocab['[SEP]'], sos_id=self.tokenizer.vocab['[CLS]'], attn_id=self.tokenizer.vocab['[MASK]'], pad_id=self.tokenizer.vocab['[PAD]'], unk_id=self.tokenizer.vocab['[UNK]'], vocab_size=len(self.tokenizer.vocab), max_decode_len=80, max_encode_len=20, beam_width=beam_width, tgt_type_id=1) output_str = self.rev_lookup(output_ids[0]) for ostr in output_str.tolist(): if '[SEP]' in ostr: ostr = ostr[:ostr.index('[SEP]')] sample_results.append("".join(ostr)) results.append(sample_results) return results
def generate(self, texts, use_gpu=False, beam_width=5): """ Get the predict result from the input texts. Args: texts(list): the input texts. use_gpu(bool): whether use gpu to predict or not beam_width(int): the beam search width. Returns: results(list): the predict result. """ if texts and isinstance(texts, list) and all(texts) and all( [isinstance(text, str) for text in texts]): predicted_data = texts else: raise ValueError( "The input texts should be a list with nonempty string elements." ) if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) if use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() with fluid.dygraph.guard(place): self.model.eval() results = [] for text in predicted_data: sample_results = [] ids, sids = self.tokenizer.encode(text) src_ids = D.to_variable(np.expand_dims(ids, 0)) src_sids = D.to_variable(np.expand_dims(sids, 0)) output_ids = beam_search_infilling( self.model, src_ids, src_sids, eos_id=self.tokenizer.sep_id, sos_id=self.tokenizer.cls_id, attn_id=self.tokenizer.vocab['[MASK]'], max_decode_len=50, max_encode_len=50, beam_width=beam_width, tgt_type_id=1) output_str = self.rev_lookup(output_ids[0].numpy()) for ostr in output_str.tolist(): if '[SEP]' in ostr: ostr = ostr[:ostr.index('[SEP]')] sample_results.append("".join(ostr)) results.append(sample_results) return results
def _read_file(self, input_file, phase=None): """Reads a tab separated value file.""" has_warned = False with io.open(input_file, "r", encoding="UTF-8") as file: reader = csv.reader(file, delimiter="\t", quotechar=None) examples = [] for (i, line) in enumerate(reader): if i == 0: ncol = len(line) if self.if_file_with_header[phase]: continue if (len(line) != ncol): print(line) if phase != "predict": if ncol == 1: raise Exception( "the %s file: %s only has one column but it is not a predict file" % (phase, input_file)) elif ncol == 2: example = InputExample(guid=i, text_a=line[0], label=line[1]) elif ncol == 3: example = InputExample(guid=i, text_a=line[0], text_b=line[1], label=line[2]) else: raise Exception( "the %s file: %s has too many columns (should <=3_实体识别)" % (phase, input_file)) else: if ncol == 1: example = InputExample(guid=i, text_a=line[0]) elif ncol == 2: if not has_warned: logger.warning( "the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b" % (input_file)) has_warned = True example = InputExample(guid=i, text_a=line[0], text_b=line[1]) else: raise Exception( "the predict file: %s has too many columns (should <=2)" % (input_file)) examples.append(example) # print(example) return examples
def preprocess(text): data_dict = {self.feed_key: [text]} processed = self.lac.lexical_analysis(data=data_dict) processed = [ self.vocab[word] for word in processed[0]['word'] if word in self.vocab ] if len(processed) == 0: if six.PY2: text = text.encode(sys_stdout_encoding()) logger.warning( "The words in text %s can't be found in the vocabulary." % (text)) return processed
def __init__(self, base_path, train_file=None, dev_file=None, test_file=None, predict_file=None, label_file=None, label_list=None, train_file_with_header=False, dev_file_with_header=False, test_file_with_header=False, predict_file_with_header=False): if not (train_file or dev_file or test_file): raise ValueError("At least one file should be assigned") self.base_path = base_path self.train_file = train_file self.dev_file = dev_file self.test_file = test_file self.predict_file = predict_file self.label_file = label_file self.label_list = label_list self.train_examples = [] self.dev_examples = [] self.test_examples = [] self.predict_examples = [] self.if_file_with_header = { "train": train_file_with_header, "dev": dev_file_with_header, "test": test_file_with_header, "predict": predict_file_with_header } if train_file: self._load_train_examples() if dev_file: self._load_dev_examples() if test_file: self._load_test_examples() if predict_file: self._load_predict_examples() if self.label_file: if not self.label_list: self.label_list = self._load_label_data() else: logger.warning( "As label_list has been assigned, label_file is noneffective" )
def from_pyobj_to_module_attr(pyobj, module_attr, obj_filter=None): if obj_filter and obj_filter(pyobj): return if isinstance(pyobj, bool): module_attr.type = module_desc_pb2.BOOLEAN module_attr.b = pyobj elif isinstance(pyobj, six.integer_types): module_attr.type = module_desc_pb2.INT module_attr.i = pyobj elif isinstance(pyobj, six.text_type): module_attr.type = module_desc_pb2.STRING module_attr.s = pyobj elif isinstance(pyobj, six.binary_type): module_attr.type = module_desc_pb2.STRING module_attr.s = pyobj elif isinstance(pyobj, float): module_attr.type = module_desc_pb2.FLOAT module_attr.f = pyobj elif isinstance(pyobj, list) or isinstance(pyobj, tuple): module_attr.type = module_desc_pb2.LIST for index, obj in enumerate(pyobj): from_pyobj_to_module_attr(obj, module_attr.list.data[str(index)], obj_filter) elif isinstance(pyobj, set): module_attr.type = module_desc_pb2.SET for index, obj in enumerate(list(pyobj)): from_pyobj_to_module_attr(obj, module_attr.set.data[str(index)], obj_filter) elif isinstance(pyobj, dict): module_attr.type = module_desc_pb2.MAP for key, value in pyobj.items(): from_pyobj_to_module_attr(value, module_attr.map.data[str(key)], obj_filter) module_attr.map.key_type[str(key)] = get_keyed_type_of_pyobj(key) elif isinstance(pyobj, type(None)): module_attr.type = module_desc_pb2.NONE else: module_attr.type = module_desc_pb2.OBJECT module_attr.name = str(pyobj.__class__.__name__) if not hasattr(pyobj, "__dict__"): logger.warning("python obj %s has not __dict__ attr" % module_attr.name) return for key, value in pyobj.__dict__.items(): from_pyobj_to_module_attr(value, module_attr.object.data[str(key)], obj_filter) module_attr.object.key_type[str(key)] = get_keyed_type_of_pyobj( key)
def step(self): if self.scheduler["gradual_unfreeze"] > 0: self.epoch += 1 if self.max_depth > 0 and self.epoch <= self.scheduler[ "gradual_unfreeze"]: set_gradual_unfreeze( self.main_program, unfreeze_depths=self. sorted_depth[:self.max_depth * self.epoch // self.scheduler["gradual_unfreeze"]]) else: logger.warning( "The max op-depth in the network is %s. That results in that can't use the gradual unfreeze finetune strategy." % (self.max_depth)) else: pass
def __init__(self): logger.warning( "ImageClassificationDataset is no longer recommended from PaddleHub v1.5.0, " "please use BaseCVDataset instead of ImageClassificationDataset. " "It's more easy-to-use with more functions and support evaluating test set " "in the end of finetune automatically.") self.base_path = None self.train_list_file = None self.test_list_file = None self.validate_list_file = None self.label_list_file = None self.num_labels = 0 self.label_list = [] self.train_examples = [] self.dev_examples = [] self.test_examples = []
def predict(self, data, load_best_model=True, return_result=False, accelerate_mode=True): """ make prediction for the input data. Args: data (list): the data will be predicted. load_best_model (bool): load the best model or not return_result (bool): return a readable result or just the raw run result accelerate_mode (bool): use high-performance predictor or not Returns: RunState: the running result of predict phase """ if not version_compare(paddle.__version__, "1.6.2") and accelerate_mode: logger.warning( "Fail to open predict accelerate mode as it does not support paddle < 1.6.2. Please update PaddlePaddle." ) accelerate_mode = False self.accelerate_mode = accelerate_mode with self.phase_guard(phase="predict"): self._predict_data = data self._predict_start_event() if load_best_model: self.init_if_load_best_model() else: self.init_if_necessary() if not self.accelerate_mode: run_states = self._run() else: if not self._predictor: self._predictor = self._create_predictor() run_states = self._run_with_predictor() self._predict_end_event(run_states) self._predict_data = None if return_result: return self._postprocessing(run_states) return run_states
def predict(self, data, load_best_model=True, return_result=False, accelerate_mode=True): """ make prediction for the input data. Args: data (list): the data will be predicted. load_best_model (bool): load the best model or not return_result (bool): return a readable result or just the raw run result accelerate_mode (bool): use high-performance predictor or not Returns: RunState: the running result of predict phase """ if accelerate_mode and isinstance(self._base_data_reader, hub.reader.LACClassifyReader): logger.warning( "LACClassifyReader does not support predictor, the accelerate_mode is closed now." ) accelerate_mode = False self.accelerate_mode = accelerate_mode with self.phase_guard(phase="predict"): self._predict_data = data self._predict_start_event() if load_best_model: self.init_if_load_best_model() else: self.init_if_necessary() if not self.accelerate_mode: run_states = self._run() else: if not self._predictor: self._predictor = self._create_predictor() run_states = self._run_with_predictor() self._predict_end_event(run_states) self._predict_data = None if return_result: return self._postprocessing(run_states) return run_states
def check_module_valid(self, module_path): try: desc_pb_path = os.path.join(module_path, 'module_desc.pb') if os.path.exists(desc_pb_path) and os.path.isfile(desc_pb_path): info = {} desc = module_desc_pb2.ModuleDesc() with open(desc_pb_path, "rb") as fp: desc.ParseFromString(fp.read()) info['version'] = desc.attr.map.data["module_info"].map.data[ "version"].s return True, info else: logger.warning( "%s does not exist, the module will be reinstalled" % desc_pb_path) except: pass return False, None
def _default_eval_end_event(self, run_states): """ Paddlehub default handler for eval_end_event, it will complete visualization and metrics calculation Args: run_states (object): the results in eval phase """ eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states) if 'train' in self._envs: self.tb_writer.add_scalar( tag="Loss_{}".format(self.phase), scalar_value=eval_loss, global_step=self._envs['train'].current_step) log_scores = "" for metric in eval_scores: if 'train' in self._envs: self.tb_writer.add_scalar( tag="{}_{}".format(metric, self.phase), scalar_value=eval_scores[metric], global_step=self._envs['train'].current_step) log_scores += "%s=%.5f " % (metric, eval_scores[metric]) logger.eval( "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" % (self.phase, eval_loss, log_scores, run_speed)) eval_scores_items = eval_scores.items() if len(eval_scores_items): # The first metric will be chose to eval main_metric, main_value = list(eval_scores_items)[0] else: logger.warning( "None of metrics has been implemented, loss will be used to evaluate." ) # The larger, the better main_metric, main_value = "negative loss", -eval_loss if self.phase in ["dev", "val"] and main_value > self.best_score: self.best_score = main_value model_saved_dir = os.path.join(self.config.checkpoint_dir, "best_model") logger.eval("best model saved to %s [best %s=%.5f]" % (model_saved_dir, main_metric, main_value)) self.save_inference_model(dirname=model_saved_dir)
def __init__(self, log_interval=10, eval_interval=100, use_pyreader=True, use_data_parallel=True, save_ckpt_interval=None, use_cuda=True, checkpoint_dir=None, num_epoch=1, batch_size=32, enable_memory_optim=False, strategy=None): """ Construct finetune Config """ self._log_interval = log_interval self._eval_interval = eval_interval self._save_ckpt_interval = save_ckpt_interval self._use_cuda = use_cuda self._checkpoint_dir = checkpoint_dir self._num_epoch = num_epoch self._batch_size = batch_size if not use_pyreader: logger.warning( "The parameter use_pyreader has been dropped! PaddleHub over v1.8.0 will use pyreader by default." ) self._use_pyreader = True self._use_data_parallel = use_data_parallel if strategy is None: self._strategy = DefaultStrategy() else: self._strategy = strategy if enable_memory_optim: logger.warning( "The memory optimization feature has been dropped! PaddleHub now doesn't optimize the memory of the program." ) self._enable_memory_optim = False if checkpoint_dir is None: now = int(time.time()) time_str = time.strftime("%Y%m%d%H%M%S", time.localtime(now)) self._checkpoint_dir = "ckpt_" + time_str else: self._checkpoint_dir = checkpoint_dir logger.info("Checkpoint dir: {}".format(self._checkpoint_dir))
def _eval_end_event(self, run_states): eval_scores, eval_loss, run_speed = self._calculate_metrics(run_states) if 'train' in self._envs: self.tb_writer.add_scalar( tag="Loss_{}".format(self.phase), scalar_value=eval_loss, global_step=self._envs['train'].current_step) log_scores = "" for metric in eval_scores: if 'train' in self._envs: self.tb_writer.add_scalar( tag="{}_{}".format(metric, self.phase), scalar_value=eval_scores[metric], global_step=self._envs['train'].current_step) log_scores += "%s=%.5f " % (metric, eval_scores[metric]) logger.info( "[%s dataset evaluation result] loss=%.5f %s[step/sec: %.2f]" % (self.phase, eval_loss, log_scores, run_speed)) eval_scores_items = eval_scores.items() if len(eval_scores_items): # The first metric will be chose to eval main_metric, main_value = list(eval_scores_items)[0] else: logger.warning( "None of metrics has been implemented, loss will be used to evaluate." ) # The larger, the better main_metric, main_value = "negative loss", -eval_loss if self.phase in ["dev", "val"] and main_value > self.best_score: self.best_score = main_value model_saved_dir = os.path.join(self.config.checkpoint_dir, "best_model") logger.info("best model saved to %s [best %s=%.5f]" % (model_saved_dir, main_metric, main_value)) save_result = fluid.io.save_persistables( executor=self.exe, dirname=model_saved_dir, main_program=self.main_program)
def __init__(self, vocab_path, dataset=None, label_map_config=None, max_seq_len=512, do_lower_case=True, random_seed=None, use_task_id=False, sp_model_path=None, word_dict_path=None, in_tokens=False): super(BaseNLPReader, self).__init__(dataset, random_seed) self.max_seq_len = max_seq_len if sp_model_path and word_dict_path: self.tokenizer = tokenization.WSSPTokenizer(vocab_path, sp_model_path, word_dict_path, ws=True, lower=True) else: self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) self.vocab = self.tokenizer.vocab self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] self.mask_id = self.vocab["[MASK]"] self.in_tokens = in_tokens self.use_task_id = use_task_id if self.use_task_id: logger.warning( "use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now." ) self.task_id = 0 self.Record_With_Label_Id = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id']) self.Record_Wo_Label_Id = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids'])
def check_module_valid(self, module_path): try: desc_pb_path = os.path.join(module_path, 'module_desc.pb') if os.path.exists(desc_pb_path) and os.path.isfile(desc_pb_path): info = {} desc = module_desc_pb2.ModuleDesc() with open(desc_pb_path, "rb") as fp: desc.ParseFromString(fp.read()) info['version'] = desc.attr.map.data["module_info"].map.data[ "version"].s info['name'] = desc.attr.map.data["module_info"].map.data[ "name"].s return True, info else: module_file = os.path.realpath( os.path.join(module_path, 'module.py')) if os.path.exists(module_file): basename = os.path.split(module_path)[-1] dirname = os.path.join( *list(os.path.split(module_path)[:-1])) sys.path.insert(0, dirname) _module = importlib.import_module( "{}.module".format(basename)) for _item, _cls in inspect.getmembers( _module, inspect.isclass): _item = _module.__dict__[_item] _file = os.path.realpath( sys.modules[_item.__module__].__file__) if issubclass( _item, hub.Module) and _file.startswith(module_file): version = _item._version break sys.path.pop(0) return True, {'version': version, 'name': _item._name} logger.warning( "%s does not exist, the module will be reinstalled" % desc_pb_path) except: pass return False, None
def __new__(cls, name=None, directory=None, module_dir=None, version=None): if cls.__name__ == "Module": if name: module = cls.init_with_name(name=name, version=version) elif directory: module = cls.init_with_directory(directory=directory) elif module_dir: logger.warning( "Parameter module_dir is deprecated, please use directory to specify the path" ) if isinstance(module_dir, list) or isinstance( module_dir, tuple): directory = module_dir[0] version = module_dir[1] else: directory = module_dir module = cls.init_with_directory(directory=directory) CacheUpdater("update_cache", module.name, module.version).start() else: module = object.__new__(cls) return module
def _postprocessing(self, run_states): if self._compatible_mode: try: label_list = list(self._base_data_reader.label_map.keys()) except: raise Exception( "ImageClassificationDataset does not support postprocessing, please use BaseCVDataset instead" ) else: if self._label_list: label_list = self._label_list else: logger.warning( "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter." ) return run_states results = [] for batch_state in run_states: batch_result = batch_state.run_results batch_infer = np.argmax(batch_result[0], axis=1) results += [ label_list[sample_infer] for sample_infer in batch_infer ] return results
def _read_file(self, input_file, phase=False): """ 读入json格式数据集 """ examples = [] drop = 0 with open(input_file, "r") as reader: input_data = json.load(reader)["data"] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] guid = [] labels = [0] * len(self.label_list) for qa in paragraph["qas"]: guid.append(qa["id"]) labels[self.label_list.index(qa["question"].replace( '的主体是什么?', ''))] = 1 guid = str(list(set(guid))) example = InputExample(guid=guid, label=labels, text_a=paragraph_text) examples.append(example) logger.warning("%i bad examples has been dropped" % drop) return examples
def _postprocessing(self, run_states): results = [] if self._compatible_mode: label_list = list(self._base_data_reader.label_map.keys()) else: if self._label_list: label_list = self._label_list else: logger.warning( "Fail to postprocess the predict output. Please set label_list parameter in predict function or initialize the task with dataset parameter." ) return run_states for batch_state in run_states: batch_result = batch_state.run_results for sample_id in range(len(batch_result[0])): sample_result = [] for category_id in range(len(label_list)): sample_category_prob = batch_result[category_id][sample_id] sample_category_value = np.argmax(sample_category_prob) sample_result.append( {label_list[category_id]: sample_category_value}) results.append(sample_result) return results
def _check_module_integrity(self): result = True for file_info in self.file_infos: file_type = file_info.type file_path = file_info.file_name.replace(FILE_SEP, os.sep) file_path = os.path.join(self.module_path, file_path) if not os.path.exists(file_path): if file_info.is_need: logger.warning( "Module integrity check failed! Missing file [%s]" % file_path) result = False else: if file_type == check_info_pb2.FILE: if not os.path.isfile(file_path): logger.warning("File type check error %s" % file_path) result = False if file_type == check_info_pb2.DIR: if not os.path.isdir(file_path): logger.warning("File type check error %s" % file_path) result = False return result
def get_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, version_2_with_negative, null_score_diff_threshold, is_english): _PrelimPrediction = collections.namedtuple("PrelimPrediction", [ "feature_index", "start_index", "end_index", "start_logit", "end_logit" ]) _NbestPrediction = collections.namedtuple( "NbestPrediction", ["text", "start_logit", "end_logit"]) example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive min_null_feature_index = 0 # the paragraph slice with min mull score null_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): if feature.unique_id not in unique_id_to_result: logger.info( "As using multidevice, the last one batch is so small that the feature %s in the last batch is discarded " % feature.unique_id) continue result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: feature_null_score = result.start_logits[ 0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index null_start_logit = result.start_logits[0] null_end_logit = result.end_logits[0] for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if start_index not in feature.token_to_orig_map: continue if end_index not in feature.token_to_orig_map: continue if not feature.token_is_max_context.get( start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], end_logit=result.end_logits[end_index])) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction(feature_index=min_null_feature_index, start_index=0, end_index=0, start_logit=null_start_logit, end_logit=null_end_logit)) prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) seen_predictions = {} nbest = [] if not prelim_predictions: logger.warning(("not prelim_predictions:", example.qas_id)) for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] if is_english: tok_text = " ".join(tok_tokens) else: tok_text = "".join(tok_tokens) # De-tokenize WordPieces that have been split off. tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) if is_english: orig_text = " ".join(orig_tokens) else: orig_text = "".join(orig_tokens) final_text = get_final_text(tok_text, orig_text, do_lower_case, is_english) if final_text in seen_predictions: continue seen_predictions[final_text] = True else: final_text = "" seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append( _NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_logit + entry.end_logit) if not best_non_null_entry: if entry.text: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_logit"] = entry.start_logit output["end_logit"] = entry.end_logit nbest_json.append(output) assert len(nbest_json) >= 1 if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold score_diff = score_null if best_non_null_entry: score_diff -= best_non_null_entry.start_logit + best_non_null_entry.end_logit scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" else: all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json return all_predictions, all_nbest_json, scores_diff_json
def generate(self, texts, use_gpu=False, beam_width=5): """ Get the continuation of the input poetry. Args: texts(list): the front part of a poetry. use_gpu(bool): whether use gpu to predict or not beam_width(int): the beam search width. Returns: results(list): the poetry continuations. """ if texts and isinstance(texts, list) and all(texts) and all( [isinstance(text, str) for text in texts]): predicted_data = texts else: raise ValueError( "The input texts should be a list with nonempty string elements." ) for i, text in enumerate(texts): if len(text) > self.line: logger.warning( 'The input text: %s, contains more than %i characters, which will be cut off' % (text, self.line)) texts[i] = text[:self.line] for char in text: if not '\u4e00' <= char <= '\u9fff': logger.warning( 'The input text: %s, contains non-Chinese characters, which may result in magic output' % text) break if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) if use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() with fluid.dygraph.guard(place): self.model.eval() results = [] for text in predicted_data: sample_results = [] ids, sids = self.tokenizer.encode(text) src_ids = D.to_variable(np.expand_dims(ids, 0)) src_sids = D.to_variable(np.expand_dims(sids, 0)) output_ids = beam_search_infilling( self.model, src_ids, src_sids, eos_id=self.tokenizer.sep_id, sos_id=self.tokenizer.cls_id, attn_id=self.tokenizer.vocab['[MASK]'], max_decode_len=80, max_encode_len=20, beam_width=beam_width, tgt_type_id=1) output_str = self.rev_lookup(output_ids[0].numpy()) for ostr in output_str.tolist(): if '[SEP]' in ostr: ostr = ostr[:ostr.index('[SEP]')] sample_results.append("".join(ostr)) results.append(sample_results) return results