def resume_from_ckpt(self, model_module, cfg): if cfg.resume_from_checkpoint is None: return meta_file = cfg.resume_from_checkpoint + ".meta.bin" model_file = cfg.resume_from_checkpoint + ".bin" if "oss::" in cfg.resume_from_checkpoint: local_file = "easytexminer_resume_pytorch_model.meta.bin" io.download(model_file, local_file) meta_file = local_file local_file = "easytexminer_resume_pytorch_model.bin" io.download(model_file, local_file) model_file = local_file with io.open(meta_file, "rb") as f: meta_data = torch.load(f, map_location='cpu') self._start_epoch = meta_data["epoch"] self._start_global_step = meta_data["global_step"] + 1 self._optimizer.load_state_dict(meta_data['optimizer']) logger.info("Resume from checkpoint {}".format(cfg.resume_from_checkpoint)) logger.info("Start epoch {}".format(self._start_epoch)) logger.info("Start step {}".format(self._start_global_step)) logger.info("Start learning rate {:.6f}".format(self._optimizer.get_current_lr())) with io.open(model_file, "rb") as f: model_module.load_state_dict(torch.load(f, map_location='cpu')) logger.info("Resume checkpoint Done".format(cfg.resume_from_checkpoint))
def main(gpu, cfg, *args, **kwargs): init_running_envs(gpu, cfg) with io.open(os.path.join(cfg.tables, "base_tasks.json")) as f: base_tasks_keys = [] base_tasks = eval(json.load(f))["data"] for item in base_tasks: base_tasks_keys.append(item['taskKey']) base_task_keys = set(base_tasks_keys) with io.open(os.path.join(cfg.tables, "meta_info.json")) as f: meta_info_list = eval(json.load(f))["data"] all_task_keys = set([t["taskKey"] for t in meta_info_list]) other_task_keys = all_task_keys - base_task_keys # base task 有很多,选择base K个进行训练,如果没有设置,就默认所有的base task都进行训练 if cfg.base_k: with io.open( os.path.join(cfg.tables, "base_tasks_%d.json" % cfg.base_k)) as f: base_task_keys = set(json.load(f)) print('base task num: {}'.format(len(list(base_task_keys)))) # logger.info("Base task Num: %d" % len(list(base_task_keys))) if cfg.mode == "train": if cfg.train_type == "base": base_mtl_training(cfg, base_task_keys) elif cfg.train_type == "lifelong": # other_task_keys: base就只是用来initialize一下 meta_lifelong_training(cfg, sorted(other_task_keys)) else: raise NotImplementedError elif cfg.mode == "predict": predict_meta_lifelong(cfg)
def readlines_from_file(self, data_file): i = 0 with io.open(data_file) as f: if self.skip_first_line: f.readline() if data_file.index('json') != -1: task_rows = eval(json.load(f))['data'] # data_rows = f.readlines() # if i % 100000: # logger.info("{} lines read from {}".format(i, data_file)) # task, content, label = items[0], items[1], items[2] with open( '/apsarapangu/disk3/zhangtaolin.ztl/MeLL_pytorch/data/all_data.json', 'r') as file: all_data = eval(json.load(file))['data'] all_data_dict = {} for item in all_data: taskKey = item['taskKey'] if taskKey not in all_data_dict.keys(): all_data_dict[taskKey] = item['dataset'] data_rows = [] if type(task_rows) == dict: task_rows = [task_rows] for item in task_rows: taskKey = item['taskKey'] dataset = all_data_dict[taskKey] for dataset_i in dataset: text = dataset_i['text'] label = dataset_i['label'] data_rows.append(taskKey + '\t' + text + '\t' + label) return data_rows
def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() index = 0 with io.open(vocab_file, "r", encoding="utf-8") as reader: while True: token = reader.readline() if not token: break token = token.strip() vocab[token] = index index += 1 return vocab
def save_vocabulary(self, vocab_path): """Save the tokenizer vocabulary to a directory or file.""" index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_NAME) with io.open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!".format(vocab_file)) index = token_index writer.write(token + u'\n') index += 1 return vocab_file
def get_model_type_from_pretrained(pretrained_model_name_or_path): config_file = os.path.join(get_dir_name(pretrained_model_name_or_path), CONFIG_NAME) with io.open(config_file) as f: config_json = json.load(f) if "model_type" not in config_json: warnings.warn( "`model_type` not found in %s, set it to `bert` by default." % config_file) model_type = "bert" else: model_type = config_json["model_type"] return model_type
def export_easytransfer_train_config(saved_path, vocab_dir, label_enumerate_values, sequence_length, model_name, extra_model_params): """ Save `train_config.json` for EasyTransfer AppZoo Args: saved_path (`str`) : the path of `train_config.json` vocab_dir (`str`) : the directory of `vocab.txt` label_enumerate_values (`list`) : The enumerate values of the label sequence_length (`int`) : Sequence Length while pre-processing model_name (`str`) : The model name of AppZoo, e.g. text_classify_bert """ if isinstance(label_enumerate_values, list): num_label = len(label_enumerate_values) label_enumerate_values = ",".join(label_enumerate_values) else: label_enumerate_values = label_enumerate_values num_label = None if "oss://" in vocab_dir: pretrain_model_name_or_path = vocab_dir + "/model.ckpt" else: pretrain_model_name_or_path = os.path.abspath( vocab_dir) + "/model.ckpt" model_config_dict = dict() for key, val in extra_model_params.items(): model_config_dict[key] = val model_config_dict[ "pretrain_model_name_or_path"] = pretrain_model_name_or_path model_config_dict["model_name"] = model_name model_config_dict["num_labels"] = num_label model_config_dict["dropout_rate"] = 0.0 train_config_dict = { "_config_json": { "model_config": model_config_dict }, "model_config": model_config_dict, "label_enumerate_values": label_enumerate_values, "sequence_length": sequence_length } with io.open(saved_path, "w") as f: f.write(json.dumps(train_config_dict, ensure_ascii=False, indent=4))
def __init__(self, data_file, meta_info_file, vocab_file, max_seq_length, max_label_num=10, **kwargs): super(MetaIntentDataset, self).__init__(data_file, **kwargs) self.tokenizer = Tokenizer(backend="bert", vocab_file=vocab_file) self.max_seq_length = max_seq_length self.max_label_num = max_label_num with io.open(meta_info_file) as f: meta_info_json = eval(json.load(f))['data'] self.task_to_idx = dict() self.task_to_label_mapping = dict() self.task_to_label_features = dict() self.label_to_memory_id = {"PAD": 0} for task_label_info in meta_info_json: labels = task_label_info["labelMap"] # 任务中包含的标签 label_map = {label: idx for idx, label in enumerate(labels)} # task_key: 任务名 task_key = task_label_info["taskKey"] self.task_to_idx[task_key] = len(self.task_to_idx) self.task_to_label_mapping[task_key] = label_map for label in labels: # 注意这里有可能出现不同的任务对应的label是一样的名字,但是只要是在同一个dataset下面,就是默认同一个label 名字就是一个意思的表达 if label not in self.label_to_memory_id: self.label_to_memory_id[label] = len( self.label_to_memory_id)
def from_pretrained(cls, pretrained_model_name_or_path, config_cls=None, adapter_fn=None, *args, **kwargs): state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) config_dict = kwargs.get('config_dict', None) kwargs.pop('config_dict', None) if config_cls is None: if config_dict: model_type = config_dict.get("model_type", "bert") else: model_type = get_model_type_from_pretrained( pretrained_model_name_or_path) if model_type in ["bert", "roberta"]: config_cls = BertConfig elif model_type == "albert": config_cls = AlbertConfig elif model_type == "gpt2": config_cls = GPT2Config else: raise NotImplementedError if config_dict: config = config_cls.from_dict(config_dict) else: config = config_cls.from_json_file( os.path.join(get_dir_name(pretrained_model_name_or_path), CONFIG_NAME)) # Instantiate model. model = cls(config, *args, **kwargs) # Check if the model is from tensorflow checkpoint is_tf_checkpoint = False if io.exists(pretrained_model_name_or_path + ".index") or \ io.exists(pretrained_model_name_or_path + ".meta"): is_tf_checkpoint = True if is_tf_checkpoint: if adapter_fn: adapter_fn(model, pretrained_model_name_or_path) else: adapter.load_bert_tf_checkpoint_weights( model, pretrained_model_name_or_path) if state_dict is None: weights_path = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) if not io.exists(weights_path): return model logger.info("Loading model {}".format(weights_path)) with io.open(weights_path, "rb") as f: state_dict = torch.load(f, map_location='cpu') # Load from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if 'gamma' in key: new_key = key.replace('gamma', 'weight') if 'beta' in key: new_key = key.replace('beta', 'bias') if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) if config.model_type == "gpt2": new_state_dict = { "gpt2." + key.replace("transformer.", ""): val for key, val in state_dict.items() } state_dict = new_state_dict missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') start_prefix = '' if not hasattr(model, 'bert') and any( s.startswith('bert.') for s in state_dict.keys()): start_prefix = 'bert.' logger.info('Loading model...') load(model, prefix=start_prefix) logger.info('Load finished!') if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError( 'Error(s) in loading state_dict for {}:\n\t{}'.format( model.__class__.__name__, "\n\t".join(error_msgs))) return model
def save_checkpoint(self, save_best=False): if not self.cfg.is_master_node: return # Save config.json output_config_file = os.path.join(self.cfg.checkpoint_dir, CONFIG_NAME) with io.open(output_config_file, "w") as f: f.write(self.model_module.arch) # Save vocab.txt if self.cfg.pretrain_model_name_or_path is not None: io.copy(os.path.join(get_dir_name(self.cfg.pretrain_model_name_or_path), "vocab.txt"), os.path.join(get_dir_name(self.cfg.checkpoint_dir), "vocab.txt")) # Save the model model_to_save_prefix = "pytorch_model" if save_best else "pytorch_model_step_%d" % (self._global_step + 1) with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".bin"), "wb") \ as output_model_file: torch.save(self.model_module.state_dict(), output_model_file) meta_data = { "epoch": self._current_epoch, "global_step": self._global_step, "optimizer": self._optimizer.state_dict() } with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".meta.bin"), "wb") \ as output_model_file: torch.save(meta_data, output_model_file) if not save_best: return if hasattr(self.model_module, "model_name"): # If the student is pre-defined EasyTransfer AppZoo model # Save train_config.json, model.ckpt.* for EasyTransfer logger.info("Export tensorflow checkpoint (%s format) to %s" % ( self.cfg.export_tf_checkpoint_type, os.path.join(get_dir_name(self.cfg.checkpoint_dir), "model.ckpt"))) exporter.export_easytransfer_train_config( saved_path=os.path.join(self.cfg.checkpoint_dir, "train_config.json"), vocab_dir=get_dir_name(self.cfg.checkpoint_dir), label_enumerate_values=self._valid_loader.dataset.label_enumerate_values, sequence_length=self.cfg.sequence_length, model_name=self.model_module.model_name, extra_model_params=self.model_module.extra_model_params) if self.cfg.export_tf_checkpoint_type == "easytransfer": exporter.export_pytorch_checkpoint_to_tf( model=self.model_module, ckpt_dir=get_dir_name(self.cfg.checkpoint_dir), bert_output_prefix="bert_pre_trained_model", appended_val_map=(("classifier", "app/ez_dense"),), appended_tensors_to_transpose=("classifier.weight",)) elif self.cfg.export_tf_checkpoint_type == "google": exporter.export_pytorch_checkpoint_to_tf( model=self.model_module, ckpt_dir=get_dir_name(self.cfg.checkpoint_dir), bert_output_prefix="", appended_val_map=(("classifier.weight", "output_weights"), ("classifier.bias", "output_bias")), appended_tensors_to_transpose=()) else: raise RuntimeError("Invalid export_tf_checkpoint_type %s" % self.cfg.export_tf_checkpoint_type) # This is a hack torch.cuda.set_device(self.cfg.local_rank)
def to_json_file(self, json_file_path): """ Save this instance to a json file.""" with io.open(json_file_path, "w", encoding='utf-8') as writer: writer.write(self.to_json_string())
def from_json_file(cls, json_file): """Constructs a `BertConfig` from a json file of parameters.""" with io.open(json_file, "r", encoding='utf-8') as reader: text = reader.read() return cls.from_dict(json.loads(text))
def __init__(self, vocab_size_or_config_json_file, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, pre_trained='', training=''): """Constructs BertConfig. Args: vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. hidden_size: Size of the encoder layers and the pooler layer. num_hidden_layers: Number of hidden layers in the Transformer encoder. num_attention_heads: Number of attention heads for each attention layer in the Transformer encoder. intermediate_size: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. hidden_dropout_prob: The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. max_position_embeddings: The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `BertModel`. initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ self.model_type = "bert" self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.pre_trained = pre_trained self.training = training if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 and isinstance( vocab_size_or_config_json_file, unicode)): with io.open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): self.__dict__[key] = value elif isinstance(vocab_size_or_config_json_file, int): self.vocab_size = vocab_size_or_config_json_file else: raise ValueError( "First argument must be either a vocabulary size (int)" "or the path to a pretrained model config file (str)")
def predict_meta_lifelong(cfg): if cfg.use_lru: print("use LRU model") model = LRUMetaLabelEnhancedBertClassify.from_pretrained( cfg.checkpoint_dir, max_memory_size=cfg.max_memory_size, max_task_num=cfg.max_task_num, max_label_num=cfg.max_label_num, freeze_encoder=True, is_testing=True).cuda() else: model = MetaLabelEnhancedBertClassify.from_pretrained( cfg.checkpoint_dir, max_memory_size=cfg.max_memory_size, max_task_num=cfg.max_task_num, max_label_num=cfg.max_label_num, freeze_encoder=True, is_testing=True).cuda() model.eval() test_dataset = MetaIntentDataset( data_file=os.path.join(cfg.tables, 'text_classify_49', "lifelong_task.json"), meta_info_file=os.path.join(cfg.tables, "meta_info.json"), vocab_file=os.path.join(cfg.checkpoint_dir_base, "base", "vocab.txt"), max_seq_length=cfg.sequence_length, max_label_num=cfg.max_label_num, is_training=False) # test_dataset = MetaIntentDataset( # data_file=os.path.join(cfg.tables, "dev.tsv"), # meta_info_file=os.path.join(cfg.tables, "meta_info.json"), # vocab_file=os.path.join(cfg.checkpoint_dir, "vocab.txt"), # max_seq_length=cfg.sequence_length , # max_label_num=cfg.max_label_num, # is_training=False) testloader = DataLoader(test_dataset, batch_size=cfg.eval_batch_size, shuffle=False, collate_fn=test_dataset.batch_fn) fout = io.open(cfg.outputs, "w") fout.write('pred_label' + "\t" + 'task' + "\t" + 'label' + "\n") print(len(testloader)) for batch in tqdm(testloader): batch = { key: val.cuda() if isinstance(val, torch.Tensor) else val for key, val in batch.items() } with torch.no_grad(): model_outputs = model(batch) logits = model_outputs["logits"] pred_ids = torch.argmax(logits, dim=-1).tolist() label_ids = batch["label_ids"].tolist() tasks = batch["tasks"] for i, task in enumerate(tasks): label_mapping = test_dataset.task_to_label_mapping[task] idx_to_label = {idx: label for label, idx in label_mapping.items()} pred_label = idx_to_label[pred_ids[i]] if pred_ids[ i] in idx_to_label else idx_to_label[0] label = idx_to_label[label_ids[i]] fout.write(pred_label + "\t" + task + "\t" + label + "\n") fout.close() print("Writing to %s finished. " % cfg.outputs)