def load_dataset_from_disk(dataset): list_samples = {k: [] for k in my_map.name2label.keys()} print(list_samples) print 'load_data in ' + dataset # return list file and folder in dir stack = os.listdir(dataset) while len(stack) > 0: file_name = stack.pop() file_path = os.path.join(dataset, file_name) # where is file_path if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('%s' % file_path) sys.stdout.flush() with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) #tokenizer content content = r.run(tokenizer.predict(content)) #dir name of file_path dir_name = utils.get_dir_name(file_path) list_samples[dir_name].append(content) print('') return list_samples
def download(): try: args = request.args name = args['name'] return get_zip_file(name) except Exception: return render_template('show_time.html', name_l=get_dir_name(source_page_path))
def get_model_type_from_pretrained(pretrained_model_name_or_path): config_file = os.path.join(get_dir_name(pretrained_model_name_or_path), CONFIG_NAME) with io.open(config_file) as f: config_json = json.load(f) if "model_type" not in config_json: warnings.warn( "`model_type` not found in %s, set it to `bert` by default." % config_file) model_type = "bert" else: model_type = config_json["model_type"] return model_type
def load_dataset(dataset): list_samples = {k: [] for k in my_map.name2label.keys()} stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print(file_path) with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) content = r.run(tokenizer.predict(content)) dir_name = utils.get_dir_name(file_path) list_samples[dir_name].append(content) return list_samples
def count_tokens(): print('count tokens...') statistic = {name: {} for name in my_map.name2label.keys()} stack = os.listdir(tokenized_dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(tokenized_dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush() with open(file_path, 'r', encoding='utf-8') as fp: label = utils.get_dir_name(file_path) for sen in fp: sen = sen.strip() tag = ViPosTagger.postagging(sen) tokens = [ tag[0][i] for i in xrange(len(tag[0])) if tag[1][i] == u'N' ] update_count_tokens(statistic, label, tokens)
def tokenizer_dataset(): utils.mkdir(tokenized_dataset) stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush() with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) content = r.run(tokenizer.predict(content)) dir_name = utils.get_dir_name(file_path) output_dir = os.path.join(tokenized_dataset, dir_name) utils.mkdir(output_dir) name = os.path.basename(file_path) with open(os.path.join(output_dir, name), 'w', encoding='utf-8') as fw: fw.write(content) print('')
def from_pretrained(cls, pretrained_model_name_or_path, config_cls=None, adapter_fn=None, *args, **kwargs): state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) config_dict = kwargs.get('config_dict', None) kwargs.pop('config_dict', None) if config_cls is None: if config_dict: model_type = config_dict.get("model_type", "bert") else: model_type = get_model_type_from_pretrained( pretrained_model_name_or_path) if model_type in ["bert", "roberta"]: config_cls = BertConfig elif model_type == "albert": config_cls = AlbertConfig elif model_type == "gpt2": config_cls = GPT2Config else: raise NotImplementedError if config_dict: config = config_cls.from_dict(config_dict) else: config = config_cls.from_json_file( os.path.join(get_dir_name(pretrained_model_name_or_path), CONFIG_NAME)) # Instantiate model. model = cls(config, *args, **kwargs) # Check if the model is from tensorflow checkpoint is_tf_checkpoint = False if io.exists(pretrained_model_name_or_path + ".index") or \ io.exists(pretrained_model_name_or_path + ".meta"): is_tf_checkpoint = True if is_tf_checkpoint: if adapter_fn: adapter_fn(model, pretrained_model_name_or_path) else: adapter.load_bert_tf_checkpoint_weights( model, pretrained_model_name_or_path) if state_dict is None: weights_path = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) if not io.exists(weights_path): return model logger.info("Loading model {}".format(weights_path)) with io.open(weights_path, "rb") as f: state_dict = torch.load(f, map_location='cpu') # Load from a PyTorch state_dict old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if 'gamma' in key: new_key = key.replace('gamma', 'weight') if 'beta' in key: new_key = key.replace('beta', 'bias') if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) if config.model_type == "gpt2": new_state_dict = { "gpt2." + key.replace("transformer.", ""): val for key, val in state_dict.items() } state_dict = new_state_dict missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') start_prefix = '' if not hasattr(model, 'bert') and any( s.startswith('bert.') for s in state_dict.keys()): start_prefix = 'bert.' logger.info('Loading model...') load(model, prefix=start_prefix) logger.info('Load finished!') if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError( 'Error(s) in loading state_dict for {}:\n\t{}'.format( model.__class__.__name__, "\n\t".join(error_msgs))) return model
def save_checkpoint(self, save_best=False): if not self.cfg.is_master_node: return # Save config.json output_config_file = os.path.join(self.cfg.checkpoint_dir, CONFIG_NAME) with io.open(output_config_file, "w") as f: f.write(self.model_module.arch) # Save vocab.txt if self.cfg.pretrain_model_name_or_path is not None: io.copy(os.path.join(get_dir_name(self.cfg.pretrain_model_name_or_path), "vocab.txt"), os.path.join(get_dir_name(self.cfg.checkpoint_dir), "vocab.txt")) # Save the model model_to_save_prefix = "pytorch_model" if save_best else "pytorch_model_step_%d" % (self._global_step + 1) with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".bin"), "wb") \ as output_model_file: torch.save(self.model_module.state_dict(), output_model_file) meta_data = { "epoch": self._current_epoch, "global_step": self._global_step, "optimizer": self._optimizer.state_dict() } with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".meta.bin"), "wb") \ as output_model_file: torch.save(meta_data, output_model_file) if not save_best: return if hasattr(self.model_module, "model_name"): # If the student is pre-defined EasyTransfer AppZoo model # Save train_config.json, model.ckpt.* for EasyTransfer logger.info("Export tensorflow checkpoint (%s format) to %s" % ( self.cfg.export_tf_checkpoint_type, os.path.join(get_dir_name(self.cfg.checkpoint_dir), "model.ckpt"))) exporter.export_easytransfer_train_config( saved_path=os.path.join(self.cfg.checkpoint_dir, "train_config.json"), vocab_dir=get_dir_name(self.cfg.checkpoint_dir), label_enumerate_values=self._valid_loader.dataset.label_enumerate_values, sequence_length=self.cfg.sequence_length, model_name=self.model_module.model_name, extra_model_params=self.model_module.extra_model_params) if self.cfg.export_tf_checkpoint_type == "easytransfer": exporter.export_pytorch_checkpoint_to_tf( model=self.model_module, ckpt_dir=get_dir_name(self.cfg.checkpoint_dir), bert_output_prefix="bert_pre_trained_model", appended_val_map=(("classifier", "app/ez_dense"),), appended_tensors_to_transpose=("classifier.weight",)) elif self.cfg.export_tf_checkpoint_type == "google": exporter.export_pytorch_checkpoint_to_tf( model=self.model_module, ckpt_dir=get_dir_name(self.cfg.checkpoint_dir), bert_output_prefix="", appended_val_map=(("classifier.weight", "output_weights"), ("classifier.bias", "output_bias")), appended_tensors_to_transpose=()) else: raise RuntimeError("Invalid export_tf_checkpoint_type %s" % self.cfg.export_tf_checkpoint_type) # This is a hack torch.cuda.set_device(self.cfg.local_rank)
def show_time(): return render_template('show_time.html', name_l=get_dir_name(source_page_path))
def _dir_name(self, date_str): return '{}/{}/{}'.format( self.out_dir, self.tweet_frequency.value, utils.get_dir_name(date_str, self.tweet_frequency))
def base_mtl_training(cfg, base_task_keys): if cfg.use_lru: print("use LRU model") # logger.info("use LRU model") model = LRUMetaLabelEnhancedBertClassify.from_pretrained( cfg.pretrain_model_name_or_path, max_memory_size=cfg.max_memory_size, max_task_num=cfg.max_task_num, max_label_num=cfg.max_label_num, freeze_encoder=False) else: model = MetaLabelEnhancedBertClassify.from_pretrained( cfg.pretrain_model_name_or_path, max_memory_size=cfg.max_memory_size, max_task_num=cfg.max_task_num, max_label_num=cfg.max_label_num, freeze_encoder=False) # 处理数据的时候,按照task为单位进行的处理和记录数据(meta-learning的基本单位),对应task中的sample representation是提前计算好的 train_dataset = MetaIntentDataset( model_type="text_classify_bert", data_file=os.path.join( cfg.tables, "base_tasks.json" if cfg.base_k is None else "base_train_%d.tsv" % cfg.base_k), meta_info_file=os.path.join(cfg.tables, "meta_info.json"), vocab_file=get_dir_name(cfg.pretrain_model_name_or_path) + "/vocab.txt", max_seq_length=cfg.sequence_length, is_training=True) valid_dataset = MetaIntentDataset( model_type="text_classify_bert", data_file=os.path.join( cfg.tables, "base_tasks.json" if cfg.base_k is None else "base_dev_%d.tsv" % cfg.base_k), meta_info_file=os.path.join(cfg.tables, "meta_info.json"), vocab_file=get_dir_name(cfg.pretrain_model_name_or_path) + "/vocab.txt", max_seq_length=cfg.sequence_length, is_training=False) cfg.checkpoint_dir = os.path.join(cfg.checkpoint_dir_base, "base") # 初始化整个模型运行过程 trainer = Trainer(model=model, train_dataset=train_dataset, valid_dataset=valid_dataset, cfg=cfg) # 把当前需要进行training的task进行处理,因为是meta learner的training阶段,所以就是对global embedding进行初始化,然后利用training阶段进行更新 for task_key in base_task_keys: # ***************************** # 这里我们没有对应的label embedding, 所以先用随机初始化,应该按照论文中描述的,用模型来计算 # with open(os.path.join(cfg.tables, task_key, "label_embeddings.json")) as f: # label_embeddings = json.load(f) label_embeddings = load_label_emb(task_key) # ***************************** memory_id_to_label_embedding = dict() for label, embedding in label_embeddings.items(): memory_id = valid_dataset.label_to_memory_id[label] memory_id_to_label_embedding[memory_id] = torch.tensor( embedding).cuda() # 注意这里是对一个dataset中所有任务中的所有label ID都进行了处理和初始化。 trainer.model_module.update_global_memory(memory_id_to_label_embedding) cfg.save_checkpoint_steps = trainer._optimizer.total_training_steps // cfg.epoch_num print("Base training, %d tasks, Train size: %d; Dev size: %d" % (len(base_task_keys), len(train_dataset), len(valid_dataset))) trainer.train() print("Updating local memories...") for task_key in base_task_keys: # 输入的dataset,有很多个task(meta-learning是以task为基本单位),找到这个task对应的所有label task_memory_label_ids = [ valid_dataset.label_to_memory_id[label] for label in valid_dataset.task_to_label_mapping[task_key].keys() ] # 输入:该task的ID和task对应的label ID (每个task对应多个ID) trainer.model_module.update_local_memory( valid_dataset.task_to_idx[task_key], task_memory_label_ids) trainer.save_checkpoint(save_best=True)