Exemple #1
0
def load_dataset_from_disk(dataset):
    list_samples = {k: [] for k in my_map.name2label.keys()}
    print(list_samples)
    print 'load_data in ' + dataset

    # return list file and folder in dir
    stack = os.listdir(dataset)

    while len(stack) > 0:
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        # where is file_path
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('%s' % file_path)
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-16') as fp:

                content = unicodedata.normalize('NFKC', fp.read())

                #tokenizer content
                content = r.run(tokenizer.predict(content))
                #dir name of file_path
                dir_name = utils.get_dir_name(file_path)
                list_samples[dir_name].append(content)
    print('')
    return list_samples
Exemple #2
0
def download():
    try:
        args = request.args
        name = args['name']
        return get_zip_file(name)
    except Exception:
        return render_template('show_time.html',
                               name_l=get_dir_name(source_page_path))
Exemple #3
0
def get_model_type_from_pretrained(pretrained_model_name_or_path):
    config_file = os.path.join(get_dir_name(pretrained_model_name_or_path),
                               CONFIG_NAME)
    with io.open(config_file) as f:
        config_json = json.load(f)

    if "model_type" not in config_json:
        warnings.warn(
            "`model_type` not found in %s, set it to `bert` by default." %
            config_file)
        model_type = "bert"
    else:
        model_type = config_json["model_type"]
    return model_type
Exemple #4
0
def load_dataset(dataset):
    list_samples = {k: [] for k in my_map.name2label.keys()}
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print(file_path)
            with open(file_path, 'r', encoding='utf-16') as fp:
                content = unicodedata.normalize('NFKC', fp.read())
                content = r.run(tokenizer.predict(content))
                dir_name = utils.get_dir_name(file_path)
                list_samples[dir_name].append(content)
    return list_samples
def count_tokens():
    print('count tokens...')
    statistic = {name: {} for name in my_map.name2label.keys()}
    stack = os.listdir(tokenized_dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(tokenized_dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-8') as fp:
                label = utils.get_dir_name(file_path)
                for sen in fp:
                    sen = sen.strip()
                    tag = ViPosTagger.postagging(sen)
                    tokens = [
                        tag[0][i] for i in xrange(len(tag[0]))
                        if tag[1][i] == u'N'
                    ]
                    update_count_tokens(statistic, label, tokens)
def tokenizer_dataset():
    utils.mkdir(tokenized_dataset)
    stack = os.listdir(dataset)
    print 'loading data in ' + dataset
    while (len(stack) > 0):
        file_name = stack.pop()
        file_path = os.path.join(dataset, file_name)
        if (os.path.isdir(file_path)):
            utils.push_data_to_stack(stack, file_path, file_name)
        else:
            print('\r%s' % (file_path)),
            sys.stdout.flush()
            with open(file_path, 'r', encoding='utf-16') as fp:
                content = unicodedata.normalize('NFKC', fp.read())
                content = r.run(tokenizer.predict(content))
                dir_name = utils.get_dir_name(file_path)
                output_dir = os.path.join(tokenized_dataset, dir_name)
                utils.mkdir(output_dir)
                name = os.path.basename(file_path)
                with open(os.path.join(output_dir, name),
                          'w',
                          encoding='utf-8') as fw:
                    fw.write(content)
    print('')
Exemple #7
0
    def from_pretrained(cls,
                        pretrained_model_name_or_path,
                        config_cls=None,
                        adapter_fn=None,
                        *args,
                        **kwargs):
        state_dict = kwargs.get('state_dict', None)
        kwargs.pop('state_dict', None)
        config_dict = kwargs.get('config_dict', None)
        kwargs.pop('config_dict', None)

        if config_cls is None:
            if config_dict:
                model_type = config_dict.get("model_type", "bert")
            else:
                model_type = get_model_type_from_pretrained(
                    pretrained_model_name_or_path)
            if model_type in ["bert", "roberta"]:
                config_cls = BertConfig
            elif model_type == "albert":
                config_cls = AlbertConfig
            elif model_type == "gpt2":
                config_cls = GPT2Config
            else:
                raise NotImplementedError
        if config_dict:
            config = config_cls.from_dict(config_dict)
        else:
            config = config_cls.from_json_file(
                os.path.join(get_dir_name(pretrained_model_name_or_path),
                             CONFIG_NAME))

        # Instantiate model.
        model = cls(config, *args, **kwargs)

        # Check if the model is from tensorflow checkpoint
        is_tf_checkpoint = False
        if io.exists(pretrained_model_name_or_path + ".index") or \
                io.exists(pretrained_model_name_or_path + ".meta"):
            is_tf_checkpoint = True

        if is_tf_checkpoint:
            if adapter_fn:
                adapter_fn(model, pretrained_model_name_or_path)
            else:
                adapter.load_bert_tf_checkpoint_weights(
                    model, pretrained_model_name_or_path)

        if state_dict is None:
            weights_path = os.path.join(pretrained_model_name_or_path,
                                        WEIGHTS_NAME)
            if not io.exists(weights_path):
                return model
            logger.info("Loading model {}".format(weights_path))
            with io.open(weights_path, "rb") as f:
                state_dict = torch.load(f, map_location='cpu')

        # Load from a PyTorch state_dict
        old_keys = []
        new_keys = []
        for key in state_dict.keys():
            new_key = None
            if 'gamma' in key:
                new_key = key.replace('gamma', 'weight')
            if 'beta' in key:
                new_key = key.replace('beta', 'bias')
            if new_key:
                old_keys.append(key)
                new_keys.append(new_key)
        for old_key, new_key in zip(old_keys, new_keys):
            state_dict[new_key] = state_dict.pop(old_key)

        if config.model_type == "gpt2":
            new_state_dict = {
                "gpt2." + key.replace("transformer.", ""): val
                for key, val in state_dict.items()
            }
            state_dict = new_state_dict

        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(
                prefix[:-1], {})
            module._load_from_state_dict(state_dict, prefix, local_metadata,
                                         True, missing_keys, unexpected_keys,
                                         error_msgs)
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + '.')

        start_prefix = ''
        if not hasattr(model, 'bert') and any(
                s.startswith('bert.') for s in state_dict.keys()):
            start_prefix = 'bert.'

        logger.info('Loading model...')
        load(model, prefix=start_prefix)
        logger.info('Load finished!')
        if len(missing_keys) > 0:
            logger.info(
                "Weights of {} not initialized from pretrained model: {}".
                format(model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            logger.info(
                "Weights from pretrained model not used in {}: {}".format(
                    model.__class__.__name__, unexpected_keys))
        if len(error_msgs) > 0:
            raise RuntimeError(
                'Error(s) in loading state_dict for {}:\n\t{}'.format(
                    model.__class__.__name__, "\n\t".join(error_msgs)))

        return model
Exemple #8
0
    def save_checkpoint(self, save_best=False):
        if not self.cfg.is_master_node:
            return

        # Save config.json
        output_config_file = os.path.join(self.cfg.checkpoint_dir, CONFIG_NAME)
        with io.open(output_config_file, "w") as f:
            f.write(self.model_module.arch)

        # Save vocab.txt
        if self.cfg.pretrain_model_name_or_path is not None:
            io.copy(os.path.join(get_dir_name(self.cfg.pretrain_model_name_or_path), "vocab.txt"),
                    os.path.join(get_dir_name(self.cfg.checkpoint_dir), "vocab.txt"))

        # Save the model
        model_to_save_prefix = "pytorch_model" if save_best else "pytorch_model_step_%d" % (self._global_step + 1)

        with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".bin"), "wb") \
                as output_model_file:
            torch.save(self.model_module.state_dict(), output_model_file)

        meta_data = {
            "epoch": self._current_epoch,
            "global_step": self._global_step,
            "optimizer": self._optimizer.state_dict()
        }

        with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".meta.bin"), "wb") \
                as output_model_file:
            torch.save(meta_data, output_model_file)

        if not save_best:
            return

        if hasattr(self.model_module, "model_name"):
            # If the student is pre-defined EasyTransfer AppZoo model
            # Save train_config.json, model.ckpt.* for EasyTransfer
            logger.info("Export tensorflow checkpoint (%s format) to %s" % (
                self.cfg.export_tf_checkpoint_type,
                os.path.join(get_dir_name(self.cfg.checkpoint_dir), "model.ckpt")))
            exporter.export_easytransfer_train_config(
                saved_path=os.path.join(self.cfg.checkpoint_dir, "train_config.json"),
                vocab_dir=get_dir_name(self.cfg.checkpoint_dir),
                label_enumerate_values=self._valid_loader.dataset.label_enumerate_values,
                sequence_length=self.cfg.sequence_length,
                model_name=self.model_module.model_name,
                extra_model_params=self.model_module.extra_model_params)

            if self.cfg.export_tf_checkpoint_type == "easytransfer":
                exporter.export_pytorch_checkpoint_to_tf(
                    model=self.model_module,
                    ckpt_dir=get_dir_name(self.cfg.checkpoint_dir),
                    bert_output_prefix="bert_pre_trained_model",
                    appended_val_map=(("classifier", "app/ez_dense"),),
                    appended_tensors_to_transpose=("classifier.weight",))
            elif self.cfg.export_tf_checkpoint_type == "google":
                exporter.export_pytorch_checkpoint_to_tf(
                    model=self.model_module,
                    ckpt_dir=get_dir_name(self.cfg.checkpoint_dir),
                    bert_output_prefix="",
                    appended_val_map=(("classifier.weight", "output_weights"),
                                      ("classifier.bias", "output_bias")),
                    appended_tensors_to_transpose=())
            else:
                raise RuntimeError("Invalid export_tf_checkpoint_type %s" % self.cfg.export_tf_checkpoint_type)
        # This is a hack
        torch.cuda.set_device(self.cfg.local_rank)
Exemple #9
0
def show_time():
    return render_template('show_time.html',
                           name_l=get_dir_name(source_page_path))
 def _dir_name(self, date_str):
     return '{}/{}/{}'.format(
         self.out_dir, self.tweet_frequency.value,
         utils.get_dir_name(date_str, self.tweet_frequency))
Exemple #11
0
def base_mtl_training(cfg, base_task_keys):
    if cfg.use_lru:
        print("use LRU model")
        # logger.info("use LRU model")
        model = LRUMetaLabelEnhancedBertClassify.from_pretrained(
            cfg.pretrain_model_name_or_path,
            max_memory_size=cfg.max_memory_size,
            max_task_num=cfg.max_task_num,
            max_label_num=cfg.max_label_num,
            freeze_encoder=False)
    else:
        model = MetaLabelEnhancedBertClassify.from_pretrained(
            cfg.pretrain_model_name_or_path,
            max_memory_size=cfg.max_memory_size,
            max_task_num=cfg.max_task_num,
            max_label_num=cfg.max_label_num,
            freeze_encoder=False)

    # 处理数据的时候,按照task为单位进行的处理和记录数据(meta-learning的基本单位),对应task中的sample representation是提前计算好的
    train_dataset = MetaIntentDataset(
        model_type="text_classify_bert",
        data_file=os.path.join(
            cfg.tables,
            "base_tasks.json" if cfg.base_k is None else "base_train_%d.tsv" %
            cfg.base_k),
        meta_info_file=os.path.join(cfg.tables, "meta_info.json"),
        vocab_file=get_dir_name(cfg.pretrain_model_name_or_path) +
        "/vocab.txt",
        max_seq_length=cfg.sequence_length,
        is_training=True)

    valid_dataset = MetaIntentDataset(
        model_type="text_classify_bert",
        data_file=os.path.join(
            cfg.tables,
            "base_tasks.json" if cfg.base_k is None else "base_dev_%d.tsv" %
            cfg.base_k),
        meta_info_file=os.path.join(cfg.tables, "meta_info.json"),
        vocab_file=get_dir_name(cfg.pretrain_model_name_or_path) +
        "/vocab.txt",
        max_seq_length=cfg.sequence_length,
        is_training=False)

    cfg.checkpoint_dir = os.path.join(cfg.checkpoint_dir_base, "base")

    # 初始化整个模型运行过程
    trainer = Trainer(model=model,
                      train_dataset=train_dataset,
                      valid_dataset=valid_dataset,
                      cfg=cfg)

    # 把当前需要进行training的task进行处理,因为是meta learner的training阶段,所以就是对global embedding进行初始化,然后利用training阶段进行更新
    for task_key in base_task_keys:
        # *****************************
        # 这里我们没有对应的label embedding, 所以先用随机初始化,应该按照论文中描述的,用模型来计算
        # with open(os.path.join(cfg.tables, task_key, "label_embeddings.json")) as f:
        #     label_embeddings = json.load(f)
        label_embeddings = load_label_emb(task_key)
        # *****************************
        memory_id_to_label_embedding = dict()
        for label, embedding in label_embeddings.items():
            memory_id = valid_dataset.label_to_memory_id[label]
            memory_id_to_label_embedding[memory_id] = torch.tensor(
                embedding).cuda()

        # 注意这里是对一个dataset中所有任务中的所有label ID都进行了处理和初始化。
        trainer.model_module.update_global_memory(memory_id_to_label_embedding)

    cfg.save_checkpoint_steps = trainer._optimizer.total_training_steps // cfg.epoch_num
    print("Base training, %d tasks, Train size: %d; Dev size: %d" %
          (len(base_task_keys), len(train_dataset), len(valid_dataset)))
    trainer.train()

    print("Updating local memories...")
    for task_key in base_task_keys:
        # 输入的dataset,有很多个task(meta-learning是以task为基本单位),找到这个task对应的所有label
        task_memory_label_ids = [
            valid_dataset.label_to_memory_id[label]
            for label in valid_dataset.task_to_label_mapping[task_key].keys()
        ]
        # 输入:该task的ID和task对应的label ID (每个task对应多个ID)
        trainer.model_module.update_local_memory(
            valid_dataset.task_to_idx[task_key], task_memory_label_ids)
    trainer.save_checkpoint(save_best=True)