Example #1
0
    def resume_from_ckpt(self, model_module, cfg):
        if cfg.resume_from_checkpoint is None:
            return
        meta_file = cfg.resume_from_checkpoint + ".meta.bin"
        model_file = cfg.resume_from_checkpoint + ".bin"
        if "oss::" in cfg.resume_from_checkpoint:
            local_file = "easytexminer_resume_pytorch_model.meta.bin"
            io.download(model_file, local_file)
            meta_file = local_file

            local_file = "easytexminer_resume_pytorch_model.bin"
            io.download(model_file, local_file)
            model_file = local_file

        with io.open(meta_file, "rb") as f:
            meta_data = torch.load(f, map_location='cpu')
        self._start_epoch = meta_data["epoch"]
        self._start_global_step = meta_data["global_step"] + 1
        self._optimizer.load_state_dict(meta_data['optimizer'])

        logger.info("Resume from checkpoint {}".format(cfg.resume_from_checkpoint))
        logger.info("Start epoch {}".format(self._start_epoch))
        logger.info("Start step {}".format(self._start_global_step))
        logger.info("Start learning rate {:.6f}".format(self._optimizer.get_current_lr()))
        with io.open(model_file, "rb") as f:
            model_module.load_state_dict(torch.load(f, map_location='cpu'))
        logger.info("Resume checkpoint Done".format(cfg.resume_from_checkpoint))
Example #2
0
def main(gpu, cfg, *args, **kwargs):
    init_running_envs(gpu, cfg)
    with io.open(os.path.join(cfg.tables, "base_tasks.json")) as f:
        base_tasks_keys = []
        base_tasks = eval(json.load(f))["data"]
        for item in base_tasks:
            base_tasks_keys.append(item['taskKey'])
        base_task_keys = set(base_tasks_keys)
    with io.open(os.path.join(cfg.tables, "meta_info.json")) as f:
        meta_info_list = eval(json.load(f))["data"]
    all_task_keys = set([t["taskKey"] for t in meta_info_list])
    other_task_keys = all_task_keys - base_task_keys

    # base task 有很多,选择base K个进行训练,如果没有设置,就默认所有的base task都进行训练
    if cfg.base_k:
        with io.open(
                os.path.join(cfg.tables,
                             "base_tasks_%d.json" % cfg.base_k)) as f:
            base_task_keys = set(json.load(f))
    print('base task num: {}'.format(len(list(base_task_keys))))
    # logger.info("Base task Num: %d" % len(list(base_task_keys)))
    if cfg.mode == "train":
        if cfg.train_type == "base":
            base_mtl_training(cfg, base_task_keys)
        elif cfg.train_type == "lifelong":
            # other_task_keys: base就只是用来initialize一下
            meta_lifelong_training(cfg, sorted(other_task_keys))
        else:
            raise NotImplementedError
    elif cfg.mode == "predict":
        predict_meta_lifelong(cfg)
Example #3
0
 def readlines_from_file(self, data_file):
     i = 0
     with io.open(data_file) as f:
         if self.skip_first_line:
             f.readline()
         if data_file.index('json') != -1:
             task_rows = eval(json.load(f))['data']
         # data_rows = f.readlines()
         # if i % 100000:
         #     logger.info("{} lines read from {}".format(i, data_file))
     # task, content, label = items[0], items[1], items[2]
     with open(
             '/apsarapangu/disk3/zhangtaolin.ztl/MeLL_pytorch/data/all_data.json',
             'r') as file:
         all_data = eval(json.load(file))['data']
         all_data_dict = {}
         for item in all_data:
             taskKey = item['taskKey']
             if taskKey not in all_data_dict.keys():
                 all_data_dict[taskKey] = item['dataset']
     data_rows = []
     if type(task_rows) == dict:
         task_rows = [task_rows]
     for item in task_rows:
         taskKey = item['taskKey']
         dataset = all_data_dict[taskKey]
         for dataset_i in dataset:
             text = dataset_i['text']
             label = dataset_i['label']
             data_rows.append(taskKey + '\t' + text + '\t' + label)
     return data_rows
Example #4
0
def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with io.open(vocab_file, "r", encoding="utf-8") as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab
Example #5
0
 def save_vocabulary(self, vocab_path):
     """Save the tokenizer vocabulary to a directory or file."""
     index = 0
     if os.path.isdir(vocab_path):
         vocab_file = os.path.join(vocab_path, VOCAB_NAME)
     with io.open(vocab_file, "w", encoding="utf-8") as writer:
         for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
             if index != token_index:
                 logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
                                " Please check that the vocabulary is not corrupted!".format(vocab_file))
                 index = token_index
             writer.write(token + u'\n')
             index += 1
     return vocab_file
Example #6
0
def get_model_type_from_pretrained(pretrained_model_name_or_path):
    config_file = os.path.join(get_dir_name(pretrained_model_name_or_path),
                               CONFIG_NAME)
    with io.open(config_file) as f:
        config_json = json.load(f)

    if "model_type" not in config_json:
        warnings.warn(
            "`model_type` not found in %s, set it to `bert` by default." %
            config_file)
        model_type = "bert"
    else:
        model_type = config_json["model_type"]
    return model_type
Example #7
0
def export_easytransfer_train_config(saved_path, vocab_dir,
                                     label_enumerate_values, sequence_length,
                                     model_name, extra_model_params):
    """ Save `train_config.json` for EasyTransfer AppZoo

        Args:
            saved_path (`str`) : the path of `train_config.json`
            vocab_dir (`str`) : the directory of `vocab.txt`
            label_enumerate_values (`list`) : The enumerate values of the label
            sequence_length (`int`) : Sequence Length while pre-processing
            model_name (`str`) : The model name of AppZoo, e.g. text_classify_bert
    """
    if isinstance(label_enumerate_values, list):
        num_label = len(label_enumerate_values)
        label_enumerate_values = ",".join(label_enumerate_values)
    else:
        label_enumerate_values = label_enumerate_values
        num_label = None

    if "oss://" in vocab_dir:
        pretrain_model_name_or_path = vocab_dir + "/model.ckpt"
    else:
        pretrain_model_name_or_path = os.path.abspath(
            vocab_dir) + "/model.ckpt"

    model_config_dict = dict()
    for key, val in extra_model_params.items():
        model_config_dict[key] = val

    model_config_dict[
        "pretrain_model_name_or_path"] = pretrain_model_name_or_path
    model_config_dict["model_name"] = model_name
    model_config_dict["num_labels"] = num_label
    model_config_dict["dropout_rate"] = 0.0

    train_config_dict = {
        "_config_json": {
            "model_config": model_config_dict
        },
        "model_config": model_config_dict,
        "label_enumerate_values": label_enumerate_values,
        "sequence_length": sequence_length
    }
    with io.open(saved_path, "w") as f:
        f.write(json.dumps(train_config_dict, ensure_ascii=False, indent=4))
Example #8
0
    def __init__(self,
                 data_file,
                 meta_info_file,
                 vocab_file,
                 max_seq_length,
                 max_label_num=10,
                 **kwargs):
        super(MetaIntentDataset, self).__init__(data_file, **kwargs)
        self.tokenizer = Tokenizer(backend="bert", vocab_file=vocab_file)
        self.max_seq_length = max_seq_length
        self.max_label_num = max_label_num

        with io.open(meta_info_file) as f:
            meta_info_json = eval(json.load(f))['data']

        self.task_to_idx = dict()
        self.task_to_label_mapping = dict()
        self.task_to_label_features = dict()
        self.label_to_memory_id = {"PAD": 0}

        for task_label_info in meta_info_json:
            labels = task_label_info["labelMap"]

            # 任务中包含的标签
            label_map = {label: idx for idx, label in enumerate(labels)}

            # task_key: 任务名
            task_key = task_label_info["taskKey"]

            self.task_to_idx[task_key] = len(self.task_to_idx)
            self.task_to_label_mapping[task_key] = label_map

            for label in labels:
                # 注意这里有可能出现不同的任务对应的label是一样的名字,但是只要是在同一个dataset下面,就是默认同一个label 名字就是一个意思的表达
                if label not in self.label_to_memory_id:
                    self.label_to_memory_id[label] = len(
                        self.label_to_memory_id)
Example #9
0
    def from_pretrained(cls,
                        pretrained_model_name_or_path,
                        config_cls=None,
                        adapter_fn=None,
                        *args,
                        **kwargs):
        state_dict = kwargs.get('state_dict', None)
        kwargs.pop('state_dict', None)
        config_dict = kwargs.get('config_dict', None)
        kwargs.pop('config_dict', None)

        if config_cls is None:
            if config_dict:
                model_type = config_dict.get("model_type", "bert")
            else:
                model_type = get_model_type_from_pretrained(
                    pretrained_model_name_or_path)
            if model_type in ["bert", "roberta"]:
                config_cls = BertConfig
            elif model_type == "albert":
                config_cls = AlbertConfig
            elif model_type == "gpt2":
                config_cls = GPT2Config
            else:
                raise NotImplementedError
        if config_dict:
            config = config_cls.from_dict(config_dict)
        else:
            config = config_cls.from_json_file(
                os.path.join(get_dir_name(pretrained_model_name_or_path),
                             CONFIG_NAME))

        # Instantiate model.
        model = cls(config, *args, **kwargs)

        # Check if the model is from tensorflow checkpoint
        is_tf_checkpoint = False
        if io.exists(pretrained_model_name_or_path + ".index") or \
                io.exists(pretrained_model_name_or_path + ".meta"):
            is_tf_checkpoint = True

        if is_tf_checkpoint:
            if adapter_fn:
                adapter_fn(model, pretrained_model_name_or_path)
            else:
                adapter.load_bert_tf_checkpoint_weights(
                    model, pretrained_model_name_or_path)

        if state_dict is None:
            weights_path = os.path.join(pretrained_model_name_or_path,
                                        WEIGHTS_NAME)
            if not io.exists(weights_path):
                return model
            logger.info("Loading model {}".format(weights_path))
            with io.open(weights_path, "rb") as f:
                state_dict = torch.load(f, map_location='cpu')

        # Load from a PyTorch state_dict
        old_keys = []
        new_keys = []
        for key in state_dict.keys():
            new_key = None
            if 'gamma' in key:
                new_key = key.replace('gamma', 'weight')
            if 'beta' in key:
                new_key = key.replace('beta', 'bias')
            if new_key:
                old_keys.append(key)
                new_keys.append(new_key)
        for old_key, new_key in zip(old_keys, new_keys):
            state_dict[new_key] = state_dict.pop(old_key)

        if config.model_type == "gpt2":
            new_state_dict = {
                "gpt2." + key.replace("transformer.", ""): val
                for key, val in state_dict.items()
            }
            state_dict = new_state_dict

        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, '_metadata', None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=''):
            local_metadata = {} if metadata is None else metadata.get(
                prefix[:-1], {})
            module._load_from_state_dict(state_dict, prefix, local_metadata,
                                         True, missing_keys, unexpected_keys,
                                         error_msgs)
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + '.')

        start_prefix = ''
        if not hasattr(model, 'bert') and any(
                s.startswith('bert.') for s in state_dict.keys()):
            start_prefix = 'bert.'

        logger.info('Loading model...')
        load(model, prefix=start_prefix)
        logger.info('Load finished!')
        if len(missing_keys) > 0:
            logger.info(
                "Weights of {} not initialized from pretrained model: {}".
                format(model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            logger.info(
                "Weights from pretrained model not used in {}: {}".format(
                    model.__class__.__name__, unexpected_keys))
        if len(error_msgs) > 0:
            raise RuntimeError(
                'Error(s) in loading state_dict for {}:\n\t{}'.format(
                    model.__class__.__name__, "\n\t".join(error_msgs)))

        return model
Example #10
0
    def save_checkpoint(self, save_best=False):
        if not self.cfg.is_master_node:
            return

        # Save config.json
        output_config_file = os.path.join(self.cfg.checkpoint_dir, CONFIG_NAME)
        with io.open(output_config_file, "w") as f:
            f.write(self.model_module.arch)

        # Save vocab.txt
        if self.cfg.pretrain_model_name_or_path is not None:
            io.copy(os.path.join(get_dir_name(self.cfg.pretrain_model_name_or_path), "vocab.txt"),
                    os.path.join(get_dir_name(self.cfg.checkpoint_dir), "vocab.txt"))

        # Save the model
        model_to_save_prefix = "pytorch_model" if save_best else "pytorch_model_step_%d" % (self._global_step + 1)

        with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".bin"), "wb") \
                as output_model_file:
            torch.save(self.model_module.state_dict(), output_model_file)

        meta_data = {
            "epoch": self._current_epoch,
            "global_step": self._global_step,
            "optimizer": self._optimizer.state_dict()
        }

        with io.open(os.path.join(self.cfg.checkpoint_dir, model_to_save_prefix + ".meta.bin"), "wb") \
                as output_model_file:
            torch.save(meta_data, output_model_file)

        if not save_best:
            return

        if hasattr(self.model_module, "model_name"):
            # If the student is pre-defined EasyTransfer AppZoo model
            # Save train_config.json, model.ckpt.* for EasyTransfer
            logger.info("Export tensorflow checkpoint (%s format) to %s" % (
                self.cfg.export_tf_checkpoint_type,
                os.path.join(get_dir_name(self.cfg.checkpoint_dir), "model.ckpt")))
            exporter.export_easytransfer_train_config(
                saved_path=os.path.join(self.cfg.checkpoint_dir, "train_config.json"),
                vocab_dir=get_dir_name(self.cfg.checkpoint_dir),
                label_enumerate_values=self._valid_loader.dataset.label_enumerate_values,
                sequence_length=self.cfg.sequence_length,
                model_name=self.model_module.model_name,
                extra_model_params=self.model_module.extra_model_params)

            if self.cfg.export_tf_checkpoint_type == "easytransfer":
                exporter.export_pytorch_checkpoint_to_tf(
                    model=self.model_module,
                    ckpt_dir=get_dir_name(self.cfg.checkpoint_dir),
                    bert_output_prefix="bert_pre_trained_model",
                    appended_val_map=(("classifier", "app/ez_dense"),),
                    appended_tensors_to_transpose=("classifier.weight",))
            elif self.cfg.export_tf_checkpoint_type == "google":
                exporter.export_pytorch_checkpoint_to_tf(
                    model=self.model_module,
                    ckpt_dir=get_dir_name(self.cfg.checkpoint_dir),
                    bert_output_prefix="",
                    appended_val_map=(("classifier.weight", "output_weights"),
                                      ("classifier.bias", "output_bias")),
                    appended_tensors_to_transpose=())
            else:
                raise RuntimeError("Invalid export_tf_checkpoint_type %s" % self.cfg.export_tf_checkpoint_type)
        # This is a hack
        torch.cuda.set_device(self.cfg.local_rank)
Example #11
0
 def to_json_file(self, json_file_path):
     """ Save this instance to a json file."""
     with io.open(json_file_path, "w", encoding='utf-8') as writer:
         writer.write(self.to_json_string())
Example #12
0
 def from_json_file(cls, json_file):
     """Constructs a `BertConfig` from a json file of parameters."""
     with io.open(json_file, "r", encoding='utf-8') as reader:
         text = reader.read()
     return cls.from_dict(json.loads(text))
Example #13
0
    def __init__(self,
                 vocab_size_or_config_json_file,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 pre_trained='',
                 training=''):
        """Constructs BertConfig.

        Args:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `BertModel`.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
        """
        self.model_type = "bert"
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.pre_trained = pre_trained
        self.training = training
        if isinstance(vocab_size_or_config_json_file,
                      str) or (sys.version_info[0] == 2 and isinstance(
                          vocab_size_or_config_json_file, unicode)):
            with io.open(vocab_size_or_config_json_file, "r",
                         encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
                self.__dict__[key] = value
        elif isinstance(vocab_size_or_config_json_file, int):
            self.vocab_size = vocab_size_or_config_json_file
        else:
            raise ValueError(
                "First argument must be either a vocabulary size (int)"
                "or the path to a pretrained model config file (str)")
Example #14
0
def predict_meta_lifelong(cfg):
    if cfg.use_lru:
        print("use LRU model")
        model = LRUMetaLabelEnhancedBertClassify.from_pretrained(
            cfg.checkpoint_dir,
            max_memory_size=cfg.max_memory_size,
            max_task_num=cfg.max_task_num,
            max_label_num=cfg.max_label_num,
            freeze_encoder=True,
            is_testing=True).cuda()
    else:
        model = MetaLabelEnhancedBertClassify.from_pretrained(
            cfg.checkpoint_dir,
            max_memory_size=cfg.max_memory_size,
            max_task_num=cfg.max_task_num,
            max_label_num=cfg.max_label_num,
            freeze_encoder=True,
            is_testing=True).cuda()
    model.eval()

    test_dataset = MetaIntentDataset(
        data_file=os.path.join(cfg.tables, 'text_classify_49',
                               "lifelong_task.json"),
        meta_info_file=os.path.join(cfg.tables, "meta_info.json"),
        vocab_file=os.path.join(cfg.checkpoint_dir_base, "base", "vocab.txt"),
        max_seq_length=cfg.sequence_length,
        max_label_num=cfg.max_label_num,
        is_training=False)

    # test_dataset = MetaIntentDataset(
    #     data_file=os.path.join(cfg.tables, "dev.tsv"),
    #     meta_info_file=os.path.join(cfg.tables, "meta_info.json"),
    #     vocab_file=os.path.join(cfg.checkpoint_dir, "vocab.txt"),
    #     max_seq_length=cfg.sequence_length ,
    #     max_label_num=cfg.max_label_num,
    #     is_training=False)

    testloader = DataLoader(test_dataset,
                            batch_size=cfg.eval_batch_size,
                            shuffle=False,
                            collate_fn=test_dataset.batch_fn)

    fout = io.open(cfg.outputs, "w")
    fout.write('pred_label' + "\t" + 'task' + "\t" + 'label' + "\n")
    print(len(testloader))
    for batch in tqdm(testloader):
        batch = {
            key: val.cuda() if isinstance(val, torch.Tensor) else val
            for key, val in batch.items()
        }
        with torch.no_grad():
            model_outputs = model(batch)
        logits = model_outputs["logits"]
        pred_ids = torch.argmax(logits, dim=-1).tolist()
        label_ids = batch["label_ids"].tolist()

        tasks = batch["tasks"]

        for i, task in enumerate(tasks):
            label_mapping = test_dataset.task_to_label_mapping[task]
            idx_to_label = {idx: label for label, idx in label_mapping.items()}
            pred_label = idx_to_label[pred_ids[i]] if pred_ids[
                i] in idx_to_label else idx_to_label[0]
            label = idx_to_label[label_ids[i]]
            fout.write(pred_label + "\t" + task + "\t" + label + "\n")
    fout.close()
    print("Writing to %s finished. " % cfg.outputs)