Ejemplo n.º 1
0
 def _init_deep_model(self, model_type, model_path, num_labels, num_regs=None):
     if 'roberta' in model_type:
         tokenizer = RobertaTokenizer.from_pretrained(model_path)
         config = RobertaConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = RobertaForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra_multitask' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         tokenizer.add_special_tokens({'additional_special_tokens': ['[VALUES]']})
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         config.num_regs = num_regs
         config.vocab_size = len(tokenizer)
         model = ElectraForSequenceClassificationMultiTask.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     elif 'electra' in model_type:
         tokenizer = ElectraTokenizer.from_pretrained(model_path)
         config = ElectraConfig.from_pretrained(model_path)
         config.num_labels = num_labels
         model = ElectraForSequenceClassification.from_pretrained(model_path, config=config)
         model.eval()
         model.to(self.device)
     else:
         raise NotImplementedError()
     return config, tokenizer, model
Ejemplo n.º 2
0
def main(train_cfg='config/electra_pretrain.json',
         model_cfg='config/electra_small.json',
         data_file='../tbc/books_large_all.txt',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         log_dir='../exp/electra/pretrain/runs',
         save_dir='../exp/electra/pretrain',
         max_len=128,
         max_pred=20,
         mask_prob=0.15,
         quantize=False):

    check_dirs_exist([log_dir, save_dir])

    train_cfg = ElectraConfig().from_json_file(train_cfg)
    model_cfg = ElectraConfig().from_json_file(model_cfg)

    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=True)
    tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

    pipeline = [
        Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()),
                            tokenizer.convert_tokens_to_ids, max_len)
    ]

    data_iter = SentPairDataLoader(data_file,
                                   train_cfg.batch_size,
                                   tokenize,
                                   max_len,
                                   pipeline=pipeline)

    # Get distilled-electra and quantized-distilled-electra
    generator = ElectraForMaskedLM.from_pretrained(
        'google/electra-small-generator')
    t_discriminator = ElectraForPreTraining.from_pretrained(
        'google/electra-base-discriminator')
    s_discriminator = QuantizedElectraForPreTraining(
        model_cfg) if quantize else ElectraForPreTraining
    s_discriminator = s_discriminator.from_pretrained(
        'google/electra-small-discriminator', config=model_cfg)  # model
    # config is used for model "QuantizedElectraForPreTraining"
    model = DistillElectraForPreTraining(generator, t_discriminator,
                                         s_discriminator, model_cfg)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir)  # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, data_iter, None,
                         optimizer, save_dir, get_device())
    trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args)
    trainer.train(model_file, None, data_parallel)
    trainer._eval()
Ejemplo n.º 3
0
def main(task='mrpc',
         base_train_cfg='config/QDElectra_pretrain.json',
         train_cfg='config/train_mrpc.json',
         model_cfg='config/QDElectra_base.json',
         data_file='../glue/MRPC/train.tsv',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         log_dir='../exp/electra/pretrain/runs',
         save_dir='../exp/bert/mrpc',
         mode='train',
         pred_distill=True):
    train_cfg_dict = json.load(open(base_train_cfg, "r"))
    train_cfg_dict.update(json.load(open(train_cfg, "r")))
    train_cfg = ElectraConfig().from_dict(train_cfg_dict)
    # train_cfg = ElectraConfig().from_json_file(train_cfg)
    model_cfg = ElectraConfig().from_json_file(model_cfg)
    output_mode, train_cfg.n_epochs, max_len = get_task_params(task)
    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
    TaskDataset = dataset_class(task) # task dataset class according to the task
    num_labels = len(TaskDataset.labels)
    pipeline = [
        Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
        AddSpecialTokensWithTruncation(max_len),
        TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len)
    ]
    data_set = TaskDataset(data_file, pipeline)
    data_iter = DataLoader(data_set, batch_size=train_cfg.batch_size, shuffle=True)

    t_discriminator = ElectraForSequenceClassification.from_pretrained(
        'google/electra-base-discriminator'
    )
    s_discriminator = QuantizedElectraForSequenceClassification.from_pretrained(
        'google/electra-small-discriminator', config=model_cfg
    )
    model = DistillElectraForSequenceClassification(t_discriminator, s_discriminator, model_cfg)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir) # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device())
    trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args)

    if mode == 'train':
        trainer.train(model_file, None, data_parallel)
    elif mode == 'eval':
        input_ids, attention_mask, token_type_ids, label_ids = TokenIndexing(tokenizer.convert_tokens_to_ids,
                                                                            TaskDataset.labels,
                                                                            output_mode,
                                                                            max_len)
        _, eval_labels = get_tensor_data(output_mode, input_ids, attention_mask, token_type_ids, label_ids)
        results = trainer.eval(model_file, output_mode, eval_labels, num_labels, data_parallel)
        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy:', total_accuracy)
Ejemplo n.º 4
0
def get_model(args):
    if args.model_size == 'debug':
        num_hidden_layers = 1
        embedding_size = 8
        hidden_size = 16
        intermediate_size = 32
        num_attention_heads = 2
        args.gen_ratio = 2

    elif args.model_size == 'tiny':
        num_hidden_layers = 4
        embedding_size = 128
        hidden_size = 336
        intermediate_size = 1344
        num_attention_heads = 12
    elif args.model_size == 'small':
        num_hidden_layers = 12
        embedding_size = 128
        hidden_size = 256
        intermediate_size = 1024
        num_attention_heads = 4
    elif args.model_size == 'base':
        num_hidden_layers = 12
        embedding_size = 768
        hidden_size = 768
        intermediate_size = 3072
        num_attention_heads = 12

    else:
        raise Exception('Which model? small, base, large')

    generator_config = ElectraConfig(
        max_position_embeddings=args.seq_length,
        vocab_size=args.vocab_size,
        num_hidden_layers=num_hidden_layers,
        embedding_size=embedding_size,
        hidden_size=hidden_size // args.gen_ratio,
        intermediate_size=intermediate_size // args.gen_ratio,
        num_attention_heads=num_attention_heads // args.gen_ratio,
    )

    discriminator_config = ElectraConfig(
        max_position_embeddings=args.seq_length,
        vocab_size=args.vocab_size,
        num_hidden_layers=num_hidden_layers,
        embedding_size=embedding_size,
        hidden_size=hidden_size,
        intermediate_size=intermediate_size,
        num_attention_heads=num_attention_heads,
    )

    model = Electra(args,
                    gen_config=generator_config,
                    dis_config=discriminator_config)
    return model
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)

        attention_mask = None
        if self.use_attention_mask:
            attention_mask = random_attention_mask(
                [self.batch_size, self.seq_length])

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length],
                                        self.type_vocab_size)

        config = ElectraConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            embedding_size=self.embedding_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
        )

        return config, input_ids, token_type_ids, attention_mask
Ejemplo n.º 6
0
 def _load_model(self):
     config = ElectraConfig.from_pretrained(self.backbone)
     p_encoder = ElectraEncoder.from_pretrained(self.backbone,
                                                config=config).cuda()
     q_encoder = ElectraEncoder.from_pretrained(self.backbone,
                                                config=config).cuda()
     return p_encoder, q_encoder
    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)

        sequence_labels = None
        token_labels = None
        choice_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)

        config = ElectraConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_hidden_layers=self.num_hidden_layers,
            num_attention_heads=self.num_attention_heads,
            intermediate_size=self.intermediate_size,
            hidden_act=self.hidden_act,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
            max_position_embeddings=self.max_position_embeddings,
            type_vocab_size=self.type_vocab_size,
            initializer_range=self.initializer_range,
        )

        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file,
                                     pytorch_dump_path,
                                     discriminator_or_generator):
    # Initialise PyTorch model
    config = ElectraConfig.from_json_file(config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))

    if discriminator_or_generator == "discriminator":
        model = ElectraForPreTraining(config)
    elif discriminator_or_generator == "generator":
        model = ElectraForMaskedLM(config)
    else:
        raise ValueError(
            "The discriminator_or_generator argument should be either 'discriminator' or 'generator'"
        )

    # Load weights from tf checkpoint
    load_tf_weights_in_electra(
        model,
        config,
        tf_checkpoint_path,
        discriminator_or_generator=discriminator_or_generator)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Ejemplo n.º 9
0
    def __init__(self):
        self.root_path = '..'
        self.checkpoint_path = f"{self.root_path}/checkpoint"
        self.save_ckpt_path = f"{self.checkpoint_path}/koelectra-wellnesee-text-classification.pth"
        model_name_or_path = "monologg/koelectra-base-discriminator"

        # 답변과 카테고리 불러오기
        self.category, self.answer = load_wellness_answer()

        ctx = "cuda" if torch.cuda.is_available() else "cpu"
        self.device = torch.device(ctx)

        # 저장한 Checkpoint 불러오기
        checkpoint = torch.load(self.save_ckpt_path, map_location=self.device)

        # Electra Tokenizer
        self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)

        electra_config = ElectraConfig.from_pretrained(model_name_or_path)
        self.model = koElectraForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_name_or_path,
            config=electra_config,
            num_labels=359)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.to(self.device)
        self.model.eval()
Ejemplo n.º 10
0
    def load_electra_model(self):
        parser = argparse.ArgumentParser()
        args = parser.parse_args()
        args.output_encoded_layers = True
        args.output_attention_layers = True
        args.output_att_score = True
        args.output_att_sum = True
        self.args = args
        # 解析配置文件, 教师模型和student模型的vocab是不变的
        # 这里是使用的teacher的config和微调后的teacher模型, 也可以换成student的config和蒸馏后的student模型
        # student config:  config/chinese_bert_config_L4t.json
        # distil student model:  distil_model/gs8316.pkl
        bert_config_file_S = self.model_conf
        tuned_checkpoint_S = self.model_file
        # 加载student的配置文件, 校验最大序列长度小于我们的配置中的序列长度
        bert_config_S = ElectraConfig.from_json_file(bert_config_file_S)
        bert_config_S.num_labels = self.num_labels

        # 加载tokenizer
        self.predict_tokenizer = BertTokenizer(vocab_file=self.vocab_file)

        # 加载模型
        self.predict_model = ElectraSPC(bert_config_S)
        assert os.path.exists(tuned_checkpoint_S), "模型文件不存在,请检查"
        state_dict_S = torch.load(tuned_checkpoint_S, map_location=self.device)
        self.predict_model.load_state_dict(state_dict_S)
        if self.verbose:
            print("模型已加载")
        logger.info(f"预测模型{tuned_checkpoint_S}加载完成")
def get_model_and_tokenizer(model_name, device):
    save_ckpt_path = CHECK_POINT[model_name]

    if model_name == "koelectra":
        model_name_or_path = "monologg/koelectra-base-discriminator"

        tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)
        electra_config = ElectraConfig.from_pretrained(model_name_or_path)
        model = koElectraForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_name_or_path,
            config=electra_config,
            num_labels=359)
    elif model_name == 'kobert':
        tokenizer = get_tokenizer()
        model = KoBERTforSequenceClassfication()

    if os.path.isfile(save_ckpt_path):
        checkpoint = torch.load(save_ckpt_path, map_location=device)
        pre_epoch = checkpoint['epoch']
        # pre_loss = checkpoint['loss']
        model.load_state_dict(checkpoint['model_state_dict'])

        print(f"load pretrain from: {save_ckpt_path}, epoch={pre_epoch}")

    return model, tokenizer
Ejemplo n.º 12
0
    def __init__(self, root_path='../ai/chatbot'):
        checkpoint_path = f"{root_path}/checkpoint"
        self.model_path = f"{checkpoint_path}/koelectra-wellness-text-classification.pth"
        model_name_or_path = "monologg/koelectra-base-discriminator"

        checkpoint = torch.load(self.model_path, map_location=device)
        electra_config = ElectraConfig.from_pretrained(model_name_or_path)
        self.model = koElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=electra_config, num_labels=359)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.to(device)
        self.model.eval()

        self.tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)

        self.category = []
        idx = -1
        with open(root_path+'/data/wellness_data_for_text_classification.txt', 'r') as f:
            while True:
                line = f.readline()
                if not line:
                    break
                datas = line.strip().split("\t")
                if datas[1] != str(idx):
                    self.category.append(datas[2])
                idx += 1
def predict_pair(model_args, data_args, training_args):
    # Set seed
    set_seed(training_args.seed)

    if 'roberta' in model_args.model_type:
        tokenizer = RobertaTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = RobertaConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = RobertaForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    elif 'electra' in model_args.model_type:
        tokenizer = ElectraTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = ElectraConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = ElectraForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)
    else:
        # default -> bert
        tokenizer = BertTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
        config = BertConfig.from_pretrained(model_args.model_name_or_path)
        config.num_labels = data_args.num_labels
        model = BertForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config)

    model.to(training_args.device)

    test_df = pickle.load(open(data_args.test_data_file, 'rb'))
    test_dataset = get_dataset(data_args, tokenizer, test_df, model_args.model_type)
    data_collator = MyDataCollator()
    if training_args.local_rank != -1:
        sampler = SequentialDistributedSampler(test_dataset)
        model = torch.nn.DataParallel(model)
    else:
        n_gpu = torch.cuda.device_count()
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)
        sampler = SequentialSampler(test_dataset)
    print(len(test_dataset))
    dataloader = DataLoader(
        test_dataset,
        sampler=sampler,
        batch_size=training_args.eval_batch_size,
        collate_fn=data_collator,
    )

    model.eval()
    all_probs = []
    for inputs in tqdm(dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(training_args.device)
        inputs.pop('labels')
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs[0]
            probs = torch.softmax(logits, dim=-1)
            maxp, maxi = torch.max(probs, dim=-1)
            result = [(_i, _p) for _p, _i in zip(maxp, maxi)]
            all_probs.extend(result)

    with open('./{}_{}.answer_classify.result'.format(data_args.data_type, model_args.model_type), 'w', encoding='utf-8') as fout:
        for i in range(len(test_df)):
            fout.write('{} | {} | {} | {} | {}\n'.format(test_df[i][0], test_df[i][1], test_df[i][2], all_probs[i][0], all_probs[i][1]))
Ejemplo n.º 14
0
 def __init__(self, output_size=24005, device='cpu'):
     super().__init__()
     self.device = device
     config = ElectraConfig.from_pretrained(
         'google/electra-small-discriminator')
     self.electra = AutoModel.from_config(config).to(device)
     self.output = nn.Linear(self.electra.config.hidden_size,
                             output_size).to(device)
Ejemplo n.º 15
0
def revise_config(config: ElectraConfig, args: argparse.Namespace):
    """
    Revise config as we want
        1. Add multiplier between generator and discriminator
        2. Degree of weight sharing
            'no' : Share nothing
            'embedding' : Share only embedding layer
            'all' : Share all layers
        3. Set configuration as electra-small
    """

    config.multiplier_generator_and_discriminator = args.multiplier_generator_and_discriminator
    config.weight_sharing_degree = args.weight_sharing_degree
    config.rtd_loss_weight = args.rtd_loss_weight
    config.generator_num_hidden_layers = args.generator_num_hidden_layers
    config.save_log_steps = args.save_log_steps

    return config
Ejemplo n.º 16
0
    def __init__(self):
        super(ElectraEncoder, self).__init__()
        self.config = ElectraConfig().from_pretrained(
            os.path.join('../pretrained', 'electra', 'config.json'))

        self.net = ElectraModel(self.config).from_pretrained(
            os.path.join('../pretrained', 'electra', 'tf_model.h5'))

        print(self.net)
Ejemplo n.º 17
0
def get_electra():
    ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='ids')
    att = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='att')
    tok_type_ids = keras.layers.Input(shape=(None, ),
                                      dtype=tf.int32,
                                      name='tti')

    config = ElectraConfig.from_pretrained(Config.Electra.config)
    electra_model = TFElectraModel.from_pretrained(Config.Electra.model,
                                                   config=config)

    x = electra_model(ids, attention_mask=att, token_type_ids=tok_type_ids)

    x1 = keras.layers.Dropout(0.15)(x[0])
    x1 = keras.layers.Conv1D(768, 2, padding='same')(x1)
    x1 = keras.layers.LeakyReLU()(x1)
    x1 = keras.layers.LayerNormalization()(x1)
    x1 = keras.layers.Conv1D(64, 2, padding='same')(x1)
    x1 = keras.layers.LeakyReLU()(x1)
    x1 = keras.layers.LayerNormalization()(x1)
    x1 = keras.layers.Conv1D(32, 2, padding='same')(x1)
    x1 = keras.layers.Conv1D(1, 1)(x1)
    x1 = keras.layers.Flatten()(x1)
    x1 = keras.layers.Activation('softmax', dtype='float32', name='sts')(x1)

    x2 = keras.layers.Dropout(0.15)(x[0])
    x2 = keras.layers.Conv1D(768, 2, padding='same')(x2)
    x2 = keras.layers.LeakyReLU()(x2)
    x2 = keras.layers.LayerNormalization()(x2)
    x2 = keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = keras.layers.LeakyReLU()(x2)
    x2 = keras.layers.LayerNormalization()(x2)
    x2 = keras.layers.Conv1D(32, 2, padding='same')(x2)
    x2 = keras.layers.Conv1D(1, 1)(x2)
    x2 = keras.layers.Flatten()(x2)
    x2 = keras.layers.Activation('softmax', dtype='float32', name='ets')(x2)

    model = keras.models.Model(inputs=[ids, att, tok_type_ids],
                               outputs=[x1, x2])

    optimizer = keras.optimizers.Adam(learning_rate=6e-5)
    if Config.Train.use_amp:
        optimizer = keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, 'dynamic')
    loss = keras.losses.CategoricalCrossentropy(
        label_smoothing=Config.Train.label_smoothing)
    model.compile(loss=loss, optimizer=optimizer)

    return model
Ejemplo n.º 18
0
 def bert_config(self):
     if self.bert_model_name.startswith('bert-'):
         return BertConfig.from_pretrained(self.bert_model_name,
                                           cache_dir=self.bert_cache_dir)
     elif 'roberta' in self.bert_model_name:
         return RobertaConfig.from_pretrained(self.bert_model_name,
                                              cache_dir=self.bert_cache_dir)
     elif self.bert_model_name.startswith('xlm-roberta-'):
         return XLMRobertaConfig.from_pretrained(
             self.bert_model_name, cache_dir=self.bert_cache_dir)
     elif 'electra' in self.bert_model_name:
         return ElectraConfig.from_pretrained(self.bert_model_name,
                                              cache_dir=self.bert_cache_dir)
     else:
         raise ValueError('Unknown model: {}'.format(self.bert_model_name))
 def get_config(self):
     return ElectraConfig(
         vocab_size=self.vocab_size,
         hidden_size=self.hidden_size,
         num_hidden_layers=self.num_hidden_layers,
         num_attention_heads=self.num_attention_heads,
         intermediate_size=self.intermediate_size,
         hidden_act=self.hidden_act,
         hidden_dropout_prob=self.hidden_dropout_prob,
         attention_probs_dropout_prob=self.attention_probs_dropout_prob,
         max_position_embeddings=self.max_position_embeddings,
         type_vocab_size=self.type_vocab_size,
         is_decoder=False,
         initializer_range=self.initializer_range,
     )
Ejemplo n.º 20
0
def define_config(name):
    if name in [
            "bert-base-multilingual-cased",
            "sangrimlee/bert-base-multilingual-cased-korquad",
            "kykim/bert-kor-base", "monologg/kobert"
    ]:
        return BertConfig.from_pretrained(name)
    elif name in [
            "monologg/koelectra-base-v3-discriminator",
            "kykim/electra-kor-base"
    ]:
        return ElectraConfig.from_pretrained(name)
    elif name in ["xlm-roberta-large"]:
        return XLMRobertaConfig.from_pretrained(name)
    elif name in ["kykim/funnel-kor-base"]:
        return FunnelConfig.from_pretrained(name)
Ejemplo n.º 21
0
def _get_bert(model_type, model_path_dict):
    if model_type == 'bert':
        config = BertConfig.from_pretrained(model_path_dict['config'])
        config.output_hidden_states = True
        bert = BertModel.from_pretrained(model_path_dict['model'],
                                         config=config)
    elif model_type == 'electra':
        config = ElectraConfig.from_pretrained(model_path_dict['config'])
        config.output_hidden_states = True
        bert = ElectraModel.from_pretrained(model_path_dict['model'],
                                            config=config)
    elif model_type == 'roberta':
        config = RobertaConfig.from_pretrained(model_path_dict['config'])
        config.output_hidden_states = True
        bert = RobertaModel.from_pretrained(model_path_dict['model'],
                                            config=config)
    return bert, config
Ejemplo n.º 22
0
 def __call_model_torch(self):
     if self.model_to_use.lower() == 'bert':
         self.config = BertConfig(num_labels=2)
         self.model = BertForSequenceClassification.from_pretrained(
             'bert-base-uncased', config=self.config)
     elif self.model_to_use.lower() == 'albert':
         self.config = AlbertConfig(num_labels=2)
         self.model = AlbertForSequenceClassification.from_pretrained(
             'albert-base-v1', config=self.config)
     elif self.model_to_use.lower() == 'electra':
         self.config = ElectraConfig(num_labels=2)
         self.model = ElectraForSequenceClassification.from_pretrained(
             'google/electra-small-discriminator', config=self.config)
     elif self.model_to_use.lower() == 'distilbert':
         self.config = DistilBertConfig(num_labels=2)
         self.model = DistilBertForSequenceClassification.from_pretrained(
             'distilbert-base-uncased', config=self.config)
     else:
         print('Model not avaiable yet.')
Ejemplo n.º 23
0
def load_model(dataBunch, pretrained_path, finetuned_wgts_path, device, multi_label):

    model_type = dataBunch.model_type
    model_state_dict = None

    if torch.cuda.is_available():
        map_location = lambda storage, loc: storage.cuda()
    else:
        map_location = "cpu"

    if finetuned_wgts_path:
        model_state_dict = torch.load(finetuned_wgts_path, map_location=map_location)
    else:
        model_state_dict = None

    if multi_label is True:
        config_class, model_class, _ = MODEL_CLASSES[model_type]

        config = config_class.from_pretrained(
            str(pretrained_path), num_labels=len(dataBunch.labels)
        )

        model = model_class[1].from_pretrained(
            str(pretrained_path), config=config, state_dict=model_state_dict
        )
    else:
        if model_type == "electra":
            config = ElectraConfig.from_pretrained(
                str(pretrained_path),
                model_type=model_type,
                num_labels=len(dataBunch.labels),
            )
        else:
            config = AutoConfig.from_pretrained(
                str(pretrained_path),
                model_type=model_type,
                num_labels=len(dataBunch.labels),
            )
        model = AutoModelForSequenceClassification.from_pretrained(
            str(pretrained_path), config=config, state_dict=model_state_dict
        )

    return model.to(device)
Ejemplo n.º 24
0
    def __init__(self, config: dict):
        super(Model, self).__init__()
        self.electra_cfg = ElectraConfig()
        self.electra = ElectraModel.from_pretrained(config["pretrained_dir"] +
                                                    "electra_small.index",
                                                    config=self.electra_cfg,
                                                    from_tf=True)

        self.sentence_encoder = AttentionSentenceEncoder(
            self.electra_cfg.hidden_size, config["sent_head"],
            config["max_sents"] + 1)  # 多一个位置给CLS
        self.img_encoder = SimpleImageEncoder(config["img_input_size"],
                                              config["img_output_size"],
                                              config["img_num"],
                                              dropout=config["dropout"])

        self.output_layer = OutputLayer(
            config["task"],
            self.electra_cfg.hidden_size + config["img_output_size"],
            config["output_size"], config["dropout"])
Ejemplo n.º 25
0
 def __init__(self, params, name="model", **kwargs):
     super(NERwithHFBERT, self).__init__(params, name=name, **kwargs)
     self._tag_string_mapper = get_sm(self._params.tags_fn_)
     self.tag_vocab_size = self._tag_string_mapper.size() + 2
     self._tracked_layers = dict()
     if self.pretrained_bert is None:
         if self._params.use_hf_electra_model_:
             self.pretrained_bert = TFElectraModel(ElectraConfig.from_pretrained(params.pretrained_hf_model_,cache_dir=params.hf_cache_dir_))
         else:
             self.pretrained_bert = TFBertModel(BertConfig.from_pretrained(params.pretrained_hf_model_,cache_dir=params.hf_cache_dir_))
     self._dropout = tf.keras.layers.Dropout(self._params.dropout_last)
     if self._params.bet_tagging_:
         # print(self.tag_vocab_size-1)
         # half of the classes is used plus O-Class, sos, eos
         self._layer_cls = tf.keras.layers.Dense(
             int(self._tag_string_mapper.size() // 2 + 3), activation=tf.keras.activations.softmax, name="layer_cls"
         )
         self._layer_start = tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid, name="layer_start")
         self._layer_end = tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid, name="layer_end")
     elif self._params.use_crf:
         self._last_layer = tf.keras.layers.Dense(self.tag_vocab_size, name="last_layer")
         self._trans_params = tf.keras.layers.Embedding(
             self.tag_vocab_size, self.tag_vocab_size, name="trans_params"
         )
         # ,embeddings_initializer=tf.keras.initializers.Constant(1))
         if self._params.crf_with_ner_rule:
             self._penalty_factor = tf.keras.layers.Embedding(1, 1, name="penalty_factor")
             # ,embeddings_initializer=tf.keras.initializers.Constant(1))
             self._penalty_absolute = tf.keras.layers.Embedding(1, 1, name="penalty_absolute")
             # ,embeddings_initializer=tf.keras.initializers.Constant(1))
         elif self.params.crf_with_ner_forb_trans:
             self._penalty_factor = tf.constant(0.0, name="penalty_factor", dtype=tf.float32)
             self._penalty_absolute = tf.constant(-100000.0, name="penalty_absolute", dtype=tf.float32)
         self.init_crf_with_ner_rule((self.tag_vocab_size - 3) // 2)
     else:
         self._last_layer = tf.keras.layers.Dense(
             self.tag_vocab_size, activation=tf.keras.activations.softmax, name="last_layer"
         )
Ejemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected")

    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected")

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help=
        "The input training file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help=
        "The input evaluation file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help=
        "If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        default=True,
        action="store_true",
        help="Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.",
    )

    parser.add_argument("--logging_steps",
                        type=int,
                        default=100,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=10000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")

    parser.add_argument(
        "--threads",
        type=int,
        default=1,
        help="multiple threads for converting example to features")

    ### DO NOT MODIFY THIS BLOCK ###
    # arguments for nsml
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--mode', type=str, default='train')
    ################################

    args = parser.parse_args()

    # for NSML
    args.data_dir = os.path.join(DATASET_PATH, args.data_dir)

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        logger.warning('IF args.n_gpu : ' + str(args.n_gpu) + ' / device : ' +
                       str(device) + '\n')
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
        logger.warning('ELSE args.n_gpu : ' + str(args.n_gpu) +
                       ' / device : ' + str(device) + '\n')

    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename='log.log')
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.warning("Model Loading ..")

    config = ElectraConfig.from_pretrained(args.model_name_or_path)
    model = ElectraForQuestionAnswering.from_pretrained(
        args.model_name_or_path, config=config)
    tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path,
                                                 do_lower_case=False)

    logger.warning("Model Loading Completed")

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    ### DO NOT MODIFY THIS BLOCK ###
    if IS_ON_NSML:
        bind_nsml(model, tokenizer, args)
        if args.pause:
            nsml.paused(scope=locals())
    ################################

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is
    # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running
    # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)
Ejemplo n.º 27
0
def get_model(args, tokenizer):
    config = ElectraConfig.from_pretrained('google/electra-base-discriminator')
    config.num_labels = 4
    config.vocab_size = tokenizer.get_vocab_size() if tokenizer else args.vocab_size
    model = ElectraForSequenceClassification(config)
    return model
Ejemplo n.º 28
0
    c.lr = 1e-4
    c.layer_lr_decay = 0.8
    c.max_length = 512
elif c.size == "large":
    c.lr = 5e-5
    c.layer_lr_decay = 0.9
    c.max_length = 512
else:
    raise ValueError(f"Invalid size {c.size}")
if c.pretrained_checkpoint is None:
    c.max_length = 512  # All public models is ++, which use max_length 512

# huggingface/transformers
hf_tokenizer = ElectraTokenizerFast.from_pretrained(
    f"google/electra-{c.size}-discriminator")
electra_config = ElectraConfig.from_pretrained(
    f"google/electra-{c.size}-discriminator")

# wsc
if c.wsc_trick:
    from _utils.wsc_trick import *  # importing spacy model takes time

# logging
# light logging callback here is to only log the last score and avoid exceeding the api access limit
if c.logger == "neptune":
    import neptune
    from fastai.callback.neptune import NeptuneCallback

    class LightNeptuneCallback(NeptuneCallback):
        def after_batch(self):
            pass
import tensorflow as tf
from transformers import (
    ElectraConfig,
    ElectraTokenizer,
    TFElectraForMaskedLM,
    TFElectraForPreTraining,
)

from electra.utils import colorize_dis, colorize_gen

os.environ["CUDA_VISIBLE_DEVICES"] = ""

# TODO: Should I use bert-base-uncased?
tokenizer = ElectraTokenizer.from_pretrained("bert-base-uncased")

gen_config = ElectraConfig.from_pretrained("google/electra-small-generator")
dis_config = ElectraConfig.from_pretrained(
    "google/electra-small-discriminator")

# gen = TFElectraForMaskedLM.from_pretrained("google/electra-small-generator")
# dis = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
gen = TFElectraForMaskedLM(config=gen_config)
dis = TFElectraForPreTraining(config=dis_config)
optimizer = tf.keras.optimizers.Adam(lr=1e-4)

# Load in WikiText-2.
filename = "/fsx/wikitext/wikitext-2-raw/wiki.test.raw"
with open(filename) as infile:
    wiki_text: str = infile.read()  # length 1,288,556

# Load in text strings.
Ejemplo n.º 30
0
  save_ckpt_path = f"{checkpoint_path}/koelectra-wellnesee-text-classification.pth"
  model_name_or_path = "monologg/koelectra-base-discriminator"

  #답변과 카테고리 불러오기
  category, answer = load_wellness_answer()

  ctx = "cuda" if torch.cuda.is_available() else "cpu"
  device = torch.device(ctx)

  # 저장한 Checkpoint 불러오기
  checkpoint = torch.load(save_ckpt_path, map_location=device)

  # Electra Tokenizer
  tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path)

  electra_config = ElectraConfig.from_pretrained(model_name_or_path)
  model = koElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                                             config=electra_config,
                                                             num_labels=359)
  model.load_state_dict(checkpoint['model_state_dict'])
  model.to(device)
  model.eval()


  while 1:
    sent = input('\nQuestion: ') # '요즘 기분이 우울한 느낌이에요'
    data = koelectra_input(tokenizer,sent, device,512)
    # print(data)

    output = model(**data)