コード例 #1
0
ファイル: config.py プロジェクト: zhangzx-uiuc/AMR-IE
 def bert_config(self):
     if self.bert_model_name.startswith('bert-'):
         return BertConfig.from_pretrained(self.bert_model_name,
                                           cache_dir=self.bert_cache_dir)
     elif self.bert_model_name.startswith('roberta-'):
         return RobertaConfig.from_pretrained(self.bert_model_name,
                                              cache_dir=self.bert_cache_dir)
     elif self.bert_model_name.startswith('xlm-roberta-'):
         return XLMRobertaConfig.from_pretrained(self.bert_model_name,
                                                 cache_dir=self.bert_cache_dir)
     else:
         raise ValueError('Unknown model: {}'.format(self.bert_model_name))
コード例 #2
0
ファイル: models.py プロジェクト: jhashekhar/multilingual-clf
    def __init__(self):
        super(XLMRobertaLargeTC, self).__init__()
        config = XLMRobertaConfig.from_pretrained('xlm-roberta-large',
                                                  output_hidden_states=True)
        self.xlm_roberta = XLMRobertaModel.from_pretrained('xlm-roberta-large',
                                                           config=config)

        self.fc = nn.Linear(config.hidden_size, 1)
        self.dropout = nn.Dropout(p=0.2)

        # initialize weight
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)
コード例 #3
0
def define_config(name):
    if name in [
            "bert-base-multilingual-cased",
            "sangrimlee/bert-base-multilingual-cased-korquad",
            "kykim/bert-kor-base", "monologg/kobert"
    ]:
        return BertConfig.from_pretrained(name)
    elif name in [
            "monologg/koelectra-base-v3-discriminator",
            "kykim/electra-kor-base"
    ]:
        return ElectraConfig.from_pretrained(name)
    elif name in ["xlm-roberta-large"]:
        return XLMRobertaConfig.from_pretrained(name)
    elif name in ["kykim/funnel-kor-base"]:
        return FunnelConfig.from_pretrained(name)
コード例 #4
0
ファイル: RelationExtractor.py プロジェクト: amkatyshev/OE
    def load_model(self, relation_model: str):
        super().load_model(relation_model)
        if not isinstance(relation_model, str):
            raise ArgumentError("Argument 'relation_model' must be a str")
        if len(relation_model) == 0:
            raise ArgumentError(
                "Argument 'relation_model' filename must not be empty")
        if not os.path.isfile(relation_model):
            raise RelationModelError("Relation model file doesn't exist")
        print('Loading model for relation extraction...', end=' ')

        args = {
            'NUM_LABELS': len(self.data.relation_labels),
            'DROPOUT_RATE': 0.1,
            'LEARNING_RATE': 2e-5,
            'EPOCHS': 5,
            'MAX_SEQUENCE_LENGTH': 384,
            'BATCH_SIZE': 16,
            'ADAM_EPSILON': 1e-8,
            'GRADIENT_ACCUMULATION_STEPS': 1,
            'MAX_GRAD_NORM': 1.0,
            'LOGGING_STEPS': 250,
            'SAVE_STEPS': 250,
            'WEIGHT_DECAY': 0.0,
            'NUM_WARMUP_STEPS': 0,
        }
        try:
            self.tokenizer.add_special_tokens({
                "additional_special_tokens":
                ["<e1>", "</e1>", "<e2>", "</e2>"]
            })
            config = XLMRobertaConfig.from_pretrained(
                'xlm-roberta-base', num_labels=args['NUM_LABELS'])
            self.model = RBERT.from_pretrained(relation_model,
                                               config=config,
                                               args=args)
            self.model.to(self.device)
        except:
            raise RelationModelError(
                'Error with loading relation model. '
                'Relation model should be PyTorch model creating for relation extraction.'
            )
        print('OK')
コード例 #5
0
def train():

    seed_everything(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # setting model hyperparameter
    # config 자체에는 학습 weight 정보 없기 때문에, from_pretrained 사용해 weight 가져올 수 있다

    # bert_config = BertConfig.from_pretrained(MODEL_NAME)
    # bert_config.num_labels = 42
    # model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config)

    # Auto
    model_config = XLMRobertaConfig.from_pretrained(args.model_name)
    model_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        args.model_name, config=model_config)

    # load model and tokenizer
    # MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
    # roberta: https://huggingface.co/transformers/model_doc/xlmroberta.html
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    # label = dataset['label'].values

    train_dataset, val_dataset = train_test_split(dataset,
                                                  test_size=0.2,
                                                  random_state=args.seed)
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_val = tokenized_dataset(val_dataset, tokenizer)

    tokenized_train_label = train_dataset['label'].values
    tokenized_val_label = val_dataset['label'].values

    # train_datasets = TokenDataset(train_dataset, tokenizer)
    # val_datasets = TokenDataset(val_dataset, tokenizer)
    RE_train_dataset = RE_Dataset(tokenized_train, tokenized_train_label)
    RE_val_dataset = RE_Dataset(tokenized_val, tokenized_val_label)

    # print(model.parameters)
    model.to(device)
    model = torch.nn.DataParallel(model)

    train_loader = DataLoader(
        RE_train_dataset,
        batch_size=args.batch_size,
        # num_workers=8,
        pin_memory=torch.cuda.is_available(),
        shuffle=True,
    )
    val_loader = DataLoader(
        RE_val_dataset,
        batch_size=args.batch_size,
        # num_workers=8,
        shuffle=False,
        pin_memory=torch.cuda.is_available(),
    )

    optimizer = AdamW(model.parameters(),
                      lr=args.lr,
                      weight_decay=args.weight_decay)
    loss_fn = LabelSmoothingLoss(smoothing=0.5)
    # loss_fn = nn.CrossEntropyLoss()

    # t_total = len(train_loader) * args.epoch
    t_total = args.epoch
    warmup_step = int(t_total * args.warmup_steps)
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_step,
                                                num_training_steps=t_total)

    log_dir = ""
    log_list = glob("./logs/*")
    if len(log_list) == 0:
        log_dir = "./logs/exp1"
    else:
        log_list = [int(log[-1]) for log in log_list]
        log_dir = "./logs/exp" + str(max(log_list) + 1)

    logger = SummaryWriter(log_dir=log_dir)

    scaler = GradScaler()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    import time

    for epoch in tqdm(range(args.epoch)):
        train_acc = 0.0
        train_loss = 0.0
        val_acc = 0.0
        val_loss = 0.0
        best_acc = 0.0
        model.train()
        for batch_id, batch in enumerate(tqdm(train_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = loss_fn(outputs.logits, labels)

            # loss.backward()
            # optimizer.step()

            scaler.scale(loss).backward()

            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            scaler.step(optimizer)
            scaler.update()

            train_acc += compute_acc(outputs.logits.cpu(), labels.cpu())
            train_loss += loss

            if (batch_id + 1) % args.logging_steps == 0:
                train_loss = train_loss.data.cpu().numpy()
                print(
                    f"[Train] epoch {epoch + 1} | batch_id {batch_id + 1} | loss {(train_loss) / args.logging_steps:.4f} | train_acc {train_acc / args.logging_steps:.4f}"
                )
                logger.add_scalar("Train/loss",
                                  train_loss / args.logging_steps,
                                  epoch * len(train_loader) + batch_id)
                logger.add_scalar("Train/acc", train_acc / args.logging_steps,
                                  epoch * len(train_loader) + batch_id)
                train_acc = 0.0
                train_loss = 0.0

        # scheduler.step()

        print("\nStart Validation Step!")
        with torch.no_grad():
            model.eval()
            for batch_id, batch in enumerate(tqdm(val_loader)):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = loss_fn(outputs.logits, labels)
                val_acc += compute_acc(outputs.logits.cpu(), labels.cpu())
                val_loss += loss

            print(
                f"[Val] epoch {epoch + 1} | val_acc {val_acc / (batch_id + 1):.4f}"
            )
            logger.add_scalar("Val/loss", val_loss / (batch_id + 1), epoch)
            logger.add_scalar("Val/acc", val_acc / (batch_id + 1), epoch)

            if val_acc >= best_acc:
                best_acc = val_acc
                # torch.save(model.state_dict(), os.path.join(args.output_dir, "saved_" + str(epoch) + ".pth"))
                torch.save(model.state_dict(),
                           os.path.join(args.output_dir, "best.pth"))
                print("Saved best acc model...")

        scheduler.step()

    torch.save(model.state_dict(), os.path.join(args.output_dir, "last.pth"))
コード例 #6
0
def train():
    # load model and tokenizer
    MODEL_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    #dev_dataset = load_data("./dataset/train/dev.tsv")
    train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(
        train_dataset,
        tokenizer)  # keys: input_ids, token_type_ids, attention_mask
    #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f'RUNNING ON {device}')
    # setting model hyperparameter
    bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_NAME, config=bert_config)
    model.parameters
    model.to(device)
    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        save_total_limit=3,  # number of total save model.
        save_steps=500,  # model saving step.
        num_train_epochs=10,  # total number of training epochs
        learning_rate=1e-5,  # learning_rate
        per_device_train_batch_size=32,  # batch size per device during training
        per_device_eval_batch_size=32,  # batch size for evaluation
        warmup_steps=300,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,  # log saving step.
        # evaluation_strategy='steps', # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        # eval_steps = 500,            # evaluation step.
        dataloader_num_workers=4,
        label_smoothing_factor=0.5)
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        # eval_dataset=RE_dev_dataset,             # evaluation dataset
        compute_metrics=compute_metrics  # define metrics function
    )

    # train model
    trainer.train()
コード例 #7
0
        "backbone_name": "xlm-roberta-large",
        "model_name": "mlm_disentangle_default",
        "gradient_acc_size": 16,
        "batch_size": 2,
        "max_step": 3375,
        "log_step": 225,
        "num_frozen_layers": 18,
        "mlm_lr": 4e-4,
        "mlm_beta1": 0.9,
        "mlm_beta2": 0.98,
        "mlm_eps": 1e-6
    }
}""",
    cls=ExperimentConfigSerializer,
)
XLMRConfig = XLMRobertaConfig.from_pretrained("xlm-roberta-large")
setattr(XLMRConfig, "discriminators", exp_config["discriminators"])
model = MultitaskModel.create_untrained(
    backbone_name="xlm-roberta-large",
    task_dict={
        "mlm": {
            "type": XLMRobertaForMaskedLM,
            "config": XLMRobertaConfig.from_pretrained("xlm-roberta-large"),
        },
        "disentangle": {
            "type": XLMRobertaForDisentanglement,
            "config": XLMRConfig,
        },
    },
)
import torch
コード例 #8
0
def load_model(args):
    if 'bert-base-multilingual' in args['model_checkpoint']:
        # bert-base-multilingual-uncased or bert-base-multilingual-cased
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'xlm-mlm' in args['model_checkpoint']:
        # xlm-mlm-100-1280
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint'])
        config = XLMConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'xlm-roberta' in args['model_checkpoint']:
        # xlm-roberta-base or xlm-roberta-large
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMRobertaTokenizer.from_pretrained(
            args['model_checkpoint'])
        config = XLMRobertaConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMRobertaForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMRobertaForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMRobertaForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'fasttext' in args['model_checkpoint']:
        # Prepare config & tokenizer
        vocab_path = args['vocab_path']
        config_path = None

        word_tokenizer = args['word_tokenizer_class']()
        emb_path = args['embedding_path'][args['model_checkpoint']]

        _, vocab_map = load_vocab(vocab_path)
        tokenizer = SimpleTokenizer(vocab_map,
                                    word_tokenizer,
                                    lower=args["lower"])
        vocab_list = list(tokenizer.vocab.keys())

        config = BertConfig.from_pretrained('bert-base-uncased')
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]

        embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300)
        config.hidden_size = 300
        config.num_attention_heads = 10
        config.vocab_size = len(embeddings)

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))

    elif 'scratch' in args['model_checkpoint']:
        vocab_path, config_path = None, None

        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        config = BertConfig.from_pretrained("bert-base-uncased")
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]
        config.hidden_size = 300
        config.num_attention_heads = 10

        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config=config)
    elif 'indobenchmark' in args['model_checkpoint']:
        # indobenchmark models
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        model_class = None
        if 'sequence_classification' == args['task']:
            model_class = AlbertForSequenceClassification if 'lite' in args[
                'model_checkpoint'] else BertForSequenceClassification
        elif 'token_classification' == args['task']:
            model_class = AlbertForWordClassification if 'lite' in args[
                'model_checkpoint'] else BertForWordClassification
        elif 'multi_label_classification' == args['task']:
            model_class = AlbertForMultiLabelClassification if 'lite' in args[
                'model_checkpoint'] else BertForMultiLabelClassification
        model = model_class.from_pretrained(args['model_checkpoint'],
                                            config=config)
    return model, tokenizer, vocab_path, config_path
コード例 #9
0
def load_eval_model(args):
    vocab_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/vocab.txt'
    config_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/config.json'
    model_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/best_model_0.th'

    # Load for word2vec and fasttext
    if 'word2vec' in args['model_type'] or 'fasttext' in args['model_type']:
        emb_path = args['embedding_path'][args['model_type']]
        model, tokenizer = load_word_embedding_model(
            args['model_type'],
            args['task'],
            vocab_path,
            args['word_tokenizer_class'],
            emb_path,
            args['num_labels'],
            lower=args['lower'])
        return model, tokenizer

    # Load config & tokenizer
    if 'albert' in args['model_type']:
        config = AlbertConfig.from_json_file(config_path)
        tokenizer = BertTokenizer(vocab_path)
    elif 'babert' in args['model_type']:
        config = BertConfig.from_json_file(config_path)
        tokenizer = BertTokenizer(vocab_path)
    elif 'scratch' in args['model_type']:
        config = BertConfig.from_pretrained('bert-base-uncased')
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    elif 'bert-base-multilingual' in args['model_type']:
        config = BertConfig.from_pretrained(args['model_type'])
        tokenizer = BertTokenizer.from_pretrained(args['model_type'])
    elif 'xlm-mlm-100-1280' in args['model_type']:
        config = XLMConfig.from_pretrained(args['model_type'])
        tokenizer = XLMTokenizer.from_pretrained(args['model_type'])
    elif 'xlm-roberta' in args['model_type']:
        config = XLMRobertaConfig.from_pretrained(args['model_type'])
        tokenizer = XLMRobertaTokenizer.from_pretrained(args['model_type'])
    else:
        raise ValueError('Invalid `model_type` argument values')

    # Get model class
    base_cls, pred_cls = get_model_class(args['model_type'], args['task'])

    # Adjust config
    if type(args['num_labels']) == list:
        config.num_labels = max(args['num_labels'])
        config.num_labels_list = args['num_labels']
    else:
        config.num_labels = args['num_labels']

    # Instantiate model
    model = pred_cls(config=config)
    base_model = base_cls.from_pretrained(model_path,
                                          from_tf=False,
                                          config=config)

    # Plug pretrained base model to classification model
    if 'bert' in model.__dir__():
        model.bert = base_model
    elif 'albert' in model.__dir__():
        model.albert = base_model
    elif 'roberta' in model.__dir__():
        model.roberta = base_model
    elif 'transformer' in model.__dir__():
        model.transformer = base_model
    else:
        ValueError(
            'Model attribute not found, is there any change in the `transformers` library?'
        )

    return model, tokenizer
コード例 #10
0
def load_model(args):
    if 'albert-large-wwmlm-512' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_vocab_uncased_30000.txt"
        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(
            "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_albert_large_config.json"
        )
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_pytorch_albert_large_512_629k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model
    elif 'albert-base-wwmlm-512' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_vocab_uncased_30000.txt"
        config_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_albert_base_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_pytorch_model_albert_base_162k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'albert-large-wwmlm-128' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_vocab_uncased_30000.txt"
        config_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_albert_large_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_pytorch_albert_large_128_500k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'babert-bpe-mlm-large-512' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_bert_large_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_pytorch_babert_uncased_large_512_dup10-5_1120k.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'albert-base-uncased-112500' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-uncased-112500/vocab.txt"
        config_path = "../embeddings/albert-base-uncased-112500/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-uncased-112500/albert_base_uncased_112500.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'albert-base-uncased-96000' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-uncased-96000/vocab.txt"
        config_path = "../embeddings/albert-base-uncased-96000/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-uncased-96000/albert_base_uncased_96000.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'albert-base-uncased-191k' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_vocab_uncased_30000.txt"
        config_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_albert_base_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_pytorch_model_albert_base_191k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'babert-opensubtitle' == args['model_checkpoint']:
        # babert-opensubtitle
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-opensubtitle/vocab.txt"
        config_path = "../embeddings/babert-opensubtitle/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-opensubtitle/model.ckpt-1000000.index",
            from_tf=True,
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-large-uncased-1100k' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_pytorch_model_babert_large_1100k.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-large-uncased-1m' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_pytorch_model_babert_large_1mil.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-base-512' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-base-512/pytorch_models_babert_base_512_pytorch_model_babert_base_uncased_512.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-large-uncased' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_pytorch_model_babert_large_778500.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-uncased-128-dup10-5' == args['model_checkpoint']:
        # babert_bpe_wwmlm
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/vocab.txt"
        config_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/pytorch_model.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'bert-base-multilingual' in args['model_checkpoint']:
        # bert-base-multilingual-uncased or bert-base-multilingual-cased
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)

    elif 'xlm-mlm' in args['model_checkpoint']:
        # xlm-mlm-100-1280
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint'])
        config = XLMConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'xlm-roberta' in args['model_checkpoint']:
        # xlm-roberta-base or xlm-roberta-large
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMRobertaTokenizer.from_pretrained(
            args['model_checkpoint'])
        config = XLMRobertaConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMRobertaForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMRobertaForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMRobertaForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'word2vec' in args['model_checkpoint'] or 'fasttext' in args[
            'model_checkpoint']:
        # Prepare config & tokenizer
        vocab_path = args['vocab_path']
        config_path = None

        word_tokenizer = args['word_tokenizer_class']()
        emb_path = args['embedding_path'][args['model_checkpoint']]

        _, vocab_map = load_vocab(vocab_path)
        tokenizer = SimpleTokenizer(vocab_map,
                                    word_tokenizer,
                                    lower=args["lower"])
        vocab_list = list(tokenizer.vocab.keys())

        config = BertConfig.from_pretrained('bert-base-uncased')
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]

        if args['model_checkpoint'] == 'word2vec-twitter':
            embeddings = gen_embeddings(vocab_list, emb_path)
            config.hidden_size = 400
            config.num_attention_heads = 8

        if args['model_checkpoint'] == 'fasttext-cc-id' or args[
                'model_checkpoint'] == 'fasttext-cc-id-300-no-oov-uncased' or args[
                    'model_checkpoint'] == 'fasttext-4B-id-300-no-oov-uncased':
            embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300)
            config.hidden_size = 300
            config.num_attention_heads = 10

        config.vocab_size = len(embeddings)

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))

    elif 'scratch' in args['model_checkpoint']:
        vocab_path, config_path = None, None

        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        config = BertConfig.from_pretrained("bert-base-uncased")
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]
        config.hidden_size = 300
        config.num_attention_heads = 10

        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config=config)
    elif 'indobenchmark' in args['model_checkpoint']:
        # indobenchmark models
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)

    return model, tokenizer, vocab_path, config_path
コード例 #11
0
ファイル: train.py プロジェクト: bcaitech1/p2-klue-deokisys
def train():
    # load model and tokenizer
    # "bert-base-multilingual-cased",kykim/bert-kor-base,roberta-large-mnli
    MODEL_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(  # roberta기준
        MODEL_NAME
    )  # XLMRobertaTokenizer AutoTokenizer.from_pretrained(MODEL_NAME)
    # autoTokenizer로 자동으로 가져온다.

    # 데이터셋 나누기
    datas = pd.read_csv("/opt/ml/input/data/train/train.tsv",
                        delimiter='\t',
                        header=None)
    train, val = train_test_split(datas, test_size=0.2, random_state=42)
    # train = pd.DataFrame(data=train)
    train.to_csv('/opt/ml/input/data/train/train_train.tsv',
                 sep='\t',
                 header=None,
                 index=False)
    val.to_csv('/opt/ml/input/data/train/train_val.tsv',
               sep='\t',
               header=None,
               index=False)

    # load dataset
    train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    train_label = train_dataset['label'].values

    dev_dataset = load_data("/opt/ml/input/data/train/train_val.tsv")
    dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    bert_config = XLMRobertaConfig.from_pretrained(
        MODEL_NAME)  # XLMRobertaConfig,BertConfig.from_pretrained(MODEL_NAME)
    # print(bert_config)

    # bert_config.vocab_size = 42004  # 4개 추가 안됨
    bert_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(  # XLMRobertaForSequenceClassification AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        config=bert_config)
    model.resize_token_embeddings(len(tokenizer))  # 새로운 토큰 추가로 임베딩 크기 조정
    # model = BertForSequenceClassification(bert_config)  # from_pretrained가져오기
    model.parameters
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        save_total_limit=3,  # number of total save model.
        save_steps=500,  # model saving step.
        num_train_epochs=10,  # total number of training epochs
        learning_rate=1e-5,  # learning_rate
        per_device_train_batch_size=32,  # batch size per device during training
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,  # log saving step.
        per_device_eval_batch_size=8,  # batch size for evaluation
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=500,  # evaluation step.
        # dataloader_num_workers=4,
        # label_smoothing_factor=0.5

        # 제일 높은것만 저장한다고 한다.
        # load_best_model_at_end=True,
        # metric_for_best_model="accuracy"
    )
    trainer = Trainer(
        # the instantiated 🤗 Transformers model to be trained
        model=model,
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        eval_dataset=RE_dev_dataset,  # evaluation dataset
        compute_metrics=compute_metrics  # define metrics function
    )

    # train model
    trainer.train()
コード例 #12
0
def train(args):
    # load model and tokenizer
    MODEL_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

    # split dataset
    dataset = pd.read_csv('/opt/ml/input/data/train/train.tsv',
                          delimiter='\t',
                          header=None)
    train, dev = train_test_split(dataset, test_size=0.2, random_state=42)
    train.to_csv('/opt/ml/input/data/train/train_train.tsv',
                 sep='\t',
                 header=None,
                 index=False)
    dev.to_csv('/opt/ml/input/data/train/train_dev.tsv',
               sep='\t',
               header=None,
               index=False)

    # load dataset
    train_dataset = load_data('/opt/ml/input/data/train/train_train.tsv',
                              args.root)
    dev_dataset = load_data('/opt/ml/input/data/train/train_dev.tsv',
                            args.root)

    train_label = train_dataset['label'].values
    dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_NAME, config=bert_config)
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(output_dir='./results/' + str(args.id),
                                      save_total_limit=3,
                                      save_steps=100,
                                      num_train_epochs=10,
                                      learning_rate=1e-5,
                                      per_device_train_batch_size=32,
                                      per_device_eval_batch_size=32,
                                      warmup_steps=300,
                                      weight_decay=0.01,
                                      logging_dir='./logs/' + str(args.id),
                                      logging_steps=100,
                                      evaluation_strategy='steps',
                                      eval_steps=100,
                                      dataloader_num_workers=4,
                                      label_smoothing_factor=0.5)
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=RE_train_dataset,
                      eval_dataset=RE_dev_dataset,
                      compute_metrics=compute_metrics)

    # train model
    trainer.train()
コード例 #13
0
ファイル: train.py プロジェクト: bcaitech1/p2-klue-shlee4290
def train():
    # load model and tokenizer
    #MODEL_NAME = "bert-base-multilingual-cased"
    MODEL_NAME = 'xlm-roberta-large'
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
    print(tokenizer.tokenize("이순신은 조선 중기의 무신이다."))
    print(tokenizer.tokenize("아버지가방에들어가신다."))
    tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." +
                                       tokenizer.sep_token + "아버지가방에들어가신다.")
    print(tokenized_str)

    # load dataset
    train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    #dev_dataset = load_data("./dataset/train/dev.tsv")
    train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values
    # train_dataset, dev_dataset = load_fold(6)
    # train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)
    train_dataset, dev_dataset = torch.utils.data.random_split(
        RE_train_dataset, [8000, 1001])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_NAME, config=bert_config)
    #model.parameters
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        save_total_limit=3,  # number of total save model.
        save_steps=300,  # model saving step.
        load_best_model_at_end=True,
        num_train_epochs=10,  # total number of training epochs
        learning_rate=1e-5,  # learning_rate
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        warmup_steps=300,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,  # log saving step.
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=300,  # evaluation step.
        dataloader_num_workers=4,
        label_smoothing_factor=0.5)
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=dev_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # define metrics function
    )

    # train model
    trainer.train()
コード例 #14
0
ファイル: train.py プロジェクト: bcaitech1/p2-klue-cha-no
def train(model_dir, args):
    seed_everything(args.seed)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    print("This notebook use [%s]." % (device))

    s_dir = args.model + str(
        args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(
            args.epochs
        ) + '-' + args.scheduler + '-' + args.tokenize + '-' + str(
            args.max_len) + '-' + str(args.seed)

    save_dir = increment_path(os.path.join(model_dir, s_dir))
    log_dir = increment_path(os.path.join('logs', s_dir))

    # load model and tokenizer
    MODEL_NAME = args.model
    if MODEL_NAME.startswith('xlm'):
        tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # set neptune
    set_neptune(save_dir, args)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    labels = dataset['label'].values

    # setting model hyperparameter
    if MODEL_NAME.startswith('xlm'):
        bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    else:
        bert_config = BertConfig.from_pretrained(MODEL_NAME)

    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers

    if MODEL_NAME.startswith('xlm'):
        model = XLMRobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, config=bert_config)
    else:
        model = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, config=bert_config)

    if args.drop >= 0:
        model.dropout = nn.Dropout(p=args.drop)

    # preprocess dataset
    if args.preprocess != 'no':
        pre_module = getattr(import_module("preprocess"), args.preprocess)
        dataset = pre_module(dataset, model, tokenizer)

    # make dataset for pytorch.
    # train, val split

    train_dataset, val_dataset = train_test_split(dataset,
                                                  test_size=args.val_ratio,
                                                  random_state=args.seed)

    tok_module = getattr(import_module("load_data"), args.tokenize)

    train_tokenized = tok_module(train_dataset,
                                 tokenizer,
                                 max_len=args.max_len)
    val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(
        train_tokenized, train_dataset['label'].reset_index(drop='index'))
    RE_val_dataset = RE_Dataset(val_tokenized,
                                val_dataset['label'].reset_index(drop='index'))

    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        seed=args.seed,
        output_dir=save_dir,  # output directory
        save_total_limit=2,  # number of total save model.
        save_steps=args.save_steps,  # model saving step.
        num_train_epochs=args.epochs,  # total number of training epochs
        learning_rate=args.lr,  # learning_rate
        per_device_train_batch_size=args.
        batch_size,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        lr_scheduler_type=args.scheduler,
        warmup_steps=args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # strength of weight decay
        logging_dir=log_dir,  # directory for storing logs
        logging_steps=100,  # log saving step.
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=100,  # evaluation step.
        dataloader_num_workers=4,
        label_smoothing_factor=args.smoothing_factor,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy')

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        eval_dataset=RE_val_dataset,  # evaluation dataset
        compute_metrics=compute_metrics  # define metrics function
    )

    # train model
    trainer.train()
コード例 #15
0
ファイル: train.py プロジェクト: bcaitech1/p2-klue-JAEWOOSUN
def train(args):

    # device setting
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # load model and tokenizer
    if args['model_name'] == "xlm-roberta-large":
        MODEL_NAME = "xlm-roberta-large"
        tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
        config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
        config.num_labels = args['num_labels']

        model = XLMRobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, config=config)

    elif args['model_name'] == "roberta-base":
        MODEL_NAME = "roberta-base"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        config = RobertaConfig.from_pretrained(MODEL_NAME,
                                               output_hidden_states=True)
        config.num_labels = args['num_labels']

        model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME,
                                                                 config=config)

    elif args['model_name'] == "bert-base-multilingual-cased":
        MODEL_NAME = "bert-base-multilingual-cased"
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        config = BertConfig.from_pretrained(MODEL_NAME)
        config.num_labels = args['num_labels']

        model = BertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                              config=config)

    else:
        MODEL_NAME = args['model_name']
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        model = AutoModel.from_pretrained(MODEL_NAME)

    # if you use entity_token
    if args['entity_token']:
        special_tokens_dict = {
            'additional_special_tokens': ["#", "@", '₩', '^']
        }
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        model.resize_token_embeddings(len(tokenizer))

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    train_dataset, valid_dataset = train_test_split(
        dataset, test_size=0.1, random_state=args['random_seed'])
    train_label = train_dataset['label'].values
    valid_label = valid_dataset['label'].values

    # pororo ner
    ner = Pororo(task="ner", lang="ko")

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer, ner, args)
    tokenized_valid = tokenized_dataset(valid_dataset, tokenizer, ner, args)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_valid_dataset = RE_Dataset(tokenized_valid, valid_label)

    # update model setting

    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.

    print("use_trainer : ", args['use_trainer'])

    if args['use_trainer']:
        training_args = TrainingArguments(
            output_dir='./results',  # output directory
            save_total_limit=5,  # number of total save model.
            save_steps=500,  # model saving step.
            num_train_epochs=args['epochs'],  # total number of training epochs
            learning_rate=args['lr'],  # learning_rate
            per_device_train_batch_size=args[
                'train_batch_size'],  # batch size per device during training
            per_device_eval_batch_size=args[
                'eval_batch_size'],  # batch size for evaluation
            warmup_steps=args[
                'warmup_steps'],  # number of warmup steps for learning rate scheduler
            weight_decay=args['weight_decay'],  # strength of weight decay
            logging_dir='./logs',  # directory for storing logs
            logging_steps=args['logging_steps'],  # log saving step.
            label_smoothing_factor=args['label_smoothing_factor'],
            evaluation_strategy=
            'steps',  # evaluation strategy to adopt during training
            # `no`: No evaluation during training.
            # `steps`: Evaluate every `eval_steps`.
            # `epoch`: Evaluate every end of epoch.
            eval_steps=100,  # evaluation step.
        )
        trainer = Trainer(
            model=model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=RE_train_dataset,  # training dataset
            eval_dataset=RE_valid_dataset,  # evaluation dataset
            compute_metrics=compute_metrics  # define metrics function
        )

        # train model
        trainer.train()

    else:
        custom_trainer(model, device, RE_train_dataset, RE_valid_dataset, args)
コード例 #16
0
    def __init__(
        self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True
    ):
        super().__init__(config=config)

        if pre_load_model:
            self.xlm_roberta = XLMRobertaModel.from_pretrained(
                self.config.model_name, output_hidden_states=True
            )
        else:
            xlm_roberta_config = XLMRobertaConfig.from_pretrained(
                self.config.model_name, output_hidden_states=True
            )
            self.xlm_roberta = XLMRobertaModel(xlm_roberta_config)

        self.vocabs = {
            const.TARGET: vocabs[const.TARGET],
            const.SOURCE: vocabs[const.SOURCE],
        }

        self.mlp = None

        if self.config.use_mlp:
            self.mlp = nn.Sequential(
                nn.Linear(self.xlm_roberta.config.hidden_size, self.config.hidden_size),
                nn.Tanh(),
            )
            output_size = self.config.hidden_size
        else:
            output_size = self.xlm_roberta.config.hidden_size

        sentence_size = output_size
        if config.pooling == 'mixed':
            sentence_size *= 2

        self.scalar_mix = ScalarMixWithDropout(
            mixture_size=self.xlm_roberta.config.num_hidden_layers
            + 1,  # +1 for embeddings
            do_layer_norm=self.config.scalar_mix_layer_norm,
            dropout=self.config.scalar_mix_dropout,
        )

        self._sizes = {
            const.TARGET: output_size,
            const.TARGET_LOGITS: output_size,
            const.TARGET_SENTENCE: sentence_size,
            const.SOURCE: output_size,
        }

        self.output_embeddings = self.xlm_roberta.embeddings.word_embeddings

        self._training_steps_ran = 0
        self._is_frozen = False
        if self.config.freeze:
            logger.info(
                'Freezing XLMRoberta encoder weights; training will not update them'
            )
            for param in self.xlm_roberta.parameters():
                param.requires_grad = False
            self._is_frozen = True
        if self.config.freeze_for_number_of_steps > 0:
            # Done inside `forward()` to guarantee we can unfreeze (if optimizer is
            #  built after this, we cannot unfreeze without calling
            #  `optimizer.add_param_group({'params': self.xlm.parameters()})`
            pass
コード例 #17
0
ファイル: train.py プロジェクト: bcaitech1/p2-klue-shlee4290
def train():
  # load model and tokenizer
  #MODEL_NAME = "bert-base-multilingual-cased"
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
  MODEL_NAME = 'xlm-roberta-large'
  tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
  print(tokenizer.tokenize("이순신은 조선 중기의 무신이다."))
  print(tokenizer.tokenize("아버지가방에들어가신다."))
  tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." + tokenizer.sep_token + "아버지가방에들어가신다.")
  print(tokenized_str)

  # load dataset
  dataset = load_data("/opt/ml/input/data/train/train.tsv")
  label = dataset['label'].values

  bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
  bert_config.num_labels = 42

  cv = StratifiedShuffleSplit(n_splits=5, test_size = 0.8, train_size= 0.2)
  for idx , (train_idx , val_idx) in enumerate(cv.split(dataset, label)):
    train_dataset = dataset.iloc[train_idx]
    val_dataset = dataset.iloc[val_idx]

    # tokenizing dataset
    train_dataset = tokenized_dataset(train_dataset, tokenizer)
    val_dataset = tokenized_dataset(val_dataset, tokenizer)

    train_y = label[train_idx]
    val_y = label[val_idx]

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(train_dataset, train_y)
    RE_valid_dataset = RE_Dataset(val_dataset, val_y)

    # setting model hyperparameter
    model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config)
    model.to(device)
    
    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
      output_dir='./results',          # output directory
      save_total_limit=2,              # number of total save model.
      save_steps=400,                 # model saving step.
      num_train_epochs=10,              # total number of training epochs
      learning_rate=1e-5,               # learning_rate
      per_device_train_batch_size=16,  # batch size per device during training
      #per_device_eval_batch_size=8,   # batch size for evaluation
      warmup_steps=300,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='./logs',            # directory for storing logs
      logging_steps=100,              # log saving step.
      evaluation_strategy='steps', # evaluation strategy to adopt during training
                                  # `no`: No evaluation during training.
                                  # `steps`: Evaluate every `eval_steps`.
                                  # `epoch`: Evaluate every end of epoch.
      eval_steps = 400,            # evaluation step.
      dataloader_num_workers=4,
      metric_for_best_model="accuracy",
      greater_is_better = True,
      label_smoothing_factor=0.5
    )
    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=RE_train_dataset,         # training dataset
      eval_dataset=RE_valid_dataset,             # evaluation dataset
      compute_metrics=compute_metrics,         # define metrics function
    )

    # train model
    trainer.train()
コード例 #18
0
def train(args):
    # load model and tokenizer
    MODEL_NAME = args.model_name
    if args.model_type == 'kobert':
        tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    # root = '/opt/ml'
    root = args.root
    train_dataset = load_data(root + "/input/data/train/train.tsv", root)
    # train_dataset = load_data(root+"/input/data/train/ner_train_ver2.tsv", root)
    #dev_dataset = load_data("./dataset/train/dev.tsv")
    train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    # BalanceClassSampler를 정의합니다. 여기선 upsampling 옵션을 주었습니다.
    # sampler = BalanceClassSampler(RE_train_dataset.get_classes(), 'upsampling')
    # RE_train_loader = DataLoader(RE_train_dataset, batch_size=16, sampler=sampler)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    if args.model_type == 'bert':
        bert_config = BertConfig.from_pretrained(MODEL_NAME)
        bert_config.num_labels = 42
        model = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, config=bert_config)
    elif args.model_type == 'electra':
        electra_config = ElectraConfig.from_pretrained(MODEL_NAME)
        electra_config.num_labels = 42
        model = ElectraForSequenceClassification.from_pretrained(
            MODEL_NAME, config=electra_config)
    elif args.model_type == 'roberta':
        roberta_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
        roberta_config.num_labels = 42
        model = XLMRobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, config=roberta_config)
    model.resize_token_embeddings(len(tokenizer))
    model.parameters
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.

    training_args = TrainingArguments(
        output_dir='./results/' + str(args.id),  # output directory
        save_total_limit=args.save_total_limit,  # number of total save model.
        save_steps=args.save_steps,  # model saving step.
        num_train_epochs=args.
        num_train_epochs,  # total number of training epochs
        learning_rate=args.learning_rate,  # learning_rate
        per_device_train_batch_size=args.
        per_device_train_batch_size,  # batch size per device during training
        #per_device_eval_batch_size=16,   # batch size for evaluation
        warmup_steps=args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # strength of weight decay
        logging_dir='./logs/' + str(args.id),  # directory for storing logs
        logging_steps=args.logging_steps,  # log saving step.
        #evaluation_strategy='steps', # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        #eval_steps = 500,            # evaluation step.
        # save_strategy='epoch',
        label_smoothing_factor=0.5)

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        #eval_dataset=RE_dev_dataset,             # evaluation dataset
        #compute_metrics=compute_metrics         # define metrics function
    )

    # train model
    print('Training start')
    trainer.train()
    print('Training finished!')
コード例 #19
0
}

PUBLIC_MODEL = {
    'mbert': {
        'name':
        'bert-base-multilingual-cased',
        'tokenizer':
        BertTokenizerFast.from_pretrained('bert-base-multilingual-cased'),
        'config':
        BertConfig.from_pretrained('bert-base-multilingual-cased'),
    },
    'xlmr': {
        'name': 'xlm-roberta-base',
        'tokenizer':
        XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base'),
        'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'),
    },
    'xlmr-large': {
        'name': 'xlm-roberta-large',
        'tokenizer':
        XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large'),
        'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'),
    },
}

TOKENIZER_CLS = {
    'spm_camembert': CamembertTokenizer,
    'spm': ThaiRobertaTokenizer,
    'newmm': ThaiWordsNewmmTokenizer,
    'syllable': ThaiWordsSyllableTokenizer,
    'sefr_cut': FakeSefrCutTokenizer,
コード例 #20
0
    def create(cls,
               model_type='camem',
               model_name="camembert-base",
               embedding_size=768,
               hidden_dim=512,
               rnn_layers=1,
               lstm_dropout=0.5,
               device="cuda",
               mode="weighted",
               key_dim=64,
               val_dim=64,
               num_heads=3,
               attn_dropout=0.3,
               self_attention=False,
               is_require_grad=False):
        configuration = {
            'model_type': model_type,
            "model_name": model_name,
            "device": device,
            "mode": mode,
            "self_attention": self_attention,
            "is_freeze": is_require_grad
        }

        if 'camem' in model_type:
            config_bert = CamembertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = CamembertModel.from_pretrained(model_name,
                                                   config=config_bert)
            model.to(device)
        elif 'flaubert' in model_type:
            config_bert = FlaubertConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = FlaubertModel.from_pretrained(model_name,
                                                  config=config_bert)
            model.to(device)
        elif 'XLMRoberta' in model_type:
            config_bert = XLMRobertaConfig.from_pretrained(
                model_name, output_hidden_states=True)
            model = XLMRobertaModel.from_pretrained(model_name,
                                                    config=config_bert)
            model.to(device)
        elif 'M-Bert' in model_type:
            config_bert = BertConfig.from_pretrained(model_name,
                                                     output_hidden_states=True)
            model = BertModel.from_pretrained(model_name, config=config_bert)
            model.to(device)

        lstm = BiLSTM.create(embedding_size=embedding_size,
                             hidden_dim=hidden_dim,
                             rnn_layers=rnn_layers,
                             dropout=lstm_dropout)

        attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads,
                                  attn_dropout)
        model.train()
        self = cls(model=model, config=configuration, lstm=lstm, attn=attn)
        # if is_freeze:
        self.freeze()

        return self