def define_model(name, config=None, location=None):
    # config가 있으면 처음 training하는 경우, 없으면 체크포인트 불러오기
    if name in [
            "bert-base-multilingual-cased",
            "sangrimlee/bert-base-multilingual-cased-korquad",
            "kykim/bert-kor-base", "monologg/kobert"
    ]:
        return BertForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else BertForSequenceClassification.from_pretrained(
            location)
    elif name in [
            "monologg/koelectra-base-v3-discriminator",
            "kykim/electra-kor-base"
    ]:
        return ElectraForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else ElectraForSequenceClassification.from_pretrained(
            location)
    elif name in ["xlm-roberta-large"]:
        return XLMRobertaForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else XLMRobertaForSequenceClassification.from_pretrained(
            location)
    elif name in ["kykim/funnel-kor-base"]:
        return FunnelForSequenceClassification.from_pretrained(
            name, config=config
        ) if config else FunnelForSequenceClassification.from_pretrained(
            location)
Exemple #2
0
    def __init__(self, config, model_argobj=None):
        Cos_NLL.__init__(self, model_argobj)
        XLMRobertaForSequenceClassification.__init__(self, config)
        self.embeddingHead = nn.Linear(config.hidden_size, 100)

        # learned loss parameters
        self.w = torch.nn.Parameter(torch.ones(1)*10.0)
        self.b = torch.nn.Parameter(torch.zeros(1))

        self.apply(self._init_weights)
Exemple #3
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # load tokenizer
    TOK_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME)

    special_tokens_dict = {'additional_special_tokens': ["#", "@", '₩', '^']}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv('./prediction/submission.csv', index=False)
Exemple #4
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    #TOK_NAME = "bert-base-multilingual-cased"
    #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    model = XLMRobertaForSequenceClassification.from_pretrained(args.model_dir)
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv('./prediction/roberta-submission13.csv', index=False)
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    TOK_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME)

    # load my model
    MODEL_NAME = args.model_dir  # model dir.
    p = Path('.').resolve()  # /opt/ml
    model_dir = p / args.model_dir
    model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
    model.resize_token_embeddings(len(tokenizer))
    model.parameters
    model.to(device)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    output.to_csv(
        './prediction/submission.csv',
        index=False,
    )
Exemple #6
0
def main(json_path, model_name_or_dir):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name_or_dir).to(DEVICE)
    model.eval()

    with open(json_path) as json_file:
        data = json.load(json_file)

    predictions = []
    labels = []

    for pair in data:
        sentence = pair['text']
        label = pair['sentiment']

        inputs = tokenizer.encode(sentence,
                                  padding=False,
                                  truncation=True,
                                  return_tensors='pt').to(DEVICE)

        with torch.no_grad():
            output = model(inputs).logits
            prediction = torch.argmax(output, dim=-1)[0].item()

        predictions.append(prediction)
        labels.append(label)

    print(metrics.classification_report(labels, predictions, digits=6))
def main(
        output_dir,
        logging_dir,
        logging_steps,
        large,
        batch_size,
        gradient_accumulation_steps,
        learning_rate,
        num_train_epochs,
        warmup_ratio):
    sst_train_dataset = load_dataset('glue', 'sst2', split='train')
    sst_validation_dataset = load_dataset('glue', 'sst2', split='validation')

    if large:
        model_name = 'xlm-roberta-large'
    else:
        model_name = 'xlm-roberta-base'

    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
    model = XLMRobertaForSequenceClassification.from_pretrained(model_name)

    def preprocess_function(examples):
        return tokenizer(examples['sentence'], padding=False, truncation=True)

    sst_train_dataset = sst_train_dataset.map(preprocess_function, batched=True)
    sst_validation_dataset = sst_validation_dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        logging_dir=logging_dir,
        logging_strategy='steps',
        logging_steps=logging_steps,
        save_strategy='epoch',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=sst_train_dataset,
        eval_dataset=sst_validation_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
Exemple #8
0
    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
        #download the model or load the model path
        model_path = download_model('xlmr.ned',
                                    cache_dir,
                                    process_func=_unzip_process_func,
                                    verbose=verbose)
        self.classes = ['0', '1']

        self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
        self.model = XLMRobertaForSequenceClassification.from_pretrained(
            model_path, num_labels=len(self.classes))

        self.max_length = self.model.roberta.embeddings.position_embeddings.num_embeddings - 2
Exemple #9
0
 def __init__(self, config):
     model_name = config.get("model_name", None)
     model_path = config.get("model_path", None)
     device = config.get("device", 0)  # default on gpu 0
     self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
     # the default entailment id is 2 (contradiction is 0, neutral is 1)
     self.contradiction_id = 0
     self.entailment_id = 2
     self.model = XLMRobertaForSequenceClassification.from_pretrained(
         model_path)
     self.model.eval()
     self.model.half()
     self.device = torch.device(
         "cpu" if device < 0 else "cuda:{}".format(device))
     if self.device.type == "cuda":
         self.model = self.model.to(self.device)
Exemple #10
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
    """
    seed_everything(args.seed)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # load tokenizer
    TOK_NAME = args.token
    if TOK_NAME.startswith('xlm'):
        tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    MODEL_NAME = os.path.join(args.model_dir, args.model)  # model dir.
    if TOK_NAME.startswith('xlm'):
        model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME)
    else:
        model = BertForSequenceClassification.from_pretrained(MODEL_NAME)

    # load test datset
    test_dataset_dir = "/opt/ml/input/data/test/test.tsv"
    test_dataset, test_label = load_test_dataset(test_dataset_dir, model,
                                                 tokenizer, args)
    test_dataset = RE_Dataset(test_dataset, test_label)

    model.to(device)

    # predict answer
    batch_size = args.batch_size
    logits, pred_answer = inference(model, test_dataset, device, batch_size)
    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.

    output = pd.DataFrame(pred_answer, columns=['pred'])
    save_dir = os.path.join(args.output_dir, args.name)
    os.makedirs(save_dir, exist_ok=True)
    output.to_csv(os.path.join(save_dir, f'{args.name}.csv'), index=False)
    np.save(os.path.join(save_dir, r'logits.npy'), logits)
Exemple #11
0
def main(args):
    """
    주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다.
  """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # load tokenizer
    TOK_NAME = args.model_name
    tokenizer = AutoTokenizer.from_pretrained(TOK_NAME)

    # load my model
    model_dir = f'./results/{args.id}/checkpoint-{args.checkpoint}'
    if args.model_type == 'bert':
        model = BertForSequenceClassification.from_pretrained(model_dir)
    elif args.model_type == 'electra':
        model = ElectraForSequenceClassification.from_pretrained(model_dir)
    elif args.model_type == 'roberta':
        model = XLMRobertaForSequenceClassification.from_pretrained(model_dir)
    model.parameters
    model.to(device)

    # load test datset
    # root = "/opt/ml"
    # root = "/content/drive/MyDrive/Boostcamp/Stage2_KLUE"
    root = args.root
    test_dataset, test_label = load_test_dataset(root, tokenizer)
    test_dataset = RE_Dataset(test_dataset, test_label)

    # predict answer
    pred_answer = inference(model, test_dataset, device)
    # logits, predictions = inference(model, test_dataset, device)

    # make csv file with predicted answer
    # 아래 directory와 columns의 형태는 지켜주시기 바랍니다.
    output = pd.DataFrame(pred_answer, columns=['pred'])
    # output = pd.DataFrame(predictions, columns=['pred'])
    output.to_csv(f'./results/{args.id}/submission{args.id}.csv', index=False)
    # np.save(f'./results/{args.id}/logits{args.id}.npy', logits)
    print('File saved')
def main(train_json_path, val_json_path, model_name_or_dir, output_dir,
         logging_dir, logging_steps, batch_size, gradient_accumulation_steps,
         learning_rate, num_train_epochs, warmup_ratio):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name_or_dir)

    sh_sentiment_train_dataset, sh_sentiment_val_dataset = create_sh_sentiment_dataset(
        train_json_path, val_json_path, tokenizer)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        logging_dir=logging_dir,
        logging_strategy='steps',
        logging_steps=logging_steps,
        save_strategy='epoch',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=sh_sentiment_train_dataset,
        eval_dataset=sh_sentiment_val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
Exemple #13
0
def main(
        input_dir_path,
        output_dir_path,
        model_name_or_dir):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForSequenceClassification.from_pretrained(model_name_or_dir).to(DEVICE)
    model.eval()

    os.makedirs(output_dir_path, exist_ok=True)

    for file_name in os.listdir(input_dir_path):
        if file_name.endswith('.json'):
            count = 0
            input_file_path = os.path.join(input_dir_path, file_name)
            with open(input_file_path) as json_file:
                data = json.load(json_file)

                for session in data['sessions']:
                    for speech in session['speeches']:
                        content = []
                        for text in speech['content']:
                            inputs = tokenizer.encode(
                                text, padding=False, truncation=True, return_tensors='pt').to(DEVICE)

                            with torch.no_grad():
                                outputs = model(inputs).logits
                                predictions = torch.softmax(outputs, dim=-1)[0, 1].item()

                            content.append({ 'text': text, 'sentiment': round(predictions, 6) })
                            count += 1
                        speech['content'] = content

            output_file_path = os.path.join(output_dir_path, file_name)
            with open(output_file_path, 'w') as json_file:
                json.dump(data, json_file)

            print("File: {}, Count: {}".format(file_name, count))
def load_model(pretrained_name, model_loc=None, load_tuned=True, num_labels=2):
    assert pretrained_name is not None
    if load_tuned:
        # load previously tuned model from disk
        if model_loc is None:
            model_dump_loc, model_state_dic_loc = generate_disk_location()
        else:
            model_dump_loc = model_loc
        model = torch.load(model_dump_loc)
        logger.info("loading model from {}".format(model_dump_loc))
    else:
        # load pretrained name from hugging face
        model_name = config["model_name"]
        if model_name == "bert":
            model = BertForSequenceClassification.from_pretrained(
                pretrained_name, num_labels=num_labels)
        elif model_name == "roberta":
            model = RobertaForSequenceClassification.from_pretrained(
                pretrained_name, num_labels=num_labels)
        elif model_name == "distillbert":
            model = DistilBertForSequenceClassification.from_pretrained(
                pretrained_name, num_labels=num_labels)
        elif model_name == "xlmroberta":
            model = XLMRobertaForSequenceClassification.from_pretrained(
                pretrained_name, num_labels=num_labels)
        elif model_name == "xlnet":
            model = XLNetForSequenceClassification.from_pretrained(
                pretrained_name, num_labels=num_labels)
        else:
            logger.error("unsupported model: {}".format(model_name))
        logger.info("loading pretrained model")

    tokenizer = AutoTokenizer.from_pretrained(pretrained_name)
    logger.info("model config: {}".format(model.config))

    return model, tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
import torch
from textattack.models.wrappers.huggingface_model_wrapper import HuggingFaceModelWrapper

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
raw_model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=10)
raw_model.load_state_dict(torch.load('xlm_roberta_en/state_dict.pt', map_location='cuda:0'))
model = HuggingFaceModelWrapper(raw_model, tokenizer)
Exemple #16
0
def train(model_dir, args):
    seed_everything(args.seed)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    print("This notebook use [%s]." % (device))

    s_dir = args.model + str(
        args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str(
            args.epochs
        ) + '-' + args.scheduler + '-' + args.tokenize + '-' + str(
            args.max_len) + '-' + str(args.seed)

    save_dir = increment_path(os.path.join(model_dir, s_dir))
    log_dir = increment_path(os.path.join('logs', s_dir))

    # load model and tokenizer
    MODEL_NAME = args.model
    if MODEL_NAME.startswith('xlm'):
        tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # set neptune
    set_neptune(save_dir, args)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    labels = dataset['label'].values

    # setting model hyperparameter
    if MODEL_NAME.startswith('xlm'):
        bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    else:
        bert_config = BertConfig.from_pretrained(MODEL_NAME)

    bert_config.num_labels = args.num_labels
    bert_config.num_hidden_layers = args.num_hidden_layers

    if MODEL_NAME.startswith('xlm'):
        model = XLMRobertaForSequenceClassification.from_pretrained(
            MODEL_NAME, config=bert_config)
    else:
        model = BertForSequenceClassification.from_pretrained(
            MODEL_NAME, config=bert_config)

    if args.drop >= 0:
        model.dropout = nn.Dropout(p=args.drop)

    # preprocess dataset
    if args.preprocess != 'no':
        pre_module = getattr(import_module("preprocess"), args.preprocess)
        dataset = pre_module(dataset, model, tokenizer)

    # make dataset for pytorch.
    # train, val split

    train_dataset, val_dataset = train_test_split(dataset,
                                                  test_size=args.val_ratio,
                                                  random_state=args.seed)

    tok_module = getattr(import_module("load_data"), args.tokenize)

    train_tokenized = tok_module(train_dataset,
                                 tokenizer,
                                 max_len=args.max_len)
    val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(
        train_tokenized, train_dataset['label'].reset_index(drop='index'))
    RE_val_dataset = RE_Dataset(val_tokenized,
                                val_dataset['label'].reset_index(drop='index'))

    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        seed=args.seed,
        output_dir=save_dir,  # output directory
        save_total_limit=2,  # number of total save model.
        save_steps=args.save_steps,  # model saving step.
        num_train_epochs=args.epochs,  # total number of training epochs
        learning_rate=args.lr,  # learning_rate
        per_device_train_batch_size=args.
        batch_size,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        lr_scheduler_type=args.scheduler,
        warmup_steps=args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # strength of weight decay
        logging_dir=log_dir,  # directory for storing logs
        logging_steps=100,  # log saving step.
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=100,  # evaluation step.
        dataloader_num_workers=4,
        label_smoothing_factor=args.smoothing_factor,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy')

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        eval_dataset=RE_val_dataset,  # evaluation dataset
        compute_metrics=compute_metrics  # define metrics function
    )

    # train model
    trainer.train()
def train(args):
    # load model and tokenizer
    MODEL_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

    # split dataset
    dataset = pd.read_csv('/opt/ml/input/data/train/train.tsv',
                          delimiter='\t',
                          header=None)
    train, dev = train_test_split(dataset, test_size=0.2, random_state=42)
    train.to_csv('/opt/ml/input/data/train/train_train.tsv',
                 sep='\t',
                 header=None,
                 index=False)
    dev.to_csv('/opt/ml/input/data/train/train_dev.tsv',
               sep='\t',
               header=None,
               index=False)

    # load dataset
    train_dataset = load_data('/opt/ml/input/data/train/train_train.tsv',
                              args.root)
    dev_dataset = load_data('/opt/ml/input/data/train/train_dev.tsv',
                            args.root)

    train_label = train_dataset['label'].values
    dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_NAME, config=bert_config)
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(output_dir='./results/' + str(args.id),
                                      save_total_limit=3,
                                      save_steps=100,
                                      num_train_epochs=10,
                                      learning_rate=1e-5,
                                      per_device_train_batch_size=32,
                                      per_device_eval_batch_size=32,
                                      warmup_steps=300,
                                      weight_decay=0.01,
                                      logging_dir='./logs/' + str(args.id),
                                      logging_steps=100,
                                      evaluation_strategy='steps',
                                      eval_steps=100,
                                      dataloader_num_workers=4,
                                      label_smoothing_factor=0.5)
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=RE_train_dataset,
                      eval_dataset=RE_dev_dataset,
                      compute_metrics=compute_metrics)

    # train model
    trainer.train()
Exemple #18
0
def train():

    seed_everything(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # setting model hyperparameter
    # config 자체에는 학습 weight 정보 없기 때문에, from_pretrained 사용해 weight 가져올 수 있다

    # bert_config = BertConfig.from_pretrained(MODEL_NAME)
    # bert_config.num_labels = 42
    # model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config)

    # Auto
    model_config = XLMRobertaConfig.from_pretrained(args.model_name)
    model_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        args.model_name, config=model_config)

    # load model and tokenizer
    # MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
    # roberta: https://huggingface.co/transformers/model_doc/xlmroberta.html
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    # load dataset
    dataset = load_data("/opt/ml/input/data/train/train.tsv")
    # label = dataset['label'].values

    train_dataset, val_dataset = train_test_split(dataset,
                                                  test_size=0.2,
                                                  random_state=args.seed)
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    tokenized_val = tokenized_dataset(val_dataset, tokenizer)

    tokenized_train_label = train_dataset['label'].values
    tokenized_val_label = val_dataset['label'].values

    # train_datasets = TokenDataset(train_dataset, tokenizer)
    # val_datasets = TokenDataset(val_dataset, tokenizer)
    RE_train_dataset = RE_Dataset(tokenized_train, tokenized_train_label)
    RE_val_dataset = RE_Dataset(tokenized_val, tokenized_val_label)

    # print(model.parameters)
    model.to(device)
    model = torch.nn.DataParallel(model)

    train_loader = DataLoader(
        RE_train_dataset,
        batch_size=args.batch_size,
        # num_workers=8,
        pin_memory=torch.cuda.is_available(),
        shuffle=True,
    )
    val_loader = DataLoader(
        RE_val_dataset,
        batch_size=args.batch_size,
        # num_workers=8,
        shuffle=False,
        pin_memory=torch.cuda.is_available(),
    )

    optimizer = AdamW(model.parameters(),
                      lr=args.lr,
                      weight_decay=args.weight_decay)
    loss_fn = LabelSmoothingLoss(smoothing=0.5)
    # loss_fn = nn.CrossEntropyLoss()

    # t_total = len(train_loader) * args.epoch
    t_total = args.epoch
    warmup_step = int(t_total * args.warmup_steps)
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_step,
                                                num_training_steps=t_total)

    log_dir = ""
    log_list = glob("./logs/*")
    if len(log_list) == 0:
        log_dir = "./logs/exp1"
    else:
        log_list = [int(log[-1]) for log in log_list]
        log_dir = "./logs/exp" + str(max(log_list) + 1)

    logger = SummaryWriter(log_dir=log_dir)

    scaler = GradScaler()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    import time

    for epoch in tqdm(range(args.epoch)):
        train_acc = 0.0
        train_loss = 0.0
        val_acc = 0.0
        val_loss = 0.0
        best_acc = 0.0
        model.train()
        for batch_id, batch in enumerate(tqdm(train_loader)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = loss_fn(outputs.logits, labels)

            # loss.backward()
            # optimizer.step()

            scaler.scale(loss).backward()

            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            scaler.step(optimizer)
            scaler.update()

            train_acc += compute_acc(outputs.logits.cpu(), labels.cpu())
            train_loss += loss

            if (batch_id + 1) % args.logging_steps == 0:
                train_loss = train_loss.data.cpu().numpy()
                print(
                    f"[Train] epoch {epoch + 1} | batch_id {batch_id + 1} | loss {(train_loss) / args.logging_steps:.4f} | train_acc {train_acc / args.logging_steps:.4f}"
                )
                logger.add_scalar("Train/loss",
                                  train_loss / args.logging_steps,
                                  epoch * len(train_loader) + batch_id)
                logger.add_scalar("Train/acc", train_acc / args.logging_steps,
                                  epoch * len(train_loader) + batch_id)
                train_acc = 0.0
                train_loss = 0.0

        # scheduler.step()

        print("\nStart Validation Step!")
        with torch.no_grad():
            model.eval()
            for batch_id, batch in enumerate(tqdm(val_loader)):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                outputs = model(input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = loss_fn(outputs.logits, labels)
                val_acc += compute_acc(outputs.logits.cpu(), labels.cpu())
                val_loss += loss

            print(
                f"[Val] epoch {epoch + 1} | val_acc {val_acc / (batch_id + 1):.4f}"
            )
            logger.add_scalar("Val/loss", val_loss / (batch_id + 1), epoch)
            logger.add_scalar("Val/acc", val_acc / (batch_id + 1), epoch)

            if val_acc >= best_acc:
                best_acc = val_acc
                # torch.save(model.state_dict(), os.path.join(args.output_dir, "saved_" + str(epoch) + ".pth"))
                torch.save(model.state_dict(),
                           os.path.join(args.output_dir, "best.pth"))
                print("Saved best acc model...")

        scheduler.step()

    torch.save(model.state_dict(), os.path.join(args.output_dir, "last.pth"))
Exemple #19
0
def model_init():
    """Returns an initialized model for use in a Hugging Face Trainer."""
    model = XLMRobertaForSequenceClassification.from_pretrained(
        "xlm-roberta-base")
    return model
def model_init():
    model = XLMRobertaForSequenceClassification.from_pretrained(args.model_dir)
    return model
import itertools
for model, wbool in list(itertools.product(models, wbools)):
    loss_weighted = wbool
    
    if model == 'MURIL':
        # Using Huggingface MURIL version
        from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
        tokenizer = AutoTokenizer.from_pretrained("simran-kh/muril-cased-temp")
        model = AutoModelForSequenceClassification.from_pretrained("simran-kh/muril-cased-temp", num_labels=6)
        model_name = 'MURIL_cased_temp_tamil'
    if model == 'XLMR':
        # Using XLM-Roberta-Base pretrained model
        from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
        model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=6)
        model_name = 'XLMroberta_large_tamil'
    if model == 'XLMR_base':
        # Using XLM-Roberta-Base pretrained model
        from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
        model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=6)
        model_name = 'XLMroberta_base_tamil'
    if model == 'mbertlarge':
        # Using Multilingual Bert, bert-large-multilingual-cased pretrained
        from transformers import BertTokenizer, BertForSequenceClassification
        tokenizer = BertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
        model = BertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6)
        model_name = 'Distilbert_m_base_cased_tamil'
    if model == 'XLMR_custom':
        # Using XLMRoberta finetuning Custom Pretrained model, Vocab same => Tokenizer base
Exemple #22
0
def load_model(args):
    if 'bert-base-multilingual' in args['model_checkpoint']:
        # bert-base-multilingual-uncased or bert-base-multilingual-cased
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'xlm-mlm' in args['model_checkpoint']:
        # xlm-mlm-100-1280
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint'])
        config = XLMConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'xlm-roberta' in args['model_checkpoint']:
        # xlm-roberta-base or xlm-roberta-large
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMRobertaTokenizer.from_pretrained(
            args['model_checkpoint'])
        config = XLMRobertaConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMRobertaForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMRobertaForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMRobertaForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'fasttext' in args['model_checkpoint']:
        # Prepare config & tokenizer
        vocab_path = args['vocab_path']
        config_path = None

        word_tokenizer = args['word_tokenizer_class']()
        emb_path = args['embedding_path'][args['model_checkpoint']]

        _, vocab_map = load_vocab(vocab_path)
        tokenizer = SimpleTokenizer(vocab_map,
                                    word_tokenizer,
                                    lower=args["lower"])
        vocab_list = list(tokenizer.vocab.keys())

        config = BertConfig.from_pretrained('bert-base-uncased')
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]

        embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300)
        config.hidden_size = 300
        config.num_attention_heads = 10
        config.vocab_size = len(embeddings)

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))

    elif 'scratch' in args['model_checkpoint']:
        vocab_path, config_path = None, None

        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        config = BertConfig.from_pretrained("bert-base-uncased")
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]
        config.hidden_size = 300
        config.num_attention_heads = 10

        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config=config)
    elif 'indobenchmark' in args['model_checkpoint']:
        # indobenchmark models
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        model_class = None
        if 'sequence_classification' == args['task']:
            model_class = AlbertForSequenceClassification if 'lite' in args[
                'model_checkpoint'] else BertForSequenceClassification
        elif 'token_classification' == args['task']:
            model_class = AlbertForWordClassification if 'lite' in args[
                'model_checkpoint'] else BertForWordClassification
        elif 'multi_label_classification' == args['task']:
            model_class = AlbertForMultiLabelClassification if 'lite' in args[
                'model_checkpoint'] else BertForMultiLabelClassification
        model = model_class.from_pretrained(args['model_checkpoint'],
                                            config=config)
    return model, tokenizer, vocab_path, config_path
Exemple #23
0
def run_xlm(dataset, model, header):
    test_set = pd.read_csv(dataset, encoding='utf-8')
    X = test_set['text']
    y = test_set[header]
    input_ids = []
    attention_masks = []

    tokenizer = XLMRobertaTokenizer.from_pretrained(model, do_lower_case=True)
    device = torch.device("cuda")

    labels = []
    for i in range(len(X)):
        if not pd.isnull(X[i]):
            encoded_dict = tokenizer.encode_plus(
                X[i],  # text.
                add_special_tokens=True,  # [CLS] and [SEP] tokens'
                max_length=64,
                pad_to_max_length=True,  # Pad missing tokens with 0s
                return_attention_mask=True,
                return_tensors='pt',  # pytorch tensors.
            )
            labels.append(y[i])
            input_ids.append(encoded_dict['input_ids'])

            # differentiates padding from non-padding
            attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    batch_size = 1
    prediction_data = TensorDataset(input_ids, attention_masks, labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data,
                                       sampler=prediction_sampler,
                                       batch_size=batch_size)

    model = XLMRobertaForSequenceClassification.from_pretrained(model)

    model.to(device)
    model.eval()

    # Tracking variables
    predictions, true_labels = [], []
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            # Get predictionss
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        logits = outputs[0]

        # Retrieve data from GPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Save predictions
        predictions.append(logits)
        true_labels.append(label_ids)

    # Combine the results across all batches.
    flat_predictions = np.concatenate(predictions, axis=0)

    # For each sample, pick the label (0 or 1) with the higher score.
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

    # Combine the correct labels for each batch into a single list.
    flat_true_labels = np.concatenate(true_labels, axis=0)

    # Check if predictions are correct
    acc = np.sum(flat_predictions == flat_true_labels) / len(flat_predictions)
    # print(flat_predictions)
    # print(flat_true_labels)
    # print(acc)

    print("Accuracy: ", accuracy_score(flat_true_labels, flat_predictions))
    print(
        "Precision: ",
        precision_score(flat_true_labels, flat_predictions,
                        average='weighted'))
    print("Recall: ",
          recall_score(flat_true_labels, flat_predictions, average='weighted'))
    print("F1-score: ",
          f1_score(flat_true_labels, flat_predictions, average='weighted'))
Exemple #24
0
def load_model(args):
    if 'albert-large-wwmlm-512' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_vocab_uncased_30000.txt"
        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(
            "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_albert_large_config.json"
        )
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_pytorch_albert_large_512_629k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model
    elif 'albert-base-wwmlm-512' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_vocab_uncased_30000.txt"
        config_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_albert_base_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_pytorch_model_albert_base_162k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'albert-large-wwmlm-128' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_vocab_uncased_30000.txt"
        config_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_albert_large_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_pytorch_albert_large_128_500k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'babert-bpe-mlm-large-512' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_bert_large_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_pytorch_babert_uncased_large_512_dup10-5_1120k.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'albert-base-uncased-112500' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-uncased-112500/vocab.txt"
        config_path = "../embeddings/albert-base-uncased-112500/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-uncased-112500/albert_base_uncased_112500.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'albert-base-uncased-96000' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-uncased-96000/vocab.txt"
        config_path = "../embeddings/albert-base-uncased-96000/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-uncased-96000/albert_base_uncased_96000.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'albert-base-uncased-191k' == args['model_checkpoint']:
        vocab_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_vocab_uncased_30000.txt"
        config_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_albert_base_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = AlbertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = AlbertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = AlbertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = AlbertForMultiLabelClassification(config)

        # Plug pretrained bert model
        albert_model = AlbertModel.from_pretrained(
            "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_pytorch_model_albert_base_191k.bin",
            from_tf=False,
            config=config)
        model.albert = albert_model

    elif 'babert-opensubtitle' == args['model_checkpoint']:
        # babert-opensubtitle
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-opensubtitle/vocab.txt"
        config_path = "../embeddings/babert-opensubtitle/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-opensubtitle/model.ckpt-1000000.index",
            from_tf=True,
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-large-uncased-1100k' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_pytorch_model_babert_large_1100k.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-large-uncased-1m' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_pytorch_model_babert_large_1mil.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-base-512' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-base-512/pytorch_models_babert_base_512_pytorch_model_babert_base_uncased_512.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-large-uncased' == args['model_checkpoint']:
        # babert_bpe
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_vocab_uncased_30522.txt"
        config_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_pytorch_model_babert_large_778500.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'babert-bpe-mlm-uncased-128-dup10-5' == args['model_checkpoint']:
        # babert_bpe_wwmlm
        # Prepare config & tokenizer
        vocab_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/vocab.txt"
        config_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/bert_config.json"

        tokenizer = BertTokenizer(vocab_path)
        config = BertConfig.from_json_file(config_path)
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)

        # Plug pretrained bert model
        bert_model = BertForPreTraining.from_pretrained(
            "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/pytorch_model.bin",
            config=config)
        model.bert = bert_model.bert

    elif 'bert-base-multilingual' in args['model_checkpoint']:
        # bert-base-multilingual-uncased or bert-base-multilingual-cased
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)

    elif 'xlm-mlm' in args['model_checkpoint']:
        # xlm-mlm-100-1280
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint'])
        config = XLMConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'xlm-roberta' in args['model_checkpoint']:
        # xlm-roberta-base or xlm-roberta-large
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = XLMRobertaTokenizer.from_pretrained(
            args['model_checkpoint'])
        config = XLMRobertaConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = XLMRobertaForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = XLMRobertaForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = XLMRobertaForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)
    elif 'word2vec' in args['model_checkpoint'] or 'fasttext' in args[
            'model_checkpoint']:
        # Prepare config & tokenizer
        vocab_path = args['vocab_path']
        config_path = None

        word_tokenizer = args['word_tokenizer_class']()
        emb_path = args['embedding_path'][args['model_checkpoint']]

        _, vocab_map = load_vocab(vocab_path)
        tokenizer = SimpleTokenizer(vocab_map,
                                    word_tokenizer,
                                    lower=args["lower"])
        vocab_list = list(tokenizer.vocab.keys())

        config = BertConfig.from_pretrained('bert-base-uncased')
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]

        if args['model_checkpoint'] == 'word2vec-twitter':
            embeddings = gen_embeddings(vocab_list, emb_path)
            config.hidden_size = 400
            config.num_attention_heads = 8

        if args['model_checkpoint'] == 'fasttext-cc-id' or args[
                'model_checkpoint'] == 'fasttext-cc-id-300-no-oov-uncased' or args[
                    'model_checkpoint'] == 'fasttext-4B-id-300-no-oov-uncased':
            embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300)
            config.hidden_size = 300
            config.num_attention_heads = 10

        config.vocab_size = len(embeddings)

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config)
            model.bert.embeddings.word_embeddings.weight.data.copy_(
                torch.FloatTensor(embeddings))

    elif 'scratch' in args['model_checkpoint']:
        vocab_path, config_path = None, None

        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        config = BertConfig.from_pretrained("bert-base-uncased")
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']
        config.num_hidden_layers = args["num_layers"]
        config.hidden_size = 300
        config.num_attention_heads = 10

        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification(config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification(config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification(config=config)
    elif 'indobenchmark' in args['model_checkpoint']:
        # indobenchmark models
        # Prepare config & tokenizer
        vocab_path, config_path = None, None
        tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint'])
        config = BertConfig.from_pretrained(args['model_checkpoint'])
        if type(args['num_labels']) == list:
            config.num_labels = max(args['num_labels'])
            config.num_labels_list = args['num_labels']
        else:
            config.num_labels = args['num_labels']

        # Instantiate model
        if 'sequence_classification' == args['task']:
            model = BertForSequenceClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'token_classification' == args['task']:
            model = BertForWordClassification.from_pretrained(
                args['model_checkpoint'], config=config)
        elif 'multi_label_classification' == args['task']:
            model = BertForMultiLabelClassification.from_pretrained(
                args['model_checkpoint'], config=config)

    return model, tokenizer, vocab_path, config_path
Exemple #25
0
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Load XLMRobertaForSequenceClassification, the pretrained XLMRoberta model with a single
# linear classification layer on top.
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base", # Use the 12-layer XLMRoberta model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()


# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
Exemple #26
0
def train():
    # load model and tokenizer
    MODEL_NAME = "xlm-roberta-large"
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    #dev_dataset = load_data("./dataset/train/dev.tsv")
    train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(
        train_dataset,
        tokenizer)  # keys: input_ids, token_type_ids, attention_mask
    #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f'RUNNING ON {device}')
    # setting model hyperparameter
    bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_NAME, config=bert_config)
    model.parameters
    model.to(device)
    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        save_total_limit=3,  # number of total save model.
        save_steps=500,  # model saving step.
        num_train_epochs=10,  # total number of training epochs
        learning_rate=1e-5,  # learning_rate
        per_device_train_batch_size=32,  # batch size per device during training
        per_device_eval_batch_size=32,  # batch size for evaluation
        warmup_steps=300,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,  # log saving step.
        # evaluation_strategy='steps', # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        # eval_steps = 500,            # evaluation step.
        dataloader_num_workers=4,
        label_smoothing_factor=0.5)
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        # eval_dataset=RE_dev_dataset,             # evaluation dataset
        compute_metrics=compute_metrics  # define metrics function
    )

    # train model
    trainer.train()
        # differentiates padding from non-padding
        attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

batch_size = 1
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data,
                                   sampler=prediction_sampler,
                                   batch_size=batch_size)

model = XLMRobertaForSequenceClassification.from_pretrained(
    './models/XLM/XLMRoBERTa-Multi')

model.to(device)
model.eval()

# Tracking variables
predictions, true_labels = [], []
for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        # Get predictionss
        outputs = model(b_input_ids,
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 32

train_dataloader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset,
                                   sampler=SequentialSampler(val_dataset),
                                   batch_size=batch_size)

# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = XLMRobertaForSequenceClassification.from_pretrained(
    "./drive/MyDrive/XLM/XLMRoBERTa-B/",
    #'xlm-roberta-base',
    num_labels=2,  # multi classification
    output_attentions=False,
    output_hidden_states=False)

# Freezing layers except the classifying layer
# for param in model.bert.parameters():
#     param.requires_grad = False

# Use GPU
model.cuda()

# if layers frozen
#optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr = 5e-5, eps = 1e-8)
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

epochs = 8
Exemple #29
0
                                           collate_fn=train_dataset.spam_collate_func,
                                           shuffle=False)

import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)



model = XLMRobertaForSequenceClassification.from_pretrained( "xlm-roberta-large" , 
                             num_labels = 2 ,   
                             output_attentions = False, 
                             output_hidden_states = False,    )

model.cuda()

optimizer = AdamW(model.parameters(),
                  lr = 1e-5, 
                  eps = 1e-8, 
                  weight_decay = 0.01
                )

"""
optimizer = Adafactor( model.parameters(),
                      lr=1e-5,
                      eps=(1e-30, 1e-3),
                      clip_threshold=1.0,
Exemple #30
0
def train():
    # load model and tokenizer
    #MODEL_NAME = "bert-base-multilingual-cased"
    MODEL_NAME = 'xlm-roberta-large'
    tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME)
    print(tokenizer.tokenize("이순신은 조선 중기의 무신이다."))
    print(tokenizer.tokenize("아버지가방에들어가신다."))
    tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." +
                                       tokenizer.sep_token + "아버지가방에들어가신다.")
    print(tokenized_str)

    # load dataset
    train_dataset = load_data("/opt/ml/input/data/train/train.tsv")
    #dev_dataset = load_data("./dataset/train/dev.tsv")
    train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values
    # train_dataset, dev_dataset = load_fold(6)
    # train_label = train_dataset['label'].values
    #dev_label = dev_dataset['label'].values

    # tokenizing dataset
    tokenized_train = tokenized_dataset(train_dataset, tokenizer)
    #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer)

    # make dataset for pytorch.
    RE_train_dataset = RE_Dataset(tokenized_train, train_label)
    #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)
    train_dataset, dev_dataset = torch.utils.data.random_split(
        RE_train_dataset, [8000, 1001])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # setting model hyperparameter
    bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME)
    bert_config.num_labels = 42
    model = XLMRobertaForSequenceClassification.from_pretrained(
        MODEL_NAME, config=bert_config)
    #model.parameters
    model.to(device)

    # 사용한 option 외에도 다양한 option들이 있습니다.
    # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        save_total_limit=3,  # number of total save model.
        save_steps=300,  # model saving step.
        load_best_model_at_end=True,
        num_train_epochs=10,  # total number of training epochs
        learning_rate=1e-5,  # learning_rate
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=16,  # batch size for evaluation
        warmup_steps=300,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=100,  # log saving step.
        evaluation_strategy=
        'steps',  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=300,  # evaluation step.
        dataloader_num_workers=4,
        label_smoothing_factor=0.5)
    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=dev_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # define metrics function
    )

    # train model
    trainer.train()