def bert_config(self): if self.bert_model_name.startswith('bert-'): return BertConfig.from_pretrained(self.bert_model_name, cache_dir=self.bert_cache_dir) elif self.bert_model_name.startswith('roberta-'): return RobertaConfig.from_pretrained(self.bert_model_name, cache_dir=self.bert_cache_dir) elif self.bert_model_name.startswith('xlm-roberta-'): return XLMRobertaConfig.from_pretrained(self.bert_model_name, cache_dir=self.bert_cache_dir) else: raise ValueError('Unknown model: {}'.format(self.bert_model_name))
def __init__(self): super(XLMRobertaLargeTC, self).__init__() config = XLMRobertaConfig.from_pretrained('xlm-roberta-large', output_hidden_states=True) self.xlm_roberta = XLMRobertaModel.from_pretrained('xlm-roberta-large', config=config) self.fc = nn.Linear(config.hidden_size, 1) self.dropout = nn.Dropout(p=0.2) # initialize weight nn.init.normal_(self.fc.weight, std=0.02) nn.init.normal_(self.fc.bias, 0)
def define_config(name): if name in [ "bert-base-multilingual-cased", "sangrimlee/bert-base-multilingual-cased-korquad", "kykim/bert-kor-base", "monologg/kobert" ]: return BertConfig.from_pretrained(name) elif name in [ "monologg/koelectra-base-v3-discriminator", "kykim/electra-kor-base" ]: return ElectraConfig.from_pretrained(name) elif name in ["xlm-roberta-large"]: return XLMRobertaConfig.from_pretrained(name) elif name in ["kykim/funnel-kor-base"]: return FunnelConfig.from_pretrained(name)
def load_model(self, relation_model: str): super().load_model(relation_model) if not isinstance(relation_model, str): raise ArgumentError("Argument 'relation_model' must be a str") if len(relation_model) == 0: raise ArgumentError( "Argument 'relation_model' filename must not be empty") if not os.path.isfile(relation_model): raise RelationModelError("Relation model file doesn't exist") print('Loading model for relation extraction...', end=' ') args = { 'NUM_LABELS': len(self.data.relation_labels), 'DROPOUT_RATE': 0.1, 'LEARNING_RATE': 2e-5, 'EPOCHS': 5, 'MAX_SEQUENCE_LENGTH': 384, 'BATCH_SIZE': 16, 'ADAM_EPSILON': 1e-8, 'GRADIENT_ACCUMULATION_STEPS': 1, 'MAX_GRAD_NORM': 1.0, 'LOGGING_STEPS': 250, 'SAVE_STEPS': 250, 'WEIGHT_DECAY': 0.0, 'NUM_WARMUP_STEPS': 0, } try: self.tokenizer.add_special_tokens({ "additional_special_tokens": ["<e1>", "</e1>", "<e2>", "</e2>"] }) config = XLMRobertaConfig.from_pretrained( 'xlm-roberta-base', num_labels=args['NUM_LABELS']) self.model = RBERT.from_pretrained(relation_model, config=config, args=args) self.model.to(self.device) except: raise RelationModelError( 'Error with loading relation model. ' 'Relation model should be PyTorch model creating for relation extraction.' ) print('OK')
def train(): seed_everything(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # setting model hyperparameter # config 자체에는 학습 weight 정보 없기 때문에, from_pretrained 사용해 weight 가져올 수 있다 # bert_config = BertConfig.from_pretrained(MODEL_NAME) # bert_config.num_labels = 42 # model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) # Auto model_config = XLMRobertaConfig.from_pretrained(args.model_name) model_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( args.model_name, config=model_config) # load model and tokenizer # MODEL_NAME = "monologg/koelectra-base-v3-discriminator" # roberta: https://huggingface.co/transformers/model_doc/xlmroberta.html tokenizer = AutoTokenizer.from_pretrained(args.model_name) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") # label = dataset['label'].values train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=args.seed) tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_val = tokenized_dataset(val_dataset, tokenizer) tokenized_train_label = train_dataset['label'].values tokenized_val_label = val_dataset['label'].values # train_datasets = TokenDataset(train_dataset, tokenizer) # val_datasets = TokenDataset(val_dataset, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, tokenized_train_label) RE_val_dataset = RE_Dataset(tokenized_val, tokenized_val_label) # print(model.parameters) model.to(device) model = torch.nn.DataParallel(model) train_loader = DataLoader( RE_train_dataset, batch_size=args.batch_size, # num_workers=8, pin_memory=torch.cuda.is_available(), shuffle=True, ) val_loader = DataLoader( RE_val_dataset, batch_size=args.batch_size, # num_workers=8, shuffle=False, pin_memory=torch.cuda.is_available(), ) optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) loss_fn = LabelSmoothingLoss(smoothing=0.5) # loss_fn = nn.CrossEntropyLoss() # t_total = len(train_loader) * args.epoch t_total = args.epoch warmup_step = int(t_total * args.warmup_steps) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) log_dir = "" log_list = glob("./logs/*") if len(log_list) == 0: log_dir = "./logs/exp1" else: log_list = [int(log[-1]) for log in log_list] log_dir = "./logs/exp" + str(max(log_list) + 1) logger = SummaryWriter(log_dir=log_dir) scaler = GradScaler() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) import time for epoch in tqdm(range(args.epoch)): train_acc = 0.0 train_loss = 0.0 val_acc = 0.0 val_loss = 0.0 best_acc = 0.0 model.train() for batch_id, batch in enumerate(tqdm(train_loader)): input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) optimizer.zero_grad() with autocast(): outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = loss_fn(outputs.logits, labels) # loss.backward() # optimizer.step() scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scaler.step(optimizer) scaler.update() train_acc += compute_acc(outputs.logits.cpu(), labels.cpu()) train_loss += loss if (batch_id + 1) % args.logging_steps == 0: train_loss = train_loss.data.cpu().numpy() print( f"[Train] epoch {epoch + 1} | batch_id {batch_id + 1} | loss {(train_loss) / args.logging_steps:.4f} | train_acc {train_acc / args.logging_steps:.4f}" ) logger.add_scalar("Train/loss", train_loss / args.logging_steps, epoch * len(train_loader) + batch_id) logger.add_scalar("Train/acc", train_acc / args.logging_steps, epoch * len(train_loader) + batch_id) train_acc = 0.0 train_loss = 0.0 # scheduler.step() print("\nStart Validation Step!") with torch.no_grad(): model.eval() for batch_id, batch in enumerate(tqdm(val_loader)): input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = loss_fn(outputs.logits, labels) val_acc += compute_acc(outputs.logits.cpu(), labels.cpu()) val_loss += loss print( f"[Val] epoch {epoch + 1} | val_acc {val_acc / (batch_id + 1):.4f}" ) logger.add_scalar("Val/loss", val_loss / (batch_id + 1), epoch) logger.add_scalar("Val/acc", val_acc / (batch_id + 1), epoch) if val_acc >= best_acc: best_acc = val_acc # torch.save(model.state_dict(), os.path.join(args.output_dir, "saved_" + str(epoch) + ".pth")) torch.save(model.state_dict(), os.path.join(args.output_dir, "best.pth")) print("Saved best acc model...") scheduler.step() torch.save(model.state_dict(), os.path.join(args.output_dir, "last.pth"))
def train(): # load model and tokenizer MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) # load dataset train_dataset = load_data("/opt/ml/input/data/train/train.tsv") #dev_dataset = load_data("./dataset/train/dev.tsv") train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset( train_dataset, tokenizer) # keys: input_ids, token_type_ids, attention_mask #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(f'RUNNING ON {device}') # setting model hyperparameter bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=3, # number of total save model. save_steps=500, # model saving step. num_train_epochs=10, # total number of training epochs learning_rate=1e-5, # learning_rate per_device_train_batch_size=32, # batch size per device during training per_device_eval_batch_size=32, # batch size for evaluation warmup_steps=300, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. # evaluation_strategy='steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. # eval_steps = 500, # evaluation step. dataloader_num_workers=4, label_smoothing_factor=0.5) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset # eval_dataset=RE_dev_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
"backbone_name": "xlm-roberta-large", "model_name": "mlm_disentangle_default", "gradient_acc_size": 16, "batch_size": 2, "max_step": 3375, "log_step": 225, "num_frozen_layers": 18, "mlm_lr": 4e-4, "mlm_beta1": 0.9, "mlm_beta2": 0.98, "mlm_eps": 1e-6 } }""", cls=ExperimentConfigSerializer, ) XLMRConfig = XLMRobertaConfig.from_pretrained("xlm-roberta-large") setattr(XLMRConfig, "discriminators", exp_config["discriminators"]) model = MultitaskModel.create_untrained( backbone_name="xlm-roberta-large", task_dict={ "mlm": { "type": XLMRobertaForMaskedLM, "config": XLMRobertaConfig.from_pretrained("xlm-roberta-large"), }, "disentangle": { "type": XLMRobertaForDisentanglement, "config": XLMRConfig, }, }, ) import torch
def load_model(args): if 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'fasttext' in args['model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model model_class = None if 'sequence_classification' == args['task']: model_class = AlbertForSequenceClassification if 'lite' in args[ 'model_checkpoint'] else BertForSequenceClassification elif 'token_classification' == args['task']: model_class = AlbertForWordClassification if 'lite' in args[ 'model_checkpoint'] else BertForWordClassification elif 'multi_label_classification' == args['task']: model_class = AlbertForMultiLabelClassification if 'lite' in args[ 'model_checkpoint'] else BertForMultiLabelClassification model = model_class.from_pretrained(args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path
def load_eval_model(args): vocab_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/vocab.txt' config_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/config.json' model_path = f'./{args["model_dir"]}/{args["dataset"]}/{args["experiment_name"]}/best_model_0.th' # Load for word2vec and fasttext if 'word2vec' in args['model_type'] or 'fasttext' in args['model_type']: emb_path = args['embedding_path'][args['model_type']] model, tokenizer = load_word_embedding_model( args['model_type'], args['task'], vocab_path, args['word_tokenizer_class'], emb_path, args['num_labels'], lower=args['lower']) return model, tokenizer # Load config & tokenizer if 'albert' in args['model_type']: config = AlbertConfig.from_json_file(config_path) tokenizer = BertTokenizer(vocab_path) elif 'babert' in args['model_type']: config = BertConfig.from_json_file(config_path) tokenizer = BertTokenizer(vocab_path) elif 'scratch' in args['model_type']: config = BertConfig.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") elif 'bert-base-multilingual' in args['model_type']: config = BertConfig.from_pretrained(args['model_type']) tokenizer = BertTokenizer.from_pretrained(args['model_type']) elif 'xlm-mlm-100-1280' in args['model_type']: config = XLMConfig.from_pretrained(args['model_type']) tokenizer = XLMTokenizer.from_pretrained(args['model_type']) elif 'xlm-roberta' in args['model_type']: config = XLMRobertaConfig.from_pretrained(args['model_type']) tokenizer = XLMRobertaTokenizer.from_pretrained(args['model_type']) else: raise ValueError('Invalid `model_type` argument values') # Get model class base_cls, pred_cls = get_model_class(args['model_type'], args['task']) # Adjust config if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model model = pred_cls(config=config) base_model = base_cls.from_pretrained(model_path, from_tf=False, config=config) # Plug pretrained base model to classification model if 'bert' in model.__dir__(): model.bert = base_model elif 'albert' in model.__dir__(): model.albert = base_model elif 'roberta' in model.__dir__(): model.roberta = base_model elif 'transformer' in model.__dir__(): model.transformer = base_model else: ValueError( 'Model attribute not found, is there any change in the `transformers` library?' ) return model, tokenizer
def load_model(args): if 'albert-large-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_vocab_uncased_30000.txt" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_albert_large_config.json" ) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_pytorch_albert_large_512_629k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_pytorch_model_albert_base_162k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-large-wwmlm-128' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_vocab_uncased_30000.txt" config_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_albert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_pytorch_albert_large_128_500k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-bpe-mlm-large-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_bert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_pytorch_babert_uncased_large_512_dup10-5_1120k.bin", config=config) model.bert = bert_model.bert elif 'albert-base-uncased-112500' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-112500/vocab.txt" config_path = "../embeddings/albert-base-uncased-112500/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-112500/albert_base_uncased_112500.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-96000' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-96000/vocab.txt" config_path = "../embeddings/albert-base-uncased-96000/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-96000/albert_base_uncased_96000.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-191k' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_pytorch_model_albert_base_191k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-opensubtitle' == args['model_checkpoint']: # babert-opensubtitle # Prepare config & tokenizer vocab_path = "../embeddings/babert-opensubtitle/vocab.txt" config_path = "../embeddings/babert-opensubtitle/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-opensubtitle/model.ckpt-1000000.index", from_tf=True, config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1100k' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_pytorch_model_babert_large_1100k.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1m' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_pytorch_model_babert_large_1mil.bin", config=config) model.bert = bert_model.bert elif 'babert-base-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_vocab_uncased_30522.txt" config_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-base-512/pytorch_models_babert_base_512_pytorch_model_babert_base_uncased_512.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_pytorch_model_babert_large_778500.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-uncased-128-dup10-5' == args['model_checkpoint']: # babert_bpe_wwmlm # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/vocab.txt" config_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/pytorch_model.bin", config=config) model.bert = bert_model.bert elif 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'word2vec' in args['model_checkpoint'] or 'fasttext' in args[ 'model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] if args['model_checkpoint'] == 'word2vec-twitter': embeddings = gen_embeddings(vocab_list, emb_path) config.hidden_size = 400 config.num_attention_heads = 8 if args['model_checkpoint'] == 'fasttext-cc-id' or args[ 'model_checkpoint'] == 'fasttext-cc-id-300-no-oov-uncased' or args[ 'model_checkpoint'] == 'fasttext-4B-id-300-no-oov-uncased': embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path
def train(): # load model and tokenizer # "bert-base-multilingual-cased",kykim/bert-kor-base,roberta-large-mnli MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained( # roberta기준 MODEL_NAME ) # XLMRobertaTokenizer AutoTokenizer.from_pretrained(MODEL_NAME) # autoTokenizer로 자동으로 가져온다. # 데이터셋 나누기 datas = pd.read_csv("/opt/ml/input/data/train/train.tsv", delimiter='\t', header=None) train, val = train_test_split(datas, test_size=0.2, random_state=42) # train = pd.DataFrame(data=train) train.to_csv('/opt/ml/input/data/train/train_train.tsv', sep='\t', header=None, index=False) val.to_csv('/opt/ml/input/data/train/train_val.tsv', sep='\t', header=None, index=False) # load dataset train_dataset = load_data("/opt/ml/input/data/train/train.tsv") train_label = train_dataset['label'].values dev_dataset = load_data("/opt/ml/input/data/train/train_val.tsv") dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = XLMRobertaConfig.from_pretrained( MODEL_NAME) # XLMRobertaConfig,BertConfig.from_pretrained(MODEL_NAME) # print(bert_config) # bert_config.vocab_size = 42004 # 4개 추가 안됨 bert_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( # XLMRobertaForSequenceClassification AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) model.resize_token_embeddings(len(tokenizer)) # 새로운 토큰 추가로 임베딩 크기 조정 # model = BertForSequenceClassification(bert_config) # from_pretrained가져오기 model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=3, # number of total save model. save_steps=500, # model saving step. num_train_epochs=10, # total number of training epochs learning_rate=1e-5, # learning_rate per_device_train_batch_size=32, # batch size per device during training warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. per_device_eval_batch_size=8, # batch size for evaluation evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=500, # evaluation step. # dataloader_num_workers=4, # label_smoothing_factor=0.5 # 제일 높은것만 저장한다고 한다. # load_best_model_at_end=True, # metric_for_best_model="accuracy" ) trainer = Trainer( # the instantiated 🤗 Transformers model to be trained model=model, args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_dev_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
def train(args): # load model and tokenizer MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) # split dataset dataset = pd.read_csv('/opt/ml/input/data/train/train.tsv', delimiter='\t', header=None) train, dev = train_test_split(dataset, test_size=0.2, random_state=42) train.to_csv('/opt/ml/input/data/train/train_train.tsv', sep='\t', header=None, index=False) dev.to_csv('/opt/ml/input/data/train/train_dev.tsv', sep='\t', header=None, index=False) # load dataset train_dataset = load_data('/opt/ml/input/data/train/train_train.tsv', args.root) dev_dataset = load_data('/opt/ml/input/data/train/train_dev.tsv', args.root) train_label = train_dataset['label'].values dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments(output_dir='./results/' + str(args.id), save_total_limit=3, save_steps=100, num_train_epochs=10, learning_rate=1e-5, per_device_train_batch_size=32, per_device_eval_batch_size=32, warmup_steps=300, weight_decay=0.01, logging_dir='./logs/' + str(args.id), logging_steps=100, evaluation_strategy='steps', eval_steps=100, dataloader_num_workers=4, label_smoothing_factor=0.5) trainer = Trainer(model=model, args=training_args, train_dataset=RE_train_dataset, eval_dataset=RE_dev_dataset, compute_metrics=compute_metrics) # train model trainer.train()
def train(): # load model and tokenizer #MODEL_NAME = "bert-base-multilingual-cased" MODEL_NAME = 'xlm-roberta-large' tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) print(tokenizer.tokenize("이순신은 조선 중기의 무신이다.")) print(tokenizer.tokenize("아버지가방에들어가신다.")) tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." + tokenizer.sep_token + "아버지가방에들어가신다.") print(tokenized_str) # load dataset train_dataset = load_data("/opt/ml/input/data/train/train.tsv") #dev_dataset = load_data("./dataset/train/dev.tsv") train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # train_dataset, dev_dataset = load_fold(6) # train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) train_dataset, dev_dataset = torch.utils.data.random_split( RE_train_dataset, [8000, 1001]) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) #model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=3, # number of total save model. save_steps=300, # model saving step. load_best_model_at_end=True, num_train_epochs=10, # total number of training epochs learning_rate=1e-5, # learning_rate per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation warmup_steps=300, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=300, # evaluation step. dataloader_num_workers=4, label_smoothing_factor=0.5) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=dev_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function ) # train model trainer.train()
def train(model_dir, args): seed_everything(args.seed) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("This notebook use [%s]." % (device)) s_dir = args.model + str( args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str( args.epochs ) + '-' + args.scheduler + '-' + args.tokenize + '-' + str( args.max_len) + '-' + str(args.seed) save_dir = increment_path(os.path.join(model_dir, s_dir)) log_dir = increment_path(os.path.join('logs', s_dir)) # load model and tokenizer MODEL_NAME = args.model if MODEL_NAME.startswith('xlm'): tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # set neptune set_neptune(save_dir, args) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") labels = dataset['label'].values # setting model hyperparameter if MODEL_NAME.startswith('xlm'): bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) else: bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers if MODEL_NAME.startswith('xlm'): model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) else: model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) if args.drop >= 0: model.dropout = nn.Dropout(p=args.drop) # preprocess dataset if args.preprocess != 'no': pre_module = getattr(import_module("preprocess"), args.preprocess) dataset = pre_module(dataset, model, tokenizer) # make dataset for pytorch. # train, val split train_dataset, val_dataset = train_test_split(dataset, test_size=args.val_ratio, random_state=args.seed) tok_module = getattr(import_module("load_data"), args.tokenize) train_tokenized = tok_module(train_dataset, tokenizer, max_len=args.max_len) val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len) # make dataset for pytorch. RE_train_dataset = RE_Dataset( train_tokenized, train_dataset['label'].reset_index(drop='index')) RE_val_dataset = RE_Dataset(val_tokenized, val_dataset['label'].reset_index(drop='index')) model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( seed=args.seed, output_dir=save_dir, # output directory save_total_limit=2, # number of total save model. save_steps=args.save_steps, # model saving step. num_train_epochs=args.epochs, # total number of training epochs learning_rate=args.lr, # learning_rate per_device_train_batch_size=args. batch_size, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation lr_scheduler_type=args.scheduler, warmup_steps=args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay logging_dir=log_dir, # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=100, # evaluation step. dataloader_num_workers=4, label_smoothing_factor=args.smoothing_factor, load_best_model_at_end=True, metric_for_best_model='accuracy') trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
def train(args): # device setting device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load model and tokenizer if args['model_name'] == "xlm-roberta-large": MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) config = XLMRobertaConfig.from_pretrained(MODEL_NAME) config.num_labels = args['num_labels'] model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=config) elif args['model_name'] == "roberta-base": MODEL_NAME = "roberta-base" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) config = RobertaConfig.from_pretrained(MODEL_NAME, output_hidden_states=True) config.num_labels = args['num_labels'] model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=config) elif args['model_name'] == "bert-base-multilingual-cased": MODEL_NAME = "bert-base-multilingual-cased" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) config = BertConfig.from_pretrained(MODEL_NAME) config.num_labels = args['num_labels'] model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=config) else: MODEL_NAME = args['model_name'] tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME) # if you use entity_token if args['entity_token']: special_tokens_dict = { 'additional_special_tokens': ["#", "@", '₩', '^'] } num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") train_dataset, valid_dataset = train_test_split( dataset, test_size=0.1, random_state=args['random_seed']) train_label = train_dataset['label'].values valid_label = valid_dataset['label'].values # pororo ner ner = Pororo(task="ner", lang="ko") # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer, ner, args) tokenized_valid = tokenized_dataset(valid_dataset, tokenizer, ner, args) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_valid_dataset = RE_Dataset(tokenized_valid, valid_label) # update model setting model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. print("use_trainer : ", args['use_trainer']) if args['use_trainer']: training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=5, # number of total save model. save_steps=500, # model saving step. num_train_epochs=args['epochs'], # total number of training epochs learning_rate=args['lr'], # learning_rate per_device_train_batch_size=args[ 'train_batch_size'], # batch size per device during training per_device_eval_batch_size=args[ 'eval_batch_size'], # batch size for evaluation warmup_steps=args[ 'warmup_steps'], # number of warmup steps for learning rate scheduler weight_decay=args['weight_decay'], # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=args['logging_steps'], # log saving step. label_smoothing_factor=args['label_smoothing_factor'], evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=100, # evaluation step. ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_valid_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train() else: custom_trainer(model, device, RE_train_dataset, RE_valid_dataset, args)
def __init__( self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True ): super().__init__(config=config) if pre_load_model: self.xlm_roberta = XLMRobertaModel.from_pretrained( self.config.model_name, output_hidden_states=True ) else: xlm_roberta_config = XLMRobertaConfig.from_pretrained( self.config.model_name, output_hidden_states=True ) self.xlm_roberta = XLMRobertaModel(xlm_roberta_config) self.vocabs = { const.TARGET: vocabs[const.TARGET], const.SOURCE: vocabs[const.SOURCE], } self.mlp = None if self.config.use_mlp: self.mlp = nn.Sequential( nn.Linear(self.xlm_roberta.config.hidden_size, self.config.hidden_size), nn.Tanh(), ) output_size = self.config.hidden_size else: output_size = self.xlm_roberta.config.hidden_size sentence_size = output_size if config.pooling == 'mixed': sentence_size *= 2 self.scalar_mix = ScalarMixWithDropout( mixture_size=self.xlm_roberta.config.num_hidden_layers + 1, # +1 for embeddings do_layer_norm=self.config.scalar_mix_layer_norm, dropout=self.config.scalar_mix_dropout, ) self._sizes = { const.TARGET: output_size, const.TARGET_LOGITS: output_size, const.TARGET_SENTENCE: sentence_size, const.SOURCE: output_size, } self.output_embeddings = self.xlm_roberta.embeddings.word_embeddings self._training_steps_ran = 0 self._is_frozen = False if self.config.freeze: logger.info( 'Freezing XLMRoberta encoder weights; training will not update them' ) for param in self.xlm_roberta.parameters(): param.requires_grad = False self._is_frozen = True if self.config.freeze_for_number_of_steps > 0: # Done inside `forward()` to guarantee we can unfreeze (if optimizer is # built after this, we cannot unfreeze without calling # `optimizer.add_param_group({'params': self.xlm.parameters()})` pass
def train(): # load model and tokenizer #MODEL_NAME = "bert-base-multilingual-cased" device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') MODEL_NAME = 'xlm-roberta-large' tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) print(tokenizer.tokenize("이순신은 조선 중기의 무신이다.")) print(tokenizer.tokenize("아버지가방에들어가신다.")) tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." + tokenizer.sep_token + "아버지가방에들어가신다.") print(tokenized_str) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") label = dataset['label'].values bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 cv = StratifiedShuffleSplit(n_splits=5, test_size = 0.8, train_size= 0.2) for idx , (train_idx , val_idx) in enumerate(cv.split(dataset, label)): train_dataset = dataset.iloc[train_idx] val_dataset = dataset.iloc[val_idx] # tokenizing dataset train_dataset = tokenized_dataset(train_dataset, tokenizer) val_dataset = tokenized_dataset(val_dataset, tokenizer) train_y = label[train_idx] val_y = label[val_idx] # make dataset for pytorch. RE_train_dataset = RE_Dataset(train_dataset, train_y) RE_valid_dataset = RE_Dataset(val_dataset, val_y) # setting model hyperparameter model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=2, # number of total save model. save_steps=400, # model saving step. num_train_epochs=10, # total number of training epochs learning_rate=1e-5, # learning_rate per_device_train_batch_size=16, # batch size per device during training #per_device_eval_batch_size=8, # batch size for evaluation warmup_steps=300, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy='steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps = 400, # evaluation step. dataloader_num_workers=4, metric_for_best_model="accuracy", greater_is_better = True, label_smoothing_factor=0.5 ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_valid_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function ) # train model trainer.train()
def train(args): # load model and tokenizer MODEL_NAME = args.model_name if args.model_type == 'kobert': tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset # root = '/opt/ml' root = args.root train_dataset = load_data(root + "/input/data/train/train.tsv", root) # train_dataset = load_data(root+"/input/data/train/ner_train_ver2.tsv", root) #dev_dataset = load_data("./dataset/train/dev.tsv") train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) # BalanceClassSampler를 정의합니다. 여기선 upsampling 옵션을 주었습니다. # sampler = BalanceClassSampler(RE_train_dataset.get_classes(), 'upsampling') # RE_train_loader = DataLoader(RE_train_dataset, batch_size=16, sampler=sampler) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter if args.model_type == 'bert': bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) elif args.model_type == 'electra': electra_config = ElectraConfig.from_pretrained(MODEL_NAME) electra_config.num_labels = 42 model = ElectraForSequenceClassification.from_pretrained( MODEL_NAME, config=electra_config) elif args.model_type == 'roberta': roberta_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) roberta_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=roberta_config) model.resize_token_embeddings(len(tokenizer)) model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results/' + str(args.id), # output directory save_total_limit=args.save_total_limit, # number of total save model. save_steps=args.save_steps, # model saving step. num_train_epochs=args. num_train_epochs, # total number of training epochs learning_rate=args.learning_rate, # learning_rate per_device_train_batch_size=args. per_device_train_batch_size, # batch size per device during training #per_device_eval_batch_size=16, # batch size for evaluation warmup_steps=args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay logging_dir='./logs/' + str(args.id), # directory for storing logs logging_steps=args.logging_steps, # log saving step. #evaluation_strategy='steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. #eval_steps = 500, # evaluation step. # save_strategy='epoch', label_smoothing_factor=0.5) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset #eval_dataset=RE_dev_dataset, # evaluation dataset #compute_metrics=compute_metrics # define metrics function ) # train model print('Training start') trainer.train() print('Training finished!')
} PUBLIC_MODEL = { 'mbert': { 'name': 'bert-base-multilingual-cased', 'tokenizer': BertTokenizerFast.from_pretrained('bert-base-multilingual-cased'), 'config': BertConfig.from_pretrained('bert-base-multilingual-cased'), }, 'xlmr': { 'name': 'xlm-roberta-base', 'tokenizer': XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base'), 'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'), }, 'xlmr-large': { 'name': 'xlm-roberta-large', 'tokenizer': XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-large'), 'config': XLMRobertaConfig.from_pretrained('xlm-roberta-base'), }, } TOKENIZER_CLS = { 'spm_camembert': CamembertTokenizer, 'spm': ThaiRobertaTokenizer, 'newmm': ThaiWordsNewmmTokenizer, 'syllable': ThaiWordsSyllableTokenizer, 'sefr_cut': FakeSefrCutTokenizer,
def create(cls, model_type='camem', model_name="camembert-base", embedding_size=768, hidden_dim=512, rnn_layers=1, lstm_dropout=0.5, device="cuda", mode="weighted", key_dim=64, val_dim=64, num_heads=3, attn_dropout=0.3, self_attention=False, is_require_grad=False): configuration = { 'model_type': model_type, "model_name": model_name, "device": device, "mode": mode, "self_attention": self_attention, "is_freeze": is_require_grad } if 'camem' in model_type: config_bert = CamembertConfig.from_pretrained( model_name, output_hidden_states=True) model = CamembertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'flaubert' in model_type: config_bert = FlaubertConfig.from_pretrained( model_name, output_hidden_states=True) model = FlaubertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'XLMRoberta' in model_type: config_bert = XLMRobertaConfig.from_pretrained( model_name, output_hidden_states=True) model = XLMRobertaModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'M-Bert' in model_type: config_bert = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = BertModel.from_pretrained(model_name, config=config_bert) model.to(device) lstm = BiLSTM.create(embedding_size=embedding_size, hidden_dim=hidden_dim, rnn_layers=rnn_layers, dropout=lstm_dropout) attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads, attn_dropout) model.train() self = cls(model=model, config=configuration, lstm=lstm, attn=attn) # if is_freeze: self.freeze() return self