def define_model(name, config=None, location=None): # config가 있으면 처음 training하는 경우, 없으면 체크포인트 불러오기 if name in [ "bert-base-multilingual-cased", "sangrimlee/bert-base-multilingual-cased-korquad", "kykim/bert-kor-base", "monologg/kobert" ]: return BertForSequenceClassification.from_pretrained( name, config=config ) if config else BertForSequenceClassification.from_pretrained( location) elif name in [ "monologg/koelectra-base-v3-discriminator", "kykim/electra-kor-base" ]: return ElectraForSequenceClassification.from_pretrained( name, config=config ) if config else ElectraForSequenceClassification.from_pretrained( location) elif name in ["xlm-roberta-large"]: return XLMRobertaForSequenceClassification.from_pretrained( name, config=config ) if config else XLMRobertaForSequenceClassification.from_pretrained( location) elif name in ["kykim/funnel-kor-base"]: return FunnelForSequenceClassification.from_pretrained( name, config=config ) if config else FunnelForSequenceClassification.from_pretrained( location)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME) special_tokens_dict = {'additional_special_tokens': ["#", "@", '₩', '^']} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) # load my model MODEL_NAME = args.model_dir # model dir. model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME) model.resize_token_embeddings(len(tokenizer)) model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv('./prediction/submission.csv', index=False)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer #TOK_NAME = "bert-base-multilingual-cased" #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') # load my model MODEL_NAME = args.model_dir # model dir. model = XLMRobertaForSequenceClassification.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv('./prediction/roberta-submission13.csv', index=False)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME) # load my model MODEL_NAME = args.model_dir # model dir. p = Path('.').resolve() # /opt/ml model_dir = p / args.model_dir model = XLMRobertaForSequenceClassification.from_pretrained(model_dir) model.resize_token_embeddings(len(tokenizer)) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) output.to_csv( './prediction/submission.csv', index=False, )
def main(json_path, model_name_or_dir): tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir) model = XLMRobertaForSequenceClassification.from_pretrained( model_name_or_dir).to(DEVICE) model.eval() with open(json_path) as json_file: data = json.load(json_file) predictions = [] labels = [] for pair in data: sentence = pair['text'] label = pair['sentiment'] inputs = tokenizer.encode(sentence, padding=False, truncation=True, return_tensors='pt').to(DEVICE) with torch.no_grad(): output = model(inputs).logits prediction = torch.argmax(output, dim=-1)[0].item() predictions.append(prediction) labels.append(label) print(metrics.classification_report(labels, predictions, digits=6))
def main( output_dir, logging_dir, logging_steps, large, batch_size, gradient_accumulation_steps, learning_rate, num_train_epochs, warmup_ratio): sst_train_dataset = load_dataset('glue', 'sst2', split='train') sst_validation_dataset = load_dataset('glue', 'sst2', split='validation') if large: model_name = 'xlm-roberta-large' else: model_name = 'xlm-roberta-base' tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name) model = XLMRobertaForSequenceClassification.from_pretrained(model_name) def preprocess_function(examples): return tokenizer(examples['sentence'], padding=False, truncation=True) sst_train_dataset = sst_train_dataset.map(preprocess_function, batched=True) sst_validation_dataset = sst_validation_dataset.map(preprocess_function, batched=True) training_args = TrainingArguments( output_dir=output_dir, do_train=True, do_eval=True, do_predict=False, evaluation_strategy='epoch', per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, learning_rate=learning_rate, num_train_epochs=num_train_epochs, warmup_ratio=warmup_ratio, logging_dir=logging_dir, logging_strategy='steps', logging_steps=logging_steps, save_strategy='epoch', ) trainer = Trainer( model=model, args=training_args, train_dataset=sst_train_dataset, eval_dataset=sst_validation_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train()
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification #download the model or load the model path model_path = download_model('xlmr.ned', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.classes = ['0', '1'] self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path) self.model = XLMRobertaForSequenceClassification.from_pretrained( model_path, num_labels=len(self.classes)) self.max_length = self.model.roberta.embeddings.position_embeddings.num_embeddings - 2
def __init__(self, config): model_name = config.get("model_name", None) model_path = config.get("model_path", None) device = config.get("device", 0) # default on gpu 0 self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_path) # the default entailment id is 2 (contradiction is 0, neutral is 1) self.contradiction_id = 0 self.entailment_id = 2 self.model = XLMRobertaForSequenceClassification.from_pretrained( model_path) self.model.eval() self.model.half() self.device = torch.device( "cpu" if device < 0 else "cuda:{}".format(device)) if self.device.type == "cuda": self.model = self.model.to(self.device)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ seed_everything(args.seed) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # load tokenizer TOK_NAME = args.token if TOK_NAME.startswith('xlm'): tokenizer = XLMRobertaTokenizer.from_pretrained(TOK_NAME) else: tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) # load my model MODEL_NAME = os.path.join(args.model_dir, args.model) # model dir. if TOK_NAME.startswith('xlm'): model = XLMRobertaForSequenceClassification.from_pretrained(MODEL_NAME) else: model = BertForSequenceClassification.from_pretrained(MODEL_NAME) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, model, tokenizer, args) test_dataset = RE_Dataset(test_dataset, test_label) model.to(device) # predict answer batch_size = args.batch_size logits, pred_answer = inference(model, test_dataset, device, batch_size) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) save_dir = os.path.join(args.output_dir, args.name) os.makedirs(save_dir, exist_ok=True) output.to_csv(os.path.join(save_dir, f'{args.name}.csv'), index=False) np.save(os.path.join(save_dir, r'logits.npy'), logits)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = args.model_name tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) # load my model model_dir = f'./results/{args.id}/checkpoint-{args.checkpoint}' if args.model_type == 'bert': model = BertForSequenceClassification.from_pretrained(model_dir) elif args.model_type == 'electra': model = ElectraForSequenceClassification.from_pretrained(model_dir) elif args.model_type == 'roberta': model = XLMRobertaForSequenceClassification.from_pretrained(model_dir) model.parameters model.to(device) # load test datset # root = "/opt/ml" # root = "/content/drive/MyDrive/Boostcamp/Stage2_KLUE" root = args.root test_dataset, test_label = load_test_dataset(root, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # logits, predictions = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) # output = pd.DataFrame(predictions, columns=['pred']) output.to_csv(f'./results/{args.id}/submission{args.id}.csv', index=False) # np.save(f'./results/{args.id}/logits{args.id}.npy', logits) print('File saved')
def main(train_json_path, val_json_path, model_name_or_dir, output_dir, logging_dir, logging_steps, batch_size, gradient_accumulation_steps, learning_rate, num_train_epochs, warmup_ratio): tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir) model = XLMRobertaForSequenceClassification.from_pretrained( model_name_or_dir) sh_sentiment_train_dataset, sh_sentiment_val_dataset = create_sh_sentiment_dataset( train_json_path, val_json_path, tokenizer) training_args = TrainingArguments( output_dir=output_dir, do_train=True, do_eval=True, do_predict=False, evaluation_strategy='epoch', per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, learning_rate=learning_rate, num_train_epochs=num_train_epochs, warmup_ratio=warmup_ratio, logging_dir=logging_dir, logging_strategy='steps', logging_steps=logging_steps, save_strategy='epoch', ) trainer = Trainer( model=model, args=training_args, train_dataset=sh_sentiment_train_dataset, eval_dataset=sh_sentiment_val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train()
def main( input_dir_path, output_dir_path, model_name_or_dir): tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir) model = XLMRobertaForSequenceClassification.from_pretrained(model_name_or_dir).to(DEVICE) model.eval() os.makedirs(output_dir_path, exist_ok=True) for file_name in os.listdir(input_dir_path): if file_name.endswith('.json'): count = 0 input_file_path = os.path.join(input_dir_path, file_name) with open(input_file_path) as json_file: data = json.load(json_file) for session in data['sessions']: for speech in session['speeches']: content = [] for text in speech['content']: inputs = tokenizer.encode( text, padding=False, truncation=True, return_tensors='pt').to(DEVICE) with torch.no_grad(): outputs = model(inputs).logits predictions = torch.softmax(outputs, dim=-1)[0, 1].item() content.append({ 'text': text, 'sentiment': round(predictions, 6) }) count += 1 speech['content'] = content output_file_path = os.path.join(output_dir_path, file_name) with open(output_file_path, 'w') as json_file: json.dump(data, json_file) print("File: {}, Count: {}".format(file_name, count))
def load_model(pretrained_name, model_loc=None, load_tuned=True, num_labels=2): assert pretrained_name is not None if load_tuned: # load previously tuned model from disk if model_loc is None: model_dump_loc, model_state_dic_loc = generate_disk_location() else: model_dump_loc = model_loc model = torch.load(model_dump_loc) logger.info("loading model from {}".format(model_dump_loc)) else: # load pretrained name from hugging face model_name = config["model_name"] if model_name == "bert": model = BertForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "roberta": model = RobertaForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "distillbert": model = DistilBertForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "xlmroberta": model = XLMRobertaForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) elif model_name == "xlnet": model = XLNetForSequenceClassification.from_pretrained( pretrained_name, num_labels=num_labels) else: logger.error("unsupported model: {}".format(model_name)) logger.info("loading pretrained model") tokenizer = AutoTokenizer.from_pretrained(pretrained_name) logger.info("model config: {}".format(model.config)) return model, tokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification import torch from textattack.models.wrappers.huggingface_model_wrapper import HuggingFaceModelWrapper tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') raw_model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=10) raw_model.load_state_dict(torch.load('xlm_roberta_en/state_dict.pt', map_location='cuda:0')) model = HuggingFaceModelWrapper(raw_model, tokenizer)
def train(args): # load model and tokenizer MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) # split dataset dataset = pd.read_csv('/opt/ml/input/data/train/train.tsv', delimiter='\t', header=None) train, dev = train_test_split(dataset, test_size=0.2, random_state=42) train.to_csv('/opt/ml/input/data/train/train_train.tsv', sep='\t', header=None, index=False) dev.to_csv('/opt/ml/input/data/train/train_dev.tsv', sep='\t', header=None, index=False) # load dataset train_dataset = load_data('/opt/ml/input/data/train/train_train.tsv', args.root) dev_dataset = load_data('/opt/ml/input/data/train/train_dev.tsv', args.root) train_label = train_dataset['label'].values dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments(output_dir='./results/' + str(args.id), save_total_limit=3, save_steps=100, num_train_epochs=10, learning_rate=1e-5, per_device_train_batch_size=32, per_device_eval_batch_size=32, warmup_steps=300, weight_decay=0.01, logging_dir='./logs/' + str(args.id), logging_steps=100, evaluation_strategy='steps', eval_steps=100, dataloader_num_workers=4, label_smoothing_factor=0.5) trainer = Trainer(model=model, args=training_args, train_dataset=RE_train_dataset, eval_dataset=RE_dev_dataset, compute_metrics=compute_metrics) # train model trainer.train()
def model_init(): """Returns an initialized model for use in a Hugging Face Trainer.""" model = XLMRobertaForSequenceClassification.from_pretrained( "xlm-roberta-base") return model
# Create the DataLoader for our training set. train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # Create the DataLoader for our validation set. validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) # Load XLMRobertaForSequenceClassification, the pretrained XLMRoberta model with a single # linear classification layer on top. model = XLMRobertaForSequenceClassification.from_pretrained( "xlm-roberta-base", # Use the 12-layer XLMRoberta model, with an uncased vocab. num_labels = 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model.cuda() # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW(model.parameters(), lr = 1e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = 1e-8 # args.adam_epsilon - default is 1e-8. )
def load_model(args): if 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'fasttext' in args['model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model model_class = None if 'sequence_classification' == args['task']: model_class = AlbertForSequenceClassification if 'lite' in args[ 'model_checkpoint'] else BertForSequenceClassification elif 'token_classification' == args['task']: model_class = AlbertForWordClassification if 'lite' in args[ 'model_checkpoint'] else BertForWordClassification elif 'multi_label_classification' == args['task']: model_class = AlbertForMultiLabelClassification if 'lite' in args[ 'model_checkpoint'] else BertForMultiLabelClassification model = model_class.from_pretrained(args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path
train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) batch_size = 32 train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = XLMRobertaForSequenceClassification.from_pretrained( "./drive/MyDrive/XLM/XLMRoBERTa-B/", #'xlm-roberta-base', num_labels=2, # multi classification output_attentions=False, output_hidden_states=False) # Freezing layers except the classifying layer # for param in model.bert.parameters(): # param.requires_grad = False # Use GPU model.cuda() # if layers frozen #optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr = 5e-5, eps = 1e-8) optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8) epochs = 8
def model_init(): model = XLMRobertaForSequenceClassification.from_pretrained(args.model_dir) return model
collate_fn=train_dataset.spam_collate_func, shuffle=False) import random seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) model = XLMRobertaForSequenceClassification.from_pretrained( "xlm-roberta-large" , num_labels = 2 , output_attentions = False, output_hidden_states = False, ) model.cuda() optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8, weight_decay = 0.01 ) """ optimizer = Adafactor( model.parameters(), lr=1e-5, eps=(1e-30, 1e-3), clip_threshold=1.0,
import itertools for model, wbool in list(itertools.product(models, wbools)): loss_weighted = wbool if model == 'MURIL': # Using Huggingface MURIL version from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained("simran-kh/muril-cased-temp") model = AutoModelForSequenceClassification.from_pretrained("simran-kh/muril-cased-temp", num_labels=6) model_name = 'MURIL_cased_temp_tamil' if model == 'XLMR': # Using XLM-Roberta-Base pretrained model from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large', num_labels=6) model_name = 'XLMroberta_large_tamil' if model == 'XLMR_base': # Using XLM-Roberta-Base pretrained model from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=6) model_name = 'XLMroberta_base_tamil' if model == 'mbertlarge': # Using Multilingual Bert, bert-large-multilingual-cased pretrained from transformers import BertTokenizer, BertForSequenceClassification tokenizer = BertTokenizer.from_pretrained('distilbert-base-multilingual-cased') model = BertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=6) model_name = 'Distilbert_m_base_cased_tamil' if model == 'XLMR_custom': # Using XLMRoberta finetuning Custom Pretrained model, Vocab same => Tokenizer base
def train(): seed_everything(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # setting model hyperparameter # config 자체에는 학습 weight 정보 없기 때문에, from_pretrained 사용해 weight 가져올 수 있다 # bert_config = BertConfig.from_pretrained(MODEL_NAME) # bert_config.num_labels = 42 # model = BertForSequenceClassification.from_pretrained(MODEL_NAME, config=bert_config) # Auto model_config = XLMRobertaConfig.from_pretrained(args.model_name) model_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( args.model_name, config=model_config) # load model and tokenizer # MODEL_NAME = "monologg/koelectra-base-v3-discriminator" # roberta: https://huggingface.co/transformers/model_doc/xlmroberta.html tokenizer = AutoTokenizer.from_pretrained(args.model_name) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") # label = dataset['label'].values train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=args.seed) tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_val = tokenized_dataset(val_dataset, tokenizer) tokenized_train_label = train_dataset['label'].values tokenized_val_label = val_dataset['label'].values # train_datasets = TokenDataset(train_dataset, tokenizer) # val_datasets = TokenDataset(val_dataset, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, tokenized_train_label) RE_val_dataset = RE_Dataset(tokenized_val, tokenized_val_label) # print(model.parameters) model.to(device) model = torch.nn.DataParallel(model) train_loader = DataLoader( RE_train_dataset, batch_size=args.batch_size, # num_workers=8, pin_memory=torch.cuda.is_available(), shuffle=True, ) val_loader = DataLoader( RE_val_dataset, batch_size=args.batch_size, # num_workers=8, shuffle=False, pin_memory=torch.cuda.is_available(), ) optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) loss_fn = LabelSmoothingLoss(smoothing=0.5) # loss_fn = nn.CrossEntropyLoss() # t_total = len(train_loader) * args.epoch t_total = args.epoch warmup_step = int(t_total * args.warmup_steps) scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) log_dir = "" log_list = glob("./logs/*") if len(log_list) == 0: log_dir = "./logs/exp1" else: log_list = [int(log[-1]) for log in log_list] log_dir = "./logs/exp" + str(max(log_list) + 1) logger = SummaryWriter(log_dir=log_dir) scaler = GradScaler() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) import time for epoch in tqdm(range(args.epoch)): train_acc = 0.0 train_loss = 0.0 val_acc = 0.0 val_loss = 0.0 best_acc = 0.0 model.train() for batch_id, batch in enumerate(tqdm(train_loader)): input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) optimizer.zero_grad() with autocast(): outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = loss_fn(outputs.logits, labels) # loss.backward() # optimizer.step() scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scaler.step(optimizer) scaler.update() train_acc += compute_acc(outputs.logits.cpu(), labels.cpu()) train_loss += loss if (batch_id + 1) % args.logging_steps == 0: train_loss = train_loss.data.cpu().numpy() print( f"[Train] epoch {epoch + 1} | batch_id {batch_id + 1} | loss {(train_loss) / args.logging_steps:.4f} | train_acc {train_acc / args.logging_steps:.4f}" ) logger.add_scalar("Train/loss", train_loss / args.logging_steps, epoch * len(train_loader) + batch_id) logger.add_scalar("Train/acc", train_acc / args.logging_steps, epoch * len(train_loader) + batch_id) train_acc = 0.0 train_loss = 0.0 # scheduler.step() print("\nStart Validation Step!") with torch.no_grad(): model.eval() for batch_id, batch in enumerate(tqdm(val_loader)): input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = loss_fn(outputs.logits, labels) val_acc += compute_acc(outputs.logits.cpu(), labels.cpu()) val_loss += loss print( f"[Val] epoch {epoch + 1} | val_acc {val_acc / (batch_id + 1):.4f}" ) logger.add_scalar("Val/loss", val_loss / (batch_id + 1), epoch) logger.add_scalar("Val/acc", val_acc / (batch_id + 1), epoch) if val_acc >= best_acc: best_acc = val_acc # torch.save(model.state_dict(), os.path.join(args.output_dir, "saved_" + str(epoch) + ".pth")) torch.save(model.state_dict(), os.path.join(args.output_dir, "best.pth")) print("Saved best acc model...") scheduler.step() torch.save(model.state_dict(), os.path.join(args.output_dir, "last.pth"))
def train(): # load model and tokenizer #MODEL_NAME = "bert-base-multilingual-cased" MODEL_NAME = 'xlm-roberta-large' tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) print(tokenizer.tokenize("이순신은 조선 중기의 무신이다.")) print(tokenizer.tokenize("아버지가방에들어가신다.")) tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." + tokenizer.sep_token + "아버지가방에들어가신다.") print(tokenized_str) # load dataset train_dataset = load_data("/opt/ml/input/data/train/train.tsv") #dev_dataset = load_data("./dataset/train/dev.tsv") train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # train_dataset, dev_dataset = load_fold(6) # train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) train_dataset, dev_dataset = torch.utils.data.random_split( RE_train_dataset, [8000, 1001]) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) #model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=3, # number of total save model. save_steps=300, # model saving step. load_best_model_at_end=True, num_train_epochs=10, # total number of training epochs learning_rate=1e-5, # learning_rate per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation warmup_steps=300, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=300, # evaluation step. dataloader_num_workers=4, label_smoothing_factor=0.5) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=dev_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function ) # train model trainer.train()
model_checkpoint = "xlm-roberta-base" metric_name = "matthews_correlation" batch_size = 16 dataset = load_dataset("glue", task) tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") def preprocess_function(examples): return tokenizer(examples["sentence"], truncation=True) encoded_dataset = dataset.map(preprocess_function, batched=True) model = XLMRobertaForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2) args = TrainingArguments( "cola_checkpoints", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=5, weight_decay=0.01, load_best_model_at_end=True, ) def compute_metrics(eval_pred): labels = eval_pred.label_ids
def train(): # load model and tokenizer MODEL_NAME = "xlm-roberta-large" tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) # load dataset train_dataset = load_data("/opt/ml/input/data/train/train.tsv") #dev_dataset = load_data("./dataset/train/dev.tsv") train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset( train_dataset, tokenizer) # keys: input_ids, token_type_ids, attention_mask #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(f'RUNNING ON {device}') # setting model hyperparameter bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=3, # number of total save model. save_steps=500, # model saving step. num_train_epochs=10, # total number of training epochs learning_rate=1e-5, # learning_rate per_device_train_batch_size=32, # batch size per device during training per_device_eval_batch_size=32, # batch size for evaluation warmup_steps=300, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. # evaluation_strategy='steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. # eval_steps = 500, # evaluation step. dataloader_num_workers=4, label_smoothing_factor=0.5) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset # eval_dataset=RE_dev_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
# differentiates padding from non-padding attention_masks.append(encoded_dict['attention_mask']) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) batch_size = 1 prediction_data = TensorDataset(input_ids, attention_masks, labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) model = XLMRobertaForSequenceClassification.from_pretrained( './models/XLM/XLMRoBERTa-Multi') model.to(device) model.eval() # Tracking variables predictions, true_labels = [], [] for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Get predictionss outputs = model(b_input_ids,
def train(model_dir, args): seed_everything(args.seed) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("This notebook use [%s]." % (device)) s_dir = args.model + str( args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str( args.epochs ) + '-' + args.scheduler + '-' + args.tokenize + '-' + str( args.max_len) + '-' + str(args.seed) save_dir = increment_path(os.path.join(model_dir, s_dir)) log_dir = increment_path(os.path.join('logs', s_dir)) # load model and tokenizer MODEL_NAME = args.model if MODEL_NAME.startswith('xlm'): tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # set neptune set_neptune(save_dir, args) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") labels = dataset['label'].values # setting model hyperparameter if MODEL_NAME.startswith('xlm'): bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) else: bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers if MODEL_NAME.startswith('xlm'): model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) else: model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) if args.drop >= 0: model.dropout = nn.Dropout(p=args.drop) # preprocess dataset if args.preprocess != 'no': pre_module = getattr(import_module("preprocess"), args.preprocess) dataset = pre_module(dataset, model, tokenizer) # make dataset for pytorch. # train, val split train_dataset, val_dataset = train_test_split(dataset, test_size=args.val_ratio, random_state=args.seed) tok_module = getattr(import_module("load_data"), args.tokenize) train_tokenized = tok_module(train_dataset, tokenizer, max_len=args.max_len) val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len) # make dataset for pytorch. RE_train_dataset = RE_Dataset( train_tokenized, train_dataset['label'].reset_index(drop='index')) RE_val_dataset = RE_Dataset(val_tokenized, val_dataset['label'].reset_index(drop='index')) model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( seed=args.seed, output_dir=save_dir, # output directory save_total_limit=2, # number of total save model. save_steps=args.save_steps, # model saving step. num_train_epochs=args.epochs, # total number of training epochs learning_rate=args.lr, # learning_rate per_device_train_batch_size=args. batch_size, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation lr_scheduler_type=args.scheduler, warmup_steps=args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay logging_dir=log_dir, # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=100, # evaluation step. dataloader_num_workers=4, label_smoothing_factor=args.smoothing_factor, load_best_model_at_end=True, metric_for_best_model='accuracy') trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
def load_model(args): if 'albert-large-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_vocab_uncased_30000.txt" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_albert_large_config.json" ) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-512/albert_large_model_bpe_wwmlm_512_pytorch_albert_large_512_629k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-wwmlm-512' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-wwmlm-512/albert_base_model_bpe_wwmlm_512_pytorch_model_albert_base_162k.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-large-wwmlm-128' == args['model_checkpoint']: vocab_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_vocab_uncased_30000.txt" config_path = "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_albert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-large-wwmlm-128/albert_large_model_bpe_wwmlm_128_pytorch_albert_large_128_500k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-bpe-mlm-large-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_bert_large_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-512/babert_model_bpe_mlm_uncased_large_512_dup10-5_pytorch_babert_uncased_large_512_dup10-5_1120k.bin", config=config) model.bert = bert_model.bert elif 'albert-base-uncased-112500' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-112500/vocab.txt" config_path = "../embeddings/albert-base-uncased-112500/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-112500/albert_base_uncased_112500.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-96000' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-96000/vocab.txt" config_path = "../embeddings/albert-base-uncased-96000/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-96000/albert_base_uncased_96000.bin", from_tf=False, config=config) model.albert = albert_model elif 'albert-base-uncased-191k' == args['model_checkpoint']: vocab_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_vocab_uncased_30000.txt" config_path = "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_albert_base_config.json" tokenizer = BertTokenizer(vocab_path) config = AlbertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = AlbertForSequenceClassification(config) elif 'token_classification' == args['task']: model = AlbertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = AlbertForMultiLabelClassification(config) # Plug pretrained bert model albert_model = AlbertModel.from_pretrained( "../embeddings/albert-base-uncased-191k/pytorch_models_albert_base_uncased_191500_pytorch_model_albert_base_191k.bin", from_tf=False, config=config) model.albert = albert_model elif 'babert-opensubtitle' == args['model_checkpoint']: # babert-opensubtitle # Prepare config & tokenizer vocab_path = "../embeddings/babert-opensubtitle/vocab.txt" config_path = "../embeddings/babert-opensubtitle/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-opensubtitle/model.ckpt-1000000.index", from_tf=True, config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1100k' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1100k/pytorch_models_babert_uncased_large_1100k_pytorch_model_babert_large_1100k.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased-1m' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased-1m/pytorch_models_babert_uncased_large_1mil_pytorch_model_babert_large_1mil.bin", config=config) model.bert = bert_model.bert elif 'babert-base-512' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_vocab_uncased_30522.txt" config_path = "../embeddings/babert-base-512/pytorch_models_babert_base_512_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-base-512/pytorch_models_babert_base_512_pytorch_model_babert_base_uncased_512.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-large-uncased' == args['model_checkpoint']: # babert_bpe # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_vocab_uncased_30522.txt" config_path = "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-large-uncased/pytorch_models_babert_uncased_large_pytorch_model_babert_large_778500.bin", config=config) model.bert = bert_model.bert elif 'babert-bpe-mlm-uncased-128-dup10-5' == args['model_checkpoint']: # babert_bpe_wwmlm # Prepare config & tokenizer vocab_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/vocab.txt" config_path = "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/bert_config.json" tokenizer = BertTokenizer(vocab_path) config = BertConfig.from_json_file(config_path) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) elif 'token_classification' == args['task']: model = BertForWordClassification(config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) # Plug pretrained bert model bert_model = BertForPreTraining.from_pretrained( "../embeddings/babert-bpe-mlm-uncased-128-dup10-5/pytorch_model.bin", config=config) model.bert = bert_model.bert elif 'bert-base-multilingual' in args['model_checkpoint']: # bert-base-multilingual-uncased or bert-base-multilingual-cased # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-mlm' in args['model_checkpoint']: # xlm-mlm-100-1280 # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMTokenizer.from_pretrained(args['model_checkpoint']) config = XLMConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'xlm-roberta' in args['model_checkpoint']: # xlm-roberta-base or xlm-roberta-large # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = XLMRobertaTokenizer.from_pretrained( args['model_checkpoint']) config = XLMRobertaConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = XLMRobertaForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = XLMRobertaForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = XLMRobertaForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'word2vec' in args['model_checkpoint'] or 'fasttext' in args[ 'model_checkpoint']: # Prepare config & tokenizer vocab_path = args['vocab_path'] config_path = None word_tokenizer = args['word_tokenizer_class']() emb_path = args['embedding_path'][args['model_checkpoint']] _, vocab_map = load_vocab(vocab_path) tokenizer = SimpleTokenizer(vocab_map, word_tokenizer, lower=args["lower"]) vocab_list = list(tokenizer.vocab.keys()) config = BertConfig.from_pretrained('bert-base-uncased') if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] if args['model_checkpoint'] == 'word2vec-twitter': embeddings = gen_embeddings(vocab_list, emb_path) config.hidden_size = 400 config.num_attention_heads = 8 if args['model_checkpoint'] == 'fasttext-cc-id' or args[ 'model_checkpoint'] == 'fasttext-cc-id-300-no-oov-uncased' or args[ 'model_checkpoint'] == 'fasttext-4B-id-300-no-oov-uncased': embeddings = gen_embeddings(vocab_list, emb_path, emb_dim=300) config.hidden_size = 300 config.num_attention_heads = 10 config.vocab_size = len(embeddings) # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'token_classification' == args['task']: model = BertForWordClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config) model.bert.embeddings.word_embeddings.weight.data.copy_( torch.FloatTensor(embeddings)) elif 'scratch' in args['model_checkpoint']: vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") config = BertConfig.from_pretrained("bert-base-uncased") if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] config.num_hidden_layers = args["num_layers"] config.hidden_size = 300 config.num_attention_heads = 10 if 'sequence_classification' == args['task']: model = BertForSequenceClassification(config=config) elif 'token_classification' == args['task']: model = BertForWordClassification(config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification(config=config) elif 'indobenchmark' in args['model_checkpoint']: # indobenchmark models # Prepare config & tokenizer vocab_path, config_path = None, None tokenizer = BertTokenizer.from_pretrained(args['model_checkpoint']) config = BertConfig.from_pretrained(args['model_checkpoint']) if type(args['num_labels']) == list: config.num_labels = max(args['num_labels']) config.num_labels_list = args['num_labels'] else: config.num_labels = args['num_labels'] # Instantiate model if 'sequence_classification' == args['task']: model = BertForSequenceClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'token_classification' == args['task']: model = BertForWordClassification.from_pretrained( args['model_checkpoint'], config=config) elif 'multi_label_classification' == args['task']: model = BertForMultiLabelClassification.from_pretrained( args['model_checkpoint'], config=config) return model, tokenizer, vocab_path, config_path
def run_xlm(dataset, model, header): test_set = pd.read_csv(dataset, encoding='utf-8') X = test_set['text'] y = test_set[header] input_ids = [] attention_masks = [] tokenizer = XLMRobertaTokenizer.from_pretrained(model, do_lower_case=True) device = torch.device("cuda") labels = [] for i in range(len(X)): if not pd.isnull(X[i]): encoded_dict = tokenizer.encode_plus( X[i], # text. add_special_tokens=True, # [CLS] and [SEP] tokens' max_length=64, pad_to_max_length=True, # Pad missing tokens with 0s return_attention_mask=True, return_tensors='pt', # pytorch tensors. ) labels.append(y[i]) input_ids.append(encoded_dict['input_ids']) # differentiates padding from non-padding attention_masks.append(encoded_dict['attention_mask']) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) batch_size = 1 prediction_data = TensorDataset(input_ids, attention_masks, labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) model = XLMRobertaForSequenceClassification.from_pretrained(model) model.to(device) model.eval() # Tracking variables predictions, true_labels = [], [] for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Get predictionss outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Retrieve data from GPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Save predictions predictions.append(logits) true_labels.append(label_ids) # Combine the results across all batches. flat_predictions = np.concatenate(predictions, axis=0) # For each sample, pick the label (0 or 1) with the higher score. flat_predictions = np.argmax(flat_predictions, axis=1).flatten() # Combine the correct labels for each batch into a single list. flat_true_labels = np.concatenate(true_labels, axis=0) # Check if predictions are correct acc = np.sum(flat_predictions == flat_true_labels) / len(flat_predictions) # print(flat_predictions) # print(flat_true_labels) # print(acc) print("Accuracy: ", accuracy_score(flat_true_labels, flat_predictions)) print( "Precision: ", precision_score(flat_true_labels, flat_predictions, average='weighted')) print("Recall: ", recall_score(flat_true_labels, flat_predictions, average='weighted')) print("F1-score: ", f1_score(flat_true_labels, flat_predictions, average='weighted'))