def define_model(name, config=None, location=None): # config가 있으면 처음 training하는 경우, 없으면 체크포인트 불러오기 if name in [ "bert-base-multilingual-cased", "sangrimlee/bert-base-multilingual-cased-korquad", "kykim/bert-kor-base", "monologg/kobert" ]: return BertForSequenceClassification.from_pretrained( name, config=config ) if config else BertForSequenceClassification.from_pretrained( location) elif name in [ "monologg/koelectra-base-v3-discriminator", "kykim/electra-kor-base" ]: return ElectraForSequenceClassification.from_pretrained( name, config=config ) if config else ElectraForSequenceClassification.from_pretrained( location) elif name in ["xlm-roberta-large"]: return XLMRobertaForSequenceClassification.from_pretrained( name, config=config ) if config else XLMRobertaForSequenceClassification.from_pretrained( location) elif name in ["kykim/funnel-kor-base"]: return FunnelForSequenceClassification.from_pretrained( name, config=config ) if config else FunnelForSequenceClassification.from_pretrained( location)
def load_model(device): checkpoint = os.path.dirname(os.path.realpath(__file__))+'/ckpt/koelectra-base-v3-ckpt1/s2_checkpoint-7300' model = ElectraForSequenceClassification.from_pretrained(checkpoint) model.to(device) print('model loaded') return model
def get_text_reader(reader_name, task, num_labels): # AILAW Corpus is korean dataset. # So, model is fixed to Korean Model such as multilingual-BERT, kobert, koelectra, etc. if reader_name == "bert": if task == "classification": model_name = "bert-base-multilingual-cased" text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) else: # ner model_name = "bert-base-multilingual-cased" text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels) elif reader_name == "kobert": if task == "classification": model_name = "monologg/kobert" text_reader = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) else: # ner model_name = "monologg/kobert" text_reader = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels) elif reader_name == "koelectra": if task == "classification": model_name = "monologg/koelectra-base-discriminator" text_reader = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=num_labels) else: # ner model_name = "monologg/koelectra-base-discriminator" text_reader = ElectraForTokenClassification.from_pretrained(model_name, num_labels=num_labels) else: raise KeyError(reader_name) return text_reader
def _init_deep_model(self, model_type, model_path, num_labels, num_regs=None): if 'roberta' in model_type: tokenizer = RobertaTokenizer.from_pretrained(model_path) config = RobertaConfig.from_pretrained(model_path) config.num_labels = num_labels model = RobertaForSequenceClassification.from_pretrained(model_path, config=config) model.eval() model.to(self.device) elif 'electra_multitask' in model_type: tokenizer = ElectraTokenizer.from_pretrained(model_path) tokenizer.add_special_tokens({'additional_special_tokens': ['[VALUES]']}) config = ElectraConfig.from_pretrained(model_path) config.num_labels = num_labels config.num_regs = num_regs config.vocab_size = len(tokenizer) model = ElectraForSequenceClassificationMultiTask.from_pretrained(model_path, config=config) model.eval() model.to(self.device) elif 'electra' in model_type: tokenizer = ElectraTokenizer.from_pretrained(model_path) config = ElectraConfig.from_pretrained(model_path) config.num_labels = num_labels model = ElectraForSequenceClassification.from_pretrained(model_path, config=config) model.eval() model.to(self.device) else: raise NotImplementedError() return config, tokenizer, model
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = "monologg/koelectra-base-v3-discriminator" #tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) tokenizer = ElectraTokenizer.from_pretrained(TOK_NAME) # load my model MODEL_NAME = args.model_dir # model dir. model = ElectraForSequenceClassification.from_pretrained(args.model_dir) model.parameters model.to(device) # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, tokenizer) test_dataset = RE_Dataset(test_dataset ,test_label) # predict answer logits, predictions = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(predictions, columns=['pred']) output.to_csv('./prediction/koelectra-submission6.csv', index=False)
def predict_pair(model_args, data_args, training_args): # Set seed set_seed(training_args.seed) if 'roberta' in model_args.model_type: tokenizer = RobertaTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = RobertaConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = RobertaForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) elif 'electra' in model_args.model_type: tokenizer = ElectraTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = ElectraConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = ElectraForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) else: # default -> bert tokenizer = BertTokenizer.from_pretrained(model_args.tokenizer_name_or_path) config = BertConfig.from_pretrained(model_args.model_name_or_path) config.num_labels = data_args.num_labels model = BertForSequenceClassification.from_pretrained(model_args.model_name_or_path, config=config) model.to(training_args.device) test_df = pickle.load(open(data_args.test_data_file, 'rb')) test_dataset = get_dataset(data_args, tokenizer, test_df, model_args.model_type) data_collator = MyDataCollator() if training_args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) model = torch.nn.DataParallel(model) else: n_gpu = torch.cuda.device_count() if n_gpu > 1: model = torch.nn.DataParallel(model) sampler = SequentialSampler(test_dataset) print(len(test_dataset)) dataloader = DataLoader( test_dataset, sampler=sampler, batch_size=training_args.eval_batch_size, collate_fn=data_collator, ) model.eval() all_probs = [] for inputs in tqdm(dataloader): for k, v in inputs.items(): inputs[k] = v.to(training_args.device) inputs.pop('labels') with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] probs = torch.softmax(logits, dim=-1) maxp, maxi = torch.max(probs, dim=-1) result = [(_i, _p) for _p, _i in zip(maxp, maxi)] all_probs.extend(result) with open('./{}_{}.answer_classify.result'.format(data_args.data_type, model_args.model_type), 'w', encoding='utf-8') as fout: for i in range(len(test_df)): fout.write('{} | {} | {} | {} | {}\n'.format(test_df[i][0], test_df[i][1], test_df[i][2], all_probs[i][0], all_probs[i][1]))
def main(task='mrpc', base_train_cfg='config/QDElectra_pretrain.json', train_cfg='config/train_mrpc.json', model_cfg='config/QDElectra_base.json', data_file='../glue/MRPC/train.tsv', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/bert/mrpc', mode='train', pred_distill=True): train_cfg_dict = json.load(open(base_train_cfg, "r")) train_cfg_dict.update(json.load(open(train_cfg, "r"))) train_cfg = ElectraConfig().from_dict(train_cfg_dict) # train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) output_mode, train_cfg.n_epochs, max_len = get_task_params(task) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) TaskDataset = dataset_class(task) # task dataset class according to the task num_labels = len(TaskDataset.labels) pipeline = [ Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len) ] data_set = TaskDataset(data_file, pipeline) data_iter = DataLoader(data_set, batch_size=train_cfg.batch_size, shuffle=True) t_discriminator = ElectraForSequenceClassification.from_pretrained( 'google/electra-base-discriminator' ) s_discriminator = QuantizedElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', config=model_cfg ) model = DistillElectraForSequenceClassification(t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args) if mode == 'train': trainer.train(model_file, None, data_parallel) elif mode == 'eval': input_ids, attention_mask, token_type_ids, label_ids = TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len) _, eval_labels = get_tensor_data(output_mode, input_ids, attention_mask, token_type_ids, label_ids) results = trainer.eval(model_file, output_mode, eval_labels, num_labels, data_parallel) total_accuracy = torch.cat(results).mean().item() print('Accuracy:', total_accuracy)
def index(): if request.values.get("txt"): from transformers import AutoTokenizer, AutoModel, ElectraForSequenceClassification tokenizer = AutoTokenizer.from_pretrained( "/srv/electra-ka-fake-news-tagging/") model = ElectraForSequenceClassification.from_pretrained( "/srv/electra-ka-fake-news-tagging/") inputs = tokenizer(request.values.get("txt"), return_tensors="pt") return str(model(**inputs)[0].tolist()) return 'no text was sent'
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import ElectraTokenizer, ElectraForSequenceClassification # download the model or load the model path model_path = download_model('electra.offensive', cache_dir, process_func=_unzip_process_func, verbose=verbose) self.classes = ['NOT', 'OFF'] self.tokenizer = ElectraTokenizer.from_pretrained(model_path) self.model = ElectraForSequenceClassification.from_pretrained( model_path, num_labels=len(self.classes)) self.max_length = self.model.electra.embeddings.position_embeddings.num_embeddings
def __init__(self, batch_size, output_size, hidden_size): super(KEA_ELECTRA, self).__init__() options_name = "google/electra-base-discriminator" self.encoder = ElectraForSequenceClassification.from_pretrained( options_name, num_labels=output_size) self.batch_size = batch_size self.output_size = output_size self.hidden_size = hidden_size self.a = nn.Linear(512, hidden_size) #512 is the size of lexicon_vec self.v = nn.Linear(512, hidden_size) #512 is the size of lexicon_vec self.d = nn.Linear(512, hidden_size) #512 is the size of lexicon_vec self.dropout = nn.Dropout(0.1) self.fc1 = nn.Linear(hidden_size, 384) self.label = nn.Linear(384, output_size)
def __init__(self, batch_size, output_size, hidden_size): super(KEA_Electra_Word_level, self).__init__() options_name = "google/electra-base-discriminator" self.encoder = ElectraForSequenceClassification.from_pretrained( options_name, num_labels=output_size) self.batch_size = batch_size self.output_size = output_size self.hidden_size = hidden_size self.bilstm = nn.LSTM(hidden_size + 3, int(hidden_size / 2), dropout=0.2, bidirectional=True) self.dropout = nn.Dropout(0.1) self.fc1 = nn.Linear(hidden_size, 384) self.label = nn.Linear(384, output_size)
def __init__(self, input_ids_batch, attention_mask, hidden_size=768, num_classes=6, dr_rate=None, params=None): super(ElectraClassifier, self).__init__() self.dr_rate = dr_rate self.device = torch.device( "cuda:0") if torch.cuda.is_available() else torch.device("cpu") self.electramodel = ElectraForSequenceClassification.from_pretrained( "monologg/koelectra-small-v2-discriminator") self.attention_mask = attention_mask self.classifier = nn.Linear(hidden_size, num_classes) if dr_rate: self.dropout = nn.Dropout(p=dr_rate)
def __call_model_torch(self): if self.model_to_use.lower() == 'bert': self.config = BertConfig(num_labels=2) self.model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=self.config) elif self.model_to_use.lower() == 'albert': self.config = AlbertConfig(num_labels=2) self.model = AlbertForSequenceClassification.from_pretrained( 'albert-base-v1', config=self.config) elif self.model_to_use.lower() == 'electra': self.config = ElectraConfig(num_labels=2) self.model = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', config=self.config) elif self.model_to_use.lower() == 'distilbert': self.config = DistilBertConfig(num_labels=2) self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', config=self.config) else: print('Model not avaiable yet.')
def __init__(self, config: ElectraConfig, embeddings, discriminator=None, embed_layer=None): super().__init__() self.embed_layer = nn.Embedding(num_embeddings=config.vocab_size, embedding_dim=config.embedding_size, padding_idx=config.vocab_size - 1) if embed_layer: self.embed_layer.load_state_dict(torch.load(embed_layer)) else: self.embed_layer.weight = nn.Parameter(embeddings) if discriminator: self.discriminator = ElectraForSequenceClassification.from_pretrained( discriminator, config=config) else: self.discriminator = ElectraForSequenceClassification(config) self.softmax = nn.Softmax(1)
def __init__( self, model_dir, vocab_dir="skplanet/dialog-koelectra-small-discriminator", label_list=None, cuda=False, ): if cuda: device = "cuda" if torch.cuda.is_available() else "cpu" else: device = "cpu" self.device = torch.device(device) self.model = ElectraForSequenceClassification.from_pretrained(model_dir) self.model.to(self.device) self.model.eval() self.tokenizer = ElectraTokenizer.from_pretrained( vocab_dir, do_lower_case=False ) self.label_list = None if label_list: self.label_list = label_list
def call(self): if self.model_to_use.lower() == 'bert': self.model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False) print('Bert Cargado.') print(self.model) elif self.model_to_use.lower() == 'albert': self.model = AlbertForSequenceClassification.from_pretrained( 'albert-base-v1', num_labels=2, output_attentions=False, output_hidden_states=False) elif self.model_to_use.lower() == 'electra': self.model = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', num_labels=2, output_attentions=False, output_hidden_states=False) elif self.model_to_use.lower() == 'distilbert': self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False) else: print('Model not avaiable right now.') self.model.to(self.device) self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=self.epsilon) self.total_steps = len(self.train_dataloader) * self.epochs self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=0, num_training_steps=self.total_steps)
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load tokenizer TOK_NAME = args.model_name tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) # load my model model_dir = f'./results/{args.id}/checkpoint-{args.checkpoint}' if args.model_type == 'bert': model = BertForSequenceClassification.from_pretrained(model_dir) elif args.model_type == 'electra': model = ElectraForSequenceClassification.from_pretrained(model_dir) elif args.model_type == 'roberta': model = XLMRobertaForSequenceClassification.from_pretrained(model_dir) model.parameters model.to(device) # load test datset # root = "/opt/ml" # root = "/content/drive/MyDrive/Boostcamp/Stage2_KLUE" root = args.root test_dataset, test_label = load_test_dataset(root, tokenizer) test_dataset = RE_Dataset(test_dataset, test_label) # predict answer pred_answer = inference(model, test_dataset, device) # logits, predictions = inference(model, test_dataset, device) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) # output = pd.DataFrame(predictions, columns=['pred']) output.to_csv(f'./results/{args.id}/submission{args.id}.csv', index=False) # np.save(f'./results/{args.id}/logits{args.id}.npy', logits) print('File saved')
def pretrained_tokenizer_and_model(self): print(f'Model Class : {self.model_type}') if self.model_type == 'bert': pretrained_model = 'bert-base-uncased' self.tokenizer = BertTokenizer.from_pretrained(pretrained_model) self.model = BertForSequenceClassification.from_pretrained( pretrained_model, num_labels=self.labels_count) elif self.model_type == 'roberta': pretrained_model = 'roberta-base' self.tokenizer = RobertaTokenizer.from_pretrained(pretrained_model) self.model = RobertaForSequenceClassification.from_pretrained( pretrained_model, num_labels=self.labels_count) elif self.model_type == 'distilbert': pretrained_model = 'distilbert-base-uncased' self.tokenizer = DistilBertTokenizer.from_pretrained( pretrained_model) self.model = DistilBertForSequenceClassification.from_pretrained( pretrained_model, num_labels=self.labels_count) elif self.model_type == 'electra': pretrained_model = 'google/electra-small-discriminator' self.tokenizer = ElectraTokenizer.from_pretrained(pretrained_model) self.model = ElectraForSequenceClassification.from_pretrained( pretrained_model, num_labels=self.labels_count) if self.device.type == 'cuda': self.model.cuda()
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch import numpy as np model = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator') tokenizer = ElectraTokenizerFast.from_pretrained( 'google/electra-small-discriminator') import random def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=256, add_special_tokens=True, padding='max_length', return_attention_mask=True) train_dataset = load_dataset( 'json', data_files={'train': 'dataset_full_question/quanta_train.json'}, field='questions')['train'] train_dataset = train_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
def main(args): nsmc = h5py.File(f'{args.data_root}/nsmc.h5', 'r') train_dataset = nsmc['train'] test_dataset = nsmc['test'] print('\n====================== Dataset Summary ======================\n') print(f"Train Label : {train_dataset['label']}") print(f"Train Input Ids : {train_dataset['input_ids']}") print(f"Train Attention Mask : {train_dataset['attention_mask']}") print(f"Test Label : {test_dataset['label']}") print(f"Test Input Ids : {test_dataset['input_ids']}") print(f"Test Attention Mask : {test_dataset['attention_mask']}") print('\n=============================================================\n') train_label = np.array(train_dataset['label']) train_input_ids = np.array(train_dataset['input_ids']) train_attention_mask = np.array(train_dataset['attention_mask']) test_label = np.array(test_dataset['label']) test_input_ids = np.array(test_dataset['input_ids']) test_attention_mask = np.array(test_dataset['attention_mask']) nsmc.close() train_dataset = NSMCDataset(train_label, train_input_ids, train_attention_mask) test_dataset = NSMCDataset(test_label, test_input_ids, test_attention_mask) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_workers) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.n_workers) if torch.cuda.is_available() and args.cuda: device = torch.device('cuda') else: device = torch.device('cpu') model = ElectraForSequenceClassification.from_pretrained( "monologg/koelectra-base-v3-discriminator") model = nn.parallel.DataParallel(model) model.to(device) optimizer = AdamW(model.parameters(), lr=args.lr) # Plot Loss and Images in Tensorboard experiment_dir = 'logs/{}@{}'.format( 'NSMC', datetime.now().strftime("%d.%m.%Y-%H:%M:%S")) os.makedirs(f"{experiment_dir}/checkpoints", exist_ok=True) writer = SummaryWriter(os.path.join(experiment_dir, "tb")) metric_dict = defaultdict(list) metric_dict_epoch_train = defaultdict(list) metric_dict_epoch_test = defaultdict(list) ########################################## ################ Training ################ ########################################## n_iters_total = 0 for epoch in range(args.n_epochs): total_loss_train = 0.0 correct = 0 total = 0 model.train() for idx, (label, input_ids, attention_masks) in tqdm(enumerate(train_loader), total=len(train_loader)): optimizer.zero_grad() label = label.to(device) input_ids = input_ids.to(device) attention_masks = attention_masks.to(device) output = model(input_ids, attention_masks)[0] # (batch_size, 2) _, pred = torch.max(output, 1) # (batch_size) loss = F.cross_entropy(output, label) loss.backward() optimizer.step() total_loss_train += loss.item() correct += (pred == label).sum() total += len(label) train_accuracy = correct.float() / total if n_iters_total % 300 == 0: print(f"Batch Loss : {loss} / Accuracy : {train_accuracy}") metric_dict['train_loss'].append(loss.item()) metric_dict['train_accuracy'].append(train_accuracy.item()) n_iters_total += 1 for title, value in metric_dict.items(): writer.add_scalar('train/{}'.format(title), value[-1], n_iters_total) train_accuracy = correct.float() / total metric_dict_epoch_train['train_total_loss_epoch'].append( total_loss_train) metric_dict_epoch_train['train_accuracy_epoch'].append(train_accuracy) for title, value in metric_dict_epoch_train.items(): writer.add_scalar('train/{}'.format(title), value[-1], epoch) print( f"Epoch : {epoch} / Train Loss : {total_loss_train} / Accuracy : {train_accuracy}" ) ########################################## ################## Test ################## ########################################## test_correct = 0 test_total = 0 total_loss_test = 0.0 model.eval() with torch.no_grad(): for idx, (label, input_ids, attention_masks) in tqdm(enumerate(test_loader), total=len(test_loader)): label = label.to(device) input_ids = input_ids.to(device) attention_masks = attention_masks.to(device) output = model(input_ids, attention_masks)[0] _, pred = torch.max(output, 1) # values, indices loss = F.cross_entropy(output, label) total_loss_test += loss test_correct += (pred == label).sum() test_total += len(label) test_accuracy = test_correct.float() / test_total metric_dict_epoch_test['test_total_loss_epoch'].append( total_loss_test) metric_dict_epoch_test['test_accuracy_epoch'].append(test_accuracy) for title, value in metric_dict_epoch_test.items(): writer.add_scalar('test/{}'.format(title), value[-1], epoch) print(f"Test Accuracy : {test_accuracy}") torch.save( { 'epoch': epoch, 'model_state_dict': model.module.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'train_accuracy': train_accuracy, 'test_accuracy': test_accuracy, }, os.path.join(experiment_dir, "checkpoints", str(epoch)))
def model_setting(): model = ElectraForSequenceClassification.from_pretrained(MODEL_NAME).to( device) #model.load_state_dict(torch.load(pre_MODEL_NAME)) model.load_state_dict(torch.load(pre_MODEL_NAME, map_location=device)) return model
def main(): nsmc = h5py.File('../data/nsmc.h5', 'r') train_dataset = nsmc['train'] test_dataset = nsmc['test'] print('\n====================== Dataset Summary ======================\n') print(f"Train Label : {train_dataset['label']}") print(f"Train Input Ids : {train_dataset['input_ids']}") print(f"Train Attention Mask : {train_dataset['attention_mask']}") print(f"Test Label : {test_dataset['label']}") print(f"Test Input Ids : {test_dataset['input_ids']}") print(f"Test Attention Mask : {test_dataset['attention_mask']}") print('\n=============================================================\n') train_label = np.array(train_dataset['label']) train_input_ids = np.array(train_dataset['input_ids']) train_attention_mask = np.array(train_dataset['attention_mask']) test_label = np.array(test_dataset['label']) test_input_ids = np.array(test_dataset['input_ids']) test_attention_mask = np.array(test_dataset['attention_mask']) nsmc.close() train_dataset = NSMCDataset(train_label, train_input_ids, train_attention_mask) test_dataset = NSMCDataset(test_label, test_input_ids, test_attention_mask) train_loader = DataLoader(train_dataset, batch_size=55, shuffle=True, num_workers=8) test_loader = DataLoader(test_dataset, batch_size=55, shuffle=False, num_workers=8) model = ElectraForSequenceClassification.from_pretrained( "monologg/koelectra-base-v3-discriminator") parameters = filter(lambda p: p.requires_grad, model.parameters()) args = add_argument() model_engine, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=parameters) losses = [] accuracies = [] for epoch in range(args.epochs): total_loss = 0.0 correct = 0 total = 0 batches = 0 for idx, (label, input_ids, attention_masks) in tqdm(enumerate(train_loader), total=len(train_loader)): label = label.to(model_engine.local_rank) input_ids = input_ids.to(model_engine.local_rank) attention_masks = attention_masks.to(model_engine.local_rank) # Model Inference output = model_engine(input_ids, attention_masks)[0] _, pred = torch.max(output, 1) loss = F.cross_entropy(output, label) model_engine.backward(loss) model_engine.step() total_loss += loss.item() correct += (pred == label).sum() total += len(label) batches += 1 if batches % 100 == 0: print( f"Batch Loss : {total_loss} / Accuracy : {correct.float() / total}" ) losses.append(total_loss) accuracies.append(correct.float() / total) print( f"Epoch : {epoch} / Train Loss : {total_loss} / Accuracy : {correct.float() / total}" ) test_correct = 0 test_total = 0 with torch.no_grad(): for idx, (label, input_ids, attention_masks) in tqdm(enumerate(test_loader), total=len(test_loader)): label = label.to(model_engine.local_rank) input_ids = input_ids.to(model_engine.local_rank) attention_masks = attention_masks.to(model_engine.local_rank) # Model Inference output = model_engine(input_ids, attention_masks)[0] _, pred = torch.max(output, 1) test_correct += (pred == label).sum() test_total += len(label) print(f"Test Accuracy : {test_correct.float() / test_total}") model_engine.save_checkpoint('../weights', f"KoELECTRA_{epoch}")
def main(): args = parse_args() os.makedirs(args.output_dir, exist_ok=True) set_seed(args.seed) logging.basicConfig(level=logging.INFO, format='%(asctime)s :: %(levelname)s :: %(message)s') if args.numnet_model is not None: config = BertConfig.from_pretrained( args.model_name, num_labels=1) # 1 label for regression # if args.contrastive: # model = ContrastiveElectra.from_pretrained(args.model_name, config=config) # else: model = BertForSequenceClassification.from_pretrained(args.model_name, config=config) state_dicts = torch.load(args.numnet_model) if "model" in state_dicts: logging.info("Loading in mutual electra format state_dicts.") model.load_state_dict(state_dicts["model"], strict=False) else: logging.info("Loading model weights only.") model.load_state_dict(state_dicts, strict=False) else: config = ElectraConfig.from_pretrained( args.model_name, num_labels=1) # 1 label for regression model = ElectraForSequenceClassification.from_pretrained( args.model_name, config=config) if args.local_model_path is not None: state_dicts = torch.load(args.local_model_path) model.load_state_dict(state_dicts["model"]) tokenizer = ElectraTokenizer.from_pretrained(args.model_name, do_lower_case=True) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # TODO enable multi-gpu training if necessary pretrain_train_dataset = DapoDataset(args.data_dir, "train", tokenizer) if args.pretrain else None pretrain_dev_dataset = DapoDataset(args.data_dir, "dev", tokenizer) if args.pretrain else None if args.train: if args.contrastive: train_dataset = ContrastiveDataset(args.data_dir, "train", tokenizer) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=mutual_contrast_collate) dev_dataset = ContrastiveDataset( args.data_dir, "dev", tokenizer) if args.eval or args.test else None dev_dataloader = DataLoader(dev_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=mutual_contrast_collate ) if dev_dataset is not None else None else: train_dataset = MutualDataset(args.data_dir, "train", tokenizer) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=8, collate_fn=mutual_collate) dev_dataset = MutualDataset( args.data_dir, "dev", tokenizer) if args.eval or args.test else None dev_dataloader = DataLoader( dev_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=mutual_collate) if dev_dataset is not None else None else: train_dataset, train_dataloader = None, None # TODO: add test_dataset if we want to submit to leaderboard pretrain_train_dataloader = DataLoader( pretrain_train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=8, collate_fn=dapo_collate ) if pretrain_train_dataset is not None else None pretrain_dev_dataloader = DataLoader( pretrain_dev_dataset, batch_size=args.train_batch_size, shuffle=False, num_workers=8, collate_fn=dapo_collate) if pretrain_dev_dataset is not None else None # currently eval_batch_size = train_batch_size if args.pretrain: logging.info("Start pretraining...") args.eval = True trainer = Trainer(args, model, device, pretrain_train_dataloader, pretrain_dev_dataloader) trainer.train() return # fine-tuning should be done separately if args.train: logging.info("Start training...") trainer = Trainer(args, model, device, train_dataloader, dev_dataloader) trainer.train() # TODO: currently testing is on the dev set if args.test: logging.info("Start testing...") tester = Tester(args, model, device, dev_dataset, dev_dataloader) tester.test()
def main(args): if not os.path.isdir('CMDs'): os.mkdir('CMDs') with open('CMDs/train.cmd', 'a') as f: f.write(' '.join(sys.argv) + '\n') f.write('--------------------------------\n') # Set the seed value all over the place to make this reproducible. seed_val = args.seed random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # Choose device device = get_default_device() prompts_train_idxs = np.loadtxt(args.train_prompts_idxs_path, dtype=np.int64) topics_dist = np.loadtxt(args.unique_prompts_distribution_path, dtype=np.int32) # Normalise topics_dist = topics_dist / np.linalg.norm(topics_dist, 1) # Load the BERT tokenizer. print('Loading BERT tokenizer...') tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator', do_lower_case=True) with open(args.unique_prompts_path) as f: topics = f.readlines() # Remove whitespaces and convert to lowercase topics = [x.strip().lower() for x in topics] with open(args.train_resps_path) as f: responses = f.readlines() # Remove whitespaces and convert to lower case responses = [x.strip().lower() for x in responses] # Tokenize all the prompts and the responses and then map the tokens to their word IDs topic_ids = [] for sent in topics: encoded_sent = tokenizer.encode(sent, add_special_tokens=True) topic_ids.append(encoded_sent) resp_ids = [] for sent in responses: encoded_sent = tokenizer.encode(sent, add_special_tokens=True) resp_ids.append(encoded_sent) MAX_LEN_topic = max([len(sen) for sen in topic_ids]) MAX_LEN_resp = max([len(sen) for sen in resp_ids]) print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id)) # Pad our input tokens with value 0. # "post" indicates that we want to pad and truncate at the end of the sequence, # as opposed to the beginning. topic_ids = pad_sequences(topic_ids, maxlen=MAX_LEN_topic, dtype="long", value=0, truncating="post", padding="post") resp_ids = pad_sequences(resp_ids, maxlen=MAX_LEN_resp, dtype="long", value=0, truncating="post", padding="post") # The attention mask simply makes it explicit which tokens are actual words versus which are padding. attention_masks_topic = [] # For each sentence... for sent in topic_ids: # Create the attention mask. # - If a token ID is 0, then it's padding, set the mask to 0. # - If a token ID is > 0, then it's a real token, set the mask to 1. att_mask = [int(token_id > 0) for token_id in sent] # Store the attention mask for this sentence. attention_masks_topic.append(att_mask) attention_masks_resp = [] for sent in resp_ids: # Create the attention mask. # - If a token ID is 0, then it's padding, set the mask to 0. # - If a token ID is > 0, then it's a real token, set the mask to 1. att_mask = [int(token_id > 0) for token_id in sent] # Store the attention mask for this sentence. attention_masks_resp.append(att_mask) # Convert to torch tensors prompts_train_idxs = torch.from_numpy(prompts_train_idxs) prompts_train_idxs = prompts_train_idxs.long() topic_ids = torch.tensor(topic_ids) topic_ids = topic_ids.long() topic_ids = topic_ids.to(device) attention_masks_topic = torch.tensor(attention_masks_topic) attention_masks_topic = attention_masks_topic.long() attention_masks_topic = attention_masks_topic.to(device) resp_ids = torch.tensor(resp_ids) resp_ids = resp_ids.long() resp_ids = resp_ids.to(device) attention_masks_resp = torch.tensor(attention_masks_resp) attention_masks_resp = attention_masks_resp.long() attention_masks_resp = attention_masks_resp.to(device) # Create the DataLoader for our training set. print(prompts_train_idxs.size(0)) print(resp_ids.size(0)) print(attention_masks_resp.size(0)) train_data = TensorDataset(prompts_train_idxs, resp_ids, attention_masks_resp) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = ElectraForSequenceClassification.from_pretrained( "google/electra-base-discriminator", # Use the 12-layer BERT model, with an uncased vocab. num_labels = 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) model.to(device) # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW(model.parameters(), lr = args.learning_rate, eps = args.adam_epsilon ) loss_values = [] # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * args.n_epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = total_steps) for epoch in range(args.n_epochs): # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch + 1, args.n_epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 model.train() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) p_id = batch[0].to(device) r = batch[1].to(device) r_msk = batch[2].to(device) # Perform dynamic shuffling p_id, r, r_msk, y_true, batch_size = _shuffle(p_id, r, r_msk, topics_dist, args.num_topics, device) # Get the prompts from the topics p, p_msk = _get_prompts(p_id, topic_ids, attention_masks_topic) p, p_msk = p.to(device), p_msk.to(device) # Concatenate prompts and responses pr_resp, pr_resp_msk = _join_pr_resp(p, p_msk, r, r_msk, args.reverse) pr_resp, pr_resp_msk = pr_resp.to(device), pr_resp_msk.to(device) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # This will return the loss (rather than the model output) because we # have provided the `labels`. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = model(pr_resp, token_type_ids=None, attention_mask=pr_resp_msk, labels=y_true) # The call to `model` always returns a tuple, so we need to pull the # loss value out of the tuple. loss = outputs[0] # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(format_time(time.time() - t0))) # NEED TO DO THE VALIDATION CODE NOW - see the rest of the tutorial at # https://medium.com/@aniruddha.choudhury94/part-2-bert-fine-tuning-tutorial-with-pytorch-for-text-classification-on-the-corpus-of-linguistic-18057ce330e1 # Save the model to a file file_path = args.save_path+'electra_classifier_seed'+str(args.seed)+'.pt' torch.save(model, file_path)
import os import torch import numpy as np from flask import Flask, request, jsonify from transformers import (ElectraTokenizer, ElectraForSequenceClassification) app = Flask(__name__) device = "cuda" if torch.cuda.is_available() else "cpu" max_seq_length = int(os.getenv("PODOLI_MAX_LENGTH", 128)) model = ElectraForSequenceClassification.from_pretrained('model') model.to(device) tokenizer = ElectraTokenizer.from_pretrained( "monologg/koelectra-small-v3-discriminator", do_lower_case=False ) def featurize(comments): tokens_a = tokenizer.tokenize(comments) if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0)
input_ids = inputs['input_ids'][0] attention_mask = inputs['attention_mask'][0] return input_ids, attention_mask, y #학습에는 train만 사용함, test는 감점대상 train_dataset = NSMC_Dataset("ratings_train.txt","train") test_dataset = NSMC_Dataset("ratings_test.txt","train") sample_dataset = NSMC_Dataset("ko_data.csv","sample") tmpstr = '훌륭하다. 초한지 얼른 읽어보고 다시 봐야겠다. 연출 훌륭하다 껄껄 한신의 토사구팽은 슬펐다' print( train_dataset.clean_text( txt = tmpstr) ) """# 모델 생성 (Create Model)""" model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device) model.cuda() # 한번 실행해보기 #text, attention_mask, y = train_dataset[0] #model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device)) try: model.load_state_dict(torch.load("model.pt")) except: print("error - model.load_state_dict(torch.load('model.pt'))") else: print("success - model.load_state_dict(torch.load('model.pt'))") # 모델 레이어 보기 model
def train(): # load model and tokenizer #MODEL_NAME = "bert-base-multilingual-cased" MODEL_NAME = "monologg/koelectra-base-v3-discriminator" tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME) print(tokenizer.tokenize("이순신은 조선 중기의 무신이다.")) print(tokenizer.tokenize("아버지가방에들어가신다.")) tokenized_str = tokenizer.tokenize("이순신은 조선 중기의 무신이다." + tokenizer.sep_token + "아버지가방에들어가신다.") print(tokenized_str) # load dataset train_dataset = load_data("/opt/ml/input/data/train/train.tsv") train_label = train_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) train_dataset, dev_dataset = torch.utils.data.random_split( RE_train_dataset, [7000, 2001]) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = ElectraConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = ElectraForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) #model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir='./results', # output directory save_total_limit=4, # number of total save model. load_best_model_at_end=True, save_steps=100, # model saving step. num_train_epochs=10, # total number of training epochs learning_rate=5e-5, # learning_rate per_device_train_batch_size=8, # batch size per device during training per_device_eval_batch_size=8, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=100, # evaluation step. dataloader_num_workers=3, label_smoothing_factor=0.5) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=dev_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function ) # train model trainer.train()
from transformers import ElectraTokenizer, ElectraForSequenceClassification, pipeline from pprint import pprint tokenizer = ElectraTokenizer.from_pretrained( "monologg/koelectra-small-finetuned-nsmc") model = ElectraForSequenceClassification.from_pretrained( "monologg/koelectra-small-finetuned-nsmc") nsmc = pipeline("sentiment-analysis", tokenizer=tokenizer, model=model) texts = [ "이 영화는 미쳤다. 넷플릭스가 일상화된 시대에 극장이 존재해야하는 이유를 증명해준다.", "촬영감독의 영혼까지 갈아넣은 마스터피스", "보면서 화가날수있습니다.", "아니 그래서 무슨말이 하고싶은거야 ㅋㅋㅋ", ] pprint(nsmc(texts))
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch import numpy as np from sklearn.metrics import accuracy_score, precision_recall_fscore_support model = ElectraForSequenceClassification.from_pretrained( 'models/ELECTRA_last_line') tokenizer = ElectraTokenizerFast.from_pretrained( 'google/electra-small-discriminator') def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=128, add_special_tokens=True, padding='max_length', return_attention_mask=True) test_dataset = load_dataset( 'json', data_files={'test': 'dataset_last_line/quanta_test.json'}, field='questions')['test'] test_dataset = test_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset)) test_dataset.set_format('torch',
# tmpstr = 'Come on. Hello? I?m sorry you have the wrong number. Okay, I?ll call you later dad. I love you.' # print( train_dataset.clean_text( txt = tmpstr) ) # test_emotion = 'joy' # if test_emotion in train_dataset.emotion_dic.keys() : # print( train_dataset.emotion_dic[test_emotion] ) # else : # print(0) # print( train_dataset.__getitem__(11790) ) # print( sample_dataset.__getitem__(10) ) """# 모델 생성 (Create Model)""" model = ElectraForSequenceClassification.from_pretrained("google/electra-base-discriminator", num_labels=8).to(device) #model.cuda() # 한번 실행해보기 #text, attention_mask, y = train_dataset[0] #model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device)) try: model.load_state_dict(torch.load("model.pt")) #model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/model_daniel021_friends_electra_base_epoch4.pt")) #model.load_state_dict(torch.load("/content/drive/MyDrive/Colab Notebooks/model_daniel021_friends_electra_large_epoch8.pt")) except: print("error - model.load_state_dict(torch.load(...))") else: print("success - model.load_state_dict(torch.load(...))")