def __init__(self, batch_size, epoch_num, model_name, is_test): self.BATCH_SIZE = batch_size self.EPOCHS = epoch_num self.NUM_LABELS = 4 self.model_name = model_name if self.model_name == "bert": self.model_version = 'bert-base-cased' self.tokenizer = BertTokenizer.from_pretrained(self.model_version) if is_test: self.model = BertForSequenceClassification.from_pretrained( model_name + "_model", num_labels=self.NUM_LABELS) else: self.model = BertForSequenceClassification.from_pretrained( self.model_version, num_labels=self.NUM_LABELS) elif self.model_name == "robert": self.model_version = 'roberta-base' self.tokenizer = RobertaTokenizer.from_pretrained( self.model_version) if is_test: self.model = RobertaForSequenceClassification.from_pretrained( model_name + "_model", num_labels=self.NUM_LABELS) else: self.model = RobertaForSequenceClassification.from_pretrained( self.model_version, num_labels=self.NUM_LABELS) elif self.model_name == "albert": self.model_version = 'albert-base-v2' self.tokenizer = AlbertTokenizer.from_pretrained( self.model_version) if is_test: self.model = AlbertForSequenceClassification.from_pretrained( model_name + "_model", num_labels=self.NUM_LABELS) else: self.model = AlbertForSequenceClassification.from_pretrained( self.model_version, num_labels=self.NUM_LABELS) if is_test: self.testset = FakeNewsDataset("test", tokenizer=self.tokenizer) self.testloader = DataLoader(self.testset, batch_size=self.BATCH_SIZE, collate_fn=create_mini_batch) else: self.trainset = FakeNewsDataset("train", tokenizer=self.tokenizer) self.trainloader = DataLoader(self.trainset, batch_size=self.BATCH_SIZE, collate_fn=create_mini_batch) self.model.train() self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-5) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.model.to(self.device)
def Get_Model(modelName): model = '' if modelName == 'XLNet': model = XLNetForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) elif modelName == 'BERT': model = BertForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) elif modelName == 'RoBerta': model = RobertaForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) elif modelName == 'Albert': model = AlbertForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. pretrained_model_path, # The number of output labels--2 for binary classification. num_labels=2) return model
def main(): bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2) bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config) count = 0 for name, param in bert_base_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in bert_base_uncased: ', count) roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2) roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config) count = 0 for name, param in roberta_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in roberta: ', count) albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2) albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config) count = 0 for name, param in albert_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in albert: ', count)
def __init__(self): super(AlbertModel, self).__init__() self.albert = AlbertForSequenceClassification.from_pretrained( "voidful/albert_chinese_base", num_labels=2) # /bert_pretrain/ self.device = torch.device("cuda") for param in self.albert.parameters(): param.requires_grad = True # 每个参数都要 求梯度
def __init__(self, model_name, model_type): """ Hyper-parameters found with validation set: xlnet-large-casd : epoch = 4, learning_rate = 1E-5, batch_size = 16, epsilon = 1e-6 bert-large-uncased : epoch = 4, learning_rate = 3E-5, batch_size = 16, epsilon = 1e-8 ALBERT xxlarge-v2 large : epoch = 3, learning_rate = 5E-5, batch_size = 8, epsilon = 1e-6 to be improved... """ self.model_name = model_name self.model_type = model_type # Cf transformers library, batch of 16 or 32 is advised for training. For memory issues, we will take 16. Gradient accumulation step has not lead # to great improvment and therefore won't be used here. if model_type == 'albert': self.batch_size = 8 else: self.batch_size = 16 available_model_name = ["xlnet-large-cased", "bert-large-uncased", "albert-xlarge-v2"] available_model_type = ["bert", "xlnet", "albert"] if self.model_name not in available_model_name: raise Exception("Error : model_name should be in", available_model_name) if self.model_type not in available_model_type: raise Exception("Error : model_name should be in", available_model_type) # Load BertForSequenceClassification, the pretrained BERT model with a single linear regression layer on top of the pooled output # Load our fined tune model: ex: BertForSequenceClassification.from_pretrained('./my_saved_model_directory/') if self.model_type == 'bert': self.config = BertConfig.from_pretrained(self.model_name, num_labels=1) # num_labels=1 for regression task self.model = BertForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'xlnet': self.config = XLNetConfig.from_pretrained(self.model_name, num_labels=1) self.model = XLNetForSequenceClassification.from_pretrained(self.model_name, config=self.config) elif self.model_type == 'albert': self.config = AlbertConfig.from_pretrained(self.model_name, num_labels=1) self.model = AlbertForSequenceClassification.from_pretrained(self.model_name, config=self.config) self.model.cuda() if self.model_name == 'xlnet-large-cased': self.epochs = 4 self.lr = 1e-5 self.eps = 1e-6 elif self.model_name == 'bert-large-uncased': self.epochs = 4 self.lr = 3e-5 self.eps = 1e-8 elif self.model_name == 'albert-xxlarge-v2': self.epochs = 3 self.lr = 5e-5 self.eps = 1e-6 self.max_grad_norm = 1.0 # Gradient threshold, gradients norms that exceed this threshold are scaled down to match the norm. self.optimizer = AdamW(self.model.parameters(), lr=self.lr, eps=self.eps) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0)
def create_and_check_albert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = AlbertForSequenceClassification(config) model.eval() loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result)
def __init__(self, path='model', model_type='albert-base-v2'): self.path = path self.model_type = model_type self.tokenizer = AlbertTokenizer.from_pretrained(self.model_type, do_lower_case=True) self.model = AlbertForSequenceClassification.from_pretrained(self.path) self.device = "cpu" self.model.to(self.device) self.model.eval()
def albert_trainer(): # load the dataset and metric dataset = load_dataset("glue", 'mnli') metric = load_metric('glue', 'mnli') # load the tokenizer tokenizer = AutoTokenizer.from_pretrained('albert-base-v2', use_fast=True) # define a pretrain method def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) # preprocess the data encoded_dataset = dataset.map(preprocess_function, batched=True) # load the model model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=3) # set all the training parameter batch_size = 16 args = TrainingArguments( "test-glue", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=5, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model='accuracy', ) # define a metric function def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) # initialize trainer trainer = Trainer(model, args, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset['validation_matched'], tokenizer=tokenizer, compute_metrics=compute_metrics) # train trainer.train() # evaluate result = trainer.evaluate() # print the result print(result)
def predict(load_path, file_path='./data/Task2.predict.csv', save_path='./data/Task2.predict.result.csv', **kwargs): with open(os.path.join(load_path, 'config.pkl'), 'rb') as f: config = pickle.load(f) config.update(**kwargs) config.load_path = load_path if config.device == 'cuda' and torch.cuda.is_available(): torch.cuda.set_device(config.gpu) else: config.device = 'cpu' data_texts = read_prediction_data(file_path) if len(data_texts) == 0: save_prediction_result([], file_path, save_path) return # Load Bert Tokenizer if config.bert_model == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif config.bert_model == 'albert': tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1') else: raise Exception('Error Bert model.') data_encodings = tokenizer(data_texts, truncation=True, padding=True) data_dataset = LicenseDataset(data_encodings) data_loader = DataLoader(data_dataset, batch_size=config.batch_size, shuffle=False) if config.bert_model == 'bert': model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased') elif config.bert_model == 'albert': model = AlbertForSequenceClassification.from_pretrained( 'albert-base-v1') else: raise Exception('Error Bert model.') if config.load_path: model = load_model(model, path=config.load_path, name=config.ckpt_name) model.to(config.device) model.eval() pred = [] for i, batch in enumerate(data_loader): input_ids = batch['input_ids'].to(config.device) attention_mask = batch['attention_mask'].to(config.device) outputs = model(input_ids, attention_mask=attention_mask) logits = outputs[0] pred.extend(torch.argmax(logits, dim=1).tolist()) save_prediction_result(pred, file_path, save_path)
def __init__(self, path='output', model_type='albert-base-v2'): self.path = path self.model_type = model_type self.tokenizer = AlbertTokenizer.from_pretrained(self.model_type, do_lower_case=True) self.model = AlbertForSequenceClassification.from_pretrained(self.path) self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model.to(self.device) self.model.eval()
def __init__(self, requires_grad=True): super(AlbertModel, self).__init__() self.albert = AlbertForSequenceClassification.from_pretrained( 'albert-xxlarge-v2', num_labels=2) self.tokenizer = AutoTokenizer.from_pretrained('albert-xxlarge-v2', do_lower_case=True) self.requires_grad = requires_grad self.device = torch.device("cuda") for param in self.albert.parameters(): param.requires_grad = True # Each parameter requires gradient
def apply_rules(dataset_path, model_path): data = np.load(dataset_path, allow_pickle=True) test_data = [separate_answers(x[0]) for x in data if int(x[1]) == 0] top_rules = np.load("final_rules.npy", allow_pickle=True) tr2 = replace_rules.TextToReplaceRules(nlp, [x[1] for x in test_data], [], min_freq=0.005, min_flip=0.005, ngram_size=2) # Own model model = AlbertForSequenceClassification.from_pretrained(pretrained_weights, num_labels=3) model.load_state_dict(torch.load(model_path)) model.cuda() model.eval() tokenized_stud_ans = tokenizer.tokenize([x[1] for x in test_data]) model_preds = {} rule_flip_amount = {} data_id_flipped = {} a = time.time() for rule in top_rules: idxs = list(tr2.get_rule_idxs(rule)) to_apply = [tokenized_stud_ans[x] for x in idxs] applies, nt = rule.apply_to_texts(to_apply, fix_apostrophe=False) # Find indices, where rule has been applied applies = [idxs[x] for x in applies] to_compute = [x for x in zip(applies, nt) if x[1] not in model_preds] if to_compute: # New predicts new_labels = [] for compute in to_compute: j, new_stud = compute # Get reference answer for sequence classification orig_instance = test_data[j] logits = predict(model, orig_instance[0], new_stud, 0) new_label = int(np.argmax(logits)) new_labels.append(new_label) for x, y in zip(to_compute, new_labels): model_preds[x[1]] = y new_labels = np.array([model_preds[x] for x in nt]) where_flipped = np.where(new_labels == 2)[0] flips = sorted([applies[x] for x in where_flipped]) rule_flip_amount[rule.hash()] = len(flips) data_id_flipped[rule.hash()] = list(where_flipped) #print("Done with " + rule.hash()) # Top 10 rules top_10 = [x.replace("text_", "").replace("pos_", "") for x in list({k: v for k, v in sorted(rule_flip_amount.items(), key=lambda item: item[1], reverse=True)})[:10]] np.save(model_path[:model_path.rfind("/") + 1] + "top_10.npy", top_10) print("Time used for applying rules: ", time.time() - a) print("Total amount of adversaries:", sum(list(rule_flip_amount.values()))) print("Total amount of afflicted data instances:", len(set(np.concatenate(list(data_id_flipped.values())).ravel().tolist())))
def initialize(self, ctx): self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest['model']['serializedFile'] model_pt_path = os.path.join(model_dir, serialized_file) self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu") # read configs for the mode, model_name, etc. from setup_config.json setup_config_path = os.path.join(model_dir, "setup_config.json") if os.path.isfile(setup_config_path): with open(setup_config_path) as setup_config_file: self.setup_config = json.load(setup_config_file) else: logger.warning('Missing the setup_config.json file.') # Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode # further setup config can be added. if self.setup_config["save_mode"] == "torchscript": self.model = torch.jit.load(model_pt_path) elif self.setup_config["save_mode"] == "pretrained": if self.setup_config["mode"] == "sequence_classification": self.model = AlbertForSequenceClassification.from_pretrained(model_dir) # elif self.setup_config["mode"]== "question_answering": # self.model = AutoModelForQuestionAnswering.from_pretrained(model_dir) # elif self.setup_config["mode"]== "token_classification": # self.model = AutoModelForTokenClassification.from_pretrained(model_dir) else: logger.warning('Missing the operation mode.') else: logger.warning('Missing the checkpoint or state_dict.') # if not os.path.isfile(os.path.join(model_dir, "vocab.*")): # self.tokenizer = BertTokenizer.from_pretrained(self.setup_config["model_name"], # do_lower_case=self.setup_config["do_lower_case"]) # else: self.tokenizer = BertTokenizer.from_pretrained(model_dir, do_lower_case=self.setup_config["do_lower_case"]) self.model.to(self.device) self.model.eval() logger.debug('Transformer model from path {0} loaded successfully'.format(model_dir)) # Read the mapping file, index to object name mapping_file_path = os.path.join(model_dir, "index_to_name.json") # Question answering does not need the index_to_name.json file. if not self.setup_config["mode"] == "question_answering": if os.path.isfile(mapping_file_path): with open(mapping_file_path) as f: self.mapping = json.load(f) else: logger.warning('Missing the index_to_name.json file.') self.initialized = True
def __init__(self, requires_grad=True, num_labels=2): super(AlbertModel, self).__init__() self.num_labels = num_labels self.albert = AlbertForSequenceClassification.from_pretrained( 'voidful/albert_chinese_base', num_labels=self.num_labels) self.tokenizer = BertTokenizer.from_pretrained( 'voidful/albert_chinese_base', do_lower_case=True) # self.albert = AlbertForSequenceClassification.from_pretrained('albert-xxlarge-v2', num_labels = self.num_labels) # self.tokenizer = AutoTokenizer.from_pretrained('albert-xxlarge-v2', do_lower_case=True) self.requires_grad = requires_grad self.device = torch.device("cuda") for param in self.albert.parameters(): param.requires_grad = True # 每个参数都要求梯度
def pick_model(model_name, num_labels): """ Return specified model: Available model names: ['albert-base-v2'\ , 'bert-base-uncased', 'bert-large-uncased'\ , 'roberta-base', 'xlnet-base-cased', ] """ if model_name == 'albert-base-v2': model = AlbertForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if model_name in ('bert-base-uncased', 'bert-large-uncased'): model = BertForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if model_name in ('roberta-base', "roberta-large", "roberta-large-mnli"): model = RobertaForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if model_name == 'xlnet-base-cased': model = XLNetForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) print(f'Loaded {model_name} model.') if torch.cuda.is_available(): model.cuda() return model
def create_and_check_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels model = AlbertForSequenceClassification(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def define_model_albert(cuda, epochs_steps, learn, length): from transformers import AlbertForSequenceClassification model = AlbertForSequenceClassification.from_pretrained('albert-base-v2') if cuda == 1: model.cuda() optimizer = AdamW(model.parameters(), lr=learn, eps=1e-8) from transformers import get_linear_schedule_with_warmup # Number of training epochs (authors recommend between 2 and 4) # Total number of training steps is number of batches * number of epochs. total_steps = length * epochs_steps # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) return model, scheduler, optimizer
def model_setting(model_name): if model_name == 'bert': from transformers import AutoTokenizer, BertForSequenceClassification, BertConfig config = BertConfig.from_pretrained("bert-base-uncased", num_labels=2) tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = BertForSequenceClassification.from_pretrained( "bert-base-uncased") return config, tokenizer, model elif model_name == 'albert': from transformers import AutoTokenizer, AlbertForSequenceClassification, AlbertConfig config = AlbertConfig.from_pretrained("albert-base-v2", num_labels=2) tokenizer = AutoTokenizer.from_pretrained("albert-base-v2") model = AlbertForSequenceClassification.from_pretrained( "albert-base-v2") return config, tokenizer, model
def main(model_name_or_path, args): config_class, model_class, tokenizer_class = (AlbertConfig, AlbertModel, AlbertTokenizer) config = load_from_file_or_base(config_class, model_name_or_path) model = load_from_file_or_base(model_class, model_name_or_path) tokenizer = load_from_file_or_base(tokenizer_class, model_name_or_path) if args.model_embedding_visual: wordembeddings = model.embeddings.word_embeddings # wordembeddings = model.albert.get_input_embeddings() logger.info(wordembeddings) vocab_word2id = tokenizer.get_vocab() word_lookup = {id: key for key, id in vocab_word2id.items()} n_samples = 1000 sampled_ids = torch.randint( high=len(vocab_word2id.values()), size=(1, n_samples), dtype=torch.long).flatten() logger.info("Sampled ids shape", sampled_ids.shape) embedded_vectors = wordembeddings(sampled_ids) from sklearn.manifold import TSNE embedded_reduced = TSNE(n_components=2).fit_transform(embedded_vectors.detach().numpy()) logger.info(embedded_reduced) names = [] for sid in sampled_ids: names.append(word_lookup[int(sid.int())]) plot_reduced_space(embedded_reduced, names=names) elif args.model_sentence_order: text_a = "It is absolutely necessary that all experiments should be recorded in detail during, " \ "or immediately after, their performance ..." text_b = "The more scientific the record is, the better." # Sadly SOP task is not implemented in Huggingface Transformers at the moment: # https://github.com/huggingface/transformers/issues/2671 # But there is a workaround using BERTs NSP, which in the wild will mostly be used # for sentence similarity queries. # The folloging issue nicly illustrates how to do that: # https://github.com/huggingface/transformers/issues/876 # If we had finetuned this model one could query with the following code, # but without finetuning, you'll always get random results: model = AlbertForSequenceClassification.from_pretrained(model_name_or_path) tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path) inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, return_tensors='pt') pred = model(inputs['input_ids'], token_type_ids=inputs['token_type_ids']) print(pred)
def model_fn(model_path='model_path'): dout = 0.1 model = AlbertForSequenceClassification.from_pretrained( model_type, num_labels=2, output_attentions=False, output_hidden_states=False, attention_probs_dropout_prob=dout, hidden_dropout_prob=dout, ) model.to(DEVICE) if DEVICE == 'cuda': model.load_state_dict( torch.load( model_path ) ) else: model.load_state_dict( torch.load( model_path, map_location=torch.device('cpu') ) ) #model.eval() return model
def main(): # Initializing a pre-trained ALBERT-base style tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') # Initialize data iterators train_generator = SingleSentenceClassificationProcessor() train_generator.add_examples_from_csv(file_name='data/train.tsv', column_label=1, column_text=0) train_dataset = train_generator.get_features( tokenizer=tokenizer) #, return_tensors='pt') eval_generator = SingleSentenceClassificationProcessor() eval_generator.add_examples_from_csv(file_name='data/dev.tsv', column_label=1, column_text=0) eval_dataset = train_generator.get_features( tokenizer=tokenizer) #, return_tensors='pt') test_generator = SingleSentenceClassificationProcessor() test_generator.add_examples_from_csv(file_name='data/test.tsv', column_label=1, column_text=0) test_dataset = train_generator.get_features( tokenizer=tokenizer) #, return_tensors='pt') model = AlbertForSequenceClassification.from_pretrained('albert-base-v2') trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) trainer.train() test_batch = next(iter(test_dataset)) print(f'Test batch is {test_batch}') pred = model( torch.tensor(test_batch.input_ids).unsqueeze(0).cuda(), torch.tensor(test_batch.label).unsqueeze(0).cuda()) print(f'Prediction: {pred}')
def sentiment_analysis(model_type, data_path): if model_type == 'albert': model = AlbertForSequenceClassification.from_pretrained( "textattack/albert-base-v2-SST-2") tokenizer = AlbertTokenizer.from_pretrained( "textattack/albert-base-v2-SST-2") elif model_type == 'bert': model = BertForSequenceClassification.from_pretrained( "textattack/bert-base-uncased-SST-2") tokenizer = BertTokenizer.from_pretrained( "textattack/bert-base-uncased-SST-2") elif model_type == 'distil': model = DistilBertForSequenceClassification.from_pretrained( "textattack/distilbert-base-cased-SST-2") tokenizer = DistilBertTokenizer.from_pretrained( "textattack/distilbert-base-cased-SST-2") elif model_type == 'roberta': model = RobertaForSequenceClassification.from_pretrained( "textattack/roberta-base-SST-2") tokenizer = RobertaTokenizer.from_pretrained( "textattack/roberta-base-SST-2") nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) tl = TextLoader(data_path) ground_truth = list() review_predictions = list() label_dict = {'LABEL_0': 0, 'LABEL_1': 1} for data in tqdm(tl): text = data['text'] score = data['score'] score = score >= 2.5 result = nlp(text, truncation=True) prediction = label_dict[result[0]['label']] ground_truth.append(score.cpu().numpy()) review_predictions.append(prediction) accuracy = accuracy_score(ground_truth, review_predictions) print('ACCURACY: ', accuracy) return accuracy
def __init__(self, hyperparams): """ :param hyperparams: list of paranters :type hyperparams: dict pretrained_weights in ['albert-base-v1', 'albert-base-v2'] more in https://huggingface.co/transformers/pretrained_models.html """ set_seed(hyperparams["random_state"], hyperparams["n_gpu"]) pretrained_weights = hyperparams['pretrained_weights'] self.tokenizer = AlbertTokenizer.from_pretrained(pretrained_weights) hyperparams["tokenizer"] = self.tokenizer self.hyperparams = hyperparams self.model = AlbertForSequenceClassification.from_pretrained( pretrained_weights, num_labels=3) self.processor = NLIProcessor(hyperparams)
def __call_model_torch(self): if self.model_to_use.lower() == 'bert': self.config = BertConfig(num_labels=2) self.model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=self.config) elif self.model_to_use.lower() == 'albert': self.config = AlbertConfig(num_labels=2) self.model = AlbertForSequenceClassification.from_pretrained( 'albert-base-v1', config=self.config) elif self.model_to_use.lower() == 'electra': self.config = ElectraConfig(num_labels=2) self.model = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', config=self.config) elif self.model_to_use.lower() == 'distilbert': self.config = DistilBertConfig(num_labels=2) self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', config=self.config) else: print('Model not avaiable yet.')
def __init__(self, num_classes, max_seq_length, batch_size, model_name, model_path): self.num_classes = num_classes self.classification_model_dir = model_path self.max_seq_length = max_seq_length self.predict_batch_size = batch_size self.model_name = model_name if torch.cuda.is_available(): self.device = torch.device("cuda") else: print('No GPU available, using the CPU instead.') self.device = torch.device("cpu") if self.model_name == 'bert': self.model = BertForSequenceClassification.from_pretrained( self.classification_model_dir, num_labels=self.num_classes) self.tokenizer = BertTokenizer.from_pretrained( self.classification_model_dir) if self.model_name == 'albert': self.model = AlbertForSequenceClassification.from_pretrained( self.classification_model_dir, num_labels=self.num_classes) self.tokenizer = AlbertTokenizer.from_pretrained( self.classification_model_dir) if self.model_name == 'distilbert': self.model = DistilBertForSequenceClassification.from_pretrained( self.classification_model_dir, num_labels=self.num_classes) self.tokenizer = DistilBertTokenizer.from_pretrained( self.classification_model_dir) if self.model_name == 'roberta': self.model = RobertaForSequenceClassification.from_pretrained( self.classification_model_dir, num_labels=self.num_classes) self.tokenizer = RobertaTokenizer.from_pretrained( self.classification_model_dir) if torch.cuda.is_available(): self.model.cuda()
def main(): # blockPrint() # setting device device = torch.device('cuda') # FullData = MR_Data.load_data('dataset/test.tsv', is_train_data=False) FullDataset = makeTorchDataSet(FullData, is_train_data=False) TestDataLoader = makeTorchDataLoader(FullDataset, batch_size=16) model_config = AlbertConfig.from_json_file( 'model/albert-large-config.json') trained_model_file = '12-11-2019_09-17-05_ALSS_e5_a69.24226892192033' model = AlbertForSequenceClassification.from_pretrained( 'train_models/' + trained_model_file + '/pytorch_model.bin', config=model_config) model.to(device) model.eval() f = open('submission.csv', 'w', encoding='utf-8') f.write('PhraseId,Sentiment\n') log("please waiting for predict ....") for batch_index, batch_dict in enumerate(TestDataLoader): batch_dict = tuple(t.to(device) for t in batch_dict) input_ids, phrase_ids = batch_dict outputs = model(input_ids) outputs = outputs[0].cpu() outputs = outputs.detach().numpy() # log(outputs) for i in range(len(outputs)): p_id = phrase_ids[i].item() s_level = np.argmax(outputs[i]) # log("phrase_id",p_id,"segment_level",s_level) f.write(str(p_id) + ',' + str(s_level) + '\n') f.close()
def call(self): if self.model_to_use.lower() == 'bert': self.model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False) print('Bert Cargado.') print(self.model) elif self.model_to_use.lower() == 'albert': self.model = AlbertForSequenceClassification.from_pretrained( 'albert-base-v1', num_labels=2, output_attentions=False, output_hidden_states=False) elif self.model_to_use.lower() == 'electra': self.model = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', num_labels=2, output_attentions=False, output_hidden_states=False) elif self.model_to_use.lower() == 'distilbert': self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False) else: print('Model not avaiable right now.') self.model.to(self.device) self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=self.epsilon) self.total_steps = len(self.train_dataloader) * self.epochs self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=0, num_training_steps=self.total_steps)
def evaluate(loader, model_dir, ckpt, num_labels): loss = 0.0 nb_eval_steps = 0 y_pred = None model = AlbertForSequenceClassification.from_pretrained( model_dir, num_labels=num_labels) model = nn.DataParallel(model) model.load_state_dict(torch.load(os.path.join(model_dir, ckpt))) model = model.cuda() for batch in tqdm(loader, desc="Evaluating"): model.eval() inp_ids, seg_ids, inp_masks, labels = batch inp_ids = inp_ids.cuda() seg_ids = seg_ids.cuda() inp_masks = inp_masks.cuda() labels = labels.cuda() with torch.no_grad(): tmp_loss, logits = model(inp_ids, seg_ids, inp_masks, labels=labels) loss += tmp_loss.mean().item() nb_eval_steps += 1 if y_pred is None: y_pred = np.argmax(logits.detach().cpu().numpy(), axis=1) y_true = labels.detach().cpu().numpy() else: y_pred = np.append( y_pred, np.argmax(logits.detach().cpu().numpy(), axis=1)) y_true = np.append(y_true, labels.detach().cpu().numpy()) loss = loss / nb_eval_steps acc = precision_score(y_true, y_pred, average='weighted') print(f"test_acc: {acc}\ttest_loss: {loss}") return acc, loss
def get_sim_model(config_file, pre_train_model, label_num=2): bert_config = BertConfig.from_pretrained(config_file) bert_config.num_labels = label_num model = AlbertForSequenceClassification(bert_config) model.load_state_dict(torch.load(pre_train_model)) return model
def main(args): tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') assert args.model in ["albert-classifier", "attn-lstm", "cond-attn-lstm"] if args.model == "albert-classifier": train_dataloader, validation_dataloader = get_dataloader_ALBERT( tokenizer, args.data_file, args.batch, args.max_len) classifier = AlbertForSequenceClassification.from_pretrained( 'albert-base-v2') classifier.config.classifier_dropout_prob = 0.1 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'. { 'params': [ p for n, p in classifier.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': args.wd }, # Filter for parameters which *do* include those. { 'params': [ p for n, p in classifier.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.0 } ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=args.adam_eps) # optimizer = AdamW(classifier.parameters(), lr = LR, eps = EPS) total_steps = len(train_dataloader) * args.epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # weight decay and clip_grad_norm 10K data LR = 1e-5 WD = 1e-4 BATCH=32 by setting accummulate = 2 classifier, history = AlbertTrainer(classifier, optimizer, scheduler, args.epochs, args.early_stop, train_dataloader, validation_dataloader, accumulation_steps=args.accumulate) else: if args.model == "attn-lstm": train_dataloader, validation_dataloader = get_dataloader_LSTM( tokenizer, args.data_file, args.batch, args.max_len) classifier = PairAttnLSTM(embedding_dim=768, hidden_dim=args.d_hid, num_layers=args.n_layer, label_size=args.n_label) optimizer_grouped_parameters = [{ 'params': [p for n, p in classifier.parameters()], 'weight_decay_rate': args.wd }] optimizer = AdamW(classifier.parameters(), lr=args.lr, eps=args.adam_eps) else: pass total_steps = len(train_dataloader) * args.epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) print("Start training ...") print("Max epochs", args.epochs) print("Early Stop", args.early_stop) print("Batch Size", args.batch) print("Accumulate", args.accummulate) print("Learning Rate", args.lr) print("Weight Decay", args.wd) print("Max Sequene Length", args.max_len) print("LSTM Hidden Size", args.d_hid) print("LSTM Layers", args.n_layer) print() classifier, history = LSTMTrainer(classifier, optimizer, scheduler, args.epochs, args.early_stop, train_dataloader, validation_dataloader, accumulation_steps=args.accumulate)