from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, TextClassificationPipeline from flask import Flask, request import json from config import SentimentClassificationConfig config = SentimentClassificationConfig.from_json("config.json") app = Flask(__name__) tokenizer = DistilBertTokenizer.from_pretrained(config.model_path) model = DistilBertForSequenceClassification.from_pretrained(config.model_path) if config.use_cuda: model = model.cuda() @app.route('/api/rest/classify_sentiment', methods=["POST"]) def classify_sentiment(): rest_request = json.loads(request.data.decode('utf-8')) sentence = str(rest_request["sentence"]) sentiment_classifier = TextClassificationPipeline( model=model, tokenizer=tokenizer, device=0 if config.use_cuda else -1) result = sentiment_classifier(sentence) return str(result) if __name__ == '__main__': app.run(host=config.host, port=config.port, debug=True) #curl --header "Content-Type: application/json" --request POST --data '{"sentence":"You are so cute!"}' http://localhost:5555/api/rest/classify_sentiment
device = torch.device("cuda:0") # model configuration batch_size = args.batch_size lr = args.lr weight_decay = args.weight_decay n_epochs = args.n_epochs if args.full_bert: bert_model = 'bert-base-uncased' bert_config = BertConfig.from_pretrained(bert_model, num_labels=2) tokenizer = BertTokenizer.from_pretrained(bert_model) else: bert_model = 'distilbert-base-uncased' bert_config = DistilBertConfig.from_pretrained(bert_model, num_labels=2) tokenizer = DistilBertTokenizer.from_pretrained(bert_model) # wandb initialization wandb.init(project="domain-adaptation-twitter-emnlp", name=args.run_name, config={ "epochs": n_epochs, "learning_rate": lr, "warmup": args.warmup_steps, "weight_decay": weight_decay, "batch_size": batch_size, "train_split_percentage": args.train_pct, "bert_model": bert_model, "seed": seed, "pretrained_model": args.pretrained_model, "tags": ",".join(args.tags)
class BERTQA: tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=True) model = DistilBertForQuestionAnswering.from_pretrained( 'distilbert-base-uncased-distilled-squad') MAX_TOKENS = 512 MAX_TOKENS_QUESTION = 30 MAX_TOKENS_DOCUMENT = MAX_TOKENS - MAX_TOKENS_QUESTION - 2 # [SEP] and [CLS] def __init__(self): pass def get_token_length(self, string): tokens = self.tokenizer.encode(string) return len(tokens) def chunk_document(self, document, re_consolidate=True): '''Chunks up a long document into optimally large pieces so that they can be passed to BERT. Activating `re_consolidate` will put the chunks back together to make them as large as possible for improved performance. ''' document_length = self.get_token_length(document) if document_length > self.MAX_TOKENS_DOCUMENT: approved_chunks = [] paragraphs = document.split('\n') paragraphs = [par for par in paragraphs if par] for paragraph in paragraphs: paragraph_length = self.get_token_length(paragraph) if paragraph_length > self.MAX_TOKENS_DOCUMENT: sentences = paragraph.split('.') sentences = [sen for sen in sentences if sen] for sentence in sentences: sentence_length = self.get_token_length(sentence) if sentence_length > self.MAX_TOKENS_DOCUMENT: print("Ignoring overlong sentence.") else: approved_chunks.append(sentence) else: approved_chunks.append(paragraph) if re_consolidate: lengths = [ self.get_token_length(chunk) for chunk in approved_chunks ] consolidated_chunks = [] running_length = 0 current_chunk = '' for chunk, length in zip(approved_chunks, lengths): if (running_length + length) < self.MAX_TOKENS_DOCUMENT: current_chunk += chunk running_length += length else: consolidated_chunks.append(current_chunk) current_chunk = chunk running_length = length return consolidated_chunks else: return approved_chunks else: return [document] def answer_question(self, question, document): '''Takes a `question` string and an `document` string (which contains the answer), and identifies the words within the `document` that are the answer. ''' question_length = self.get_token_length(question) document_length = self.get_token_length(document) if question_length > self.MAX_TOKENS_QUESTION: msg = f'Question exceeds max token length ({str(question_length)}).' raise ValueError(msg) if document_length > self.MAX_TOKENS_DOCUMENT: msg = f'Document exceeds max token length ({str(document_length)}).' raise ValueError(msg) encoding = self.tokenizer.encode_plus(question, document) input_ids, attention_mask = encoding["input_ids"], encoding[ "attention_mask"] start_scores, end_scores = self.model(torch.tensor([input_ids]), attention_mask=torch.tensor( [attention_mask])) confidence = float(max(torch.max(start_scores), torch.max(end_scores))) start_token = torch.argmax(start_scores) end_token = torch.argmax(end_scores) ans_tokens = input_ids[torch.argmax(start_scores ):torch.argmax(end_scores) + 1] answer_tokens = self.tokenizer.convert_ids_to_tokens( ans_tokens, skip_special_tokens=True) if not answer_tokens: # TODO Understand this bug return '<NO ANSWER>', -10 else: answer = answer_tokens[0] for token in answer_tokens[1:]: if token[0:2] == '##': answer += token[2:] else: answer += ' ' + token return answer, confidence def answer_question_chunked(self, question, document, re_consolidate=True): chunks = self.chunk_document(document, re_consolidate=True) responses = [] for chunk in tqdm(chunks): answer, confidence = self.answer_question(question, chunk) response = { 'answer': answer, 'confidence': confidence, 'chunk': chunk } responses.append(response) responses.sort(key=lambda x: -x['confidence']) return responses
if build_local_vectors: if local_vec_generator == "w2v": corpus2wordVSM(all_data_file, w2v_file, txt_file_name, local_vector_size, 5, 5, 4) elif local_vec_generator == "bert": pass #TODO train model here instead of externally # generate_local_bert_embeddings() if local_vec_generator == "w2v": local_model = models.Word2Vec.load(w2v_file) #CBOW local_tokenizer = None elif local_vec_generator == "bert": local_model = DistilBertModel( DistilBertConfig()).from_pretrained(local_bert_model_location) local_tokenizer = DistilBertTokenizer.from_pretrained( local_bert_model_location) util = Utility.Utility() preprocessor = Preprocess.Preprocess() data_dir = os.path.abspath( f"data/multichannel_{global_vec_generator}_global_{local_vec_generator}_local/" ) if global_vec_generator == "glove": global_dim = 50 else: global_dim = 768 if not os.path.exists(data_dir):
def main(): parser = argparse.ArgumentParser( description='argument parsing for testing') parser.add_argument('--data_dir', default='data', type=str, help='path to data directory - default: \'data\'') parser.add_argument('--review_file', default='yelp_reviews_test1000.csv', type=str, help='file name containig reviews') parser.add_argument('--batch_size', default=32, type=int, help='batch size - default: 32') parser.add_argument('--model_save', default='./model_save', type=str, help='directory to pull model') parser.add_argument('--nolog', action='store_true', help='disable logging') # parse input arguments clargs = parser.parse_args() # log to file and stdout if clargs.nolog: print("Not logging") else: sys.stdout = Logger('test') print("") print("==========================================") print("-------------Confirm Arguments------------") print("==========================================") print("Data directory for test data: {0:s}".format(clargs.data_dir)) print("Test reviews file: {0:s}".format(clargs.review_file)) print("Batch size of {0:d}".format(clargs.batch_size)) print("Loading model from: {0:s}".format(clargs.model_save)) print("") print("==========================================") print("---------------Generate Data--------------") print("==========================================") path = clargs.data_dir fn = clargs.review_file filename = path + "/" + fn t0 = time.perf_counter() print("Reading in training data from {0:s}".format(clargs.review_file)) reviews_df = pd.read_csv(filename) reviews_df = reviews_df[['text', 'stars']] TEST_SIZE = len(reviews_df.index) elapsed = time.perf_counter() - t0 print("Finished reading {0:d} entries | Took {1:0.2f} seconds".format( TEST_SIZE, elapsed)) # load the model from save print("") print("==========================================") print("----------------Load Model----------------") print("==========================================") print("Loading model and tokenizer from directory") model_path = clargs.model_save json_infile = model_path + '/' + 'hyperparams.json' with open(json_infile, 'r') as infile: hyper_json = json.load(infile) if 'model' not in hyper_json or hyper_json['model'] == 'bert': print("Loading normal bert model") tokenizer = BertTokenizer.from_pretrained(model_path) model = BertForSequenceClassification.from_pretrained(model_path) else: print("Loading distilbert Model") tokenizer = DistilBertTokenizer.from_pretrained(model_path) model = DistilBertForSequenceClassification.from_pretrained(model_path) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) model.eval() print("Tokenizing the data to be tested") dataset = extract_features(reviews_df, tokenizer) test_dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=clargs.batch_size, drop_last=False) # test the model against some test data print("") print("==========================================") print("----------------Test Model----------------") print("==========================================") print("Testing - Split {0:d} examples into {1:d} batches".format( TEST_SIZE, len(test_dataloader))) test_loss, test_acc, pred_labels, actual_labels = evaluate( model, device, test_dataloader, TEST_SIZE) mae = mean_abs_error(pred_labels, actual_labels) mse = mean_square_error(pred_labels, actual_labels) conf_matrix = confusion_matrix(pred_labels, actual_labels) print("") print("==========================================") print("---------------TEST RESULTS---------------") print("==========================================") print("") print("-----------TRAINING HYPERPARAMS-----------") print("Data directory: {0:s}".format(hyper_json['dataDirectory'])) print("Reviews file: {0:s}".format(hyper_json['dataFile'])) print("Batch size of {0:s}".format(hyper_json['batchSize'])) print("Train ratio of {0:s}".format(hyper_json['trainRatio'])) print("Train for {0:s} epochs".format(hyper_json['numEpochs'])) print("") print("Testing accuracy: ", test_acc) print("Mean absolute error: ", mae) print("Mean square error: ", mse) print("") print("-------------CONFUSION MATRIX-------------") print("") print(conf_matrix) print("") target_names = ['1 star', '2 star', '3 star', '4 star', '5 star'] print( metrics.classification_report(actual_labels, pred_labels, digits=3, target_names=target_names))
def __init__(self, data_path, seq_length): self.data = pd.read_csv(data_path).astype('object') self.seq_length = seq_length self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def wsd( model_name='distilbert-base-uncased', #ensemble-distil-1-albert-1 / albert-xxlarge-v2 / bert-base-uncased classifier_input='token-embedding-last-1-layers', # token-embedding-last-layer / token-embedding-last-n-layers classifier_hidden_layers=[], reduce_options=True, freeze_base_model=True, max_len=512, batch_size=32, test=False, lr=5e-5, eps=1e-8, n_epochs=50, cls_token=False, # If true, the cls token is used instead of the relevant-word token cache_embeddings=True, # If true, the embeddings from the base model are saved to disk so that they only need to be computed once save_classifier=True # If true, the classifier part of the network is saved after each epoch, and the training is automatically resumed from this saved network if it exists ): train_path = "wsd_train.txt" test_path = "wsd_test_blind.txt" n_classes = 222 device = 'cuda' import __main__ as main print("Script: " + os.path.basename(main.__file__)) print("Loading base model %s..." % model_name) if model_name.startswith('ensemble-distil-'): last_n_distil = int(model_name.replace('ensemble-distil-', "")[0]) last_n_albert = int(model_name[-1]) from transformers import AlbertTokenizer from transformers.modeling_albert import AlbertModel base_model = AlbertModel.from_pretrained('albert-xxlarge-v2', output_hidden_states=True, output_attentions=False) tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') print( "Ensemble model with DistilBert last %d layers and Albert last %d layers" % (last_n_distil, last_n_albert)) elif model_name.startswith('distilbert'): tokenizer = DistilBertTokenizer.from_pretrained(model_name) base_model = DistilBertModel.from_pretrained(model_name, num_labels=n_classes, output_hidden_states=True, output_attentions=False) elif model_name.startswith('bert'): from transformers import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained(model_name) base_model = BertModel.from_pretrained(model_name, num_labels=n_classes, output_hidden_states=True, output_attentions=False) elif model_name.startswith('albert'): from transformers import AlbertTokenizer from transformers.modeling_albert import AlbertModel tokenizer = AlbertTokenizer.from_pretrained(model_name) base_model = AlbertModel.from_pretrained(model_name, output_hidden_states=True, output_attentions=False) use_n_last_layers = 1 if classifier_input == 'token-embedding-last-layer': use_n_last_layers = 1 elif classifier_input.startswith( 'token-embedding-last-') and classifier_input.endswith('-layers'): use_n_last_layers = int( classifier_input.replace('token-embedding-last-', "").replace('-layers', "")) else: raise ValueError("Invalid classifier_input argument") print("Using the last %d layers" % use_n_last_layers) def tokenize(str): return tokenizer.tokenize(str)[:max_len - 2] SENSE = LabelField(is_target=True) LEMMA = LabelField() TOKEN_POS = LabelField(use_vocab=False) TEXT = Field(tokenize=tokenize, pad_token=tokenizer.pad_token, init_token=tokenizer.cls_token, eos_token=tokenizer.sep_token) EXAMPLE_ID = LabelField(use_vocab=False) fields = [('sense', SENSE), ('lemma', LEMMA), ('token_pos', TOKEN_POS), ('text', TEXT), ('example_id', EXAMPLE_ID)] def read_data(corpus_file, fields, max_len=None): train_id_start = 0 test_id_start = 76049 # let the ids for the test examples start after the training example indices if corpus_file == "wsd_test_blind.txt": print("Loading test data...") id_start = test_id_start else: print("Loading train/val data...") id_start = train_id_start with open(corpus_file, encoding='utf-8') as f: examples = [] for i, line in enumerate(f): sense, lemma, word_position, text = line.split('\t') # We need to convert from the word position to the token position words = text.split() pre_word = " ".join(words[:int(word_position)]) pre_word_tokenized = tokenizer.tokenize(pre_word) token_position = len( pre_word_tokenized ) + 1 # taking into account the later addition of the start token example_id = id_start + i if max_len is None or token_position < max_len - 1: # ignore examples where the relevant token is cut off due to max_len if cls_token: token_position = 0 examples.append( Example.fromlist( [sense, lemma, token_position, text, example_id], fields)) else: print( "Example %d is skipped because the relevant token was cut off (token pos = %d)" % (example_id, token_position)) print(text) return Dataset(examples, fields) dataset = read_data(train_path, fields, max_len) random.seed(0) trn, vld = dataset.split(0.7, stratified=True, strata_field='sense') TEXT.build_vocab([]) if model_name.startswith('albert') or model_name.startswith( 'ensemble-distil-'): class Mapping: def __init__(self, fn): self.fn = fn def __getitem__(self, item): return self.fn(item) TEXT.vocab.stoi = Mapping(tokenizer.sp_model.PieceToId) TEXT.vocab.itos = Mapping(tokenizer.sp_model.IdToPiece) else: TEXT.vocab.stoi = tokenizer.vocab TEXT.vocab.itos = list(tokenizer.vocab) SENSE.build_vocab(trn) LEMMA.build_vocab(trn) trn_iter = BucketIterator(trn, device=device, batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, train=True, sort=True) vld_iter = BucketIterator(vld, device=device, batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False, train=False, sort=True) if freeze_base_model: for mat in base_model.parameters(): mat.requires_grad = False # Freeze Bert model so that we only train the classifier on top if reduce_options: lemma_mask = defaultdict( lambda: torch.zeros(len(SENSE.vocab), device=device)) for example in trn: lemma = LEMMA.vocab.stoi[example.lemma] sense = SENSE.vocab.stoi[example.sense] lemma_mask[lemma][sense] = 1 lemma_mask = dict(lemma_mask) def mask( batch_logits, batch_lemmas ): # Masks out the senses that do not belong to the specified lemma for batch_i in range(len(batch_logits)): lemma = batch_lemmas[batch_i].item() batch_logits[batch_i, :] *= lemma_mask[lemma] return batch_logits else: def mask(batch_logits, batch_lemmas): return batch_logits experiment_name = model_name + " " + ( classifier_input if not model_name.startswith('ensemble-distil-') else "") + " " + str(classifier_hidden_layers) + " (" + ( " cls_token" if cls_token else "") + (" reduce_options" if reduce_options else "") + ( " freeze_base_model" if freeze_base_model else "" ) + " ) " + "max_len=" + str(max_len) + " batch_size=" + str( batch_size) + " lr=" + str(lr) + " eps=" + str(eps) + ( " cache_embeddings" if cache_embeddings else "") if model_name.startswith('ensemble-distil-'): model = WSDEnsembleModel(last_n_distil, last_n_albert, n_classes, mask, classifier_hidden_layers) else: model = WSDModel(base_model, n_classes, mask, use_n_last_layers, model_name, classifier_hidden_layers, cache_embeddings) history = None #if save_classifier: # if model.load_classifier(experiment_name): # # Existing saved model loaded # # Also load the corresponding training history # history = read_dict_file("results/"+experiment_name+".txt") model.cuda() print("Starting experiment " + experiment_name) if test: tst = read_data(test_path, fields, max_len=512) tst_iter = Iterator(tst, device=device, batch_size=batch_size, sort=False, sort_within_batch=False, repeat=False, train=False) batch_predictions = [] for batch in tst_iter: print('.', end='') sys.stdout.flush() text = batch.text.t() with torch.no_grad(): outputs = model(text, token_positions=batch.token_pos, lemmas=batch.lemma, example_ids=batch.example_id) scores = outputs[-1] batch_predictions.append(scores.argmax(dim=1)) batch_preds = torch.cat(batch_predictions, 0).tolist() predicted_senses = [SENSE.vocab.itos(pred) for pred in batch_preds] with open("test_predictions/" + experiment_name + ".txt", "w") as out: out.write("\n".join(predicted_senses)) else: no_decay = ['bias', 'LayerNorm.weight'] decay = 0.01 optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=eps) def save_results(history): with open("results/" + experiment_name + ".txt", "w") as out: out.write(str(history)) if save_classifier: if len(history['val_acc']) < 2 or history['val_acc'][-1] > max( history['val_acc'][:-1]): model.save_classifier(experiment_name, best=True) else: model.save_classifier(experiment_name, best=False) train(model, optimizer, trn_iter, vld_iter, n_epochs, save_results, history)
def main(): ntasks = len(tasks) data_args = list() configuration = list() sub_models = list() datasets = list() # train_iter = list() # dev_iter = list() # test_iter = list() sub_optimizer = list() metrics = list() tokenizer = DistilBertTokenizer.from_pretrained(bert_path, cache_dir=cache_dir) for i in range(ntasks): logger.info("Tasks:" + tasks[i]) data_args.append(GlueDataArgs(task_name=tasks[i])) configuration.append( DistilBertConfig.from_pretrained( bert_path, num_labels=glue_tasks_num_labels[tasks[i].lower()], finetuning_task=data_args[i].task_name, cache_dir=cache_dir)) if use_gpu: sub_models.append(SequenceClassification(configuration[i]).cuda()) else: sub_models.append(SequenceClassification(configuration[i])) datasets.append( GlueDataSets(data_args[i], tokenizer=tokenizer, cache_dir=cache_dir)) sub_optimizer.append( torch.optim.AdamW(sub_models[i].parameters(), lr=learning_rate_0)) metrics.append(ComputeMetrics(data_args[i])) logger.info("*** DataSet Ready ***") if use_gpu: Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True).cuda() else: Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True) bert_optimizer = torch.optim.AdamW(Bert_model.parameters(), lr=learning_rate_0) # balaned dataset train_num = list() for i in range(ntasks): train_num.append(datasets[i].length("train")) #train_nummax = #train_num = [x/train_nummax for x in train_num] print(train_num) iterations = (epochs * max(train_num) // bs) + 1 #print(iterations) sub_scheduler = list() for i in range(ntasks): sub_scheduler.append( torch.optim.lr_scheduler.LambdaLR( sub_optimizer[i], lambda step: (1.0 - step / iterations)) ) #if step <= frozen else learning_rate_1) Bert_scheduler = torch.optim.lr_scheduler.LambdaLR( bert_optimizer, lambda step: (1.0 - step / iterations)) # if step <= frozen else learning_rate_1 # datasets[i].dataloader("train", batch_size_train[i]) train_iter = list() for i in range(ntasks): train_iter.append( GlueIterator(datasets[i].dataloader("train", batch_size_train[i]))) for i in range(1, iterations + 1): if i > frozen: for p in Bert_model.parameters(): p.requires_grad = True Bert_model.train() elif i == frozen: for p in Bert_model.parameters(): p.requires_grad = True Bert_model.train() logging.info("#####################################") logging.info("Release the Traing of the Main Model.") logging.info("#####################################") else: for p in Bert_model.parameters(): p.requires_grad = False Bert_model.eval() losses = list() loss_rates = list() for j in range(ntasks): sub_models[j].train() data = train_iter[j].next() if use_gpu: input_ids = data['input_ids'].cuda() attention_mask = data['attention_mask'].cuda() #token_type_ids=data['token_type_ids'].cuda() label = data['labels'].cuda() else: input_ids = data['input_ids'] attention_mask = data['attention_mask'] #token_type_ids=data['token_type_ids'] label = data['labels'] output_inter = Bert_model( input_ids=input_ids, attention_mask=attention_mask, return_dict=True) # token_type_ids=token_type_ids, losses.append(sub_models[j](input=output_inter, labels=label)[0]) losssum = sum(losses).item() for j in range(ntasks): loss_rates.append(losses[j].item() / losssum) loss = 0 printInfo = 'TOTAL/Train {}/{}, lr:{}'.format(i, iterations, Bert_scheduler.get_lr()) for j in range(ntasks): loss += losses[j] * batch_size_train[j] * loss_rates[j] printInfo += ', loss{}-{:.6f}'.format(j, losses[j]) sub_optimizer[j].zero_grad() logging.info(printInfo) if i > frozen: bert_optimizer.zero_grad() loss.backward() if i > frozen: bert_optimizer.step() for j in range(ntasks): sub_optimizer[j].step() #sub_scheduler[j].step() #Bert_scheduler.step() if (i % eval_interval == 0): evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics, ntasks) save_models(Bert_model, sub_models, ntasks, i) evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics, ntasks) save_models(Bert_model, sub_models, ntasks, iterations)
def main(): """ Main function """ # Parse cmd line arguments args = nlp_parser.parse_arguments() source = "" subject = "" context = "" question = "" answer = "" squadid = "" if args: if "text" in args: if args["text"]: source = args["text"] if "subject" in args: if args["subject"]: subject = args["subject"] if "context" in args: if args["context"]: context = args["context"] if "question" in args: if args["question"]: question = args["question"] clean_question = nlp.clean(question) if "answer" in args: if args["answer"]: answer = args["answer"] if "squadid" in args: if args["squadid"]: squadid = args["squadid"] else: sys.exit("Parser didn't return args correctly") # Setup the question, either from a specified SQuAD record # or from cmd line arguments. # If no question details are provided, a random # SQuAD example will be chosen. if question: if source: with open(source, "r") as text_file_handle: context = text_file_handle.read() else: print("No text provided, searching SQuAD dev-2.0 dataset") squad_data = nlp.import_squad_data() squad_records = squad_data.loc[squad_data["clean_question"] == clean_question] if squad_records.empty: sys.exit( "Question not found in SQuAD data, please provide context using `--text`." ) subject = squad_records["subject"].iloc[0] context = squad_records["context"].iloc[0] question = squad_records["question"].iloc[0] answer = squad_records["answer"] else: squad_data = nlp.import_squad_data() if squadid: source = args["squadid"] squad_records = squad_data.loc[squad_data["id"] == source] i_record = 0 else: if subject: print( "Picking a question at random on the subject: ", subject, ) squad_records = squad_data.loc[squad_data["subject"] == subject] else: print( "No SQuAD ID or question provided, picking one at random!") squad_records = squad_data n_records = len(squad_records.index) i_record = random.randint(0, max(0, n_records - 1)) if squad_records.empty: sys.exit( "No questions found in SQuAD data, please provide valid ID or subject." ) n_records = len(squad_records.index) i_record = random.randint(0, n_records - 1) source = squad_records["id"].iloc[i_record] subject = squad_records["subject"].iloc[i_record] context = squad_records["context"].iloc[i_record] question = squad_records["question"].iloc[i_record] answer = squad_records["answer"].iloc[i_record] # DistilBERT question answering using pre-trained model. token = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", return_token_type_ids=True) model = TFDistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased-distilled-squad") encoding = token.encode_plus(question, context, max_length=512, truncation=True) input_ids, attention_mask = ( encoding["input_ids"], encoding["attention_mask"], ) model_output = model(np.array([input_ids]), attention_mask=np.array([attention_mask])) start_scores = model_output.start_logits end_scores = model_output.end_logits answer_ids = input_ids[np.argmax(start_scores):np.argmax(end_scores) + 1] answer_tokens = token.convert_ids_to_tokens(answer_ids, skip_special_tokens=True) answer_tokens_to_string = token.convert_tokens_to_string(answer_tokens) # Display results print("\nDistilBERT question answering example.") print("======================================") print("Reading from: ", subject, source) print("\nContext: ", context) print("--") print("Question: ", question) print("Answer: ", answer_tokens_to_string) print("Reference Answers: ", answer)
if __name__ == '__main__': if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' print(f"Using {device}.") print(f"Reading {sys.argv[1]}...") df = extract_data(sys.argv[1], contain_answers=False).set_index(['id']) print(f"DataFrame created.") print("Tokenizing the DataFrame...") model = DistilBertKnowledge(alpha=0.5) DistilBertTokenizer.from_pretrained( model.info.pretrained_model).save_pretrained('slow_tokenizer/') tokenizer = BertWordPieceTokenizer('slow_tokenizer/vocab.txt', lowercase=True) df = process_dataframe(df, tokenizer, contain_answers=False) print("Tokenization complete.") dataset = SquadDataset(df, model.info, contain_answers=False) loader = DataLoader(dataset, batch_size=16, num_workers=4, pin_memory=True) print("Loading model weights...") model.load_state_dict(torch.load('model.pt')) model = model.to(device) print("Model loaded.") model.eval() print("Starting evaluation...")
def main(): args = parse_arguments(sys.argv[1:]) set_seed(args['random_seed']) df = get_train_data() test_df = get_test_data() NUM_CLASSES = df['label'].nunique() train_texts, val_texts, train_labels, val_labels = train_test_split(df['sentence'], df['label_int'], random_state=args['random_seed'], test_size=.2) print(train_texts.shape, val_texts.shape, train_labels.shape, val_labels.shape) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') train_encodings = tokenizer(train_texts.to_list(), truncation=True, padding=True) val_encodings = tokenizer(val_texts.to_list(), truncation=True, padding=True) test_encodings = tokenizer(test_df['sentence'].to_list(), truncation=True, padding=True) train_dataset = HINTDataset(train_encodings, train_labels.values) val_dataset = HINTDataset(val_encodings, val_labels.values) test_dataset = HINTDataset(test_encodings, test_df['label_int'].values) model = HINTModel(num_classes=NUM_CLASSES) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) model.ffn.train() train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False) optim = AdamW(model.parameters(), lr=args['learning_rate']) loss_fn = nn.CrossEntropyLoss() step = 0 best_acc = 0 Path(args['model_dir']).mkdir(parents=True, exist_ok=True) for epoch in range(args['epochs']): train_loss, train_acc, train_f1 = train_fn(model, train_loader, loss_fn, optim, device) val_loss, val_acc, val_f1 = val_fn(model, val_loader, loss_fn, device) print(f"{epoch+1}: train: [{train_loss:.3f}, {train_acc:.3f}, {train_f1:.3f}], val: [{val_loss:.3f}, {val_acc:.3f}, {val_f1:.3f}]") if val_acc > best_acc: best_acc = val_acc step = 0 torch.save(model.state_dict(), f"{args['model_dir']}/{args['model_path']}") else: step += 1 if step >= args['max_steps']: break model.load_state_dict(torch.load(f"{args['model_dir']}/{args['model_path']}", map_location=device)) print("model successfully loaded!") test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False) preds, probs = inference_fn(model, test_loader, device) test_df['preds'] = preds test_df['probs'] = probs test_df['label_int'] = test_df['label_int'].fillna(NUM_CLASSES + 1) test_df['updated_preds'] = test_df['preds'] test_df.loc[test_df['probs'] <= args['min_prob'], 'updated_preds'] = NUM_CLASSES + 1 Path(args['output_dir']).mkdir(parents=True, exist_ok=True) test_df.to_csv(f"{args['output_dir']}/{args['test_file_name']}", index=False) acc1 = accuracy_score(test_df['label_int'], test_df['preds']) acc2 = accuracy_score(test_df['label_int'], test_df['updated_preds']) f11 = f1_score(test_df['label_int'], test_df['preds'], average='weighted') f12 = f1_score(test_df['label_int'], test_df['updated_preds'], average='weighted') print(f"Default: acc: {acc1}, f1_score: {f11}") print(f"Updated with Min Prob: acc: {acc2}, f1_score: {f12}")
def load(cls, pretrained_model_name_or_path, revision=None, tokenizer_class=None, use_fast=True, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) kwargs["revision"] = revision if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class( pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if "AlbertTokenizer" in tokenizer_class: if use_fast: ret = AlbertTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "XLMRobertaTokenizer" in tokenizer_class: if use_fast: ret = XLMRobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: if use_fast: ret = RobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "XLNetTokenizer" in tokenizer_class: if use_fast: ret = XLNetTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "CamembertTokenizer" in tokenizer_class: if use_fast: ret = CamembertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRQuestionEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRContextEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BigBirdTokenizer" in tokenizer_class: if use_fast: ret = BigBirdTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BigBirdTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
def main(): parser = setup_parser() args = parser.parse_args() # specifies the path where the biobert or clinical bert model is saved if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert': args.bert_model = args.model_loc print(args.bert_model) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "mednli": MedNLIProcessor, "goc": GOCProcessor } num_labels_task = {"cola": 2, "mnli": 3, "mrpc": 2, "mednli": 3, "goc": 2} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) #if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = DistilBertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) print('TRAIN') train = processor.get_train_examples(args.data_dir) print([(train[i].text_a, train[i].text_b, train[i].label) for i in range(3)]) print('DEV') dev = processor.get_dev_examples(args.data_dir) print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)]) print('TEST') test = processor.get_test_examples(args.data_dir) print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)]) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = DocDistilBertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.freeze_bert: print("FREEZING BERT") for param in model.distilbert.parameters(): param.requires_grad = False if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) num_train_optimization_steps scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=float(num_train_optimization_steps) * args.warmup_proportion, num_training_steps=num_train_optimization_steps) #optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_documents_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) print(len(train_features[0].input_ids)) print(len(train_features[0].input_ids[0])) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) #print(loss[0].shape) #print(loss[1].shape) #print(loss[2].shape) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # Saving checkpoint save_checkpoint(model, args.output_dir, "epoch_%d_checkpoint.pth" % epoch_num) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned #config = DistilBertConfig(output_config_file) model = DistilBertForSequenceClassification.from_pretrained( args.output_dir) #, num_labels=num_labels) #model.load_state_dict(torch.load(output_model_file)) else: model = DistilBertForSequenceClassification.from_pretrained( args.bert_model) #, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_documents_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = processor.get_test_examples(args.data_dir) test_features = convert_documents_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): #tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids) tmp_test_loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) #logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss } output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
# Load MRPC data data = tensorflow_datasets.load('glue/mrpc') # Pick GPU device (only pick 1 GPU) gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_visible_devices(gpus[0], 'GPU') # Load tokenizer, model from pretrained model/vocabulary bert_tokenizer = BertTokenizer.from_pretrained('mrpc/1') bert_model = TFBertForSequenceClassification.from_pretrained('mrpc/1') valid_dataset = glue_convert_examples_to_features(data['validation'], bert_tokenizer, max_length=128, task='mrpc') valid_dataset = valid_dataset.batch(64) # Evaluate time for bert_model (bigger model) start_time = time.time() results = bert_model.predict(valid_dataset) execution_time = time.time() - start_time # Load tokenizer, model from pretrained model/vocabulary distilbert_tokenizer = DistilBertTokenizer.from_pretrained('mrpc/2') distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('mrpc/2') valid_dataset = glue_convert_examples_to_features(data['validation'], distilbert_tokenizer, max_length=128, task='mrpc') valid_dataset = valid_dataset.batch(64) # Evaluate time for distilbert_model (bigger model) start_time = time.time() results = distilbert_model.predict(valid_dataset) execution_time = time.time() - start_time
# handling model output import torch # the place where we have our DistilBERT model output_dir = "./model_save" # maximum length of the comment/tweet; median lies below 64. MAX_LEN = 64 # telling pytorch to use CPU for predicting outputs device = torch.device("cpu") # Load a trained model and vocabulary that you have fine-tuned model = DistilBertForSequenceClassification.from_pretrained(output_dir) tokenizer = DistilBertTokenizer.from_pretrained(output_dir) # Copy the model to the GPU. model.to(device) # softmax layer for converting predicted logits into probability soft = torch.nn.Softmax() def predict_sentiment(sentences): """Produces sentiment analysis on a list of sentences Args: sentences:Takes in a list of sentences Returns:
def fit(self, series: pd.Series): if self.tokenize_str == "bert": if self.doLower: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') else: tokenizer = BertTokenizer.from_pretrained('bert-base-cased') def generate_BERT_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding="max_length", truncation=True) return (toks["input_ids"], toks["attention_mask"]) self.tokenizer = generate_BERT_vectors elif self.tokenize_str == "distilbert": if self.doLower: tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') else: tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') def generate_DistilBERT_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding="max_length", truncation=True) return (toks["input_ids"], toks["attention_mask"]) self.tokenizer = generate_DistilBERT_vectors elif self.tokenize_str == "roberta": tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base') def generate_RoBERTa_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding="max_length", truncation=True) return (toks["input_ids"], toks["attention_mask"]) self.tokenizer = generate_RoBERTa_vectors elif self.tokenize_str == "fasttext": embeddingModel = fasttext.load_model(self.fasttextFile) def generate_fasttext_vectors(s): words = word_tokenize(s) words_embed = [ embeddingModel.get_word_vector(w) for w in words if w.isalpha() ] return words_embed self.tokenizer = generate_fasttext_vectors elif self.tokenize_str == "bow": vectorizer = CountVectorizer() vectorizer.fit(series) self.tokenizer = vectorizer.transform elif self.tokenize_str == "tfidf": vectorizer = TfidfVectorizer() vectorizer.fit(series) self.tokenizer = vectorizer.transform
def __init__(self, binaryClassification: bool, model_str: str, doLower: bool, train_batchSize: int, testval_batchSize: int, learningRate: float, doLearningRateScheduler: bool, labelSentences: dict = None, max_label_len=None, model=None, optimizer=None, device="cpu"): self.binaryClassification = binaryClassification self.labelSentences = labelSentences self.model_str = model_str self.tokenizer = None self.device = device self.train_batchSize = train_batchSize self.testval_batchSize = testval_batchSize self.learningRate = learningRate self.optimizer = optimizer self.doLearningRateScheduler = doLearningRateScheduler self.learningRateScheduler = None self.max_label_len = max_label_len if self.binaryClassification: self.num_labels = 1 else: self.num_labels = len(self.labelSentences.keys()) if self.model_str == "distilbert": if doLower: self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False) self.tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') else: self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False) self.tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') elif self.model_str == "bert": if doLower: self.model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False) self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased') else: self.model = BertForSequenceClassification.from_pretrained( 'bert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False) self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased') elif self.model_str == "roberta": self.model = RobertaForSequenceClassification.from_pretrained( 'distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False) self.tokenizer = RobertaTokenizer.from_pretrained( 'distilroberta-base') else: if model: if binaryClassification: self.model = dict() for key in self.labelSentences.keys(): self.model[key] = model else: self.model = model else: logging.error( "If model_str is not predefined, a model needs to be given." ) sys.exit( "If model_str is not predefined, a model needs to be given." )
def load_distillbert() -> Tuple[DistilBertTokenizer, DistilBertModel]: model = DistilBertModel.from_pretrained(PRETRAINED_WEIGHTS) tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_WEIGHTS) model.eval() return tokenizer, model
import numpy as np import tensorflow as tf from transformers import TFDistilBertModel, DistilBertTokenizer import re from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import pickle # load huggingface distilbert = tf.keras.models.load_model('model\\transformer') model_name = 'distilbert-base-uncased' huggingface_tokenizer = DistilBertTokenizer.from_pretrained(model_name) # load keras bilstm = tf.keras.models.load_model('model\\bilstm') with open('tokenizer.pickle', 'rb') as handle: keras_tokenizer = pickle.load(handle) def huggingface_classify(input_text, tokenizer, model, max_len=120): clean = re.sub(r"[-()\"#/@;:<>{}=~|.?,]", "", str(input_text)) if 'user' in clean: clean.strip('user') tokens = [ tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=True, add_special_tokens=True) for t in [clean] ] tensor = np.array([a['input_ids'] for a in tokens]) results = model.predict(tensor) results = np.argmax(results, axis=1)
tags_vals = ["O", "B-PER", "I-PER", "B-LOC", "I-LOC", "B-ORG", "I-ORG"] tags_vals.append("PAD") tag2idx = {t: i for i, t in enumerate(tags_vals)} print("tags_vals: ", tags_vals) print("tag2idx: ", tag2idx) MAX_LEN = 512 bs = 2 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # n_gpu = torch.cuda.device_count() # print("显卡名称: ", torch.cuda.get_device_name(0)) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case = True) def tokenize_and_preserve_labels(sentence, text_labels): tokenized_sentence = [] labels = [] for word, label in zip(sentence, text_labels): # Tokenize the word and count # of subwords the word is broken into tokenized_word = tokenizer.tokenize(word) n_subwords = len(tokenized_word) # Add the tokenized word to the final tokenized word list tokenized_sentence.extend(tokenized_word) # Add the same label to the new list of labels `n_subwords` times
print(f"Validation Accuracy Epoch: {epoch_accu}") return epoch_loss, epoch_accu, y_test_actual, y_test_predicted, y_test_predicted_prob_list ################################################################## # Prepare the data df_path = config.df_path # This will give reduced sentiment [FYI: Its excepting preprocessed dataframe] df_new_reduced, sentiment_map, sentiment_demap = utility.data_process( dataset_path=df_path) # Initiate the tokenizer bert_tokenizer = DistilBertTokenizer.from_pretrained( config.PRE_TRAINED_MODEL_NAME) # Creating instance of Preprocess # This Preprocess internally Triage class # This will split data and encode using passing tokenizer # Creating instance of the class Preprocess = prepare_data.Preprocess(dataframe=df_new_reduced, tokenizer=bert_tokenizer, max_len=config.MAX_LEN, train_batch_size=config.TRAIN_BATCH_SIZE, valid_batch_size=config.VALID_BATCH_SIZE, test_batch_size=config.TEST_BATCH_SIZE) # Accessing the process_data_for_model method of Preprocess class training_loader, valid_loader, testing_loader = Preprocess.process_data_for_model( )
print(f"URL:{s3_url}") os.makedirs(os.path.join(path), exist_ok=True) filename = Path(path_to_model) r = requests.get(s3_url) filename.write_bytes(r.content) return path_to_model, path pretrained_weights = 'distilbert-base-cased' path_to_model, path = download_model( "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-pytorch_model.bin", "pytorch_model.bin") tokenizer = DistilBertTokenizer.from_pretrained(path) bert_model = DistilBertModel.from_pretrained(path) labels_list = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] class linear_model(nn.Module): def __init__(self, bert_model, num_labels): super().__init__() embed_size = bert_model.config.hidden_size if pretrained_weights == 'distilbert-base-cased': dropout_prob = bert_model.config.dropout else:
def fit(self, series: pd.Series): if self.args["tokenizer"] == "bert": if self.doLower: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') else: tokenizer = BertTokenizer.from_pretrained('bert-base-cased') def generate_BERT_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding="max_length", truncation=True, max_length=self.max_length) return (toks["input_ids"], toks["attention_mask"]) def tokenizer_fun(series): return pd.Series(series).progress_apply( generate_BERT_vectors).values self.tokenizer = tokenizer_fun elif self.args["tokenizer"] == "distilbert": if self.doLower: # distilbert german uncased should be used, however a pretrained model does not exist tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') else: tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') def generate_DistilBERT_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding="max_length", truncation=True, max_length=self.max_length) return (toks["input_ids"], toks["attention_mask"]) def tokenizer_fun(series): return pd.Series(series).progress_apply( generate_DistilBERT_vectors).values self.tokenizer = tokenizer_fun elif self.args["tokenizer"] == "xlnet": if self.doLower: # XLNET uncased should be used, however a pretrained model does not exist tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') else: tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') def generate_XLM_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding=True, truncation=True, max_length=self.max_length) return (toks["input_ids"], toks["attention_mask"]) self.tokenizer = generate_XLM_vectors def tokenizer_fun(series): return pd.Series(series).progress_apply( generate_XLM_vectors).values self.tokenizer = tokenizer_fun elif self.args["tokenizer"] == "roberta": if self.doLower: # roberta uncased should be used, however a pretrained model does not exist tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') def generate_Roberta_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding="max_length", truncation=True, max_length=self.max_length) return (toks["input_ids"], toks["attention_mask"]) def tokenizer_fun(series): return pd.Series(series).progress_apply( generate_Roberta_vectors).values self.tokenizer = tokenizer_fun elif self.args["tokenizer"] == "distilroberta": if self.doLower: # distilroberta uncased should be used, however a pretrained model does not exist tokenizer = RobertaTokenizer.from_pretrained( 'distilroberta-base') else: tokenizer = RobertaTokenizer.from_pretrained( 'distilroberta-base') def generate_DistilRoberta_vectors(s): toks = tokenizer(s, return_attention_mask=True, padding="max_length", truncation=True, max_length=self.max_length) return (toks["input_ids"], toks["attention_mask"]) def tokenizer_fun(series): return pd.Series(series).progress_apply( generate_DistilRoberta_vectors).values self.tokenizer = tokenizer_fun elif "fasttext" in self.args["tokenizer"]: embeddingModel = fasttext.load_model(self.fasttextFile) def generate_fasttext_vectors(s): words = word_tokenize(s) if "mean" in self.args["tokenizer"]: words_embed = [ embeddingModel.get_word_vector(w) for w in words if w.isalpha() ] words_embed = np.column_stack(words_embed).mean(axis=1) elif "max" in self.args["tokenizer"]: words_embed = [ embeddingModel.get_word_vector(w) for w in words if w.isalpha() ] words_embed = np.column_stack(words_embed).max(axis=1) else: words = words[:self.max_length] words_embed = [ embeddingModel.get_word_vector(w) for w in words if w.isalpha() ] return words_embed def tokenizer_fun(series): if "mean" in self.args["tokenizer"] or "max" in self.args[ "tokenizer"]: return np.row_stack( pd.Series(series).progress_apply( generate_fasttext_vectors).values) else: return pd.Series(series).progress_apply( generate_fasttext_vectors).values self.tokenizer = tokenizer_fun elif self.args["tokenizer"] == "bow": vectorizer = CountVectorizer(ngram_range=(1, self.args["ngram"]), lowercase=self.doLower) vectorizer.fit(series) def tokenizer_fun(series): return vectorizer.transform(series) self.tokenizer = tokenizer_fun elif self.args["tokenizer"] == "tfidf": vectorizer = TfidfVectorizer(ngram_range=(1, self.args["ngram"]), lowercase=self.doLower) vectorizer.fit(series) def tokenizer_fun(series): return vectorizer.transform(series) self.tokenizer = tokenizer_fun
def main(): n_colors = 2 opt = parser.parse_args() data_path = os.path.join(os.getcwd(), 'gp_debias', 'wordlist', opt.lang, 'occupation_stereotype_list.tsv') if opt.model == 'y': device = torch.device('cpu') n_model = SequenceClassifier(model_name='distilbert-base-uncased', num_labels=3, cache_dir='./cache') con = DistilBertModel state_dict = torch.load("trained_1575511705.pth", map_location=device) # create new OrderedDict that does not contain `module. (To deal with pytorch bug) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v # load params n_model.model.load_state_dict(state_dict) model = n_model.model.distilbert print('loaded model') else: model = DistilBertModel.from_pretrained('distilbert-base-uncased') model.eval() tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') stereo_list = get_stereotype_words(data_path) if opt.load == 'n': basis = create_subspace(opt.lang, model, tokenizer) else: basis = load_subspace() X_vecs, sentences = proj_gen_space(tokenizer, model, stereo_list, basis, opt.lang) norms = norm(X_vecs, axis=2) import pprint pp = pprint.PrettyPrinter(indent=2) for i in range(len(sentences)): pp.pprint( sorted(list(zip(norms[i], sentences[i][0])), key=lambda x: x[0])) stereo_vecs = np.zeros((X_vecs.shape[0], X_vecs.shape[2])) for s in range(len(sentences)): stereo_vecs[s] = X_vecs[s, sentences[s][1], :] sent2bert, labeled_words, vecs_labels = train_kmeans( stereo_vecs, stereo_list, n_colors) # if opt.load == 'n': train_svm(vecs_labels) print(sorted(labeled_words, key=lambda x: x[1])) pca_viz(stereo_vecs, labeled_words, n_colors) scores = score_vectors(tokenizer, model, stereo_list, basis, opt.lang) #print(scores) stereo_scores = list(reversed(sorted(scores, key=lambda x: x[1]))) print() print() # print(list(stereo_scores)) gen_df(labeled_words, stereo_scores) print('Done')
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', return_token_type_ids=True) model = DistilBertForQuestionAnswering.from_pretrained( 'distilbert-base-uncased-distilled-squad') data = pd.read_csv('examples.csv') for idx, row in data.iterrows(): context = row['context'] question = row['question'] encoding = tokenizer.encode_plus(question, context) input_ids, attention_mask = encoding["input_ids"], encoding[ "attention_mask"] start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor( [attention_mask])).values() ans_tokens = input_ids[torch.argmax(start_scores ):torch.argmax(end_scores) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True) answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens) print(answer_tokens_to_string)
default=None, help='output model path and name') parser.add_argument('--benchmark', action='store_true', default=False, help='Get benchmark performance of quantized model.') parser.add_argument('--benchmark_nums', type=int, default=1000, help="Benchmark numbers of samples") parser.add_argument('--accuracy_only', action='store_true', default=False, help="Mode of benchmark") args = parser.parse_args() tokenizer = DistilBertTokenizer.from_pretrained(args.input_dir, do_lower_case=True) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else ( args.task_name, ) eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True) # exit(0) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, \ batch_size=args.eval_batch_size) def eval_func(model): return evaluate_onnxrt(args, model, tokenizer, eval_dataloader)
from tqdm import tqdm from joblib import dump, load from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score as acc from sklearn.model_selection import train_test_split from catboost import CatBoostClassifier import torch from transformers import DistilBertTokenizer, DistilBertModel PREPROCESS = False MICRO_MULT = 10 tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') transformer = DistilBertModel.from_pretrained('distilbert-base-uncased', output_hidden_states=True) def embed(x): x = tokenizer.encode(x, add_special_tokens=True) x = torch.tensor([x]) with torch.no_grad(): last_hs = transformer(x)[0] # last layer hidden state # (bs, max_seq_len) -> (bs, seq_len, hid_dim) final_hs = torch.squeeze(last_hs, 0)[-1, :] emb_X = final_hs.detach().numpy() return emb_X
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--debug", default=False, action='store_true', help="Whether to run in debug mode.") parser.add_argument("--data_dir", default='data/semeval_14', type=str, help="SemEval data dir") parser.add_argument("--train_file", default=None, type=str, help="SemEval xml for training") parser.add_argument("--predict_file", default=None, type=str, help="SemEval csv for prediction") parser.add_argument("--extraction_file", default=None, type=str, help="pkl file for extraction") parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=96, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_pipeline", default=False, action='store_true', help="Whether to run pipeline on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--save_proportion", default=0.5, type=float, help="Proportion of steps to save models for. E.g., 0.5 = 50% " "of training.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() if not args.do_train and not args.do_predict and not args.do_pipeline: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train and not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict and not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") save_path = os.path.join(args.output_dir, 'checkpoint_mlcls.pth.tar') if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "torch_version: {} device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(torch.__version__, device, n_gpu, bool(args.local_rank != -1), args.fp16)) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info('output_dir: {}'.format(args.output_dir)) save_path = os.path.join(args.output_dir, 'checkpoint_ml_cls.pth.tar') log_path = os.path.join(args.output_dir, 'performance_ml_cls.txt') network_path = os.path.join(args.output_dir, 'network_ml_cls.txt') parameter_path = os.path.join(args.output_dir, 'parameter_ml_cls.txt') predictions_path = os.path.join(args.output_dir, 'predictions_ml_cls.txt') logger.info("***** Preparing model *****") model = DistillBertForMultilabelClassification() model.to(device) if args.init_checkpoint is not None and not os.path.isfile(save_path): checkpoint = torch.load(save_path, map_location='cpu') model.load_state_dict(checkpoint['model']) optimizer = torch.optim.Adam() optimizer.load_state_dict(checkpoint['optimizer_state_dict']) logger.info( "Loading model from pretrained checkpoint: {}".format(save_path)) else: optimizer = torch.optim.Adam(params=model.parameters(), lr=args.learning_rate) logger.info("***** Preparing data *****") training_loader, testing_loader = read_train_data(args, tokenizer, logger) if args.do_train: logger.info("***** Preparing training *****") #bert def loss_fn(outputs, targets): return torch.nn.BCEWithLogitsLoss()(outputs, targets) def calcuate_accu(big_idx, targets): n_correct = (big_idx == targets).sum().item() return n_correct def train(epoch): tr_loss = 0 n_correct = 0 nb_tr_steps = 0 nb_tr_examples = 0 #bert # loss = loss_fn(outputs, targets) #distilled loss_function = torch.nn.CrossEntropyLoss() model.train() for _, data in enumerate(training_loader, 0): ids = data['ids'].to(device, dtype=torch.long) mask = data['mask'].to(device, dtype=torch.long) #bert #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long) targets = data['targets'].to(device, dtype=torch.float) #outputs = model(ids, mask, token_type_ids) outputs = model(ids, mask) loss = loss_fn(outputs, targets) tr_loss += loss.item() big_val, big_idx = torch.max(outputs.data, dim=1) # n_correct += calcuate_accu(big_idx, targets) nb_tr_steps += 1 nb_tr_examples += targets.size(0) optimizer.zero_grad() if _ % 5000 == 0: loss_step = tr_loss / nb_tr_steps accu_step = (n_correct * 100) / nb_tr_examples print(f"Training Loss per 5000 steps: {loss_step}") print(f"Training Accuracy per 5000 steps: {accu_step}") #print(f'Epoch: {epoch + 1}, Loss: {loss.item()}') optimizer.zero_grad() loss.backward() optimizer.step() print( f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}' ) epoch_loss = tr_loss / nb_tr_steps epoch_accu = (n_correct * 100) / nb_tr_examples print(f"Training Loss Epoch: {epoch_loss}") print(f"Training Accuracy Epoch: {epoch_accu}") return for epoch in range(args.num_train_epochs): train(epoch) torch.save( { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': 0 }, save_path) logger.info("***** Running validation *****") f = open(log_path, "a") for epoch in range(3): outputs, targets = validation(args, model, device, testing_loader) outputs = np.array(outputs) >= 0.5 from sklearn import metrics accuracy = metrics.accuracy_score(targets, outputs) recall = metrics.recall_score(targets, outputs, average='samples') f1_score_micro = metrics.f1_score(targets, outputs, average='micro') f1_score_macro = metrics.f1_score(targets, outputs, average='macro') f1 = metrics.f1_score(targets, outputs, average='samples') print("epoch: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f}".format( epoch + 1, accuracy, recall, f1), file=f) print(" ", file=f) f.close() print("epoch: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f}".format( epoch + 1, accuracy, recall, f1)) if args.do_predict: logger.info("***** Running prediction *****") # restore from best checkpoint if save_path and os.path.isfile(save_path) and args.do_train: checkpoint = torch.load(save_path) model.load_state_dict(checkpoint['model']) logger.info("Loading model from finetuned checkpoint: '{}'".format( save_path)) model.eval() results = validation(args, model, device, testing_loader, write_pred=True, predictions_path=predictions_path)
if not discard: train = list(train) + [x for x in test if x in no_test] test = [x for x in test if x not in no_test] yield (train, test) from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification import tensorflow as tf import tensorflow_addons as tfa tf.config.threading.set_intra_op_parallelism_threads(8) tf.config.threading.set_inter_op_parallelism_threads(8) MODEL_NAME = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME) def create_train_val(x, y, train, val): train_encodings = tokenizer(list(x[train].values), truncation=True, padding=True) val_encodings = tokenizer(list(x[val].values), truncation=True, padding=True) train_dataset = tf.data.Dataset.from_tensor_slices( (dict(train_encodings), list(y[train].values))) val_dataset = tf.data.Dataset.from_tensor_slices( (dict(val_encodings), list(y[val].values)))
def __init__(self, config): # self.name, self.num_classes, epochs, batchs self.Configs = config self.num_classes = len(config.label_list) self.train_logits = [] self.validation_logits = [] self.test_logits = [] self.train_texts = [] self.train_labels = [] self.validation_texts = [] self.validation_labels = [] self.test_texts = [] self.test_labels = [] train = pd.read_csv(os.path.join(self.Configs.data_dir, 'train.csv')) try: dev = pd.read_csv(os.path.join(self.Configs.data_dir, 'dev.csv')) except: print('Validation disabled.') test = pd.read_csv(os.path.join(self.Configs.data_dir, 'test.csv')) self.train_texts = train['text'].tolist() self.train_labels = train['label'].tolist() try: self.validation_texts = dev['text'].tolist() self.validation_labels = dev['label'].tolist() except: pass self.test_texts = test['text'].tolist() for i in range(len(self.test_texts)): self.test_labels.append(0) if torch.cuda.is_available(): self.device = torch.device("cuda") else: print('No GPU available, using the CPU instead.') self.device = torch.device("cpu") if self.Configs.model_name == 'albert': self.model = AlbertForSequenceClassification.from_pretrained(self.Configs.pretrained_model_dir, num_labels=self.num_classes) self.tokenizer = AlbertTokenizer.from_pretrained(self.Configs.pretrained_model_dir) if self.Configs.model_name == 'distilbert': self.model = DistilBertForSequenceClassification.from_pretrained(self.Configs.pretrained_model_dir, num_labels=self.num_classes) self.tokenizer = DistilBertTokenizer.from_pretrained(self.Configs.pretrained_model_dir) if self.Configs.model_name == 'roberta': self.model = RobertaForSequenceClassification.from_pretrained(self.Configs.pretrained_model_dir, num_labels=self.num_classes) self.tokenizer = RobertaTokenizer.from_pretrained(self.Configs.pretrained_model_dir) if torch.cuda.is_available(): self.model.cuda()