def __init__(self, max_seq_len=MAX_LEN, batch_size=BATCH_SIZE, n_epochs=N_EPOCHS, val_size=0.1, learning_rate=LEARNING_RATE, load_local_pretrained=False): self.max_seq_len = max_seq_len self.batch_size = batch_size self.n_epochs = n_epochs self.val_size = val_size self.learning_rate = learning_rate # Load dataset, tokenizer, model from pretrained model/vocabulary self.tokenizer = (DistilBertTokenizerFast.from_pretrained( BERTMODEL, do_lower_case=False)) if load_local_pretrained: self.model = (TFDistilBertForSequenceClassification. from_pretrained(MODEL_PATH)) else: config = DistilBertConfig.from_pretrained(BERTMODEL, num_labels=2) self.model = ( TFDistilBertForSequenceClassification.from_pretrained( BERTMODEL, config=config)) # Freeze distilbert layer self.model.distilbert.trainable = False
def __load(self): dbertConf = DistilBertConfig.from_pretrained(self.path + '/config.json') self.model = TFDistilBertForSequenceClassification.from_pretrained\ ( self.path + '/tf_model.h5', config=dbertConf, )
def load_model(self, model_name: str = "bert_ner_test"): # TODO model loaded from mlflow # Load model and tokenizer. config = DistilBertConfig.from_pretrained(model_name) model = DistilBertForTokenClassification(config).from_pretrained( model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) return model, config, tokenizer
def model_load(self, path: str): config = DistilBertConfig.from_pretrained(path + "/config.json") tokenizer = DistilBertTokenizer.from_pretrained( path, do_lower_case=self.do_lower_case) model = DistilBertForQuestionAnswering.from_pretrained(path, from_tf=False, config=config) return model, tokenizer
def model_load(self, path): s3_model_url = 'https://distilbert-finetuned-model.s3.eu-west-2.amazonaws.com/pytorch_model.bin' path_to_model = download_model(s3_model_url, model_name="pytorch_model.bin") config = DistilBertConfig.from_pretrained(path + "/config.json") tokenizer = DistilBertTokenizer.from_pretrained(path, do_lower_case=self.do_lower_case) model = DistilBertForQuestionAnswering.from_pretrained(path_to_model, from_tf=False, config=config) return model, tokenizer
def download_distilbert_base(): file = '../input/distilbert-base-uncased' config = DistilBertConfig.from_pretrained('distilbert-base-uncased') config.save_pretrained(file) model = DistilBertModel.from_pretrained('distilbert-base-uncased') model.save_pretrained(file) tkn = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') tkn.save_pretrained(file)
def build_model(args): if args.clf_model.lower() == "cnn": # easy for text tokenization tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = CNN_Text(args) elif args.clf_model.lower() == "robert": print("name is {}".format(args.model_name_or_path)) tokenizer = RobertaTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = RobertaConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = RobertaForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers if args.freeze: for n, p in model.named_parameters(): if "bert" in n: p.requires_grad = False elif args.clf_model.lower() == "bert": tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers # if args.freeze: # for n, p in model.named_parameters(): # if "bert" in n: # p.requires_grad = False else: tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = DistilBertConfig.from_pretrained( args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = DistilBertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) model.expand_class_head(args.multi_head) model = model.to(args.device) return tokenizer, model
def __init__(self, path=None, model_name=None): if path: self.model = DistilBertForSequenceClassification.from_pretrained( path) tokenizer_path = os.path.join(path, "model/") if os.path.exists(tokenizer_path): self.tokenizer = DistilBertTokenizerFast.from_pretrained( tokenizer_path) else: self.tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") elif model_name: config = DistilBertConfig.from_pretrained(model_name, return_dict=True, num_labels=2) self.model = DistilBertForSequenceClassification.from_pretrained( model_name, config=config) self.tokenizer = DistilBertTokenizerFast.from_pretrained( model_name)
def get_bert_config(bert_model_type, output_hidden_states=False): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: bert_config = BertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: bert_config = RobertaConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['xlnet-base-cased']: bert_config = XLNetConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: bert_config = AlbertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: bert_config = GPT2Config.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['transfo-xl']: bert_config = TransfoXLConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: bert_config = DistilBertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}') bert_config.output_hidden_states = output_hidden_states return bert_config
def __init__(self, DatasetClass, weights_path = './.trained_models/frankie_encoder/ec.ckpt'): # transformer/model parameters are hardcoded due to usage of pre-trained weights self.max_seq_length = 64 self.model_name = 'distilbert-base-uncased' self.tokenizer = tokenizer = DistilBertTokenizer.from_pretrained( self.model_name, do_lower_case=True, add_special_tokens=True, max_length=self.max_seq_length, pad_to_max_length=True ) self.dataset = DatasetClass(self.tokenizer, self.max_seq_length) self.model_config = DistilBertConfig.from_pretrained(self.model_name) self.model_config.output_hidden_states = False self.model = self._create_sentence_transformer(input_shape=(self.max_seq_length,)) self.model.load_weights(weights_path) print("Initialized Encoder Model")
def returnRelevant(researchPaper, query, numSnippets=15): # Make sure these are downloaded before using config = DistilBertConfig.from_pretrained("distilbert-base-uncased") tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertModel.from_pretrained('distilbert-base-uncased', config=config) relevantSnippets = [] with open(researchPaper, encoding='utf8') as researchPaperCSV, torch.no_grad(): researchPaperReader = csv.reader(researchPaperCSV) score_max_heap = [] input_ids = torch.tensor( [tokenizer.encode(query, add_special_tokens=True, max_length=512)]) output_tuple = model(input_ids) last_hidden_states = output_tuple[0] queryObj = last_hidden_states.mean(1) for snippet in researchPaperReader: if ('<EOS>' not in snippet): snippetStr = " " snippetStr = ' '.join([str(elem) for elem in snippet]) # This implementation will reject snippets of longer than 512 tokens input_ids = torch.tensor([ tokenizer.encode(snippetStr, add_special_tokens=True, max_length=512) ]) output_tuple = model(input_ids) last_hidden_states = output_tuple[0] snippetObj = last_hidden_states.mean(1) qs = QuerySnippet(query, snippet, similarity(queryObj, snippetObj)) if len(score_max_heap ) < numSnippets or qs.similarity > score_max_heap[ 0].similarity: if len(score_max_heap) == numSnippets: heapq.heappop(score_max_heap) heapq.heappush(score_max_heap, qs) for qs in score_max_heap: relevantSnippets.append(qs.snippet) return relevantSnippets
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') config = DistilBertConfig.from_pretrained('distilbert-base-uncased', output_hidden_states=True) model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', config=config) tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased', do_lower_case=do_lower_case) # distil_model_bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', output_hidden_states=True) # if no_pretraining: # pass # else: # distil_model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu')) # print("Load pre-trained parameters.") model.to(device) return model, tokenizer, config
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, #64, 256 type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval or not.") parser.add_argument("--eval_on", default="dev", help="Whether to run eval on the dev set or test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = DistilBertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = DistilBertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) # print(config) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(args.output_dir) tokenizer = DistilBertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred, digits=4) accuracy = accuracy_score(y_true, y_pred) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report) with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") logger.info("\n%s", accuracy) writer.write(str(accuracy))
class_loss = nn.CrossEntropyLoss()(class_preds, class_labels) return start_loss + end_loss + class_loss #['LONG', 'NO', 'SHORT', 'UNKNOWN', 'YES'] def loss_fn_classifier(preds, labels): _,_, class_preds = preds _, _,class_labels = labels class_weights = [1.0, 1.0, 1.0, 0.6, 1.0] class_weights = torch.FloatTensor(class_weights).cuda() class_loss = nn.CrossEntropyLoss(class_weights)(class_preds, class_labels) return class_loss # RekhaDist config = DistilBertConfig.from_pretrained('distilbert-base-uncased-distilled-squad') config.num_labels = 5 model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', config=config) model = model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] num_train_optimization_steps = int(n_epochs * train_size / batch_size / accumulation_steps) print('num_train_optimization_steps=', num_train_optimization_steps) num_warmup_steps = int(num_train_optimization_steps * warmup) print('num_warmup_steps', num_warmup_steps)
batch_size = 32 learning_rate = 1e-06 max_epochs = 100 alpha = 0.1 # smoothing parameters for true label # /PARAMETERS # create log file data_folder = '../../data/from-figure-eight/balanced-test-data/tobert/' res_path = '../../res/' res_path += logfile_name with open(res_path, 'w') as f: c = 'epoch, iter, loss_train, loss_val, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val' f.write(c + '\n') # configure DistilBERT model config = DistilBertConfig.from_pretrained('distilbert-base-cased') config.num_labels = num_labels tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForSequenceClassification(config) # load model to GPU if available if torch.cuda.is_available(): model = model.cuda() # load datasets train_dataset = pd.read_csv(data_folder + train_file) val_dataset = pd.read_csv(data_folder + val_file) print("TRAIN Dataset: {}".format(train_dataset.shape)) print("VAL Dataset: {}".format(val_dataset.shape)) training_set = DataLoaderSmoothing(train_dataset, alpha) validating_set = DataLoaderHard(val_dataset)
'-m', type=int, default=512, help='maximum length handled by the model') args = parser.parse_args() usecfg = False if usecfg: from transformers import ( DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer, ) config = DistilBertConfig.from_pretrained(args.model_name, finetuning_task='sentiment3', num_labels=3) model = DistilBertForSequenceClassification.from_pretrained( args.model_name, config=config) tokenizer = DistilBertTokenizer.from_pretrained( args.model_name, do_lower_case=(not args.keep_case)) else: from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained( args.model_name, do_lower_case=(not args.keep_case)) model = AutoModelForSequenceClassification.from_pretrained(args.model_name) model.to("cpu") model.eval() classes = ["0", "1", "2"]
def main(): ntasks = len(tasks) data_args = list() configuration = list() sub_models = list() train_iter = list() dev_iter = list() test_iter = list() sub_optimizer = list() metrics = list() tokenizer = DistilBertTokenizer.from_pretrained(bert_path, cache_dir=cache_dir) for i in range(ntasks): logger.info("Tasks:" + tasks[i]) data_args.append(GlueDataArgs(task_name=tasks[i])) configuration.append(DistilBertConfig.from_pretrained(bert_path, num_labels=glue_tasks_num_labels[data_args[i].task_name], finetuning_task=data_args[i].task_name, cache_dir = cache_dir)) if use_gpu: sub_models.append(SequenceClassification(configuration[i]).cuda()) else: sub_models.append(SequenceClassification(configuration[i])) train_iter.append(DataIterator(data_args[i], tokenizer=tokenizer, mode="train", cache_dir=cache_dir, batch_size=batch_size[i])) dev_iter.append(DataIterator(data_args[i], tokenizer=tokenizer, mode="dev", cache_dir=cache_dir, batch_size=batch_size_val[i])) sub_optimizer.append(torch.optim.AdamW(sub_models[i].parameters(), lr=learning_rate)) metrics.append(ComputeMetrics(data_args[i])) logger.info("*** DataSet Ready ***") if use_gpu: Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True).cuda() else: Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True) bert_optimizer = torch.optim.AdamW(Bert_model.parameters(), lr=learning_rate) # balaned dataset train_num = list() for i in range(ntasks): train_num.append(len(train_iter[i])) #train_nummax = #train_num = [x/train_nummax for x in train_num] #print(train_num) iterations = (epochs * max(train_num) // bs) + 1 #print(iterations) sub_scheduler = list() for i in range(ntasks): sub_scheduler.append(torch.optim.lr_scheduler.LambdaLR(sub_optimizer[i], lambda step: (1.0-step/iterations))) Bert_scheduler = torch.optim.lr_scheduler.LambdaLR(bert_optimizer, lambda step: (1.0-step/iterations)) for i in range(1, iterations+1): if iterations > frozen: for p in Bert_model.parameters(): p.requires_grad = True Bert_model.train() else: for p in Bert_model.parameters(): p.requires_grad = False Bert_model.eval() losses=list() for j in range(ntasks): sub_models[j].train() data = train_iter[j].next() if use_gpu: input_ids=data['input_ids'].cuda() attention_mask=data['attention_mask'].cuda() #token_type_ids=data['token_type_ids'].cuda() label=data['labels'].cuda() else: input_ids=data['input_ids'] attention_mask=data['attention_mask'] #token_type_ids=data['token_type_ids'] label=data['labels'] output_inter = Bert_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True) # token_type_ids=token_type_ids, losses.append(sub_models[j](input=output_inter, labels=label)[0]) loss = 0 printInfo = 'TOTAL/Train {}/{}, lr:{}'.format(i, iterations, Bert_scheduler.get_lr()) for j in range(ntasks): loss += losses[j] * batch_size[j] printInfo += ', loss{}-{:.6f}'.format(j,losses[j]) sub_optimizer[j].zero_grad() logging.info(printInfo) if iterations > frozen: bert_optimizer.zero_grad() loss.backward() if iterations > frozen: bert_optimizer.step() for j in range(ntasks): sub_optimizer[j].step() sub_scheduler[j].step() if iterations > frozen: Bert_scheduler.step() if (i % eval_interval == 0): for j in range(ntasks): evaluate(Bert_model, sub_models[j], dev_iter[j], batch_size_val[j], metrics[j]) sub_models[j].save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format(tasks[j], i))) Bert_model.save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format("main", i))) for i in range(ntasks): evaluate(Bert_model, sub_models[i], dev_iter[i], batch_size_val[i], metrics[i]) sub_models[i].save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format(tasks[j], iterations))) Bert_model.save_pretrained(os.path.join(model_save_dir, "{}-checkpoint-{:06}.pth.tar".format("main", iterations)))
def main(): """ main function for conducting Subtask C. Parameters are parsed with argparse. Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ ############################ variable settings ################################# parser = argparse.ArgumentParser( description= 'Run Subtask C of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df_cat.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df_cat.tsv', help='The filename of the input development data.') parser.add_argument( '--test_data1', type=str, default='test_syn_df_cat.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument( '--test_data2', type=str, default='test_dia_df_cat.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument( '--output_path', type=str, default='./output/subtaskC/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--save_prediction", default=False, action="store_true", help="Flag for saving predictions.") parser.add_argument("--save_cr", default=False, action="store_true", help="Flag for saving confusion matrix.") parser.add_argument("--exclude_general", default=False, action="store_true", help="Flag for excluding category Allgemein.") parser.add_argument("--exclude_neutral", default=False, action="store_true", help="Flag for excluding neutral polarity.") parser.add_argument("--exclude_general_neutral", default=False, action="store_true", help="Flag for excluding category Allgemein:neutral.") args = parser.parse_args() ################################################################################ set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) # Load data train_df = pd.read_csv(args.df_path + args.train_data, delimiter='\t') dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter='\t') test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter='\t') test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter='\t') # Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained( args.lang_model, do_lower_case=lower_case, max_length=args.max_len) # get training features cats = train_df.columns[5:] end = "full" # exclude categories if required if (args.exclude_general): cats = [i for i in list(cats) if "Allgemein" not in i] end = "excl_gen" if (args.exclude_neutral): cats = [i for i in list(cats) if "neutral" not in i] end = "excl_neu" if (args.exclude_general_neutral): cats = [i for i in list(cats) if "Allgemein:neutral" not in i] end = "excl_genneu" num_labels = len(list(cats)) # create one hot labels train_df['one_hot_labels'] = list(train_df[list(cats)].values) dev_df['one_hot_labels'] = list(dev_df[list(cats)].values) test_syn_df['one_hot_labels'] = list(test_syn_df[list(cats)].values) test_dia_df['one_hot_labels'] = list(test_dia_df[list(cats)].values) # retrieve sentences and labels df = pd.concat([train_df, dev_df]) sentences = df.text.values labels = list(df.one_hot_labels.values) sentences_syn = test_syn_df.text.values labels_syn = list(test_syn_df.one_hot_labels.values) sentences_dia = test_dia_df.text.values labels_dia = list(test_dia_df.one_hot_labels.values) print("number of categories:", len(list(cats))) # Tokenize all of the sentences and map the tokens to their word IDs. input_ids = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=args.max_len) for sent in sentences ] input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") # Create attention masks attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids] # synchronic test data input_ids_syn = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn ] input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] # diachronic test data input_ids_dia = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia ] input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, labels) # transform to torch tensor train_inputs = torch.tensor(train_inputs) dev_inputs = torch.tensor(dev_inputs) train_labels = torch.tensor(train_labels) dev_labels = torch.tensor(dev_labels) train_masks = torch.tensor(train_masks) dev_masks = torch.tensor(dev_masks) test_syn_inputs = torch.tensor(input_ids_syn) test_syn_masks = torch.tensor(attention_masks_syn) test_syn_labels = torch.tensor(labels_syn) test_dia_inputs = torch.tensor(input_ids_dia) test_dia_masks = torch.tensor(attention_masks_dia) test_dia_labels = torch.tensor(labels_dia) # Create the DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train=True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train=False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train=False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train=False) # Create model if args.train: if model_class == "BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = BertForSequenceClassification.from_pretrained( args.lang_model, num_labels=num_labels, output_attentions=False, output_hidden_states=False) if model_class == "DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = DistilBertForSequenceClassification.from_pretrained( args.lang_model, num_labels=num_labels, output_attentions=False, output_hidden_states=False) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # train model # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr) print() track_time = time.time() # trange is a tqdm wrapper around the normal python range for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i" % epoch, dt.datetime.now()) model, optimizer, scheduler, tr_loss = train_multilabel( train_dataloader=train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler, num_labels=num_labels) # EVALUATION: TRAIN SET pred_bools_train, true_bools_train, f1_train = eval_multilabel( train_dataloader, model=model, device=device) print("TRAIN: micro F1 %.3f" % (f1_train)) # EVALUATION: DEV SET pred_bools_dev, true_bools_dev, f1_dev = eval_multilabel( dev_dataloader, model=model, device=device) print("EVAL: micro F1 %.3f" % (f1_dev)) print(" Training and validation took in total: {:}".format( format_time(time.time() - track_time))) # EVALUATION: TEST SYN SET pred_bools_syn, true_bools_syn, f1_test_syn = eval_multilabel( test_syn_dataloader, model=model, device=device) print("TEST SYN: micro F1 %.4f" % (f1_test_syn)) # classification report clf_report_syn = classification_report(true_bools_syn, pred_bools_syn, target_names=cats, digits=3) print(clf_report_syn) # EVALUATION: TEST DIA SET pred_bools_dia, true_bools_dia, f1_test_dia = eval_multilabel( test_dia_dataloader, model=model, device=device) print("TEST DIA: micro F1 %.4f" % (f1_test_dia)) # classification report clf_report_dia = classification_report(true_bools_dia, pred_bools_dia, target_names=cats, digits=3) print(clf_report_dia) if args.save_cr: pickle.dump( clf_report_syn, open( args.output_path + 'clf_report_' + args.lang_model + '_test_syn_' + str(num_labels) + end + '.txt', 'wb')) pickle.dump( clf_report_dia, open( args.output_path + 'clf_report_' + args.lang_model + '_test_dia_' + str(num_labels) + end + '.txt', 'wb')) if args.save_prediction: test_syn_df["category_pred"] = pred_bools_syn test_dia_df["category_pred"] = pred_bools_dia test_syn_df.category_pred.to_csv(args.output_path + args.lang_model + '_test_syn_' + str(num_labels) + end + ".tsv", sep="\t", index=False, header=True, encoding="utf-8-sig") test_dia_df.category_pred.to_csv(args.output_path + args.lang_model + '_test_dia_' + str(num_labels) + end + ".tsv", sep="\t", index=False, header=True, encoding="utf-8-sig")
model_name = 'distilbert-base-uncased' tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) granola_ids = tokenizer.encode('granola bars') # Print the IDs print('granola_ids', granola_ids) print('type of granola_ids', type(granola_ids)) print('granola_tokens', tokenizer.convert_ids_to_tokens(granola_ids)) # Convert the list of IDs to a tensor of IDs granola_ids = torch.LongTensor(granola_ids) # Print the IDs print('granola_ids', granola_ids) print('type of granola_ids', type(granola_ids)) config = DistilBertConfig.from_pretrained(model_name, output_hidden_states=True) model = DistilBertModel.from_pretrained(model_name, config=config) # Set the device to GPU (cuda) if available, otherwise stick with CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' model = model.to(device) granola_ids = granola_ids.to(device) model.eval() print(granola_ids.size()) # unsqueeze IDs to get batch size of 1 as added dimension granola_ids = granola_ids.unsqueeze(0) print(granola_ids.size()) print(type(granola_ids))
def main(): ntasks = len(tasks) data_args = list() configuration = list() sub_models = list() datasets = list() # train_iter = list() # dev_iter = list() # test_iter = list() sub_optimizer = list() metrics = list() tokenizer = DistilBertTokenizer.from_pretrained(bert_path, cache_dir=cache_dir) for i in range(ntasks): logger.info("Tasks:" + tasks[i]) data_args.append(GlueDataArgs(task_name=tasks[i])) configuration.append( DistilBertConfig.from_pretrained( bert_path, num_labels=glue_tasks_num_labels[tasks[i].lower()], finetuning_task=data_args[i].task_name, cache_dir=cache_dir)) if use_gpu: sub_models.append(SequenceClassification(configuration[i]).cuda()) else: sub_models.append(SequenceClassification(configuration[i])) datasets.append( GlueDataSets(data_args[i], tokenizer=tokenizer, cache_dir=cache_dir)) sub_optimizer.append( torch.optim.AdamW(sub_models[i].parameters(), lr=learning_rate_0)) metrics.append(ComputeMetrics(data_args[i])) logger.info("*** DataSet Ready ***") if use_gpu: Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True).cuda() else: Bert_model = DistilBertModel.from_pretrained(bert_path, return_dict=True) bert_optimizer = torch.optim.AdamW(Bert_model.parameters(), lr=learning_rate_0) # balaned dataset train_num = list() for i in range(ntasks): train_num.append(datasets[i].length("train")) #train_nummax = #train_num = [x/train_nummax for x in train_num] print(train_num) iterations = (epochs * max(train_num) // bs) + 1 #print(iterations) sub_scheduler = list() for i in range(ntasks): sub_scheduler.append( torch.optim.lr_scheduler.LambdaLR( sub_optimizer[i], lambda step: (1.0 - step / iterations)) ) #if step <= frozen else learning_rate_1) Bert_scheduler = torch.optim.lr_scheduler.LambdaLR( bert_optimizer, lambda step: (1.0 - step / iterations)) # if step <= frozen else learning_rate_1 # datasets[i].dataloader("train", batch_size_train[i]) train_iter = list() for i in range(ntasks): train_iter.append( GlueIterator(datasets[i].dataloader("train", batch_size_train[i]))) for i in range(1, iterations + 1): if i > frozen: for p in Bert_model.parameters(): p.requires_grad = True Bert_model.train() elif i == frozen: for p in Bert_model.parameters(): p.requires_grad = True Bert_model.train() logging.info("#####################################") logging.info("Release the Traing of the Main Model.") logging.info("#####################################") else: for p in Bert_model.parameters(): p.requires_grad = False Bert_model.eval() losses = list() loss_rates = list() for j in range(ntasks): sub_models[j].train() data = train_iter[j].next() if use_gpu: input_ids = data['input_ids'].cuda() attention_mask = data['attention_mask'].cuda() #token_type_ids=data['token_type_ids'].cuda() label = data['labels'].cuda() else: input_ids = data['input_ids'] attention_mask = data['attention_mask'] #token_type_ids=data['token_type_ids'] label = data['labels'] output_inter = Bert_model( input_ids=input_ids, attention_mask=attention_mask, return_dict=True) # token_type_ids=token_type_ids, losses.append(sub_models[j](input=output_inter, labels=label)[0]) losssum = sum(losses).item() for j in range(ntasks): loss_rates.append(losses[j].item() / losssum) loss = 0 printInfo = 'TOTAL/Train {}/{}, lr:{}'.format(i, iterations, Bert_scheduler.get_lr()) for j in range(ntasks): loss += losses[j] * batch_size_train[j] # * loss_rates[j] printInfo += ', loss{}-{:.6f}'.format(j, losses[j]) sub_optimizer[j].zero_grad() logging.info(printInfo) if i > frozen: bert_optimizer.zero_grad() loss.backward() if i > frozen: bert_optimizer.step() for j in range(ntasks): sub_optimizer[j].step() # sub_scheduler[j].step() # Bert_scheduler.step() if (i % eval_interval == 0): evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics, ntasks) save_models(Bert_model, sub_models, ntasks, i) evaluate(Bert_model, sub_models, datasets, batch_size_val, metrics, ntasks) save_models(Bert_model, sub_models, ntasks, iterations)
def training_model(args): args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() config = DistilBertConfig.from_pretrained(TRANSFORMER_MODEL, num_labels=97 + 1) tokenizer = DistilBertTokenizer.from_pretrained(TRANSFORMER_MODEL) model = DistilBertForSequenceClassification.from_pretrained(TRANSFORMER_MODEL, config=config) model.to(args.device) etl = ETL(env.DB_FILE, env.SCHEMA_FILE) complaints_users = etl.load_query(SQL_QUERY_STRING) features = convert_examples_to_features( complaints_users[[COMPLAINT_TEXT, LABEL]].to_dict(orient='records'), max_length=128, tokenizer=tokenizer) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) train_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = len(train_dataloader) // args.num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) loss.backward() tr_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 logs = {} if args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = 'eval_{}'.format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs['learning_rate'] = learning_rate_scalar logs['loss'] = loss_scalar logging_loss = tr_loss model.eval() # Creating the trace dummy_all_input_ids = torch.tensor([f.input_ids for f in features[0:1]], dtype=torch.long).to(args.device) dummy_all_attention_mask = torch.tensor([f.attention_mask for f in features[0:1]], dtype=torch.long).to(args.device) traced_model = torch.jit.trace(model, [dummy_all_input_ids, dummy_all_attention_mask]) torch.jit.save(traced_model, "traced_bert.pt") tokenizer.save_pretrained('tokenizer')
def main(): """ main function for conducting Subtask A. Parameters are parsed with argparse. Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ ############################ variable settings ################################# parser = argparse.ArgumentParser(description='Run Subtask A or B of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--task', type=str, default='A', help="The task you want to conduct ('A' or 'B').") parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df.tsv', help='The filename of the input development data.') parser.add_argument('--test_data1', type=str, default='test_syn_df.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument('--test_data2', type=str, default='test_dia_df.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument('--output_path', type=str, default='./output/subtaskA/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--save_prediction", default=True, action="store_true", help="Flag for saving predictions.") args = parser.parse_args() ################################################################################ set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) # Load data train_df = pd.read_csv(args.df_path + args.train_data, delimiter = '\t') dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter = '\t') test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter = '\t') test_syn_df = test_syn_df.dropna(subset = ["text"]) test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter = '\t') # Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) # get training features df = pd.concat([train_df, dev_df]) sentences = df.text.values sentences_syn = test_syn_df.text.values sentences_dia = test_dia_df.text.values if args.task == 'A': class_list = [False, True] df['relevance_label'] = df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels = df.relevance_label.values test_syn_df['relevance_label'] = test_syn_df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels_syn = test_syn_df.relevance_label.values test_dia_df['relevance_label'] = test_dia_df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels_dia = test_dia_df.relevance_label.values if args.task == 'B': class_list = ["negative", "neutral", "positive"] df['sentiment_label'] = df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels = df.sentiment_label.values test_syn_df['sentiment_label'] = test_syn_df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels_syn = test_syn_df.sentiment_label.values test_dia_df['sentiment_label'] = test_dia_df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels_dia = test_dia_df.sentiment_label.values num_labels = len(set(labels)) # Tokenize all of the sentences and map the tokens to their word IDs. input_ids = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=args.max_len) for sent in sentences] input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") # Create attention masks attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids] # synchronic test data input_ids_syn = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn] input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] # diachronic test data input_ids_dia = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia] input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, labels) # transform to torch tensor train_inputs = torch.tensor(train_inputs) dev_inputs = torch.tensor(dev_inputs) train_labels = torch.tensor(train_labels) dev_labels = torch.tensor(dev_labels) train_masks = torch.tensor(train_masks) dev_masks = torch.tensor(dev_masks) test_syn_inputs = torch.tensor(input_ids_syn) test_syn_labels = torch.tensor(labels_syn) test_syn_masks = torch.tensor(attention_masks_syn) test_dia_inputs = torch.tensor(input_ids_dia) test_dia_labels = torch.tensor(labels_dia) test_dia_masks = torch.tensor(attention_masks_dia) # Create the DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train=True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train=False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train=False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train=False) # Create model if args.train: if model_class == "BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = BertForSequenceClassification.from_pretrained( args.lang_model, num_labels = num_labels, output_attentions = False, output_hidden_states = False ) if model_class == "DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = DistilBertForSequenceClassification.from_pretrained( args.lang_model, num_labels = num_labels, output_attentions = False, output_hidden_states = False ) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=args.lr, eps=1e-8 ) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) # train model # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr) print() track_time = time.time() # trange is a tqdm wrapper around the normal python range for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i"%epoch, dt.datetime.now()) model, optimizer, scheduler, tr_loss = train( train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler ) # EVALUATION: TRAIN SET true_bools_train, pred_bools_train, f1_train = eval( train_dataloader, model=model, device=device) print("TRAIN: micro F1 %.4f"%(f1_train)) # here: same as accuracy print(confusion_matrix(true_bools_train,pred_bools_train)) # EVALUATION: DEV SET true_bools_dev, pred_bools_dev, f1_dev = eval( dev_dataloader, model=model, device=device) print("EVAL: micro F1 %.4f"%(f1_dev)) print(confusion_matrix(true_bools_dev,pred_bools_dev)) print(" Training and validation took in total: {:}".format(format_time(time.time()-track_time))) # EVALUATION: TEST SYN SET true_bools_syn, pred_bools_syn, f1_test_syn = eval( test_syn_dataloader, model=model, device=device) print("TEST SYN: micro F1 %.4f"%(f1_test_syn)) print(confusion_matrix(true_bools_syn,pred_bools_syn)) # EVALUATION: TEST DIA SET true_bools_dia, pred_bools_dia, f1_test_dia = eval( test_dia_dataloader, model=model, device=device) print("TEST DIA: micro F1 %.4f"%(f1_test_dia)) print(confusion_matrix(true_bools_dia, pred_bools_dia)) if args.save_prediction: if args.task == 'A': test_syn_df["relevance_pred"] = pred_bools_syn test_dia_df["relevance_pred"] = pred_bools_dia if args.task == 'B': test_syn_df["sentiment_pred"] = pred_bools_syn test_dia_df["sentiment_pred"] = pred_bools_dia test_syn_df.to_csv(args.output_path+args.lang_model+"_eval_test_syn.tsv", sep="\t", index = False, header = True, encoding = "utf-8-sig") test_dia_df.to_csv(args.output_path+args.lang_model+"_eval_test_dia.tsv", sep="\t", index = False, header = True, encoding = "utf-8-sig")
fig, ax = plt.subplots(1,2, figsize=(8,4)) ax = ax.flatten() _ = plot_pr(y_test, y_pred, ax=ax[0],label="Naive Bayes") _ = plot_roc(y_test, y_pred, ax=ax[1],label="Naive Bayes") # # Model 9 - BERT # In[14]: from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig torch.backends.cudnn.benchmark = True tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") config = DistilBertConfig.from_pretrained("distilbert-base-uncased") bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased") # First of all let's check possible max length # In[ ]: def len_tokens(s): return len(s.split()) df = df_reviews_train.copy() df["len"] = np.vectorize(len_tokens)(df["review_norm"]) print("max len is: {}".format(max(list(map(len, tokenizer.batch_encode_plus(df.sort_values(by="len", ascending=False)[:1]["review_norm"].to_list())["input_ids"]))))) sns.displot(df["len"])
def train(argv=None): """ A function that re-trains BERT for sentiment analysis. """ _set_config() num_labels = len(glue_processors[FLAGS.task]().get_labels()) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=num_labels) model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config) # Load dataset via TensorFlow Datasets data, info = tensorflow_datasets.load(f'glue/{_get_tfds_task(FLAGS.task)}', with_info=True) train_examples = info.splits['train'].num_examples # MNLI expects either validation_matched or validation_mismatched valid_examples = info.splits['validation'].num_examples # Prepare dataset for GLUE as a tf.data.Dataset instance train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, FLAGS.max_length, FLAGS.task) # MNLI expects either validation_matched or validation_mismatched valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, FLAGS.max_length, FLAGS.task) train_dataset = train_dataset.shuffle(FLAGS.buffer_size).batch(FLAGS.batch_size).repeat(-1) valid_dataset = valid_dataset.batch(FLAGS.batch_size * 2) # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule opt = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate, epsilon=FLAGS.epsilon) if FLAGS.use_amp: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') if num_labels == 1: loss = tf.keras.losses.MeanSquaredError() else: loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model_path = f'./{_get_tfds_task(FLAGS.task)}/' model.compile(optimizer=opt, loss=loss, metrics=[metric]) if FLAGS.evaluate: print('Model summary:') print(model.summary()) print('Evaluating on the training dataset...') model.evaluate(train_dataset, verbose=2, steps=int(_get_train_length(FLAGS.task) / FLAGS.batch_size)) print('Evaluating on the validation dataset...') model.evaluate(valid_dataset, verbose=2) return if os.path.exists(model_path + 'tf_model.h5') and not FLAGS.force_train: print(f'Model in {model_path} already exists. Skipping training. ' + \ 'If you would like to force a re-train, set the force_train flag.') local_vars = locals() for variable in local_vars: if not variable.startswith('-'): print(f'{variable}:\t{local_vars[variable]}') return # Train and evaluate using tf.keras.Model.fit() train_steps = train_examples // FLAGS.batch_size valid_steps = valid_examples // (FLAGS.batch_size * 2) _ = model.fit(train_dataset, epochs=FLAGS.epochs, steps_per_epoch=train_steps, validation_data=valid_dataset, validation_steps=valid_steps) # Save TF2 model os.makedirs(model_path, exist_ok=True) model.save_pretrained(model_path)
from sklearn.model_selection import train_test_split from sklearn.utils import resample tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") CLASSES = [1, 2, 3, 4, 5] config = DistilBertConfig.from_pretrained( "distilbert-base-uncased", num_labels=len(CLASSES), id2label={ 0: 1, 1: 2, 2: 3, 3: 4, 4: 5 }, label2id={ 1: 0, 2: 1, 3: 2, 4: 3, 5: 4 }, ) def list_arg(raw_value): """argparse type for a list of strings""" return str(raw_value).split(",")
def __init__(self): super(DistilBertModelTest, self).__init__() config = DistilBertConfig.from_pretrained('models/config.json') self.distilbert = DistilBertForSequenceClassification( config) # /bert_pretrain/ self.device = torch.device("cuda")
from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.utils import resample tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") CLASSES = [1, 0, -1] config = DistilBertConfig.from_pretrained( "distilbert-base-uncased", num_labels=len(CLASSES), id2label={ 0: 1, 1: 0, 2: -1 }, label2id={ 1: 0, 0: 1, -1: 2 }, ) def to_sentiment(star_rating): if star_rating in {1, 2}: # negative return -1 if star_rating == 3: # neutral return 0 if star_rating in {4, 5}: # positive return 1
def main(): """ main function for conducting Subtask D. Parameters are parsed with argparse. Language model should be one of the following: Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ parser = argparse.ArgumentParser(description='Run Subtask D of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df_opinion.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df_opinion.tsv', help='The filename of the input development data.') parser.add_argument('--test_data1', type=str, default='test_syn_df_opinion.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument('--test_data2', type=str, default='test_dia_df_opinion.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument('--output_path', type=str, default='./output/subtaskD/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--use_crf", default=False, action="store_true", help="Flag for CRF usage.") parser.add_argument("--save_cr", default=False, action="store_true", help="Flag for saving classification report.") args = parser.parse_args() ############################################################################# # Settings set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) lm = args.lang_model if args.use_crf: lm = args.lang_model+"_crf" ############################################################################# # Load and prepare data by adding BIO tags train_df = bio_tagging_df(pd.read_csv(args.df_path + args.train_data, delimiter = '\t')) dev_df = bio_tagging_df(pd.read_csv(args.df_path + args.dev_data, delimiter = '\t')) test_syn_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data1, delimiter = '\t')) test_dia_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data2, delimiter = '\t')) # 1. Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len) # get training features df = pd.concat([train_df, dev_df]) sentences = df.text.values labels = df.bio_tags.values tokenized_texts, labels = get_sentences_biotags(tokenizer, sentences, labels, args.max_len) sentences_syn = test_syn_df.text.values labels_syn = test_syn_df.bio_tags tokenized_texts_syn, labels_syn = get_sentences_biotags(tokenizer, sentences_syn, labels_syn, args.max_len) sentences_dia = test_dia_df.text.values labels_dia = test_dia_df.bio_tags tokenized_texts_dia, labels_dia = get_sentences_biotags(tokenizer, sentences_dia, labels_dia, args.max_len) # get tag values and dictionary tag_values, tag2idx, entities = get_tags_list(args.df_path) # pad input_ids and tags input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen = args.max_len, value=0.0, padding="post", dtype="long", truncating="post") tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=args.max_len, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post") input_ids_syn = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_syn], maxlen = args.max_len, value=0.0, padding="post", dtype="long", truncating="post") tags_syn = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_syn], maxlen=args.max_len, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post") input_ids_dia = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_dia], maxlen = args.max_len, value=0.0, padding="post", dtype="long", truncating="post") tags_dia = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_dia], maxlen=args.max_len, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post") # create attention masks attention_masks= [[int(token_id > 0) for token_id in sent] for sent in input_ids] attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, tags) # transform to torch tensor train_inputs = torch.tensor(train_inputs, dtype = torch.long) dev_inputs = torch.tensor(dev_inputs, dtype = torch.long) train_labels = torch.tensor(train_labels, dtype = torch.long) dev_labels = torch.tensor(dev_labels, dtype = torch.long) train_masks = torch.tensor(train_masks, dtype = torch.uint8) dev_masks = torch.tensor(dev_masks, dtype = torch.uint8) test_syn_inputs = torch.tensor(input_ids_syn, dtype = torch.long) test_syn_labels = torch.tensor(tags_syn, dtype = torch.long) test_syn_masks = torch.tensor(attention_masks_syn, dtype = torch.uint8) test_dia_inputs = torch.tensor(input_ids_dia, dtype = torch.long) test_dia_labels = torch.tensor(tags_dia, dtype = torch.long) test_dia_masks = torch.tensor(attention_masks_dia, dtype = torch.uint8) # create DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train = True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train = False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train = False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train = False) ############################################################################# # Training if args.train: # Load Config if model_class=="BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx)) config.hidden_dropout_prob = 0.1 # dropout probability for all fully connected layers # in the embeddings, encoder, and pooler; default = 0.1 model = TokenBERT( model_name=args.lang_model, num_labels=len(tag2idx), use_crf=args.use_crf) if model_class=="DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx)) config.hidden_dropout_prob = 0.1 model = TokenDistilBERT( model_name=args.lang_model, num_labels=len(tag2idx), use_crf=args.use_crf) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=args.lr, eps=1e-8 ) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "use CRF:", args.use_crf, ",", "learning rate:", args.lr, ",", "DROPOUT:", config.hidden_dropout_prob) print() track_time = time.time() for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i"%epoch, dt.datetime.now()) # TRAINING model, optimizer, scheduler, tr_loss = training( train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler ) # EVALUATION: TRAIN SET y_true_train, y_pred_train, f1s_train, f1s_overlap_train = evaluation( train_dataloader, model=model, device=device, tag_values=tag_values) print("TRAIN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_train, f1s_overlap_train)) # EVALUATION: DEV SET y_true_dev, y_pred_dev, f1s_dev, f1s_overlap_dev = evaluation( dev_dataloader, model=model, device=device, tag_values=tag_values) print("EVAL: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_dev, f1s_overlap_dev)) print(" Training and validation took in total: {:}".format(format_time(time.time()-track_time))) # EVALUATION: TEST SYN SET y_true_test_syn, y_pred_test_syn, f1s_test_syn, f1s_overlap_test_syn = evaluation( test_syn_dataloader, model=model, device=device, tag_values=tag_values) print("TEST SYN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_syn, f1s_overlap_test_syn)) # EVALUATION: TEST DIA SET y_true_test_dia, y_pred_test_dia, f1s_test_dia, f1s_overlap_test_dia = evaluation( test_dia_dataloader, model=model, device=device, tag_values=tag_values) print("TEST DIA: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_dia, f1s_overlap_test_dia)) # Print classification report cr_report_syn = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4) cr_report_dia = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4) cr_report_syn_overlap = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4, overlap = True) cr_report_dia_overlap = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4, overlap = True) print("Classification report for TEST SYN (Exact):", cr_report_syn) print("Classification report for TEST SYN (Overlap):", cr_report_dia) print("Classification report for TEST DIA (Exact):", cr_report_syn_overlap) print("Classification report for TEST DIA (Overlap):", cr_report_dia_overlap) if args.save_cr: pickle.dump(cr_report_syn, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_exact.txt','wb')) pickle.dump(cr_report_dia, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_exact.txt','wb')) pickle.dump(cr_report_syn_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_overlap.txt','wb')) pickle.dump(cr_report_dia_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_overlap.txt','wb'))
torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # See if CUDA available device = torch.device("cpu") if args.n_gpu > 0 and torch.cuda.is_available(): print("Training on GPU") device = torch.device("cuda:0") # model configuration bert_model = 'distilbert-base-uncased' n_epochs = args.n_epochs bert_config = DistilBertConfig.from_pretrained(bert_model, num_labels=2, output_hidden_states=True) # wandb initialization wandb.init(project="domain-adaptation-twitter-emnlp", name=args.run_name, config={ "epochs": n_epochs, "train_split_percentage": args.train_pct, "bert_model": bert_model, "seed": seed, "tags": ",".join(args.tags) }) #wandb.watch(model) #Create save directory for model if not os.path.exists(f"{args.model_dir}/{Path(wandb.run.dir).name}"):
# print(all_predicate_ids[0]) print("Graph loaded") # model init import torch from transformers import DistilBertTokenizer, DistilBertConfig from MPBert_sampler_model import MessagePassingHDTBert DEVICE = 'cuda' # model configuration model_name = 'distilbert-base-uncased' tokenizer = DistilBertTokenizer.from_pretrained(model_name) config = DistilBertConfig.from_pretrained(model_name, num_labels=1) E_BEAM = 10 P_BEAM = 100 model = MessagePassingHDTBert(config, topk_entities=E_BEAM, topk_predicates=P_BEAM) for param in model.bert.parameters(): param.requires_grad = False if DEVICE == 'cuda': device = torch.device("cuda") # run model on the GPU model.cuda() else: