def preprocess_text_input( context='Danielle is a girl who really loves her cat, Steve.', question='What cat does Danielle love?', vocab_file='DeepLearningExamples/PyTorch/LanguageModeling/BERT/vocab/vocab', max_seq_length=384, max_query_length=64, n_best_size=1, max_answer_length=30, null_score_diff_threshold=-11.0): tokenizer = BertTokenizer(vocab_file, do_lower_case=True, max_len=512) doc_tokens = context.split() query_tokens = tokenizer.tokenize(question) feature = preprocess_tokenized_text(doc_tokens, query_tokens, tokenizer, max_seq_length=max_seq_length, max_query_length=max_query_length) tensors_for_inference, tokens_for_postprocessing = feature input_ids = torch.tensor(tensors_for_inference.input_ids, dtype=torch.long).unsqueeze(0) segment_ids = torch.tensor(tensors_for_inference.segment_ids, dtype=torch.long).unsqueeze(0) input_mask = torch.tensor(tensors_for_inference.input_mask, dtype=torch.long).unsqueeze(0) return (input_ids, segments_ids, input_mask)
def predict(cls, context, question, bing_key=None, max_seq_length=384, max_query_length=64, n_best_size=3, do_lower_case=True, can_give_negative_answer=True, max_answer_length=30, null_score_diff_threshold=-11.0): """For the input, do the predictions and return them. Args: input (a pandas dataframe): The data on which to do the predictions. There will be one prediction per row in the dataframe""" predictor_model = cls.get_predictor_model() doc_tokens = context.split() tokenizer = BertTokenizer(vocab_file, do_lower_case=True, max_len=max_seq_length) query_tokens = tokenizer.tokenize(question) feature = preprocess_tokenized_text(doc_tokens, query_tokens, tokenizer, max_seq_length=max_seq_length, max_query_length=max_query_length) tensors_for_inference, tokens_for_postprocessing = feature input_ids = torch.tensor(tensors_for_inference.input_ids, dtype=torch.long, device=device).unsqueeze(0) segment_ids = torch.tensor(tensors_for_inference.segment_ids, dtype=torch.long, device=device).unsqueeze(0) input_mask = torch.tensor(tensors_for_inference.input_mask, dtype=torch.long, device=device).unsqueeze(0) # run prediction with torch.no_grad(): start_logits, end_logits = predictor_model(input_ids, segment_ids, input_mask) # post-processing start_logits = start_logits[0].detach().cpu().tolist() end_logits = end_logits[0].detach().cpu().tolist() prediction = get_predictions(doc_tokens, tokens_for_postprocessing, start_logits, end_logits, n_best_size, max_answer_length, do_lower_case, can_give_negative_answer, null_score_diff_threshold) return prediction
def __init__(self, args): self.args = args self.processor = MrpcProcessor() self.label_list = self.processor.get_labels() self.tokenizer = BertTokenizer( args.vocab_file, do_lower_case=args.do_lower_case )
def test( args, new_dirs=None, dev_as_test=None ): # Load a trained model that you have fine-tuned (we assume evaluate on cpu) processor = data_utils.AscProcessor() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained( modelconfig.MODEL_ARCHIVE_MAP[args.bert_model]) if dev_as_test: data_dir = os.path.join(args.data_dir, 'dev_as_test') else: data_dir = args.data_dir eval_examples = processor.get_test_examples(data_dir) eval_features = data_utils.convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, "asc") logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model = torch.load(os.path.join(new_dirs, "model.pt")) model.cuda() model.eval() full_logits = [] full_label_ids = [] for step, batch in enumerate(eval_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, label_ids = batch with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.cpu().numpy() full_logits.extend(logits.tolist()) full_label_ids.extend(label_ids.tolist()) output_eval_json = os.path.join(new_dirs, "predictions.json") with open(output_eval_json, "w") as fw: json.dump({"logits": full_logits, "label_ids": full_label_ids}, fw)
def evaluate(model, result_file): with open(args.predict_example_files, 'rb') as f: eval_examples = pickle.load(f) with torch.no_grad(): tokenizer = BertTokenizer.from_pretrained('../roberta_wwm_ext', do_lower_case=True) model.eval() pred_answers, ref_answers = [], [] for step, example in enumerate(tqdm(eval_examples)): start_probs, end_probs = [], [] question_text = example['question_text'] for p_num, doc_tokens in enumerate( example['doc_tokens'][:args.max_para_num]): (input_ids, input_ids_q, input_mask, input_mask_q, segment_ids) = \ predict_data.predict_data(question_text, doc_tokens['doc_tokens'], tokenizer, args.max_seq_length, args.max_query_length) # input_ids = torch.tensor(input_ids).unsqueeze(0) # input_mask = torch.tensor(input_mask).unsqueeze(0) # segment_ids = torch.tensor(segment_ids).unsqueeze(0) start_prob, end_prob, can_logit = model( input_ids, input_ids_q, token_type_ids=segment_ids, attention_mask=input_mask, attention_mask_q=input_mask_q) start_probs.append(start_prob.squeeze(0)) end_probs.append(end_prob.squeeze(0)) best_answer, docs_index = find_best_answer(example, start_probs, end_probs) print(best_answer) pred_answers.append({ 'question_id': example['id'], 'question': example['question_text'], # 'question_type': example['question_type'], 'answers': [best_answer], 'entity_answers': [[]], 'yesno_answers': [] }) if 'answers' in example: ref_answers.append({ 'question_id': example['id'], # 'question_type': example['question_type'], 'answers': example['answers'], 'entity_answers': [[]], 'yesno_answers': [] }) with open(result_file, 'w', encoding='utf-8') as fout: for pred_answer in pred_answers: fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') with open("../metric/ref_dev.json", 'w', encoding='utf-8') as fout: for pred_answer in ref_answers: fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
def __init__( self, eval_script: str = "data/squad/v1.1/evaluate-v1.1.py", predict_file: str = "", output_dir: str = "./", n_best_size: int = 20, max_answer_length: int = 30, version_2_with_negative: bool = False, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, vocab_file: str = "", do_lower_case: bool = True, max_len: int = 512, ): tokenizer = BertTokenizer(vocab_file, do_lower_case=do_lower_case, max_len=max_len) # for bert large self.eval_examples = read_squad_examples( input_file=predict_file, is_training=False, version_2_with_negative=version_2_with_negative) self.eval_features = convert_examples_to_features( examples=self.eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, ) self.output_dir = output_dir self.eval_script = eval_script self.predict_file = predict_file args = Namespace( version_2_with_negative=version_2_with_negative, n_best_size=n_best_size, max_answer_length=max_answer_length, verbose_logging=False, do_lower_case=do_lower_case, ) self.args = args self.all_results: List[RawResult] = []
def get_dataloader(args): ''' return dataloader for inference ''' # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) with open(cached_features_file, "wb") as writer: pickle.dump(eval_features, writer) data = [] for feature in eval_features: input_ids = torch.tensor(feature.input_ids, dtype=torch.int64) input_mask = torch.tensor(feature.input_mask, dtype=torch.int64) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64) inp = (input_ids, segment_ids, input_mask) data.append(inp) if args.nbatches > 0: data = data[:args.nbatches*args.batch_size] test_loader = torch.utils.data.DataLoader( data, batch_size=args.batch_size, shuffle=False, num_workers=1, pin_memory=True) return test_loader
def predict_single_sentence(model_wieght_path, text): # initialize basic options random.seed(42) np.random.seed(42) torch.manual_seed(42) output_mode = 'classification' processor = PuriProcessor() label_list = processor.get_labels() num_labels = len(label_list) bert_model = 'bert-base-multilingual-uncased' tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) # convert text to feature example = processor.create_single_example(text) feature = convert_single_example_to_feature(example, 128, tokenizer, output_mode) # create_model model = BertForSequenceClassification.from_pretrained( model_wieght_path, num_labels=num_labels) model.eval() preds = [] input_ids = torch.tensor(feature.input_ids, dtype=torch.long).unsqueeze(0) input_mask = torch.tensor(feature.input_mask, dtype=torch.long).unsqueeze(0) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.long).unsqueeze(0) # label_ids = feature.label_ids with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] result = np.argmax(preds, axis=1) return preds, result
def infer(input, target_start_id, target_end_id, args): sent = input.split(" ") assert 0 <= target_start_id and target_start_id < target_end_id and target_end_id <= len( sent) target = " ".join(sent[target_start_id:target_end_id]) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") label_list = ["0", "1"] num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) print(f"input: {input}\ntarget: {target}") examples = construct_context_gloss_pairs_through_nltk( input, target_start_id, target_end_id) eval_features, candidate_results = convert_to_features(examples, tokenizer) input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) model.eval() input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) logits_ = F.softmax(logits, dim=-1) logits_ = logits_.detach().cpu().numpy() output = np.argmax(logits_, axis=0)[1] print(f"results:\ngloss: {candidate_results[output][1]}")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): print(torch.cuda.is_available()) parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--input_dir", default=None, type=str, required=True, help= "The input train corpus. can be directory with .txt files or a path to a single file" ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output file where the model checkpoints will be written.") ## Other parameters # bool parser.add_argument( "--mode", type=str, ) # str parser.add_argument( "--bert_model", default="bert-large-uncased", type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default="", type=str, required=False) parser.add_argument("--local_rank", default=0, type=int) # int parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--dupe_factor", default=1, type=int, help= "Number of times to duplicate the input data (with different masks).") parser.add_argument("--max_predictions_per_seq", default=20, type=int, help="Maximum sequence length.") parser.add_argument("--sentence_batch_size", default=32, type=int) parser.add_argument("--top_sen_rate", default=0.8, type=float) parser.add_argument("--threshold", default=0.2, type=float) # floats parser.add_argument("--masked_lm_prob", default=0.15, type=float, help="Masked LM probability.") parser.add_argument( "--short_seq_prob", default=0.1, type=float, help= "Probability to create a sequence shorter than maximum sequence length" ) parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument('--random_seed', type=int, default=12345, help="random seed for initialization") parser.add_argument('--part', type=int, default=0) parser.add_argument('--max_proc', type=int, default=1) parser.add_argument('--with_rand', action='store_true') parser.add_argument('--split_part', type=int) args = parser.parse_args() print(args) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) logger = logging.getLogger(__name__) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) rng = random.Random(args.random_seed) np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) print("creating instance from {}".format(args.input_dir)) processor = processors[args.task_name]() eval_examples = processor.get_pretrain_examples(args.input_dir, args.part, args.max_proc) if args.task_name == "absa" or args.task_name == "absa_term": data = eval_examples all_labels = None else: data = [example.text_a for example in eval_examples] all_labels = [example.label for example in eval_examples] del eval_examples label_list = processor.get_labels() logger.info("Bert Model: {}".format(args.bert_model)) if args.mode == "rand": print("Mode: rand") generator = RandMask(args.masked_lm_prob, args.bert_model, args.do_lower_case, args.max_seq_length) elif args.mode == "rule": print("Mode: rule") if args.task_name == "absa" or args.task_name == "absa_term": generator = ASC(args.masked_lm_prob, args.top_sen_rate, args.threshold, args.bert_model, args.do_lower_case, args.max_seq_length, label_list, args.sentence_batch_size) else: generator = SC(args.masked_lm_prob, args.top_sen_rate, args.threshold, args.bert_model, args.do_lower_case, args.max_seq_length, label_list, args.sentence_batch_size) else: print("Mode: model") generator = ModelGen(args.masked_lm_prob, args.bert_model, args.do_lower_case, args.max_seq_length, args.sentence_batch_size, with_rand=args.with_rand) if args.with_rand: instances, rand_instances, labeled_data = create_training_instances( data, all_labels, args.task_name, generator, args.max_seq_length, args.dupe_factor, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng, with_rand=args.with_rand) else: instances, labeled_data = create_training_instances( data, all_labels, args.task_name, generator, args.max_seq_length, args.dupe_factor, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng, with_rand=args.with_rand) if args.part >= 0: output_file = os.path.join(args.output_dir, "model", "{}.hdf5".format(args.part)) if args.with_rand: rand_output_file = os.path.join(args.output_dir, "rand", "{}.hdf5".format(args.part)) labeled_output_file = os.path.join(args.output_dir, "{}.pkl".format(args.part)) else: output_file = os.path.join(args.output_dir, "model", "0.hdf5") if args.with_rand: rand_output_file = os.path.join(args.output_dir, "rand", "0.hdf5") labeled_output_file = os.path.join(args.output_dir, "0.pkl") if args.mode == "rule": print("Writing labeled data(.pkl) for rule mode") write_labeled_data(labeled_data, labeled_output_file) else: print("Writing masked data(.hdf5) for model mode") if args.with_rand: print("Num instances: {}. Num rand instance: {}".format( len(instances), len(rand_instances))) write_instance_to_example_file(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, output_file) write_instance_to_example_file(rand_instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, rand_output_file) else: print("Num instances: {}.".format(len(instances))) write_instance_to_example_file(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, labeled_output_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--sample", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--word_embedding_file", default='./emb/wiki-news-300d-1M.vec', type=str, help="The input directory of word embeddings.") parser.add_argument("--index_path", default='./emb/p_index.bin', type=str, help="The input directory of word embedding index.") parser.add_argument("--word_embedding_info", default='./emb/vocab_info.txt', type=str, help="The input directory of word embedding info.") parser.add_argument("--data_file", default='', type=str, help="The input directory of input data file.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--max_ngram_length", default=16, type=int, help="The maximum total ngram sequence") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--embedding_size", default=300, type=int, help="Total batch size for embeddings.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--num_eval_epochs", default=3.0, type=float, help="Total number of eval epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--single', action='store_true', help="Whether only evaluate a single epoch") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=args.do_lower_case) # Parameters directly assignment in the code #DATA_DIR = 'data' DATA_DIR = './data/sst-2/train.tsv' # For debugger IN_TEXT = 'cleaned_haiku.data' IN_W2V = 'w2v_haiku.model' # to-do: think of an open vocabulary system WORD_LIMIT = 9999 # remaining 1 for <PAD> (this is inclusive of UNK) #task_name = "" TARGET_PAD_IDX = -1 INPUT_PAD_IDX = 0 keyboard_mappings = None text = encoder = None print("args.num_train_epochs :", args.num_train_epochs) def _create_examples(lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): #print("line :", line) #print("line[0]: ", line[0]) #print("line[1]", line[1]) flaw_labels = None if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = line[0] label = line[1] if len(line) > 2: flaw_labels = line[2] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label, flaw_labels=flaw_labels)) return examples def _create_examples_clean(lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): flaw_labels = None if i == 0: continue guid = "%s-%s" % (set_type, i) text_raw = line[0] text_a = clean_text(text_raw) label = line[1] if len(line) > 2: flaw_labels = line[2] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label, flaw_labels=flaw_labels)) return examples def _read_tsv(input_file, quotechar=None): """Reads a tab separated value file.""" with open(input_file, "r") as f: reader = csv.reader(f, delimiter="\t", quotechar=quotechar) lines = [] for line in reader: if sys.version_info[0] == 2: line = list(unicode(cell, 'utf-8') for cell in line) lines.append(line) return lines def get_train_examples(DATA_DIR): """See base class.""" if 'tsv' in DATA_DIR: return _create_examples(_read_tsv(DATA_DIR), "train") else: return _create_examples( _read_tsv(os.path.join(DATA_DIR, "train.tsv")), "train") def get_train_examples_clean(DATA_DIR): """See base class.""" if 'tsv' in DATA_DIR: return _create_examples_clean(_read_tsv(DATA_DIR), "train") else: return _create_examples_clean( _read_tsv(os.path.join(DATA_DIR, "train.tsv")), "train") def get_text_from_train_examples(train_examples): all_text_train_examples = [] for example in train_examples: all_text_train_examples.append(example.text_a) return all_text_train_examples def get_text_and_labels_train_examples(train_examples): all_text_train_examples = [] all_labels_train_examples = [] label_map = {label: i for i, label in enumerate(label_list)} for example in train_examples: all_text_train_examples.append(example.text_a) all_labels_train_examples.append(label_map[example.label]) return all_text_train_examples, all_labels_train_examples text = None encoder = None def get_data(): #train_examples = get_train_examples(args.data_dir) train_examples = get_train_examples_clean(args.data_dir) text, labels = get_text_and_labels_train_examples(train_examples) logger.info("Loading word embeddings ...in Gensim format ") text_new = [txt.split() for txt in text] encoder = FastText(text_new, min_count=1, size=128) return text_new, text, encoder, labels def get_lines(start, end, text, encoder): text = text encoder = encoder seq_lens = [] sentences = [] longest = 0 text_batch = [] for i in range((end - start)): text_batch.append(text[i]) for l in text_batch: seq_lens.append(len(l)) longest = len(l) if len(l) > longest else longest sentence = [] for txt in l: sentence.append(torch.tensor(encoder.wv[txt])) sentences.append(torch.stack(sentence).unsqueeze(0)) # Pad input d_size = sentences[0].size(2) print("sentences: ", type(sentences)) print("sentences len: ", len(sentences)) for i in range(len(sentences)): sl = sentences[i].size(1) if sl < longest: sentences[i] = torch.cat( [sentences[i], torch.zeros(1, longest - sl, d_size)], dim=1) # Need to squish sentences into [0,1] domain seq = torch.cat(sentences, dim=0) # seq = torch.sigmoid(seq) start_words = seq[:, 0:1, :] packer = pack_padded_sequence(seq, seq_lens, batch_first=True, enforce_sorted=False) return packer, start_words """ word representation from bag of chars """ def get_boc_word_representation(word): return zero_vector() + bag_of_chars(word) + zero_vector() def one_hot(char): return [1.0 if ch == char else 0.0 for ch in CHAR_VOCAB] def bag_of_chars(chars): return [float(chars.count(ch)) for ch in CHAR_VOCAB] def zero_vector(): return [0.0 for _ in CHAR_VOCAB] """ word representation from individual chars one hot (first char) + bag of chars (middle chars) + one hot (last char) """ def get_swap_word_representation(word): # dirty case if len(word) == 1 or len(word) == 2: rep = one_hot(word[0]) + zero_vector() + one_hot( word[-1]) # Return value used return rep, word rep = one_hot(word[0]) + bag_of_chars(word[1:-1]) + one_hot( word[-1]) # Return value used if len(word) > 3: idx = random.randint(1, len(word) - 3) word = word[:idx] + word[idx + 1] + word[idx] + word[ idx + 2:] # return value not used return rep, word def get_closest(sentences): scores = [] wv = encoder.wv for s in sentences.detach().numpy(): st = [ wv[wv.most_similar([s[i]], topn=1)[0][0]] for i in range(s.shape[0]) ] scores.append(torch.tensor(st)) return torch.stack(scores, dim=0) torch.set_num_threads(16) device = torch.device("cpu") sample_task = args.sample.lower() def adversarial_attacks(start, end, encoder): train_examples_batch = processor.get_train_examples_for_attacks( args.data_dir, start, end) features_for_attacks, w2i_disp, i2w_disp, vocab_size = convert_examples_to_features_gan2vec( train_examples_batch, label_list, tokenizer=None, max_seq_length=6) all_tokens = torch.tensor([f.token_ids for f in features_for_attacks], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in features_for_attacks], dtype=torch.long) data_for_attacks = TensorDataset(all_tokens, all_label_id) sampler_for_attacks = SequentialSampler(data_for_attacks) dataloader_for_attack = DataLoader(data_for_attacks, sampler=sampler_for_attacks) all_batch_flaw_tokens = [] all_batch_flaw_labels = [] all_batch_flaw_labels_truth = [] flaw_labels_lst = [] for step, batch in enumerate( tqdm(dataloader_for_attack, desc="Iteration")): batch = tuple(t.to(device) for t in batch) tokens, _ = batch # , label_id, ngram_ids, ngram_labels, ngram_masks tokens = tokens.to('cpu').numpy() features_with_flaws, all_flaw_tokens, all_token_idx, all_truth_tokens, all_flaw_labels_truth = convert_examples_to_features_flaw_attacks_gr( tokens, args.max_seq_length, args.max_ngram_length, i2w, tokenizer, embeddings=None, emb_index=None, words=None) all_token_idx = ",".join( [str(id) for tok in all_token_idx for id in tok]) all_truth_tokens_flat = ' '.join( [str(id) for tok in all_truth_tokens for id in tok]) flaw_ids = torch.tensor([f.flaw_ids for f in features_with_flaws]) flaw_labels = torch.tensor( [f.flaw_labels for f in features_with_flaws]) print("all_flaw_tokens : before before ", all_flaw_tokens) all_flaw_tokens = ' '.join( [str(y) for x in all_flaw_tokens for y in x]) flaw_ids_ar = flaw_ids.detach().cpu().numpy() flaw_ids_lst = flaw_ids.tolist() flaw_labels_ar = flaw_labels.detach().cpu().numpy() flaw_labels_lst = flaw_labels.tolist() all_flaw_tokens = all_flaw_tokens.strip("''").strip("``") all_truth_tokens_flat = all_truth_tokens_flat.strip("''").strip( "``") all_batch_flaw_tokens.append(all_flaw_tokens) all_batch_flaw_labels.append(flaw_labels_lst) all_batch_flaw_labels_truth.append(all_flaw_labels_truth) batch_tx = [] BATCH_SEQ_LEN = [] Xtype = torch.FloatTensor for line in all_batch_flaw_tokens: SEQ_LEN = len(line.split()) line = line.lower() X = get_target_representation(line, encoder) batch_tx.append(X) BATCH_SEQ_LEN.append(SEQ_LEN) X_t = torch.tensor(batch_tx, dtype=torch.float) real_adv = pack_padded_sequence(X_t, BATCH_SEQ_LEN, batch_first=True) all_batch_flaw_labels_truth_t = torch.tensor( all_batch_flaw_labels_truth, dtype=torch.long) all_batch_flaw_labels_truth_t_s = torch.squeeze( all_batch_flaw_labels_truth_t) return X_t, all_batch_flaw_labels_truth_t_s def get_loss(): return loss_nll, loss_nll def train(epochs, batch_size=256, latent_size=256, K=1): text, text_orig, encoder, labels = get_data() num_samples = len(text) create_vocab(args.data_dir, text_orig) G = Generator(128, 128) D = Discriminator(128, len(CHAR_VOCAB), encoder) l2 = nn.MSELoss() loss = get_loss() opt_d = Adam(D.parameters(), lr=0.002, betas=(0.5, 0.999)) opt_g = Adam(G.parameters(), lr=0.002, betas=(0.5, 0.999)) max_seq_len = args.max_seq_length for e in range(epochs): i = 0 while batch_size * i < num_samples: stime = time.time() start = batch_size * i end = min(batch_size * (i + 1), num_samples) bs = end - start # Fixed labels test_seq_length = 6 zeros = torch.zeros(bs, test_seq_length, dtype=torch.long) ones = torch.ones(bs, test_seq_length, dtype=torch.long) tl = torch.full((bs, 1), 0.9) fl = torch.full((bs, 1), 0.1) real, greal = get_lines(start, end, text, encoder) # Train Generator as per RobGAN for _ in range(K): opt_g.zero_grad() # GAN fooling ability fake = G(greal) d_fake_bin, d_fake_multi = D(fake) g_loss = loss_nll(d_fake_bin, tl, d_fake_multi, zeros, lam=0.5) g_loss.backward() opt_g.step() g_loss = g_loss.item() # Train descriminator opt_d.zero_grad() fake = G(greal) d_fake_bin_d, d_fake_multi_d = D(fake) d_f_loss = loss_nll(d_fake_bin_d, fl, d_fake_multi_d, ones, lam=0.5) real_adv, flaw_labels = adversarial_attacks( start, end, encoder) d_adv_bin, d_adv_multi = D(real_adv) d_adv_loss = loss_nll(d_adv_bin, tl, d_adv_multi, flaw_labels, lam=0.5) d_loss_total = d_adv_loss + d_f_loss d_loss_total.backward() opt_d.step() i += 1 torch.save(D, 'Discriminator.model') torch.save(G, 'generator.model') if sample_task == 'developing': train(10, batch_size=256)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--output_dir", default='output', type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--checkpoint", default='pretrain_ckpt/bert_small_ckpt.bin', type=str, help="checkpoint") parser.add_argument("--model_config", default='data/bert_small.json', type=str) # Other parameters parser.add_argument("--train_file", default='data/KorQuAD_v1.0_train.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=96, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=8.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O2', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer('data/ko_vocab_32k.txt', max_len=args.max_seq_length, do_basic_tokenize=True) # Prepare model config = Config.from_json_file(args.model_config) model = QuestionAnswering(config) model.bert.load_state_dict(torch.load(args.checkpoint)) num_params = count_parameters(model) logger.info("Total Parameter: %d" % num_params) model.to(device) model = torch.nn.DataParallel(model) cached_train_features_file = args.train_file + '_{0}_{1}_{2}'.format( str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_examples = read_squad_examples(input_file=args.train_file, is_training=True, version_2_with_negative=False) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_optimization_steps = int( len(train_features) / args.train_batch_size) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule( optimizer, warmup_steps=num_train_optimization_steps * 0.1, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) num_train_step = num_train_optimization_steps input_ids = np.load('input_ids2.npy') input_mask = np.load('input_mask.npy') input_segments = np.load('input_segments.npy') start_prob = np.load('start_prob.npy') end_prob = np.load('end_prob.npy') start_label = np.load('input_start.npy') stop_label = np.load('input_stop.npy') """ for i in range(1000): print(input_ids[i]) print(max(start_prob[i])) print(sum(start_prob[i])) input() """ paragraph = torch.tensor(input_ids.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_mask = torch.tensor(input_mask.astype( np.int64)).type(dtype=torch.long).cuda() paragraph_segments = torch.tensor(input_segments.astype( np.int64)).type(dtype=torch.long).cuda() start_prob = torch.tensor(start_prob.astype( np.float32)).type(dtype=torch.float32).cuda() end_prob = torch.tensor(end_prob.astype( np.float32)).type(dtype=torch.float32).cuda() start_label = torch.tensor(start_label.astype( np.int64)).type(dtype=torch.long).cuda() stop_label = torch.tensor(stop_label.astype( np.int64)).type(dtype=torch.long).cuda() train_data = TensorDataset(paragraph, paragraph_mask, paragraph_segments, start_label, stop_label, start_prob, end_prob) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() global_step = 0 epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): iter_bar = tqdm( train_dataloader, desc="Train(XX Epoch) Step(XX/XX) (Mean loss=X.X) (loss=X.X)") tr_step, total_loss, mean_loss = 0, 0., 0. for step, batch in enumerate(iter_bar): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_probs, end_probs = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_probs, end_probs) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 tr_step += 1 total_loss += loss mean_loss = total_loss / tr_step iter_bar.set_description( "Train Step(%d / %d) (Mean loss=%5.5f) (loss=%5.5f)" % (global_step, num_train_step, mean_loss, loss.item())) logger.info("** ** * Saving file * ** **") model_checkpoint = "korquad_%d.bin" % (epoch) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if n_gpu > 1: torch.save(model.module.state_dict(), output_model_file) else: torch.save(model.state_dict(), output_model_file) epoch += 1
def main(): global logger args, task_config = parse_input_parameter() random.seed(args.seed) os.environ['PYTHONHASHSEED'] = str(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # if you are using multi-GPU. torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True device, n_gpu = init_device(args) data_name = args.data_name.lower() if data_name in DATASET_DICT: args.train_file = DATASET_DICT[data_name]["train_file"] if args.do_eval: args.valid_file = DATASET_DICT[data_name]["valid_file"] if args.do_test: args.test_file = DATASET_DICT[data_name]["test_file"] else: assert args.train_file is not None if args.do_eval: assert args.valid_file is not None if args.do_test: assert args.test_file is not None task_name = args.task_name.lower() if task_name not in DATALOADER_DICT: raise ValueError("Task not found: %s" % (task_name)) if n_gpu > 1 and (args.use_ghl): logger.warning("Multi-GPU make the results not reproduce.") tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Generate label list from training dataset file_path = os.path.join(args.data_dir, args.train_file) train_dataloader, train_examples, label_tp_list = DATALOADER_DICT[ task_name]["train"](args, tokenizer, file_path) logging.info("AT Labels are = %s:", "[" + ", ".join(label_tp_list[0]) + "]") logging.info("AS Labels are = %s:", "[" + ", ".join(label_tp_list[1]) + "]") at_num_labels = len(label_tp_list[0]) as_num_labels = len(label_tp_list[1]) num_tp_labels = (at_num_labels, as_num_labels) task_config["at_labels"] = label_tp_list[0] model = init_model(args, num_tp_labels, task_config, device, n_gpu) # Generate test dataset logger.info("***** Running test *****") if data_name.startswith('joint'): # May need to perform testing on more than one domain test_dataloaders_list = [] test_examples_list = [] for td, test_file_name in enumerate(args.test_file): file_path = os.path.join(args.data_dir, test_file_name) test_dataloader, test_examples = DATALOADER_DICT[task_name][ "eval"](args, tokenizer, file_path, label_tp_list=label_tp_list, set_type="test") test_dataloaders_list.append(test_dataloader) test_examples_list.append(test_examples) logger.info(" Domain %d", td) logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) else: file_path = os.path.join(args.data_dir, args.test_file) test_dataloader, test_examples = DATALOADER_DICT[task_name]["eval"]( args, tokenizer, file_path, label_tp_list=label_tp_list, set_type="test") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) if args.do_train: num_train_optimization_steps = ( int(len(train_dataloader) + args.gradient_accumulation_steps - 1) / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) optimizer = prep_optimizer(args, model, num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.do_eval: logger.info("***** Running evaluation *****") if data_name.startswith('joint'): # May need to perform evaluation on more than one domain eval_dataloaders_list = [] eval_examples_list = [] for ed, eval_file_name in enumerate(args.valid_file): file_path = os.path.join(args.data_dir, eval_file_name) eval_dataloader, eval_examples = DATALOADER_DICT[ task_name]["eval"](args, tokenizer, file_path, label_tp_list=label_tp_list, set_type="val") eval_dataloaders_list.append(eval_dataloader) eval_examples_list.append(eval_examples) logger.info(" Domain %d", ed) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) else: file_path = os.path.join(args.data_dir, args.valid_file) eval_dataloader, eval_examples = DATALOADER_DICT[task_name][ "eval"](args, tokenizer, file_path, label_tp_list=label_tp_list, set_type="val") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) global_step = 0 for epoch in range(args.num_train_epochs): tr_loss, global_step = train_epoch(epoch, args, model, train_dataloader, device, n_gpu, tokenizer, optimizer, global_step, num_train_optimization_steps) logger.info("Epoch %d/%s Finished, Train Loss: %f", epoch + 1, args.num_train_epochs, tr_loss) save_model(epoch, args, model) if args.do_eval: if data_name.startswith('joint'): for ed, eval_dataloader in enumerate( eval_dataloaders_list): logger.info(" Domain %d", ed) eval_epoch(model, eval_dataloader, label_tp_list, device) else: eval_epoch(model, eval_dataloader, label_tp_list, device) if args.do_test: logger.info("***Results on test***") if data_name.startswith('joint'): for td, test_dataloader in enumerate(test_dataloaders_list): logger.info(" Domain %d", td) eval_epoch(model, test_dataloader, label_tp_list, device) else: eval_epoch(model, test_dataloader, label_tp_list, device) elif args.do_test: if args.init_model: if data_name.startswith('joint'): for td, test_dataloader in enumerate(test_dataloaders_list): logger.info(" Domain %d", td) eval_epoch(model, test_dataloader, label_tp_list, device) else: eval_epoch(model, test_dataloader, label_tp_list, device) else: for epoch in range(args.num_train_epochs): # Load a trained model that you have fine-tuned model = load_model(epoch, args, num_tp_labels, task_config, device) if not model: break if data_name.startswith('joint'): for td, test_dataloader in enumerate( test_dataloaders_list): logger.info(" Domain %d", td) eval_epoch(model, test_dataloader, label_tp_list, device) else: eval_epoch(model, test_dataloader, label_tp_list, device)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--load_finetuned_model', action='store_true', default=False, help="Load finetuned model.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "compq": COMPQProcessor, } output_modes = { "compq": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if args.load_finetuned_model: print("Loading finetuned model....") model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] softmax_preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) softmax_preds.append(Softmax(1)(logits).detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) softmax_preds[0] = np.append( softmax_preds[0], Softmax(1)(logits).detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] softmax_preds = softmax_preds[0] output_prediction_file = os.path.join(args.output_dir, "predictions.txt") with open(output_prediction_file, 'w') as writer: for i, pred in enumerate(softmax_preds): writer.write( str(pred[0]) + '\t' + str(pred[1]) + '\t' + eval_examples[i].text_a + '\n') if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default="bert-base-cased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default="/Users/lifuh/Documents/Research/squad2.0/output/", type=str, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default="/Users/lifuh/Documents/Research/squad2.0/train-v2.0.json", type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default="/Users/lifuh/Documents/Research/squad2.0/dev-v2.0.json", type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", default=False, action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) # initialize a tokenizer from a pretrained model tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_is_impossibles = torch.tensor([int(f.is_impossible) for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_is_impossibles) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, _ = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForQuestionAnswering.from_pretrained(args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, True, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)), num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): # Terminate early for benchmarking if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForMultipleChoice(config, num_choices=4) # noinspection PyUnresolvedReferences model.load_state_dict(torch.load(output_model_file)) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples # noinspection PyUnboundLocalVariable result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(args): args.data_dir = os.path.join(args.data_dir, args.task_name) args.output_dir = os.path.join(args.output_dir, args.task_name) logger.info("args = %s", args) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, "emo": EmoProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", "emo": "classification" } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # device = torch.device('cpu') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info("Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): try: os.makedirs(args.output_dir) except: pass logger.info("catch a error") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) # tokenizer = BertTokenizer.from_pretrained(args.vocab_file, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.bert_model) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) # use bert to aug train_examples ori_train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) test_examples = processor.get_test_examples(args.data_dir) num_train_optimization_steps = int( len(ori_train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.use_saved == 1: bert_saved_dir = args.ckpt model = BertForNSPAug.from_pretrained(bert_saved_dir, cache_dir=args.ckpt_cache_dir, num_labels=num_labels, args=args) else: model = BertForNSPAug.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels, args=args) model.cuda() if n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 best_val_acc = 0.0 first_time = time.time() logger.info( "*********************************** Running training ***********************************" ) logger.info(" Num original examples = %d", len(ori_train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() aug_ratio = 0.0 # aug_ratio = 0.2 aug_seed = np.random.randint(0, 1000) for epoch in range(int(args.num_train_epochs)): logger.info("epoch=%d, aug_ratio = %f, aug_seed=%d", epoch, aug_ratio, aug_seed) train_examples = Aug_each_ckpt(ori_train_examples, label_list, model, tokenizer, args=args, num_show=args.num_show, output_mode=output_mode, seed=aug_seed, aug_ratio=aug_ratio, use_bert=False) if aug_ratio + args.aug_ratio_each < 1.0: aug_ratio += args.aug_ratio_each aug_seed += 1 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args) logger.info( "*********************************** Done convert features ***********************************" ) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) token_real_label = torch.tensor( [f.token_real_label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, token_real_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) logger.info( "*********************************** begin training ***********************************" ) tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0 nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0 preds = [] all_labels = [] for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids, token_real_label = batch seq_logits, aug_logits, aug_loss = model( input_ids, segment_ids, input_mask, labels=None, token_real_label=token_real_label) if output_mode == "classification": # if task_name == "emo": # loss_fct = # else: loss_fct = CrossEntropyLoss() seq_loss = loss_fct(seq_logits.view(-1, num_labels), label_ids.view(-1)) # print("[classification]label_ids: {}, size: {}".format(label_ids.view(-1), label_ids.view(-1).size())) # print("[classification]seq_logits size: {}".format(seq_logits.view(-1, num_labels).size())) elif output_mode == "regression": loss_fct = MSELoss() seq_loss = loss_fct(seq_logits.view(-1), label_ids.view(-1)) token_real_label = token_real_label.detach().cpu().numpy() w = args.aug_loss_weight loss = (1 - w) * seq_loss + w * aug_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 10000.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 batch_loss = seq_loss.mean().item() tr_seq_loss += seq_loss.mean().item() seq_logits = seq_logits.detach().cpu().numpy() label_ids = label_ids.detach().cpu().numpy() if len(preds) == 0: preds.append(seq_logits) all_labels.append(label_ids) else: preds[0] = np.append(preds[0], seq_logits, axis=0) all_labels[0] = np.append(all_labels[0], label_ids, axis=0) aug_logits = aug_logits.detach().cpu().numpy() tmp_train_aug_accuracy, tmp_tokens = accuracy(aug_logits, token_real_label, type="aug") train_aug_accuracy += tmp_train_aug_accuracy nb_tr_tokens += tmp_tokens tr_aug_loss += aug_loss.mean().item() if global_step % 20 == 0: loss = tr_loss / nb_tr_steps seq_loss = tr_seq_loss / nb_tr_steps aug_loss = tr_aug_loss / nb_tr_steps tmp_pred = preds[0] tmp_labels = all_labels[0] if output_mode == "classification": tmp_pred = np.argmax(tmp_pred, axis=1) elif output_mode == "regression": tmp_pred = np.squeeze(tmp_pred) res = accuracy(tmp_pred, tmp_labels, task_name=task_name) if nb_tr_tokens != 0: aug_avg = train_aug_accuracy / nb_tr_tokens else: aug_avg = 0.0 log_string = "" log_string += "epoch={:<5d}".format(epoch) log_string += " step={:<9d}".format(global_step) log_string += " total_loss={:<9.7f}".format(loss) log_string += " seq_loss={:<9.7f}".format(seq_loss) log_string += " aug_loss={:<9.7f}".format(aug_loss) log_string += " batch_loss={:<9.7f}".format(batch_loss) log_string += " lr={:<9.7f}".format(optimizer.get_lr()[0]) log_string += " |g|={:<9.7f}".format(total_norm) #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg) log_string += " tr_aug_acc={:<9.7f}".format(aug_avg) log_string += " mins={:<9.2f}".format( float(time.time() - first_time) / 60) for key in sorted(res.keys()): log_string += " " + key + "= " + str(res[key]) logger.info(log_string) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 train_loss = tr_loss / nb_tr_steps logger.info( "*********************************** training epoch done ***********************************" ) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and epoch % 1 == 0: tot_time = float(time.time() - first_time) / 60 eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts=\ do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev") eval_res["tot_time"] = tot_time if "acc" in eval_res: tmp_acc = eval_res["acc"] elif "mcc" in eval_res: tmp_acc = eval_res["mcc"] else: tmp_acc = eval_res["corr"] result = { 'eval_total_loss': eval_loss, 'eval_seq_loss': eval_seq_loss, 'eval_aug_loss': eval_aug_loss, 'eval_aug_accuracy': eval_aug_accuracy, 'global_step': global_step, 'train_loss': train_loss, 'train_batch_size': args.train_batch_size, 'args': args } if tmp_acc >= best_val_acc: best_val_acc = tmp_acc dev_test = "dev" result.update({'best_epoch': epoch}) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_dir = os.path.join(args.output_dir, "dev_" + str(tmp_acc)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) output_model_file = os.path.join(output_model_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(output_model_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) result.update(eval_res) result.update(res_parts) # output_eval_file = os.path.join(args.output_dir, # dev_test + "_results_" + str(tmp_acc) + ".txt") # with open(output_eval_file, "w") as writer: logger.info( "****************************** eval results ***********************************" ) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) else: result = { 'eval_total_loss': eval_loss, 'eval_seq_loss': eval_seq_loss, 'eval_aug_loss': eval_aug_loss, 'eval_aug_accuracy': eval_aug_accuracy, 'global_step': global_step, 'train_loss': train_loss, 'train_batch_size': args.train_batch_size, 'args': args } result.update(eval_res) result.update(res_parts) logger.info( "****************************** eval results ***********************************" ) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # write test results if args.do_test: # res_file = os.path.join(args.output_dir, # "test_" + str(tmp_acc)+".tsv") # idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) # dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds}) # dataframe.to_csv(res_file, index=False, sep='\t') # logger.info(" Num test length = %d", idx) logger.info( "*********************************** Running test ***********************************" ) logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss, test_seq_loss, test_aug_loss, test_res, test_aug_accuracy, res_parts=\ do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, test_examples, type="test") result = { 'test_total_loss': test_loss, 'test_seq_loss': test_seq_loss, 'test_aug_loss': test_aug_loss, 'test_aug_accuracy': test_aug_accuracy, 'global_step': global_step, 'args': args } result.update(test_res) result.update(res_parts) logger.info( "****************************** test results ***********************************" ) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) logger.info( "*********************************** test done ***********************************" )
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--word_embedding_file", default='emb/crawl-300d-2M.vec', type=str, help="The input directory of word embeddings.") parser.add_argument("--index_path", default='emb/p_index.bin', type=str, help="The input directory of word embedding index.") parser.add_argument("--word_embedding_info", default='emb/vocab_info.txt', type=str, help="The input directory of word embedding info.") parser.add_argument("--data_file", default='', type=str, help="The input directory of input data file.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--max_ngram_length", default=16, type=int, help="The maximum total ngram sequence") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--embedding_size", default=300, type=int, help="Total batch size for embeddings.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--num_eval_epochs', type=int, default=0, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--single', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) logger.info("loading embeddings ... ") if args.do_train: emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors( args.word_embedding_file) write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list) if args.do_eval: emb_vocab_size, vocab_list = load_vocab_info(args.word_embedding_info) #emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(args.word_embedding_file) #write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list) logger.info("loading p index ...") if not os.path.exists(args.index_path): p = load_embeddings_and_save_index(range(emb_vocab_size), emb_vec, args.index_path) else: p = load_embedding_index(args.index_path, emb_vocab_size, num_dim=args.embedding_size) train_examples = None num_train_optimization_steps = None w2i, i2w, vocab_size = {}, {}, 1 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) train_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_train(\ train_examples, label_list, args.max_seq_length, args.max_ngram_length, tokenizer, emb_dict) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num token vocab = %d", vocab_size) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_ngram_ids = torch.tensor([f.ngram_ids for f in train_features], dtype=torch.long) all_ngram_labels = torch.tensor( [f.ngram_labels for f in train_features], dtype=torch.long) all_ngram_masks = torch.tensor([f.ngram_masks for f in train_features], dtype=torch.long) all_ngram_embeddings = torch.tensor( [f.ngram_embeddings for f in train_features], dtype=torch.float) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForNgramClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels, embedding_size=args.embedding_size, max_seq_length=args.max_seq_length, max_ngram_length=args.max_ngram_length) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 #if args.do_train: train_data = TensorDataset(all_ngram_ids, all_ngram_labels, all_ngram_masks, all_ngram_embeddings) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ind in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) ngram_ids, ngram_labels, ngram_masks, ngram_embeddings = batch loss = model(ngram_ids, ngram_masks, ngram_embeddings) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'loss': loss, } output_eval_file = os.path.join(args.output_dir, "train_results.txt") with open(output_eval_file, "a") as writer: #logger.info("***** Training results *****") writer.write("epoch" + str(ind) + '\n') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('\n') model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "epoch" + str(ind) + WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_gnrt_dev_examples(args.data_file) eval_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_eval( eval_examples, label_list, args.max_seq_length, args.max_ngram_length, tokenizer, w2i, i2w, vocab_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Num token vocab = %d", vocab_size) logger.info(" Batch size = %d", args.eval_batch_size) all_token_ids = torch.tensor([f.token_ids for f in eval_features], dtype=torch.long) # all_flaw_labels: indexes of wrong words predicted by disc all_flaw_labels = torch.tensor([f.flaw_labels for f in eval_features], dtype=torch.long) all_ngram_ids = torch.tensor([f.ngram_ids for f in eval_features], dtype=torch.long) all_ngram_mask = torch.tensor([f.ngram_mask for f in eval_features], dtype=torch.long) all_ngram_labels = torch.tensor( [f.ngram_labels for f in eval_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_token_ids, all_ngram_ids, all_ngram_mask, all_ngram_labels, all_label_id, all_flaw_labels) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if args.single: eval_range = trange(int(args.num_eval_epochs), int(args.num_eval_epochs + 1), desc="Epoch") else: eval_range = trange(int(args.num_eval_epochs), desc="Epoch") for epoch in eval_range: output_file = os.path.join( args.data_dir, "epoch" + str(epoch) + "gnrt_outputs.tsv") with open(output_file, "w") as csv_file: writer = csv.writer(csv_file, delimiter='\t') writer.writerow(["sentence", "label"]) output_model_file = os.path.join( args.output_dir, "epoch" + str(epoch) + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForNgramClassification( config, num_labels=num_labels, embedding_size=args.embedding_size, max_seq_length=args.max_seq_length, max_ngram_length=args.max_ngram_length) model.load_state_dict(torch.load(output_model_file)) model.to(device) model.eval() for token_ids, ngram_ids, ngram_mask, ngram_labels, label_id, flaw_labels in tqdm( eval_dataloader, desc="Evaluating"): ngram_ids = ngram_ids.to(device) ngram_mask = ngram_mask.to(device) with torch.no_grad(): logits = model(ngram_ids, ngram_mask) logits = logits.detach().cpu().numpy() flaw_labels = flaw_labels.to('cpu').numpy() label_id = label_id.to('cpu').numpy() token_ids = token_ids.to('cpu').numpy() masks = ngram_mask.to('cpu').numpy() with open(output_file, "a") as csv_file: for i in range(len(label_id)): correct_tokens = look_up_words(logits[i], masks[i], vocab_list, p) token_new = replace_token(token_ids[i], flaw_labels[i], correct_tokens, i2w) token_new = ' '.join(token_new) label = str(label_id[i]) writer = csv.writer(csv_file, delimiter='\t') writer.writerow([token_new, label])
seed=123 beam_size=5 length_penalty=0 forbid_ignore_word=None max_tgt_length=40 batch_size = 50 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=False) data_tokenizer = WhitespaceTokenizer() tokenizer.max_len = max_seq_length pair_num_relation = 0 bi_uni_pipeline = [] bi_uni_pipeline.append(seq2seq_loader.Preprocess4Seq2seqDecoder(list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_seq_length, max_tgt_length=max_tgt_length, mode="s2s", num_qkv=num_qkv, s2s_special_token=False, s2s_add_segment=False, s2s_share_segment=False, pos_shift=False)) amp_handle = None from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") cls_num_labels = 2 type_vocab_size = 6 mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[S2S_SOS]"])
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--google_pretrained", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=1, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--fp16', default=False, action='store_true', help="Mixed precision training") parser.add_argument('--amp', default=False, action='store_true', help="Mixed precision training") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--old", action='store_true', help="use old fp16 optimizer") parser.add_argument( '--vocab_file', type=str, default=None, required=True, help="Vocabulary mapping/file BERT was pretrainined on") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") args = parser.parse_args() args.fp16 = args.fp16 or args.amp if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir) and is_main_process(): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForSequenceClassification(config, num_labels=num_labels) print("USING CHECKPOINT from", args.init_checkpoint) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False) print("USED CHECKPOINT from", args.init_checkpoint) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: print("using fp16") try: from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False) if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) scheduler = LinearWarmUpScheduler( optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps) else: print("using fp32") optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: print("data prep") cached_train_features_file = args.data_dir + '_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.do_lower_case)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT which FusedAdam doesn't do scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 preds = None out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = label_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None results = { 'eval_loss': eval_loss, 'global_step': global_step, 'loss': loss } result = compute_metrics(task_name, preds, out_label_ids) results.update(result) print(results) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=["WSD"], help="The name of the task to train.") parser.add_argument("--train_data_dir", default=None, type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--eval_data_dir", default=None, type=str, help="The label data dir. (./wordnet)") parser.add_argument("--label_data_dir", default=None, type=str, required=True, help="The label data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--bert_model", default=None, type=str, required=True, help='''a path or url to a pretrained model archive containing: 'bert_config.json' a configuration file for the model 'pytorch_model.bin' a PyTorch dump of a BertForPreTraining instance''') ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_test: raise ValueError("At least one of `do_train` or `do_test` must be True.") if args.do_train: assert args.train_data_dir != None, "train_data_dir can not be None" if args.do_eval: assert args.eval_data_dir != None, "eval_data_dir can not be None" if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) # prepare dataloaders processors = { "WSD":WSDProcessor } output_modes = { "WSD": "classification" } processor = processors[args.task_name]() output_mode = output_modes[args.task_name] label_list = processor.get_labels(args.label_data_dir) num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # training set train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.train_data_dir, args.label_data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # load data if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long) all_index_start = torch.tensor([f.index_start for f in train_features], dtype=torch.long) all_index_end = torch.tensor([f.index_end for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_target_mask, all_index_start, all_index_end) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples = processor.get_dev_examples(args.eval_data_dir, args.label_data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long) all_label_mask = torch.tensor([f.label_mask for f in eval_features], dtype=torch.float) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_target_mask, all_label_mask) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) # train global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: model.train() epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, target_mask, index_start, index_end = batch all_label_mask = [] for i in range(len(index_start)): label_mask = [float("-inf")] * len(label_list) for i in range(index_start[i][0].item(), index_end[i][0].item()): label_mask[i] = 0 all_label_mask.append(label_mask) all_label_mask = torch.tensor(all_label_mask, dtype=torch.float).to(device) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None, target_mask=target_mask) logits = logits + all_label_mask logits = F.softmax(logits, dim=-1) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` model_output_dir = os.path.join(args.output_dir, str(epoch)) if not os.path.exists(model_output_dir): os.makedirs(model_output_dir) output_model_file = os.path.join(model_output_dir, WEIGHTS_NAME) output_config_file = os.path.join(model_output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(model_output_dir) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_"+str(epoch)+".txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, target_mask, label_mask in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) target_mask = target_mask.to(device) label_mask = label_mask.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None, target_mask=target_mask) logits = logits + label_mask logits_ = F.softmax(logits, dim=-1) logits_ = logits_.detach().cpu().numpy() label_ids_ = label_ids.to('cpu').numpy() outputs = np.argmax(logits_, axis=1) for output_i in range(len(outputs)): f.write(str(outputs[output_i])) f.write("\n") tmp_eval_accuracy = np.sum(outputs == label_ids_) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss/nb_tr_steps if args.do_train else None result = OrderedDict() result['eval_loss'] = eval_loss result['eval_accuracy'] = eval_accuracy result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: writer.write("epoch=%s\n"%str(epoch)) logger.info("***** Eval results *****") for key in result.keys(): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.eval_data_dir, args.label_data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long) all_label_mask = torch.tensor([f.label_mask for f in eval_features], dtype=torch.float) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_target_mask, all_label_mask) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results.txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, target_mask, label_mask in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) target_mask = target_mask.to(device) label_mask = label_mask.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None, target_mask=target_mask) logits = logits + label_mask logits_ = F.softmax(logits, dim=-1) logits_ = logits_.detach().cpu().numpy() label_ids_ = label_ids.to('cpu').numpy() outputs = np.argmax(logits_, axis=1) for output_i in range(len(outputs)): f.write(str(outputs[output_i])) f.write("\n") tmp_eval_accuracy = np.sum(outputs == label_ids_) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss/nb_tr_steps if args.do_train else None result = OrderedDict() result['eval_loss'] = eval_loss result['eval_accuracy'] = eval_accuracy result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in result.keys(): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") ## Other parameters parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help = "local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) features = convert_examples_to_features( examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")
args = parser.parse_args() # TRITON client setup protocol = ProtocolType.from_str(args.protocol) model_version = -1 infer_ctx = InferContext(args.url, protocol, args.model_name, model_version, http_headers=args.http_headers, verbose=args.verbose) # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format( args.max_seq_length, args.doc_stride) eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( examples=eval_examples,
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary the BERT model will train on.") parser.add_argument( "--input_file", default=None, type=str, required=True, help= "The input train corpus. can be directory with .txt files or a path to a single file" ) parser.add_argument( "--output_file", default=None, type=str, required=True, help="The output file where the model checkpoints will be written.") ## Other parameters # int parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--dupe_factor", default=10, type=int, help= "Number of times to duplicate the input data (with different masks).") parser.add_argument("--max_predictions_per_seq", default=20, type=int, help="Maximum sequence length.") # floats parser.add_argument("--masked_lm_prob", default=0.15, type=float, help="Masked LM probability.") parser.add_argument( "--short_seq_prob", default=0.1, type=float, help= "Probability to create a sequence shorter than maximum sequence length" ) parser.add_argument( "--do_lower_case", action='store_true', default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument('--random_seed', type=int, default=12345, help="random seed for initialization") args = parser.parse_args() tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case) input_files = [] if os.path.isfile(args.input_file): input_files.append(args.input_file) elif os.path.isdir(args.input_file): input_files = [ os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith('.txt')) ] else: raise ValueError("{} is not a valid path".format(args.input_file)) rng = random.Random(args.random_seed) instances = create_training_instances(input_files, tokenizer, args.max_seq_length, args.dupe_factor, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng) output_files = args.output_file.split(",") print("*** Writing to output files ***") for output_file in output_files: print(output_file) write_instance_to_example_files(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, output_files)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "commonsenseqa": CommonsenseQaProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, "commonsenseqa":4, } # if args.local_rank == -1 or args.no_cuda: # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() # else: # torch.cuda.set_device(args.local_rank) # device = torch.device("cuda", args.local_rank) # n_gpu = 1 # # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') device = "cuda:2" n_gpu = 1 logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) print("current task is " + str(task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( # args.local_rank), # num_labels=num_labels) model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'. format(args.local_rank), num_choices=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 best_eval_accuracy = 0.0 if args.do_train: train_features = convert_examples_to_features_mc( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_accuracy = eval_accuracy / nb_eval_examples print("the current eval accuracy is: " + str(eval_accuracy)) if eval_accuracy > best_eval_accuracy: best_eval_accuracy = eval_accuracy if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model.train() # # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForMultipleChoice.from_pretrained(args.bert_model, state_dict=model_state_dict, num_choices=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_pred_labels = [] all_anno_labels = [] all_logits = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() output_labels = np.argmax(logits, axis=1) all_pred_labels.extend(output_labels.tolist()) all_logits.extend(list(logits)) all_anno_labels.extend(list(label_ids)) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'best_eval_accuracy': best_eval_accuracy, 'global_step': global_step, 'loss': loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for i in range(len(all_pred_labels)): writer.write(str(i) + "\t" + str(all_anno_labels[i]) + "\t" + str(all_pred_labels[i]) + "\t" + str(all_logits[i]) + "\n")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument("--train_vae", action='store_true', help="Whether to train vae.") parser.add_argument('--bleu', type=float, default=0.2, help="Set Bleu ") # decoding parameters parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument("--input_file", type=str, help="Input file") parser.add_argument('--subset', type=int, default=0, help="Decode a subset of the input dataset.") parser.add_argument("--output_file", type=str, help="output file") parser.add_argument("--split", type=str, default="", help="Data split (train/val/test).") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--batch_size', type=int, default=4, help="Batch size for decoding.") parser.add_argument('--beam_size', type=int, default=1, help="Beam size for searching") parser.add_argument('--length_penalty', type=float, default=0, help="Length penalty for beam search") parser.add_argument('--forbid_duplicate_ngrams', action='store_true') parser.add_argument('--forbid_ignore_word', type=str, default=None, help="Ignore the word during forbid_duplicate_ngrams") parser.add_argument("--min_len", default=1, type=int) parser.add_argument('--need_score_traces', action='store_true') parser.add_argument('--ngram_size', type=int, default=1) parser.add_argument('--mode', default="s2s", choices=["s2s", "l2r", "both"]) parser.add_argument('--max_tgt_length', type=int, default=128, help="maximum length of target sequence") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") parser.add_argument('--not_predict_token', type=str, default=None, help="Do not predict the tokens during decoding.") args = parser.parse_args() if args.need_score_traces and args.beam_size <= 1: raise ValueError( "Score trace is only available for beam search with beam size > 1." ) if args.max_tgt_length >= args.max_seq_length - 2: raise ValueError("Maximum tgt length exceeds max seq length - 2.") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer.max_len = args.max_seq_length pair_num_relation = 0 bi_uni_pipeline = [] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seqDecoder( list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, max_tgt_length=args.max_tgt_length, new_segment_ids=args.new_segment_ids, mode="s2s", num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift)) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 mask_word_id, eos_word_ids, sos_word_id = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[S2S_SOS]"]) def _get_token_id_set(s): r = None if s: w_list = [] for w in s.split('|'): if w.startswith('[') and w.endswith(']'): w_list.append(w.upper()) else: w_list.append(w) r = set(tokenizer.convert_tokens_to_ids(w_list)) return r forbid_ignore_set = _get_token_id_set(args.forbid_ignore_word) not_predict_set = _get_token_id_set(args.not_predict_token) print(args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=pair_num_relation, type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, sos_id=sos_word_id, forbid_duplicate_ngrams=args.forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, not_predict_set=not_predict_set, ngram_size=args.ngram_size, min_len=args.min_len, mode=args.mode, max_position_embeddings=args.max_seq_length, ffn_type=args.ffn_type, num_qkv=args.num_qkv, seg_emb=args.seg_emb, pos_shift=args.pos_shift) del model_recover if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) torch.cuda.empty_cache() model.eval() next_i = 0 max_src_length = args.max_seq_length - 2 - args.max_tgt_length with open(args.input_file, encoding="utf-8") as fin: input_lines = [x.strip() for x in fin.readlines()] if args.subset > 0: logger.info("Decoding subset: %d", args.subset) input_lines = input_lines[:args.subset] data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer input_lines = [ data_tokenizer.tokenize(x)[:max_src_length] for x in input_lines ] input_lines = sorted(list(enumerate(input_lines)), key=lambda x: -len(x[1])) output_lines = [""] * len(input_lines) score_trace_list = [None] * len(input_lines) total_batch = math.ceil(len(input_lines) / args.batch_size) with tqdm(total=total_batch) as pbar: while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf_id = [x[0] for x in _chunk] buf = [x[1] for x in _chunk] next_i += args.batch_size max_a_len = max([len(x) for x in buf]) instances = [] for instance in [(x, max_a_len) for x in buf]: for proc in bi_uni_pipeline: instances.append(proc(instance)) with torch.no_grad(): batch = seq2seq_loader.batch_list_to_batch_tensors( instances) batch = [ t.to(device) if t is not None else None for t in batch ] input_ids, token_type_ids, position_ids, input_mask, mask_qkv, task_idx = batch traces = model(input_ids, token_type_ids, position_ids, input_mask, task_idx=task_idx, mask_qkv=mask_qkv, bleu=args.bleu) if args.beam_size > 1: traces = {k: v.tolist() for k, v in traces.items()} output_ids = traces['pred_seq'] else: output_ids = traces.tolist() for i in range(len(buf)): w_ids = output_ids[i] output_buf = tokenizer.convert_ids_to_tokens(w_ids) output_tokens = [] for t in output_buf: if t in ("[SEP]", "[PAD]"): break output_tokens.append(t) output_sequence = ' '.join(detokenize(output_tokens)) output_lines[buf_id[i]] = output_sequence if args.need_score_traces: score_trace_list[buf_id[i]] = { 'scores': traces['scores'][i], 'wids': traces['wids'][i], 'ptrs': traces['ptrs'][i] } pbar.update(1) if args.output_file: fn_out = args.output_file else: fn_out = model_recover_path + '.' + args.split with open(fn_out, "w", encoding="utf-8") as fout: for l in output_lines: fout.write(l) fout.write("\n") if args.need_score_traces: with open(fn_out + ".trace.pickle", "wb") as fout_trace: pickle.dump({ "version": 0.0, "num_samples": len(input_lines) }, fout_trace) for x in score_trace_list: pickle.dump(x, fout_trace)
"input_mask": input_mask, "segment_ids": segment_ids, "start_position": start_position, "end_position": end_position }) with open(filepath, 'w', encoding="utf-8") as fout: for feature in features: fout.write(json.dumps(feature, ensure_ascii=False) + '\n') print("len(features):", len(features)) return features if __name__ == "__main__": tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) # 生成训练数据, train.data examples = read_squad_examples(zhidao_input_file=args.zhidao_input_file, search_input_file=args.search_input_file) features = convert_examples_to_features( filepath=TRAINSET_PATH, examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, max_query_length=args.max_query_length) # 生成验证数据, dev.data。记得注释掉生成训练数据的代码,并在196行将train.data改为dev.data examples = read_squad_examples( zhidao_input_file=args.dev_zhidao_input_file, search_input_file=args.dev_search_input_file) features = convert_examples_to_features(
def main(): # args = parse_arguments() # del args.local_rank # print(args) # args_to_yaml(args, 'config_finetune_train_glue_mrpc.yaml') # exit(0) config_yaml, local_rank = parse_my_arguments() args = args_from_yaml(config_yaml) args.local_rank = local_rank """ Experiment Setup """ if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) """ Prepare Model """ # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) state_dict = torch.load(args.init_checkpoint, map_location='cpu') state_dict = state_dict.get( 'model', state_dict ) # in a full checkpoint weights are saved in state_dict['model'] model.load_state_dict(state_dict, strict=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) plain_model = getattr(model, 'module', model) with open(args.sparsity_config, 'r') as f: raw_dict = yaml.load(f, Loader=yaml.SafeLoader) masks = dict.fromkeys(raw_dict['prune_ratios'].keys()) for param_name in list(masks.keys()): if get_parameter_by_name(plain_model, param_name) is None: print(f'[WARNING] Cannot find {param_name}') del masks[param_name] for param_name in masks: param = get_parameter_by_name(plain_model, param_name) non_zero_mask = torch.ne(param, 0).to(param.dtype) masks[param_name] = non_zero_mask """ Prepare Optimizer""" # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.fp16_utils.fp16_optimizer import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: """ Prepare Dataset """ train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) """ Training Loop """ model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 plain_model = getattr(model, 'module', model) for param_name, mask in masks.items(): get_parameter_by_name(plain_model, param_name).data *= mask """ Load Model for Evaluation """ if args.do_train: # Save a trained model and the associated configuration output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if is_main_process( ): # only the main process should save the trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) state_dict = torch.load(args.init_checkpoint, map_location='cpu') state_dict = state_dict.get('model', state_dict) model.load_state_dict(state_dict, strict=False) model.to(device) """ Run Evaluation """ if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
import sys import random import re from metrics import bleu_metric import numpy as np import nltk from tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained("unilm_v2_bert_pretrain", do_lower_case=True) from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) def move_stop_words(str): item = " ".join([w for w in str.split() if not w.lower() in stop_words]) return item re_art = re.compile(r'\b(a|an|the)\b') re_punc = re.compile(r'[!"#$%&()*+,-./:;<=>?@\[\]\\^`{|}~_\']') def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): return re_art.sub(' ', text) def white_space_fix(text): return ' '.join(text.split())