def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() # Load data features from cache or dataset file if evaluate: cached_mode = "dev" elif test: cached_mode = "test" else: cached_mode = "train" assert not (evaluate and test) cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( cached_mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if evaluate: examples = processor.get_dev_examples(args.data_dir) elif test: examples = processor.get_test_examples(args.data_dir) else: examples = processor.get_train_examples(args.data_dir) logger.info("Training number: %s", str(len(examples))) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token_segment_id=tokenizer.pad_token_type_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(data_dir, task, tokenizer, evaluate=False, test=False): processor = processors[task]() # Load data features from cache or dataset file if evaluate: cached_mode = 'dev' elif test: cached_mode = 'test' else: cached_mode = 'train' assert (evaluate == True and test == True) == False cached_features_file = os.path.join( data_dir, 'cached_{}_{}_{}_{}'.format( cached_mode, list(filter(None, output_dir.split('/'))).pop(), str(max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() if evaluate: examples = processor.get_dev_examples(data_dir) elif test: examples = processor.get_test_examples(data_dir) else: examples = processor.get_train_examples(data_dir) logger.info("Training number: %s", str(len(examples))) features = convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, pad_on_left=True, # pad on the left for xlnet pad_token_segment_id=4) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def forward(self, input_example): ''' input: json dict for now, process one example at a time ''' question = input_example['question'] article = input_example['context'] options = input_example['options'] label = str(input_example['label']) examples = [] examples.append( InputExample( example_id=None, question=question, contexts=[article, article, article, article], # this is not efficient but convenient endings=[options[0], options[1], options[2], options[3]], label=label, ) ) features = convert_examples_to_features( examples, label_list, max_seq_length, self.tokenizer, pad_on_left=False, pad_token_segment_id=0) all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) with torch.no_grad(): inputs = { "input_ids": all_input_ids.cuda(), "attention_mask": all_input_mask.cuda(), "labels": all_label_ids.cuda() } outputs = self.model(**inputs) tmp_eval_loss, logits = outputs[:2] logits = self.m(logits) return logits.cpu().numpy()
def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() # Load data features from cache or dataset file if evaluate: cached_mode = 'dev' elif test: cached_mode = 'test' else: cached_mode = 'train' assert (evaluate == True and test == True) == False cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( cached_mode, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if evaluate: examples = processor.get_dev_examples(args.data_dir) elif test: examples = processor.get_test_examples(args.data_dir) else: examples = processor.get_train_examples(args.data_dir) logger.info("Training number: %s", str(len(examples))) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, cls_token_at_end=bool(args.model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def derive_features_for_model(examples, text_field, processor, args, evaluate=False, test=False, type='ar'): if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file if evaluate: cached_mode = "dev" + "_%s" % type elif test: cached_mode = "test" cached_mode = cached_mode + type else: cached_mode = "train" + "_%s" % type assert not (evaluate and test) cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_lstm".format( cached_mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() logger.info("Training number: %s", str(len(examples))) features = convert_examples_to_features(examples, label_list, args.max_seq_length, text_field) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor( [fea.choices_features['input_ids'].numpy() for fea in features], dtype=torch.long) all_input_length = torch.tensor( [fea.choices_features['lengths'].numpy() for fea in features], dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) # print(all_input_ids.size(), all_input_mask.size(), all_segment_ids.size(), all_label_ids.size()) dataset = TensorDataset(all_input_ids, all_input_length, all_label_ids) return dataset
def load_and_cache_examples(task, tokenizer, evaluate=False, test=False): data_dir = "/Users/yezhuoyang/Desktop/LogicalReasoning(git)/reclor/reclor_data" processor = processors['reclor']() max_seq_length = 256 model_name_or_path = "bert-base-uncased" # Load data features from cache or dataset file if evaluate: cached_mode = "dev" elif test: cached_mode = "test" else: cached_mode = "train" assert not (evaluate and test) cached_features_file = os.path.join( data_dir, "cached_{}_{}_{}_{}".format( cached_mode, list(filter(None, model_name_or_path.split("/"))).pop(), str(max_seq_length), str(task), ), ) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", data_dir) label_list = processor.get_labels() if evaluate: examples = processor.get_dev_examples(data_dir) elif test: examples = processor.get_test_examples(data_dir) else: examples = processor.get_train_examples(data_dir) logger.info("Training number: %s", str(len(examples))) features = convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, pad_on_left=False, # pad on the left for xlnet pad_token_segment_id=0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def predict_example(args, model, example, processor, tokenizer): input_example = InputExample( example_id=example.get('example_id', 'demo'), question=example['question'], contexts=[example['document']] * len(processor.get_labels()), endings=example['options'], label='0', ) label_list = processor.get_labels() features = convert_examples_to_features( [input_example], label_list, args.max_seq_length, tokenizer, ) all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) eval_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Eval! # logger.info("***** Running evaluation {} *****".format(prefix)) # logger.info(" Num examples = %d", len(eval_dataset)) # logger.info(" Batch size = %d", args.eval_batch_size) # eval_loss = 0.0 # nb_eval_steps = 0 preds = None out_label_ids = None for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None, # XLM don't use segment_ids "labels": batch[3], } outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] # eval_loss += tmp_eval_loss.mean().item() # nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() # out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) # out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) label_preds = np.argmax(preds, axis=1) prob_preds = np.exp(preds) / np.sum(np.exp(preds)) # softmax return label_preds, prob_preds
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print('gpu count:', n_gpu) random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(random_seed) os.makedirs(output_dir, exist_ok=True) model_state_dict = torch.load(output_model_file, map_location=device) model = XLNetForMultipleChoice.from_pretrained('xlnet-large-cased', state_dict=model_state_dict) logger.info("Trained model: {} loaded.".format(output_model_file)) model.to(device) no_decay = ['bias', 'LayerNorm.weight'] ## note: no weight decay according to XLNet paper optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') processor = processors['dream']() label_list = processor.get_labels() eval_examples = processor.get_test_examples('') eval_features = convert_examples_to_features( eval_examples, label_list, max_seq_length, tokenizer, pad_on_left=True, # pad on the left for xlnet pad_token_segment_id=4 ) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): eval_output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, n_class=n_class) tmp_eval_loss = eval_output.loss logits = eval_output.logits logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) output_eval_file = os.path.join(output_dir, "results.txt") with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key])))
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print('gpu count:', n_gpu) random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(random_seed) os.makedirs(output_dir, exist_ok=True) model = XLNetForMultipleChoice.from_pretrained('xlnet-large-cased') model.to(device) no_decay = ['bias', 'LayerNorm.weight'] ## note: no weight decay according to XLNet paper optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') processor = processors['dream']() label_list = processor.get_labels() train_examples = processor.get_train_examples('') num_train_steps = int( len(train_examples) // gradient_accumulation_steps * num_train_epochs) optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) train_features = convert_examples_to_features( train_examples, label_list, max_seq_length, tokenizer, pad_on_left=True, # pad on the left for xlnet pad_token_segment_id=4) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) global_step = 0 model.train() for ep in range(int(num_train_epochs)): max_score = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, n_class=n_class) loss = output.loss if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if step % 800 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) eval_examples = processor.get_dev_examples('') eval_features = convert_examples_to_features( eval_examples, label_list, max_seq_length, tokenizer, pad_on_left=True, # pad on the left for xlnet pad_token_segment_id=4) logger.info("***** Running Dev Evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): eval_output = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, n_class=n_class) tmp_eval_loss = eval_output.loss logits = eval_output.logits logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } logger.info(" Epoch: %d", (ep + 1)) logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) output_eval_file = os.path.join(output_dir, "results.txt") with open(output_eval_file, "a+") as writer: writer.write(" Epoch: " + str(ep + 1)) for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( output_dir, "pytorch_model_{}epoch.bin".format(ep + 1)) torch.save(model_to_save.state_dict(), output_model_file)