tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode ) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) #dataset = TensorDataset(all_input_ids[0:data_size], all_attention_mask[0:data_size], all_token_type_ids[0:data_size], all_labels[0:data_size]) return dataset, all_labels def main(): args = parse_args() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # multi-gpu training (should be after apex fp16 initialization)
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta", "xlmroberta"]: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = ( processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) ) if args.model_type == 'gpt2': #setting pad token for GPT-2 tokenizer.pad_token = '[PAD]' if args.sst5: label_list = ['0','1','2','3','4'] features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, desc_tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Drug Description desc_max_seq_length = args.desc_max_seq_length desc_processor = processors['desc']() output_mode = output_modes[task] # For Drug1 # Load data features from cache or dataset file all_desc_features = [] for drug_indx in (1, 2): cached_desc_features_file = os.path.join( args.data_dir, 'cached_desc{}_{}_{}_{}_{}'.format( drug_indx, 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(desc_max_seq_length), str(task))) if os.path.exists( cached_desc_features_file) and not args.overwrite_cache: logger.info( "Loading description of drug%s features from cached file %s", drug_indx, cached_desc_features_file) desc_features = torch.load(cached_desc_features_file) else: logger.info( "Creating description of drug%s features from dataset file at %s", drug_indx, args.data_dir) label_list = desc_processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] desc_examples = desc_processor.get_dev_examples( args.data_dir, drug_indx) if evaluate else desc_processor.get_train_examples( args.data_dir, drug_indx) desc_features = convert_examples_to_features( desc_examples, desc_tokenizer, label_list=label_list, max_length=desc_max_seq_length, output_mode=output_mode, pad_on_left=bool( args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info( "Saving description of drug%s features into cached file %s", drug_indx, cached_desc_features_file) torch.save(desc_features, cached_desc_features_file) all_desc_features.append(desc_features) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Get Position index drug_id = tokenizer.vocab['drug'] one_id = tokenizer.vocab['##1'] two_id = tokenizer.vocab['##2'] all_input_ids = [f.input_ids for f in features] all_entity1_pos = [] all_entity2_pos = [] for input_ids in all_input_ids: entity1_pos = args.max_seq_length - 1 entity2_pos = args.max_seq_length - 1 for i in range(args.max_seq_length): if input_ids[i] == drug_id and input_ids[i + 1] == one_id: entity1_pos = i if input_ids[i] == drug_id and input_ids[i + 1] == two_id: entity2_pos = i all_entity1_pos.append(entity1_pos) all_entity2_pos.append(entity2_pos) assert len(all_input_ids) == len(all_entity1_pos) == len(all_entity2_pos) range_list = list(range(args.max_seq_length, 2 * args.max_seq_length)) all_relative_dist1 = torch.tensor([[x - e1 for x in range_list] for e1 in all_entity1_pos], dtype=torch.long) all_relative_dist2 = torch.tensor([[x - e2 for x in range_list] for e2 in all_entity2_pos], dtype=torch.long) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) all_desc1_ii = torch.tensor([f.input_ids for f in all_desc_features[0]], dtype=torch.long) all_desc1_am = torch.tensor( [f.attention_mask for f in all_desc_features[0]], dtype=torch.long) all_desc1_tti = torch.tensor( [f.token_type_ids for f in all_desc_features[0]], dtype=torch.long) all_desc2_ii = torch.tensor([f.input_ids for f in all_desc_features[1]], dtype=torch.long) all_desc2_am = torch.tensor( [f.attention_mask for f in all_desc_features[1]], dtype=torch.long) all_desc2_tti = torch.tensor( [f.token_type_ids for f in all_desc_features[1]], dtype=torch.long) # Fingerprint fingerprint_indices = torch.tensor(list(range(len(features))), dtype=torch.long) #dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) #dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_relative_dist1, all_relative_dist2, all_labels) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_relative_dist1, all_relative_dist2, all_desc1_ii, all_desc1_am, all_desc1_tti, all_desc2_ii, all_desc2_am, all_desc2_tti, fingerprint_indices, all_labels) return dataset
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--predict_eval", action='store_true', help="Whether to predict eval set.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument("--freeze", default=0, type=int, required=False, help="freeze bert.") parser.add_argument("--not_do_eval_steps", default=0.35, type=float, help="not_do_eval_steps.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification_last2embedding_cls.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps // args.gradient_accumulation_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # 先做一个eval for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) model.train() for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) nb_tr_examples += input_ids.size(0) del input_ids, input_mask, segment_ids, label_ids if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and ( step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification_last2embedding_cls.from_pretrained( os.path.join(args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False) if args.predict_eval: del model gc.collect() args.do_train = False model = BertForSequenceClassification_last2embedding_cls.from_pretrained( os.path.join(args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--dataset_name", default="top300_kl", type=str, required=True, help="The name of dataset to inference (without extention ex) top300_kl)") parser.add_argument("--model_type", default="baseline_tfidf", type=str, required=True, help="baseline, baseline_tfidf, ir-v0, ir-v1") parser.add_argument("--model_path", default=None, type=str, required=True, help="path to model dir") parser.add_argument("--output_dir", default=None, type=str, required=True, help="save_path") ## Other parameters parser.add_argument("--bert_model", default="bert-base-multilingual-cased", type=str, help="Default: bert-base-multilingual-cased" "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--model_file", default="pytorch_model.bin", type=str, help="The file of model (.bin), default is pytorhc_model.bin,\n" "특정 파일이 필요시 이름 설정 필요") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) processor = IRProcessor() label_list = processor.get_labels() num_labels = len(label_list) print("model:", args.model_type) if args.model_type == "baseline": # load model (finetuned baseline on IR) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False) config = BertConfig(os.path.join(args.model_path + "bert_config.json")) model = BertForPreTraining(config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "baseline_tfidf": # load model (baseline_tfidf) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) TFIDFconfig = modeling.BertConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling.BertTFIDFForPreTraining(TFIDFconfig) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v0": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v1": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir_2.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir_2.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) if args.fp16: model.half() model.to(device) tfidf_dict = pickle_load(os.path.join(args.data_dir, args.dataset_name + '_tfidf.pkl')) results_logit = dict() results_softmax = dict() eval_set, documents, queries = processor.make_eval_set(args.data_dir, args.dataset_name) logger.info("***** Running evaluation *****") logger.info(" Batch size = %d", args.eval_batch_size) for q_num, query in tqdm(enumerate(queries), total=len(queries), desc="Evaluating"): # for query in queries[0:1]: # for testing logger.info(f"Current Query Num : {q_num}") eval_examples = processor._create_examples(eval_set, query, documents) # logger.info(" Num examples = %d", len(eval_examples)) if args.model_type == "baseline": # baseline or baseline_finetuned eval_features = convert_examples_to_features_for_vanilla( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Query"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): _, logits = model(input_ids, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() # nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) else: # baseline_tfidf or *-head model eval_data = LazyDatasetClassifier(eval_examples, label_list, args.max_seq_length, tokenizer, tfidf_dict) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for batch in tqdm(eval_dataloader, desc="Query"): batch = tuple(t.to(device) for t in batch) input_ids, input_weights, input_mask, segment_ids, label_ids = batch with torch.no_grad(): _, logits = model(input_ids, input_weights, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] results_softmax[query] = [] for i, pred in enumerate(softmax(preds)): # using softmax pair = dict() pair["score"] = pred[1] pair["doc_id"] = list(documents.keys())[i] results_softmax[query].append(pair) results_softmax[query].sort(reverse=True, key=lambda x: x["score"]) ranked_doc_list = [] for doc in results_logit[query]: ranked_doc_list.append(doc["doc_id"]) results_logit[query] = ranked_doc_list ranked_doc_list = [] for doc in results_softmax[query]: ranked_doc_list.append(doc["doc_id"]) results_softmax[query] = ranked_doc_list save_name2 = args.model_path.split('/')[0] + '_' + args.model_file.split('.')[0] \ + '_' + args.dataset_name + '_output.json' path2 = os.path.join(args.output_dir, save_name2) with open(path2, 'w', encoding="utf8") as f: json.dump(results_softmax, f, indent=4, sort_keys=True, ensure_ascii=False)
def plot_examples(examples, name): clipped = torch.clamp(examples.detach(), 0, 1) image = make_grid(clipped) fig = Figure() canvas = backend.FigureCanvasAgg(fig) ax = fig.subplots() ax.set_title(name) ax.imshow(image.permute(1, 2, 0).numpy()) canvas.print_figure(name) device = 'cuda' if torch.cuda.is_available() else 'cpu' train_file = 'data/train.pt' dataset = TensorDataset(torch.load(train_file)) loader = DataLoader(dataset, batch_size=16, shuffle=True) writer = SummaryWriter() decoder = 'sbd' model = VAE(im_size=64, decoder=decoder) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) steps = 0 log = '[{:d}/{:d}] MSE: {:.6f} KL: {:.6f} Total: {:.6f}' for epoch in range(100): print('Epoch {:d}'.format(epoch + 1)) train_loss = 0 train_mse = 0 train_kl = 0
if args.deterministic: logging.info( "Running with deterministic sequence. Performance will be slower") torch.backends.cudnn.deterministic = True # torch.backends.cudnn.enabled = False torch.backends.cudnn.benchmark = False ################################# ####### Read data and precompute ###### img = np.load(gdict['ip_fname'], mmap_mode='r')[:gdict['num_imgs']].transpose(0, 1, 2, 3).copy() t_img = torch.from_numpy(img) logging.info("%s, %s" % (img.shape, t_img.shape)) dataset = TensorDataset(t_img) dataloader = DataLoader(dataset, batch_size=gdict['batchsize'], shuffle=True, num_workers=0, drop_last=True) # Precompute metrics with validation data for computing losses with torch.no_grad(): val_img = np.load(gdict['ip_fname'])[-3000:].transpose(0, 1, 2, 3).copy() t_val_img = torch.from_numpy(val_img).to(gdict['device']) # Precompute radial coordinates r, ind = f_get_rad(img) r = r.to(gdict['device'])
class processer(): def __init__(self): pass def get_labels(self): return ['0', '1'] def read_txt(self, filename): with open(filename, 'r') as rf: lines = rf.readlines() return lines def create_examples(self, data, type): examples = [] for i, line in enumerate(data): guid = f'{i}-{line}' text_a = line.split('\t')[1] text_b = None label = line.split('\t')[3].replace('\n', '') if type != 'test' else '0' example = InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label) examples.append(example) return examples def convert_examples_to_features(self, examples, tokenizer, max_length=512, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, split_num=4): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: CLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) #***对长文本进行切分,将切分后的每一个子句作为一个单独完整的句子,计算feature*** split_text_length = int(len(example.text_a) / split_num) split_features = [] for i in range(split_num): split_text = example.text_a[split_text_length * i:split_text_length * (i + 1)] inputs = tokenizer.encode_plus(split_text, example.text_b, add_special_tokens=True, max_length=max_length) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0 ] * len(input_ids) input_len = len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ( [pad_token_segment_id] * padding_length) assert len( input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len( attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len( token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %s)" % (example.label, label)) logger.info("input length: %d" % (input_len)) split_features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, input_len=input_len) ) #split_features中包含的就是split_num个子句的InputFeatures对象 features.append(split_features) return features def create_dataset(self, features): features_input_ids, features_attention_mask,features_token_type_ids,features_input_len,features_label= [],[],[],[],[] for split_features in features: split_features_input_ids, split_features_attention_mask,split_features_token_type_ids,split_features_input_len,split_features_label= [],[],[],[],[] split_features_input_ids.append(split_features_input_ids) split_features_attention_mask.append(split_features_attention_mask) split_features_token_type_ids.append(split_features_token_type_ids) split_features_input_len.append(split_features_input_len) split_features_label.append(split_features_label) features_input_ids.extend(split_features_input_ids) features_attention_mask.extend(split_features_attention_mask) features_token_type_ids.extend(split_features_token_type_ids) features_input_len.extend(split_features_input_len) features_attention_mask.extend(split_features_attention_mask) features_input_ids = torch.tensor(features_input_ids) features_attention_mask = torch.tensor(features_attention_mask) features_token_type_ids = torch.tensor(features_token_type_ids) features_input_len = torch.tensor(features_input_len) features_attention_mask = torch.tensor(features_attention_mask) print(all_input_ids.shape) print(all_attention_mask.shape) print(all_token_type_ids.shape) print(all_lens.shape) print(all_labels.shape) dataset = TensorDataset(features_input_ids, features_attention_mask, features_token_type_ids, features_input_len, features_attention_mask) return dataset
def startAutoEncoderGrey(dataset): global myDataYTrain global myDataXTrain global myDataYTest global myDataXTest if (dataset == 'SmallDataset_Q*Bert_Mixed_Greyscale'): split = False datasetTrain = load_dataset( '/home/annika/BA-Datensaetze/SmallDataset_Q*Bert_Mixed_Greyscale/SmallDatasetTraining_Q*Bert_Mixed_Greyscale.npy' ) datasetTest = load_dataset( '/home/annika/BA-Datensaetze/SmallDataset_Q*Bert_Mixed_Greyscale/SmallDatasetTest_Q*Bert_Mixed_Greyscale.npy' ) title = 'Auto-Encoder mit SmallDatasetTest_Q*Bert_Mixed_Greyscale' elif (dataset == 'SmallDataset_SpaceInvaders_Greyscale'): split = True datasetTrain = load_dataset( '/home/annika/BA-Datensaetze/SmallDataset_SpaceInvaders_Greyscale/smallDatasetTraining1_SpaceInvaders_Greyscale.npy' ) datasetTrain2 = load_dataset( '/home/annika/BA-Datensaetze/SmallDataset_SpaceInvaders_Greyscale/smallDatasetTraining2_SpaceInvaders_Greyscale.npy' ) datasetTest = load_dataset( '/home/annika/BA-Datensaetze/SmallDataset_SpaceInvaders_Greyscale/smallDatasetTest_SpaceInvaders_Greyscale.npy' ) title = 'Auto-Encoder mit SmallDataset_SpaceInvaders_Greyscale' TrainingImage = Image.fromarray(datasetTrain[10]) # firstImage.show() plt.imshow(TrainingImage, cmap=plt.get_cmap('gray')) plt.show() TestImage = Image.fromarray(datasetTest[10]) # firstImage.show() plt.imshow(TestImage, cmap=plt.get_cmap('gray')) plt.show() ae = AutoEncoderWMGrey() #ae = AutoEncoderMFGrey() #ae = AutoEncoderGrey() #ae = AutoEncoderVAEGrey() ae.to(torch.device("cuda:0")) print(ae) # define our optimizer and loss function loss_func = nn.MSELoss() optimizer = torch.optim.Adamax(ae.parameters(), lr=4e-4) # losses = [] global plotter plotter = VisdomLinePlotter(env_name='Tutorial Plots') if (split): iterationsTrain = ((len(datasetTrain) + len(datasetTrain2)) // 1000) firstIterationTrain = (len(datasetTrain) // 1000) else: iterationsTrain = (len(datasetTrain) // 1000) firstIterationTrain = iterationsTrain rest = False if (len(datasetTrain) % 1000 != 0): iterationsTrain += 1 rest = True predictions = [] epochs = 4 for e in range(epochs): for i in range(iterationsTrain): train_snippet = i + (e * iterationsTrain) losses = [] startTrain = i * 1000 stopTrain = ((i + 1) * 1000) - 1 if (split): if (i + 1 < firstIterationTrain): trainSetSnippet = datasetTrain[startTrain:stopTrain, :, :] else: if (i + 1 == firstIterationTrain): trainSetSnippet = datasetTrain[startTrain:, :, :] else: startTrain = (i - firstIterationTrain) * 1000 stopTrain = ((i - firstIterationTrain + 1) * 1000) - 1 if (i + 1 == iterationsTrain): trainSetSnippet = datasetTrain2[startTrain:, :, :] else: trainSetSnippet = datasetTrain2[ startTrain:stopTrain, :, :] else: if (i + 1 < firstIterationTrain): trainSetSnippet = datasetTrain[startTrain:stopTrain, :, :] else: trainSetSnippet = datasetTrain[startTrain:, :, :] trainSetSnippet = trainSetSnippet.reshape(len(trainSetSnippet), 210, 160, 1) #print(trainSetSnippet.shape) trainSetSnippet = normalize(trainSetSnippet) trn_torch = torch.from_numpy(trainSetSnippet).type( torch.cuda.FloatTensor) trn_torch = trn_torch.permute(0, 3, 1, 2) trn_torch = trn_torch[:, :, :, :] trn = TensorDataset(trn_torch, trn_torch) trn_dataloader = torch.utils.data.DataLoader(trn, batch_size=1, shuffle=False, num_workers=0) startTest = i * 430 stopTest = ((i + 1) * 430) - 1 if (i + 1 == iterationsTrain): testSetSnippet = datasetTest[startTest:, :, :] else: testSetSnippet = datasetTest[startTest:stopTest, :, :] testSetSnippet = testSetSnippet.reshape(len(testSetSnippet), 210, 160, 1) #print(testSetSnippet.shape) testSetSnippet = normalize(testSetSnippet) test_torch = torch.from_numpy(testSetSnippet).type( torch.cuda.FloatTensor) test_torch = test_torch.permute(0, 3, 1, 2) test_torch = test_torch[:, :, :, :] test = TensorDataset(test_torch, test_torch) test_dataloader = torch.utils.data.DataLoader(test, batch_size=20, shuffle=False, num_workers=0) # last_loss = 1 for batch_idx, (data, target) in enumerate(trn_dataloader): data = torch.autograd.Variable(data) optimizer.zero_grad() pred = ae(data) loss = loss_func(pred, data) losses.append(loss.cpu().data.item()) # Backpropagation loss.backward() optimizer.step() # Display if batch_idx % 25 == 1: number = (((i + 1) * 1000)) if (i + 1 == iterationsTrain): number = len(datasetTrain) numberAll = number * (e + 1) print( '\r Images trained: {}/{} epochs: {}/{} [{}/{} ({:.0f}%)]\tLoss: {:.6f}' .format(i, iterationsTrain, e + 1, epochs, batch_idx * len(data), len(trn_dataloader.dataset), 100. * batch_idx / len(trn_dataloader), loss.cpu().data.item()), end='') # if(loss.cpu().data.item() <= last_loss): # last_loss = loss.cpu().data.item() median_loss_train = statistics.median(losses) plotter.plot('loss', 'train', title, train_snippet + 1, median_loss_train) if (i == 0 and e == 0): myDataXTrain.append(0) myDataYTrain.append(losses[0]) myDataXTrain.append(train_snippet + 1) myDataYTrain.append(median_loss_train) ae.eval() loss_func_val = nn.MSELoss() losses_val = [] for batch_idx, (data, target) in enumerate(test_dataloader): data = torch.autograd.Variable(data) pred = ae(data) for prediction in pred: predictions.append(prediction) loss_val = loss_func_val(pred, data) losses_val.append(loss_val.cpu().data.item()) print('\ntestLossSum = {}'.format(loss_val.cpu().data.item())) median_loss_test = statistics.median(losses_val) plotter.plot('loss', 'validation', title, train_snippet + 1, median_loss_test) if (i == 0 and e == 0): myDataXTest.append(0) myDataYTest.append(losses_val[0]) myDataXTest.append(train_snippet + 1) myDataYTest.append(median_loss_test) if ((i == (iterationsTrain - 1)) and (e == 0 or e == (epochs - 1))): test_torch = test_torch.permute(0, 2, 3, 1) yay = torch.tensor([255], dtype=torch.int, device=torch.device("cuda:0")) testImg = test_torch[2] * yay show_torch_image_Grey(testImg.reshape(210, 160)) # * torch.tensor([255,255,255]) predImg = predictions[2].permute( 1, 2, 0).detach() * torch.tensor( [255], dtype=torch.int, device=torch.device("cuda:0")) show_torch_image_Grey(predImg.reshape(210, 160)) test_torch = test_torch.permute(0, 3, 1, 2) predictions = [] global episode global evaluationsfolder pathEvaluation = evaluationsfolder + "/" + 'Episode{}/ae.pt'.format( episode) torch.save(ae.state_dict(), pathEvaluation)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") ## Other parameters parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help = "local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) features = convert_examples_to_features( examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")
print(x_validate0.shape) f = np.load('test_origin.npz') x_test0 = f['a'] if testSizeEffect: sz = int(x_test0.shape[0] / 2) x_test0 = x_test0[:sz] print(x_test0.shape) return (x_train0, x_train, y_train, x_validate0, x_validate, y_validate, x_test0, x_test, y_test) (x_train0, x_train, y_train, x_validate0, x_validate, y_validate, x_test0, x_test, y_test) = loadData() dataset_train = TensorDataset(Tensor(x_train), Tensor(y_train)) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=seqLen, shuffle=False) dataset_validate = TensorDataset(Tensor(x_validate), Tensor(y_validate)) validate_loader = torch.utils.data.DataLoader(dataset_validate, batch_size=seqLen, shuffle=False) dataset_test = TensorDataset(Tensor(x_test), Tensor(y_test)) test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=seqLen, shuffle=False) net = Net()
#get some random data around value def get_data(value, shape): data = torch.ones(shape) * value #add some noise data += torch.randn(shape)**2 return data #dataset #cat some data with different values data = torch.cat( (get_data(0, (100, 1, 14, 14)), get_data(0.5, (100, 1, 14, 14))), 0) #labels labels = torch.cat((torch.zeros(100), torch.ones(100)), 0) #generator gen = DataLoader(TensorDataset(data, labels), batch_size=25, shuffle=True) #network m = M() #loss and optim loss = nn.NLLLoss() optimizer = torch.optim.Adam(params=m.parameters()) #settings for train and log num_epochs = 20 embedding_log = 5 writer = SummaryWriter(comment='mnist_embedding_training') #TRAIN for epoch in range(num_epochs): for j, sample in enumerate(gen): n_iter = (epoch * len(gen)) + j #reset grad
pad_to_max_length = True, return_attention_mask = True, # Construct attn. masks. return_tensors = 'pt', # Return pytorch tensors. ) input_ids.append(encoded_dict['input_ids']) attention_masks.append(encoded_dict['attention_mask']) input_ids, attention_masks, labels = torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0), torch.tensor(labels) print('Original: ', sentences[0]) print('Token IDs:', input_ids[0]) from torch.utils.data import TensorDataset, random_split dataset = TensorDataset(input_ids, attention_masks, labels) train_size = int(0.8 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) print('%d training samples'%(train_size)) print('%d validation samples'%(val_size)) from torch.utils.data import DataLoader, RandomSampler, SequentialSampler batch_size = 32 train_dataloader = DataLoader( train_dataset,
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--vocab_file", default='bert-base-uncased-vocab.txt', type=str, required=True) parser.add_argument("--model_file", default='bert-base-uncased.tar.gz', type=str, required=True) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument( "--predict_dir", default=None, type=str, required=True, help="The output directory where the predictions will be written.") parser.add_argument('--predict_output_file', type=str, default='predictions.json') parser.add_argument('--label_output_file', type=str, default='evidence_predictions.json') # Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") # Base setting parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--max_ctx', type=int, default=2) parser.add_argument('--task_name', type=str, default='coqa_yesno') parser.add_argument('--bert_name', type=str, default='baseline') parser.add_argument('--reader_name', type=str, default='coqa') # model parameters parser.add_argument('--evidence_lambda', type=float, default=0.8) parser.add_argument('--negative_lambda', type=float, default=1.0) parser.add_argument('--add_entropy', default=False, action='store_true') parser.add_argument('--split_num', type=int, default=3) parser.add_argument('--split_index', type=int, default=0) # Parameters for running labeling model parser.add_argument('--do_label', default=False, action='store_true') parser.add_argument('--sentence_id_file', type=str, default=None) parser.add_argument('--weight_threshold', type=float, default=0.0) parser.add_argument('--label_threshold', type=float, default=0.0) # negative sample parameters parser.add_argument('--do_negative_sampling', default=False, action='store_true') parser.add_argument('--read_extra_self', default=False, action='store_true') parser.add_argument('--sample_ratio', type=float, default=0.5) parser.add_argument('--extra_sen_file', type=str, default=None) parser.add_argument('--multi_inputs', default=False, action='store_true') args = parser.parse_args() logger = setting_logger(args.output_dir) logger.info('================== Program start. ========================') # model parameters model_params = prepare_model_params(args) # read parameters read_params = prepare_read_params(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if args.do_predict: os.makedirs(args.predict_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.vocab_file) data_reader = initialize_reader(args.reader_name) num_train_steps = None if args.do_train or args.do_label: train_examples = data_reader.read(input_file=args.train_file, **read_params) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), str(args.max_ctx), str(args.task_name)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except FileNotFoundError: train_features = data_reader.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Just for test no_evidence = 0 for feature in train_features: if feature.sentence_id == -1: no_evidence += 1 logger.info( f'No evidence ratio: {no_evidence} / {len(train_features)} = {no_evidence * 1.0 / len(train_features)}' ) # Prepare model if args.pretrain is not None: logger.info('Load pretrained model from {}'.format(args.pretrain)) model_state_dict = torch.load(args.pretrain, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) else: model = initialize_model(args.bert_name, args.model_file, **model_params) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # Prepare data if 'read_state' in read_params: read_params['read_state'] = ReadState.NoNegative eval_examples = data_reader.read(input_file=args.predict_file, **read_params) eval_features = data_reader.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) eval_tensors = data_reader.data_to_tensors(eval_features) eval_data = TensorDataset(*eval_tensors) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) if args.do_train: if args.sentence_id_file is not None: logger.info('Training with evidence self-labeled data.') data_reader.generate_features_sentence_ids(train_features, args.sentence_id_file) else: logger.info('No sentence id file found. Train in traditional way.') logger.info("Start training") train_loss = AverageMeter() best_acc = 0.0 summary_writer = SummaryWriter(log_dir=args.output_dir) global_step = 0 eval_loss = AverageMeter() train_tensors = data_reader.data_to_tensors(train_features) train_data = TensorDataset(*train_tensors) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # Train model.train() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, train_features, model_state=ModelState.Train) output_dict = model(**inputs) loss = output_dict['loss'] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 train_loss.update(loss.item(), args.train_batch_size) summary_writer.add_scalar('train_loss', train_loss.avg, global_step) # Evaluation model.eval() all_results = [] logger.info("Start evaluating") for eval_step, batch in enumerate( tqdm(eval_dataloader, desc="Evaluating")): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, eval_features, model_state=ModelState.Evaluate) with torch.no_grad(): output_dict = model(**inputs) loss, batch_choice_logits = output_dict[ 'loss'], output_dict['yesno_logits'] eval_loss.update(loss.item(), args.predict_batch_size) summary_writer.add_scalar( 'eval_loss', eval_loss.avg, epoch * len(eval_dataloader) + eval_step) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu( ).tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) data_reader.write_predictions(eval_examples, eval_features, all_results, None, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') current_acc = yes_metric['accuracy'] summary_writer.add_scalar('eval_yes_f1', yes_metric['f1'], epoch) summary_writer.add_scalar('eval_yes_recall', yes_metric['recall'], epoch) summary_writer.add_scalar('eval_yes_precision', yes_metric['precision'], epoch) summary_writer.add_scalar('eval_no_f1', no_metric['f1'], epoch) summary_writer.add_scalar('eval_no_recall', no_metric['recall'], epoch) summary_writer.add_scalar('eval_no_precision', no_metric['precision'], epoch) summary_writer.add_scalar('eval_yesno_acc', current_acc, epoch) torch.cuda.empty_cache() if current_acc > best_acc: best_acc = current_acc model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info('Epoch: %d, Accuracy: %f (Best Accuracy: %f)' % (epoch, current_acc, best_acc)) data_reader.yesno_cate.reset() summary_writer.close() # Loading trained model. output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Write Yes/No predictions if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = eval_examples test_features = eval_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start predicting yes/no on Dev set.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs(batch, test_features, model_state=ModelState.Test) with torch.no_grad(): output_dict = model(**inputs) batch_choice_logits = output_dict['yesno_logits'] example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) output_prediction_file = os.path.join(args.predict_dir, 'predictions.json') data_reader.write_predictions(eval_examples, eval_features, all_results, output_prediction_file, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') logger.info('Yes Metrics: %s' % json.dumps(yes_metric, indent=2)) logger.info('No Metrics: %s' % json.dumps(no_metric, indent=2)) # Labeling sentence id. if args.do_label and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = train_examples test_features = train_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running labeling *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start labeling.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device(batch, device) inputs = data_reader.generate_inputs(batch, test_features, model_state=ModelState.Test) with torch.no_grad(): output_dict = model(**inputs) batch_choice_logits = output_dict['yesno_logits'] batch_max_weight_indexes = output_dict['max_weight_index'] batch_max_weights = output_dict['max_weight'] example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() max_weight_index = batch_max_weight_indexes[i].detach().cpu( ).tolist() max_weight = batch_max_weights[i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( FullResult(unique_id=unique_id, choice_logits=choice_logits, max_weight_index=max_weight_index, max_weight=max_weight)) output_prediction_file = os.path.join(args.predict_dir, args.label_output_file) data_reader.write_sentence_predictions( test_examples, test_features, all_results, output_prediction_file, weight_threshold=args.weight_threshold, label_threshold=args.label_threshold)
def _get_data(self, flag): """Function that creats a dataloader basd on flag. Args: flag: Flag indicating if we should return training/validation/testing dataloader Returns: data_loader: Dataloader for the required dataset. """ # Here we initialize matrix that will store last past error # and selection probabilty for each expert by the gate if flag == "test": shuffle_flag = False drop_last = True batch_size = self.args.batch_size data_set = TensorDataset( torch.Tensor(self.data.test_x), torch.Tensor(self.data.test_index), torch.Tensor(self.data.test_y)) self.past_test_error = torch.zeros( (len(self.data.test_x), self.num_experts), requires_grad=False).to(self.device) self.gate_weights_test = torch.ones( (len(self.data.test_x), self.num_experts), requires_grad=False).to( self.device) * 1 / self.num_experts elif flag == "pred": shuffle_flag = False drop_last = False # To take advantage of past error we process test dataset one by one # during final prediction. batch_size = 1 data_set = TensorDataset( torch.Tensor(self.data.test_x), torch.Tensor(self.data.test_index), torch.Tensor(self.data.test_y)) self.past_test_error = torch.zeros( (len(self.data.test_x), self.num_experts), requires_grad=False).to(self.device) elif flag == "val": shuffle_flag = False drop_last = False batch_size = self.args.batch_size data_set = TensorDataset( torch.Tensor(self.data.valid_x), torch.Tensor(self.data.valid_index), torch.Tensor(self.data.valid_y)) self.past_val_error = torch.zeros( (len(self.data.valid_x), self.num_experts), requires_grad=False).to(self.device) self.gate_weights_val = torch.ones( (len(self.data.valid_x), self.num_experts), requires_grad=False).to( self.device) * 1 / self.num_experts else: shuffle_flag = False drop_last = True batch_size = self.args.batch_size data_set = TensorDataset( torch.Tensor(self.data.train_x), torch.Tensor(self.data.train_index), torch.Tensor(self.data.train_y)) self.past_train_error = torch.zeros( (len(self.data.train_x), self.num_experts), requires_grad=False).to(self.device) self.gate_weights_train = torch.ones( (len(self.data.train_x), self.num_experts), requires_grad=False).to( self.device) * 1 / self.num_experts # Fitting past error matrix self.error_scaler.fit( self.past_train_error.detach().cpu().numpy().flatten().reshape(-1, 1)) print("Data for", flag, "dataset size", len(data_set)) data_loader = DataLoader( data_set, batch_size=batch_size, shuffle=shuffle_flag, num_workers=self.args.num_workers, drop_last=drop_last) if flag == "train": data_loader_shuffled = DataLoader( data_set, batch_size=batch_size, shuffle=True, num_workers=self.args.num_workers, drop_last=drop_last) return data_loader, data_loader_shuffled else: return data_loader
def run(config): seed = config['seed'] random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) exp_dir = get_experiment_dir(config) run_dir = os.path.join(exp_dir, 'seed_{}'.format(config['seed'])) # tensorboard logger writer = SummaryWriter(run_dir) # get data loaders and metrics function if config['dataset'] == 'openmic': (train_loader, val_loader, test_loader), (full_dataset, train_inds) = get_openmic_loaders(config) n_classes = 20 metric_fn = evaluate.metrics.metric_fn_openmic elif config['dataset'] == 'sonyc': (train_loader, val_loader, test_loader), train_dataset = get_sonyc_loaders(config) if config['coarse']: n_classes = 8 else: n_classes = 23 metric_fn = evaluate.metrics.metric_fn_sonycust # Randomly remove labels if 'label_drop_rate' in config: label_drop_rate = config['label_drop_rate'] drop_mask = np.random.rand(*train_dataset.Y_mask.shape) drop_mask = train_dataset.Y_mask + drop_mask train_dataset.Y_mask = drop_mask > (1 + label_drop_rate) # hyper params hparams = config['hparams'] lr = hparams['lr'] wd = hparams['wd'] model_params = { 'n_features': hparams['n_features'], 'drop_rate': hparams['dropout'], 'n_classes': n_classes, 'n_layers': hparams['n_layers'] } num_epochs = hparams['num_epochs'] prune_thres = hparams['prune_thres'] batch_size = hparams['batch_size'] # initialize models model = create_model(model_params) # initialize criterion and optimizer criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd) # initialize best metric variables best_models = [None, None] best_val_loss = 100000.0 best_f1_macro = -1.0 # teacher training loop for epoch in tqdm(range(num_epochs)): # drop learning rate every 30 epochs if (epoch > 0) and (epoch % 30 == 0): for param_group in optimizer.param_groups: param_group['lr'] = lr * 0.5 lr = lr * 0.5 # first train treating all missing labels as negatives train_loss = trainer_baseline(model, train_loader, optimizer, criterion, baseline_type=0) print('#### Training ####') print('Loss: {}'.format(train_loss)) val_loss, metrics = eval_baseline(model, val_loader, criterion, n_classes, metric_fn, baseline_type=1) val_metric = 'F1_macro' if config[ 'dataset'] == 'openmic' else 'auprc_macro' avg_val_metric = np.mean(metrics[val_metric]) print('#### Validation ####') print('Loss: {}\t Macro F1 score: {}'.format(val_loss, avg_val_metric)) # log to tensorboard writer.add_scalar("train/loss", train_loss, epoch) writer.add_scalar("val/loss_loss", val_loss, epoch) writer.add_scalar(f"val/{val_metric}", avg_val_metric, epoch) #Save best models if val_loss < best_val_loss: best_val_loss = val_loss best_models[0] = deepcopy(model) if avg_val_metric > best_f1_macro: best_f1_macro = avg_val_metric best_models[1] = deepcopy(model) # Perform label pruning if config['dataset'] == 'openmic': X = full_dataset.X[train_inds] Y_mask = full_dataset.Y_mask[train_inds] X_dataset = TensorDataset( torch.tensor(X, requires_grad=False, dtype=torch.float32)) loader = DataLoader(X_dataset, batch_size) all_predictions = forward(best_models[0], loader, n_classes) new_mask = get_enhanced_labels(Y_mask, all_predictions, prune_thres) full_dataset.Y_mask[train_inds] = new_mask if config['dataset'] == 'sonyc': X = train_dataset.X Y_mask = train_dataset.Y_mask X_dataset = TensorDataset( torch.tensor(X, requires_grad=False, dtype=torch.float32)) loader = DataLoader(X_dataset, batch_size) all_predictions = forward(best_models[0], loader, n_classes) new_mask = get_enhanced_labels(Y_mask, all_predictions, prune_thres) train_dataset.Y_mask = new_mask # Retrain with pruned labels # initialize models model = create_model(model_params) # initialize optimizer optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd) # initialize best metric variables best_models = [None, None] best_val_loss = 100000.0 best_f1_macro = -1.0 for epoch in tqdm(range(num_epochs)): # drop learning rate every 30 epochs if (epoch > 0) and (epoch % 30 == 0): for param_group in optimizer.param_groups: param_group['lr'] = lr * 0.5 lr = lr * 0.5 # train with new mask train_loss = trainer_baseline(model, train_loader, optimizer, criterion, baseline_type=1) print('#### Training ####') print('Loss: {}'.format(train_loss)) val_loss, metrics = eval_baseline(model, val_loader, criterion, n_classes, metric_fn, baseline_type=1) val_metric = 'F1_macro' if config[ 'dataset'] == 'openmic' else 'auprc_macro' avg_val_metric = np.mean(metrics[val_metric]) print('#### Validation ####') print('Loss: {}\t Macro F1 score: {}'.format(val_loss, avg_val_metric)) # log to tensorboard writer.add_scalar("train/loss", train_loss, epoch) writer.add_scalar("val/loss_loss", val_loss, epoch) writer.add_scalar(f"val/{val_metric}", avg_val_metric, epoch) #Save best models if val_loss < best_val_loss: best_val_loss = val_loss best_models[0] = deepcopy(model) if avg_val_metric > best_f1_macro: best_f1_macro = avg_val_metric best_models[1] = deepcopy(model) # Test best models for i, model in enumerate(best_models): test_loss, metrics = eval_baseline(model, test_loader, criterion, n_classes, metric_fn, baseline_type=1) print('#### Testing ####') print('Test Loss: ', test_loss) for key, val in metrics.items(): print(f'Test {key}: {np.mean(val)}') # save metrics and model torch.save(model.state_dict(), os.path.join(run_dir, f'model_{i}.pth')) np.save(os.path.join(run_dir, f'metrics_{i}'), metrics) # jsonify metrics and write to json as well for manual inspection js = {} for key, val in metrics.items(): if not np.ndim(val) == 0: js[key] = val.tolist() else: js[key] = val json.dump(js, open(os.path.join(run_dir, f'metrics_{i}.json'), 'w')) json.dump(config, open(os.path.join(run_dir, f'config.json'), 'w'))
from torch.utils.data import TensorDataset, DataLoader from torch import nn class SNN(nn.Module): def __init__(self, in_d, out_d): super().__init__() self.fc = nn.Linear(in_d, out_d, bias=False) nn.init.normal_(self.fc.weight, 0.0, 1.0) def forward(self, x): return self.fc(x) model = SNN(300, 4) dataset = TensorDataset(X_train, y_train) dataloader = DataLoader(dataset, shuffle=True) los_fun = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-1) for epoch in range(10): for X, y in dataloader: optimizer.zero_grad() outputs = model(X) loss = loss_fun(outputs, y) loss.backward() optimizer.step()
return x def sigmoid(x): return 1 / (1 + np.exp(-x)) folds = KFold(n_splits=5, shuffle=True, random_state=2019) NN_predictions = np.zeros((test_X.shape[0], )) oof_preds = np.zeros((train_X.shape[0], )) x_test = np.array(test_X) x_test = torch.tensor(x_test, dtype=torch.float) if torch.cuda.is_available(): x_test = x_test.cuda() test = TensorDataset(x_test) test_loader = DataLoader(test, batch_size=batch_size, shuffle=False) avg_losses_f = [] avg_val_losses_f = [] for fold_, (trn_, val_) in enumerate(folds.split(train_x)): print("fold {}".format(fold_ + 1)) x_train = Variable(torch.Tensor(train_X[trn_.astype(int)])) y_train = Variable(torch.Tensor(train_y[trn_.astype(int), np.newaxis])) x_valid = Variable(torch.Tensor(train_X[val_.astype(int)])) y_valid = Variable(torch.Tensor(train_y[val_.astype(int), np.newaxis])) model = MLP(x_train.shape[1], 512, classes, dropout=0.3)
"""import data""" # input_ids_train,attention_masks_train,role_type_ids_train,entity_type_ids_train,labels_train = prepare_input('ACE05_events_three_level_train_with_sent_id.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len) # input_ids_dev,attention_masks_dev,role_type_ids_dev,entity_type_ids_dev,labels_dev = prepare_input_emma('test_temp_three_level.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len) # input_ids_dev,attention_masks_dev,role_type_ids_dev,entity_type_ids_dev,labels_dev = prepare_input('ACE05_events_three_level_dev_with_sent_id.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len) input_ids, attention_masks, role_type_ids, entity_type_ids, labels = prepare_input_withIBO_multi_pair( event_type_dict, entity_type_dict, role_type_dict, tokenizer, tokenizer_max_len) # input_ids_dev,attention_masks_dev,role_type_ids_dev,entity_type_ids_dev,labels_dev =prepare_input_withIBO_multi_pair('ACE05_events_three_level_dev_with_sent_id.json',event_type_dict,entity_type_dict,role_type_dict,tokenizer,tokenizer_max_len) from torch.utils.data import TensorDataset, random_split # Combine the training inputs into a TensorDataset. dataset = TensorDataset(input_ids, attention_masks, role_type_ids, entity_type_ids, labels) # Create a 90-10 train-validation split. # Calculate the number of samples to include in each set. train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size ## train dev separate version # # """split train and val dataset""" # # from torch.utils.data import TensorDataset, random_split # # # Combine the training inputs into a TensorDataset. # # dataset_train = TensorDataset(input_ids_train, attention_masks_train,role_type_ids_train,entity_type_ids_train, labels_train) # # dataset_dev = TensorDataset(input_ids_dev, attention_masks_dev,role_type_ids_dev,entity_type_ids_dev, labels_dev) # # Create a 90-10 train-validation split.
def main(): start_time = time.time() args = parse_args() make_directories(args.output_dir) # Start Tensorboard and log hyperparams. tb_writer = SummaryWriter(args.output_dir) tb_writer.add_hparams(vars(args), {}) file_log_handler = logging.FileHandler( os.path.join(args.output_dir, 'log.txt')) logger.addHandler(file_log_handler) # Get list of text and list of label (integers) from disk. train_text, train_label_id_list, eval_text, eval_label_id_list = \ get_examples_and_labels(args.dataset) # Augment training data. if (args.augmentation_recipe is not None) and len( args.augmentation_recipe): import pandas as pd if args.augmentation_recipe == 'textfooler': aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590551967800.csv' elif args.augmentation_recipe == 'tf-adjusted': aug_csv = '/p/qdata/jm8wx/research/text_attacks/textattack/outputs/attack-1590564015768.csv' else: raise ValueError( f'Unknown augmentation recipe {args.augmentation_recipe}') aug_df = pd.read_csv(aug_csv) # filter skipped outputs aug_df = aug_df[aug_df['original_text'] != aug_df['perturbed_text']] print( f'Augmentation recipe {args.augmentation_recipe} / augmentation num. examples {args.augmentation_num}/ len {len(aug_df)}' ) original_text = aug_df['original_text'] perturbed_text = aug_df['perturbed_text'] # convert `train_text` and `train_label_id_list` to an np array so things are faster train_text = np.array(train_text) train_label_id_list = np.array(train_label_id_list) x_adv_list = [] x_adv_id_list = [] for (x, x_adv) in zip(original_text, perturbed_text): x = x.replace('[[', '').replace(']]', '') x_adv = x_adv.replace('[[', '').replace(']]', '') x_idx = (train_text == x).nonzero()[0][0] x_adv_label = train_label_id_list[x_idx] x_adv_id_list.append(x_adv_label) x_adv_list.append(x_adv) # truncate to `args.augmentation_num` examples if (args.augmentation_num >= 0): perm = list(range(len(x_adv_list))) random.shuffle(perm) perm = perm[:args.augmentation_num] x_adv_list = [x_adv_list[i] for i in perm] x_adv_id_list = [x_adv_id_list[i] for i in perm] train_text = train_text.tolist() + x_adv_list train_label_id_list = train_label_id_list.tolist() + x_adv_id_list print( f'Augmentation added {len(x_adv_list)} examples, for a total of {len(train_text)}' ) label_id_len = len(train_label_id_list) num_labels = len(set(train_label_id_list)) logger.info('num_labels: %s', num_labels) train_examples_len = len(train_text) if len(train_label_id_list) != train_examples_len: raise ValueError( f'Number of train examples ({train_examples_len}) does not match number of labels ({len(train_label_id_list)})' ) if len(eval_label_id_list) != len(eval_text): raise ValueError( f'Number of teste xamples ({len(eval_text)}) does not match number of labels ({len(eval_label_id_list)})' ) print_cuda_memory(args) # old INFO:__main__:Loaded data and tokenized in 189.66675066947937s # @TODO support other vocabularies, or at least, support case tokenizer = BertWordPieceTokenizer('bert-base-uncased-vocab.txt', lowercase=True) tokenizer.enable_padding(max_length=args.max_seq_len) tokenizer.enable_truncation(max_length=args.max_seq_len) logger.info(f'Tokenizing training data. (len: {train_examples_len})') train_text_ids = [ encoding.ids for encoding in tokenizer.encode_batch(train_text) ] logger.info(f'Tokenizing test data (len: {len(eval_label_id_list)})') eval_text_ids = [ encoding.ids for encoding in tokenizer.encode_batch(eval_text) ] load_time = time.time() logger.info(f'Loaded data and tokenized in {load_time-start_time}s') print_cuda_memory(args) # Load pre-trained model tokenizer (vocabulary) logger.info('Loading model: %s', args.model_dir) # Load pre-trained model (weights) logger.info(f'Model class: (vanilla) BertForSequenceClassification.') model = BertForSequenceClassification.from_pretrained( args.model_dir, num_labels=num_labels) if torch.cuda.is_available(): torch.cuda.empty_cache() model.to(device) # print(model) # multi-gpu training if args.num_gpus > 1: model = torch.nn.DataParallel(model) logger.info(f'Training model across {args.num_gpus} GPUs') num_train_optimization_steps = int( train_examples_len / args.batch_size / args.grad_accum_steps) * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_proportion, num_training_steps=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", train_examples_len) logger.info(" Batch size = %d", args.batch_size) logger.info(" Max sequence length = %d", args.max_seq_len) logger.info(" Num steps = %d", num_train_optimization_steps) wandb.log({'train_examples_len': train_examples_len}) train_input_ids = torch.tensor(train_text_ids, dtype=torch.long) train_label_ids = torch.tensor(train_label_id_list, dtype=torch.long) train_data = TensorDataset(train_input_ids, train_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) eval_input_ids = torch.tensor(eval_text_ids, dtype=torch.long) eval_label_ids = torch.tensor(eval_label_id_list, dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_label_ids) eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) def get_eval_acc(): correct = 0 total = 0 for input_ids, label_ids in tqdm.tqdm(eval_dataloader, desc="Evaluating accuracy"): input_ids = input_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids)[0] correct += (logits.argmax(dim=1) == label_ids).sum() total += len(label_ids) return float(correct) / total def save_model(): model_to_save = model.module if hasattr( model, 'module') else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, args.weights_name) output_config_file = os.path.join(args.output_dir, args.config_name) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) logger.info( f'Best acc found. Saved tokenizer, model config, and model to {args.output_dir}.' ) global_step = 0 def save_model_checkpoint(checkpoint_name=None): # Save model checkpoint checkpoint_name = checkpoint_name or 'checkpoint-{}'.format( global_step) output_dir = os.path.join(args.output_dir, checkpoint_name) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info('Checkpoint saved to %s.', output_dir) print_cuda_memory(args) model.train() best_eval_acc = 0 steps_since_best_eval_acc = 0 def loss_backward(loss): if args.num_gpus > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.grad_accum_steps > 1: loss = loss / args.grad_accum_steps loss.backward() for epoch in tqdm.trange(int(args.num_train_epochs), desc="Epoch"): prog_bar = tqdm.tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(prog_bar): print_cuda_memory(args) batch = tuple(t.to(device) for t in batch) input_ids, labels = batch logits = model(input_ids)[0] loss_fct = torch.nn.CrossEntropyLoss() loss = torch.nn.CrossEntropyLoss()(logits.view(-1, num_labels), labels.view(-1)) if global_step % args.tb_writer_step == 0: tb_writer.add_scalar('loss', loss, global_step) tb_writer.add_scalar('lr', loss, global_step) loss_backward(loss) prog_bar.set_description(f"Loss {loss.item()}") if (step + 1) % args.grad_accum_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # Save model checkpoint to file. if global_step % args.checkpoint_steps == 0: save_model_checkpoint() model.zero_grad() # Inc step counter. global_step += 1 # Check accuracy after each epoch. eval_acc = get_eval_acc() tb_writer.add_scalar('epoch_eval_acc', eval_acc, global_step) wandb.log({'epoch_eval_acc': eval_acc, 'epoch': epoch}) if args.checkpoint_every_epoch: save_model_checkpoint(f'epoch-{epoch}') logger.info(f'Eval acc: {eval_acc*100}%') if eval_acc > best_eval_acc: best_eval_acc = eval_acc steps_since_best_eval_acc = 0 save_model() else: steps_since_best_eval_acc += 1 if (args.early_stopping_epochs > 0) and ( steps_since_best_eval_acc > args.early_stopping_epochs): logger.info( f'Stopping early since it\'s been {args.early_stopping_epochs} steps since validation acc increased' ) break
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForQuestionAnswering.from_pretrained( args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def fit_neural(x, y): # convert to tensors X = torch.Tensor(x) X = X.view(len(x), 1) Y = torch.Tensor(y) Y = Y.view(len(x), 1) my_dataset = TensorDataset(X, Y) # create your datset batchSize = 512 train_loader = DataLoader(dataset=my_dataset, batch_size=batchSize, num_workers=2, shuffle=True) #hyperparameters inputSize = 1 hidden_size1 = 40 hidden_size2 = 40 outputSize = 1 learning_rate = 0.001 #Design model class NeuralNet(nn.Module): def __init__(self, input_size, hidden_size1, hidden_size2, output_size): super(NeuralNet, self).__init__() self.input_size = input_size self.l1 = nn.Linear(input_size, hidden_size1) self.leaky_relu_1 = nn.LeakyReLU(negative_slope=0.3) self.l2 = nn.Linear(hidden_size1, hidden_size2) self.leaky_relu_2 = nn.LeakyReLU(negative_slope=0.3) self.l3 = nn.Linear(hidden_size2, outputSize) def forward(self, x): out = self.l1(x) out = self.leaky_relu_1(out) out = self.l2(out) out = self.leaky_relu_2(out) out = self.l3(out) return out global count10 if count10 == 0: model = NeuralNet(inputSize, hidden_size1, hidden_size2, outputSize) count10 += 1 else: model = NeuralNet(inputSize, hidden_size1, hidden_size2, outputSize) model.load_state_dict( torch.load( "/home/ppl/Documents/Universitet/KUKandidat/Speciale/DeepPricing/python/deepStopping/saveModel/ModelAM1.pth" )) #model = nn.Linear(inputSize, outputSize) #loss and optimizer criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Train the model num_epochs = 1 n_total_steps = len(train_loader) #enumereate epoch es = earlyStop.EarlyStopping(patience=1) for epoch in range(num_epochs): total_loss = 0 n_samples = 0 for i, (X, y) in enumerate(train_loader): #one batch of samples optimizer.zero_grad() # zero the gradient buffer #forward pass and loss y_predicted = model(X) loss = criterion(y_predicted, y) # Backward and optimize loss.backward() optimizer.step() #does weight update #epoch_loss += loss # accumulate loss total_loss += loss.item() * X.shape[0] n_samples += X.shape[0] total_loss /= n_samples #if (epoch+1) % 10 == 0: #print (f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}') if es.step(total_loss): break # early stop criterion is met, we can stop now #enumereate epoch torch.save( model.state_dict(), "/home/ppl/Documents/Universitet/KUKandidat/Speciale/DeepPricing/python/deepStopping/saveModel/ModelAM1.pth" ) return model
def get_bert_out(output_path, local_rank, no_cuda, batch_size): startt = timeit.default_timer() if local_rank == -1 or no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(local_rank != -1))) model = BertModel.from_pretrained(args.bert_dir) model.to(device) # model.to(0) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) model.eval() sent_bert = numpy.load(output_path + "sen_bert.npy") sent_mask_bert = numpy.load(output_path + "sen_mask_bert.npy") f = open(output_path + "sent_output_bert.npy", 'ab') num = 0 all_input_ids = torch.tensor(sent_bert, dtype=torch.int64).to(device) all_input_mask = torch.tensor(sent_mask_bert, dtype=torch.int64).to(device) all_example_index = torch.tensor(list(range(len(sent_bert))), dtype=torch.int64).to(device) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) for input_ids, input_mask, example_indices in eval_dataloader: all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers num += len(sent_bert) outs = [] for b, example_index in enumerate(example_indices): layer_output = all_encoder_layers[-1].detach().cpu().numpy( ) # last layer layer_output = layer_output[b][:, :512] # sent b # out = [round(x.item(), 6) for x in layer_output[0]] # [CLS] # outs.append(out) outs.append(layer_output) # all tokens----------------- outs = numpy.array(outs) numpy.save(f, outs) endt = timeit.default_timer() print(file=sys.stderr) print("Total use %.3f seconds for BERT Data Generating" % (endt - startt), file=sys.stderr)
def main(): parser = argparse.ArgumentParser( description='Test code - measure the detection peformance') parser.add_argument('--eva_iter', default=1, type=int, help='number of passes for mc-dropout when evaluation') parser.add_argument( '--model', type=str, choices=['base', 'manifold-smoothing', 'mc-dropout', 'temperature'], default='base') parser.add_argument('--seed', type=int, default=0, help='random seed for test') parser.add_argument("--epochs", default=10, type=int, help="Number of epochs for training.") parser.add_argument('--index', type=int, default=0, help='random seed you used during training') parser.add_argument('--in_dataset', required=True, help='target dataset: 20news') parser.add_argument('--out_dataset', required=True, help='out-of-dist dataset') parser.add_argument('--eval_batch_size', type=int, default=32) parser.add_argument('--saved_dataset', type=str, default='n') parser.add_argument( '--eps_out', default=0.001, type=float, help="Perturbation size of out-of-domain adversarial training") parser.add_argument("--eps_y", default=0.1, type=float, help="Perturbation size of label") parser.add_argument( '--eps_in', default=0.0001, type=float, help="Perturbation size of in-domain adversarial training") args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.device = device set_seed(args) outf = 'test/' + args.model + '-' + str(args.index) if not os.path.isdir(outf): os.makedirs(outf) if args.model == 'base': dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index) pretrained_dir = './model_save/{}'.format(dirname) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained(pretrained_dir) model.to(args.device) print('Load Tekenizer') elif args.model == 'mc-dropout': dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index) pretrained_dir = './model_save/{}'.format(dirname) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained(pretrained_dir) model.to(args.device) elif args.model == 'temperature': dirname = '{}/BERT-base-{}'.format(args.in_dataset, args.index) pretrained_dir = './model_save/{}'.format(dirname) orig_model = BertForSequenceClassification.from_pretrained( pretrained_dir) orig_model.to(args.device) model = ModelWithTemperature(orig_model) model.to(args.device) elif args.model == 'manifold-smoothing': dirname = '{}/BERT-mf-{}-{}-{}-{}'.format(args.in_dataset, args.index, args.eps_in, args.eps_y, args.eps_out) print(dirname) pretrained_dir = './model_save/{}'.format(dirname) model = BertForSequenceClassification.from_pretrained(pretrained_dir) model.to(args.device) if args.saved_dataset == 'n': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) train_sentences, val_sentences, test_sentences, train_labels, val_labels, test_labels = load_dataset( args.in_dataset) _, _, nt_test_sentences, _, _, nt_test_labels = load_dataset( args.out_dataset) val_input_ids = [] test_input_ids = [] nt_test_input_ids = [] if args.in_dataset == '20news' or args.in_dataset == '20news-15': MAX_LEN = 150 else: MAX_LEN = 256 for sent in val_sentences: encoded_sent = tokenizer.encode( sent, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' truncation=True, max_length=MAX_LEN, # Truncate all sentences. #return_tensors = 'pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. val_input_ids.append(encoded_sent) for sent in test_sentences: encoded_sent = tokenizer.encode( sent, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' truncation=True, max_length=MAX_LEN, # Truncate all sentences. #return_tensors = 'pt', # Return pytorch tensors. ) # Add the encoded sentence to the list. test_input_ids.append(encoded_sent) for sent in nt_test_sentences: encoded_sent = tokenizer.encode( sent, add_special_tokens=True, truncation=True, max_length=MAX_LEN, ) nt_test_input_ids.append(encoded_sent) # Pad our input tokens val_input_ids = pad_sequences(val_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") nt_test_input_ids = pad_sequences(nt_test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") val_attention_masks = [] test_attention_masks = [] nt_test_attention_masks = [] for seq in val_input_ids: seq_mask = [float(i > 0) for i in seq] val_attention_masks.append(seq_mask) for seq in test_input_ids: seq_mask = [float(i > 0) for i in seq] test_attention_masks.append(seq_mask) for seq in nt_test_input_ids: seq_mask = [float(i > 0) for i in seq] nt_test_attention_masks.append(seq_mask) val_inputs = torch.tensor(val_input_ids) val_labels = torch.tensor(val_labels) val_masks = torch.tensor(val_attention_masks) test_inputs = torch.tensor(test_input_ids) test_labels = torch.tensor(test_labels) test_masks = torch.tensor(test_attention_masks) nt_test_inputs = torch.tensor(nt_test_input_ids) nt_test_labels = torch.tensor(nt_test_labels) nt_test_masks = torch.tensor(nt_test_attention_masks) val_data = TensorDataset(val_inputs, val_masks, val_labels) test_data = TensorDataset(test_inputs, test_masks, test_labels) nt_test_data = TensorDataset(nt_test_inputs, nt_test_masks, nt_test_labels) dataset_dir = 'dataset/test' if not os.path.exists(dataset_dir): os.makedirs(dataset_dir) torch.save( val_data, dataset_dir + '/{}_val_in_domain.pt'.format(args.in_dataset)) torch.save( test_data, dataset_dir + '/{}_test_in_domain.pt'.format(args.in_dataset)) torch.save( nt_test_data, dataset_dir + '/{}_test_out_of_domain.pt'.format(args.out_dataset)) else: dataset_dir = 'dataset/test' val_data = torch.load(dataset_dir + '/{}_val_in_domain.pt'.format(args.in_dataset)) test_data = torch.load(dataset_dir + '/{}_test_in_domain.pt'.format(args.in_dataset)) nt_test_data = torch.load( dataset_dir + '/{}_test_out_of_domain.pt'.format(args.out_dataset)) ######## saved dataset test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) nt_test_sampler = SequentialSampler(nt_test_data) nt_test_dataloader = DataLoader(nt_test_data, sampler=nt_test_sampler, batch_size=args.eval_batch_size) val_sampler = SequentialSampler(val_data) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=args.eval_batch_size) if args.model == 'temperature': model.set_temperature(val_dataloader, args) model.eval() if args.model == 'mc-dropout': model.apply(apply_dropout) correct = 0 total = 0 output_list = [] labels_list = [] ##### validation dat with torch.no_grad(): for step, batch in enumerate(val_dataloader): batch = tuple(t.to(args.device) for t in batch) b_input_ids, b_input_mask, b_labels = batch total += b_labels.shape[0] batch_output = 0 for j in range(args.eva_iter): if args.model == 'temperature': current_batch = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask) #logits else: current_batch = model( input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] #logits batch_output = batch_output + F.softmax(current_batch, dim=1) batch_output = batch_output / args.eva_iter output_list.append(batch_output) labels_list.append(b_labels) score, predicted = batch_output.max(1) correct += predicted.eq(b_labels).sum().item() ###calculate accuracy and ECE val_eval_accuracy = correct / total print("Val Accuracy: {}".format(val_eval_accuracy)) ece_criterion = ECE_v2().to(args.device) softmaxes_ece = torch.cat(output_list) labels_ece = torch.cat(labels_list) val_ece = ece_criterion(softmaxes_ece, labels_ece).item() print('ECE on Val data: {}'.format(val_ece)) #### Test data correct = 0 total = 0 output_list = [] labels_list = [] predict_list = [] true_list = [] true_list_ood = [] predict_mis = [] predict_in = [] score_list = [] correct_index_all = [] ## test on in-distribution test set with torch.no_grad(): for step, batch in enumerate(test_dataloader): batch = tuple(t.to(args.device) for t in batch) b_input_ids, b_input_mask, b_labels = batch total += b_labels.shape[0] batch_output = 0 for j in range(args.eva_iter): if args.model == 'temperature': current_batch = model(input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask) #logits else: current_batch = model( input_ids=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] #logits batch_output = batch_output + F.softmax(current_batch, dim=1) batch_output = batch_output / args.eva_iter output_list.append(batch_output) labels_list.append(b_labels) score, predicted = batch_output.max(1) correct += predicted.eq(b_labels).sum().item() correct_index = (predicted == b_labels) correct_index_all.append(correct_index) score_list.append(score) ###calcutae accuracy eval_accuracy = correct / total print("Test Accuracy: {}".format(eval_accuracy)) ##calculate ece ece_criterion = ECE_v2().to(args.device) softmaxes_ece = torch.cat(output_list) labels_ece = torch.cat(labels_list) ece = ece_criterion(softmaxes_ece, labels_ece).item() print('ECE on Test data: {}'.format(ece)) #confidence for in-distribution data score_in_array = torch.cat(score_list) #indices of data that are classified correctly correct_array = torch.cat(correct_index_all) label_array = torch.cat(labels_list) ### test on out-of-distribution data predict_ood = [] score_ood_list = [] true_list_ood = [] with torch.no_grad(): for step, batch in enumerate(nt_test_dataloader): batch = tuple(t.to(args.device) for t in batch) b_input_ids, b_input_mask, b_labels = batch batch_output = 0 for j in range(args.eva_iter): if args.model == 'temperature': current_batch = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) else: current_batch = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)[0] batch_output = batch_output + F.softmax(current_batch, dim=1) batch_output = batch_output / args.eva_iter score_out, _ = batch_output.max(1) score_ood_list.append(score_out) score_ood_array = torch.cat(score_ood_list) label_array = label_array.cpu().numpy() score_ood_array = score_ood_array.cpu().numpy() score_in_array = score_in_array.cpu().numpy() correct_array = correct_array.cpu().numpy() ####### calculate NBAUCC for detection task predict_o = np.zeros(len(score_in_array) + len(score_ood_array)) true_o = np.ones(len(score_in_array) + len(score_ood_array)) true_o[:len(score_in_array )] = 0 ## in-distribution data as false, ood data as positive true_mis = np.ones(len(score_in_array)) true_mis[ correct_array] = 0 ##true instances as false, misclassified instances as positive predict_mis = np.zeros(len(score_in_array)) ood_sum = 0 mis_sum = 0 ood_sum_list = [] mis_sum_list = [] #### upper bound of the threshold tau for NBAUCC stop_points = [0.50, 1.] for threshold in np.arange(0., 1.01, 0.02): predict_ood_index1 = (score_in_array < threshold) predict_ood_index2 = (score_ood_array < threshold) predict_ood_index = np.concatenate( (predict_ood_index1, predict_ood_index2), axis=0) predict_o[predict_ood_index] = 1 predict_mis[score_in_array < threshold] = 1 ood = f1_score(true_o, predict_o, average='binary' ) ##### detection f1 score for a specific threshold mis = f1_score(true_mis, predict_mis, average='binary') ood_sum += ood * 0.02 mis_sum += mis * 0.02 if threshold in stop_points: ood_sum_list.append(ood_sum) mis_sum_list.append(mis_sum) for i in range(len(stop_points)): print('OOD detection, NBAUCC {}: {}'.format( stop_points[i], ood_sum_list[i] / stop_points[i])) print('misclassification detection, NBAUCC {}: {}'.format( stop_points[i], mis_sum_list[i] / stop_points[i]))
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=MAX_LEN, value=tag2idx["O"], padding="post", dtype="long", truncating="post") attention_masks = [[float(i>0) for i in ii] for ii in input_ids] test_inputs = torch.tensor(input_ids) test_masks = torch.tensor(attention_masks) test_tags = torch.tensor(tags) test_data = TensorDataset(test_inputs, test_masks, test_tags) test_sampler = RandomSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() model = RobertaForTokenClassification.from_pretrained(output_dir) model.cuda()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval or not.") parser.add_argument("--eval_on", default="dev", help="Whether to run eval on the dev set or test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=num_labels, finetuning_task=args.task_name) model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # def inplace_relu(m): # classname = m.__class__.__name__ # if classname.find('ReLU') != -1: # m.inplace = True # # model.apply(inplace_relu) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch loss = model(input_ids, segment_ids, input_mask, label_ids, valid_ids, l_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = Ner.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.eval_on == "dev": eval_examples = processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) l_mask = l_mask.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, valid_ids=valid_ids, attention_mask_label=l_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_map): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) report = classification_report(y_true, y_pred, digits=4) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--gpu_ids", default='0', type=str) parser.add_argument("--bert_config_file", default='check_points/pretrain_models/bert_wwm_ext_base/bert_config.json', type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default='check_points/pretrain_models/bert_wwm_ext_base/vocab.txt', type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--init_restore_dir", required=True, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--input_dir", required=True, default='dataset/CHID') parser.add_argument("--output_dir", required=True, type=str, help="The output directory where the model checkpoints and predictions will be written.") parser.add_argument("--predict_file", required=True, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument('--output_file', type=str, default='predictions_test.json') ## Other parameters parser.add_argument("--max_seq_length", default=64, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--max_num_choices", default=10, type=int, help="The maximum number of cadicate answer, shorter than this will be padded.") parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument('--fp16', default=True, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") args = parser.parse_args() print(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") print("device: {}, distributed training: {}, 16-bits training: {}".format(device, bool(args.local_rank != -1), args.fp16)) tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) test_example_file = os.path.join(args.input_dir, 'test_examples_{}.pkl'.format(str(args.max_seq_length))) test_feature_file = os.path.join(args.input_dir, 'test_features_{}.pkl'.format(str(args.max_seq_length))) eval_features = generate_input(args.predict_file, None, test_example_file, test_feature_file, tokenizer, max_seq_length=args.max_seq_length, max_num_choices=args.max_num_choices, is_training=False) # Prepare model if 'albert' in args.bert_config_file: if 'google' in args.bert_config_file: bert_config = AlbertConfig.from_json_file(args.bert_config_file) model = AlbertForMultipleChoice(bert_config, num_choices=args.max_num_choices) else: bert_config = ALBertConfig.from_json_file(args.bert_config_file) model = ALBertForMultipleChoice(bert_config, num_choices=args.max_num_choices) else: bert_config = BertConfig.from_json_file(args.bert_config_file) model = BertForMultipleChoice(bert_config, num_choices=args.max_num_choices) model = model.to(device) if args.init_restore_dir.endswith('.pth') or \ args.init_restore_dir.endswith('.pt') or \ args.init_restore_dir.endswith('.bin'): pass else: args.init_restore_dir = glob(args.init_restore_dir + '*.pth') assert len(args.init_restore_dir) == 1 args.init_restore_dir = args.init_restore_dir[0] torch_init_model(model, args.init_restore_dir) if args.fp16: model = model.half() print("***** Running predictions *****") print("Num split examples = %d", len(eval_features)) print("Batch size = %d", args.predict_batch_size) all_example_ids = [f.example_id for f in eval_features] all_tags = [f.tag for f in eval_features] all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_masks = torch.tensor([f.input_masks for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_choice_masks = torch.tensor([f.choice_masks for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_choice_masks, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] print("Start evaluating") for input_ids, input_masks, segment_ids, choice_masks, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=None): if len(all_results) == 0: print('shape of input_ids: {}'.format(input_ids.shape)) input_ids = input_ids.to(device) input_masks = input_masks.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_masks, labels=None) for i, example_index in enumerate(example_indices): logits = batch_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, example_id=all_example_ids[unique_id], tag=all_tags[unique_id], logit=logits)) else: print("prediction is over") print('decoder raw results') tmp_predict_file = os.path.join(args.output_dir, "test_raw_predictions.pkl") output_prediction_file = os.path.join(args.output_dir, args.output_file) results = get_final_predictions(all_results, tmp_predict_file, g=True) write_predictions(results, output_prediction_file) print('predictions saved to {}'.format(output_prediction_file))
def get_data_loaders(args, tokenizer): """ Prepare the dataset for training and evaluation """ personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) logger.info("Build inputs and labels") datasets = {"train": defaultdict(list), "valid": defaultdict(list)} for dataset_name, dataset in personachat.items(): num_candidates = len(dataset[0]["utterances"][0]["candidates"]) if args.num_candidates > 0 and dataset_name == 'train': num_candidates = min(args.num_candidates, num_candidates) for dialog in dataset: persona = dialog["personality"].copy() for _ in range(args.personality_permutations): for utterance in dialog["utterances"]: history = utterance["history"][-(2 * args.max_history + 1):] for j, candidate in enumerate( utterance["candidates"][-num_candidates:]): lm_labels = bool(j == num_candidates - 1) instance, _ = build_input_from_segments( persona, history, candidate, tokenizer, lm_labels) for input_name, input_array in instance.items(): datasets[dataset_name][input_name].append( input_array) datasets[dataset_name]["mc_labels"].append(num_candidates - 1) datasets[dataset_name]["n_candidates"] = num_candidates persona = [persona[-1] ] + persona[:-1] # permuted personalities logger.info("Pad inputs and convert to Tensor") tensor_datasets = {"train": [], "valid": []} for dataset_name, dataset in datasets.items(): dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS[-1])) for input_name in MODEL_INPUTS: tensor = torch.tensor(dataset[input_name]) if input_name != "mc_labels": tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) tensor_datasets[dataset_name].append(tensor) logger.info("Build train and validation dataloaders") train_dataset, valid_dataset = TensorDataset( *tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) if args.distributed else None valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_dataset) if args.distributed else None train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed)) valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False) logger.info("Train dataset (Batch, Candidates, Seq length): {}".format( train_dataset.tensors[0].shape)) logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format( valid_dataset.tensors[0].shape)) return train_loader, valid_loader, train_sampler, valid_sampler
# %% tr_inputs = torch.tensor(tr_inputs) val_inputs = torch.tensor(val_inputs) tr_tags = torch.tensor(tr_tags) val_tags = torch.tensor(val_tags) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) tr_segs = torch.tensor(tr_segs) val_segs = torch.tensor(val_segs) # %% # Set batch num # Only set token embedding, attention embedding, no segment embedding train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) # Drop last can make batch training better for the last one train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num, drop_last=True) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num) # %% model_file_address = 'bert-base-cased' model = BertForTokenClassification.from_pretrained(
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "fever": FeverProcessor, } output_modes = { "fever": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) # train_examples=train_examples[0:50] #debugging num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # else: #testing # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # elif output_mode == "regression": # all_label_ids = torch.tensor([f.athene_label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print('printing loss') print('training_loss~=', tr_loss / nb_tr_steps) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) #testing model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) #testing tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) #testing model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) # eval_examples=eval_examples[0:100] #debugging eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, 'no label') logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) # add guid to batch guid1 = list() guid2 = list() guid_list = ([f.guid for f in eval_examples]) for g in range(len(eval_examples)): pair_id = guid_list[g].find('-', 0) evidence_id = guid_list[g].find('_', 0) guid1.append(int(guid_list[g][pair_id + 1:evidence_id])) guid2.append(int(guid_list[g][evidence_id + 1:])) Guid1 = torch.tensor(guid1, dtype=torch.long) Guid2 = torch.tensor(guid2, dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] store_output = list() for input_ids, input_mask, segment_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) # label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) store_output.extend(logits.cpu().numpy()) nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) print("Storing dev scores") output_eval_file = os.path.join(args.output_dir, "test_logits.p") #testing pickle_in = open(output_eval_file, 'wb') pickle.dump(store_output, pickle_in) pickle_in.close()