def create_batch_iter(mode): """构造迭代器""" processor, tokenizer = init_params() if mode == "train": examples = processor.get_train_examples() num_train_steps = int( len(examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) batch_size = args.train_batch_size logger.info(" Num steps = %d", num_train_steps) elif mode == "valid": examples = processor.get_valid_examples() batch_size = args.eval_batch_size else: raise ValueError("Invalid mode %s" % mode) label_list = processor.get_labels() # 特征 features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer) logger.info(" Num examples = %d", len(examples)) logger.info(" Batch size = %d", batch_size) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) all_output_mask = torch.tensor([f.output_mask for f in features], dtype=torch.long) # 数据集 data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_output_mask) if mode == "train": sampler = RandomSampler(data) elif mode == "valid": sampler = SequentialSampler(data) else: raise ValueError("Invalid mode %s" % mode) # 迭代器 iterator = DataLoader(data, sampler=sampler, batch_size=batch_size) if mode == "train": return iterator, num_train_steps elif mode == "valid": return iterator else: raise ValueError("Invalid mode %s" % mode)
def create_inference_iter(): processor, tokenizer = init_params() examples = processor.get_valid_examples()[:100] batch_size = args.inference_batch_size label_list = processor.get_labels() # 特征 features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer) logger.info(" Num examples = %d", len(examples)) logger.info(" Batch size = %d", batch_size) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) all_output_mask = torch.tensor([f.output_mask for f in features], dtype=torch.long) all_text = [''.join(example.text_a) for example in examples] print(all_text) # 数据集 data = MyTensorDataset(all_text, all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_output_mask) sampler = SequentialSampler(data) iterator = DataLoader(data, sampler=sampler, batch_size=batch_size) return iterator
def load_and_cache_examples(args, task, tokenizer, evaluate=False,mode="train"): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( mode, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] if mode == "train": examples = processor.get_train_examples(args.data_dir) elif mode == "dev": examples = processor.get_dev_examples(args.data_dir) elif mode == "test": examples = processor.get_test_examples(args.data_dir) """ examples = ( processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) )""" features = convert_examples_to_features(examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def create_batch_iter(mode, X, y, batch_size=1): """ 构造迭代器 """ processor, tokenizer = init_params() if mode == 'train': examples = processor.get_train_examples(X=X, y=y) elif mode == 'dev': examples = processor.get_dev_examples(X=X, y=y) elif mode == 'predict': examples = processor.get_examples(X=X) else: raise ValueError("Invalid mode %s" % mode) # 方法一: 调整维度 if args.use_calculate_max_seq_length: max_seq_length = processor._calculate_max_seq_length(X=X) if args.max_seq_length < max_seq_length: max_seq_length = args.max_seq_length # 方法二: 固定维度 else: max_seq_length = args.max_seq_length # 特征 features = convert_examples_to_features(examples=examples, max_seq_length=max_seq_length, tokenizer=tokenizer) all_input_ids = torch.LongTensor([f.input_ids for f in features]) all_input_mask = torch.LongTensor([f.input_mask for f in features]) all_label_ids = torch.LongTensor([f.label_id for f in features]) all_output_mask = torch.LongTensor([f.output_mask for f in features]) # 数据集 data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_output_mask) if mode == "train": sampler = RandomSampler(data) elif mode == "dev": sampler = SequentialSampler(data) elif mode == 'predict': sampler = SequentialSampler(data) else: raise ValueError("Invalid mode %s" % mode) # 迭代器 iterator = DataLoader(data, sampler=sampler, batch_size=batch_size) return iterator
def load_and_cache_examples(args, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = Processor() # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length) ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating output_mode from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = ( processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) ) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, pad_on_left=False, #bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id= 0, # 4 if args.model_type in ["xlnet"] else 0, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def predict(model, path, label_list, tokenizer, test_filename='test.csv'): predict_processor = MultiLabelTextProcessor(path) test_examples = predict_processor.get_test_examples(path, test_filename, size=-1) # Hold input data for returning it input_data = [{'filename': input_example.guid} for input_example in test_examples] max_seq_length = 512 test_features = convert_examples_to_features( test_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running prediction *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", 2) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=2) all_logits = None model.eval() nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(test_dataloader): input_ids, input_mask, segment_ids = batch input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.sigmoid() if all_logits is None: all_logits = logits.detach().cpu().numpy() else: all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0) nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 return pd.merge(pd.DataFrame(input_data), pd.DataFrame(all_logits, columns=label_list), left_index=True, right_index=True)
optimizer_grouped_parameters, lr=args['learning_rate'], correct_bias=False ) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args['warmup_proportion'], num_training_steps=t_total) # PyTorch scheduler scheduler = CyclicLR(optimizer, base_lr=2e-5, max_lr=5e-5, step_size=2500, last_batch_iteration=0) # Prepare training feature train_features = convert_examples_to_features(train_examples, label_list, args['max_seq_length'], tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args['train_batch_size']) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
print("Configuration: ", bert_config.to_json_string()) # Prepare data and data processors dataset_processor = processors[task_name.lower()] tokenizer = tokenization.FullTokenizer(vocab_file=bert_model_type + '/vocab.txt', do_lower_case=do_lower_case) # Get labels labels = dataset_processor.get_labels(data_dir) # Training data training_examples = dataset_processor.get_train_examples(data_dir) train_data_file = os.path.join(datasets_dir, "train.tf_record") data_processor.convert_examples_to_features(training_examples, labels, max_seq_length, tokenizer, train_data_file) train_input_fn = utils.input_fn_builder(train_data_file, max_seq_length, is_training=True, drop_remainder=True) # Evaluation data evaluation_examples = dataset_processor.get_eval_examples(data_dir) eval_data_file = os.path.join(datasets_dir, "eval.tf_record") data_processor.convert_examples_to_features(evaluation_examples, labels, max_seq_length, tokenizer, eval_data_file) eval_input_fn = utils.input_fn_builder(input_file=eval_data_file, seq_length=max_seq_length, is_training=False,
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_type", default='bert', type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default='bert-base-uncased', type=str, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default='exp', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=32, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', default=True, help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=16, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=16, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=2, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=12.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=100, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=300, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--tpu', action='store_true', help="Whether to run on the TPU defined in the environment variables") parser.add_argument( '--tpu_ip_address', type=str, default='', help="TPU IP address if none are set in the environment variables") parser.add_argument( '--tpu_name', type=str, default='', help="TPU name if none are set in the environment variables") parser.add_argument( '--xrt_tpu_config', type=str, default='', help="XRT TPU config if none are set in the environment variables") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Set seed set_seed(args) d = DataProcessor() train_dir_path = './dataset' dev_dir_path = './dataset' test_dir_path = './dataset' dev_eg = d.get_dev_examples(dev_dir_path) train_eg = d.get_train_examples(train_dir_path) test_eg = d.get_test_examples(test_dir_path) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') train_dataset = convert_features_to_dataset( convert_examples_to_features(examples=train_eg, label2id=LABEL2ID, max_seq_length=30, tokenizer=tokenizer)) dev_dataset = convert_features_to_dataset( convert_examples_to_features(examples=dev_eg, label2id=LABEL2ID, max_seq_length=30, tokenizer=tokenizer)) test_dataset = convert_features_to_dataset( convert_examples_to_features(examples=test_eg, label2id=LABEL2ID, max_seq_length=30, tokenizer=tokenizer)) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.model_name_or_path, num_labels=2) model = BertForCS.from_pretrained(args.model_name_or_path, config=config, num_labels=2) model.to(args.device) train(args, train_dataset, model, dev_dataset) new_result, pred_to_write = evaluate(args, test_dataset, model) with open('./result/result.csv', 'w') as f: for i, r in enumerate(pred_to_write, 1): f.write('%d,%d\n' % (i, int(r)))