def predict_callback(model,step): labels = get_labels(args.labels) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") model.train()
def load_dataset(self, mode, batch_size): labels = get_labels(self.hparams.labels) self.pad_token_label_id = CrossEntropyLoss().ignore_index dataset = self.load_and_cache_examples(labels, self.pad_token_label_id, mode) if mode == "train": if self.hparams.n_gpu > 1: sampler = DistributedSampler(dataset) else: sampler = RandomSampler(dataset) else: sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size) return dataloader
def init_model(self): print("***** Loading the model *****") #param self.model_type = 'roberta' label_parh = 'model/labels.txt' model_name_or_path = 'model' self.per_gpu_eval_batch_size = 8 self.max_seq_length=512 self.local_rank=-1 seed=42 # Setup CUDA, GPU & distributed training if self.local_rank == -1: self.device = torch.device("cpu") self.n_gpu=0 else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(self.local_rank) self.device = torch.device("cuda", self.local_rank) torch.distributed.init_process_group(backend="nccl") self.n_gpu = 1 #set random seed self.set_seed(seed,self.n_gpu) # Prepare CONLL-2003 task self.labels = get_labels(label_parh) num_labels = len(self.labels) print(" The number of labels in this model is : %d", num_labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later config_class, model_class, tokenizer_class = MODEL_CLASSES[self.model_type] config = config_class.from_pretrained(model_name_or_path, num_labels=num_labels, cache_dir=None) self.tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True, cache_dir=None) self.model = model_class.from_pretrained(model_name_or_path, from_tf=bool(".ckpt" in model_name_or_path), config=config, cache_dir=None) self.pad_token_label_id = CrossEntropyLoss().ignore_index print("***** Building model *****") print(" This is a :%s NER model for citation strings", self.model_type) print(" Modle is load from : %s", model_name_or_path) self.model.to(self.device) print(" Finish preparing the model !")
def __init__(self, model_name_or_path, labels_file='labels.txt'): self.LABELS = get_labels(labels_file) num_labels = len(self.LABELS) self.tokenizer = transformers.AutoTokenizer.from_pretrained( model_name_or_path) model_config = transformers.AutoConfig.from_pretrained( model_name_or_path, num_labels=num_labels, output_hidden_states=True, output_attentions=True, ) # This is a just a regular PyTorch model. self.model = _from_pretrained( transformers.AutoModelForTokenClassification, model_name_or_path, config=model_config) self.model.load_state_dict( torch.load(os.path.join(model_name_or_path, 'pytorch_model.bin'), map_location='cpu')) self.model.eval()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--language", default=None, type=str, required=True, help= "Evaluation language. Also train language if `train_language` is set to None.", ) parser.add_argument( "--train_language", default=None, type=str, help="Train language if is different of the evaluation language.") parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument("--gpu_id", default="", type=str, help="GPU id") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.") parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.") parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--logging_each_epoch", action="store_true", help="Logging every epoch") parser.add_argument( "--task_name", default="ner", type=str, required=True, help="The name of the task to train", ) parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train", lang=args.train_language) global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Prepare CONLL-2003 task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return preds_list, out_label_list
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--train_data_subset", type=int, default=-1, help= "If > 0: limit the training data to a subset of train_data_subset instances." ) parser.add_argument( "--eval_data_subset", type=int, default=-1, help= "If > 0: limit the evaluation data to a subset of eval_data_subset instances." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, 'run_args.txt'), 'w') as f: f.write(json.dumps(args.__dict__, indent=2)) f.close() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning("Device: %s, n_gpu: %s", device, args.n_gpu) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) model.to(args.device) logger.info("Training/evaluation parameters %s", args) train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") if args.train_data_subset > 0: train_dataset = Subset( train_dataset, list(range(min(args.train_data_subset, len(train_dataset))))) run_feature_extractor(args, train_dataset, model)
model_path: str = field(default="./model", metadata={"help": "Model saved by run_ner.py"}) labels: str = field( default="./data/labels.txt", metadata={"help": "Path to a file containing all labels."}) input: str = field( default= "在福特的帮助下,阿瑟·登特在地球被毁灭前的最后一刻搭上了一艘路过地球的外星人的太空船,远离这个即将毁灭的伤心地,开始了一段充满惊奇的星河探险", metadata={"help": "Input data"}) if __name__ == "__main__": parser = HfArgumentParser(PredArguments) args = parser.parse_args_into_dataclasses()[0] model = AutoModelForTokenClassification.from_pretrained(args.model_path) tokenizer = AutoTokenizer.from_pretrained(args.model_path) labels = get_labels(args.labels) inputs = tokenizer.encode(args.input, return_tensors="pt") outputs = model(inputs)[0] prediction = torch.argmax(outputs, dim=2)[0] seq = [labels[it] for it in prediction.tolist()] # Bit of a hack to get the tokens with the special tokens tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(args.input))) entities = get_entities(seq) print(f"Input: {args.input}") for entity in entities: print(f"{entity}: {''.join(tokens[entity[1]:entity[2] + 1])}")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--labels", default="", type=str, help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) # tokenizer = tokenizer_class.from_pretrained( # args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, # do_lower_case=args.do_lower_case, # cache_dir=args.cache_dir if args.cache_dir else None, # ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Evaluation parameters %s", args) # Evaluation if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) import time start_time = time.time() result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") print("--- %s seconds ---" % (time.time() - start_time)) # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def run(article): original_article = article start_end = char_indexing(article) article = article_to_txt(article) # Prepare CONLL-2003 task labels = get_labels(labels_path) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] config = config_class.from_pretrained( config_name if config_name else model_name_or_path, num_labels=num_labels) # Predict tokenizer = tokenizer_class.from_pretrained(output_dir, do_lower_case=do_lower_case) model = model_class.from_pretrained(output_dir) model.to(device) result, predictions = evaluate(model, tokenizer, labels, pad_token_label_id, mode="test") # # Save results # print_result_precision(result) # Save predictions example_id = 0 prediction_lines = [] container = [] # for i, line in enumerate(article): # if line.startswith("-DOCSTART-") or line == "" or line == "\n": # if not predictions[example_id]: # example_id += 1 # elif predictions[example_id]: # word = line.split()[0] # pred = predictions[example_id].pop(0) # print(start_end[i], word, pred) # output_line = str(example_id) + " " + word + " " + pred + "\n" # prediction_lines.append((word, pred)) # else: # logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) for i, line in enumerate(article): if line.startswith( "-DOCSTART-" ) or line == "" or line == "\n": #or line.endswith('.') or line.endswith(',') or line.endswith(';') : if not predictions[example_id]: example_id += 1 container.append([example_id, line, 'NONE']) elif predictions[example_id]: word = line.split()[0] pred = predictions[example_id].pop(0) container.append([example_id, word, pred]) prediction_lines.append((word, pred)) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) result = categorizing(prediction_lines) print(len(container)) print(len(start_end)) labels = get_label_docanno(start_end, container) docanno = to_docanno(original_article, labels) print('PREDICTION RESULT:') for k, v in result.items(): print(k, ':', v) print("###") # returns docanno.json print(docanno) json_filename = 'docanno.json' with open(json_filename, 'w') as f: json.dump(docanno, f) return docanno
results = { "loss": eval_loss, "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } logger.info("***** Eval results %s *****", prefix) for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results, preds_list # Prepare CONLL-2003 task labels = get_labels('') nlabels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = nn.CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Build Bert Tokenizer: tokenizer_class = BertTokenizer tokenizer = tokenizer_class.from_pretrained("bert-base-cased", cache_dir=args.data_dir) dictionary = tokenizer.vocab
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default="unilm", type=str, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--disable_tqdm', action='store_true', help='Disable the tqdm bar. ') ## Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.") parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.") parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_ratio", default=0.1, type=float, help="Linear warmup over warmup_ratio.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--metric_for_choose_best_checkpoint', type=str, default=None, help="Set the metric to choose the best checkpoint") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer_name = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) if not hasattr(config, 'need_pooler') or config.need_pooler is not True: setattr(config, 'need_pooler', True) model = model_class.from_pretrained( args.model_name_or_path, config=config, cache_dir=args.cache_dir if args.cache_dir else None) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) tokenizer.save_pretrained(args.output_dir) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args) checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) metric_for_best = args.metric_for_choose_best_checkpoint best_performance = None best_epoch = None for checkpoint in checkpoints: prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" checkpoint_config = config_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint, config=checkpoint_config) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) if metric_for_best is None: metric_for_best = list(result.keys())[-1] if best_epoch is None: best_epoch = checkpoint best_performance = result else: if best_performance[metric_for_best] < result[metric_for_best]: best_performance = result best_epoch = checkpoint if best_epoch is not None: logger.info( " ***************** Best checkpoint: {}, choosed by {} *****************" .format(best_epoch, metric_for_best)) logger.info("Best performance = %s" % json.dumps(best_performance)) save_best_result(best_epoch, best_performance, args.output_dir) checkpoint = best_epoch checkpoint_config = config_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint, config=checkpoint_config) model.to(args.device) result, _ = test(args, model, tokenizer, labels, pad_token_label_id, mode="test", prefix=global_step)
def predict_entities(input_file,output_prediction_file): args = parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device args.do_eval = True args.do_predict = True # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) # logger.warning( # "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", # args.local_rank, # device, # args.n_gpu, # bool(args.local_rank != -1), # args.fp16, # ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) - 4 # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS} # logger.info("Tokenizer arguments: %s", tokenizer_args) # tokenizer = AutoTokenizer.from_pretrained( # args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, # cache_dir=args.cache_dir if args.cache_dir else None, # **tokenizer_args, # ) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,cache_dir=args.cache_dir if args.cache_dir else None,**tokenizer_args) model = AutoModelForTokenClassification_Soft_NER.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Parameters %s", args) # Evaluation # Save results on test tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args) model = AutoModelForTokenClassification_Soft_NER.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="", path=input_file ) # print(len(predictions[0])) # Save results on test output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = output_prediction_file with open(output_test_predictions_file, "w") as writer: with open(input_file, "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) # print(example_id) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: # logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) continue print("\n\n") print("-------------------------------------------------------------------------------------------------") print("***** Perdictions on sentences is stored at ", output_prediction_file,"*****" ) print("-------------------------------------------------------------------------------------------------") print("\n\n")
def main(args): # Prepare CONLL-2003 task labels = get_labels(args.labels) # langs = [l for l in args.src_langs] + [args.tgt_lang] langs = args.src_langs num_langs = len(langs) pad_token_label_id = CrossEntropyLoss().ignore_index # -100 here # load target model (pretrained BERT) and tokenizer tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path) # Training if args.do_train: logger.info("********** scheme: train domain learner **********") # prepare plain text datasets & compute sentence embeddings( the order of the args.src_langs matters!!! ) f_datasets = [] domain_embeds = [] cnt_datasets = [] for i, lang in enumerate(langs): pt_dts = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, lang, mode="train", plain_text=True) st_ebd = get_init_domain_embed(args, pt_dts, lang) # dataset_size x hidden_size domain_embeds.append(torch.mean(st_ebd, dim=0)) lang_id = torch.tensor([i] * st_ebd.size(0), dtype=torch.long).to( args.device) # dataset_size f_datasets.append(TensorDataset(st_ebd, lang_id)) cnt_datasets.append(st_ebd.size(0)) f_datasets = torch.utils.data.ConcatDataset(f_datasets) domain_embeds = torch.stack( domain_embeds) # (n_langs + 1) x hidden_size, device class_weight = None if args.balance_classes: class_weight = torch.from_numpy( 1.0 / np.array(cnt_datasets, dtype=np.float32)) class_weight = class_weight / torch.sum(class_weight) class_weight.requires_grad = False class_weight = class_weight.to(args.device) domain_model = DomainLearner(num_langs, config.hidden_size, args.low_rank_size, weights_init=domain_embeds, gamma=args.gamma_R, class_weight=class_weight, domain_orthogonal=args.domain_orthogonal) domain_model.to(args.device) # Train! global_step, loss = train(args, domain_model, f_datasets) logger.info(" global_step = %s, loss = %s", global_step, loss) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(domain_model.state_dict(), os.path.join(args.output_dir, "domain_model.bin")) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) if args.do_predict: # save domain similarity logger.info( "********** scheme: prediction - compute domain similarity **********" ) sims_dir = os.path.join( args.output_dir, "{}{}{}-rank_{}-gamma_{}".format( args.tgt_lang, '' if not args.balance_classes else '-balanced', '' if not args.domain_orthogonal else '-domain_orth', args.low_rank_size, args.gamma_R)) dm_namespace = torch.load(os.path.join(sims_dir, "training_args.bin")) # langs = [l for l in dm_namespace.src_langs] + [dm_namespace.tgt_lang] langs = dm_namespace.src_langs src_idxs = [langs.index(l) for l in args.src_langs] pt_dts = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, args.tgt_lang, mode="train", plain_text=True) st_ebd = get_init_domain_embed( args, pt_dts, args.tgt_lang) # dataset_size x hidden_size dataset_st_ebd = TensorDataset(st_ebd) domain_model = DomainLearner(len(langs), config.hidden_size, args.low_rank_size) domain_model.load_state_dict( torch.load(os.path.join(sims_dir, "domain_model.bin"), map_location=args.device)) domain_model.to(args.device) st_sims, dm_sims = evaluate(args, domain_model, dataset_st_ebd, src_idxs) torch.save( st_sims, os.path.join( sims_dir, "sims-{}-{}-{}-{}.bin".format(args.tau_metric, "_".join(args.src_langs), args.tgt_lang, "train")))
def main(_): logging.set_verbosity(logging.INFO) args = flags.FLAGS.flag_values_dict() if args["fp16"]: tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": True}) if args["tpu"]: resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=args["tpu"]) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) args["n_device"] = args["num_tpu_cores"] elif len(args["gpus"].split(",")) > 1: args["n_device"] = len( [f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) elif args["no_cuda"]: args["n_device"] = 1 strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: args["n_device"] = len(args["gpus"].split(",")) strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0]) logging.warning( "n_device: %s, distributed training: %s, 16-bits training: %s", args["n_device"], bool(args["n_device"] > 1), args["fp16"], ) labels = get_labels(args["labels"]) pad_token_label_id = -1 logging.info("predict parameters %s", args) tokenizer = AutoTokenizer.from_pretrained( args["output_dir"], do_lower_case=args["do_lower_case"]) model = TFAutoModelForTokenClassification.from_pretrained( args["output_dir"]) while True: print('Input chinese sentence:') line = str(input()) if line == 'quit': break if len(line) < 1: print( 'Please input a chinese sentence or "quit" to break this loop:' ) continue examples = read_examples_from_line(line) features = convert_examples_to_features( examples, labels, args["max_seq_length"], tokenizer, cls_token_at_end=bool(args["model_type"] in ["xlnet"]), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args["model_type"] in ["roberta"]), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args["model_type"] in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.pad_token_id, pad_token_segment_id=tokenizer.pad_token_type_id, pad_token_label_id=pad_token_label_id, ) feature = features[0] X = collections.OrderedDict() X["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List( value=list(feature.input_ids))) X["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List( value=list(feature.input_mask))) X["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List( value=list(feature.segment_ids))) X["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List( value=list(feature.label_ids))) tf_example = tf.train.Example(features=tf.train.Features(feature=X)) tf_example = tf_example.SerializeToString() max_seq_length = args["max_seq_length"] name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), } def _decode_record(record): example = tf.io.parse_single_example(record, name_to_features) features = {} features["input_ids"] = example["input_ids"] features["input_mask"] = example["input_mask"] features["segment_ids"] = example["segment_ids"] return features, example["label_ids"] dataset = [] dataset.append(tf_example) dataset = tf.data.Dataset.from_tensor_slices(dataset) dataset = dataset.map(_decode_record) batch_size = 1 dataset = dataset.batch(batch_size) eval_features, eval_labels = iter(dataset).next() inputs = { "attention_mask": eval_features["input_mask"], "training": False } if args["model_type"] != "distilbert": inputs["token_type_ids"] = (eval_features["segment_ids"] if args["model_type"] in ["bert", "xlnet"] else None) with strategy.scope(): logits = model(eval_features["input_ids"], **inputs)[0] active_loss = tf.reshape(eval_labels, (-1, )) != pad_token_label_id preds = logits.numpy() label_ids = eval_labels.numpy() preds = np.argmax(preds, axis=2) y_pred = [[] for _ in range(label_ids.shape[0])] for i in range(label_ids.shape[0]): for j in range(label_ids.shape[1]): if label_ids[i, j] != pad_token_label_id: y_pred[i].append(labels[preds[i, j]]) tokens = tokenizer.tokenize(line) print('## tokens = %s' % tokens) print('## y_pred = %s' % y_pred) print('## %s = %s' % (len(tokens), len(y_pred[0]))) word_group = [] subword = {} def _add_word(subword): word_group.append(subword['token'] + '/' + subword['flag']) subword.clear() for i, token in enumerate(tokens): flag = y_pred[0][i] print('## %s = %s' % (token, flag)) if flag.startswith('B'): if len(subword) > 0: _add_word(subword) subword['token'] = token subword['flag'] = flag elif flag.startswith('I'): if (len(subword) > 0 and (y_pred[0][i - 1].startswith('I') or y_pred[0][i - 1].startswith('B')) and (y_pred[0][i - 1][1:] == flag[1:])): subword['token'] = subword['token'] + token continue elif len(subword) > 0: _add_word(subword) subword['token'] = token subword['flag'] = flag else: if len(subword) > 0: _add_word(subword) subword['token'] = token subword['flag'] = flag _add_word(subword) if len(subword) > 0: _add_word(subword) print('## word_group = %s' % word_group)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " ) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--labels", default="", type=str, help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.") parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument("--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=12, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=48, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=30.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=5000, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=5000, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument("--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument("--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument("--info", default="question", type=str, help="Information to be passed.") # parser.add_argument("--dataset_to_eval", default=None, type=str, # help="Dataset which is being tested.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) if global_step: result = {"{}_{}".format(global_step, k): v for k, v in result.items()} results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) test_data_dirs = ['AnatEM', 'BC2GM', 'BC4CHEMD', 'BC5CDR', 'BIONLP09', 'BIONLP11EPI', 'BIONLP11ID', 'BIONLP13CG', 'BIONLP13GE', 'BIONLP13PC', 'CRAFT', 'ExPTM', 'JNLPBA', 'Linnaeus', 'NCBIDisease', '2012TemporalRelations', '2010RelationsChallege', '2011CoreferenceChallenge'] # test_data_dirs = [args.dataset_to_eval] if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) if args.n_gpu > 1: model = torch.nn.DataParallel(model) base_data_dir = args.data_dir for data_dir in tqdm(test_data_dirs,desc="Data_Dir:"): # for data_dir in [""]: dataset_test = data_dir print("Current Data Dir:"+data_dir) data_dir=os.path.join(base_data_dir,data_dir) print("Updated Data Dir:"+data_dir) args.data_dir=data_dir result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results # output_test_results_file = os.path.join(args.data_dir, args.info+"_test_results.txt") output_test_results_file = os.path.join(args.output_dir, dataset_test+"_"+args.info+"_test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions # output_test_predictions_file = os.path.join(args.data_dir, args.info+"_test_predictions.jsonl") output_test_predictions_file = os.path.join(args.output_dir, dataset_test+"_"+args.info+"_test_predictions.jsonl") with jsonlines.open(output_test_predictions_file, "w") as writer: examples = read_examples_from_file_qa(args.data_dir,"test",args.info) for idx,example in enumerate(examples): outrow = example.orig_row outrow["predicted_label"] = predictions[idx] outrow["input_words"] = example.words outrow["labels"] = example.labels writer.write(outrow) # print(example.words,predictions[example.guid],example.labels) # writer.write(example.guid+"\n") # count=0 # for word,pred,label in zip(example.words,predictions[idx],example.labels): # writer.write("%s\t%s\t%s\n"%(word,pred,label)) # if pred=="B": # count+=1 # true_counts=example.counts # writer.write("PredCounts:%d, TrueCounts:%d \n"%(count,true_counts)) # with open(os.path.join(args.data_dir, "test.txt"), "r") as f: # example_id = 0 # for line in f: # if line.startswith("-DOCSTART-") or line == "" or line == "\n": # writer.write(line) # if not predictions[example_id]: # example_id += 1 # elif predictions[example_id]: # output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n" # writer.write(output_line) # else: # logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) return results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Task embeddings parser.add_argument( "--num_softmax_classifiers", default=1, type=int, help="Number of softmax classifiers on top of Bert's output.") parser.add_argument("--pow", type=float, default=2.0, help="Return features to the power pow.") parser.add_argument( "--feature_type", default='grads', type=str, help="The type of the features selected in ['grads', 'weights']") parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") parser.add_argument( "--retain_gradients", default=True, type=eval, help= "Whether to retain gradients at each layer output of the feature extractor." ) parser.add_argument("--do_pooling", default=True, type=eval, help="Whether to pool the feature extractor.") parser.add_argument( "--use_labels", default=True, type=eval, help= "Whether to use training labels or sample from the model's predictive distribution \n" "pθ(y|xn), e.g., to compute the theoretical Fisher information.") parser.add_argument( "--num_trials_for_FIM", type=int, default=100, help= "Number of trials to sample from the model's predictive distribution pθ(y|xn)." ) parser.add_argument( "--FIM_scale", type=float, default=0.25, help= "Standard deviation of the distribution used to compute the theoretical FIM." ) parser.add_argument("--finetune_classifier", default=False, type=eval, help="Whether to fine-tune the final classifier.") parser.add_argument("--finetune_feature_extractor", default=False, type=eval, help="Whether to fine-tune the feature extractor.") ## Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( '--save', type=str, default='all', help="Select load mode from ['all', '0', '1', '2', '3', ...]") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, 'run_args.txt'), 'w') as f: f.write(json.dumps(args.__dict__, indent=2)) f.close() # Setup CUDA, GPU training device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() args.device = device if args.n_gpu > 1: raise ValueError("This code only supports a single GPU.") # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning("Device: %s, n_gpu: %s", device, args.n_gpu) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) args.num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_labels, num_softmax_classifiers=args.num_softmax_classifiers, retain_gradients=args.retain_gradients, do_pooling=args.do_pooling, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) model.to(args.device) logger.info("List of model named parameters:") for n, p in list(model.named_parameters()): logger.info("%s", n) logger.info("Training/evaluation parameters %s", args) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) tokenizer.save_pretrained(args.output_dir) train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") if args.data_subset > 0: train_dataset = Subset( train_dataset, list(range(min(args.data_subset, len(train_dataset))))) compute_taskemb(args, train_dataset, model)
"bert": (BertConfig, BertForTokenClassification, BertTokenizer), } config_class, model_class, tokenizer_class = MODEL_CLASSES['bert'] # location of the model of choice model_dir = 'model/' model = model_class.from_pretrained(model_dir) case = False tokenizer = tokenizer_class.from_pretrained(model_dir, do_lower_case=case) device = 'cpu' model.to(device) # setting the folder for possible whole directory tagging files_dir = 'data/' # getting labels labels = get_labels('labels.txt') pad_token_label_id = CrossEntropyLoss().ignore_index # reading examples examples = read_examples_from_file('.', 'test') features = convert_examples_to_features( examples, labels, 256, tokenizer, cls_token_at_end=False, cls_token=tokenizer.cls_token, cls_token_segment_id=0, sep_token=tokenizer.sep_token, sep_token_extra=False, pad_on_left=False,
def main(args): # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # -100 here tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) src_datasets = [] for src in args.src_langs: src_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, src, mode="train") src_datasets.append(src_dataset) # Training if args.do_train: logger.info("********** scheme: training with KD **********") # compute sentence embeddings of target training examples dataset_pt = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, args.tgt_lang, mode="train") src_probs, src_predictions = get_src_weighted_probs(args, dataset_pt, config, mode="train", src_datasets=src_datasets) # load target model (pretrained BERT) and tokenizer model = BertForTokenClassification_.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) model.to(args.device) task_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, args.tgt_lang, mode="train") if args.train_hard_label: # update the src_probs as hard labels voting_labels = torch.argmax(src_probs, dim=-1).cpu() voting_labels[task_dataset.tensors[3] == pad_token_label_id] = pad_token_label_id task_dataset = TensorDataset(task_dataset.tensors[0], task_dataset.tensors[1], task_dataset.tensors[2], voting_labels) src_probs = None # Train! global_step, loss_KD, loss = train(args, model, task_dataset, src_probs, pad_token_label_id, src_predictions) logger.info(" global_step = %s, average task KD loss = %s, average task loss = %s", global_step, loss_KD, loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) if args.do_voting: logger.info("********** scheme: Voting with %s **********", "averaging" if args.sim_dir == "" else "{}/rank_{}-gamma_{}".format(args.sim_dir, args.low_rank_size, args.gamma_R)) test_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, args.tgt_lang, mode="test") src_probs, _ = get_src_weighted_probs(args, test_dataset, config, mode="test", src_datasets=src_datasets) result, predictions = weighted_voting(args, test_dataset, src_probs, labels, pad_token_label_id) # Save results output_test_results_file = os.path.join(args.output_dir, "test_results-{}-{}.txt".format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), os.path.basename(args.data_dir))) with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions-{}-{}.txt".format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), os.path.basename(args.data_dir))) with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, args.tgt_lang, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split()[0] + " " + line.split()[-1].replace("\n", "") + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) if args.do_predict: logger.info("********** scheme: prediction **********") model = BertForTokenClassification_.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results-{}-{}.txt".format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), args.tgt_lang)) with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions-{}-{}.txt".format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), args.tgt_lang)) with open(output_test_predictions_file, "w", encoding='utf-8') as writer: with open(os.path.join(args.data_dir, args.tgt_lang, "test.txt"), "r", encoding='utf-8') as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split()[0] + " " + line.split()[-1].replace("\n", "") + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.") parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.") parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument( "--has_new_labels", action="store_true", help= "Tells the trainer that more labels are present than in the pretrained model." ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) if args.has_new_labels and os.path.exists(args.model_name_or_path): old_config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, ) old_model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=old_config, cache_dir=args.cache_dir if args.cache_dir else None, ) old_num_labels = old_config.num_labels new_labels = [ label for label in labels if label not in old_config.label2id.keys() ] id2label = { **old_config.id2label, **{ i + old_num_labels: label for i, label in enumerate(new_labels) } } label2id = { **old_config.label2id, **{ label: i + old_num_labels for i, label in enumerate(new_labels) } } config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label=id2label, label2id=label2id, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class(config) params_old = old_model.named_parameters() params_new = model.named_parameters() dict_params_old = dict(params_old) dict_params_new = dict(params_new) old_labels = list(old_config.label2id.keys()) new_labels = list(config.label2id.keys()) logger.info("Old labels %s", old_labels) logger.info("New labels %s", new_labels) sort_keys = np.argsort(new_labels) for param_name, param_old in dict_params_old.items(): if param_name in dict_params_new: if param_name.startswith('classifier'): # not only do the dimensions differ when adding a new label # but something resorts the labels (apparently alphabetically) # and uses that resorted list to determine the matrix structure # so we should apply that logic here explicitly for target_idx, current_idx in enumerate(sort_keys): target_label = new_labels[current_idx] if target_label in old_labels: idx_in_old_weights = old_labels.index(target_label) dict_params_new[param_name].data[target_idx].copy_( param_old.data[idx_in_old_weights]) else: dict_params_new[param_name].data.copy_(param_old.data) model.load_state_dict(dict_params_new) elif args.has_new_labels: raise Exception( "You have specified you're adding new labels but not provided a base model" ) else: config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w", encoding="utf-8") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args) model = model_class.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w", encoding="utf-8") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w", encoding="utf-8") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r", encoding="utf-8") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split( )[0] + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) return results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used." ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--train_data_subset", type=int, default=-1, help= "If > 0: limit the training data to a subset of train_data_subset instances." ) parser.add_argument( "--eval_data_subset", type=int, default=-1, help= "If > 0: limit the evaluation data to a subset of eval_data_subset instances." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--finetune_feature_extractor", default=True, type=eval, help="Whether to fine-tune the feature extractor.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( '--model_load_mode', type=str, default='all', help="Select load mode from ['base_model_only', 'all']") parser.add_argument( '--save', type=str, default='all', help="Select load mode from ['all', '0', '1', '2', '3', ...]") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Create output directory if needed if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, 'run_args.txt'), 'w') as f: f.write(json.dumps(args.__dict__, indent=2)) f.close() # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab if args.model_load_mode not in ['base_model_only', 'all']: raise ValueError("Model load mode not found: %s" % (args.model_load_mode)) state_dict_with_prefix = None if args.model_load_mode == 'base_model_only': archive_file = os.path.join(args.model_name_or_path, WEIGHTS_NAME) model_state_dict = torch.load(archive_file) state_dict_with_prefix = {} for key, value in model_state_dict.items(): if key.startswith(args.model_type): state_dict_with_prefix[key] = value args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, state_dict=state_dict_with_prefix) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) tokenizer.save_pretrained(args.output_dir) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") if args.train_data_subset > 0: train_dataset = Subset( train_dataset, list(range(min(args.train_data_subset, len(train_dataset))))) global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: eval_list = [] if args.save == 'all': eval_list = range(int(args.num_train_epochs + 1)) elif str(args.save).isdigit(): eval_list = [int(args.save)] if len(eval_list) > 0: for epoch in eval_list: checkpoints = [ os.path.join(args.output_dir, 'checkpoint-{}'.format(epoch)) ] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger( "pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = model_class.from_pretrained(checkpoint) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", checkpoint_id=epoch, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format( key, str(results[key]))) return results
def __init__(self, hparams): self.labels = get_labels(hparams.labels) num_labels = len(self.labels) super(NERTransformer, self).__init__(hparams, num_labels)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Prepare Token Classification task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) with training_args.strategy.scope(): model = TFAutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( TFNerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( TFNerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != -1: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return preds_list, out_label_list def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset.get_dataset() if train_dataset else None, eval_dataset=eval_dataset.get_dataset() if eval_dataset else None, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = TFNerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, ) predictions, label_ids, metrics = trainer.predict(test_dataset.get_dataset()) preds_list, labels_list = align_predictions(predictions, label_ids) report = classification_report(labels_list, preds_list) logger.info("\n%s", report) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: writer.write("%s\n" % report) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not preds_list[example_id]: example_id += 1 elif preds_list[example_id]: output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Prepare CONLL-2003 task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) tui_ids = None if data_args.umls: tui_ids = create_cui_dict(voc_updated=data_args.med_document, tokenizer=tokenizer) # Get datasets train_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, tui_ids=tui_ids, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, tui_ids=tui_ids, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return preds_list, out_label_list def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, tui_ids=tui_ids, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, ) predictions, label_ids, metrics = trainer.predict(test_dataset) preds_list, _ = align_predictions(predictions, label_ids) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: try: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not preds_list[example_id]: example_id += 1 elif preds_list[example_id]: output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0] ) except: break return results
def __init__(self, hparams): self.labels = get_labels(hparams.labels) num_labels = len(self.labels) self.pad_token_label_id = CrossEntropyLoss().ignore_index super(NERTransformer, self).__init__(hparams, num_labels, self.mode)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=10000000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") parser.add_argument( "--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the ud2: ", ) parser.add_argument( "--base_data_dir", default=None, type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--data_size", default=None, type=int, help= "Data_size for Shapley Training (None for full data), e.g., 100 for debug ", ) parser.add_argument( "--indices_to_delete_file_path", default=None, type=str, help="File path where the ids are to delete", ) parser.add_argument( "--domain_to_delete_file_path", default=None, type=str, help="File path where the ids are to delete", ) parser.add_argument( "--is_baseline_run", action="store_true", help="Is the baseline run to get result, data_size, random/init score", ) parser.add_argument( "--LOO", action="store_true", help="Whether to calculate LOO or not?", ) parser.add_argument( "--is_few_shot", action="store_true", help="Whether to calculate LOO or not?", ) parser.add_argument( "--num_bags", default=20, type=int, help="How many bags to approximate the mean performance ", ) args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) args.base_data_dir = args.data_dir args.task_name = args.task_name # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: # old code # train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") # global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) # logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # new code ALL_BINARY_TASKS = [ 'UD_ARABIC', 'UD_BASQUE', 'UD_BULGARIAN', 'UD_CATALAN', 'UD_CHINESE', 'UD_CROATIAN', 'UD_CZECH', 'UD_DANISH', 'UD_DUTCH', 'UD_ENGLISH', 'UD_FINNISH', 'UD_FRENCH', 'UD_GERMAN', 'UD_HEBREW', 'UD_HINDI', 'UD_INDONESIAN', 'UD_ITALIAN', 'UD_JAPANESE', 'UD_KOREAN', 'UD_NORWEGIAN', 'UD_PERSIAN', 'UD_POLISH', 'UD_PORTUGUESE', 'UD_ROMANIAN', 'UD_RUSSIAN', 'UD_SERBIAN', 'UD_SLOVAK', 'UD_SLOVENIAN', 'UD_SPANISH', 'UD_SWEDISH', 'UD_TURKISH' ] ALL_BINARY_TASKS.remove(args.task_name) logger.info( " ALL_POS_TASKS = %s, task = %s args.indices_to_delete_file_path = %s", ALL_BINARY_TASKS, args.task_name, args.indices_to_delete_file_path) logger.info( " not evaluate = %s args.indices_to_delete_file_path and not evaluate=%s", not evaluate, args.indices_to_delete_file_path and not evaluate) if args.indices_to_delete_file_path: with open(args.indices_to_delete_file_path, "r") as reader: print("***** reading ids to remove *****", flush=True) data = reader.read().replace('\n', '').strip().split() ids = np.array([int(i) for i in data]) logger.info(" Data = %s, ids = %s", data, str(ids)) ALL_BINARY_TASKS = np.delete(np.array(ALL_BINARY_TASKS), ids, axis=0) logger.info(" After delete ALL_BINARY_TASKS = %s", ALL_BINARY_TASKS) ALL_BINARY_TASKS = np.random.permutation(ALL_BINARY_TASKS) logger.info(" After Permutation ALL_BINARY_TASKS = %s", ALL_BINARY_TASKS) if len(ALL_BINARY_TASKS) > 0: train_dataset, random_init_result, n_train_points = \ load_and_cache_examples(args, ALL_BINARY_TASKS[0], tokenizer, labels, pad_token_label_id, mode="train") for task in ALL_BINARY_TASKS[1:]: train_dataset2, random_init_result2, n_train_points2 = \ load_and_cache_examples(args, task, tokenizer, labels, pad_token_label_id, mode="train") train_dataset += train_dataset2 n_train_points += n_train_points2 if args.is_few_shot: train_dataset2, random_init_result2, n_train_points2 = \ load_and_cache_examples(args, args.task_name, tokenizer, labels, pad_token_label_id, mode="dev") train_dataset += train_dataset2 n_train_points += n_train_points2 output_eval_file = os.path.join(args.output_dir, '', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Creating Empty Eval results file *****") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) output_eval_file = os.path.join(args.output_dir, "training_results" + ".txt") with open(output_eval_file, "w") as writer: logger.info("***** Writig Training dataset size {} *****") logger.info("%s = %s\n" % ('n_points', n_train_points)) writer.write("%s = %s\n" % ('n_points', n_train_points)) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): if results[key]: writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) results, (predictions, gold_labels) = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(results.keys()): if results[key]: writer.write("{} = {}\n".format(key, str(results[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") output_test_gold_file = os.path.join(args.output_dir, "test_gold.txt") pred_count = 0 with open(output_test_predictions_file, "w") as writer: with open(output_test_gold_file, "w") as f: for gold_line, pred_line in zip(gold_labels, predictions): for gold, pred in zip(gold_line, pred_line): f.write(gold + "\n") writer.write(pred + "\n") pred_count += 1 logger.info( "Written %s sentences to both gold and %s to pred file. Total %d word", len(gold_labels), len(predictions), pred_count) return results
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default="../mid_out1/", type=str, required=False, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default="bert", type=str, required=False, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default="bert-base-uncased", # default="ner_out/run3/", type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default="./ner_out/bert_02", type=str, required=False, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters # 1:bio,0:01,2:BIOE parser.add_argument( "--labels", default=2, type=int, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", # default="bert-base-uncased", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", # default="bert-base-uncased", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="./cach/", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=80, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", default=True, action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", default=False, action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", default=True, action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", default=True, action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=10000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", # default=True, action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument( "--no_cuda", # default=True, action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", default=True, action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", default=True, action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=37, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset, train_examples = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, mode="train") data_stitic(train_examples, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _, dev_examples = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) data_stitic(dev_examples, "dev") if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) result, predictions, examples = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") data_stitic(examples, "test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions spans = get_propgranda_span(predictions, examples) output_test_predictions_file = os.path.join(args.output_dir, "BIOE_predictions.txt") with open(output_test_predictions_file, "w") as writer: for span in spans: out_str = span[0] + "\t" + str(span[1]) + "\t" + str( span[2]) + "\n" writer.write(out_str) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Prepare CONLL-2003 task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) model = AutoModelForTokenMultiLabelClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (NerDataset( data_dir=data_args.data_dir, data_format=data_args.data_format, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, multilabeling=True, ) if training_args.do_train else None) eval_dataset = (NerDataset( data_dir=data_args.data_dir, data_format=data_args.data_format, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, multilabeling=True, ) if training_args.do_eval else None) def get_label_preds_refs( predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]: """ Returns a list of labels for each token in each sequence in the dataset. """ logit_threshold = 0.0 # Corresponds to a probability of 0.5 if fed through a sigmoid. preds = predictions > logit_threshold batch_size, seq_len, _ = preds.shape refs_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): preds_list[i].append( [label_map[x] for x in np.where(preds[i][j] == 1)[0]]) refs_list[i].append( [label_map[x] for x in np.where(label_ids[i][j] == 1)[0]]) return preds_list, refs_list def align_predictions( predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]: logit_threshold = 0.0 # Corresponds to a probability of 0.5 if fed through a sigmoid. preds = predictions > logit_threshold batch_size, seq_len, _ = preds.shape # is_tagged indicates for each token whether it has an associated tag (i.e. a # label, including the O label) and should be assessed, otherwise it's # a padding or special token. is_tagged = label_ids.sum(axis=2) > 0 out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if is_tagged[i, j]: #out_label_list[i].append(label_map[label_ids[i][j]]) out_label_list[i].append([ label_map[x] for x in np.where(label_ids[i][j] == 1)[0] ]) #preds_list[i].append(label_map[preds[i][j]]) preds_list[i].append( [label_map[x] for x in np.where(preds[i][j] == 1)[0]]) return preds_list, out_label_list def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) (chunk_prec, chunk_rec, chunk_f1, tok_prec, tok_rec, tok_f1) = fsn4nlp.utils.conlleval.evaluate_multilabel( out_label_list, preds_list) return { "chunk_precision": chunk_prec, "chunk_recall": chunk_rec, "chunk_f1": chunk_f1, "tok_precision": tok_prec, "tok_recall": tok_rec, "tok_f1": tok_f1, } # Initialize our Trainer trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, labels=labels) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = NerDataset( data_dir=data_args.data_dir, data_format=data_args.data_format, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, multilabeling=True, ) predictions, label_ids, metrics = trainer.predict(test_dataset) preds_list, refs_list = get_label_preds_refs(predictions, label_ids) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: for i, example in enumerate(test_dataset): for tok_id in example.input_ids: tok = tokenizer.convert_ids_to_tokens(tok_id) if refs_list[i][0] == []: output_line = f"{tok}\n" refs_list[i].pop(0) else: output_line = f"{tok} {refs_list[i].pop(0)} {preds_list[i].pop(0)}\n" writer.write(output_line) return results
def main(_): logging.set_verbosity(logging.INFO) args = flags.FLAGS.flag_values_dict() if (os.path.exists(args["output_dir"]) and os.listdir(args["output_dir"]) and args["do_train"] and not args["overwrite_output_dir"]): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args["output_dir"])) if args["fp16"]: tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": True}) if args["tpu"]: resolver = tf.distribute.cluster_resolver.TPUClusterResolver( tpu=args["tpu"]) tf.config.experimental_connect_to_cluster(resolver) tf.tpu.experimental.initialize_tpu_system(resolver) strategy = tf.distribute.experimental.TPUStrategy(resolver) args["n_device"] = args["num_tpu_cores"] elif len(args["gpus"].split(",")) > 1: args["n_device"] = len( [f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) strategy = tf.distribute.MirroredStrategy( devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")]) elif args["no_cuda"]: args["n_device"] = 1 strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") else: args["n_device"] = len(args["gpus"].split(",")) strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0]) logging.warning( "n_device: %s, distributed training: %s, 16-bits training: %s", args["n_device"], bool(args["n_device"] > 1), args["fp16"], ) labels = get_labels(args["labels"]) num_labels = len(labels) pad_token_label_id = -1 # IBO print(args["config_name"] if args["config_name"] else args["model_name_or_path"]) config = AutoConfig.from_pretrained( args["config_name"] if args["config_name"] else args["model_name_or_path"], num_labels=num_labels, cache_dir=args["cache_dir"], ) logging.info("Training/evaluation parameters %s", args) args["model_type"] = config.model_type # Training if args["do_train"]: tokenizer = AutoTokenizer.from_pretrained( args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"], do_lower_case=args["do_lower_case"], cache_dir=args["cache_dir"], ) with strategy.scope(): model = TFAutoModelForTokenClassification.from_pretrained( args["model_name_or_path"], from_pt=bool(".bin" in args["model_name_or_path"]), config=config, cache_dir=args["cache_dir"], ) train_batch_size = args["per_device_train_batch_size"] * args[ "n_device"] train_dataset, num_train_examples = load_and_cache_examples( args, tokenizer, labels, pad_token_label_id, train_batch_size, mode="train") train_dataset = strategy.experimental_distribute_dataset(train_dataset) train( args, strategy, train_dataset, tokenizer, model, num_train_examples, labels, train_batch_size, pad_token_label_id, ) os.makedirs(args["output_dir"], exist_ok=True) logging.info("Saving model to %s", args["output_dir"]) model.save_pretrained(args["output_dir"]) tokenizer.save_pretrained(args["output_dir"]) # Evaluation if args["do_eval"]: tokenizer = AutoTokenizer.from_pretrained( args["output_dir"], do_lower_case=args["do_lower_case"]) checkpoints = [] results = [] if args["eval_all_checkpoints"]: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True), key=lambda f: int("".join(filter(str.isdigit, f)) or -1), )) logging.info("Evaluate the following checkpoints: %s", checkpoints) if len(checkpoints) == 0: checkpoints.append(args["output_dir"]) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if re.match( ".*checkpoint-[0-9]", checkpoint) else "final" with strategy.scope(): model = TFAutoModelForTokenClassification.from_pretrained( checkpoint) y_true, y_pred, eval_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="dev") report = metrics.classification_report(y_true, y_pred, digits=4) if global_step: results.append({ global_step + "_report": report, global_step + "_loss": eval_loss }) output_eval_file = os.path.join(args["output_dir"], "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: for res in results: for key, val in res.items(): if "loss" in key: logging.info(key + " = " + str(val)) writer.write(key + " = " + str(val)) writer.write("\n") else: logging.info(key) logging.info("\n" + report) writer.write(key + "\n") writer.write(report) writer.write("\n") if args["do_predict"]: tokenizer = AutoTokenizer.from_pretrained( args["output_dir"], do_lower_case=args["do_lower_case"]) model = TFAutoModelForTokenClassification.from_pretrained( args["output_dir"]) eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"] predict_dataset, _ = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, eval_batch_size, mode="test") y_true, y_pred, pred_loss = evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode="test") output_test_results_file = os.path.join(args["output_dir"], "test_results.txt") output_test_predictions_file = os.path.join(args["output_dir"], "test_predictions.txt") report = metrics.classification_report(y_true, y_pred, digits=4) with tf.io.gfile.GFile(output_test_results_file, "w") as writer: report = metrics.classification_report(y_true, y_pred, digits=4) logging.info("\n" + report) writer.write(report) writer.write("\n\nloss = " + str(pred_loss)) with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer: with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not y_pred[example_id]: example_id += 1 elif y_pred[example_id]: output_line = line.split( )[0] + " " + y_pred[example_id].pop(0) + "\n" writer.write(output_line) else: logging.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default="./data/ner/conll/debug", type=str, # required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task." ) parser.add_argument( "--output_dir", default="conll-model/test", type=str, # required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--src_model_dir", default="conll-model-22/", type=str, help="path to load teacher models") parser.add_argument( "--src_model_dir_prefix", default="mono-src-", type=str, help="prefix of the teacher model dir (to indicate the model type)") parser.add_argument("--src_langs", type=str, nargs="+", default="en", help="source languages used for multi-teacher models") parser.add_argument("--unlabeled_data_ratio", type=float, default=1.0, help="Ratio of the training data to use.") ## Other parameters parser.add_argument( "--model_type", default='bert', type=str, # required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default='bert-base-multilingual-cased', type=str, # required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--labels", default="./data/ner/conll/labels.txt", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_KD", action="store_true", help="Whether to train with knowledge distillation.") parser.add_argument("--hard_label", action="store_true") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=32, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--freeze_bottom_layer", default=3, type=int, help="Freeze the bottom n layers of the model during fine-tuning.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--warmup_ratio", default=0.1, type=float, help="Linear warmup over warmup_ratio.") parser.add_argument("--logging_steps", type=int, default=20, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=20000, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=667, help="random seed for initialization") parser.add_argument("--gpu_ids", type=int, nargs="+", default=0, help="ids of the gpus to use") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() # Check output_dir if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and os.path.exists( os.path.join(args.output_dir, "pytorch_model.bin")) and os.path.basename( args.output_dir) != "test": raise ValueError( "Train: Output directory already exists and is not empty.") if os.path.exists(args.output_dir) and args.do_predict: is_done = False for name in os.listdir( args.output_dir): # result file: "test_results-TIME-LANGUAGE" if "test_results" in name and (os.path.basename(args.data_dir) + ".txt") in name: is_done = True break if is_done: raise ValueError( "Predict: Output directory ({}) already exists and is not empty." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: args.n_gpu = len(args.gpu_ids) # torch.cuda.device_count() os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_ids[0]) # os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6" device = torch.device("cuda") # device = torch.device("cpu") if (args.n_gpu == 0 or args.no_cuda) else torch.device( # "cuda:{}".format(args.gpu_ids[0])) else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) args.log_dir = os.path.join(args.output_dir, "logs") if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) formatter = logging.Formatter('%(asctime)s %(levelname)s: - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') log_name = "log-{}".format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())) if args.do_train: log_name += "-train" if args.do_predict: log_name += "-predict" log_name += "-{}".format("_".join(args.src_langs)) log_name += "-{}.txt".format(os.path.basename(args.data_dir)) fh = logging.FileHandler(os.path.join(args.log_dir, log_name)) fh.setLevel(logging.INFO if args.local_rank in [-1, 0] else logging.WARN) fh.setFormatter(formatter) ch = logging.StreamHandler() ch.setLevel(logging.INFO if args.local_rank in [-1, 0] else logging.WARN) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # -100 here # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() logger.info("Training/evaluation parameters %s", args) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] # Training if args.do_train: # load target model config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) # prepare target training plain text train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") if args.do_KD: logger.info("********** scheme: training with KD **********") # compute probs from source models w = 1.0 / len(args.src_langs) weight_probs = {l: w for l in args.src_langs} src_probs = None for lang in args.src_langs: if src_probs is None: src_probs = weight_probs[lang] * get_src_probs( args, train_dataset, model_class, src_lang=lang) else: src_probs += weight_probs[lang] * get_src_probs( args, train_dataset, model_class, src_lang=lang) # Train! if args.hard_label: hard_labels = torch.argmax(src_probs, dim=-1, keepdim=False) train_dataset.tensors += (hard_labels, ) global_step, tr_loss = train(args, model, train_dataset, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) else: global_step, tr_loss_KD, tr_loss = train_KD( args, model, train_dataset, src_probs, tokenizer, labels, pad_token_label_id) logger.info( " global_step = %s, average KD loss = %s, average loss = %s", global_step, tr_loss_KD, tr_loss) else: logger.info("********** scheme: training without KD **********") global_step, tr_loss = train(args, model, train_dataset, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) if args.do_predict and args.local_rank in [-1, 0]: logger.info("********** scheme: prediction **********") tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join( args.output_dir, "test_results-{}-{}.txt".format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), os.path.basename(args.data_dir))) with open(output_test_results_file, "w", encoding='utf-8') as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join( args.output_dir, "test_predictions-{}-{}.txt".format( time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), os.path.basename(args.data_dir))) with open(output_test_predictions_file, "w", encoding='utf-8') as writer: with open(os.path.join(args.data_dir, "test.txt"), "r", encoding='utf-8') as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split()[0] + " " + line.split( )[-1].replace( "\n", "") + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])