Esempio n. 1
0
 def predict_callback(model,step):
     labels = get_labels(args.labels)
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
     # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
     pad_token_label_id = CrossEntropyLoss().ignore_index
     if args.do_predict and args.local_rank in [-1, 0]:
         tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case)
         evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
     model.train()
Esempio n. 2
0
 def load_dataset(self, mode, batch_size):
     labels = get_labels(self.hparams.labels)
     self.pad_token_label_id = CrossEntropyLoss().ignore_index
     dataset = self.load_and_cache_examples(labels, self.pad_token_label_id,
                                            mode)
     if mode == "train":
         if self.hparams.n_gpu > 1:
             sampler = DistributedSampler(dataset)
         else:
             sampler = RandomSampler(dataset)
     else:
         sampler = SequentialSampler(dataset)
     dataloader = DataLoader(dataset,
                             sampler=sampler,
                             batch_size=batch_size)
     return dataloader
  def init_model(self):
    print("***** Loading the model *****")
    #param
    self.model_type = 'roberta'
    label_parh = 'model/labels.txt'
    model_name_or_path = 'model'
    self.per_gpu_eval_batch_size = 8
    self.max_seq_length=512
    self.local_rank=-1
    seed=42

    # Setup CUDA, GPU & distributed training
    if self.local_rank == -1:
        self.device = torch.device("cpu")
        self.n_gpu=0
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(self.local_rank)
        self.device = torch.device("cuda", self.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        self.n_gpu = 1

    #set random seed
    self.set_seed(seed,self.n_gpu)

    # Prepare CONLL-2003 task
    self.labels = get_labels(label_parh)
    num_labels = len(self.labels)
    print("  The number of labels in this model is : %d", num_labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    
    config_class, model_class, tokenizer_class = MODEL_CLASSES[self.model_type]
    config = config_class.from_pretrained(model_name_or_path,
                                          num_labels=num_labels,
                                          cache_dir=None)
    self.tokenizer = tokenizer_class.from_pretrained(model_name_or_path,
                                                do_lower_case=True,
                                                cache_dir=None)
    self.model = model_class.from_pretrained(model_name_or_path,
                                        from_tf=bool(".ckpt" in model_name_or_path),
                                        config=config,
                                        cache_dir=None)
    self.pad_token_label_id = CrossEntropyLoss().ignore_index
    print("***** Building model *****")
    print("  This is a :%s NER model for citation strings", self.model_type)
    print("  Modle is load from : %s", model_name_or_path)
    self.model.to(self.device)
    print("  Finish preparing the model !")
Esempio n. 4
0
 def __init__(self, model_name_or_path, labels_file='labels.txt'):
     self.LABELS = get_labels(labels_file)
     num_labels = len(self.LABELS)
     self.tokenizer = transformers.AutoTokenizer.from_pretrained(
         model_name_or_path)
     model_config = transformers.AutoConfig.from_pretrained(
         model_name_or_path,
         num_labels=num_labels,
         output_hidden_states=True,
         output_attentions=True,
     )
     # This is a just a regular PyTorch model.
     self.model = _from_pretrained(
         transformers.AutoModelForTokenClassification,
         model_name_or_path,
         config=model_config)
     self.model.load_state_dict(
         torch.load(os.path.join(model_name_or_path, 'pytorch_model.bin'),
                    map_location='cpu'))
     self.model.eval()
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters

    parser.add_argument(
        "--language",
        default=None,
        type=str,
        required=True,
        help=
        "Evaluation language. Also train language if `train_language` is set to None.",
    )
    parser.add_argument(
        "--train_language",
        default=None,
        type=str,
        help="Train language if is different of the evaluation language.")

    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument("--gpu_id", default="", type=str, help="GPU id")

    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--keep_accents",
                        action="store_const",
                        const=True,
                        help="Set this flag if model is trained with accents.")
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.")
    parser.add_argument("--use_fast",
                        action="store_const",
                        const=True,
                        help="Set this flag to use fast tokenization.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--logging_each_epoch",
                        action="store_true",
                        help="Logging every epoch")

    parser.add_argument(
        "--task_name",
        default="ner",
        type=str,
        required=True,
        help="The name of the task to train",
    )

    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")

    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        id2label={str(i): label
                  for i, label in enumerate(labels)},
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer_args = {
        k: v
        for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS
    }
    logger.info("Tokenizer arguments: %s", tokenizer_args)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        **tokenizer_args,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train",
                                                lang=args.train_language)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)
Esempio n. 6
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                        preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--train_data_subset",
        type=int,
        default=-1,
        help=
        "If > 0: limit the training data to a subset of train_data_subset instances."
    )
    parser.add_argument(
        "--eval_data_subset",
        type=int,
        default=-1,
        help=
        "If > 0: limit the evaluation data to a subset of eval_data_subset instances."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")

    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Create output directory if needed
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    with open(os.path.join(args.output_dir, 'run_args.txt'), 'w') as f:
        f.write(json.dumps(args.__dict__, indent=2))
        f.close()

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning("Device: %s, n_gpu: %s", device, args.n_gpu)

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None)

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    train_dataset = load_and_cache_examples(args,
                                            tokenizer,
                                            labels,
                                            pad_token_label_id,
                                            mode="train")
    if args.train_data_subset > 0:
        train_dataset = Subset(
            train_dataset,
            list(range(min(args.train_data_subset, len(train_dataset)))))
    run_feature_extractor(args, train_dataset, model)
Esempio n. 8
0
    model_path: str = field(default="./model",
                            metadata={"help": "Model saved by run_ner.py"})
    labels: str = field(
        default="./data/labels.txt",
        metadata={"help": "Path to a file containing all labels."})
    input: str = field(
        default=
        "在福特的帮助下,阿瑟·登特在地球被毁灭前的最后一刻搭上了一艘路过地球的外星人的太空船,远离这个即将毁灭的伤心地,开始了一段充满惊奇的星河探险",
        metadata={"help": "Input data"})


if __name__ == "__main__":
    parser = HfArgumentParser(PredArguments)
    args = parser.parse_args_into_dataclasses()[0]

    model = AutoModelForTokenClassification.from_pretrained(args.model_path)
    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
    labels = get_labels(args.labels)

    inputs = tokenizer.encode(args.input, return_tensors="pt")
    outputs = model(inputs)[0]
    prediction = torch.argmax(outputs, dim=2)[0]

    seq = [labels[it] for it in prediction.tolist()]
    # Bit of a hack to get the tokens with the special tokens
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(args.input)))
    entities = get_entities(seq)
    print(f"Input: {args.input}")
    for entity in entities:
        print(f"{entity}: {''.join(tokens[entity[1]:entity[2] + 1])}")
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args()

    

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    # tokenizer = tokenizer_class.from_pretrained(
    #     args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    #     do_lower_case=args.do_lower_case,
    #     cache_dir=args.cache_dir if args.cache_dir else None,
    # )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Evaluation parameters %s", args)



    # Evaluation
    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)

        import time
        start_time = time.time()
        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
        print("--- %s seconds ---" % (time.time() - start_time))
        # Save results
        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
        with open(output_test_predictions_file, "w") as writer:
            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
                example_id = 0
                for line in f:
                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
Esempio n. 10
0
def run(article):
    original_article = article
    start_end = char_indexing(article)

    article = article_to_txt(article)

    # Prepare CONLL-2003 task
    labels = get_labels(labels_path)

    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
    config = config_class.from_pretrained(
        config_name if config_name else model_name_or_path,
        num_labels=num_labels)

    # Predict
    tokenizer = tokenizer_class.from_pretrained(output_dir,
                                                do_lower_case=do_lower_case)
    model = model_class.from_pretrained(output_dir)
    model.to(device)
    result, predictions = evaluate(model,
                                   tokenizer,
                                   labels,
                                   pad_token_label_id,
                                   mode="test")

    # # Save results
    # print_result_precision(result)

    # Save predictions
    example_id = 0
    prediction_lines = []
    container = []

    # for i, line in enumerate(article):
    # if line.startswith("-DOCSTART-") or line == "" or line == "\n":
    # if not predictions[example_id]:
    # example_id += 1
    # elif predictions[example_id]:
    # word = line.split()[0]
    # pred = predictions[example_id].pop(0)
    # print(start_end[i], word, pred)
    # output_line = str(example_id) + " " +  word + " " + pred + "\n"
    # prediction_lines.append((word, pred))
    # else:
    # logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])

    for i, line in enumerate(article):
        if line.startswith(
                "-DOCSTART-"
        ) or line == "" or line == "\n":  #or line.endswith('.') or line.endswith(',') or line.endswith(';') :
            if not predictions[example_id]:
                example_id += 1
                container.append([example_id, line, 'NONE'])
        elif predictions[example_id]:
            word = line.split()[0]
            pred = predictions[example_id].pop(0)

            container.append([example_id, word, pred])
            prediction_lines.append((word, pred))
        else:
            logger.warning(
                "Maximum sequence length exceeded: No prediction for '%s'.",
                line.split()[0])
    result = categorizing(prediction_lines)

    print(len(container))
    print(len(start_end))
    labels = get_label_docanno(start_end, container)
    docanno = to_docanno(original_article, labels)

    print('PREDICTION RESULT:')
    for k, v in result.items():
        print(k, ':', v)
        print("###")

    # returns docanno.json
    print(docanno)
    json_filename = 'docanno.json'
    with open(json_filename, 'w') as f:
        json.dump(docanno, f)

    return docanno
    results = {
        "loss": eval_loss,
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list),
    }

    logger.info("***** Eval results %s *****", prefix)
    for key in sorted(results.keys()):
        logger.info("  %s = %s", key, str(results[key]))

    return results, preds_list


# Prepare CONLL-2003 task
labels = get_labels('')
nlabels = len(labels)
# Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
pad_token_label_id = nn.CrossEntropyLoss().ignore_index

# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
    torch.distributed.barrier(
    )  # Make sure only the first process in distributed training will download model & vocab

# Build Bert Tokenizer:
tokenizer_class = BertTokenizer
tokenizer = tokenizer_class.from_pretrained("bert-base-cased",
                                            cache_dir=args.data_dir)
dictionary = tokenizer.vocab
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default="unilm",
                        type=str,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--disable_tqdm',
                        action='store_true',
                        help='Disable the tqdm bar. ')

    ## Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--keep_accents",
                        action="store_const",
                        const=True,
                        help="Set this flag if model is trained with accents.")
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.")
    parser.add_argument("--use_fast",
                        action="store_const",
                        const=True,
                        help="Set this flag to use fast tokenization.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_ratio",
                        default=0.1,
                        type=float,
                        help="Linear warmup over warmup_ratio.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--metric_for_choose_best_checkpoint',
                        type=str,
                        default=None,
                        help="Set the metric to choose the best checkpoint")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        id2label={str(i): label
                  for i, label in enumerate(labels)},
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer_args = {
        k: v
        for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS
    }
    logger.info("Tokenizer arguments: %s", tokenizer_args)
    tokenizer_name = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        **tokenizer_args,
    )
    if not hasattr(config, 'need_pooler') or config.need_pooler is not True:
        setattr(config, 'need_pooler', True)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

        tokenizer.save_pretrained(args.output_dir)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
                                                    **tokenizer_args)

        checkpoints = list(
            os.path.dirname(c) for c in sorted(
                glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                          recursive=True)))
        logging.getLogger("transformers.modeling_utils").setLevel(
            logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        metric_for_best = args.metric_for_choose_best_checkpoint
        best_performance = None
        best_epoch = None

        for checkpoint in checkpoints:
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""
            checkpoint_config = config_class.from_pretrained(checkpoint)
            model = model_class.from_pretrained(checkpoint,
                                                config=checkpoint_config)
            model.to(args.device)
            result, _ = evaluate(args,
                                 model,
                                 tokenizer,
                                 labels,
                                 pad_token_label_id,
                                 mode="dev",
                                 prefix=global_step)

            if metric_for_best is None:
                metric_for_best = list(result.keys())[-1]
            if best_epoch is None:
                best_epoch = checkpoint
                best_performance = result
            else:
                if best_performance[metric_for_best] < result[metric_for_best]:
                    best_performance = result
                    best_epoch = checkpoint

        if best_epoch is not None:
            logger.info(
                " ***************** Best checkpoint: {}, choosed by {} *****************"
                .format(best_epoch, metric_for_best))
            logger.info("Best performance = %s" % json.dumps(best_performance))

            save_best_result(best_epoch, best_performance, args.output_dir)
            checkpoint = best_epoch
            checkpoint_config = config_class.from_pretrained(checkpoint)
            model = model_class.from_pretrained(checkpoint,
                                                config=checkpoint_config)
            model.to(args.device)
            result, _ = test(args,
                             model,
                             tokenizer,
                             labels,
                             pad_token_label_id,
                             mode="test",
                             prefix=global_step)
def predict_entities(input_file,output_prediction_file):
    args = parse_args()
    
    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    
    args.do_eval = True
    args.do_predict = True

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    # logger.warning(
    #     "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    #     args.local_rank,
    #     device,
    #     args.n_gpu,
    #     bool(args.local_rank != -1),
    #     args.fp16,
    # )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels) - 4
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        id2label={str(i): label for i, label in enumerate(labels)},
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer_args = {k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS}
    # logger.info("Tokenizer arguments: %s", tokenizer_args)
    # tokenizer = AutoTokenizer.from_pretrained(
    #     args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    #     cache_dir=args.cache_dir if args.cache_dir else None,
    #     **tokenizer_args,
    # )
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,cache_dir=args.cache_dir if args.cache_dir else None,**tokenizer_args)
    model = AutoModelForTokenClassification_Soft_NER.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Parameters %s", args)

    

    # Evaluation
    # Save results on test

    tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args)
    model = AutoModelForTokenClassification_Soft_NER.from_pretrained(args.output_dir)
    model.to(args.device)


    result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="", path=input_file )
    # print(len(predictions[0]))
    # Save results on test
    output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
    with open(output_test_results_file, "w") as writer:
        for key in sorted(result.keys()):
            writer.write("{} = {}\n".format(key, str(result[key])))
    # Save predictions
    output_test_predictions_file = output_prediction_file
    with open(output_test_predictions_file, "w") as writer:
        with open(input_file, "r") as f:
            example_id = 0
            for line in f:
                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                    writer.write(line)
                    # print(example_id)
                    if not predictions[example_id]:
                        example_id += 1
                elif predictions[example_id]:
                    output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
                    writer.write(output_line)
                else:
                    # logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
                    continue

    print("\n\n")
    print("-------------------------------------------------------------------------------------------------")
    print("***** Perdictions on sentences is stored at ", output_prediction_file,"*****" )
    print("-------------------------------------------------------------------------------------------------")
    print("\n\n")
Esempio n. 14
0
def main(args):
    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)

    # langs = [l for l in args.src_langs] + [args.tgt_lang]
    langs = args.src_langs
    num_langs = len(langs)

    pad_token_label_id = CrossEntropyLoss().ignore_index  # -100 here

    # load target model (pretrained BERT) and tokenizer
    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path,
                                              do_lower_case=args.do_lower_case)
    config = BertConfig.from_pretrained(args.model_name_or_path)

    # Training
    if args.do_train:
        logger.info("********** scheme: train domain learner **********")
        # prepare plain text datasets & compute sentence embeddings( the order of the args.src_langs matters!!! )
        f_datasets = []
        domain_embeds = []
        cnt_datasets = []
        for i, lang in enumerate(langs):
            pt_dts = load_and_cache_examples(args,
                                             tokenizer,
                                             labels,
                                             pad_token_label_id,
                                             lang,
                                             mode="train",
                                             plain_text=True)
            st_ebd = get_init_domain_embed(args, pt_dts,
                                           lang)  # dataset_size x hidden_size
            domain_embeds.append(torch.mean(st_ebd, dim=0))
            lang_id = torch.tensor([i] * st_ebd.size(0), dtype=torch.long).to(
                args.device)  # dataset_size
            f_datasets.append(TensorDataset(st_ebd, lang_id))
            cnt_datasets.append(st_ebd.size(0))

        f_datasets = torch.utils.data.ConcatDataset(f_datasets)
        domain_embeds = torch.stack(
            domain_embeds)  # (n_langs + 1) x hidden_size, device

        class_weight = None
        if args.balance_classes:
            class_weight = torch.from_numpy(
                1.0 / np.array(cnt_datasets, dtype=np.float32))
            class_weight = class_weight / torch.sum(class_weight)
            class_weight.requires_grad = False
            class_weight = class_weight.to(args.device)

        domain_model = DomainLearner(num_langs,
                                     config.hidden_size,
                                     args.low_rank_size,
                                     weights_init=domain_embeds,
                                     gamma=args.gamma_R,
                                     class_weight=class_weight,
                                     domain_orthogonal=args.domain_orthogonal)
        domain_model.to(args.device)

        # Train!
        global_step, loss = train(args, domain_model, f_datasets)
        logger.info(" global_step = %s, loss = %s", global_step, loss)

        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(domain_model.state_dict(),
                   os.path.join(args.output_dir, "domain_model.bin"))
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    if args.do_predict:
        # save domain similarity
        logger.info(
            "********** scheme: prediction - compute domain similarity **********"
        )
        sims_dir = os.path.join(
            args.output_dir, "{}{}{}-rank_{}-gamma_{}".format(
                args.tgt_lang, '' if not args.balance_classes else '-balanced',
                '' if not args.domain_orthogonal else '-domain_orth',
                args.low_rank_size, args.gamma_R))

        dm_namespace = torch.load(os.path.join(sims_dir, "training_args.bin"))
        # langs = [l for l in dm_namespace.src_langs] + [dm_namespace.tgt_lang]
        langs = dm_namespace.src_langs
        src_idxs = [langs.index(l) for l in args.src_langs]

        pt_dts = load_and_cache_examples(args,
                                         tokenizer,
                                         labels,
                                         pad_token_label_id,
                                         args.tgt_lang,
                                         mode="train",
                                         plain_text=True)
        st_ebd = get_init_domain_embed(
            args, pt_dts, args.tgt_lang)  # dataset_size x hidden_size
        dataset_st_ebd = TensorDataset(st_ebd)

        domain_model = DomainLearner(len(langs), config.hidden_size,
                                     args.low_rank_size)
        domain_model.load_state_dict(
            torch.load(os.path.join(sims_dir, "domain_model.bin"),
                       map_location=args.device))
        domain_model.to(args.device)

        st_sims, dm_sims = evaluate(args, domain_model, dataset_st_ebd,
                                    src_idxs)

        torch.save(
            st_sims,
            os.path.join(
                sims_dir,
                "sims-{}-{}-{}-{}.bin".format(args.tau_metric,
                                              "_".join(args.src_langs),
                                              args.tgt_lang, "train")))
Esempio n. 15
0
def main(_):
    logging.set_verbosity(logging.INFO)
    args = flags.FLAGS.flag_values_dict()

    if args["fp16"]:
        tf.config.optimizer.set_experimental_options(
            {"auto_mixed_precision": True})

    if args["tpu"]:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=args["tpu"])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
        args["n_device"] = args["num_tpu_cores"]
    elif len(args["gpus"].split(",")) > 1:
        args["n_device"] = len(
            [f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
        strategy = tf.distribute.MirroredStrategy(
            devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
    elif args["no_cuda"]:
        args["n_device"] = 1
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
    else:
        args["n_device"] = len(args["gpus"].split(","))
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" +
                                                   args["gpus"].split(",")[0])

    logging.warning(
        "n_device: %s, distributed training: %s, 16-bits training: %s",
        args["n_device"],
        bool(args["n_device"] > 1),
        args["fp16"],
    )

    labels = get_labels(args["labels"])
    pad_token_label_id = -1

    logging.info("predict parameters %s", args)
    tokenizer = AutoTokenizer.from_pretrained(
        args["output_dir"], do_lower_case=args["do_lower_case"])
    model = TFAutoModelForTokenClassification.from_pretrained(
        args["output_dir"])

    while True:
        print('Input chinese sentence:')
        line = str(input())
        if line == 'quit':
            break
        if len(line) < 1:
            print(
                'Please input a chinese sentence or "quit" to break this loop:'
            )
            continue

        examples = read_examples_from_line(line)
        features = convert_examples_to_features(
            examples,
            labels,
            args["max_seq_length"],
            tokenizer,
            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args["model_type"] in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args["model_type"] in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
            pad_token_label_id=pad_token_label_id,
        )

        feature = features[0]
        X = collections.OrderedDict()

        X["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.input_ids)))
        X["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.input_mask)))
        X["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.segment_ids)))
        X["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.label_ids)))
        tf_example = tf.train.Example(features=tf.train.Features(feature=X))
        tf_example = tf_example.SerializeToString()

        max_seq_length = args["max_seq_length"]
        name_to_features = {
            "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        }

        def _decode_record(record):
            example = tf.io.parse_single_example(record, name_to_features)
            features = {}
            features["input_ids"] = example["input_ids"]
            features["input_mask"] = example["input_mask"]
            features["segment_ids"] = example["segment_ids"]
            return features, example["label_ids"]

        dataset = []
        dataset.append(tf_example)

        dataset = tf.data.Dataset.from_tensor_slices(dataset)
        dataset = dataset.map(_decode_record)

        batch_size = 1
        dataset = dataset.batch(batch_size)

        eval_features, eval_labels = iter(dataset).next()

        inputs = {
            "attention_mask": eval_features["input_mask"],
            "training": False
        }

        if args["model_type"] != "distilbert":
            inputs["token_type_ids"] = (eval_features["segment_ids"]
                                        if args["model_type"]
                                        in ["bert", "xlnet"] else None)

        with strategy.scope():
            logits = model(eval_features["input_ids"], **inputs)[0]
            active_loss = tf.reshape(eval_labels, (-1, )) != pad_token_label_id

        preds = logits.numpy()
        label_ids = eval_labels.numpy()

        preds = np.argmax(preds, axis=2)
        y_pred = [[] for _ in range(label_ids.shape[0])]

        for i in range(label_ids.shape[0]):
            for j in range(label_ids.shape[1]):
                if label_ids[i, j] != pad_token_label_id:
                    y_pred[i].append(labels[preds[i, j]])

        tokens = tokenizer.tokenize(line)
        print('## tokens = %s' % tokens)
        print('## y_pred = %s' % y_pred)
        print('## %s = %s' % (len(tokens), len(y_pred[0])))
        word_group = []
        subword = {}

        def _add_word(subword):
            word_group.append(subword['token'] + '/' + subword['flag'])
            subword.clear()

        for i, token in enumerate(tokens):
            flag = y_pred[0][i]
            print('## %s = %s' % (token, flag))
            if flag.startswith('B'):
                if len(subword) > 0:
                    _add_word(subword)
                subword['token'] = token
                subword['flag'] = flag
            elif flag.startswith('I'):
                if (len(subword) > 0 and (y_pred[0][i - 1].startswith('I')
                                          or y_pred[0][i - 1].startswith('B'))
                        and (y_pred[0][i - 1][1:] == flag[1:])):
                    subword['token'] = subword['token'] + token
                    continue
                elif len(subword) > 0:
                    _add_word(subword)
                subword['token'] = token
                subword['flag'] = flag
            else:
                if len(subword) > 0:
                    _add_word(subword)
                subword['token'] = token
                subword['flag'] = flag
                _add_word(subword)

        if len(subword) > 0:
            _add_word(subword)
        print('## word_group = %s' % word_group)
Esempio n. 16
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir", default=None, type=str, required=True,
                        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.")
    parser.add_argument("--model_type", default=None, type=str, required=True,
                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
                        help="Path to pre-trained model or shortcut name selected in the list: " )
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--labels", default="", type=str,
                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
    parser.add_argument("--config_name", default="", type=str,
                        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument("--tokenizer_name", default="", type=str,
                        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument("--cache_dir", default="", type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length", default=128, type=int,
                        help="The maximum total input sequence length after tokenization. Sequences longer "
                             "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train", action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict", action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument("--evaluate_during_training", action="store_true",
                        help="Whether to run evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action="store_true",
                        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size", default=12, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=48, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=30.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=5000,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=5000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--eval_all_checkpoints", action="store_true",
                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
    parser.add_argument("--no_cuda", action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir", action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument("--overwrite_cache", action="store_true",
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed", type=int, default=42,
                        help="random seed for initialization")

    parser.add_argument("--fp16", action="store_true",
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument("--fp16_opt_level", type=str, default="O1",
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    
    parser.add_argument("--info", default="question", type=str,
                        help="Information to be passed.")
    
#     parser.add_argument("--dataset_to_eval", default=None, type=str,
#                         help="Dataset which is being tested.")
    
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
                        datefmt="%m/%d/%Y %H:%M:%S",
                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
                                          num_labels=num_labels)
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
                                                do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path),
                                        config=config)

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
            if global_step:
                result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    test_data_dirs = ['AnatEM', 'BC2GM', 'BC4CHEMD', 'BC5CDR', 'BIONLP09', 'BIONLP11EPI', 'BIONLP11ID', 'BIONLP13CG', 'BIONLP13GE', 'BIONLP13PC', 'CRAFT', 'ExPTM', 'JNLPBA', 'Linnaeus', 'NCBIDisease', '2012TemporalRelations', '2010RelationsChallege', '2011CoreferenceChallenge']
#     test_data_dirs = [args.dataset_to_eval]
    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
                                                do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        if args.n_gpu > 1:                                                          
            model = torch.nn.DataParallel(model)     
        
        base_data_dir = args.data_dir
        for data_dir in tqdm(test_data_dirs,desc="Data_Dir:"):
#         for data_dir in [""]:
            dataset_test = data_dir
            print("Current Data Dir:"+data_dir)
            data_dir=os.path.join(base_data_dir,data_dir)
            print("Updated Data Dir:"+data_dir)
            args.data_dir=data_dir
            result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
            # Save results
#             output_test_results_file = os.path.join(args.data_dir, args.info+"_test_results.txt")
            output_test_results_file = os.path.join(args.output_dir, dataset_test+"_"+args.info+"_test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(result.keys()):
                    writer.write("{} = {}\n".format(key, str(result[key])))
            # Save predictions
#             output_test_predictions_file = os.path.join(args.data_dir, args.info+"_test_predictions.jsonl")
            output_test_predictions_file = os.path.join(args.output_dir, dataset_test+"_"+args.info+"_test_predictions.jsonl")
            with jsonlines.open(output_test_predictions_file, "w") as writer:
                examples = read_examples_from_file_qa(args.data_dir,"test",args.info)
                for idx,example in enumerate(examples):
                    
                    outrow = example.orig_row
                    outrow["predicted_label"] = predictions[idx]
                    outrow["input_words"] = example.words
                    outrow["labels"] = example.labels
                    
                    writer.write(outrow)
                    
    #                 print(example.words,predictions[example.guid],example.labels)
#                     writer.write(example.guid+"\n")
#                     count=0
#                     for word,pred,label in zip(example.words,predictions[idx],example.labels):
#                         writer.write("%s\t%s\t%s\n"%(word,pred,label))
#                         if pred=="B":
#                             count+=1
#                     true_counts=example.counts
#                     writer.write("PredCounts:%d, TrueCounts:%d \n"%(count,true_counts))
    #             with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
    #                 example_id = 0
    #                 for line in f:
    #                     if line.startswith("-DOCSTART-") or line == "" or line == "\n":
    #                         writer.write(line)
    #                         if not predictions[example_id]:
    #                             example_id += 1
    #                     elif predictions[example_id]:
    #                         output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
    #                         writer.write(output_line)
    #                     else:
    #                         logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])

    return results
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Task embeddings
    parser.add_argument(
        "--num_softmax_classifiers",
        default=1,
        type=int,
        help="Number of softmax classifiers on top of Bert's output.")
    parser.add_argument("--pow",
                        type=float,
                        default=2.0,
                        help="Return features to the power pow.")
    parser.add_argument(
        "--feature_type",
        default='grads',
        type=str,
        help="The type of the features selected in ['grads', 'weights']")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size.")
    parser.add_argument(
        "--retain_gradients",
        default=True,
        type=eval,
        help=
        "Whether to retain gradients at each layer output of the feature extractor."
    )
    parser.add_argument("--do_pooling",
                        default=True,
                        type=eval,
                        help="Whether to pool the feature extractor.")
    parser.add_argument(
        "--use_labels",
        default=True,
        type=eval,
        help=
        "Whether to use training labels or sample from the model's predictive distribution \n"
        "pθ(y|xn), e.g., to compute the theoretical Fisher information.")
    parser.add_argument(
        "--num_trials_for_FIM",
        type=int,
        default=100,
        help=
        "Number of trials to sample from the model's predictive distribution pθ(y|xn)."
    )
    parser.add_argument(
        "--FIM_scale",
        type=float,
        default=0.25,
        help=
        "Standard deviation of the distribution used to compute the theoretical FIM."
    )
    parser.add_argument("--finetune_classifier",
                        default=False,
                        type=eval,
                        help="Whether to fine-tune the final classifier.")
    parser.add_argument("--finetune_feature_extractor",
                        default=False,
                        type=eval,
                        help="Whether to fine-tune the feature extractor.")

    ## Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--data_subset",
        type=int,
        default=-1,
        help="If > 0: limit the data to a subset of data_subset instances.")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--save',
        type=str,
        default='all',
        help="Select load mode from ['all', '0', '1', '2', '3', ...]")

    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Create output directory if needed
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    with open(os.path.join(args.output_dir, 'run_args.txt'), 'w') as f:
        f.write(json.dumps(args.__dict__, indent=2))
        f.close()

    # Setup CUDA, GPU training
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    if args.n_gpu > 1:
        raise ValueError("This code only supports a single GPU.")

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger.warning("Device: %s, n_gpu: %s", device, args.n_gpu)

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    args.num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=args.num_labels,
        num_softmax_classifiers=args.num_softmax_classifiers,
        retain_gradients=args.retain_gradients,
        do_pooling=args.do_pooling,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None)

    model.to(args.device)

    logger.info("List of model named parameters:")
    for n, p in list(model.named_parameters()):
        logger.info("%s", n)
    logger.info("Training/evaluation parameters %s", args)
    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
    tokenizer.save_pretrained(args.output_dir)

    train_dataset = load_and_cache_examples(args,
                                            tokenizer,
                                            labels,
                                            pad_token_label_id,
                                            mode="train")
    if args.data_subset > 0:
        train_dataset = Subset(
            train_dataset,
            list(range(min(args.data_subset, len(train_dataset)))))
    compute_taskemb(args, train_dataset, model)
Esempio n. 18
0
    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
}
config_class, model_class, tokenizer_class = MODEL_CLASSES['bert']
# location of the model of choice
model_dir = 'model/'
model = model_class.from_pretrained(model_dir)
case = False
tokenizer = tokenizer_class.from_pretrained(model_dir, do_lower_case=case)
device = 'cpu'
model.to(device)

# setting the folder for possible whole directory tagging
files_dir = 'data/'

# getting labels
labels = get_labels('labels.txt')
pad_token_label_id = CrossEntropyLoss().ignore_index

# reading examples
examples = read_examples_from_file('.', 'test')
features = convert_examples_to_features(
    examples,
    labels,
    256,
    tokenizer,
    cls_token_at_end=False,
    cls_token=tokenizer.cls_token,
    cls_token_segment_id=0,
    sep_token=tokenizer.sep_token,
    sep_token_extra=False,
    pad_on_left=False,
Esempio n. 19
0
def main(args):
    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)

    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index  # -100 here

    tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,do_lower_case=args.do_lower_case)
    config = BertConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels)

    src_datasets = []
    for src in args.src_langs:
        src_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, src, mode="train")
        src_datasets.append(src_dataset)

    # Training
    if args.do_train:
        logger.info("********** scheme: training with KD **********")

        # compute sentence embeddings of target training examples
        dataset_pt = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, args.tgt_lang, mode="train")

        src_probs, src_predictions = get_src_weighted_probs(args, dataset_pt, config, mode="train", src_datasets=src_datasets)

        # load target model (pretrained BERT) and tokenizer
        model = BertForTokenClassification_.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config)
        model.to(args.device)

        task_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, args.tgt_lang, mode="train")

        if args.train_hard_label:
            # update the src_probs as hard labels
            voting_labels = torch.argmax(src_probs, dim=-1).cpu()
            voting_labels[task_dataset.tensors[3] == pad_token_label_id] = pad_token_label_id
            task_dataset = TensorDataset(task_dataset.tensors[0], task_dataset.tensors[1], task_dataset.tensors[2], voting_labels)
            src_probs = None

        # Train!
        global_step, loss_KD, loss = train(args, model, task_dataset, src_probs, pad_token_label_id, src_predictions)
        logger.info(" global_step = %s, average task KD loss = %s, average task loss = %s", global_step, loss_KD, loss)

        # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    if args.do_voting:
        logger.info("********** scheme: Voting with %s **********", "averaging" if args.sim_dir == "" else "{}/rank_{}-gamma_{}".format(args.sim_dir, args.low_rank_size, args.gamma_R))
        test_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, args.tgt_lang, mode="test")

        src_probs, _ = get_src_weighted_probs(args, test_dataset, config, mode="test", src_datasets=src_datasets)

        result, predictions = weighted_voting(args, test_dataset, src_probs, labels, pad_token_label_id)
        # Save results
        output_test_results_file = os.path.join(args.output_dir, "test_results-{}-{}.txt".format(
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()),
            os.path.basename(args.data_dir)))
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions-{}-{}.txt".format(
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()),
            os.path.basename(args.data_dir)))
        with open(output_test_predictions_file, "w") as writer:
            with open(os.path.join(args.data_dir, args.tgt_lang, "test.txt"), "r") as f:
                example_id = 0
                for line in f:
                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split()[0] + " " + line.split()[-1].replace("\n", "") + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])

    if args.do_predict:
        logger.info("********** scheme: prediction **********")
        model = BertForTokenClassification_.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir, "test_results-{}-{}.txt".format(
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), args.tgt_lang))
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions-{}-{}.txt".format(
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()), args.tgt_lang))
        with open(output_test_predictions_file, "w", encoding='utf-8') as writer:
            with open(os.path.join(args.data_dir, args.tgt_lang, "test.txt"), "r", encoding='utf-8') as f:
                example_id = 0
                for line in f:
                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split()[0] + " " + line.split()[-1].replace("\n", "") + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--keep_accents",
                        action="store_const",
                        const=True,
                        help="Set this flag if model is trained with accents.")
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.")
    parser.add_argument("--use_fast",
                        action="store_const",
                        const=True,
                        help="Set this flag to use fast tokenization.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument(
        "--has_new_labels",
        action="store_true",
        help=
        "Tells the trainer that more labels are present than in the pretrained model."
    )
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    tokenizer_args = {
        k: v
        for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS
    }
    logger.info("Tokenizer arguments: %s", tokenizer_args)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        **tokenizer_args,
    )

    if args.has_new_labels and os.path.exists(args.model_name_or_path):
        old_config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        old_model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=old_config,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        old_num_labels = old_config.num_labels
        new_labels = [
            label for label in labels
            if label not in old_config.label2id.keys()
        ]
        id2label = {
            **old_config.id2label,
            **{
                i + old_num_labels: label
                for i, label in enumerate(new_labels)
            }
        }
        label2id = {
            **old_config.label2id,
            **{
                label: i + old_num_labels
                for i, label in enumerate(new_labels)
            }
        }
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )

        model = model_class(config)

        params_old = old_model.named_parameters()
        params_new = model.named_parameters()
        dict_params_old = dict(params_old)
        dict_params_new = dict(params_new)
        old_labels = list(old_config.label2id.keys())
        new_labels = list(config.label2id.keys())
        logger.info("Old labels %s", old_labels)
        logger.info("New labels %s", new_labels)
        sort_keys = np.argsort(new_labels)
        for param_name, param_old in dict_params_old.items():
            if param_name in dict_params_new:
                if param_name.startswith('classifier'):
                    # not only do the dimensions differ when adding a new label
                    # but something resorts the labels (apparently alphabetically)
                    # and uses that resorted list to determine the matrix structure
                    # so we should apply that logic here explicitly
                    for target_idx, current_idx in enumerate(sort_keys):
                        target_label = new_labels[current_idx]
                        if target_label in old_labels:
                            idx_in_old_weights = old_labels.index(target_label)
                            dict_params_new[param_name].data[target_idx].copy_(
                                param_old.data[idx_in_old_weights])
                else:
                    dict_params_new[param_name].data.copy_(param_old.data)

        model.load_state_dict(dict_params_new)

    elif args.has_new_labels:
        raise Exception(
            "You have specified you're adding new labels but not provided a base model"
        )
    else:
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            id2label={str(i): label
                      for i, label in enumerate(labels)},
            label2id={label: i
                      for i, label in enumerate(labels)},
            cache_dir=args.cache_dir if args.cache_dir else None,
        )

        model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
                                                    **tokenizer_args)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result, _ = evaluate(args,
                                 model,
                                 tokenizer,
                                 labels,
                                 pad_token_label_id,
                                 mode="dev",
                                 prefix=global_step)
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w", encoding="utf-8") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
                                                    **tokenizer_args)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args,
                                       model,
                                       tokenizer,
                                       labels,
                                       pad_token_label_id,
                                       mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir,
                                                "test_results.txt")
        with open(output_test_results_file, "w", encoding="utf-8") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir,
                                                    "test_predictions.txt")
        with open(output_test_predictions_file, "w",
                  encoding="utf-8") as writer:
            with open(os.path.join(args.data_dir, "test.txt"),
                      "r",
                      encoding="utf-8") as f:
                example_id = 0
                for line in f:
                    if line.startswith(
                            "-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split(
                        )[0] + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning(
                            "Maximum sequence length exceeded: No prediction for '%s'.",
                            line.split()[0])

    return results
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--train_data_subset",
        type=int,
        default=-1,
        help=
        "If > 0: limit the training data to a subset of train_data_subset instances."
    )
    parser.add_argument(
        "--eval_data_subset",
        type=int,
        default=-1,
        help=
        "If > 0: limit the evaluation data to a subset of eval_data_subset instances."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--finetune_feature_extractor",
                        default=True,
                        type=eval,
                        help="Whether to fine-tune the feature extractor.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--model_load_mode',
        type=str,
        default='all',
        help="Select load mode from ['base_model_only', 'all']")
    parser.add_argument(
        '--save',
        type=str,
        default='all',
        help="Select load mode from ['all', '0', '1', '2', '3', ...]")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Create output directory if needed
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    with open(os.path.join(args.output_dir, 'run_args.txt'), 'w') as f:
        f.write(json.dumps(args.__dict__, indent=2))
        f.close()

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    if args.model_load_mode not in ['base_model_only', 'all']:
        raise ValueError("Model load mode not found: %s" %
                         (args.model_load_mode))

    state_dict_with_prefix = None
    if args.model_load_mode == 'base_model_only':
        archive_file = os.path.join(args.model_name_or_path, WEIGHTS_NAME)
        model_state_dict = torch.load(archive_file)
        state_dict_with_prefix = {}
        for key, value in model_state_dict.items():
            if key.startswith(args.model_type):
                state_dict_with_prefix[key] = value

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
        state_dict=state_dict_with_prefix)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)
    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
    tokenizer.save_pretrained(args.output_dir)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train")
        if args.train_data_subset > 0:
            train_dataset = Subset(
                train_dataset,
                list(range(min(args.train_data_subset, len(train_dataset)))))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        eval_list = []
        if args.save == 'all':
            eval_list = range(int(args.num_train_epochs + 1))
        elif str(args.save).isdigit():
            eval_list = [int(args.save)]
        if len(eval_list) > 0:
            for epoch in eval_list:
                checkpoints = [
                    os.path.join(args.output_dir,
                                 'checkpoint-{}'.format(epoch))
                ]
                if args.eval_all_checkpoints:
                    checkpoints = list(
                        os.path.dirname(c) for c in sorted(
                            glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                                      recursive=True)))
                    logging.getLogger(
                        "pytorch_transformers.modeling_utils").setLevel(
                            logging.WARN)  # Reduce logging
                logger.info("Evaluate the following checkpoints: %s",
                            checkpoints)
                for checkpoint in checkpoints:
                    global_step = checkpoint.split(
                        '-')[-1] if len(checkpoints) > 1 else ""
                    prefix = checkpoint.split(
                        '/')[-1] if checkpoint.find('checkpoint') != -1 else ""
                    model = model_class.from_pretrained(checkpoint)
                    tokenizer = tokenizer_class.from_pretrained(
                        args.output_dir, do_lower_case=args.do_lower_case)

                    model.to(args.device)
                    result, _ = evaluate(args,
                                         model,
                                         tokenizer,
                                         labels,
                                         pad_token_label_id,
                                         mode="dev",
                                         checkpoint_id=epoch,
                                         prefix=prefix)
                    result = dict((k + '_{}'.format(global_step), v)
                                  for k, v in result.items())
                    results.update(result)
                output_eval_file = os.path.join(args.output_dir,
                                                "eval_results.txt")
                with open(output_eval_file, "w") as writer:
                    for key in sorted(results.keys()):
                        writer.write("{} = {}\n".format(
                            key, str(results[key])))

    return results
Esempio n. 22
0
 def __init__(self, hparams):
     self.labels = get_labels(hparams.labels)
     num_labels = len(self.labels)
     super(NERTransformer, self).__init__(hparams, num_labels)
Esempio n. 23
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Prepare Token Classification task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForTokenClassification.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    train_dataset = (
        TFNerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        TFNerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)
        batch_size, seq_len = preds.shape
        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != -1:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)

        return {
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset.get_dataset() if train_dataset else None,
        eval_dataset=eval_dataset.get_dataset() if eval_dataset else None,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()
        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")

            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = TFNerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
        )

        predictions, label_ids, metrics = trainer.predict(test_dataset.get_dataset())
        preds_list, labels_list = align_predictions(predictions, label_ids)
        report = classification_report(labels_list, preds_list)

        logger.info("\n%s", report)

        output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")

        with open(output_test_results_file, "w") as writer:
            writer.write("%s\n" % report)

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")

        with open(output_test_predictions_file, "w") as writer:
            with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
                example_id = 0

                for line in f:
                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)

                        if not preds_list[example_id]:
                            example_id += 1
                    elif preds_list[example_id]:
                        output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"

                        writer.write(output_line)
                    else:
                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])

    return results
Esempio n. 24
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
            os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir)
            and training_args.do_train
            and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    tui_ids = None
    if data_args.umls:
        tui_ids = create_cui_dict(voc_updated=data_args.med_document, tokenizer=tokenizer)

    # Get datasets
    train_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            tui_ids=tui_ids,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            tui_ids=tui_ids,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
        return {
            "precision": precision_score(out_label_list, preds_list),
            "recall": recall_score(out_label_list, preds_list),
            "f1": f1_score(out_label_list, preds_list),
        }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = NerDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            labels=labels,
            tui_ids=tui_ids,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
        )

        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, _ = align_predictions(predictions, label_ids)

        output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
                    example_id = 0
                    for line in f:
                        try:
                            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                                writer.write(line)
                                if not preds_list[example_id]:
                                    example_id += 1
                            elif preds_list[example_id]:
                                output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
                                writer.write(output_line)
                            else:
                                logger.warning(
                                    "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]
                                )
                        except:
                            break

    return results
Esempio n. 25
0
 def __init__(self, hparams):
     self.labels = get_labels(hparams.labels)
     num_labels = len(self.labels)
     self.pad_token_label_id = CrossEntropyLoss().ignore_index
     super(NERTransformer, self).__init__(hparams, num_labels, self.mode)
Esempio n. 26
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=10000000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")

    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the ud2: ",
    )
    parser.add_argument(
        "--base_data_dir",
        default=None,
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--data_size",
        default=None,
        type=int,
        help=
        "Data_size for Shapley Training (None for full data), e.g., 100 for debug ",
    )
    parser.add_argument(
        "--indices_to_delete_file_path",
        default=None,
        type=str,
        help="File path where the ids are to delete",
    )
    parser.add_argument(
        "--domain_to_delete_file_path",
        default=None,
        type=str,
        help="File path where the ids are to delete",
    )
    parser.add_argument(
        "--is_baseline_run",
        action="store_true",
        help="Is the baseline run to get result, data_size, random/init score",
    )
    parser.add_argument(
        "--LOO",
        action="store_true",
        help="Whether to calculate LOO or not?",
    )

    parser.add_argument(
        "--is_few_shot",
        action="store_true",
        help="Whether to calculate LOO or not?",
    )

    parser.add_argument(
        "--num_bags",
        default=20,
        type=int,
        help="How many bags to approximate the mean performance ",
    )

    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    args.base_data_dir = args.data_dir
    args.task_name = args.task_name

    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:

        # old code
        # train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
        # global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
        # logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

        # new code
        ALL_BINARY_TASKS = [
            'UD_ARABIC', 'UD_BASQUE', 'UD_BULGARIAN', 'UD_CATALAN',
            'UD_CHINESE', 'UD_CROATIAN', 'UD_CZECH', 'UD_DANISH', 'UD_DUTCH',
            'UD_ENGLISH', 'UD_FINNISH', 'UD_FRENCH', 'UD_GERMAN', 'UD_HEBREW',
            'UD_HINDI', 'UD_INDONESIAN', 'UD_ITALIAN', 'UD_JAPANESE',
            'UD_KOREAN', 'UD_NORWEGIAN', 'UD_PERSIAN', 'UD_POLISH',
            'UD_PORTUGUESE', 'UD_ROMANIAN', 'UD_RUSSIAN', 'UD_SERBIAN',
            'UD_SLOVAK', 'UD_SLOVENIAN', 'UD_SPANISH', 'UD_SWEDISH',
            'UD_TURKISH'
        ]

        ALL_BINARY_TASKS.remove(args.task_name)

        logger.info(
            " ALL_POS_TASKS = %s, task = %s args.indices_to_delete_file_path = %s",
            ALL_BINARY_TASKS, args.task_name, args.indices_to_delete_file_path)
        logger.info(
            " not evaluate = %s args.indices_to_delete_file_path and not evaluate=%s",
            not evaluate, args.indices_to_delete_file_path and not evaluate)
        if args.indices_to_delete_file_path:
            with open(args.indices_to_delete_file_path, "r") as reader:
                print("***** reading ids to remove *****", flush=True)
                data = reader.read().replace('\n', '').strip().split()
                ids = np.array([int(i) for i in data])
                logger.info(" Data = %s, ids = %s", data, str(ids))
                ALL_BINARY_TASKS = np.delete(np.array(ALL_BINARY_TASKS),
                                             ids,
                                             axis=0)
                logger.info(" After delete ALL_BINARY_TASKS = %s",
                            ALL_BINARY_TASKS)

        ALL_BINARY_TASKS = np.random.permutation(ALL_BINARY_TASKS)
        logger.info(" After Permutation ALL_BINARY_TASKS = %s",
                    ALL_BINARY_TASKS)
        if len(ALL_BINARY_TASKS) > 0:
            train_dataset, random_init_result, n_train_points = \
                load_and_cache_examples(args, ALL_BINARY_TASKS[0], tokenizer, labels, pad_token_label_id, mode="train")

        for task in ALL_BINARY_TASKS[1:]:
            train_dataset2, random_init_result2, n_train_points2 = \
                load_and_cache_examples(args, task, tokenizer, labels, pad_token_label_id, mode="train")
            train_dataset += train_dataset2
            n_train_points += n_train_points2

        if args.is_few_shot:
            train_dataset2, random_init_result2, n_train_points2 = \
                load_and_cache_examples(args, args.task_name, tokenizer, labels, pad_token_label_id, mode="dev")
            train_dataset += train_dataset2
            n_train_points += n_train_points2

        output_eval_file = os.path.join(args.output_dir, '',
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Creating Empty Eval results file *****")

        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)

        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

        output_eval_file = os.path.join(args.output_dir,
                                        "training_results" + ".txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Writig Training dataset size  {} *****")
            logger.info("%s = %s\n" % ('n_points', n_train_points))
            writer.write("%s = %s\n" % ('n_points', n_train_points))

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result, _ = evaluate(args,
                                 model,
                                 tokenizer,
                                 labels,
                                 pad_token_label_id,
                                 mode="dev",
                                 prefix=global_step)
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                if results[key]:
                    writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        results, (predictions, gold_labels) = evaluate(args,
                                                       model,
                                                       tokenizer,
                                                       labels,
                                                       pad_token_label_id,
                                                       mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir,
                                                "eval_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(results.keys()):
                if results[key]:
                    writer.write("{} = {}\n".format(key, str(results[key])))
        # Save predictions

        output_test_predictions_file = os.path.join(args.output_dir,
                                                    "test_predictions.txt")
        output_test_gold_file = os.path.join(args.output_dir, "test_gold.txt")
        pred_count = 0
        with open(output_test_predictions_file, "w") as writer:
            with open(output_test_gold_file, "w") as f:
                for gold_line, pred_line in zip(gold_labels, predictions):
                    for gold, pred in zip(gold_line, pred_line):
                        f.write(gold + "\n")
                        writer.write(pred + "\n")
                        pred_count += 1
        logger.info(
            "Written %s sentences to both gold and %s to  pred file. Total %d word",
            len(gold_labels), len(predictions), pred_count)

    return results
Esempio n. 27
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default="../mid_out1/",
        type=str,
        required=False,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default="bert",
        type=str,
        required=False,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default="bert-base-uncased",
        # default="ner_out/run3/",
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default="./ner_out/bert_02",
        type=str,
        required=False,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    # 1:bio,0:01,2:BIOE
    parser.add_argument(
        "--labels",
        default=2,
        type=int,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name",
        # default="bert-base-uncased",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        # default="bert-base-uncased",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="./cach/",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=80,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        default=True,
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        default=True,
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=32,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=64,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=5.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=10000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        # default=True,
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument(
        "--no_cuda",
        # default=True,
        action="store_true",
        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        default=True,
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        default=True,
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=37,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset, train_examples = load_and_cache_examples(
            args, tokenizer, labels, pad_token_label_id, mode="train")
        data_stitic(train_examples, mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result, _, dev_examples = evaluate(args,
                                               model,
                                               tokenizer,
                                               labels,
                                               pad_token_label_id,
                                               mode="dev",
                                               prefix=global_step)
            data_stitic(dev_examples, "dev")
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions, examples = evaluate(args,
                                                 model,
                                                 tokenizer,
                                                 labels,
                                                 pad_token_label_id,
                                                 mode="dev")
        data_stitic(examples, "test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir,
                                                "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))

        # Save predictions
        spans = get_propgranda_span(predictions, examples)
        output_test_predictions_file = os.path.join(args.output_dir,
                                                    "BIOE_predictions.txt")
        with open(output_test_predictions_file, "w") as writer:
            for span in spans:
                out_str = span[0] + "\t" + str(span[1]) + "\t" + str(
                    span[2]) + "\n"
                writer.write(out_str)

    return results
Esempio n. 28
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Prepare CONLL-2003 task
    labels = get_labels(data_args.labels)
    label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)}
    num_labels = len(labels)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        id2label=label_map,
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )
    model = AutoModelForTokenMultiLabelClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (NerDataset(
        data_dir=data_args.data_dir,
        data_format=data_args.data_format,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
        multilabeling=True,
    ) if training_args.do_train else None)
    eval_dataset = (NerDataset(
        data_dir=data_args.data_dir,
        data_format=data_args.data_format,
        tokenizer=tokenizer,
        labels=labels,
        model_type=config.model_type,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
        multilabeling=True,
    ) if training_args.do_eval else None)

    def get_label_preds_refs(
            predictions: np.ndarray,
            label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]:
        """ Returns a list of labels for each token in each sequence in the dataset. """
        logit_threshold = 0.0  # Corresponds to a probability of 0.5 if fed through a sigmoid.
        preds = predictions > logit_threshold

        batch_size, seq_len, _ = preds.shape

        refs_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                preds_list[i].append(
                    [label_map[x] for x in np.where(preds[i][j] == 1)[0]])
                refs_list[i].append(
                    [label_map[x] for x in np.where(label_ids[i][j] == 1)[0]])

        return preds_list, refs_list

    def align_predictions(
            predictions: np.ndarray,
            label_ids: np.ndarray) -> Tuple[List[List[str]], List[List[str]]]:
        logit_threshold = 0.0  # Corresponds to a probability of 0.5 if fed through a sigmoid.
        preds = predictions > logit_threshold
        batch_size, seq_len, _ = preds.shape

        # is_tagged indicates for each token whether it has an associated tag (i.e. a
        # label, including the O label) and should be assessed, otherwise it's
        # a padding or special token.
        is_tagged = label_ids.sum(axis=2) > 0

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]

        for i in range(batch_size):
            for j in range(seq_len):
                if is_tagged[i, j]:
                    #out_label_list[i].append(label_map[label_ids[i][j]])
                    out_label_list[i].append([
                        label_map[x] for x in np.where(label_ids[i][j] == 1)[0]
                    ])
                    #preds_list[i].append(label_map[preds[i][j]])
                    preds_list[i].append(
                        [label_map[x] for x in np.where(preds[i][j] == 1)[0]])

        return preds_list, out_label_list

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds_list, out_label_list = align_predictions(p.predictions,
                                                       p.label_ids)
        (chunk_prec, chunk_rec, chunk_f1, tok_prec, tok_rec,
         tok_f1) = fsn4nlp.utils.conlleval.evaluate_multilabel(
             out_label_list, preds_list)
        return {
            "chunk_precision": chunk_prec,
            "chunk_recall": chunk_rec,
            "chunk_f1": chunk_f1,
            "tok_precision": tok_prec,
            "tok_recall": tok_rec,
            "tok_f1": tok_f1,
        }

    # Initialize our Trainer
    trainer = Trainer(model=model,
                      args=training_args,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      compute_metrics=compute_metrics,
                      labels=labels)

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

            results.update(result)

    # Predict
    if training_args.do_predict:
        test_dataset = NerDataset(
            data_dir=data_args.data_dir,
            data_format=data_args.data_format,
            tokenizer=tokenizer,
            labels=labels,
            model_type=config.model_type,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.test,
            multilabeling=True,
        )

        predictions, label_ids, metrics = trainer.predict(test_dataset)
        preds_list, refs_list = get_label_preds_refs(predictions, label_ids)

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                for i, example in enumerate(test_dataset):
                    for tok_id in example.input_ids:
                        tok = tokenizer.convert_ids_to_tokens(tok_id)
                        if refs_list[i][0] == []:
                            output_line = f"{tok}\n"
                            refs_list[i].pop(0)
                        else:
                            output_line = f"{tok} {refs_list[i].pop(0)} {preds_list[i].pop(0)}\n"
                        writer.write(output_line)

    return results
Esempio n. 29
0
def main(_):
    logging.set_verbosity(logging.INFO)
    args = flags.FLAGS.flag_values_dict()

    if (os.path.exists(args["output_dir"]) and os.listdir(args["output_dir"])
            and args["do_train"] and not args["overwrite_output_dir"]):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args["output_dir"]))

    if args["fp16"]:
        tf.config.optimizer.set_experimental_options(
            {"auto_mixed_precision": True})

    if args["tpu"]:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=args["tpu"])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
        args["n_device"] = args["num_tpu_cores"]
    elif len(args["gpus"].split(",")) > 1:
        args["n_device"] = len(
            [f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
        strategy = tf.distribute.MirroredStrategy(
            devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
    elif args["no_cuda"]:
        args["n_device"] = 1
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
    else:
        args["n_device"] = len(args["gpus"].split(","))
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" +
                                                   args["gpus"].split(",")[0])

    logging.warning(
        "n_device: %s, distributed training: %s, 16-bits training: %s",
        args["n_device"],
        bool(args["n_device"] > 1),
        args["fp16"],
    )

    labels = get_labels(args["labels"])
    num_labels = len(labels)
    pad_token_label_id = -1
    # IBO
    print(args["config_name"]
          if args["config_name"] else args["model_name_or_path"])
    config = AutoConfig.from_pretrained(
        args["config_name"]
        if args["config_name"] else args["model_name_or_path"],
        num_labels=num_labels,
        cache_dir=args["cache_dir"],
    )

    logging.info("Training/evaluation parameters %s", args)
    args["model_type"] = config.model_type

    # Training
    if args["do_train"]:
        tokenizer = AutoTokenizer.from_pretrained(
            args["tokenizer_name"]
            if args["tokenizer_name"] else args["model_name_or_path"],
            do_lower_case=args["do_lower_case"],
            cache_dir=args["cache_dir"],
        )

        with strategy.scope():
            model = TFAutoModelForTokenClassification.from_pretrained(
                args["model_name_or_path"],
                from_pt=bool(".bin" in args["model_name_or_path"]),
                config=config,
                cache_dir=args["cache_dir"],
            )

        train_batch_size = args["per_device_train_batch_size"] * args[
            "n_device"]
        train_dataset, num_train_examples = load_and_cache_examples(
            args,
            tokenizer,
            labels,
            pad_token_label_id,
            train_batch_size,
            mode="train")
        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
        train(
            args,
            strategy,
            train_dataset,
            tokenizer,
            model,
            num_train_examples,
            labels,
            train_batch_size,
            pad_token_label_id,
        )

        os.makedirs(args["output_dir"], exist_ok=True)

        logging.info("Saving model to %s", args["output_dir"])

        model.save_pretrained(args["output_dir"])
        tokenizer.save_pretrained(args["output_dir"])

    # Evaluation
    if args["do_eval"]:
        tokenizer = AutoTokenizer.from_pretrained(
            args["output_dir"], do_lower_case=args["do_lower_case"])
        checkpoints = []
        results = []

        if args["eval_all_checkpoints"]:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args["output_dir"] + "/**/" + TF2_WEIGHTS_NAME,
                              recursive=True),
                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
                ))

        logging.info("Evaluate the following checkpoints: %s", checkpoints)

        if len(checkpoints) == 0:
            checkpoints.append(args["output_dir"])

        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if re.match(
                ".*checkpoint-[0-9]", checkpoint) else "final"

            with strategy.scope():
                model = TFAutoModelForTokenClassification.from_pretrained(
                    checkpoint)

            y_true, y_pred, eval_loss = evaluate(args,
                                                 strategy,
                                                 model,
                                                 tokenizer,
                                                 labels,
                                                 pad_token_label_id,
                                                 mode="dev")
            report = metrics.classification_report(y_true, y_pred, digits=4)

            if global_step:
                results.append({
                    global_step + "_report": report,
                    global_step + "_loss": eval_loss
                })

        output_eval_file = os.path.join(args["output_dir"], "eval_results.txt")

        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            for res in results:
                for key, val in res.items():
                    if "loss" in key:
                        logging.info(key + " = " + str(val))
                        writer.write(key + " = " + str(val))
                        writer.write("\n")
                    else:
                        logging.info(key)
                        logging.info("\n" + report)
                        writer.write(key + "\n")
                        writer.write(report)
                        writer.write("\n")

    if args["do_predict"]:
        tokenizer = AutoTokenizer.from_pretrained(
            args["output_dir"], do_lower_case=args["do_lower_case"])
        model = TFAutoModelForTokenClassification.from_pretrained(
            args["output_dir"])
        eval_batch_size = args["per_device_eval_batch_size"] * args["n_device"]
        predict_dataset, _ = load_and_cache_examples(args,
                                                     tokenizer,
                                                     labels,
                                                     pad_token_label_id,
                                                     eval_batch_size,
                                                     mode="test")
        y_true, y_pred, pred_loss = evaluate(args,
                                             strategy,
                                             model,
                                             tokenizer,
                                             labels,
                                             pad_token_label_id,
                                             mode="test")
        output_test_results_file = os.path.join(args["output_dir"],
                                                "test_results.txt")
        output_test_predictions_file = os.path.join(args["output_dir"],
                                                    "test_predictions.txt")
        report = metrics.classification_report(y_true, y_pred, digits=4)

        with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
            report = metrics.classification_report(y_true, y_pred, digits=4)

            logging.info("\n" + report)

            writer.write(report)
            writer.write("\n\nloss = " + str(pred_loss))

        with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
            with tf.io.gfile.GFile(os.path.join(args["data_dir"], "test.txt"),
                                   "r") as f:
                example_id = 0

                for line in f:
                    if line.startswith(
                            "-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)

                        if not y_pred[example_id]:
                            example_id += 1
                    elif y_pred[example_id]:
                        output_line = line.split(
                        )[0] + " " + y_pred[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logging.warning(
                            "Maximum sequence length exceeded: No prediction for '%s'.",
                            line.split()[0])
Esempio n. 30
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default="./data/ner/conll/debug",
        type=str,  # required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task."
    )
    parser.add_argument(
        "--output_dir",
        default="conll-model/test",
        type=str,  # required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    parser.add_argument("--src_model_dir",
                        default="conll-model-22/",
                        type=str,
                        help="path to load teacher models")
    parser.add_argument(
        "--src_model_dir_prefix",
        default="mono-src-",
        type=str,
        help="prefix of the teacher model dir (to indicate the model type)")
    parser.add_argument("--src_langs",
                        type=str,
                        nargs="+",
                        default="en",
                        help="source languages used for multi-teacher models")

    parser.add_argument("--unlabeled_data_ratio",
                        type=float,
                        default=1.0,
                        help="Ratio of the training data to use.")

    ## Other parameters
    parser.add_argument(
        "--model_type",
        default='bert',
        type=str,  # required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default='bert-base-multilingual-cased',
        type=str,  # required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--labels",
        default="./data/ner/conll/labels.txt",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used."
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")

    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")

    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_KD",
                        action="store_true",
                        help="Whether to train with knowledge distillation.")
    parser.add_argument("--hard_label", action="store_true")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=32,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=32,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=1e-4,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        "--freeze_bottom_layer",
        default=3,
        type=int,
        help="Freeze the bottom n layers of the model during fine-tuning.")
    parser.add_argument("--num_train_epochs",
                        default=3,
                        type=int,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--warmup_ratio",
                        default=0.1,
                        type=float,
                        help="Linear warmup over warmup_ratio.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=20,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=20000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=667,
                        help="random seed for initialization")
    parser.add_argument("--gpu_ids",
                        type=int,
                        nargs="+",
                        default=0,
                        help="ids of the gpus to use")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    # Check output_dir
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and os.path.exists(
                os.path.join(args.output_dir,
                             "pytorch_model.bin")) and os.path.basename(
                                 args.output_dir) != "test":
        raise ValueError(
            "Train: Output directory already exists and is not empty.")

    if os.path.exists(args.output_dir) and args.do_predict:
        is_done = False
        for name in os.listdir(
                args.output_dir):  # result file: "test_results-TIME-LANGUAGE"
            if "test_results" in name and (os.path.basename(args.data_dir) +
                                           ".txt") in name:
                is_done = True
                break
        if is_done:
            raise ValueError(
                "Predict: Output directory ({}) already exists and is not empty."
                .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        args.n_gpu = len(args.gpu_ids)  # torch.cuda.device_count()
        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_ids[0])
        # os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3,4,5,6"
        device = torch.device("cuda")
        # device = torch.device("cpu") if (args.n_gpu == 0 or args.no_cuda) else torch.device(
        #     "cuda:{}".format(args.gpu_ids[0]))
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)
    args.log_dir = os.path.join(args.output_dir, "logs")
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    formatter = logging.Formatter('%(asctime)s %(levelname)s: - %(message)s',
                                  datefmt='%Y-%m-%d %H:%M:%S')
    log_name = "log-{}".format(
        time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()))
    if args.do_train:
        log_name += "-train"
    if args.do_predict:
        log_name += "-predict"
    log_name += "-{}".format("_".join(args.src_langs))
    log_name += "-{}.txt".format(os.path.basename(args.data_dir))
    fh = logging.FileHandler(os.path.join(args.log_dir, log_name))
    fh.setLevel(logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    fh.setFormatter(formatter)
    ch = logging.StreamHandler()
    ch.setLevel(logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    ch.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(ch)

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index  # -100 here

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()

    logger.info("Training/evaluation parameters %s", args)

    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    # Training
    if args.do_train:
        # load target model
        config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels)
        tokenizer = tokenizer_class.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config)

        if args.local_rank == 0:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training will download model & vocab

        model.to(args.device)

        # prepare target training plain text
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train")

        if args.do_KD:
            logger.info("********** scheme: training with KD **********")

            # compute probs from source models
            w = 1.0 / len(args.src_langs)
            weight_probs = {l: w for l in args.src_langs}

            src_probs = None
            for lang in args.src_langs:
                if src_probs is None:
                    src_probs = weight_probs[lang] * get_src_probs(
                        args, train_dataset, model_class, src_lang=lang)
                else:
                    src_probs += weight_probs[lang] * get_src_probs(
                        args, train_dataset, model_class, src_lang=lang)

            # Train!
            if args.hard_label:
                hard_labels = torch.argmax(src_probs, dim=-1, keepdim=False)
                train_dataset.tensors += (hard_labels, )

                global_step, tr_loss = train(args, model, train_dataset,
                                             tokenizer, labels,
                                             pad_token_label_id)
                logger.info(" global_step = %s, average loss = %s",
                            global_step, tr_loss)
            else:
                global_step, tr_loss_KD, tr_loss = train_KD(
                    args, model, train_dataset, src_probs, tokenizer, labels,
                    pad_token_label_id)
                logger.info(
                    " global_step = %s, average KD loss = %s, average loss = %s",
                    global_step, tr_loss_KD, tr_loss)
        else:
            logger.info("********** scheme: training without KD **********")
            global_step, tr_loss = train(args, model, train_dataset, tokenizer,
                                         labels, pad_token_label_id)
            logger.info(" global_step = %s, average loss = %s", global_step,
                        tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            "module") else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    if args.do_predict and args.local_rank in [-1, 0]:
        logger.info("********** scheme: prediction **********")
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args,
                                       model,
                                       tokenizer,
                                       labels,
                                       pad_token_label_id,
                                       mode="test")
        # Save results
        output_test_results_file = os.path.join(
            args.output_dir, "test_results-{}-{}.txt".format(
                time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()),
                os.path.basename(args.data_dir)))
        with open(output_test_results_file, "w", encoding='utf-8') as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(
            args.output_dir, "test_predictions-{}-{}.txt".format(
                time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime()),
                os.path.basename(args.data_dir)))
        with open(output_test_predictions_file, "w",
                  encoding='utf-8') as writer:
            with open(os.path.join(args.data_dir, "test.txt"),
                      "r",
                      encoding='utf-8') as f:
                example_id = 0
                for line in f:
                    if line.startswith(
                            "-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split()[0] + " " + line.split(
                        )[-1].replace(
                            "\n",
                            "") + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning(
                            "Maximum sequence length exceeded: No prediction for '%s'.",
                            line.split()[0])