def load_and_cache_examples(args, task, tokenizer):
    # similar to that in main.py
    processor = ABSAProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}'.format(
            'test',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task)))
    if os.path.exists(cached_features_file):
        print("cached_features_file:", cached_features_file)
        features = torch.load(cached_features_file)
        examples = processor.get_test_examples(args.data_dir,
                                               args.tagging_schema)
    else:
        #logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels(args.tagging_schema)
        examples = processor.get_test_examples(args.data_dir,
                                               args.tagging_schema)
        features = convert_examples_to_seq_features(
            examples=examples,
            label_list=label_list,
            tokenizer=tokenizer,
            cls_token_at_end=bool(args.model_type in ['xlnet']),
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            pad_on_left=bool(args.model_type in ['xlnet']),
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
        torch.save(features, cached_features_file)
    total_words = []
    for input_example in examples:
        text = input_example.text_a
        total_words.append(text.split(' '))

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)

    all_label_ids = torch.tensor([f.label_ids for f in features],
                                 dtype=torch.long)
    # used in evaluation
    all_evaluate_label_ids = [f.evaluate_label_ids for f in features]
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset, all_evaluate_label_ids, total_words
Exemple #2
0
def convert_to_dataset(args, examples, tokenizer):
    processor = ABSAProcessor()
    label_list = processor.get_labels(args.tagging_schema)
    normal_labels = processor.get_normal_labels(args.tagging_schema)
    features, imp_words = convert_examples_to_seq_features(
        examples=examples,
        label_list=(label_list, normal_labels),
        tokenizer=tokenizer,
        cls_token_at_end=False,
        cls_token=tokenizer.cls_token,
        sep_token=tokenizer.sep_token,
        cls_token_segment_id=0,
        pad_on_left=False,
        pad_token_segment_id=0)
    idxs = torch.arange(len(features))
    dataset = ABSADataset(features, idxs)
    return dataset
Exemple #3
0
def main():
    args = init_args()
    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        os.environ['MASTER_ADDR'] = args.MASTER_ADDR
        os.environ['MASTER_PORT'] = args.MASTER_PORT
        torch.distributed.init_process_group(backend='nccl',
                                             rank=args.local_rank,
                                             world_size=1)
        args.n_gpu = 1

    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    # not using 16-bits training
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: False",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1))

    # Set seed
    set_seed(args)

    # Prepare task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % args.task_name)
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels(args.tagging_schema)
    num_labels = len(label_list)
    normal_labels = processor.get_normal_labels(args.tagging_schema)
    num_normal_labels = len(normal_labels)
    sent_labels = ABSAProcessor.get_sentiment_labels()
    num_sent_labels = len(sent_labels)

    # initialize the pre-trained model
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
                                              cache_dir='./cache')

    config.absa_type = args.absa_type
    config.tfm_mode = args.tfm_mode
    config.fix_tfm = args.fix_tfm
    config.num_normal_labels = num_normal_labels
    config.num_sent_labels = num_sent_labels
    config.ts_vocab = {label: i for i, label in enumerate(label_list)}
    config.ote_vocab = {label: i for i, label in enumerate(normal_labels)}
    config.sent_vocab = {label: i for i, label in enumerate(sent_labels)}
    config.device = "cuda" if torch.cuda.is_available(
    ) and not args.no_cuda else "cpu"
    config.output_hidden_states = True
    config.model_name_or_path = args.model_name_or_path

    if args.gen_adv_from_path:
        # Generate adversarial examples
        modes = ['train', 'dev', 'test']
        for mode in modes:
            model = model_class.from_pretrained(args.gen_adv_from_path).to(
                args.device)
            train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples(
                args, args.task_name, tokenizer, mode=mode, model=model)
            adversary = Adversary(args, model)
            adv_examples = []
            sz = 64
            for _ in trange(len(examples) // sz + 1):
                if len(examples) == 0:
                    continue
                adv_examples.extend(
                    adversary.generate_adv_examples(examples[:sz],
                                                    imp_words[:sz], tokenizer))
                examples = examples[sz:]
                imp_words = imp_words[sz:]
            adv_dataset = convert_to_dataset(args, adv_examples, tokenizer)
            output_dir = f'{args.task_name}_adv'
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            torch.save(adv_dataset, f'{output_dir}/{mode}.pth')
            torch.save(adv_examples, f'{output_dir}/{mode}-examples.pth')
        exit(0)

    if args.load_model:
        print('Loading model from:', args.load_model)
        model = model_class.from_pretrained(args.load_model, config=config)
    else:
        model = model_class.from_pretrained(args.model_name_or_path,
                                            config=config,
                                            cache_dir='./cache')
        print('Loading model from:', args.model_name_or_path)

    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)
    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Training
    if args.do_train:
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.mkdir(args.output_dir)

        # Store model configuration with results
        shutil.copyfile('absa_layer.py', args.output_dir + '/absa_layer.py')
        # Store training configuration
        shutil.copyfile('train.sh', args.output_dir + '/train.sh')
        if args.do_adv:
            # Store adv training config
            shutil.copyfile('main.py', args.output_dir + '/main.py')

        train_dataset, train_evaluate_label_ids, examples, imp_words = load_and_cache_examples(
            args, args.task_name, tokenizer, mode='train', model=model)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)

        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        # save the model configuration
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Validation
    results = {}
    best_f1 = -999999.0
    best_checkpoint = None
    checkpoints = []
    if args.eval_all_checkpoints:
        checkpoints = os.listdir(args.output_dir)
        checkpoints.sort()
    logger.info("Perform validation on the following checkpoints: %s",
                checkpoints)
    test_results = {}
    steps = []
    for checkpoint in checkpoints:
        global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
        if checkpoint.split('-')[0] != 'checkpoint':
            continue
        if args.pred_checkpoint and args.pred_checkpoint != global_step:
            continue
        steps.append(global_step)
        set_seed(args)
        model = model_class.from_pretrained(f'{args.output_dir}/{checkpoint}')
        model.to(args.device)
        dev_result = evaluate(args,
                              model,
                              tokenizer,
                              mode='dev',
                              prefix=global_step)

        # regard the micro-f1 as the criteria of model selection
        if int(global_step) > 1000 and dev_result['micro-f1'] > best_f1:
            best_f1 = dev_result['micro-f1']
            best_checkpoint = checkpoint
        dev_result = dict(
            (k + '_{}'.format(global_step), v) for k, v in dev_result.items())
        results.update(dev_result)

        test_result = evaluate(args,
                               model,
                               tokenizer,
                               mode='test',
                               prefix=global_step)
        test_result = dict(
            (k + '_{}'.format(global_step), v) for k, v in test_result.items())
        test_results.update(test_result)

    best_ckpt_string = "\nThe best checkpoint is %s" % best_checkpoint
    logger.info(best_ckpt_string)
    dev_f1_values, dev_loss_values = [], []
    for k in results:
        v = results[k]
        if 'micro-f1' in k:
            dev_f1_values.append((k, v))
        if 'eval_loss' in k:
            dev_loss_values.append((k, v))
    test_f1_values, test_loss_values = [], []
    for k in test_results:
        v = test_results[k]
        if 'micro-f1' in k:
            test_f1_values.append((k, v))
        if 'eval_loss' in k:
            test_loss_values.append((k, v))
    log_file_path = '%s/log.txt' % args.output_dir
    log_file = open(log_file_path, 'a')
    log_file.write("\tValidation:\n")
    for (test_f1_k,
         test_f1_v), (test_loss_k,
                      test_loss_v), (dev_f1_k,
                                     dev_f1_v), (dev_loss_k,
                                                 dev_loss_v) in zip(
                                                     test_f1_values,
                                                     test_loss_values,
                                                     dev_f1_values,
                                                     dev_loss_values):
        global_step = int(test_f1_k.split('_')[-1])
        if not args.overfit and global_step <= 1000:
            continue
        print('test-%s: %.5lf, test-%s: %.5lf, dev-%s: %.5lf, dev-%s: %.5lf' %
              (test_f1_k, test_f1_v, test_loss_k, test_loss_v, dev_f1_k,
               dev_f1_v, dev_loss_k, dev_loss_v))
        validation_string = '\t\tdev-%s: %.5lf, dev-%s: %.5lf' % (
            dev_f1_k, dev_f1_v, dev_loss_k, dev_loss_v)
        log_file.write(validation_string + '\n')

    n_times = args.max_steps // args.save_steps + 1
    for step in steps:
        log_file.write('\tStep %s:\n' % step)
        precision = test_results['precision_%s' % step]
        recall = test_results['recall_%s' % step]
        micro_f1 = test_results['micro-f1_%s' % step]
        macro_f1 = test_results['macro-f1_%s' % step]
        log_file.write(
            '\t\tprecision: %.4lf, recall: %.4lf, micro-f1: %.4lf, macro-f1: %.4lf\n'
            % (precision, recall, micro_f1, macro_f1))
    log_file.write("\tBest checkpoint: %s\n" % best_checkpoint)
    log_file.write('******************************************\n')
    log_file.close()