Exemple #1
0
def load_and_cache_examples(args, tokenizer, set_type):
    processor = QPMProcessor()
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_customized'.format(
        set_type,
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length)
    ))
    if os.path.exists(cached_features_file):
        logger.info('Loading features from cache file %s', cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info('Creating features from dataset file at %s', args.data_dir)
        label_list = processor.get_labels()
        category_list = processor.get_categories()
        examples = processor.get_examples(args.data_dir, set_type)
        features = convert_examples_to_features(examples, label_list, category_list, args.max_seq_length, tokenizer,
            cls_token_at_end=False,    # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False,
            pad_on_left=False,
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=0
            # cls_token_at_end=bool(args.model_type in ['xlnet']),    # xlnet has a cls token at the end
            # cls_token=tokenizer.cls_token,
            # cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
            # sep_token=tokenizer.sep_token,
            # sep_token_extra=bool(args.model_type in ['roberta']),   # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            # pad_on_left=bool(args.model_type in ['xlnet']),         # pad on the left for xlnet
            # pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            # pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
        )
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    all_ct_clf_input_ids = torch.tensor([f.category_clf_input_ids for f in features], dtype=torch.long)
    all_ct_clf_input_mask = torch.tensor([f.category_clf_input_mask for f in features], dtype=torch.long)
    all_ct_clf_segment_ids = torch.tensor([f.category_clf_segment_ids for f in features], dtype=torch.long)
    all_category_ids = torch.tensor([f.category_id for f in features], dtype=torch.long)
    all_hand_features = torch.tensor([f.hand_features for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,
                            all_ct_clf_input_ids, all_ct_clf_input_mask, all_ct_clf_segment_ids, all_category_ids,
                            all_hand_features)
    return dataset
Exemple #2
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument('--data_dir', default=None, type=str, required=True,
                        help='The input data dir. Should contain the .csv files for the task.')
    parser.add_argument('--model_name_or_path', default=None, type=str, required=True,
                        help='Path to pretrained model or shortcut name selected in the list.')
    parser.add_argument('--output_dir', default=None, type=str, required=True,
                        help='The output directory where the model predictions and checkpoints will be written.')

    ## Other parameters
    parser.add_argument('--config_name', default='', type=str,
                        help='Pretrained config name or path if not the same as model_name.')
    parser.add_argument('--tokenizer_name', default='', type=str,
                        help='Pretrained tokenizer name or path if not the same as model_name.')
    parser.add_argument('--max_seq_length', default='128', type=int,
                        help='The maximum total input sequence length after tokenization. Sequences longer than this '
                             'will be truncated, sequences shorter will be padded.')
    parser.add_argument('--do_train', action='store_true',
                        help='Whether to run training.')
    parser.add_argument('--do_eval', action='store_true',
                        help='Whether to run eval on the dev set.')
    parser.add_argument('--do_predict', action='store_true',
                        help='Whether to run test on the test set.')
    parser.add_argument('--evaluate_during_training', action='store_true',
                        help='Rul evaluation during training at each logging step.')
    parser.add_argument('--do_lower_case', action='store_true',
                        help='Set this flag if you are using an uncased model.')

    parser.add_argument('--per_gpu_train_batch_size', default=1, type=int,
                        help='Batch size per GPU/CPU for training.')
    parser.add_argument('--per_gpu_eval_batch_size', default=8, type=int,
                        help='Batch size per GPU/CPU for evaluation.')
    parser.add_argument('--per_gpu_test_batch_size', default=8, type=int,
                        help='Batch size per GPU/CPU for prediction.')
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help='Number of updates steps to accumulate before performing a backward/update pass.')
    parser.add_argument('--learning_rate', default=5e-5, type=float,
                        help='The initial learning rate for Adam.')
    parser.add_argument('--weight_decay', default=0.0, type=float,
                        help='Weight decay if we apply some.')
    parser.add_argument('--adam_epsilon', default=1e-8, type=float,
                        help='Epsilon for Adam optimizer.')
    parser.add_argument('--max_grad_norm', default=1.0, type=float,
                        help='Max gradient norm.')
    parser.add_argument('--num_train_epochs', default=4.0, type=float,
                        help='Total number of training epochs to perform.')
    parser.add_argument('--max_steps', default=-1, type=int,
                        help='If > 0: set total number of training steps to perform. Override num_train_epochs.')
    parser.add_argument('--warmup_steps', default=0, type=int,
                        help='Linear warmup over warmup_steps.')

    parser.add_argument('--logging_steps', type=int, default=50,
                        help='Log every X updates steps.')
    parser.add_argument('--save_steps', type=int, default=100,
                        help='Save checkpoint every X updates steps.')
    parser.add_argument('--eval_all_checkpoints', action='store_true',
                        help='Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number.')
    parser.add_argument('--no_cuda', action='store_true',
                        help='Avoid using CUDA when available.')
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help='Overwrite the content of the output directory.')
    parser.add_argument('--overwrite_cache', action='store_true',
                        help='Overwrite the cached training and evaluation sets.')
    parser.add_argument('--seed', type=int, default=42,
                        help='random seed for initialization')

    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    args = parser.parse_args()

    # Setup CUDA, GPU
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError('Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.')

    device = torch.device('cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO)
    logger.warning('Process device: %s, n_gpu: %s, 16-bits training: %s',
                   device, args.n_gpu, args.fp16)

    # Set seed
    set_seed(args)
    # Prepare QPM task
    processor = QPMProcessor()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    config_class, model_class, tokenizer_class = BertConfig, FeatureBert, BertTokenizer
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels)
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
    model.to(args.device)

    logger.info('Trainning/evaluation parameters %s', args)
    parent_data_dir = args.data_dir
    parent_output_dir = args.output_dir

    # Trainning
    results_tmp = {}
    if args.do_train:
        # 10-Fold dataset for training.
        # for i in range(0, 10):
        # Reload the pretrained model.
        model = model_class.from_pretrained(args.model_name_or_path,
                                            from_tf=bool('.ckpt' in args.model_name_or_path),
                                            config=config)
        model.to(args.device)

        # args.data_dir = parent_data_dir + str(i)
        # args.output_dir = parent_output_dir + str(i)

        train_dataset = load_and_cache_examples(args, tokenizer, set_type='train')
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
        # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
        # Create output directory if needed
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

        # for reduce the usage of disk, evluate and find the best checkpoint every sub dataset.
        # args.data_dir = parent_data_dir + str(i)
        # args.output_dir = parent_output_dir + str(i)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        best_f1 = 0.0
        for checkpoint in checkpoints:
            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=global_step)
            if result['f1'] > best_f1:
                best_f1 = result['f1']
                # Save the best model checkpoint
                output_dir = os.path.join(args.output_dir, 'best_checkpoint_fold')
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(model, 'module') else model
                model_to_save.save_pretrained(output_dir)
                torch.save(args, 'training_args.bin')
                logger.info('Saving model checkpoint to %s', output_dir)

            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
            results_tmp.update(result)
        checkpoints.remove(args.output_dir)
        for checkpoint in checkpoints:
            shutil.rmtree(checkpoint)

    # Evaluation
    results = {}
    if args.do_eval:
        for i in range(10):
            args.data_dir = parent_data_dir + str(i)
            args.output_dir = parent_output_dir + str(i)
            tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
            checkpoints = [args.output_dir]
            if args.eval_all_checkpoints:
                checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
                logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
            logger.info("Evaluate the following checkpoints: %s", checkpoints)
            best_f1 = 0.0
            for checkpoint in checkpoints:
                global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
                model = model_class.from_pretrained(checkpoint)
                model.to(args.device)
                result = evaluate(args, model, tokenizer, prefix=global_step)
                if result['f1'] > best_f1:
                    best_f1 = result['f1']
                    # Save the best model checkpoint
                    output_dir = os.path.join(args.output_dir, 'best_checkpoint_fold' + str(i))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, 'training_args.bin')
                    logger.info('Saving model checkpoint to %s', output_dir)

                result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
                results.update(result)

    # Prediction
    if args.do_predict:
        # for i in range(10):
        # args.output_dir = parent_output_dir + str(i)
        args.output_dir = parent_output_dir
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        # checkpoint = args.output_dir + '/best_checkpoint_fold' + str(i)
        checkpoint = args.output_dir + '/best_checkpoint_fold'
        model = model_class.from_pretrained(checkpoint)
        model.to(args.device)
        # predict(args, model, tokenizer, i)
        predict(args, model, tokenizer)

        # For bagging.
        all = pd.read_csv('./data/sample_submission.csv')
        for i in range(10):
            df = pd.read_csv(args.data_dir + str(i) + '/result.csv')
            all['label'] += df['label']
        all['label'] = all['label'] // 6
        all.to_csv('./data/result.csv', index=False)