Esempio n. 1
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False, val_or_test="val"):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."

    # Init features and dataset from cache if it exists
    
    logger.info("Creating features from dataset file at %s", input_dir)

    if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

        if args.version_2_with_negative:
            logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

        tfds_examples = tfds.load("squad")
        examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
    else:
        processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json"
            examples = processor.get_eval_examples(args.data_dir, filename=filename)
        else:
            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

    print("Starting squad_convert_examples_to_features")
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
        threads=args.threads,
    )
    print("Complete squad_convert_examples_to_features")


    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Esempio n. 2
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False,
                            val_or_test="val"):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            # Default
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json"
                examples = processor.get_eval_examples(args.data_dir,
                                                       filename=filename)
            else:
                examples = processor.get_train_examples(
                    args.data_dir, filename=args.train_file)

        print("Starting squad_convert_examples_to_features")
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
        print("Complete squad_convert_examples_to_features")

        # if args.local_rank in [-1, 0]:
        #    logger.info("Saving features into cached file %s", cached_features_file)
        #    torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
def load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, val_or_test="val", is_pretrain=False, qa_style=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    cached_features_file = "cache_{}".format("dev" if evaluate else "train")
    # confirm mixing should be applied 
    do_mix = (args.mix_qa and not evaluate) and (is_pretrain and val_or_test == "val")
    # load from cache if it is possible
    if val_or_test=="val" and args.load_cache:
        cached_session = args.cached_session_dev if evaluate else args.cached_session_train
        if is_pretrain:
            cached_session = args.cached_session_pretrain
            if qa_style:
                cached_session = args.cached_session_pretrain_qa
        logger.info("Loading features from cached file %s in %s", cached_features_file, cached_session)

        features_and_datasets = {}
        def load_data(dir_name):
            tmp = torch.load(os.path.join(dir_name, '{}.pt'.format(cached_features_file)))
            print(tmp.keys())
            nsml.copy(tmp, features_and_datasets)
        
        nsml.bind(load=load_data)
        nsml.load(checkpoint=cached_features_file, session=cached_session)
        bind_nsml(model, tokenizer, args)
        print(features_and_datasets.keys())
        features, dataset, examples = (
            features_and_datasets["features"],
            features_and_datasets["dataset"],
            features_and_datasets["examples"],
        )          

    else:
        logger.info("Creating features from dataset file at %s", cached_features_file)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json"
                examples = processor.get_eval_examples(args.data_dir, filename=filename)
            else:
                if is_pretrain:
                    examples = processor.get_pretrain_examples(args.data_dir, filename=args.train_file, qa_style=qa_style)
                else:
                    examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
    # apply mixing
    if do_mix:
        num_qa = len(examples)
        mix_batch_size = int(args.mix_portion * num_qa)
        if mix_batch_size % 2 == 1:
            mix_batch_size -= 1
        mix_batch = np.array(random.sample(range(num_qa), mix_batch_size)).reshape(-1, 2)
        for i, (k,v) in enumerate(mix_batch):
            example_k, example_v = examples[k], examples[v]
            ans_k, ans_v = example_k.answer_text, example_v.answer_text
            example_k.context_text, example_v.context_text = example_v.context_text, example_k.context_text
            assert not (example_k.is_impossible or example_v.is_impossible)
            if ans_k != ans_v:
                example_k.is_impossible, example_v.is_impossible = True, True
                example_k.start_position_character, example_v.start_position_character = None, None
            else:
                example_k.start_position, example_v.end_position = example_v.start_position, example_k.end_position
    if do_mix or not (val_or_test=="val" and args.load_cache):
        print("Starting squad_convert_examples_to_features")
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
        print("Complete squad_convert_examples_to_features")

    # make cache in the session if it is required
    if val_or_test=="val" and args.save_cache:
        features_and_datasets = {"dataset": dataset, "examples": examples, "features": features}

        def save_data(dir_name):
            os.makedirs(dir_name, exist_ok=True)
            torch.save(features_and_datasets, os.path.join(dir_name, '{}.pt'.format(cached_features_file)))
            logger.info("Save data at {}".format(dir_name))

        nsml.bind(save=save_data)
        nsml.save(cached_features_file)
        bind_nsml(model, tokenizer, args)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset,
        # and the others will use the cache.
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Esempio n. 4
0
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json"
                examples = processor.get_eval_examples(args.data_dir, filename=filename, example_style=args.example_style)
            else:
                examples = processor.get_train_examples(args.data_dir, filename=args.train_file, example_style=args.example_style)

        print("Starting squad_convert_examples_to_features")
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,