def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False, val_or_test="val"): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." # Init features and dataset from cache if it exists logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json" examples = processor.get_eval_examples(args.data_dir, filename=filename) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) print("Starting squad_convert_examples_to_features") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) print("Complete squad_convert_examples_to_features") if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False, val_or_test="val"): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: # Default processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json" examples = processor.get_eval_examples(args.data_dir, filename=filename) else: examples = processor.get_train_examples( args.data_dir, filename=args.train_file) print("Starting squad_convert_examples_to_features") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) print("Complete squad_convert_examples_to_features") # if args.local_rank in [-1, 0]: # logger.info("Saving features into cached file %s", cached_features_file) # torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, val_or_test="val", is_pretrain=False, qa_style=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() cached_features_file = "cache_{}".format("dev" if evaluate else "train") # confirm mixing should be applied do_mix = (args.mix_qa and not evaluate) and (is_pretrain and val_or_test == "val") # load from cache if it is possible if val_or_test=="val" and args.load_cache: cached_session = args.cached_session_dev if evaluate else args.cached_session_train if is_pretrain: cached_session = args.cached_session_pretrain if qa_style: cached_session = args.cached_session_pretrain_qa logger.info("Loading features from cached file %s in %s", cached_features_file, cached_session) features_and_datasets = {} def load_data(dir_name): tmp = torch.load(os.path.join(dir_name, '{}.pt'.format(cached_features_file))) print(tmp.keys()) nsml.copy(tmp, features_and_datasets) nsml.bind(load=load_data) nsml.load(checkpoint=cached_features_file, session=cached_session) bind_nsml(model, tokenizer, args) print(features_and_datasets.keys()) features, dataset, examples = ( features_and_datasets["features"], features_and_datasets["dataset"], features_and_datasets["examples"], ) else: logger.info("Creating features from dataset file at %s", cached_features_file) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json" examples = processor.get_eval_examples(args.data_dir, filename=filename) else: if is_pretrain: examples = processor.get_pretrain_examples(args.data_dir, filename=args.train_file, qa_style=qa_style) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) # apply mixing if do_mix: num_qa = len(examples) mix_batch_size = int(args.mix_portion * num_qa) if mix_batch_size % 2 == 1: mix_batch_size -= 1 mix_batch = np.array(random.sample(range(num_qa), mix_batch_size)).reshape(-1, 2) for i, (k,v) in enumerate(mix_batch): example_k, example_v = examples[k], examples[v] ans_k, ans_v = example_k.answer_text, example_v.answer_text example_k.context_text, example_v.context_text = example_v.context_text, example_k.context_text assert not (example_k.is_impossible or example_v.is_impossible) if ans_k != ans_v: example_k.is_impossible, example_v.is_impossible = True, True example_k.start_position_character, example_v.start_position_character = None, None else: example_k.start_position, example_v.end_position = example_v.start_position, example_k.end_position if do_mix or not (val_or_test=="val" and args.load_cache): print("Starting squad_convert_examples_to_features") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) print("Complete squad_convert_examples_to_features") # make cache in the session if it is required if val_or_test=="val" and args.save_cache: features_and_datasets = {"dataset": dataset, "examples": examples, "features": features} def save_data(dir_name): os.makedirs(dir_name, exist_ok=True) torch.save(features_and_datasets, os.path.join(dir_name, '{}.pt'.format(cached_features_file))) logger.info("Save data at {}".format(dir_name)) nsml.bind(save=save_data) nsml.save(cached_features_file) bind_nsml(model, tokenizer, args) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, # and the others will use the cache. torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: filename = args.predict_file if val_or_test == "val" else "test_data/korquad_open_test.json" examples = processor.get_eval_examples(args.data_dir, filename=filename, example_style=args.example_style) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file, example_style=args.example_style) print("Starting squad_convert_examples_to_features") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length,