def load_saved_examples(args, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") logger.warn("Something went wrong!") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) # Sanity check for loading the correct example assert examples[ 0].question_text == 'In what country is Normandy located?', 'Invalid dev file!' else: # Normal get train examples examples = processor.get_train_examples(args.data_dir, filename=args.train_file) # Sanity check for loading the correct example assert examples[ 0].question_text == 'When did Beyonce start becoming popular?', 'Invalid train file!' assert args.saved_processed_data_dir, 'args.saved_processed_data_dir not defined!' ensemble_dir = args.saved_processed_data_dir print(args.saved_processed_data_dir) if evaluate: with open(os.path.join(ensemble_dir, 'saved_data_dev.pkl'), 'rb') as f: saved_data = pickle.load(f) else: with open(os.path.join(ensemble_dir, 'saved_data_train.pkl'), 'rb') as f: saved_data = pickle.load(f) # saved_data: [features, all_results, tokenizer] features, all_results, tokenizer = saved_data if evaluate: assert len(examples) == 6078 else: assert len(examples) == 130319 if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() return examples, features, all_results, tokenizer
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length)) ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset = features_and_dataset["features"], features_and_dataset["dataset"] else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features_cg( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset='pt', threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset}, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache if output_examples: return dataset, examples, features return dataset
def __init__( self, model_name_or_path = "ktrapeznikov/albert-xlarge-v2-squad-v2", n_best_size = 1, max_answer_length = 30, do_lower_case = True, null_score_diff_threshold = 0.0, ): self.n_best_size = n_best_size self.max_answer_length = max_answer_length self.do_lower_case = do_lower_case self.null_score_diff_threshold = null_score_diff_threshold self.config_class = AlbertConfig self.model_class = AlbertForQuestionAnswering self.tokenizer_class = AlbertTokenizer self.config = self.config_class.from_pretrained(model_name_or_path) self.tokenizer = self.tokenizer_class.from_pretrained( model_name_or_path, do_lower_case=self.do_lower_case ) self.model = self.model_class.from_pretrained( model_name_or_path, config=self.config ) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.model.to(self.device) self.processor = SquadV2Processor()
def load_squad_for_bert(self): if args['datasetsize'] == '1.1': self.basedir = '../MR/SQUAD/' self.corpus_file_train = self.basedir + 'train-v1.1.json' self.corpus_file_dev = self.basedir + 'dev-v1.1.json' self.data_dump_path = args['rootDir'] + '/SQUAD_1_bert.pkl' # self.vocfile = args['rootDir'] + '/voc_squad_1.txt' self.processor = SquadV1Processor() elif args['datasetsize'] == '2.0': self.basedir = '../MR/SQUAD/' self.corpus_file_train = self.basedir + 'train-v2.0.json' self.corpus_file_test = self.basedir + 'dev-v2.0.json' self.data_dump_path = args['rootDir'] + '/SQUAD_2_bert.pkl' # self.vocfile = args['rootDir'] + '/voc_squad_2.txt' self.processor = SquadV2Processor() datasetExist = os.path.isfile(self.data_dump_path) if not datasetExist: datasets = {'train': {}, 'dev': {}, 'test': {}} examples = self.processor.get_train_examples( self.basedir, filename='train-v1.1.json') features, data = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=True, return_dataset="pt", threads=1, ) datasets['train']['dataset'] = data datasets['train']['features'] = features datasets['train']['examples'] = examples examples_dev = self.processor.get_dev_examples( self.basedir, filename='dev-v1.1.json') features_dev, data_dev = squad_convert_examples_to_features( examples=examples_dev, tokenizer=self.tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) datasets['dev']['dataset'] = data_dev datasets['dev']['features'] = features_dev datasets['dev']['examples'] = examples_dev print('Saving dataset...') self.saveDataset(self.data_dump_path, datasets, dataonly=True) # Saving tf samples else: datasets = self.loadDataset(self.data_dump_path, dataonly=True) # print('train size:\t', len(dataset['train'])) # print('test size:\t', len(dataset['test'])) print('loaded') return datasets
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), "cached_distillation_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) try: features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) except KeyError: raise DeprecationWarning( "You seem to be loading features from an older version of this script please delete the " "file %s in order for it to be created again" % cached_features_file ) else: logger.info("Creating features from dataset file at %s", input_file) processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = args.data_dir cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) # If the processor is defined, filename will automatically defined by the processor. processor = SquadV2Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=None) else: examples = processor.get_train_examples(args.data_dir, filename=None) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
def __init__(self, data_path, h5_path, pretrained_tokenizer="bert-large-uncased", verbose=False): """ProcessSquad class initialization routine. Args: data_path (str): OS path location of the SQuAD v2 dev and train files. h5_path (str): OS path location of the output folder where the h5 processed SQuAD data should be stored. pretrained_tokenizer (str): name of the pretrained tokenizer to use during processing (ref: https://huggingface.co/transformers/main_classes/tokenizer.html). verbose (bool, optional): Indicates whether the routine should provide verbose feedback to caller. Defaults to False. Raises: RuntimeError: if any path provided does not exist. RuntimeError: if the SQuAD v2 dev or training files do not exist in the data_path. """ # validate that the constructor parameters were provided by caller if (not data_path) | (not h5_path) | (not pretrained_tokenizer): raise RuntimeError( 'SQuAD v2 data path, output h5 path, and pretrained_tokenizer must be specified.' ) # clean and validate the path strings data_path = self.__clean_path(data_path) h5_path = self.__clean_path(h5_path) # validate existence of the expected SQuAD v2 files in the data_path provided by caller for f, d in [[SQUAD_DEV_FILE, "SQuAD v2 Dev File"], [SQUAD_TRAIN_FILE, "SQuAD v2 Train File"]]: f = os.path.join(data_path, f) if (not os.path.isfile(f)): raise RuntimeError(f"{d} file specified [{f}] does not exist.") # set the class variables with the dev and train squad file locations self.__dev_squad = data_path self.__train_squad = data_path # set the class variable for the h5 output files self.__dev_h5 = os.path.join(h5_path, OUTPUT_DEV_FILE) self.__train_h5 = os.path.join(h5_path, OUTPUT_TRAIN_FILE) # load the pre-trained tokenizer pretrained_tokenizer = pretrained_tokenizer.strip().lower() try: self.__tokenizer = BertTokenizer.from_pretrained( pretrained_tokenizer) except: raise RuntimeError( f"Failed to load pretrained tokenizer '{pretrained_tokenizer}'." ) # Load the processor self.__processor = SquadV2Processor() if verbose: print("All input file locations validated.")
def load_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from dataset file # logger.info("Creating features from dataset file at %s", ) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(os.path.join( args.data_dir, args.task), filename=args.predict_file) else: examples = processor.get_train_examples(os.path.join( args.data_dir, args.task), filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) # logger.info("Saving features into cached file %s", cached_features_file) torch.save({ "features": features, "dataset": dataset, "examples": examples }) if output_examples: return dataset, examples, features return dataset
def __init__(self): self.searcher = SimpleSearcher(PATH_TO_WIKI_INDEX) self.searcher.set_bm25() self.searcher.unset_rm3() self.processor = SquadV2Processor() self.k = 29 self.mu = 0.5 self.use_ir_score = True self.tokenizer = BertTokenizer.from_pretrained(PATH_TO_DILBERT, do_lower_case=True) self.model = DilBert.from_pretrained(PATH_TO_DILBERT) self.device = DEVICE_COMP self.model.to(torch.device(self.device))
def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary self.seed = args.seed self.bpe = encoders.build_bpe(args) self.tokenizer = SQuADTokenizer(args.bpe_vocab_file, dictionary) self.do_evaluate = args.do_evaluate try: from transformers.data.processors.squad import SquadV2Processor self.processor = SquadV2Processor() except ImportError: raise ImportError( 'Please install transformers with: pip install transformers')
def load_and_cache_examples(tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = "deeplearning_needed/SQUAD_data" cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, model_type.split("/"))).pop(), str(384), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file): features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: processor = SquadV2Processor() if evaluate: examples = processor.get_dev_examples(input_dir, filename='dev-v2.0.json') else: examples = processor.get_train_examples(input_dir, filename='train-v2.0.json') features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=not evaluate, return_dataset="pt", threads=1, ) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
def __init__( self, named_model='bert-large-uncased', max_query_length=64, max_seq_length=386, doc_stride=128, processor=SquadV2Processor(), ): self.named_model = named_model self.tokenizer = BertTokenizer.from_pretrained(named_model) self.max_query_length = max_query_length self.max_seq_length = max_seq_length self.doc_stride = doc_stride self.processor = processor
def load_and_cache_examples(data_dir: str, tokenizer, task, max_seq_length, doc_stride, max_query_length, evaluate=False): if (task == "SQuAD1.1"): train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json" validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json" train_file = "train-v1.1.json" validation_file = "dev-v1.1.json" processor = SquadV1Processor() elif (task == "SQuAD2.0"): train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json" validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json" train_file = "train-v2.0.json" validation_file = "dev-v2.0.json" processor = SquadV2Processor() else: raise NameError("Incompatible dataset detected") if not os.path.exists(data_dir): os.makedirs(data_dir) if evaluate: with urllib.request.urlopen(validation_url) as url: with open(data_dir + "/" + validation_file, 'w') as f: f.write(url.read().decode()) examples = processor.get_dev_examples(data_dir, filename=validation_file) else: with urllib.request.urlopen(train_url) as url: with open(data_dir + "/" + train_file, 'w') as f: f.write(url.read().decode()) examples = processor.get_train_examples(data_dir, filename=train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset="pt", ) return dataset, examples, features
def evaluate(model, tokenizer, device, maxSequenceLength, maxQueryLength, documentStride): processor = SquadV2Processor() devData = processor.get_dev_examples(".") features, devDataset = transformers.squad_convert_examples_to_features( examples=devData, tokenizer=tokenizer, max_seq_length=maxSequenceLength, max_query_length=maxQueryLength, doc_stride=documentStride, return_dataset="pt", threads=1, is_training=False) batchSize = 2 sampler = torch.utils.data.SequentialSampler(devDataset) dataLoader = torch.utils.data.DataLoader(devDataset, sampler=sampler, batch_size=batchSize) results = [] for batch in tqdm.tqdm(devDataset): model.eval() batch = tuple(bat.to(device) for bat in batch) with torch.no_grad(): startPosition = batch[3] outputs = model(input_ids=batch[0], attention_mask=batch[1], token_type_ids=batch[2]) for i, index in enumerate(example): feature = features[index.item()] id = int(feature.unique_id) output = [output[i].detach().cpu().tolist() for output in outputs] print(len(output)) results.append( transformers.data.processors.squad.SquadResult( id, output[0], output[2])) predictions = transformers.data.metrics.squad_metrics.compute_predictions_logits( devData, features, results) results = transformers.data.metrics.squad_metrics.squad_evaluate( devData, predictions) return results
def load_squad_examples(self, mode="train"): if self.data_dir: processor = SquadV2Processor( ) if self.version_2_with_negative else SquadV1Processor() if mode == "train": examples = processor.get_train_examples(self.data_dir, filename="train.json") elif mode == "dev": examples = processor.get_train_examples( self.data_dir, filename="dev.json" ) # for obtaining start positions and end positions elif mode == "test": examples = processor.get_dev_examples(self.data_dir, filename="dev.json") else: raise KeyError(mode) # for debugging -- to small set # Uncomment out below code for debugging. N = 10 examples = examples[:N] # -------------------------------------- is_training = mode != "test" # for obtaining start positions and end positions features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, doc_stride=self.doc_stride, max_query_length=self.max_query_length, is_training=is_training, return_dataset="pt", # Return DataType is Pytorch Tensor ! threads=2) if not is_training: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = '' if not args.do_output: cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) else: cached_features_file = os.path.join( input_dir, "cached_output_{}_{}_{}".format( "dev" if evaluate else "train", list( filter( None, args.model_name_or_path.replace('/cur_best', '').split("/"))).pop(), str(args.max_seq_length), ), ) # Overwrite cached_features_file if args.cached_features_file is not None if args.cached_features_file is not None: cached_features_file = args.cached_features_file # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( args.data_dir, filename=args.predict_file) # Sanity check for loading the correct example assert examples[ 0].question_text == 'In what country is Normandy located?', 'Invalid dev file!' else: # Normal get train examples examples = processor.get_train_examples( args.data_dir, filename=args.train_file) # Sanity check for loading the correct example assert examples[ 0].question_text == 'When did Beyonce start becoming popular?', 'Invalid train file!' features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if args.do_output and not evaluate: example_indices = torch.tensor([f.example_index for f in features]) og_tensors = dataset.tensors dataset = TensorDataset(*og_tensors, example_indices) assert len(og_tensors) + 1 == len( dataset.tensors), 'Failed to add example_indices to Dataset!' print(len(examples)) print(len(dataset)) # Sanity check example length with the correct numbers if evaluate: assert len(examples) == 6078 else: assert len(examples) == 130319 if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
print("gpunum :", gpunum) # set the random seed of training seed = 142857 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) model_type = "bert" shortcut = "bert-base-uncased" config = transformers.AutoConfig.from_pretrained(shortcut) tokenizer = transformers.AutoTokenizer.from_pretrained(shortcut) model = transformers.AutoModelForQuestionAnswering.from_pretrained( shortcut, config=config) model.to(device) # read in data processor = SquadV2Processor() trainData = processor.get_train_examples(".") features, trainDataset = transformers.squad_convert_examples_to_features( examples=trainData, tokenizer=tokenizer, max_seq_length=maxSequenceLength, max_query_length=maxQueryLength, doc_stride=documentStride, return_dataset="pt", is_training=True) # build up model batchSize = 12 trainSampler = torch.utils.data.RandomSampler(trainDataset) trainDataloader = torch.utils.data.DataLoader(trainDataset, sampler=trainSampler, batch_size=batchSize)
def run_squad_and_get_results( model: tf.keras.Model, # Must be QuestionAnswering model, not PreTraining tokenizer: PreTrainedTokenizer, run_name: str, filesystem_prefix: str, per_gpu_batch_size: int, checkpoint_frequency: Optional[int], validate_frequency: Optional[int], evaluate_frequency: Optional[int], learning_rate: float, warmup_steps: int, total_steps: int, dataset: str, dummy_eval: bool = False, ) -> Dict: checkpoint_frequency = checkpoint_frequency or 1000000 validate_frequency = validate_frequency or 1000000 evaluate_frequency = evaluate_frequency or 1000000 is_sagemaker = filesystem_prefix.startswith("/opt/ml") disable_tqdm = is_sagemaker schedule = LinearWarmupPolyDecaySchedule( max_learning_rate=learning_rate, end_learning_rate=0, warmup_steps=warmup_steps, total_steps=total_steps, ) optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic" ) # AMP if dataset == "squadv1": train_filename = "train-v1.1.json" val_filename = "dev-v1.1.json" processor = SquadV1Processor() elif dataset == "squadv2": train_filename = "train-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() elif dataset == "debug": train_filename = "dev-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() else: assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']" data_dir = os.path.join(filesystem_prefix, "squad_data") train_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=train_filename, per_gpu_batch_size=per_gpu_batch_size, shard=True, shuffle=True, repeat=True, drop_remainder=True, ) if hvd.rank() == 0: logger.info(f"Starting finetuning on {dataset}") pbar = tqdm.tqdm(total_steps, disable=disable_tqdm) summary_writer = None # Only create a writer if we make it through a successful step val_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=val_filename, per_gpu_batch_size=per_gpu_batch_size, shard=False, shuffle=True, drop_remainder=False, ) # Need to re-wrap every time this function is called # Wrapping train_step gives an error with optimizer initialization on the second pass # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875 # Discussion at https://github.com/tensorflow/tensorflow/issues/27120 global train_step train_step = rewrap_tf_function(train_step) for step, batch in enumerate(train_dataset): learning_rate = schedule(step=tf.constant(step, dtype=tf.float32)) loss, acc, exact_match, f1, precision, recall = train_step( model=model, optimizer=optimizer, batch=batch ) # Broadcast model after the first step so parameters and optimizer are initialized if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) is_final_step = step >= total_steps - 1 if hvd.rank() == 0: do_checkpoint = ((step > 0) and step % checkpoint_frequency == 0) or is_final_step do_validate = ((step > 0) and step % validate_frequency == 0) or is_final_step do_evaluate = ((step > 0) and step % evaluate_frequency == 0) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}" pbar.set_description(description) if do_validate: logger.info("Running validation") ( val_loss, val_acc, val_exact_match, val_f1, val_precision, val_recall, ) = run_validation(model=model, val_dataset=val_dataset) description = ( f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, " f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}" ) logger.info(description) if do_evaluate: logger.info("Running evaluation") if dummy_eval: results = { "exact": 0.8169797018445212, "f1": 4.4469722448269335, "total": 11873, "HasAns_exact": 0.15182186234817813, "HasAns_f1": 7.422216845956518, "HasAns_total": 5928, "NoAns_exact": 1.4802354920100924, "NoAns_f1": 1.4802354920100924, "NoAns_total": 5945, "best_exact": 50.07159100480081, "best_exact_thresh": 0.0, "best_f1": 50.0772059855695, "best_f1_thresh": 0.0, } else: results: Dict = get_evaluation_metrics( model=model, tokenizer=tokenizer, data_dir=data_dir, filename=val_filename, per_gpu_batch_size=32, ) print_eval_metrics(results=results, step=step, dataset=dataset) if do_checkpoint: # TODO: Abstract out to specify any checkpoint path checkpoint_path = os.path.join( filesystem_prefix, f"checkpoints/squad/{run_name}-step{step}.ckpt" ) logger.info(f"Saving checkpoint at {checkpoint_path}") model.save_weights(checkpoint_path) if summary_writer is None: # TODO: Abstract out to specify any logs path summary_writer = tf.summary.create_file_writer( os.path.join(filesystem_prefix, f"logs/squad/{run_name}") ) with summary_writer.as_default(): tf.summary.scalar("learning_rate", learning_rate, step=step) tf.summary.scalar("train_loss", loss, step=step) tf.summary.scalar("train_acc", acc, step=step) tf.summary.scalar("train_exact", exact_match, step=step) tf.summary.scalar("train_f1", f1, step=step) tf.summary.scalar("train_precision", precision, step=step) tf.summary.scalar("train_recall", recall, step=step) if do_validate: tf.summary.scalar("val_loss", val_loss, step=step) tf.summary.scalar("val_acc", val_acc, step=step) tf.summary.scalar("val_exact", val_exact_match, step=step) tf.summary.scalar("val_f1", val_f1, step=step) tf.summary.scalar("val_precision", val_precision, step=step) tf.summary.scalar("val_recall", val_recall, step=step) # And the eval metrics tensorboard_eval_metrics( summary_writer=summary_writer, results=results, step=step, dataset=dataset ) if is_final_step: break del train_dataset # Can we return a value only on a single rank? if hvd.rank() == 0: pbar.close() logger.info(f"Finished finetuning, job name {run_name}") return results
def predict(model_prefix, probes_dir, preds_dir, data_dir, data_file, layers, batch_size, hidden_dim, max_seq_length, device): # Extract examples tokenizer = AutoTokenizer.from_pretrained(model_prefix) processor = SquadV2Processor() dev_examples = processor.get_dev_examples(data_dir=data_dir, filename=data_file) # Extract dev features print("Loading dev features") dev_features, dev_dataset = squad_convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1) # Initialize config and model config = AutoConfig.from_pretrained(model_prefix, output_hidden_states=True) model = AutoModelForQuestionAnswering.from_pretrained(model_prefix, config=config) # multi-gpu evaluate model = torch.nn.DataParallel(model) # Load probe for each layer print("Loading probes") probes = [] for i in range(layers): p = Probe(hidden_dim) p.load(probes_dir, i + 1, device) probes.append(p) # Extract IDs print("Extracting dev IDs") n = len(dev_examples) q_ids = [] for i in range(n): q_ids.append(dev_examples[i].qas_id) # Initialize dev data loader eval_sampler = SequentialSampler(dev_dataset) eval_dataloader = DataLoader(dev_dataset, sampler=eval_sampler, batch_size=batch_size) # Initialize predictions predictions = [] for i in range(layers): pred = pd.DataFrame() pred['Id'] = q_ids pred['Predicted'] = [""] * len(dev_examples) pred['Question'] = [""] * len(dev_examples) pred['Score'] = [0] * len(dev_examples) predictions.append(pred) # List to keep track of how many unique questions we've seen in each df, questions with # contexts longer than max seq len get split into multiple features based on doc_stride # a good alternative we may implement later is recording for all features, then simplifying with groupby and max # e.g. something like df.sort_values('Score', ascending=False).drop_duplicates(['Question']) question_ids = [0] * layers # Evaluation batches print("Predicting on dev set") for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } # Distil does not use token type ids if "distil" in model_dir: inputs.pop('token_type_ids') # ALBERT/BERT/Distilibert forward pass idx = batch[3] outputs = model(**inputs) attention_hidden_states = outputs[2][1:] # Compute prediction on eval indices for j, index in enumerate(idx): index = int(index.item()) # Extract tokens for the current batch tokens = tokenizer.convert_ids_to_tokens(batch[0][j]) # Find where context starts and ends, since we want to predict in context context_start = int(max_seq_length - torch.argmax( torch.flip(batch[2][j], [0])).item()) - 1 context_end = int(torch.argmax(batch[2][j]).item()) # Find the question, starting right after [CLS] and subtracting 1 to chop off the [SEP] token question_start = 1 question_end = context_start question = tokenizer.convert_tokens_to_string( tokens[question_start:question_end - 1]) # For each layer ... for i, p in enumerate(probes): # Extract predicted indicies score, start_idx, end_idx = p.predict( attention_hidden_states[i][j].unsqueeze(0), device, threshold=0, context_start=context_start, context_end=context_end) start_idx = int(start_idx[0]) end_idx = int(end_idx[0]) # Extract predicted answer, converting start tokens to empty strings (no answer) answer = tokenizer.convert_tokens_to_string( tokens[start_idx:end_idx + 1]) if answer == '[CLS]': answer = '' # Check if the question is the same as the last one, if it is go back to the last question id and keep the higher score. # If the question is not already in the dataframe, then assign it to the dataframe. # Note we first handle the case where there are no prior questions by storing since we know there are no duplicates if question_ids[i] == 0: predictions[i].loc[question_ids[i], 'Question'] = question predictions[i].loc[question_ids[i], 'Predicted'] = answer predictions[i].loc[question_ids[i], 'Score'] = score elif (predictions[i].loc[int(question_ids[i] - 1), 'Question'] == question): question_ids[i] -= 1 old_score = predictions[i].loc[question_ids[i], 'Score'] if score > old_score: predictions[i].loc[question_ids[i], 'Predicted'] = answer predictions[i].loc[question_ids[i], 'Score'] = score else: predictions[i].loc[question_ids[i], 'Question'] = question predictions[i].loc[question_ids[i], 'Predicted'] = answer predictions[i].loc[question_ids[i], 'Score'] = score # Increment to new question id (note, for duplicate answers this gets us back to where we were) question_ids[i] += 1 # Save predictions for each layer print("Saving predictions") if not os.path.exists(preds_dir): os.mkdir(preds_dir) for i, pred in enumerate(predictions): pred[['Id', 'Predicted']].to_csv(preds_dir + "/layer_" + str(i + 1) + ".csv", index=False)
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = args.cache_dir if args.cache_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( os.path.join(args.data_dir, args.task), filename=args.predict_file) else: examples = processor.get_train_examples( os.path.join(args.data_dir, args.task), filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # COMET model setting up device = "0" comet_model = "pretrained_models/atomic_pretrained_model.pickle" sampling_algo = "beam-2" opt, state_dict = interactive.load_model_file(comet_model) data_loader, text_encoder = interactive.load_data("atomic", opt) n_ctx = data_loader.max_event + data_loader.max_effect n_vocab = len(text_encoder.encoder) + n_ctx model = interactive.make_model(opt, n_vocab, n_ctx, state_dict) nlp = spacy.load("en_core_web_sm") if device != "cpu": cfg.device = int(device) cfg.do_gpu = True torch.cuda.set_device(cfg.device) model.cuda(cfg.device) else: cfg.device = "cpu" sampling_algorithm = sampling_algo sampler = interactive.set_sampler(opt, sampling_algorithm, data_loader) def augment(article): context = (article.numpy().decode('UTF-8')) category_list = ["xNeed", "xIntent", "xWant", "xReact"] for category in category_list: entity_list = nlp(context) input_event = context replaced = [] replacement_list = ["PersonX", "PersonY", "PersonZ"] r = 0 for entity in entity_list.ents: if entity.label_ == 'PERSON' or entity.label_ == 'NORP': input_event = input_event.replace(entity.text, replacement_list[r]) r += 1 if (r == 3): break outputs = interactive.get_atomic_sequence(input_event, model, sampler, data_loader, text_encoder, category) for key in outputs: prefix = "" if (key[0] == "o"): if (key == "oEffect"): prefix = " Everyone else " elif (key == "oReact"): prefix = "They are " elif (key == "oWant"): prefix = "They want " else: if (len(replaced) != 0): prefix = replaced[0] else: prefix = "Person" if (key == "xAttr"): prefix += " is " elif (key == "xEffect"): prefix += " " elif (key == "xIntent"): prefix += " intends " elif (key == "xReact"): prefix += " is " elif (key == "xNeed"): prefix += " needs " elif (key == "xWant"): prefix += " wants " for j in range(5): if (outputs[key]["beams"][j] != 'none'): comet_inf = outputs[key]["beams"][j] if (len(replaced) > 0): comet_inf = comet_inf.replace( "personx", replaced[0]) if (len(replaced) > 1): comet_inf = comet_inf.replace( "persony", replaced[1]) article += prefix + (comet_inf) + ". " break return article def process_example(example): example['context'] = tf.py_function(func=augment, inp=[example['context']], Tout=tf.string) return example ## End if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") tfds_examples["train"] = tfds_examples["train"].map( lambda x: process_example(x)) examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples( args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
import json import torch from bert_squad import BERT_SQUAD from torch.utils.data import DataLoader from torch.utils.tensorboard import SummaryWriter from transformers import squad_convert_examples_to_features from transformers.data.processors.squad import SquadResult, SquadV2Processor from transformers import BertModel, BertConfig, BertTokenizer device = torch.device('cuda') logger = SummaryWriter('logs/bert_model') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') feature_processor = SquadV2Processor() examples = feature_processor.get_train_examples('../data') features, dataset = squad_convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=512, doc_stride=128, max_query_length=128, is_training=True, return_dataset="pt", threads=1) train_loader = DataLoader(dataset=dataset, batch_size=6, shuffle=True) dev_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True) dbs = BERT_SQUAD().to(device) num_epochs = 2 optimizer = torch.optim.Adam(dbs.parameters(), lr=.00003)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Prepare Question-Answering task # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) with training_args.strategy.scope(): model = TFAutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets if data_args.use_tfds: if data_args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically" ) try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) tfds_examples = tfds.load("squad", data_dir=data_args.data_dir) train_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=False) if training_args.do_train else None) eval_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=True) if training_args.do_eval else None) else: processor = SquadV2Processor( ) if data_args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples( data_args.data_dir) if training_args.do_train else None eval_examples = processor.get_dev_examples( data_args.data_dir) if training_args.do_eval else None train_dataset = (squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=True, return_dataset="tf", ) if training_args.do_train else None) train_dataset = train_dataset.apply( tf.data.experimental.assert_cardinality(len(train_examples))) eval_dataset = (squad_convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=False, return_dataset="tf", ) if training_args.do_eval else None) eval_dataset = eval_dataset.apply( tf.data.experimental.assert_cardinality(len(eval_examples))) # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir)
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() """ # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) """ if args.dataset_type in ['korquad2']: processor = KorquadV2Processor( args.threads, args.max_paragraph_length, args.max_answer_text_length ) if args.max_answer_text_length is not None else KorquadV2Processor( args.threads, args.max_paragraph_length) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = [] ## Find json file name predict_files = [ temp_file for temp_file in os.listdir(args.predict_dir) if '.json' in temp_file ] ## Load json files for predict_file in predict_files: temp_examples = processor.get_dev_examples(args.predict_dir, filename=predict_file) if temp_examples is not None and len(temp_examples) > 0: examples.extend(temp_examples) else: examples = [] ## Find json file name train_files = [ temp_file for temp_file in os.listdir(args.train_dir) if '.json' in temp_file ] ## Load json files for train_file in train_files: temp_examples = processor.get_train_examples(args.train_dir, filename=train_file) if temp_examples is not None and len(temp_examples) > 0: examples.extend(temp_examples) if args.dataset_type in ['korquad2']: features, dataset = korquad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) else: features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, f"cached_{'dev' if evaluate else 'train'}_{args.tokenizer}_{args.max_seq_length}" ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file): logger.info( f"Loading features from cached file {cached_features_file}") features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info(f"Creating features from dataset file at {input_dir}") if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) if args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor( ) if args.version_2_with_negative else SquadV1Processor() if evaluate: examples = processor.get_dev_examples( args.data_dir, filename=args.predict_file) else: examples = processor.get_train_examples( args.data_dir, filename=args.train_file) # For MeCab tokenizer, we remove '\n' in all texts in all examples for example in examples: example.question_text = example.question_text.replace("\n", "") example.context_text = example.context_text.replace("\n", "") if example.answer_text is not None: example.answer_text = example.answer_text.replace("\n", "") example.title = example.title.replace("\n", "") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) logger.info(f"Saving features into cached file {cached_features_file}") torch.save( { "features": features, "dataset": dataset, "examples": examples }, cached_features_file) if output_examples: return dataset, examples, features return dataset
def get_evaluation_metrics( model, tokenizer, data_dir: str, filename: str, per_gpu_batch_size: int = 32, num_batches: int = None, disable_tqdm: bool = False, ) -> Dict[str, "Number"]: """ Return an OrderedDict in the format: { 'exact': 0.8169797018445212, 'f1': 4.4469722448269335, 'total': 11873, 'HasAns_exact': 0.15182186234817813, 'HasAns_f1': 7.422216845956518, 'HasAns_total': 5928, 'NoAns_exact': 1.4802354920100924, 'NoAns_f1': 1.4802354920100924, 'NoAns_total': 5945, 'best_exact': 50.07159100480081, 'best_exact_thresh': 0.0, 'best_f1': 50.0772059855695, 'best_f1_thresh': 0.0 } """ # These are not used in inference, only for scoring in `compute_predictions_logits()`. processor = SquadV2Processor() examples: List[SquadExample] = processor.get_dev_examples( data_dir, filename=filename) features: List[SquadFeatures] = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=filename, per_gpu_batch_size=per_gpu_batch_size, shard=False, shuffle=False, drop_remainder=False, return_raw_features=True, ) # Here we get the dataset instead of just the features, with return_raw_features=False. dataset: tf.data.Dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=filename, per_gpu_batch_size=per_gpu_batch_size, shard=False, shuffle=False, drop_remainder=False, return_raw_features=False, ) results: List[SquadResult] = get_squad_results( model=model, dataset=dataset, features=features, per_gpu_batch_size=per_gpu_batch_size, num_batches=num_batches, disable_tqdm=disable_tqdm, ) write_prediction_files = False if write_prediction_files: output_predictions_file = f"/fsx/{args.checkpoint}_predictions.json" output_nbest_file = f"/fsx/{args.checkpoint}_nbest_predictions.json" output_null_log_odds_file = f"/fsx/{args.checkpoint}_null_odds.json" else: output_predictions_file = None output_nbest_file = None output_null_log_odds_file = None predictions = compute_predictions_logits( all_examples=examples, all_features=features, all_results=results, n_best_size=20, max_answer_length=30, do_lower_case=True, output_prediction_file=output_predictions_file, output_nbest_file=output_nbest_file, output_null_log_odds_file=output_null_log_odds_file, verbose_logging=False, version_2_with_negative=True, null_score_diff_threshold=0.0, tokenizer=tokenizer, ) results: collections.OrderedDict = squad_evaluate(examples, predictions) return results
def load_and_cache_examples(data_dir: Path, tokenizer, task, max_seq_length, doc_stride, max_query_length, evaluate=False, model_name=None): if (task == "SQuAD1.1"): train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json" validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json" train_file = "train-v1.1.json" validation_file = "dev-v1.1.json" processor = SquadV1Processor() elif (task == "SQuAD2.0"): train_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/train-v2.0-short.json" validation_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/dev-v2.0-short.json" train_file = "train-v2.0.json" validation_file = "dev-v2.0.json" processor = SquadV2Processor() else: raise NameError("Incompatible dataset detected") if not data_dir.exists(): data_dir.mkdir(parents=True) if evaluate: # TODO: Cache instead of always downloading with urllib.request.urlopen(validation_url) as url: val_path = data_dir / validation_file with val_path.open('w') as f: f.write(url.read().decode()) else: with urllib.request.urlopen(train_url) as url: train_path = data_dir / train_file with train_path.open('w') as f: f.write(url.read().decode()) # Load data features from cache or dataset file cached_features_file = os.path.join( str(data_dir.absolute()), "cache_{}_{}".format( "dev" if evaluate else "train", model_name, ), ) # Init features and dataset from cache if it exists overwrite_cache = False # Set to True to do a cache wipe (TODO: Make cache wipe configurable) if os.path.exists(cached_features_file) and not overwrite_cache: print("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: if evaluate: examples = processor.get_dev_examples(data_dir, filename=validation_file) else: examples = processor.get_train_examples(data_dir, filename=train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset="pt", ) print("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) return dataset, examples, features
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Load data features from cache or dataset file input_dir = args.data_dir if args.data_dir else "." cached_features_file = os.path.join( input_dir, "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) features, dataset, examples = ( features_and_dataset["features"], features_and_dataset["dataset"], features_and_dataset["examples"], ) else: logger.info("Creating features from dataset file at %s", input_dir) if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") if args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") tfds_examples = tfds.load("squad") examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() #Tydi specific if args.leave_out_languages is not None: logger.info("Creating temporary trainig file at %s", args.data_dir) leave_languages = args.leave_out_languages.split(',') with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader: input_data = json.load(reader) tmp_data = {} tmp_data['data'] = [] for k in input_data.keys(): if k != 'data': tmp_data[k] = input_data[k] left_out_count = 0 for entry in input_data['data']: paragraph = entry["paragraphs"][0] #only one paragraph per entry qa = paragraph["qas"][0] #single question is sufficient to determine the language lang = qa['id'].split('-')[0] if lang not in leave_languages: tmp_data['data'].append(entry) else: left_out_count += 1 logger.info("No. of training examples left out %d", left_out_count) tmp_filename = args.train_file[:-5] for lang in leave_languages: tmp_filename += '-'+lang tmp_filename += '.json' with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer: json.dump(tmp_data, writer) if args.train_on_languages is not None: logger.info("Creating temporary training file at %s", args.data_dir) keep_languages = args.train_on_languages.split(',') with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader: input_data = json.load(reader) tmp_data = {} tmp_data['data'] = [] for k in input_data.keys(): if k != 'data': tmp_data[k] = input_data[k] left_out_count = 0 keep_count = 0 for entry in input_data['data']: paragraph = entry["paragraphs"][0] #only one paragraph per entry qa = paragraph["qas"][0] #single question is sufficient to determine the language lang = qa['id'].split('-')[0] if lang in keep_languages: tmp_data['data'].append(entry) keep_count += 1 else: left_out_count += 1 logger.info("No. of training examples left out %d", left_out_count) logger.info("No. of training examples kept %d", keep_count) tmp_filename = args.train_file[:-5] for lang in keep_languages: tmp_filename += '-keep-'+lang tmp_filename += '.json' with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer: json.dump(tmp_data, writer) if evaluate: examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) else: if args.leave_out_languages is not None: args.train_file = tmp_filename if args.train_on_languages is not None: args.train_file = tmp_filename examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, return_dataset="pt", threads=args.threads, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() if output_examples: return dataset, examples, features return dataset
def run_squad_and_get_results( run_name: str, fsx_prefix: str, pre_layer_norm: bool, model_size: str, load_from: Union[str, tf.keras.Model], load_step: int, batch_size: int, checkpoint_frequency: Optional[int], validate_frequency: Optional[int], learning_rate: float, warmup_steps: int, total_steps: int, dataset: str, dummy_eval: bool = False, config: Optional[PretrainedConfig] = None, ) -> Dict: checkpoint_frequency = checkpoint_frequency or 1000000 validate_frequency = validate_frequency or 1000000 if isinstance(load_from, tf.keras.Model): config = load_from.config assert config is not None, "config may not be None" # Instantiate QuestionAnswering model if isinstance(load_from, TFPreTrainedModel): model = load_qa_from_pretrained(model=load_from) elif load_from == "scratch": model = TFAutoModelForQuestionAnswering.from_config(config) elif load_from == "huggingface": model = load_qa_from_pretrained(name=f"albert-{model_size}-v2") else: raise ValueError( f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']" ) tokenizer = get_tokenizer() schedule = LinearWarmupPolyDecaySchedule( max_learning_rate=learning_rate, end_learning_rate=0, warmup_steps=warmup_steps, total_steps=total_steps, ) optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic" ) # AMP model.call = wrap_tf_function_idempotent(model.call) if dataset == "squadv1": train_filename = "train-v1.1.json" val_filename = "dev-v1.1.json" processor = SquadV1Processor() elif dataset == "squadv2": train_filename = "train-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() elif dataset == "debug": train_filename = "dev-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() else: assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']" data_dir = f"{fsx_prefix}/squad_data" train_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=train_filename, batch_size=batch_size, shard=True, shuffle=True, repeat=True, drop_remainder=True, ) if hvd.rank() == 0: print("Starting finetuning") pbar = tqdm.tqdm(total_steps) summary_writer = None # Only create a writer if we make it through a successful step val_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=val_filename, batch_size=batch_size, shard=False, shuffle=True, drop_remainder=False, ) # Need to re-wrap every time this function is called # Wrapping train_step gives an error with optimizer initialization on the second pass # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875 # Discussion at https://github.com/tensorflow/tensorflow/issues/27120 wrapped_train_step = tf.function(train_step) for step, batch in enumerate(train_dataset): learning_rate = schedule(step=tf.constant(step, dtype=tf.float32)) loss, acc, exact_match, f1, precision, recall = wrapped_train_step( model=model, optimizer=optimizer, batch=batch ) # Broadcast model after the first step so parameters and optimizer are initialized if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) is_final_step = step >= total_steps - 1 if hvd.rank() == 0: do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step do_validate = (step % validate_frequency == 0) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}" pbar.set_description(description) if do_validate: print("Running validation") ( val_loss, val_acc, val_exact_match, val_f1, val_precision, val_recall, ) = run_validation(model=model, val_dataset=val_dataset) description = ( f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, " f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}" ) print(description) print("Running evaluation") if dummy_eval: results = { "exact": 0.8169797018445212, "f1": 4.4469722448269335, "total": 11873, "HasAns_exact": 0.15182186234817813, "HasAns_f1": 7.422216845956518, "HasAns_total": 5928, "NoAns_exact": 1.4802354920100924, "NoAns_f1": 1.4802354920100924, "NoAns_total": 5945, "best_exact": 50.07159100480081, "best_exact_thresh": 0.0, "best_f1": 50.0772059855695, "best_f1_thresh": 0.0, } else: results: Dict = get_evaluation_metrics( model=model, data_dir=data_dir, filename=val_filename, batch_size=32, ) print_eval_metrics(results=results, step=step) if do_checkpoint: checkpoint_path = ( f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt" ) print(f"Saving checkpoint at {checkpoint_path}") model.save_weights(checkpoint_path) if summary_writer is None: summary_writer = tf.summary.create_file_writer( f"{fsx_prefix}/logs/albert-squad/{run_name}" ) with summary_writer.as_default(): tf.summary.scalar("learning_rate", learning_rate, step=step) tf.summary.scalar("train_loss", loss, step=step) tf.summary.scalar("train_acc", acc, step=step) tf.summary.scalar("train_exact", exact_match, step=step) tf.summary.scalar("train_f1", f1, step=step) tf.summary.scalar("train_precision", precision, step=step) tf.summary.scalar("train_recall", recall, step=step) if do_validate: tf.summary.scalar("val_loss", val_loss, step=step) tf.summary.scalar("val_acc", val_acc, step=step) tf.summary.scalar("val_exact", val_exact_match, step=step) tf.summary.scalar("val_f1", val_f1, step=step) tf.summary.scalar("val_precision", val_precision, step=step) tf.summary.scalar("val_recall", val_recall, step=step) # And the eval metrics tensorboard_eval_metrics( summary_writer=summary_writer, results=results, step=step ) if is_final_step: break # Can we return a value only on a single rank? if hvd.rank() == 0: pbar.close() print(f"Finished finetuning, job name {run_name}") return results
def run_benchmark(tokenizer, model, small_portion: bool, device: str = 'cuda', k: int = 10, mu: float = None, use_ir_score: bool = False): """Main Benchmark function. """ # initializing pyserini's searcher searcher = SimpleSearcher( 'formatted_open_squad/indexes/paragraphs_indexing') searcher.set_bm25() searcher.unset_rm3() # loading squad processor = SquadV2Processor() counter = 0 model.to(torch.device(device)) squad_dataset = json.load(open("SQuAD_1_1/dev-v1.1.json", 'r'))['data'] with open('formatted_open_squad/open_squad.pkl', 'rb') as f1: squad1_for_orqa = pickle.load(f1) ans_predictions = dict() if small_portion: np.random.seed(42) id_examples = np.random.permutation(len( squad1_for_orqa['questions']))[:100] else: id_examples = np.arange(len(squad1_for_orqa['questions'])) # Main loop : evaluation IR and ODQA for i in id_examples: print(i) curr_question = squad1_for_orqa['questions'][i] curr_answer = squad1_for_orqa['answers'][i] print('Question : ', curr_question) print('Answer : ', curr_answer) is_in = False hits = searcher.search(squad1_for_orqa['questions'][i], k=k) ir_scores = [] paragraphs = [] for j in range(len(hits)): passage = hits[j].raw ir_scores.append(hits[j].score) is_in = is_in or (squad1_for_orqa['answers'][i] in passage) paragraphs.append(passage) if is_in: counter += 1 input_ = build_squad_input(curr_question, paragraphs) examples = processor._create_examples(input_["data"], "dev") features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset="pt", threads=1, ) if use_ir_score: all_results, predictions = process_one_question( features, dataset, model, tokenizer, examples, device, True, mu, ir_scores) else: all_results, predictions = process_one_question( features, dataset, model, tokenizer, examples, device) scores = np.array([(p['start_logit'] + p['end_logit']) for p in predictions['0']]) texts = [p['text'] for p in predictions['0']] predicted_p_indexes_all = scores.argsort()[::-1].argsort() iterator_idx = 0 is_empty = True predicted_p_index = 0 while is_empty and iterator_idx < len(predicted_p_indexes_all): predicted_p_index = predicted_p_indexes_all[iterator_idx] is_empty = texts[predicted_p_index] == "empty" iterator_idx += 1 ans_predictions[squad1_for_orqa['ids'][i]] = texts[predicted_p_index] print('Predicted Answer : ', texts[predicted_p_index]) evaluation = evaluate(squad_dataset, ans_predictions, ignore_missing_qids=True) em = evaluation['exact_match'] f1 = evaluation['f1'] write_in_result_file("Running evaluation on " + str(len(ans_predictions)) + " predictions") write_in_result_file(f"exact_match: {em}, f1: {f1}") print("IR : ", counter / len(id_examples)) write_in_result_file(f"IR : {counter / len(id_examples)}") print(f"exact_match: {em}, f1: {f1}")