def get_data_bert(max_seq_length, batch_sizes): """ Args: max_num_words: (int) Max number of words as input for the Tokenizer embedding_dim: (int) Embedding dim of the embeddings max_seq_length: (int) Max sequence length of the sentences batch_size: (int) Batch size for the DataLoader use_bert: (bool) Use the BERT model or another model Output: word_index, embedding_matrix, X_train, y_train, X_test, y_test """ #Load data train, val, test = load_data() #Clean data X_train, y_train = clean_data(train) X_val, y_val = clean_data(val) X_test, y_test = clean_data(test) #Features data tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) train_examples = convert_examples_to_features(X_train, y_train, max_seq_length, tokenizer) val_examples = convert_examples_to_features(X_val, y_val, max_seq_length, tokenizer) test_examples = convert_examples_to_features(X_test, y_test, max_seq_length, tokenizer) #Data loaders train_dataloader = get_dataloader(train_examples, batch_sizes[0]) val_dataloader = get_dataloader(val_examples, batch_sizes[1]) test_dataloader = get_dataloader(test_examples, batch_sizes[2]) return train_dataloader, val_dataloader, test_dataloader
def preprare_eval_examples(self): self.eval_examples = self.processor.get_dev_examples( self.args.data_dir, self.args.dev_file) input_length_arr = [] if self.processor.is_pair(): truncate_seq_pair = lambda tokens_a, tokens_b, max_length: self.processor.truncate_seq_pair( tokens_a, tokens_b, max_length) self.eval_features = convert_examples_to_features( self.eval_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr, truncate_seq_pair=truncate_seq_pair) else: self.eval_features = convert_examples_to_features( self.eval_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr) all_input_ids = torch.tensor([f.input_ids for f in self.eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in self.eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in self.eval_features], dtype=torch.long) input_length_arr = np.array(input_length_arr) print("Eval input_length_arr: max={}, min={}, avg={}".format( np.max(input_length_arr), np.min(input_length_arr), np.mean(input_length_arr))) if self.output_mode == "classification": self.eval_all_label_ids = torch.tensor( [f.label_id for f in self.eval_features], dtype=torch.long) elif self.output_mode == "regression": self.eval_all_label_ids = torch.tensor( [f.label_id for f in self.eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, self.eval_all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) self.eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
def load_examples(tokenizer, mode="train"): sst2 = Sst2Processor() # examples: list of InputExample objects if mode == "train": examples = sst2.get_train_examples() elif mode == "dev": examples = sst2.get_dev_examples() else: examples = None # features: list of InputFeatures features = convert_examples_to_features(examples, tokenizer, sst2.get_labels(), max_length=args.max_seq_length) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.int64) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.int64) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.int64) all_labels = torch.tensor([f.label for f in features], dtype=torch.int64) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def prepare_data(self): "Called to initialize data. Use the call to construct features" args = self.hparams processor = ChnSentiCorpProcessor() self.labels = processor.get_labels() for mode in ["train", "dev"]: cached_features_file = self._feature_file(mode) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) examples = ( processor.get_dev_examples(args.data_dir) if mode == "dev" else processor.get_train_examples(args.data_dir) ) features = convert_examples_to_features( examples, self.tokenizer, processor, max_length=args.max_seq_length, label_list=self.labels, output_mode=args.output_mode, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file)
def load_and_cache_bert_example(args, tokenizer, type='train'): ''' load or cache the InputExample, return dataset :param type: :param args: :param task_class: :param tokenizer: :return: ''' task_class = processors[args.task_name]() # file: cached_train_bert-base-uncased_256_imdb file = 'normal_bert_cached_{}_{}_{}'.format(type, args.max_seq_length, args.task_name) cached_features_file = os.path.join(args.cache_dir, file) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = task_class.get_labels() if type == 'train': examples = task_class.get_train_examples(args.data_dir) elif type == 'dev': examples = task_class.get_dev_examples(args.data_dir) else: examples = task_class.get_test_examples(args.data_dir) features = utils.convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1, # cls segment id pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) all_lengths = torch.tensor([f.length for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_lengths, all_label_ids) return dataset
def load_and_cache_examples(task, tokenizer, evaluate=False): processor = processors[task]() output_mode = args['output_mode'] mode = 'dev' if evaluate else 'train' cached_features_file = os.path.join( args['data_dir'], f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}") if os.path.exists( cached_features_file) and not args['reprocess_input_data']: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args['data_dir']) label_list = processor.get_labels() print("label list", label_list) examples = processor.get_dev_examples( args['data_dir']) if evaluate else processor.get_train_examples( args['data_dir']) features = convert_examples_to_features( examples, label_list, args['max_seq_length'], tokenizer, output_mode, cls_token_at_end=bool(args['model_type'] in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0, pad_on_left=bool( args['model_type'] in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def load_and_cache_examples(config, task, tokenizer, evaluate=False): if config.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() processor = data_processors[config.task_name]() # Load data features from cache or dataset file # cached_features_file = os.path.join(config.data_dir, 'cached_{}_{}_{}_{}'.format( # 'dev' if evaluate else 'train', # list(filter(None, 'bert-large-uncased'.split('/'))).pop(), # str(config.max_seq_len), # str(task))) # if os.path.exists(cached_features_file): # logger.info("Loading features from cached file %s", # cached_features_file) # features = torch.load(cached_features_file) # else: logger.info("Creating features from dataset file at %s", config.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( config.data_dir) if evaluate else processor.get_train_examples( config.data_dir) features = convert_examples_to_features( examples, label_list, config.max_seq_len, tokenizer, "classification", use_entity_indicator=config.use_entity_indicator) # if config.local_rank in [-1, 0]: # logger.info("Saving features into cached file %s", # cached_features_file) # torch.save(features, cached_features_file) if config.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() output_mode = "classification" # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_e1_mask = torch.tensor([f.e1_mask for f in features], dtype=torch.long) # add e1 mask all_e2_mask = torch.tensor([f.e2_mask for f in features], dtype=torch.long) # add e2 mask if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = XNLIProcessor() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.cache_dir, "cached_{}_beto_{}_{}_es".format( os.path.basename( args.eval_file_path if evaluate else args.train_file_path), str(args.max_seq_length), str(task), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info( "Creating features from dataset file at %s", args.eval_file_path if evaluate else args.train_file_path, ) label_list = processor.get_labels() examples = (processor.get_eval_examples(args.eval_file_path) if evaluate else processor.get_train_examples( args.train_file_path)) features = convert_examples_to_features( examples, tokenizer, processor, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) else: raise ValueError("No other `output_mode` for XNLI.") dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) elif output_mode == 'multi_label': all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def test_beforeafter_mask(): exs = get_beforeafter_examples("beforeafter/examples/", num_examples=1000) fes = convert_examples_to_features( examples=exs, tokenizer=tokenizer, max_seq_length=100, doc_stride=128, mask='beforeafter') return exs, fes
def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = myprocessors[task](args.data_dir) output_mode = output_modes[task] # Load data features from cache or dataset file # cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( # 'dev' if evaluate else 'train', # list(filter(None, args.model_name_or_path.split('/'))).pop(), # str(args.max_seq_length), # str(task))) # if os.path.exists(cached_features_file): # logger.info("Loading features from cached file %s", cached_features_file) # features = torch.load(cached_features_file) # else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] if test: examples = processor.get_test_examples(args.data_dir) else: examples = processor.get_dev_examples( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) pad_on_left = bool(args.model_type in ['xlnet']) pad_token = tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0] pad_token_segment_id = 4 if args.model_type in ['xlnet'] else 0 cls_token = tokenizer.convert_tokens_to_ids([tokenizer.cls_token])[0] sep_token = tokenizer.convert_tokens_to_ids([tokenizer.sep_token])[0] features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=pad_on_left, # pad on the left for xlnet pad_token=pad_token, pad_token_segment_id=pad_token_segment_id, max_path=args.max_concepts, max_path_len=args.max_concept_len, cls_token=cls_token, sep_token=sep_token) # if args.local_rank in [-1, 0]: # logger.info("Saving features into cached file %s", cached_features_file) # torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache return MyDataset(features, pad_on_left, pad_token, pad_token_segment_id, cls_token, sep_token, len(label_list))
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) raw_texts = [] else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features, raw_texts = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ['roberta']), pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_ext_mask = torch.tensor([f.extraction_mask for f in features], dtype=torch.float) all_ext_start_ids = torch.tensor([f.extraction_start_ids for f in features], dtype=torch.long) all_ext_end_ids = torch.tensor([f.extraction_end_ids for f in features], dtype=torch.long) all_aug_mask = torch.tensor([f.augmentation_mask for f in features], dtype=torch.float) all_aug_start_ids = torch.tensor([f.augmentation_start_ids for f in features], dtype=torch.long) all_aug_end_ids = torch.tensor([f.augmentation_end_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_ext_mask, all_ext_start_ids, all_ext_end_ids, all_aug_mask, all_aug_start_ids, all_aug_end_ids) return dataset, raw_texts
def load_and_cache_examples(args, task, tokenizer, evaluate=False): features = convert_examples_to_features( task, tokenizer, max_length=args.max_seq_length, pad_on_left=bool( args.model_type in ["xlnet"]), #pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, )
def to_bert_input(self, dataset_pd): x_input = dataset_pd.apply(lambda x: InputExample(guid=None, text_a=x[self.DATA_COLUMN], text_b=x[self.DATA2_COLUMN] if self.DATA2_COLUMN else None, label=x[self.LABEL_COLUMN]), axis=1) x_features = convert_examples_to_features(x_input, self.label_list, self.max_seq_length, self.tokenizer) x_input_ids, x_input_masks, x_segment_ids, x_labels = self.get_features(x_features) return (x_input_ids, x_input_masks, x_segment_ids), x_labels
def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False): if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task]() # Load data features from cache or dataset file if evaluate: cached_mode = 'dev' elif test: cached_mode = 'test' else: cached_mode = 'train' assert (evaluate == True and test == True) == False cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( cached_mode, list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() if evaluate: examples = processor.get_dev_examples(args.data_dir) elif test: examples = processor.get_test_examples(args.data_dir) else: examples = processor.get_train_examples(args.data_dir) logger.info("Training number: %s", str(len(examples))) features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer, pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0 ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def get_features(examples): features = convert_examples_to_features(examples, args.tokenizer, args, stage="test") all_source_ids = torch.tensor( [f.source_ids[:args.max_source_length] for f in features], dtype=torch.long) all_source_mask = torch.tensor( [f.source_mask[:args.max_source_length] for f in features], dtype=torch.long) return TensorDataset(all_source_ids, all_source_mask)
def tokenize(sentence): sentence = sentence.replace('\n', '') test_examples = [InputExample(0, sentence, None, '0')] label_list = ["0", "1"] num_labels = len(label_list) test_examples_len = len(test_examples) label_map = {label: i for i, label in enumerate(label_list)} test_features = convert_examples_to_features( test_examples, label_list, max_seq_len, tokenizer, output_mode, cls_token_at_end=bool( 'model_type' == 'xlnet'), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if 'model_type' == 'xlnet' else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool('model_type' == 'roberta'), pad_on_left=True, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if 'model_type' == 'xlnet' else 0) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) test_sampler = SequentialSampler(test_data) eval_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=eval_batch_size) for batch in eval_dataloader: batch = tuple(t for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } return inputs
def load_and_cached_examples(args, tokenizer_a, tokenizer_b=None, evaluate=False, output_examples=False, is_double=True): # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_type.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_cross_examples(input_file=input_file) features = convert_examples_to_features( examples=examples, tokenizer_a=tokenizer_a, tokenizer_b=tokenizer_b, max_seq_length=args.max_seq_length, is_double=is_double) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_a_ids = torch.tensor([f.input_a_id for f in features], dtype=torch.long) all_input_b_ids = torch.tensor([f.input_b_id for f in features], dtype=torch.long) all_input_a_mask = torch.tensor([f.input_a_mask for f in features], dtype=torch.long) all_input_b_mask = torch.tensor([f.input_b_mask for f in features], dtype=torch.long) all_input_a_length = torch.tensor([f.input_a_length for f in features], dtype=torch.long) all_input_b_length = torch.tensor([f.input_b_length for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_a_ids, all_input_b_ids, all_input_a_mask, all_input_b_mask, all_input_a_length, all_input_b_length, all_labels) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}".format( mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length) ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) examples = read_examples_from_file(args.data_dir, mode) features = convert_examples_to_features( examples, labels, args.max_seq_length, tokenizer, cls_token_at_end=bool(args.model_type in ["xlnet"]), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args.model_type in ["roberta"]), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, pad_token_label_id=pad_token_label_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def prepare_run_examples(self): self.run_examples = self.processor.get_dev_examples( self.args.data_dir, self.args.dev_file) input_length_arr = [] if self.processor.is_pair(): truncate_seq_pair = lambda tokens_a, tokens_b, max_length: self.processor.truncate_seq_pair( tokens_a, tokens_b, max_length) self.run_features = convert_examples_to_features( self.run_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr, truncate_seq_pair=truncate_seq_pair) else: self.run_features = convert_examples_to_features( self.run_examples, self.label_list, self.args.max_seq_length, self.tokenizer, self.output_mode, self.logger, input_length_arr) all_input_ids = torch.tensor([f.input_ids for f in self.run_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in self.run_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in self.run_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) self.run_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.args.eval_batch_size)
def load_and_cache_examples(self, task, tokenizer, sentence, evaluate=False): task = self.args['task_name'] processor = processors[task]() output_mode = self.args['output_mode'] label = '1' set_type = "dev" guid = "%s-%s" % (set_type, 0) text_a = sentence examples = [] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) label_list = processor.get_labels() features = convert_examples_to_features( examples, label_list, self.args['max_seq_length'], tokenizer, output_mode, cls_token_at_end=bool(self.args['model_type'] in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if self.args['model_type'] in ['xlnet'] else 0, pad_on_left=bool(self.args['model_type'] in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if self.args['model_type'] in ['xlnet'] else 0) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def matres_train_examples(tokenizer, lm='roberta', mask_events=False, mask_context=False): train_examples, _ = matres_examples() train_examples, train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask_events=mask_events, mask_context=mask_context) train_data = make_tensor_dataset(train_features, model=lm) return train_examples, train_data, train_features
def load_and_cache_examples(args, tokenizer, processor, mode): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache\ # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}".format( mode, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) examples = processor.get_train_examples(args.data_dir) if mode=='train' else \ processor.get_dev_examples(args.data_dir) features = convert_examples_to_features( examples, tokenizer, processor, args.max_seq_length, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def matres_dev_examples(tokenizer, lm='roberta', mask_events=False, mask_context=False): _, dev_examples = matres_examples() dev_examples, dev_features = convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask_events=mask_events, mask_context=mask_context, id_prefix="md") dev_data = make_tensor_dataset(dev_features, model=lm) return dev_examples, dev_data
def udst(tokenizer, lm='roberta', split="train", example_dir="udst/all_annotations/", mask_events=False, mask_context=False): exs = parse_udst.get_examples(example_dir=example_dir, split=split) exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=False, mask_events=mask_events, mask_context=mask_context) data = make_tensor_dataset(feats, model=lm) return exs, data, feats
def load_and_cache_examples(task, tokenizer, evaluate=False, input_file="train"): processor = processors[task]() output_mode = args['output_mode'] mode = 'dev' if evaluate else 'train' cached_features_file = os.path.join(args['data_dir'], f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}") if os.path.exists(cached_features_file) and not args['reprocess_input_data']: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args['data_dir']) label_list = processor.get_labels() examples = processor.get_dev_examples(args['data_dir'], input_file) if evaluate else processor.get_train_examples( args['data_dir'], input_file) # teghub if __name__ == "__main__": features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode, cls_token_at_end=bool(args['model_type'] in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(args['model_type'] in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args['model_type'] in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def distant_test_examples(tokenizer, lm='roberta', train=False, mask=False, mask_events=False): f = open('timex/orig/test_exs.pkl', 'rb') exs = pickle.load(f) if mask: mask = 'distant' exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=mask, mask_events=mask_events) data = make_tensor_dataset(feats, model=lm) return exs, data
def udst_majority(tokenizer, lm='roberta', example_dir="udst/all_annotations/", split="dev", mask_events=False, ties=True): exs = parse_udst.get_majority_examples(example_dir=example_dir, split=split, ties=ties) exs, feats = convert_examples_to_features(examples=exs, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask=False, mask_events=mask_events) data = make_tensor_dataset(feats, model=lm) return exs, data
def matres_test_examples(tokenizer, lm='roberta', mask_events=False, mask_context=False): loader = MatresLoader() examples = loader.read_test_examples(doc_dir="timebank/te3-platinum/", rel_dir="timebank/MATRES/") examples, features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=MAX_SEQ_LENGTH, doc_stride=DOC_STRIDE, mask_events=mask_events, mask_context=mask_context, id_prefix="mt") data = make_tensor_dataset(features, model=lm) return examples, data
def load_and_cache_test_samples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_{}_{}_{}'.format( 'test', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_test_examples(args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, # pad on the left for xlnet pad_on_left=bool(args.model_type in ['xlnet']), pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, predict=True) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids) return dataset