def _convert(_examples): _features = convert_examples_to_features( _examples, args["max_seq_length"], tokenizer, output_mode, # XLNet has a CLS token at the end cls_token_at_end=bool(args["model_type"] in ["xlnet"]), cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, sep_token=tokenizer.sep_token, # RoBERTa uses an extra separator b/w pairs of sentences, # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 sep_token_extra=bool(args["model_type"] in ["roberta"]), # PAD on the left for XLNet pad_on_left=bool(args["model_type"] in ["xlnet"]), pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, process_count=process_count, multi_label=multi_label, silent=args["silent"], use_multiprocessing=args["use_multiprocessing"], sliding_window=args["sliding_window"], flatten=not evaluate, stride=args["stride"], ) if args["sliding_window"] and evaluate: raise NotImplementedError all_input_ids = torch.tensor([f.input_ids for f in _features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in _features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in _features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in _features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in _features], dtype=torch.float) return _features, (all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, multi_label=False): """ Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures. Utility function for train() and eval() methods. Not intended to be used directly. """ process_count = self.args["process_count"] tokenizer = self.tokenizer args = self.args if not multi_label and args["regression"]: output_mode = "regression" else: output_mode = "classification" if not os.path.isdir(self.args["cache_dir"]): os.mkdir(self.args["cache_dir"]) mode = "dev" if evaluate else "train" cached_features_file = os.path.join( args["cache_dir"], "cached_{}_{}_{}_{}_{}".format( mode, args["model_type"], args["max_seq_length"], self.num_labels, len(examples), ), ) if os.path.exists(cached_features_file) and ( (not args["reprocess_input_data"] and not no_cache) or (mode == "dev" and args["use_cached_eval_features"])): features = torch.load(cached_features_file) print(f"Features loaded from cache at {cached_features_file}") else: print(f"Converting to features started. Cache is not used.") features = convert_examples_to_features( examples, args["max_seq_length"], tokenizer, output_mode, # XLNet has a CLS token at the end cls_token_at_end=bool(args["model_type"] in ["xlnet"]), cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0, sep_token=tokenizer.sep_token, # RoBERTa uses an extra separator b/w pairs of sentences, # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 sep_token_extra=bool(args["model_type"] in ["roberta"]), # PAD on the left for XLNet pad_on_left=bool(args["model_type"] in ["xlnet"]), pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0, process_count=process_count, multi_label=multi_label, silent=args["silent"], use_multiprocessing=args["use_multiprocessing"], sliding_window=args["sliding_window"], flatten=not evaluate, stride=args["stride"], ) if not no_cache: torch.save(features, cached_features_file) if args["sliding_window"] and evaluate: window_counts = [len(sample) for sample in features] features = [ feature for feature_set in features for feature in feature_set ] all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args["sliding_window"] and evaluate: return dataset, window_counts else: return dataset
def load_and_cache_examples(self, examples, evaluate=False, no_cache=False, multi_label=False, verbose=True, silent=False): """ Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures. Utility function for train() and eval() methods. Not intended to be used directly. """ process_count = self.args.process_count tokenizer = self.tokenizer args = self.args if not no_cache: no_cache = args.no_cache if not multi_label and args.regression: output_mode = "regression" else: output_mode = "classification" makedirs(self.args.cache_dir, exist_ok=True) mode = "dev" if evaluate else "train" cached_features_file = join( args.cache_dir, "cached_{}_{}_{}_{}_{}".format( mode, args.model_type, args.max_seq_length, self.num_labels, len(examples), ), ) if exists(cached_features_file) and ( (not args.reprocess_input_data and not no_cache) or (mode == "dev" and args.use_cached_eval_features and not no_cache)): features = torch.load(cached_features_file) if verbose: logger.info( f" Features loaded from cache at {cached_features_file}") else: if verbose: logger.info( " Converting to features started. Cache is not used.") if args.sliding_window: logger.info(" Sliding window enabled") # If labels_map is defined, then labels need to be replaced with ints if self.args.labels_map: for example in examples: if multi_label: example.label = [ self.args.labels_map[label] for label in example.label ] else: example.label = self.args.labels_map[example.label] features = convert_examples_to_features( examples, args.max_seq_length, tokenizer, output_mode, # XLNet has a CLS token at the end cls_token_at_end=bool(args.model_type in ["xlnet"]), cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, # RoBERTa uses an extra separator b/w pairs of sentences, # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 sep_token_extra=bool( args.model_type in ["roberta", "camembert", "xlmroberta", "longformer"]), # PAD on the left for XLNet pad_on_left=bool(args.model_type in ["xlnet"]), pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0, process_count=process_count, multi_label=multi_label, silent=args.silent or silent, use_multiprocessing=args.use_multiprocessing, sliding_window=args.sliding_window, flatten=not evaluate, stride=args.stride, add_prefix_space=bool( args.model_type in ["roberta", "camembert", "xlmroberta", "longformer"]), args=args, ) if verbose and args.sliding_window: logger.info( f" {len(features)} features created from {len(examples)} samples." ) if not no_cache: torch.save(features, cached_features_file) if args.sliding_window and evaluate: features = [[feature_set] if not isinstance(feature_set, list) else feature_set for feature_set in features] window_counts = [len(sample) for sample in features] features = [ feature for feature_set in features for feature in feature_set ] all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def _load_and_cache_examples(self, examples, evaluate=False, no_cache=False): """ Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures. Utility function for train() and eval() methods. Not intended to be used directly. """ process_count = self.args['process_count'] tokenizer = self.tokenizer output_mode = 'classification' args = self.args if not os.path.isdir(self.args['cache_dir']): os.mkdir(self.args['cache_dir']) mode = 'dev' if evaluate else 'train' cached_features_file = os.path.join( args['cache_dir'], f"cached_{mode}_{args['model_type']}_{args['max_seq_length']}_binary" ) if os.path.exists(cached_features_file) and not args[ 'reprocess_input_data'] and not no_cache: features = torch.load(cached_features_file) else: features = convert_examples_to_features( examples, args['max_seq_length'], tokenizer, output_mode, # xlnet has a cls token at the end cls_token_at_end=bool(args['model_type'] in ['xlnet']), cls_token=tokenizer.cls_token, cls_token_segment_id=2 if self.args['model_type'] in ['xlnet'] else 0, sep_token=tokenizer.sep_token, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 sep_token_extra=bool(args['model_type'] in ['roberta']), # pad on the left for xlnet pad_on_left=bool(args['model_type'] in ['xlnet']), pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], pad_token_segment_id=4 if self.args['model_type'] in ['xlnet'] else 0, process_count=process_count, silent=True) if not no_cache: torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset