'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer) } config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']] config = config_class.from_pretrained(args['model_name'], num_labels=3, finetuning_task=args['task_name']) tokenizer = tokenizer_class.from_pretrained(args['model_name']) model = model_class.from_pretrained(args['output_dir']) task = args['task_name'] if task in processors.keys() and task in output_modes.keys(): processor = processors[task]() label_list = processor.get_labels() num_labels = len(label_list) else: raise KeyError( f'{task} not found in processors or in output_modes. Please check utils.py.' ) # In[8]: def load_and_cache_examples(task, tokenizer, evaluate=False): processor = processors[task]() output_mode = args['output_mode']
def load_and_cache_examples(self, args, tokenizer, evaluate=False): task = args['task_name'] if task in processors.keys() and task in output_modes.keys(): processor = processors[task]() label_list = processor.get_labels() else: raise KeyError( f'{task} not found in processors or in output_modes. Please check utils.py.' ) output_mode = args['output_mode'] mode = 'dev' if evaluate else 'train' cached_features_file = os.path.join( args['data_dir'], f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}" ) if os.path.exists( cached_features_file) and not args['reprocess_input_data']: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args['data_dir']) label_list = processor.get_labels() examples = processor.get_dev_examples( args['data_dir'] ) if evaluate else processor.get_train_examples(args['data_dir']) features = convert_examples_to_features( examples, label_list, args['max_seq_length'], tokenizer, output_mode, cls_token_at_end=bool(args['model_type'] in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool( args['model_type'] in ['roberta'] ), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(args['model_type'] in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset