Ejemplo n.º 1
0
    'roberta':
    (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

config = config_class.from_pretrained(args['model_name'],
                                      num_labels=3,
                                      finetuning_task=args['task_name'])
tokenizer = tokenizer_class.from_pretrained(args['model_name'])

model = model_class.from_pretrained(args['output_dir'])

task = args['task_name']

if task in processors.keys() and task in output_modes.keys():
    processor = processors[task]()
    label_list = processor.get_labels()
    num_labels = len(label_list)
else:
    raise KeyError(
        f'{task} not found in processors or in output_modes. Please check utils.py.'
    )

# In[8]:


def load_and_cache_examples(task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = args['output_mode']
Ejemplo n.º 2
0
    def load_and_cache_examples(self, args, tokenizer, evaluate=False):
        task = args['task_name']
        if task in processors.keys() and task in output_modes.keys():
            processor = processors[task]()
            label_list = processor.get_labels()
        else:
            raise KeyError(
                f'{task} not found in processors or in output_modes. Please check utils.py.'
            )
        output_mode = args['output_mode']

        mode = 'dev' if evaluate else 'train'
        cached_features_file = os.path.join(
            args['data_dir'],
            f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}"
        )

        if os.path.exists(
                cached_features_file) and not args['reprocess_input_data']:
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            features = torch.load(cached_features_file)

        else:
            logger.info("Creating features from dataset file at %s",
                        args['data_dir'])
            label_list = processor.get_labels()
            examples = processor.get_dev_examples(
                args['data_dir']
            ) if evaluate else processor.get_train_examples(args['data_dir'])
            features = convert_examples_to_features(
                examples,
                label_list,
                args['max_seq_length'],
                tokenizer,
                output_mode,
                cls_token_at_end=bool(args['model_type'] in ['xlnet']
                                      ),  # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2
                if args['model_type'] in ['xlnet'] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=bool(
                    args['model_type'] in ['roberta']
                ),  # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(args['model_type'] in
                                 ['xlnet']),  # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4
                if args['model_type'] in ['xlnet'] else 0)

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
        return dataset