def load_and_cache_examples(args, task, tokenizer, evaluate=False):

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_roberta_{}_{}_mytask_sfu'.format(
            'dev' if evaluate else 'train', str(args.max_seq_length),
            str(task)))
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = processor.get_dev_examples3(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=False,
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0,
        )
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_align_mask = torch.tensor([f.align_mask for f in features],
                                  dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_align_mask,
                            all_token_type_ids, all_labels)
    return dataset
Esempio n. 2
0
 for i in range(len(samples)):
     guid = "%s" % (i)
     text_a = samples[i][0].lower()
     text_b = samples[i][1].lower()
     label = str(int(table.row_values(j)[3] >= 0.5))
     examples.append(
         InputExample(guid=guid,
                      text_a=text_a,
                      text_b=text_b,
                      label=label))
 features = convert_examples_to_features(
     examples,
     tokenizer,
     label_list=["0", "1"],
     max_length=64,
     output_mode="classification",
     pad_on_left=False,
     # pad on the left for xlnet
     pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                ])[0],
     pad_token_segment_id=0,
 )
 #for f in features:
 #input_ids = torch.tensor(f.input_ids, dtype=torch.long).unsqueeze(0)
 #attention_mask = torch.tensor(f.attention_mask, dtype=torch.long).unsqueeze(0)
 #align_mask = torch.tensor(f.align_mask, dtype=torch.long).unsqueeze(0)
 #label = torch.tensor(f.label, dtype=torch.long).unsqueeze(0)
 #input_ids = torch.tensor(f.input_ids, dtype=torch.long).unsqueeze(0).cuda()
 #attention_mask = torch.tensor(f.attention_mask, dtype=torch.long).unsqueeze(0).cuda()
 #align_mask = torch.tensor(f.align_mask, dtype=torch.long).unsqueeze(0).cuda()
 #label = torch.tensor(f.label, dtype=torch.long).unsqueeze(0).cuda()
 #print(label.shape)