def load_and_cache_examples(args, task, tokenizer, evaluate=False): processor = processors[task]() output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, 'cached_{}_roberta_{}_{}_mytask_sfu'.format( 'dev' if evaluate else 'train', str(args.max_seq_length), str(task))) if os.path.exists(cached_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = processor.get_dev_examples3( args.data_dir) if evaluate else processor.get_train_examples( args.data_dir) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0, ) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_align_mask = torch.tensor([f.align_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) elif output_mode == "regression": all_labels = torch.tensor([f.label for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_attention_mask, all_align_mask, all_token_type_ids, all_labels) return dataset
for i in range(len(samples)): guid = "%s" % (i) text_a = samples[i][0].lower() text_b = samples[i][1].lower() label = str(int(table.row_values(j)[3] >= 0.5)) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) features = convert_examples_to_features( examples, tokenizer, label_list=["0", "1"], max_length=64, output_mode="classification", pad_on_left=False, # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0, ) #for f in features: #input_ids = torch.tensor(f.input_ids, dtype=torch.long).unsqueeze(0) #attention_mask = torch.tensor(f.attention_mask, dtype=torch.long).unsqueeze(0) #align_mask = torch.tensor(f.align_mask, dtype=torch.long).unsqueeze(0) #label = torch.tensor(f.label, dtype=torch.long).unsqueeze(0) #input_ids = torch.tensor(f.input_ids, dtype=torch.long).unsqueeze(0).cuda() #attention_mask = torch.tensor(f.attention_mask, dtype=torch.long).unsqueeze(0).cuda() #align_mask = torch.tensor(f.align_mask, dtype=torch.long).unsqueeze(0).cuda() #label = torch.tensor(f.label, dtype=torch.long).unsqueeze(0).cuda() #print(label.shape)