Esempio n. 1
0
def bert_features(model, tokenizer, data, batch_size=1):
    in_features = convert_examples_to_features(data,
                                               seq_length=50,
                                               tokenizer=tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in in_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in in_features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=batch_size)

    model.eval()

    bert = []
    for input_ids, input_mask, example_indices in tqdm(eval_dataloader):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        all_encoder_layers, _ = model(input_ids,
                                      token_type_ids=None,
                                      attention_mask=input_mask)
        bert.append(all_encoder_layers[-1].detach().cpu().numpy())

    return np.concatenate(bert, axis=0)
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}".format(
            mode, 'bert', str(args.max_seq_length)
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        examples = read_examples_from_file(args.data_dir, mode)
        features = convert_examples_to_features(
            examples,
            labels,
            args.max_seq_length,
            tokenizer,
            cls_token_at_end=bool(args.model_type in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
            pad_token_label_id=pad_token_label_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    # features中的label_ids 统一将word做tokenize后多余的以及pad的位置全部置为了pad_token_label_id = -100
    # 后续验证的时候需要筛掉
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset
# In[6]:

tokenizer = bert_utils.create_tokenizer_from_hub_module(BERT_PATH, sess)

# ### Preprocess Data

# In[7]:

train_text, train_label, num_classes = utils.load_ag_news_dataset(
    max_seq_len=MAX_SEQ_LEN, test=False)

train_label = np.asarray(train_label)
train_examples = bert_utils.convert_text_to_examples(train_text, train_label)
feat = bert_utils.convert_examples_to_features(tokenizer,
                                               train_examples,
                                               max_seq_length=MAX_SEQ_LEN,
                                               verbose=True)

(train_input_ids, train_input_masks, train_segment_ids, train_labels) = feat

train_input_ids, train_input_masks, train_segment_ids, train_labels = shuffle(
    train_input_ids, train_input_masks, train_segment_ids, train_labels)

# In[8]:

examples, labels, num_classes = utils.load_ag_news_dataset(
    max_seq_len=MAX_SEQ_LEN, test=True)
labels = np.asarray(labels)
test_examples = bert_utils.convert_text_to_examples(examples, labels)
feat = bert_utils.convert_examples_to_features(tokenizer,
                                               test_examples,
Esempio n. 4
0
bert_samples = []

for i, test_sample in enumerate(test_samples):
    bert_sample = bert_utils.InputExample(
        guid="train-%d" % i,
        text_a=test_sample["text"],
        text_b=None,
        label=test_sample["state_label"],
        entity=test_sample["participant"],
        sequence_id=test_sample["entity_tags"].astype(int).tolist())

    bert_samples.append(bert_sample)

test_features = bert_utils.convert_examples_to_features(
    bert_samples,
    label_list=["none", "create", "destroy", "move"],
    max_seq_length=70,
    tokenizer=tokenizer,
    output_mode="classification")

with torch.no_grad():
    correct_state_label = 0
    total_state_label = 0

    sum_loss = 0.0

    state_label_cm = [[0] * 4 for _ in range(4)]

    # fpDev = open("test_preds_epoch%d.txt" % (epoch), "w")

    for i, test_feature in enumerate(test_features):
        gpu_input_ids = test_feature.input_ids.cuda()
Esempio n. 5
0
                                          do_lower_case=do_lower_case)
num_train_optimization_steps = int(
    len(train_InputExamples) / batch_size /
    gradient_accumulation_steps) * num_epochs

model_qa = BertQA.from_pretrained(
    bert_model,
    cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                           'distributed_{}'.format(args.local_rank)))

if args.local_rank == 0:
    torch.distributed.barrier()

model_qa.to(device)

train_features = bert_utils.convert_examples_to_features(
    train_InputExamples, MAX_SEQ_LENGTH, tokenizer)
all_input_ids = torch.tensor([f.input_ids for f in train_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                               dtype=torch.long)
all_start_positions = torch.tensor([f.start_label_ids for f in train_features],
                                   dtype=torch.long)
all_end_positions = torch.tensor([f.end_label_ids for f in train_features],
                                 dtype=torch.long)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                           all_start_positions, all_end_positions)

train_sampler = SequentialSampler(
    train_data) if args.local_rank == -1 else DistributedSampler(train_data)