Beispiel #1
0
def main():
    NUM_TRAIN_DATA = 150000
    NUM_TEST_DATA = 5000
    MODEL_DIR = './albert_base'
    MAX_LEN = 512
    BATCH_SIZE = 16 * 2 # 8gpu * 16
    LR = 1e-5
    NUM_LABELS = 33
    EPOCHS = 4

    # read data
    content, target = read_data('../../corpus/ettoday_2017.json')

    # train dataloader
    examples = DataProcessor().get_train_examples(content[:NUM_TRAIN_DATA], target[:NUM_TRAIN_DATA])
    train_dataset = convert_examples_to_features(examples, max_length=MAX_LEN, tokenizer=BertTokenizerFast.from_pretrained(MODEL_DIR))
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)

    # test dataloader
    examples = DataProcessor().get_test_examples(content[NUM_TRAIN_DATA:NUM_TEST_DATA+NUM_TRAIN_DATA], target[NUM_TRAIN_DATA:NUM_TEST_DATA+NUM_TRAIN_DATA])
    test_dataset = convert_examples_to_features(examples, max_length=MAX_LEN, tokenizer=BertTokenizerFast.from_pretrained(MODEL_DIR))
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

    # start training and callback for eval
    # train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, epochs=EPOCHS, eval_callback=evaluate, test_loader=train_loader)
    train(train_loader, MODEL_DIR, num_labels=NUM_LABELS, lr=LR, epochs=EPOCHS, eval_callback=evaluate, test_loader=test_loader)
Beispiel #2
0
def build_features(input_path,
                   tokenizer,
                   poss,
                   labels,
                   config,
                   mode='train',
                   w_tokenizer=None,
                   glabels={}):

    logger.info("[Creating features from file] %s", input_path)
    examples = read_examples_from_file(config, input_path, mode=mode)
    features = convert_examples_to_features(
        config,
        examples,
        poss,
        labels,
        config['n_ctx'],
        tokenizer,
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(config['emb_class'] in ['roberta']),
        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_token=tokenizer.pad_token,
        pad_token_id=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_pos_id=config['pad_pos_id'],
        pad_token_label_id=config['pad_label_id'],
        pad_token_segment_id=0,
        sequence_a_segment_id=0,
        glabel_map=glabels,
        w_tokenizer=w_tokenizer)
    return features
Beispiel #3
0
def main():
    NUM_TRAIN_DATA = 60000
    MODEL_DIR = './rbtl3'
    MAX_LEN = 512
    BATCH_SIZE = 12
    EPOCHS = 4

    # read data
    content, target = read_data()

    # train dataloader
    examples = DataProcessor().get_train_examples(content[:NUM_TRAIN_DATA],
                                                  target[:NUM_TRAIN_DATA])
    train_dataset = convert_examples_to_features(
        examples,
        max_length=MAX_LEN,
        tokenizer=BertTokenizer.from_pretrained(MODEL_DIR))
    train_loader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=BATCH_SIZE)

    # test dataloader
    examples = DataProcessor().get_test_examples(content[NUM_TRAIN_DATA:],
                                                 target[NUM_TRAIN_DATA:])
    test_dataset = convert_examples_to_features(
        examples,
        max_length=MAX_LEN,
        tokenizer=BertTokenizer.from_pretrained(MODEL_DIR))
    test_loader = DataLoader(test_dataset,
                             shuffle=False,
                             batch_size=BATCH_SIZE)

    # start training and callback for eval
    # train(train_loader, MODEL_DIR, num_labels=18, epochs=EPOCHS, eval_callback=evaluate, test_loader=train_loader)
    train(train_loader,
          MODEL_DIR,
          num_labels=18,
          epochs=EPOCHS,
          eval_callback=evaluate,
          test_loader=test_loader)
Beispiel #4
0
def map_eval(eval_file, token_length, tokenizer, device, model, label_list):
    model.eval()
    datasets, labels = get_datasets(eval_file)
    total_batches = 0
    total_avp = 0.0
    total_mrr = 0.0
    # scores, labels = [], []
    for k, dataset in tqdm(datasets.items(), desc="Eval datasets"):
        examples = []
        for i, data in enumerate(dataset):
            examples.append(InputExample(i, data[0], data[1], '0'))
        eval_features = convert_examples_to_features(examples, label_list,
                                                     token_length, tokenizer)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long).to(device)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long).to(device)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long).to(device)
        # all_label_ids = torch.tensor(
        #   [f.label_id for f in eval_features], dtype=torch.long).to(device)
        x_input_ids = torch.tensor([f.input_ids_x for f in eval_features],
                                   dtype=torch.long).to(device)
        x_input_mask = torch.tensor([f.input_mask_x for f in eval_features],
                                    dtype=torch.long).to(device)
        x_segment_ids = torch.tensor([f.segment_ids_x for f in eval_features],
                                     dtype=torch.long).to(device)
        y_input_ids = torch.tensor([f.input_ids_y for f in eval_features],
                                   dtype=torch.long).to(device)
        y_input_mask = torch.tensor([f.input_mask_y for f in eval_features],
                                    dtype=torch.long).to(device)
        y_segment_ids = torch.tensor([f.segment_ids_y for f in eval_features],
                                     dtype=torch.long).to(device)
        with torch.no_grad():
            logits = model(x_input_ids, x_input_mask, x_segment_ids,
                           y_input_ids, y_input_mask, y_segment_ids,
                           all_input_ids, all_segment_ids, all_input_mask)
        score = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
        label = np.array(list(map(int, labels[k])))
        # print(score, label)
        # scores.append(score)
        #      labels.append(label)
        total_avp += mean_average_precision(label, score)
        total_mrr += mean_reciprocal_rank(label, score)
        total_batches += 1
    mAP = total_avp / total_batches
    mRR = total_mrr / total_batches
    logger.info("map is : {}, mrr is : {}".format(mAP, mRR))
    data = {'map': mAP, 'mrr': mRR}
    with open('./result.json', 'w', encoding='utf-8') as f:
        json.dump(data, f)
Beispiel #5
0
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, data_file, is_test=False, is_ens=False):
	if args.local_rank not in [-1, 0] and not evaluate:
		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

	# Load data features from cache or dataset file
	cached_features_file = os.path.join(
		args.data_dir,
		"cached_{}_{}_{}".format(
			data_file, list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)
		),
	)
	if os.path.exists(cached_features_file) and not args.overwrite_cache:
		logger.info("Loading features from cached file %s", cached_features_file)
		features = torch.load(cached_features_file)
	else:
		logger.info("Creating features from dataset file at %s", args.data_dir)
		examples = read_examples_from_file(args.data_dir, data_file, is_test=is_test, is_ens=is_ens)
		features = convert_examples_to_features(
			examples,
			labels,
			args.max_seq_length,
			tokenizer,
			cls_token_at_end=bool(args.model_type in ["xlnet"]),
			# xlnet has a cls token at the end
			cls_token=tokenizer.cls_token,
			cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
			sep_token=tokenizer.sep_token,
			sep_token_extra=bool("roberta" in args.model_type),
			# roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
			pad_on_left=bool(args.model_type in ["xlnet"]),
			# pad on the left for xlnet
			pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
			pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
			pad_token_label_id=pad_token_label_id,
		)
		if args.local_rank in [-1, 0]:
			logger.info("Saving features into cached file %s", cached_features_file)
			torch.save(features, cached_features_file)

	if args.local_rank == 0 and not evaluate:
		torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

	# Convert to Tensors and build dataset
	all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
	all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
	all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
	all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)

	dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
	return dataset
Beispiel #6
0
def get_dataloader(processor, args, tokenizer, mode='test'):
    eval_examples = processor.get_test_examples() if mode=='test' \
        else processor.get_dev_examples()
    eval_examples = eval_examples[:1000]
    label_list = processor.get_labels()
    eval_features = convert_examples_to_features(eval_examples, label_list,
                                                 args.max_seq_length,
                                                 tokenizer)
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.long)
    x_input_ids = torch.tensor([f.input_ids_x for f in eval_features],
                               dtype=torch.long)
    x_input_mask = torch.tensor([f.input_mask_x for f in eval_features],
                                dtype=torch.long)
    x_segment_ids = torch.tensor([f.segment_ids_x for f in eval_features],
                                 dtype=torch.long)
    y_input_ids = torch.tensor([f.input_ids_y for f in eval_features],
                               dtype=torch.long)
    y_input_mask = torch.tensor([f.input_mask_y for f in eval_features],
                                dtype=torch.long)
    y_segment_ids = torch.tensor([f.segment_ids_y for f in eval_features],
                                 dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_ids, x_input_ids, x_input_mask,
                              x_segment_ids, y_input_ids, y_input_mask,
                              y_segment_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)
    return eval_dataloader
def read_data(file):
    df = pd.read_json(file)
    df = shuffle(df)
    content = (df['title'] + ' ' + df['content']).to_list()
    target = df['category'].to_list()
    return content, target


if __name__ == '__main__':
    import pandas as pd

    NUM_TEST_DATA = 50016
    MODEL_DIR = './electra_chinese_base'
    MAX_LEN = 512
    BATCH_SIZE = 16 * 2  # 8gpu * 16
    NUM_LABELS = 33

    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

    content, target = read_data('../../corpus/ettoday_2017.json')
    examples = DataProcessor().get_test_examples(content[:NUM_TEST_DATA],
                                                 target[:NUM_TEST_DATA])
    test_dataset = convert_examples_to_features(
        examples,
        max_length=MAX_LEN,
        tokenizer=ElectraTokenizerFast.from_pretrained(MODEL_DIR))
    test_loader = DataLoader(test_dataset,
                             shuffle=False,
                             batch_size=BATCH_SIZE)
    evaluate(test_loader, MODEL_DIR, 'step_18749.ckpt', NUM_LABELS)
Beispiel #8
0
def train(model,
          processor,
          task_name,
          optimizer,
          train_examples,
          label_list,
          args,
          tokenizer,
          device,
          n_gpu,
          num_train_optimization_steps,
          valid=False):
    # model.train()
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    # train_features = convert_examples_to_features(
    #    train_examples, label_list, args.max_seq_length, tokenizer)
    if os.path.exists('./cache_cmed/train_features.pkl'):
        with open('./cache_cmed/train_features.pkl', 'rb') as f:
            train_features = pickle.load(f)[:50000]
    else:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        with open('./cache_cmed/train_features.pkl', 'wb') as f:
            pickle.dump(train_features, f)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_optimization_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features],
                                 dtype=torch.long)
    x_input_ids = torch.tensor([f.input_ids_x for f in train_features],
                               dtype=torch.long)
    x_input_mask = torch.tensor([f.input_mask_x for f in train_features],
                                dtype=torch.long)
    x_segment_ids = torch.tensor([f.segment_ids_x for f in train_features],
                                 dtype=torch.long)
    y_input_ids = torch.tensor([f.input_ids_y for f in train_features],
                               dtype=torch.long)
    y_input_mask = torch.tensor([f.input_mask_y for f in train_features],
                                dtype=torch.long)
    y_segment_ids = torch.tensor([f.segment_ids_y for f in train_features],
                                 dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                               all_label_ids, x_input_ids, x_input_mask,
                               x_segment_ids, y_input_ids, y_input_mask,
                               y_segment_ids)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    for _ in trange(int(args.num_train_epochs), desc="Epoch"):
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids, x_input_ids, x_input_mask, x_segment_ids, y_input_ids, y_input_mask, y_segment_ids = batch
            loss = model(x_input_ids, x_input_mask, x_segment_ids, y_input_ids,
                         y_input_mask, y_segment_ids, input_ids, segment_ids,
                         input_mask, label_ids)
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                optimizer.backward(loss)
            else:
                loss.backward()
            logger.info(loss.item())

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    # modify learning rate with special warm up BERT uses
                    # if args.fp16 is False, BertAdam is used that handles this automatically
                    lr_this_step = args.learning_rate * \
                        warmup_linear(
                            global_step/num_train_optimization_steps, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                #global_step += 1
        if valid:
            logging.info('Start eval the dev set')
            if task_name in ['lcqmc', 'mrpc', 'qqp', "cmedqa"]:
                eval_dataloader = get_dataloader(processor,
                                                 args,
                                                 tokenizer,
                                                 mode='dev')
                eval(model, eval_dataloader, device)
            else:
                dev_file = os.path.join(args.data_dir, 'dev.tsv')
                map_eval(dev_file, args.max_seq_length, tokenizer, device,
                         model, label_list)