def load_data(config):
    # load some data and processor
    # data_processor = MsraNerProcessor()
    if config.data_sign == "nlpcc-dbqa":
        data_processor = DBQAProcessor()
    else:
        raise ValueError

    label_list = data_processor.get_labels()
    tokenizer = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=True)

    # load data exampels
    train_examples = data_processor.get_train_examples(config.data_dir)
    dev_examples = data_processor.get_dev_examples(config.data_dir)
    test_examples = data_processor.get_test_examples(config.data_dir)

    # convert data example into featrues
    train_features = convert_examples_to_features(train_examples, label_list, config.max_seq_length, tokenizer,
                                                  task_sign=config.task_name)
    train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    train_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    train_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    train_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
    train_data = TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_label_ids)
    # train_sampler = DistributedSampler(train_data)
    train_sampler = RandomSampler(train_data)

    dev_features = convert_examples_to_features(dev_examples, label_list, config.max_seq_length, tokenizer,
                                                task_sign=config.task_name)
    dev_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long)
    dev_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long)
    dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long)
    dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long)
    dev_data = TensorDataset(dev_input_ids, dev_input_mask, dev_segment_ids, dev_label_ids)

    dev_sampler = RandomSampler(dev_data)

    test_features = convert_examples_to_features(test_examples, label_list, config.max_seq_length, tokenizer,
                                                 task_sign=config.task_name)
    test_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    test_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    test_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)
    test_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long)
    test_data = TensorDataset(test_input_ids, test_input_mask, test_segment_ids, test_label_ids)
    # test_sampler = DistributedSampler(test_data)
    test_sampler = RandomSampler(test_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, \
                                  batch_size=config.train_batch_size, num_workers=config.nworkers)

    dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, \
                                batch_size=config.dev_batch_size, num_workers=config.nworkers)

    test_dataloader = DataLoader(test_data, sampler=test_sampler, \
                                 batch_size=config.test_batch_size, num_workers=config.nworkers)

    num_train_steps = int(len(train_examples) / config.train_batch_size * 5)
    return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list
Esempio n. 2
0
def load_data(config):
    # load some data and processor
    # data_processor = MsraNerProcessor()
    if config.data_sign == "msra_ner":
        data_processor = MsraNERProcessor()
    elif config.data_sign == "resume_ner":
        data_processor = ResumeNERProcessor()
    elif config.data_sign == "ontonotes_ner":
        data_processor = OntoNotesNERProcessor()
    elif config.data_sign == "ctb5_pos":
        data_processor = Ctb5POSProcessor()
    elif config.data_sign == "ctb6_pos":
        data_processor = Ctb6POSProcessor()
    elif config.data_sign == "ud1_pos":
        data_processor = Ud1POSProcessor()
    elif config.data_sign == "ctb6_cws":
        data_processor = Ctb6CWSProcessor()
    elif config.data_sign == "pku_cws":
        data_processor = PkuCWSProcessor()
    elif config.data_sign == "mrs_cws":
        data_processor = MsrCWSProcessor()
    else:
        raise ValueError

    label_list = data_processor.get_labels()
    tokenizer = BertTokenizer.from_pretrained(config.bert_model,
                                              do_lower_case=True)

    # load data exampels
    train_examples = data_processor.get_train_examples(config.data_dir)
    dev_examples = data_processor.get_dev_examples(config.data_dir)
    test_examples = data_processor.get_test_examples(config.data_dir)

    # convert data example into featrues
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  config.max_seq_length,
                                                  tokenizer)
    train_input_ids = torch.tensor([f.input_ids for f in train_features],
                                   dtype=torch.long)
    train_input_mask = torch.tensor([f.input_mask for f in train_features],
                                    dtype=torch.long)
    train_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                     dtype=torch.long)
    train_label_ids = torch.tensor([f.label_id for f in train_features],
                                   dtype=torch.long)
    train_data = TensorDataset(train_input_ids, train_input_mask,
                               train_segment_ids, train_label_ids)
    # train_sampler = DistributedSampler(train_data)
    train_sampler = RandomSampler(train_data)

    dev_features = convert_examples_to_features(dev_examples, label_list,
                                                config.max_seq_length,
                                                tokenizer)
    dev_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                 dtype=torch.long)
    dev_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                  dtype=torch.long)
    dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features],
                                   dtype=torch.long)
    dev_label_ids = torch.tensor([f.label_id for f in dev_features],
                                 dtype=torch.long)
    dev_data = TensorDataset(dev_input_ids, dev_input_mask, dev_segment_ids,
                             dev_label_ids)

    dev_sampler = RandomSampler(dev_data)

    test_features = convert_examples_to_features(test_examples, label_list,
                                                 config.max_seq_length,
                                                 tokenizer)
    test_input_ids = torch.tensor([f.input_ids for f in test_features],
                                  dtype=torch.long)
    test_input_mask = torch.tensor([f.input_mask for f in test_features],
                                   dtype=torch.long)
    test_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                    dtype=torch.long)
    test_label_ids = torch.tensor([f.label_id for f in test_features],
                                  dtype=torch.long)
    test_data = TensorDataset(test_input_ids, test_input_mask,
                              test_segment_ids, test_label_ids)
    # test_sampler = DistributedSampler(test_data)
    test_sampler = RandomSampler(test_data)

    train_dataloader = DataLoader(train_data, sampler=train_sampler, \
        batch_size=config.train_batch_size)

    dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, \
        batch_size=config.dev_batch_size)

    test_dataloader = DataLoader(test_data, sampler=test_sampler, \
        batch_size=config.test_batch_size)

    num_train_steps = int(
        len(train_examples) / config.train_batch_size *
        config.num_train_epochs)
    return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list