def load_data(config): # load some data and processor # data_processor = MsraNerProcessor() if config.data_sign == "nlpcc-dbqa": data_processor = DBQAProcessor() else: raise ValueError label_list = data_processor.get_labels() tokenizer = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=True) # load data exampels train_examples = data_processor.get_train_examples(config.data_dir) dev_examples = data_processor.get_dev_examples(config.data_dir) test_examples = data_processor.get_test_examples(config.data_dir) # convert data example into featrues train_features = convert_examples_to_features(train_examples, label_list, config.max_seq_length, tokenizer, task_sign=config.task_name) train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) train_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) train_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) train_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_label_ids) # train_sampler = DistributedSampler(train_data) train_sampler = RandomSampler(train_data) dev_features = convert_examples_to_features(dev_examples, label_list, config.max_seq_length, tokenizer, task_sign=config.task_name) dev_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) dev_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(dev_input_ids, dev_input_mask, dev_segment_ids, dev_label_ids) dev_sampler = RandomSampler(dev_data) test_features = convert_examples_to_features(test_examples, label_list, config.max_seq_length, tokenizer, task_sign=config.task_name) test_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_input_ids, test_input_mask, test_segment_ids, test_label_ids) # test_sampler = DistributedSampler(test_data) test_sampler = RandomSampler(test_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, \ batch_size=config.train_batch_size, num_workers=config.nworkers) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, \ batch_size=config.dev_batch_size, num_workers=config.nworkers) test_dataloader = DataLoader(test_data, sampler=test_sampler, \ batch_size=config.test_batch_size, num_workers=config.nworkers) num_train_steps = int(len(train_examples) / config.train_batch_size * 5) return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list
def load_data(config): # load some data and processor # data_processor = MsraNerProcessor() if config.data_sign == "msra_ner": data_processor = MsraNERProcessor() elif config.data_sign == "resume_ner": data_processor = ResumeNERProcessor() elif config.data_sign == "ontonotes_ner": data_processor = OntoNotesNERProcessor() elif config.data_sign == "ctb5_pos": data_processor = Ctb5POSProcessor() elif config.data_sign == "ctb6_pos": data_processor = Ctb6POSProcessor() elif config.data_sign == "ud1_pos": data_processor = Ud1POSProcessor() elif config.data_sign == "ctb6_cws": data_processor = Ctb6CWSProcessor() elif config.data_sign == "pku_cws": data_processor = PkuCWSProcessor() elif config.data_sign == "mrs_cws": data_processor = MsrCWSProcessor() else: raise ValueError label_list = data_processor.get_labels() tokenizer = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=True) # load data exampels train_examples = data_processor.get_train_examples(config.data_dir) dev_examples = data_processor.get_dev_examples(config.data_dir) test_examples = data_processor.get_test_examples(config.data_dir) # convert data example into featrues train_features = convert_examples_to_features(train_examples, label_list, config.max_seq_length, tokenizer) train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) train_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) train_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) train_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_label_ids) # train_sampler = DistributedSampler(train_data) train_sampler = RandomSampler(train_data) dev_features = convert_examples_to_features(dev_examples, label_list, config.max_seq_length, tokenizer) dev_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) dev_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) dev_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) dev_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(dev_input_ids, dev_input_mask, dev_segment_ids, dev_label_ids) dev_sampler = RandomSampler(dev_data) test_features = convert_examples_to_features(test_examples, label_list, config.max_seq_length, tokenizer) test_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_input_ids, test_input_mask, test_segment_ids, test_label_ids) # test_sampler = DistributedSampler(test_data) test_sampler = RandomSampler(test_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, \ batch_size=config.train_batch_size) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, \ batch_size=config.dev_batch_size) test_dataloader = DataLoader(test_data, sampler=test_sampler, \ batch_size=config.test_batch_size) num_train_steps = int( len(train_examples) / config.train_batch_size * config.num_train_epochs) return train_dataloader, dev_dataloader, test_dataloader, num_train_steps, label_list