def roberta_pair_task(config): tokenizer = BertTokenizer.from_pretrained(config.tokenizer_file, do_lower_case=config.do_lower_case) processor = DataProcessor(config) config.class_list = processor.get_labels() config.num_labels = len(config.class_list) train_examples = processor.get_train_examples() dev_examples = processor.get_dev_examples() augment_examples = processor.read_data_augment(config.data_augment_method) cur_model = MODEL_CLASSES[config.use_model] model = cur_model(config) logging.info("self config %s", config_to_json_string(config)) model_example, dev_evaluate, predict_label = cross_validation( config=config, model=model, tokenizer=tokenizer, train_examples=train_examples, dev_examples=dev_examples, pattern=config.pattern, train_enhancement=augment_examples if config.data_augment else None, test_examples=None) logging.info("dev_evaluate: {}".format(dev_evaluate)) if config.pattern == 'full_train': model_save(config, model_example) return dev_evaluate
def chip2019_extract(config): config.stop_word_valid = False processor = DataProcessor(config) config.class_list = processor.get_labels() config.num_labels = len(config.class_list) original_chip2019_examples = processor.get_original_chip2019_examples() if config.reverse_tag: # 交换 reverse_test_examples = sentence_reverse(original_chip2019_examples) all_test_examples = [original_chip2019_examples, reverse_test_examples] else: all_test_examples = [original_chip2019_examples] cur_model = MODEL_CLASSES[config.use_model] tokenizer = BertTokenizer.from_pretrained( config.tokenizer_file, do_lower_case=config.do_lower_case) model = cur_model(config) modle_file = os.path.join(config.save_path[0], config.save_file[0] + '.pkl') if not os.path.isfile(modle_file) or config.retrain_model: print("{} not exit.".format(modle_file)) # 不存在模型文件 # 读取训练数据 config.batch_size = 16 train_examples = processor.get_train_examples() dev_examples = processor.get_dev_examples() if config.data_augment: augment_examples = processor.read_data_augment( config.data_augment_method) else: augment_examples = None model_example, dev_evaluate, predict_label = cross_validation( config=config, model=model, tokenizer=tokenizer, train_examples=train_examples, dev_examples=dev_examples, pattern=config.pattern, train_enhancement=augment_examples, test_examples=None) model_save(config, model_example) model_load(config, model, device='cpu') model.to(config.device) config.batch_size = 512 single_model_predict = [] for test_examples in all_test_examples: _, _, predict_label = cross_validation(config=config, model=model, tokenizer=tokenizer, train_examples=None, dev_examples=None, pattern='predict', train_enhancement=None, test_examples=test_examples) single_model_predict.append(predict_label) predict_prob = combined_result(single_model_predict, pattern='average') save_file = os.path.join(config.other_data_dir, config.chip2019_augment_save_file) print('save_file{}'.format(save_file)) examples_extract(original_chip2019_examples, predict_prob, save_file, sel_prob=config.prob_range, random_state=config.seed)
def test_task(config): print('cude: {}'.format(torch.cuda.is_available())) print('cur device {}'.format(config.device.type)) start_time = time.time() processor = DataProcessor(config) config.class_list = processor.get_labels() config.num_labels = len(config.class_list) test_examples = processor.get_test_examples(config.test_data_dir) # 交换 if config.reverse_tag: reverse_test_examples = sentence_reverse(test_examples) all_examples = [test_examples, reverse_test_examples] else: all_examples = [test_examples] cur_model = MODEL_CLASSES[config.use_model] print('loading data time: {:.6f}s'.format(time.time() - start_time)) all_predict = [] for i in range(config.model_num): model_time_s = time.time() print('the model of {} starting...'.format(config.models_name[i])) tokenizer = BertTokenizer.from_pretrained( config.tokenizer_file[i], do_lower_case=config.do_lower_case) model = cur_model(config, num=i) model_load(config, model, num=i, device='cpu') model.to(config.device) print("\tloading pre-train model, cost time {:.6f}s".format( time.time() - model_time_s)) single_model_predict = [] for e_idx, t_examples in enumerate(all_examples): example_time = time.time() _, _, predict_label = cross_validation(config=config, model=model, tokenizer=tokenizer, train_examples=None, dev_examples=None, pattern='predict', train_enhancement=None, test_examples=t_examples) single_model_predict.append(predict_label) print("\ttest dataset:{}, cost time {:.6f}s, total time {:.6f}s". format(e_idx + 1, time.time() - example_time, time.time() - start_time)) print("# time {:.6f}s, total time {:.6f}s".format( time.time() - model_time_s, time.time() - start_time)) predict_prob = combined_result(single_model_predict, pattern='average') all_predict.append(predict_prob) final_predict_label = combined_result(all_predict, pattern='average') final_predict_label = np.asarray( final_predict_label >= config.prob_threshold, dtype=np.int) index = list( pd.read_csv(os.path.join(config.test_data_dir, 'test.csv'), encoding='utf-8')['id']) df_upload = pd.DataFrame({'id': index, 'label': final_predict_label}) df_upload.to_csv(config.save_data_path, index=False) print('\ntotal time {:.6f}s'.format(time.time() - start_time))