예제 #1
0
def roberta_pair_task(config):

    tokenizer = BertTokenizer.from_pretrained(config.tokenizer_file,
                                              do_lower_case=config.do_lower_case)
    processor = DataProcessor(config)
    config.class_list = processor.get_labels()
    config.num_labels = len(config.class_list)

    train_examples = processor.get_train_examples()
    dev_examples = processor.get_dev_examples()
    augment_examples = processor.read_data_augment(config.data_augment_method)

    cur_model = MODEL_CLASSES[config.use_model]
    model = cur_model(config)

    logging.info("self config %s", config_to_json_string(config))

    model_example, dev_evaluate, predict_label = cross_validation(
        config=config,
        model=model,
        tokenizer=tokenizer,
        train_examples=train_examples,
        dev_examples=dev_examples,
        pattern=config.pattern,
        train_enhancement=augment_examples if config.data_augment else None,
        test_examples=None)
    logging.info("dev_evaluate: {}".format(dev_evaluate))

    if config.pattern == 'full_train':
        model_save(config, model_example)

    return dev_evaluate
def chip2019_extract(config):

    config.stop_word_valid = False
    processor = DataProcessor(config)
    config.class_list = processor.get_labels()
    config.num_labels = len(config.class_list)

    original_chip2019_examples = processor.get_original_chip2019_examples()
    if config.reverse_tag:  # 交换
        reverse_test_examples = sentence_reverse(original_chip2019_examples)
        all_test_examples = [original_chip2019_examples, reverse_test_examples]
    else:
        all_test_examples = [original_chip2019_examples]

    cur_model = MODEL_CLASSES[config.use_model]

    tokenizer = BertTokenizer.from_pretrained(
        config.tokenizer_file, do_lower_case=config.do_lower_case)
    model = cur_model(config)
    modle_file = os.path.join(config.save_path[0],
                              config.save_file[0] + '.pkl')

    if not os.path.isfile(modle_file) or config.retrain_model:
        print("{} not exit.".format(modle_file))
        # 不存在模型文件
        # 读取训练数据
        config.batch_size = 16
        train_examples = processor.get_train_examples()
        dev_examples = processor.get_dev_examples()
        if config.data_augment:
            augment_examples = processor.read_data_augment(
                config.data_augment_method)
        else:
            augment_examples = None

        model_example, dev_evaluate, predict_label = cross_validation(
            config=config,
            model=model,
            tokenizer=tokenizer,
            train_examples=train_examples,
            dev_examples=dev_examples,
            pattern=config.pattern,
            train_enhancement=augment_examples,
            test_examples=None)
        model_save(config, model_example)

    model_load(config, model, device='cpu')
    model.to(config.device)
    config.batch_size = 512
    single_model_predict = []
    for test_examples in all_test_examples:
        _, _, predict_label = cross_validation(config=config,
                                               model=model,
                                               tokenizer=tokenizer,
                                               train_examples=None,
                                               dev_examples=None,
                                               pattern='predict',
                                               train_enhancement=None,
                                               test_examples=test_examples)
        single_model_predict.append(predict_label)
    predict_prob = combined_result(single_model_predict, pattern='average')
    save_file = os.path.join(config.other_data_dir,
                             config.chip2019_augment_save_file)
    print('save_file{}'.format(save_file))
    examples_extract(original_chip2019_examples,
                     predict_prob,
                     save_file,
                     sel_prob=config.prob_range,
                     random_state=config.seed)
def test_task(config):

    print('cude: {}'.format(torch.cuda.is_available()))
    print('cur device {}'.format(config.device.type))
    start_time = time.time()

    processor = DataProcessor(config)
    config.class_list = processor.get_labels()
    config.num_labels = len(config.class_list)

    test_examples = processor.get_test_examples(config.test_data_dir)
    # 交换
    if config.reverse_tag:
        reverse_test_examples = sentence_reverse(test_examples)
        all_examples = [test_examples, reverse_test_examples]
    else:
        all_examples = [test_examples]

    cur_model = MODEL_CLASSES[config.use_model]
    print('loading data time: {:.6f}s'.format(time.time() - start_time))

    all_predict = []
    for i in range(config.model_num):
        model_time_s = time.time()
        print('the model of {} starting...'.format(config.models_name[i]))
        tokenizer = BertTokenizer.from_pretrained(
            config.tokenizer_file[i], do_lower_case=config.do_lower_case)
        model = cur_model(config, num=i)
        model_load(config, model, num=i, device='cpu')
        model.to(config.device)
        print("\tloading pre-train model, cost time {:.6f}s".format(
            time.time() - model_time_s))

        single_model_predict = []
        for e_idx, t_examples in enumerate(all_examples):
            example_time = time.time()
            _, _, predict_label = cross_validation(config=config,
                                                   model=model,
                                                   tokenizer=tokenizer,
                                                   train_examples=None,
                                                   dev_examples=None,
                                                   pattern='predict',
                                                   train_enhancement=None,
                                                   test_examples=t_examples)
            single_model_predict.append(predict_label)
            print("\ttest dataset:{}, cost time {:.6f}s, total time {:.6f}s".
                  format(e_idx + 1,
                         time.time() - example_time,
                         time.time() - start_time))

        print("# time {:.6f}s, total time {:.6f}s".format(
            time.time() - model_time_s,
            time.time() - start_time))
        predict_prob = combined_result(single_model_predict, pattern='average')
        all_predict.append(predict_prob)

    final_predict_label = combined_result(all_predict, pattern='average')
    final_predict_label = np.asarray(
        final_predict_label >= config.prob_threshold, dtype=np.int)

    index = list(
        pd.read_csv(os.path.join(config.test_data_dir, 'test.csv'),
                    encoding='utf-8')['id'])
    df_upload = pd.DataFrame({'id': index, 'label': final_predict_label})
    df_upload.to_csv(config.save_data_path, index=False)
    print('\ntotal time {:.6f}s'.format(time.time() - start_time))