コード例 #1
0
def training(opt):
    processor = NERProcessor(opt.raw_data_dir)

    train_feature,dev_feature,test_feature,pseudo_feature = processor.get_data_examples()
    # train_feature = train_feature + pseudo_feature
    train_feature = train_feature + dev_feature
    train_base(opt, train_feature,dev_feature,test_feature)
コード例 #2
0
def training(opt):
    processor = NERProcessor(opt.raw_data_dir)
    print("开始单折训练和预测:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))

    train_feature,dev_feature,fu_test_feature,pseudo_feature,chu_test_feature = processor.get_data_examples()
    train_feature = train_feature + pseudo_feature
    train_feature = train_feature + dev_feature

    train_base(opt, train_feature,dev_feature,fu_test_feature)
    
    print("结束单折训练和预测:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))
コード例 #3
0
def stacking(opt):
    logger.info('Start to KFold stack attribution model')

    if args.task_type == 'mrc':
        # 62 for mrc query
        processor = NERProcessor(opt.max_seq_len-62)
    else:
        processor = NERProcessor(opt.max_seq_len)

    kf = KFold(5, shuffle=True, random_state=42)

    stack_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'stack.json'))

    pseudo_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'pseudo.json'))

    base_output_dir = opt.output_dir

    for i, (train_ids, dev_ids) in enumerate(kf.split(stack_raw_examples)):
        logger.info(f'Start to train the {i} fold')
        train_raw_examples = [stack_raw_examples[_idx] for _idx in train_ids]

        # add pseudo data to train data
        train_raw_examples = train_raw_examples + pseudo_raw_examples
        train_examples = processor.get_examples(train_raw_examples, 'train')

        dev_raw_examples = [stack_raw_examples[_idx] for _idx in dev_ids]
        dev_info = processor.get_examples(dev_raw_examples, 'dev')

        tmp_output_dir = os.path.join(base_output_dir, f'v{i}')

        opt.output_dir = tmp_output_dir

        train_base(opt, train_examples, dev_info)
コード例 #4
0
def training(opt):
    if args.task_type == "mrc":
        # 62 for mrc query
        processor = NERProcessor(opt.max_seq_len - 62)
    else:
        processor = NERProcessor(opt.max_seq_len)

    # todo ???
    # train_raw_examples = processor.read_json(
    #     os.path.join(opt.raw_data_dir, "stack.json")
    # )
    train_raw_examples = processor.read_json(
        os.path.join(opt.raw_data_dir, "train.json"))

    # add pseudo data to train data
    # pseudo_raw_examples = processor.read_json(
    #     os.path.join(opt.raw_data_dir, "pseudo.json")
    # )

    # train_raw_examples = train_raw_examples + pseudo_raw_examples
    train_raw_examples = train_raw_examples

    train_examples = processor.get_examples(train_raw_examples, "train")

    dev_examples = None
    if opt.eval_model:
        dev_raw_examples = processor.read_json(
            os.path.join(opt.raw_data_dir, "dev.json"))
        dev_examples = processor.get_examples(dev_raw_examples, "dev")

    train_base(opt, train_examples, dev_examples)
コード例 #5
0
def ready_pretrain_data():
    #外部数据
    # out_data = []
    # with open(file='./data/raw_data/addr_sample',mode='r',encoding='utf-8') as lines:
    #     for line in lines:
    #         if line.strip():
    #             out_data.append(line.strip())

    #复赛数据
    data = []
    processor = NERProcessor('./data/raw_data')
    train_feature, dev_feature, fu_test_feature, _, chu_test_feature = processor.get_data_examples(
    )
    for feature in [
            train_feature, dev_feature, fu_test_feature, chu_test_feature
    ]:
        for sample in feature:
            data.append(''.join(sample.text).strip())

    return data
コード例 #6
0
def stacking(opt):
    processor = NERProcessor(opt.max_seq_len)
    train_feature,dev_feature,fu_test_feature,pseudo_feature,chu_test_feature = processor.get_data_examples()
    train = train_feature + dev_feature
    if opt.cv_infer:
        print("开始进行cv训练和推理:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))
        test_feature = fu_test_feature
    else:
        print("开始cv生成pseudo数据:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))
        test_feature = fu_test_feature + chu_test_feature



    kf = KFold(5, shuffle=True, random_state=42)

    base_output_dir = opt.output_dir
    models = []
    for i, (train_ids, dev_ids) in enumerate(kf.split(train)):
        logger.info(f'Start to train the {i} fold')
        train_raw_examples = [train[_idx] for _idx in train_ids]
        dev_raw_examples = [train[_idx] for _idx in dev_ids]
        # add pseudo data to train data
        train_raw_examples = train_raw_examples + pseudo_feature
        tmp_output_dir = os.path.join(base_output_dir, f'v{i}')

        opt.output_dir = tmp_output_dir
        opt.cv_num = i

        model,device=train_base(opt, train_raw_examples, dev_raw_examples, test_feature)
        models.append(model)

    #生成pseudo数据集
    if opt.cv_infer:
        #ensemble_infer(opt,test_feature,models,device)
        generate_pseudos(opt)
        print("结束cv训练和推理:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))
    else:
        generate_pseudos(opt)
        print("结束cv生成pseudo数据:{}".format(time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())))
コード例 #7
0
def training(opt):
    if args.task_type == 'mrc':
        # 62 for mrc query
        processor = NERProcessor(opt.max_seq_len-62)
    else:
        processor = NERProcessor(opt.max_seq_len)

    train_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'train.json'))

    # add pseudo data to train data
    pseudo_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'pseudo.json'))
    train_raw_examples = train_raw_examples + pseudo_raw_examples

    train_examples = processor.get_examples(train_raw_examples, 'train')

    dev_examples = None
    if opt.eval_model:
        dev_raw_examples = processor.read_json(os.path.join(opt.raw_data_dir, 'dev.json'))
        dev_examples = processor.get_examples(dev_raw_examples, 'dev')

    train_base(opt, train_examples, dev_examples)