Example #1
0
def train_albert(hyper_parameters=None, rate=1.0):
    # path_ner_people_1998_train = "D:/soft_install/dataset/corpus/ner/china-people-daily-ner-corpus/example.train"
    # path_ner_people_1998_valid = "D:/soft_install/dataset/corpus/ner/china-people-daily-ner-corpus/example.dev"
    if not hyper_parameters:
        hyper_parameters = {
            'len_max':
            128,  # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 本地win10-4G设为20就好, 过大小心OOM
            'embed_size':
            768,  # 768,  # 字/词向量维度, bert取768, word取300, char可以更小些
            'vocab_size': 20000,  # 这里随便填的,会根据代码里修改
            'trainable': True,  # embedding是静态的还是动态的, 即控制可不可以微调
            'level_type':
            'char',  # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
            'embedding_type':
            'albert',  # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
            'gpu_memory_fraction': 0.76,  #gpu使用率
            'model': {
                'label':
                575,  # 类别数
                'batch_size':
                2,  # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
                'dropout':
                0.5,  # 随机失活, 概率
                'decay_step':
                100,  # 学习率衰减step, 每N个step衰减一次
                'decay_rate':
                0.9,  # 学习率衰减系数, 乘法
                'epochs':
                132,  # 训练最大轮次
                'patience':
                18,  # 早停,2-3就好
                'lr':
                5e-5,  # 学习率, bert取5e-5,其他取1e-3,如果acc较低或者一直不变,优先调这个, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
                'l2':
                1e-9,  # l2正则化
                'activate_classify':
                'softmax',  # 最后一个layer, 即分类激活函数
                'loss':
                'sparse_categorical_crossentropy',  # 损失函数, mse, categorical_crossentropy, sparse_categorical_crossentropy, binary_crossentropy等
                'metrics':
                'accuracy',  # 保存更好模型的评价标准, accuracy, binary_accuracy, categorical_accuracy, sparse_categorical_accuracy, sparse_top_k_categorical_accuracy
                'optimizer_name':
                'ADAM',  # 可填'ADAM', 'RADAM', 'RADAM,LOOKAHEAD'
                'is_training':
                True,  # 训练后者是测试模型
                'path_model_dir':
                os.path.join(path_model_dir, "bilstm"),
                'model_path':
                os.path.join(
                    path_model_dir, "bilstm_crf.model"
                ),  # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True
                'path_hyper_parameters':
                os.path.join(
                    path_model_dir,
                    "hyper_parameters.json"),  # 模型(包括embedding),超参数地址,
                'path_fineture':
                os.path.join(path_model_dir, "embedding.model"
                             ),  # embedding trainable地址, 例如字向量、词向量、bert向量等
                'path_l2i_i2l':
                os.path.join(path_model_dir, "l2i_i2l.json"),
                'num_rnn_layers':
                1,  # rnn层数
                'rnn_type':
                'GRU',  # rnn类型,可以填"LSTM","GRU","CuDNNLSTM","CuDNNGRU"
                'rnn_units':
                128,  # rnn隐藏元
                'crf_mode':
                'other',  # crf类型, 可填'other', 'reg', 'pad'(包括句子实际长度)
            },
            'embedding': {
                'layer_indexes': [-1, -2, -3, -4],  # bert取的层数
                # 'corpus_path': '',     # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
            },
            'data': {
                'train_data':
                path_seg_pku_1998_train,  # path_ner_people_1998_train, # 训练数据
                'val_data':
                path_seg_pku_1998_train  # path_ner_people_1998_valid    # 验证数据
            },
        }

    # 删除先前存在的模型和embedding微调模型等
    delete_file(path_model_dir)
    time_start = time.time()
    if not os.path.exists(hyper_parameters['model']['path_model_dir']):
        os.mkdir(hyper_parameters['model']['path_model_dir'])
    # 数据预处理初始化
    from macropodus.network.preprocess.preprocess_generator import PreprocessGenerator
    pg = PreprocessGenerator(os.path.join(path_model_dir, "l2i_i2l.json"))
    label_sets, _ = pg.preprocess_label2set(
        hyper_parameters['data']['train_data'],
        hyper_parameters['embedding_type'])
    # 训练数据中试集序列类别个数
    hyper_parameters['model']['label'] = len(label_sets)
    # graph初始化
    graph = Graph(hyper_parameters)
    print("graph init ok!")
    ra_ed = graph.word_embedding

    # 数据预处理, fit
    _, len_train = pg.preprocess_label2set(
        hyper_parameters['data']['train_data'])
    x_train, y_train = pg.preprocess_label_question_to_idx_fit(
        embedding_type=hyper_parameters['embedding_type'],
        path=hyper_parameters['data']['train_data'],
        embed=ra_ed,
        rate=rate)

    x_val, y_val = pg.preprocess_label_question_to_idx_fit(
        embedding_type=hyper_parameters['embedding_type'],
        path=hyper_parameters['data']['train_data'],
        embed=ra_ed,
        rate=rate)
    # 训练
    graph.fit(x_train, y_train, x_val, y_val)
    print("耗时:" + str(time.time() - time_start))
Example #2
0
def train_w2v(hyper_parameters=None,
              rate=1.0,
              path_train=None,
              path_val=None,
              path_train_dir=None):
    # 时间计算
    time_start = time.time()
    # 默认超参数
    if not hyper_parameters:
        hyper_parameters = {
            'len_max':
            128,  # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 本地win10-4G设为20就好, 过大小心OOM
            'embed_size':
            256,  # 768,  # 字/词向量维度, bert取768, word取300, char可以更小些
            'vocab_size': 20000,  # 这里随便填的,会根据代码里修改
            'trainable': True,  # embedding是静态的还是动态的, 即控制可不可以微调
            'level_type':
            'char',  # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
            'embedding_type':
            'random',  # 级别, 嵌入类型, 还可以填'ngram'、'random'、 'bert'、 'albert' or 'word2vec"
            'gpu_memory_fraction': 0.76,  # gpu使用率
            'model': {
                'label':
                99,  # 类别数
                'batch_size':
                32,  # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
                'dropout':
                0.8,  # 随机失活, 概率
                'decay_step':
                3000,  # 学习率衰减step, 每N个step衰减一次
                'decay_rate':
                0.999,  # 学习率衰减系数, 乘法
                'epochs':
                16,  # 训练最大轮次
                'patience':
                3,  # 早停,2-3就好
                'lr':
                1e-3,  # 学习率, bert取5e-5,其他取1e-3,如果acc较低或者一直不变,优先调这个, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
                'l2':
                0.5,  # l2正则化
                'filters': [2, 3, 4],
                'filters_num':
                256,
                'activate_rnn':
                'tanh',  # rnn-layer中的激活函数, 即RNN激活函数, 可填'tanh', 'relu', 'signmoid'
                'activate_classify':
                'softmax',  # 最后一个layer, 即分类激活函数, 'softmax', 'signmoid'
                'loss':
                'categorical_crossentropy',  # 损失函数, mse, categorical_crossentropy, sparse_categorical_crossentropy, binary_crossentropy等
                'metrics':
                'accuracy',  # 保存更好模型的评价标准, accuracy, binary_accuracy, categorical_accuracy, sparse_categorical_accuracy, sparse_top_k_categorical_accuracy
                'optimizer_name':
                'ADAM',  # 可填'ADAM', 'RADAM', 'RADAM,LOOKAHEAD'
                'is_training':
                True,  # 训练后者是测试模型, 训练时候是True, 测试时候是False
                'path_model_dir':
                os.path.join(path_model_dir, "crf"),  # 保存当前训练模型的根目录
                'model_path':
                os.path.join(path_model_dir, "bilstm_crf.model"),
                # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True
                'path_hyper_parameters':
                os.path.join(path_model_dir, "hyper_parameters.json"),
                # 模型(包括embedding),超参数地址,
                'path_fineture':
                os.path.join(path_model_dir, "embedding.model"),
                # embedding trainable地址, 例如字向量、词向量、bert向量等
                'path_l2i_i2l':
                os.path.join(path_model_dir, "l2i_i2l.json"),  # 类别转类标的字典
                'num_rnn_layers':
                1,  # rnn层数, 1, 2 or 3等
                'rnn_type':
                'LSTM',  # rnn类型,可以填"LSTM","GRU","CuDNNLSTM","CuDNNGRU"
                'rnn_units':
                256,  # rnn隐藏元, 128, 256, 512, 768, 1024等
                'crf_mode':
                'reg',  # crf类型, 可填'other', 'reg', 'pad'(包括句子实际长度)
            },
            'embedding': {
                'layer_indexes': [1, 2, 3, 4],  #[-1, -2, -3, -4],  # bert取的层数
                'corpus_path':
                path_seg_pku_1998_train,  # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
            },
            'data': {
                'train_data':
                path_tag_people_1998_train,  # path_tag_people_2014_train, # path_seg_pku_1998_train,  # path_ner_people_1998_train, # 训练数据
                'val_data':
                path_tag_people_1998_train  # path_tag_people_2014_valid, # path_seg_pku_1998_train  # path_ner_people_1998_valid    # 验证数据
            },
        }
    # 外部引入参数, 包括输入训练-验证语料, 模型保存地址
    if path_train:
        hyper_parameters["data"]["train_data"] = path_train
        hyper_parameters["data"]["val_data"] = path_train
    if path_val:
        hyper_parameters["data"]["val_data"] = path_val
    if path_train_dir:
        hyper_parameters["model"]["path_model_dir"] = path_model_dir

    # 创建模型保存目录
    if not os.path.exists(hyper_parameters['model']['path_model_dir']):
        os.mkdir(hyper_parameters['model']['path_model_dir'])
    # 删除先前存在的模型和embedding微调模型等
    delete_file(hyper_parameters['model']['path_model_dir'])
    # 数据预处理初始化
    from macropodus.network.preprocess.preprocess_generator import PreprocessGenerator
    pg = PreprocessGenerator(
        os.path.join(hyper_parameters['model']['path_model_dir'],
                     "l2i_i2l.json"))
    label_sets, _ = pg.preprocess_label2set(
        hyper_parameters['data']['train_data'],
        hyper_parameters['embedding_type'])
    # 训练数据中试集序列类别个数
    hyper_parameters['model']['label'] = len(label_sets)
    # graph初始化
    graph = Graph(hyper_parameters)
    print("graph init ok!")
    ra_ed = graph.word_embedding
    # # 数据预处理, fit
    # x_train, y_train = pg.preprocess_label_question_to_idx_fit(embedding_type=hyper_parameters['embedding_type'],
    #                                                            path=hyper_parameters['data']['train_data'],
    #                                                            embed=ra_ed,
    #                                                            rate=rate,
    #                                                            crf_mode=hyper_parameters['model']['crf_mode'])
    #
    # x_val, y_val = pg.preprocess_label_question_to_idx_fit(embedding_type=hyper_parameters['embedding_type'],
    #                                                        path=hyper_parameters['data']['val_data'],
    #                                                        embed=ra_ed,
    #                                                        rate=rate,
    #                                                        crf_mode=hyper_parameters['model']['crf_mode'])
    # # 训练
    # graph.fit(x_train, y_train, x_val, y_val)
    # fit_generator
    graph.fit_generator(ra_ed, rate)

    print("耗时:" + str(time.time() - time_start))