Example #1
0
def run():
    batch_size = 63
    epochs = 5000
    
    data_process = DataProcess(use_word2cut=False)

    model = build_model()
  
    documents_length = data_process.get_documents_size(data_process.enc_ids_file, data_process.dec_ids_file)
    
    if batch_size > documents_length:
        print("ERROR--->" + u"语料数据量过少,请再添加一些")
        return None
    #自适应学习率
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=20, min_lr=1e-6, mode='min')
    '''monitor: 需要监视的量,val_loss,val_acc
    patience: 当early stop被激活(如发现loss相比上一个epoch训练没有下降),则经过patience个epoch后停止训练
    verbose: 信息展示模式
    mode: 'auto','min','max'之一,在min模式训练,如果检测值停止下降则终止训练。在max模式下,当检测值不再上升的时候则停止训练。'''
    early_stopping = EarlyStopping(monitor='val_loss', patience=50, verbose=2)
    model.fit_generator(generator=generate_batch(batch_size=batch_size),
                        steps_per_epoch=int(documents_length / batch_size)+5, \
                        validation_data=generate_batch(batch_size=batch_size), \
                        validation_steps=int(documents_length / batch_size)+5,\
                        epochs=epochs, verbose=1, workers=2, use_multiprocessing=True,
                        callbacks=[reduce_lr,early_stopping])

    model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
Example #2
0
def run():
    batch_size = 63
    epochs = 5000

    data_process = DataProcess(use_word2cut=False)

    model = build_model()

    documents_length = data_process.get_documents_size(
        data_process.enc_ids_file, data_process.dec_ids_file)

    if batch_size > documents_length:
        print("ERROR--->" + u"语料数据量过少,请再添加一些")
        return None

    model.fit_generator(generator=generate_batch(batch_size=batch_size),
                        steps_per_epoch=int(documents_length / batch_size), \
                        epochs=epochs, verbose=1, workers=1)

    model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
Example #3
0
def run():

    enc_vec_model = gensim.models.Word2Vec.load(r'model/encoder_vector.m')
    dec_vec_model = gensim.models.Word2Vec.load(r'model/decoder_vector.m')

    batch_size = 9
    epochs = 30
    data_process = DataProcess(use_word2cut=False)
    documents_length = data_process.get_documents_size(
        data_process.enc_ids_file, data_process.dec_ids_file)
    input_length = data_process.enc_input_length
    output_length = data_process.dec_output_length
    enc_embedding_length = data_process.enc_embedding_length
    dec_embedding_length = data_process.dec_embedding_length

    if batch_size > documents_length:
        print("ERROR--->" + u"语料数据量过少,请再添加一些")
        return None

    if (data_process.hidden_dim < data_process.enc_input_length):
        print("ERROR--->" + u"隐层神经元数目过少,请再添加一些")
        return None

    model = AttentionSeq2Seq(output_dim=dec_embedding_length, hidden_dim=data_process.hidden_dim, output_length=output_length, \
                             input_shape=(input_length, enc_embedding_length),
                             batch_size=batch_size,
                             depth=data_process.layer_shape)
    # keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
    model.compile(loss='mse', optimizer='rmsprop')
    model.fit_generator(generator=generate_batch(batch_size=batch_size, \
                                                 encoder_word2vec_model=enc_vec_model, \
                                                 decoder_word2vec_model=dec_vec_model, \
                                                 encoder_file_path=data_process.enc_ids_padding_file, \
                                                 decoder_file_path=data_process.dec_ids_padding_file, \
                                                 embedding_shape = (enc_embedding_length, dec_embedding_length)),
                        steps_per_epoch=int(documents_length / batch_size), \
                        epochs=epochs, verbose=1, workers=1)

    model.save_weights("model/seq2seq_model_weights.h5", overwrite=True)
def run():

    if not os.path.exists("data"):

        os.makedirs("data")

    if not os.path.exists("model"):

        os.makedirs("model")

    print("step-1--->" + u"加载词向量模型" + "--->START")

    embedding_model = gensim.models.Word2Vec.load(
        r'model/model_vector_people.m')

    word_dict = create_useful_words(embedding_model)

    embedding_size = embedding_model.vector_size

    print("step-2--->" + u"语料格式转换,加标注生成标准文件" + "--->START")

    raw_train_file = [corpus_path + os.sep + main_path + os.sep + sub_path \
                      for main_path in os.listdir(corpus_path) \
                      for sub_path in os.listdir(corpus_path + os.sep + main_path)]

    create_label_data(word_dict, raw_train_file)

    print("step-3--->" + u"按标点符号或是空格存储文件" + "--->START")

    documents_length = create_documents()

    print("step-4--->" + u"对语料中的词统计排序生成索引" + "--->START")

    lexicon, lexicon_reverse = create_lexicon(word_dict)

    print("step-5--->" + u"对所有的词创建词向量" + "--->START")

    useful_word_length, embedding_weights = create_embedding(
        embedding_model, embedding_size, lexicon_reverse)

    print("step-6--->" + u"生成标注以及索引" + "--->START")

    label_2_index = create_label_index()

    label_2_index_length = len(label_2_index)

    print("step-7--->" + u"将语料中每一句和label进行索引编码" + "--->START")

    create_matrix(lexicon, label_2_index)

    print("step-8--->" + u"将语料中每一句和label以最大长度统一长度,不足补零" + "--->START")

    max_len = maxlen_2d_list()

    padding_sentences(max_len)

    print("step-9--->" + u"模型创建" + "--->START")

    model = bilstm_cnn_crf(max_len, useful_word_length + 2,
                           label_2_index_length, embedding_size,
                           embedding_weights)

    print("step-10--->" + u"模型训练" + "--->START")

    if batch_size > documents_length:

        print("ERROR--->" + u"语料数据量过少,请再添加一些")

        return None

    _ = model.fit_generator(generator=generate_batch(batch_size=batch_size, label_class=label_2_index_length), \
                            steps_per_epoch=int(documents_length / batch_size), \
                            epochs=epochs, verbose=1, workers=1)

    print("step-11--->" + u"模型和字典保存" + "--->START")

    model.save_weights('model/train_model.hdf5')

    index_2_label = create_index_label()

    pickle.dump([lexicon, index_2_label], open('model/lexicon.pkl', 'wb'))

    pickle.dump([
        max_len, embedding_size, useful_word_length + 2, label_2_index_length
    ], open('model/model_params.pkl', 'wb'))

    print("step-12--->" + u"打印恢复模型的重要参数" + "--->START")

    print("sequence_max_length: " + str(max_len))

    print("embedding size: " + str(embedding_size))

    print("useful_word_length: " + str(useful_word_length + 2))

    print("label_2_index_length: " + str(label_2_index_length))

    print(u"训练完成" + "--->OK")
    histogram_freq=0,
    batch_size=args.batch_size,
    write_graph=True,
    write_grads=False,
    write_images=False,
    embeddings_freq=0,
    embeddings_layer_names=None,
    embeddings_metadata=None,
    embeddings_data=None,
    update_freq="epoch",
)

_ = model.fit_generator(
    generator=generate_batch(
        trainPath=trainPath,
        batch_size=args.batch_size,
        label_class=args.label_2_index_length,
    ),
    steps_per_epoch=int(args.documents_length / args.batch_size),
    epochs=args.epochs,
    verbose=1,
    workers=1,
    callbacks=[checkpoint, tensorBoard, earlyStopping],
)

logger.info("step-11--->" + u"模型和字典保存" + "--->START")

model.save_weights(trainPath.weights_path)

index_2_label = dataPreprocess.create_index_label()