コード例 #1
0
def train(logpath, modeldir, batch_size=256, epochs=100):
    modelpath = modeldir + 'model.h5'
    dictpath = modeldir + 'word_dict.json'
    for filepath in [logpath, modelpath, dictpath]:
        check_validity(filepath)
    check_file(logpath)
    # load data
    train_data = get_train_dataset(logpath)

    # pre-process
    from autoencoder import AutoEncoder  # lazy load

    pre_processor = Preprocessor(filepath=dictpath)
    train_sr, time_sr = pre_processor.pre_process(train_data)
    autoencoder = AutoEncoder(shape=(train_sr.shape[1], train_sr.shape[2]),
                              filepath=modelpath)
    cluster_model = Cluster(dirpath=modeldir)

    # train
    autoencoder.fit(train_sr, batch_size=batch_size, epochs=epochs)
    train_vector = autoencoder.transfer(train_sr)
    predict_result, cluster_number, dist_tbl = cluster_model.classify(
        train_vector)
    top_index = get_topn_sql(dist_tbl, topn=1)
    topn_sql = train_data[
        top_index][:, -1]  # typical SQL template for each cluster
    cluster_model.get_cluster_info(predict_result, time_sr, cluster_number)
    print("Train complete!")
    return cluster_number, topn_sql