def train(logpath, modeldir, batch_size=256, epochs=100): modelpath = modeldir + 'model.h5' dictpath = modeldir + 'word_dict.json' for filepath in [logpath, modelpath, dictpath]: check_validity(filepath) check_file(logpath) # load data train_data = get_train_dataset(logpath) # pre-process from autoencoder import AutoEncoder # lazy load pre_processor = Preprocessor(filepath=dictpath) train_sr, time_sr = pre_processor.pre_process(train_data) autoencoder = AutoEncoder(shape=(train_sr.shape[1], train_sr.shape[2]), filepath=modelpath) cluster_model = Cluster(dirpath=modeldir) # train autoencoder.fit(train_sr, batch_size=batch_size, epochs=epochs) train_vector = autoencoder.transfer(train_sr) predict_result, cluster_number, dist_tbl = cluster_model.classify( train_vector) top_index = get_topn_sql(dist_tbl, topn=1) topn_sql = train_data[ top_index][:, -1] # typical SQL template for each cluster cluster_model.get_cluster_info(predict_result, time_sr, cluster_number) print("Train complete!") return cluster_number, topn_sql