def main(): # 读取训练用数据 print('Start: read data', time.process_time()) X = pretreatment.load_data() # NOQA c, h, w = X.shape X = X.reshape((c, h * w)) # NOQA X = X / 255.0 # NOQA print('Samples', len(X), 'Feature', len(X[0])) # PCA print('Start: PCA', time.process_time()) pca = PCA(n_components=0.99) pca.fit(X) X = pca.transform(X) # NOQA print('Samples', len(X), 'Feature', len(X[0])) sys.stdout.flush() # 训练 print('Start: train', time.process_time()) n_clusters = 2000 # 聚类中心个数 estimator = KMeans(n_clusters, n_init=1, max_iter=20, verbose=True) estimator.fit(X) print('Clusters', estimator.n_clusters, 'Iter', estimator.n_iter_) print('Start: classify', time.process_time()) np.save('labels.npy', estimator.labels_) print('Start: save model', time.process_time()) joblib.dump(estimator, 'k-means.pkl')
def main(): import sys from pretreatment import load_data texts = load_data() fp = open('texts.log', 'w', encoding='utf-8') for idx, text in enumerate(texts): try: text = ocr(text) print(idx, text, file=fp) print(idx, text) except Exception as e: print(e, file=sys.stderr)
def learn(): texts, imgs = load_data() labels = mlearn.predict(texts) labels = labels.argmax(axis=1) imgs.dtype = np.uint64 imgs.shape = (-1, 8) unique_imgs = np.unique(imgs) print(unique_imgs.shape) imgs_labels = [] for img in unique_imgs: idxs = np.where(imgs == img)[0] counts = np.bincount(labels[idxs], minlength=80) imgs_labels.append(counts) np.savez('images.npz', images=unique_imgs, labels=imgs_labels)
def main(): import sys from pretreatment import load_data texts, _ = load_data() fp = open('texts.log', 'w', encoding='utf-8') for idx, text in enumerate(texts): try: # cv2.imshow('lena', text) # cv2.waitKey(1000) text = ocr(text) print(idx, text, file=fp) print(idx, text) except Exception as e: print(e, file=sys.stderr)
# coding: utf-8 import sys import cv2 import numpy as np import pretreatment import utils result_fn = sys.argv[1] classify_fn = sys.argv[2] utils.mkdir(classify_fn) # 用于统计有多少聚类中心是有样本的 result = np.load(result_fn) print(np.unique(result).shape) # 将聚类后的样本复制并使用聚类结果命名 imgs = pretreatment.load_data() for idx, (img, classify) in enumerate(zip(imgs, result)): dst = f'{classify_fn}/{classify}({idx}).jpg' cv2.imwrite(dst, img)