Beispiel #1
0
def main():
    # 读取训练用数据
    print('Start: read data', time.process_time())
    X = pretreatment.load_data()    # NOQA
    c, h, w = X.shape
    X = X.reshape((c, h * w))       # NOQA
    X = X / 255.0                   # NOQA
    print('Samples', len(X), 'Feature', len(X[0]))
    # PCA
    print('Start: PCA', time.process_time())
    pca = PCA(n_components=0.99)
    pca.fit(X)
    X = pca.transform(X)            # NOQA
    print('Samples', len(X), 'Feature', len(X[0]))
    sys.stdout.flush()
    # 训练
    print('Start: train', time.process_time())
    n_clusters = 2000    # 聚类中心个数
    estimator = KMeans(n_clusters, n_init=1, max_iter=20, verbose=True)
    estimator.fit(X)
    print('Clusters', estimator.n_clusters, 'Iter', estimator.n_iter_)
    print('Start: classify', time.process_time())
    np.save('labels.npy', estimator.labels_)
    print('Start: save model', time.process_time())
    joblib.dump(estimator, 'k-means.pkl')
Beispiel #2
0
def main():
    import sys
    from pretreatment import load_data
    texts = load_data()
    fp = open('texts.log', 'w', encoding='utf-8')
    for idx, text in enumerate(texts):
        try:
            text = ocr(text)
            print(idx, text, file=fp)
            print(idx, text)
        except Exception as e:
            print(e, file=sys.stderr)
def learn():
    texts, imgs = load_data()
    labels = mlearn.predict(texts)
    labels = labels.argmax(axis=1)
    imgs.dtype = np.uint64
    imgs.shape = (-1, 8)
    unique_imgs = np.unique(imgs)
    print(unique_imgs.shape)
    imgs_labels = []
    for img in unique_imgs:
        idxs = np.where(imgs == img)[0]
        counts = np.bincount(labels[idxs], minlength=80)
        imgs_labels.append(counts)
    np.savez('images.npz', images=unique_imgs, labels=imgs_labels)
Beispiel #4
0
def main():
    import sys
    from pretreatment import load_data
    texts, _ = load_data()
    fp = open('texts.log', 'w', encoding='utf-8')
    for idx, text in enumerate(texts):
        try:
            # cv2.imshow('lena', text)
            # cv2.waitKey(1000)
            text = ocr(text)
            print(idx, text, file=fp)
            print(idx, text)
        except Exception as e:
            print(e, file=sys.stderr)
Beispiel #5
0
# coding: utf-8
import sys

import cv2
import numpy as np

import pretreatment
import utils

result_fn = sys.argv[1]
classify_fn = sys.argv[2]

utils.mkdir(classify_fn)

# 用于统计有多少聚类中心是有样本的
result = np.load(result_fn)
print(np.unique(result).shape)

# 将聚类后的样本复制并使用聚类结果命名
imgs = pretreatment.load_data()
for idx, (img, classify) in enumerate(zip(imgs, result)):
    dst = f'{classify_fn}/{classify}({idx}).jpg'
    cv2.imwrite(dst, img)