Beispiel #1
0
def train(index):
    out_fn = 'training-data-%i.h5' % index
    if os.path.exists(out_fn):
        data, labels = classify.load_training_data_from_disk(out_fn,
                                    names=['data', 'labels'])
    else:
        ws_tr = imio.read_image_stack('watershed-%i.lzf.h5' % index)
        pr_tr = imio.read_image_stack('probabilities-%i.lzf.h5' % index) / 255
        gt_tr = imio.read_image_stack('ground-truth-%i.lzf.h5' % index)
        g = agglo.Rag(ws_tr, pr_tr,
                      feature_manager=fman)
        data, labels = g.learn_agglomerate(gt_tr, fman, min_num_epochs=4)[0][:2]
        classify.save_training_data_to_disk([data, labels],
                                            fn='training-data-%i.h5' % index,
                                            names=['data', 'labels'])
    print('total training data:', data.shape)
    print('size in MB:', data.size * data.itemsize / 1e6)
    rf = classify.DefaultRandomForest()
    rf.fit(data, labels[:, 0])
    policy = agglo.classifier_probability(fman, rf)
    return policy
# IPython log file


from gala import classify
datas = []
labels = []
import numpy as np
for i in range(4):
    data, label = classify.load_training_data_from_disk('training-data-%i.h5' % i,
                                                        names=['data', 'labels'])
    datas.append(data)
    labels.append(label[:, 0])
    
X0 = np.concatenate(datas, axis=0)
y0 = np.concatenate(labels)
# runtime was 5min for 3000 samples, expect ~2h for 72,000
# for 280,000, expect ~8h (took 10h)
idx = np.random.choice(len(y0), size=280000, replace=False)
X, y = X0[idx], y0[idx]
param_dist = {'n_estimators': [20, 100, 200, 500],
              'max_depth': [3, 5, 20, None],
              'max_features': ['auto', 5, 10, 20],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}
from sklearn import grid_search as gs
from time import time
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
random_search = gs.GridSearchCV(rf, param_grid=param_dist, refit=False,
                                verbose=2, n_jobs=12)
start=time(); random_search.fit(X, y); stop=time()
# IPython log file
# Run this in the NewEM data folder


from gala import classify
X, y = classify.load_training_data_from_disk('training-data-0.h5',
        names=['data', 'labels'])
        
train_idxs = np.random.randint(0, X.shape[0], size=10_000)
y = y[:, 0]
Xtr, ytr = X[train_idxs], y[train_idxs]
test_idxs = np.random.randint(0, X.shape[0], size=1000)
test_idxs = np.setdiff1d(test_idxs, train_idxs)
Xts, yts = X[test_idxs], y[test_idxs]
rf = classify.default_random_forest()
# get_ipython().magic('timeit -n 1 -r 1 rf.fit(Xtr, ytr)')
lg = classify.get_classifier('logist')
# get_ipython().magic('timeit -n 1 -r 1 lg.fit(Xtr, ytr)')
# 20x faster training
lgacc = 1 - np.sum(lg.predict(Xts) != yts) / len(yts)
# 73%
rfacc = 1 - np.sum(rf.predict(Xts) != yts) / len(yts)
# 79.2%
# get_ipython().magic('timeit -r 1 -n 1 lg.predict(Xts)')
# get_ipython().magic('timeit -r 1 -n 1 rf.predict(Xts)')
# 20x faster prediction
# get_ipython().magic('timeit rf.predict(Xts[0:1])')
# get_ipython().magic('timeit lg.predict(Xts[0:1])')
# 30x faster single line prediction
from sklearn.preprocessing import StandardScaler
s = StandardScaler()