def sklearn_digits( classifier=None ): #estim = hyperopt_estimator( classifier=any_classifier('hai'), algo=tpe.suggest ) if classifier is None: classifier = any_classifier('any') estim = hyperopt_estimator( classifier=classifier ) digits = load_digits() X = digits.data y = digits.target test_size = 50 np.random.seed(0) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] estim.fit( X_train, y_train ) pred = estim.predict( X_test ) print( pred ) print ( y_test ) print( score( pred, y_test ) ) print( estim.best_model() )
def test_fit_biginc(self): model = hyperopt_estimator( classifier=components.any_classifier('classifier'), verbose=1, max_evals=5, trial_timeout=5.0, fit_increment=20) model.fit(self.X, self.Y) # -- make sure we only get 5 even with big fit_increment assert len(model.trials.trials) == 5
def test_fit(self): model = hyperopt_estimator( classifier=components.any_classifier('classifier'), verbose=1, max_evals=5, trial_timeout=5.0) model.fit(self.X, self.Y) assert len(model.trials.trials) == 5
def test_fit_iter_basic(self): model = hyperopt_estimator( classifier=components.any_classifier('classifier'), verbose=1, trial_timeout=5.0) for ii, trials in enumerate(model.fit_iter(self.X, self.Y)): assert trials is model.trials assert len(trials.trials) == ii if ii == 10: break
def setUp(self): self.algo = SklearnClassifier( partial( hyperopt_estimator, preprocessing=simple_small_image_preprocessing('pp'), classifier=hpc.any_classifier('classif'), max_evals=100, verbose=1, algo=tpe.suggest, fit_timeout=5.0, # -- seconds ))
def test_warm_start(self): model = hyperopt_estimator( classifier=components.any_classifier('classifier'), verbose=1, max_evals=5, trial_timeout=5.0) params = model.get_params() assert params['algo'] == rand.suggest assert params['max_evals'] == 5 model.fit(self.X, self.Y, warm_start=False) assert len(model.trials.trials) == 5 model.set_params(algo=tpe.suggest, max_evals=10) params = model.get_params() assert params['algo'] == tpe.suggest assert params['max_evals'] == 10 model.fit(self.X, self.Y, warm_start=True) assert len(model.trials.trials) == 15 # 5 + 10 = 15.
def test_preproc(self): """ As a domain expert, I have a particular pre-processing that I believe reveals important patterns in my data. I would like to know how good a classifier can be built on top of my preprocessing algorithm. """ # -- for testing purpose, suppose that the RBM is our "domain-specific # pre-processing" algo = SklearnClassifier( partial( hyperopt_estimator, preprocessing=hp.choice('pp', [ # -- VQ (alone) [ hpc.colkmeans('vq0', n_init=1), ], # -- VQ -> RBM [ hpc.colkmeans('vq1', n_clusters=scope.int( hp.quniform( 'vq1.n_clusters', 1, 5, q=1)), n_init=1), hpc.rbm(name='rbm:alone', verbose=0) ], # -- VQ -> RBM -> PCA [ hpc.colkmeans('vq2', n_clusters=scope.int( hp.quniform( 'vq2.n_clusters', 1, 5, q=1)), n_init=1), hpc.rbm(name='rbm:pre-pca', verbose=0), hpc.pca('pca') ], ]), classifier=hpc.any_classifier('classif'), algo=tpe.suggest, max_evals=10, )) mean_test_error = self.view.protocol(algo) print('mean test error:', mean_test_error)
def create_random_pipeline(): pipeline_space = {'clf': any_classifier('my_clf'), 'preprocessor': any_preprocessing('my_prep')} sample = hyperopt.pyll.stochastic.sample(pipeline_space) classifier = sample['clf'] p = None try: preprocessor = sample['preprocessor'][0] p = Pipeline([('preprocessing', preprocessor), ('classifier', classifier)]) except: p = Pipeline([('classifier', classifier)]) return p
def test_search_all(self): """ As a ML researcher, I want a quick way to do model selection implicitly, in order to get a baseline accuracy score for a new data set. """ algo = LearningAlgo( partial( hyperopt_estimator, classifier=hpc.any_classifier('classifier'), # trial_timeout=15.0, # seconds verbose=1, max_evals=10, )) mean_test_error = self.view.protocol(algo) print('\n====Iris: any preprocessing + any classifier====', file=sys.stderr) print('mean test error:', mean_test_error, file=sys.stderr) print('====End optimization====', file=sys.stderr)
def mnist_digits(): estim = hyperopt_estimator( classifier=any_classifier('hai') ) digits = fetch_mldata('MNIST original') X = digits.data y = digits.target test_size = int( 0.2 * len( y ) ) np.random.seed(0) indices = np.random.permutation(len(X)) X_train = X[ indices[:-test_size]] y_train = y[ indices[:-test_size]] X_test = X[ indices[-test_size:]] y_test = y[ indices[-test_size:]] estim.fit( X_train, y_train ) pred = estim.predict( X_test ) print( pred ) print ( y_test ) print( score( pred, y_test ) ) print( estim.best_model() )
def test_preproc(self): """ As a domain expert, I have a particular pre-processing that I believe reveals important patterns in my data. I would like to know how good a classifier can be built on top of my preprocessing algorithm. """ # -- for testing purpose, suppose that the RBM is our "domain-specific # pre-processing" algo = LearningAlgo( partial( hyperopt_estimator, preprocessing=hp.choice( 'pp', [ # -- VQ (alone) [ hpc.colkmeans( 'vq0', n_clusters=scope.int( hp.quniform( 'vq0.n_clusters', 1.5, 5.5, q=1)), n_init=1, max_iter=100), ], # -- VQ -> RBM [ hpc.colkmeans( 'vq1', n_clusters=scope.int( hp.quniform( 'vq1.n_clusters', 1.5, 5.5, q=1)), n_init=1, max_iter=100), hpc.rbm(name='rbm:alone', n_components=scope.int( hp.qloguniform('rbm1.n_components', np.log(4.5), np.log(20.5), 1)), n_iter=100, verbose=0) ], # -- VQ -> RBM -> PCA [ hpc.colkmeans('vq2', n_clusters=scope.int( hp.quniform('vq2.n_clusters', 1.5, 5.5, q=1)), n_init=1, max_iter=100), hpc.rbm(name='rbm:pre-pca', n_components=scope.int( hp.qloguniform('rbm2.n_components', np.log(4.5), np.log(20.5), 1)), n_iter=100, verbose=0), hpc.pca('pca') ], ]), classifier=hpc.any_classifier('classif'), algo=tpe.suggest, #trial_timeout=5.0, # seconds verbose=1, max_evals=10, )) mean_test_error = self.view.protocol(algo) print('\n====Iris: VQ + RBM + PCA + any classifier====', file=sys.stderr) print('mean test error:', mean_test_error, file=sys.stderr) print('====End optimization====')
'loss': -1 * np.mean(scores), 'status': STATUS_OK, 'training_time': training_time, 'total_time': total_time } except: total_time = time.time() - start_time return { 'loss': np.inf, 'status': STATUS_OK, 'training_time': 0, 'total_time': total_time } pipeline_space = { 'clf': any_classifier('my_clf'), 'preprocessor': any_preprocessing('my_prep') } print(pipeline_space) trials = Trials() best = fmin(objective, space=pipeline_space, algo=tpe.suggest, max_evals=200, trials=trials) print(trials.best_trial) pickle.dump(trials, open("/tmp/trials.p", "wb"))
def main(): dig = Digits() #dig = Digits( use_mnist=True ) #dig.run( test_size=20 ) """ best = fmin( fn=dig.hyperopt_wrapper, space={'classifier' : 'LinearSVC', 'C' : hp.lognormal('svm_C', 0, 1 ), 'loss' : hp.choice('loss', ['l1', 'l2']) }, algo=tpe.suggest, max_evals=50 ) """ """ best = fmin( fn=dig.hyperopt_wrapper, space=( hp.lognormal('C', 0, 2 ), hp.choice('loss', ['l1', 'l2']), #hp.choice('penalty', ['l1', 'l2']) ), algo=tpe.suggest, max_evals=30 ) """ """ best = fmin( fn=dig.hyperopt_wrapper, space={ 'C':hp.lognormal('C', 0, 2 ), 'loss':hp.choice('loss', ['l1', 'l2']), }, algo=tpe.suggest, max_evals=30 ) """ """ best = fmin( fn=dig.hyperopt_wrapper, space={ 'C':hp.lognormal('C', 0.0, 2.0 ), 'kernel':hp.choice('kernel', ['rbf', 'sigmoid', 'linear']), }, algo=tpe.suggest, max_evals=20 ) """ from hpsklearn.components import svc_linear, sklearn_SVC, any_classifier #print svc_linear('hai') print sklearn_SVC() #""" best = fmin( fn=dig.hyperopt_sklearn_wrapper, #space={'classifier':any_classifier('hai'),'preprocessing':[]}, space=any_classifier('hai'), algo=tpe.suggest, max_evals=20 ) #""" """ best = fmin( fn=dig.hyperopt_wrapper, space={ 'C':hp.lognormal('C', 0, 2 ), 'svmkernel':hp.choice('svmkernel', [ { 'kernel':'rbf' }, { 'kernel':'sigmoid' }, { 'kernel':'linear', 'degree':hp.quniform( 'degree', 1, 5, 1 ) } ] ), }, algo=tpe.suggest, max_evals=20 ) """ """ from hpsklearn.components import svc_linear best = fmin( fn=dig.hyperopt_wrapper, space=svc_linear('name of svc'), algo=tpe.suggest, max_evals=30 ) """ print best #print dig.best_f1_score print "Precision Score: %s" % dig.best_score ##print "C value: %s" % dig.best_C #dig.test_model( best, classifier="SVC" ) # Convert from index to string #best['kernel'] = ['rbf', 'sigmoid', 'linear'][best['kernel']] dig.test_model( classifier="SVC", **best )
def main(data='newsgroups', algo='tpe', seed=1, evals=100, clf='any', loss=None, pre='any', text=''): filename = text + algo + '_' + clf + '_' + pre + '_' + str(seed) + '_' + str(evals) + \ '_' + data if loss is not None: if hasattr(metrics, loss): loss = getattr(metrics, loss) else: print('Unknown loss metric specified') return 1 if algo == 'tpe': algorithm = tpe.suggest elif algo == 'anneal': algorithm = anneal.suggest elif algo == 'rand': algorithm = rand.suggest elif algo == 'tree': algorithm = hypertree.tree.suggest elif algo == 'gp_tree': algorithm = hypertree.gp_tree.suggest else: print('Unknown algorithm specified') return 1 # TODO: impose restrictions on classifiers that do not work on sparse data if clf == 'any': if data in ['newsgroups']: classifier = any_sparse_classifier('clf') else: classifier = any_classifier('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif hasattr(hpsklearn.components, clf): classifier = getattr(hpsklearn.components, clf)('clf') else: print('Unknown classifier specified') return 1 """ elif clf == 'svc': classifier = svc('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'sgd': classifier = sgd('clf') elif clf == 'random_forest': classifier = random_forest('clf') elif clf == 'extra_trees': classifier = extra_trees('clf') elif clf == 'liblinear_svc': classifier = liblinear_svc('clf') elif clf == 'multinomial_nb': classifier = multinomial_nb('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif clf == 'rbm': classifier = rbm('clf') elif clf == 'colkmeans': classifier = colkmeans('clf') else: print( 'Unknown classifier specified' ) return 1 """ if pre == 'any': if data in ['newsgroups']: preproc = any_text_preprocessing('pre') else: preproc = any_preprocessing('pre') elif pre == 'none': preproc = [] elif hasattr(hpsklearn.components, pre): preproc = [getattr(hpsklearn.components, pre)('pre')] else: print('Unknown preprocessing specified') return 1 """ elif pre == 'pca': preproc = [pca('pre')] elif pre == 'standard_scaler': preproc = [standard_scaler('pre')] elif pre == 'min_max_scaler': preproc = [min_max_scaler('pre')] elif pre == 'normalizer': preproc = [normalizer('pre')] elif pre == 'tfidf': preproc = [tfidf('pre')] """ if data == 'newsgroups': sklearn_newsgroups(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) elif data == 'convex': if CONVEX_EXISTS: sklearn_convex(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) else: print( "Convex dataset not detected on your system, install from MLPython" ) return 1 elif data == 'mnist': sklearn_mnist(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) elif data == 'digits': sklearn_digits(classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss) else: print("Unknown dataset specified")
def main( data='newsgroups', algo='tpe', seed=1, evals=100, clf='any', loss=None, pre='any', text='' ): filename = text + algo + '_' + clf + '_' + pre + '_' + str(seed) + '_' + str(evals) + \ '_' + data if loss is not None: if hasattr( metrics, loss ): loss = getattr( metrics, loss ) else: print( 'Unknown loss metric specified' ) return 1 if algo == 'tpe': algorithm = tpe.suggest elif algo == 'anneal': algorithm = anneal.suggest elif algo == 'rand': algorithm = rand.suggest elif algo == 'tree': algorithm = hypertree.tree.suggest elif algo == 'gp_tree': algorithm = hypertree.gp_tree.suggest else: print( 'Unknown algorithm specified' ) return 1 # TODO: impose restrictions on classifiers that do not work on sparse data if clf == 'any': if data in ['newsgroups']: classifier = any_sparse_classifier('clf') else: classifier = any_classifier('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif hasattr( hpsklearn.components, clf ): classifier = getattr( hpsklearn.components, clf )( 'clf' ) else: print( 'Unknown classifier specified' ) return 1 """ elif clf == 'svc': classifier = svc('clf') elif clf == 'knn': if data in ['newsgroups']: classifier = knn('clf', sparse_data=True) else: classifier = knn('clf') elif clf == 'sgd': classifier = sgd('clf') elif clf == 'random_forest': classifier = random_forest('clf') elif clf == 'extra_trees': classifier = extra_trees('clf') elif clf == 'liblinear_svc': classifier = liblinear_svc('clf') elif clf == 'multinomial_nb': classifier = multinomial_nb('clf') elif clf == 'nearest_centroid': if data in ['newsgroups']: classifier = nearest_centroid('clf', sparse_data=True) else: classifier = nearest_centroid('clf') elif clf == 'rbm': classifier = rbm('clf') elif clf == 'colkmeans': classifier = colkmeans('clf') else: print( 'Unknown classifier specified' ) return 1 """ if pre == 'any': if data in ['newsgroups']: preproc = any_text_preprocessing('pre') else: preproc = any_preprocessing('pre') elif pre == 'none': preproc = [] elif hasattr( hpsklearn.components, pre ): preproc = [getattr( hpsklearn.components, pre)( 'pre' )] else: print( 'Unknown preprocessing specified' ) return 1 """ elif pre == 'pca': preproc = [pca('pre')] elif pre == 'standard_scaler': preproc = [standard_scaler('pre')] elif pre == 'min_max_scaler': preproc = [min_max_scaler('pre')] elif pre == 'normalizer': preproc = [normalizer('pre')] elif pre == 'tfidf': preproc = [tfidf('pre')] """ if data == 'newsgroups': sklearn_newsgroups( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) elif data == 'convex': if CONVEX_EXISTS: sklearn_convex( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) else: print("Convex dataset not detected on your system, install from MLPython") return 1 elif data == 'mnist': sklearn_mnist( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) elif data == 'digits': sklearn_digits( classifier=classifier, algorithm=algorithm, max_evals=evals, seed=seed, filename=filename, preproc=preproc, loss=loss ) else: print( "Unknown dataset specified" )