def testFunction(data): #clf = sklearn.ensemble.forest.RandomForestClassifier(bootstrap:true,weight:null,criterion:"gini",depth:null,features:"auto",nodes:null,decrease:0.0,split:null,leaf:1,split:2,leaf:0.0,estimators:10,jobs:1,score:false,state:6826,verbose:0,start:false) #X, y, features = data.get_data(target=data.default_target_attribute, return_attribute_names=True); run = oml.runs.get_run(1836360) print(run.flow_id) #flow = oml.flows.get_flow(4834) flow = oml.flows.get_flow(8900) #flow = oml.flows.get_flow(8426) #flow = oml.flows.get_flow(7650) flow = oml.flows.flow_to_sklearn(flow) clf = pipeline.Pipeline(steps=[('imputer', impute.SimpleImputer()), ('estimator', flow)]) flow = flows.sklearn_to_flow(clf) print(flow.model) taskId = tasks.get_task(55) run = runs.run_flow_on_task(taskId, flow, avoid_duplicate_runs=True) feval = dict(run.fold_evaluations['predictive_accuracy'][0]) acc = 0 for val in feval.values(): acc += val print(acc / 10)
def challenge(): ## use dev openml to run # Download task, run learner, publish results task = tasks.get_task(14951) ## clf = BaggingClassifier(SVC(), n_estimators = 128) ''' clf = RandomForestClassifier(n_estimators = 128, class_weight = 'balanced_subsample') ''' ''' clf = BaggingClassifier(ExtraTreeClassifier(), n_estimators = 20) ''' ''' param_grid = {'max_depth': np.linspace(1, 15, num = 15, dtype = np.int64), 'class_weight': ['balanced', 'balanced_subsample', None], 'min_samples_split': np.linspace(1, 15, num = 15, dtype = np.int64), 'criterion': ['gini', 'entropy'] } base_clf = RandomForestClassifier(n_estimators = 20) clf = GridSearchCV(base_clf, param_grid = param_grid, scoring = 'roc_auc', cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4) ''' ''' ## grid search - gamma and C, grid_den = 20, time needed = 13.36s grid_den = 1 param_grid = {#'C': np.logspace(-5, 5, num = grid_den, base = 2.0), 'gamma': np.logspace(-5, 5, num = grid_den, base = 2.0) } clf = GridSearchCV(SVC(probability = True), param_grid = param_grid, scoring = 'roc_auc', cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4) ''' clf = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine') run = runs.run_task(task, clf) return_code, response = run.publish() # get the run id for reference if (return_code == 200): response_dict = xmltodict.parse(response) run_id = response_dict['oml:upload_run']['oml:run_id'] print("Uploaded run with id %s. Check it at www.openml.org/r/%s" % (run_id, run_id))
def challenge(): ## use dev openml to run # Download task, run learner, publish results task = tasks.get_task(14951) ## clf = BaggingClassifier(SVC(), n_estimators = 128) ''' clf = RandomForestClassifier(n_estimators = 128, class_weight = 'balanced_subsample') ''' ''' clf = BaggingClassifier(ExtraTreeClassifier(), n_estimators = 20) ''' ''' param_grid = {'max_depth': np.linspace(1, 15, num = 15, dtype = np.int64), 'class_weight': ['balanced', 'balanced_subsample', None], 'min_samples_split': np.linspace(1, 15, num = 15, dtype = np.int64), 'criterion': ['gini', 'entropy'] } base_clf = RandomForestClassifier(n_estimators = 20) clf = GridSearchCV(base_clf, param_grid = param_grid, scoring = 'roc_auc', cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4) ''' ''' ## grid search - gamma and C, grid_den = 20, time needed = 13.36s grid_den = 1 param_grid = {#'C': np.logspace(-5, 5, num = grid_den, base = 2.0), 'gamma': np.logspace(-5, 5, num = grid_den, base = 2.0) } clf = GridSearchCV(SVC(probability = True), param_grid = param_grid, scoring = 'roc_auc', cv = 10, pre_dispatch = '2*n_jobs', n_jobs = 4) ''' clf = KNeighborsClassifier(n_neighbors = 5, algorithm = 'brute', metric = 'cosine') run = runs.run_task(task, clf) return_code, response = run.publish() # get the run id for reference if(return_code == 200): response_dict = xmltodict.parse(response) run_id = response_dict['oml:upload_run']['oml:run_id'] print("Uploaded run with id %s. Check it at www.openml.org/r/%s" % (run_id,run_id))
def get_task(self): task = tasks.get_task(self.tid) return task
def test_class_labels(self): task = get_task(self.task_id) self.assertEqual(task.class_labels, ['tested_negative', 'tested_positive'])
from openml import tasks, runs from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier from sklearn.tree import DecisionTreeClassifier import xmltodict from sklearn import ensemble task = tasks.get_task(14951) # clf = ensemble.RandomForestClassifier() # clf = AdaBoostClassifier(algorithm="SAMME.R",n_estimators=700) # clf = AdaBoostClassifier(algorithm="SAMME",n_estimators=5000) clf = RandomForestClassifier( warm_start=True, n_estimators=128, criterion="entropy", min_samples_split=20, bootstrap=True, random_state=123 ) run = runs.run_task(task, clf) return_code, response = run.publish() # get the run id for reference if return_code == 200: response_dict = xmltodict.parse(response) run_id = response_dict["oml:upload_run"]["oml:run_id"] print("Uploaded run with id %s. Check it at www.openml.org/r/%s" % (run_id, run_id))
def test_download_task(self): return get_task(self.task_id)
if(mean_mutual_information == 0): features["NoiseToSignalRatio"] = 0 features["NoiseToSignalRatio"] = (mean_feature_entropy - mean_mutual_information) / mean_mutual_information features["InformationFeatureTime"] = sw.duration return features if __name__ == "__main__": utils.log("Running tests - Importing...") from openml import datasets, tasks # Take 59 is for dataset 61, the iris dataset, which is good for numerical tests, # Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information. task = tasks.get_task(60) data = task.get_dataset() X, y, categorical = data.get_data(target = data.default_target_attribute, return_categorical_indicator = True) # We want to do cross-validation for some landmarkers, so we take a cv-10 fold. # We need to unroll the generator into a list because it is iterated over multiple times. folds = list(next(task.iterate_repeats())) simple = simple_metafeatures(X, y, categorical) stats = statistical_metafeatures(X, y, categorical) info = information_theoretic_metafeatures(X, y, categorical) landmarkers = landmarker_metafeatures(X, y, categorical, folds) for key, val in simple.items(): print("{}: {}".format(key, val))
def test_get_X_and_Y(self) -> Tuple[np.ndarray, np.ndarray]: task = get_task(self.task_id) X, Y = task.get_X_and_y() return X, Y
features["NoiseToSignalRatio"] = ( mean_feature_entropy - mean_mutual_information) / mean_mutual_information features["InformationFeatureTime"] = sw.duration return features if __name__ == "__main__": utils.log("Running tests - Importing...") from openml import datasets, tasks # Take 59 is for dataset 61, the iris dataset, which is good for numerical tests, # Task 60 is for dataset 62, a zoo dataset, which contains a lot of categorical information. task = tasks.get_task(60) data = task.get_dataset() X, y, categorical = data.get_data(target=data.default_target_attribute, return_categorical_indicator=True) # We want to do cross-validation for some landmarkers, so we take a cv-10 fold. # We need to unroll the generator into a list because it is iterated over multiple times. folds = list(next(task.iterate_repeats())) simple = simple_metafeatures(X, y, categorical) stats = statistical_metafeatures(X, y, categorical) info = information_theoretic_metafeatures(X, y, categorical) landmarkers = landmarker_metafeatures(X, y, categorical, folds) for key, val in simple.items(): print("{}: {}".format(key, val))
def retrieveTaskId(task): return tasks.get_task(getTaskId(task))
information_names = ",".join(mf.information_theoretic_metafeature_names()) landmarking_names = ",".join(landmarking_metafeature_names()) subsample_names = ",".join(subsample_metafeature_names()) learner_names = ",".join([baselearner.__name__ for baselearner in config.base_learners]) log(learner_names) column_names = "{},{},{},{},{},{},{}\n".format("did,subsize", simple_names, statistical_names, information_names, landmarking_names, subsample_names, learner_names) fh.write(column_names) # Then for each dataset (and every desired subset of it), perform landmarking, # and record execution time. for task_id in config.test_task_ids: if task_id in config.excluded_tasks.keys(): continue log("Getting task {}".format(task_id)) task = tasks.get_task(task_id) did = task.dataset_id log("Loading dataset {}".format(did)) try: dataset = task.get_dataset() # Impute the values - While values would be imputed when calculating some meta-features anyway, this gives more control. X, y, categorical = dataset.get_data(target = task.target_feature, return_categorical_indicator = True) #X, categorical = remove_zero_columns(impute_values(X, categorical), categorical) # Subsample landmarker need folds, the train+test set of subsample landmarkers should be 500 instances, # since that is the size of our smallest dataset. # We first create a fold for 500 stratified samples, and then again divide that selection to 10 folds. max_size = 500 number_of_classes = len(np.unique(y))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 9 14:06:01 2017 @author: joost """ import openml as oml from openml import tasks, runs task = tasks.get_task(145677) X, y = task.get_X_and_y() from sklearn.ensemble import VotingClassifier from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler, Imputer, PolynomialFeatures, MinMaxScaler from sklearn.feature_selection import GenericUnivariateSelect from sklearn.model_selection import cross_val_score from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV, RandomizedSearchCV import xgboost as xgb from sklearn.model_selection import train_test_split import itertools import xmltodict