def exercise_2(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.KFold(len(X), n_folds=10, shuffle=False, random_state=0) error = [] error_mean = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.set_params(n_estimators=i) clf.fit(X_train, y_train) error_mean.append( zero_one_loss(y_test, clf.predict(X_test)) ) error.append( np.array(error_mean).mean() ) #plot plt.style.use('ggplot') plt.plot(lst, error, '#009999', marker='o') plt.xticks(lst) plt.show()
def exercise_3(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0) error = [] error_cart = [] error_mean = [] error_mean_cart = [] clf = RandomForestClassifier(n_estimators=100, oob_score=True, max_features="auto", random_state=0) clf_cart = DecisionTreeClassifier() error_mean = [] for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) clf_cart.fit(X_train, y_train) error_mean.append( roc_auc_score(y_test, clf.predict(X_test)) ) error_mean_cart.append( roc_auc_score(y_test, clf_cart.predict(X_test)) ) error.append( np.array(error_mean).mean() ) error_cart.append( np.array(error_mean_cart).mean() ) print 'Error RandomForest: ', error print 'Error CART: ', error_cart
def test_get_cached_datasets(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) datasets = connector.get_cached_datasets() self.assertIsInstance(datasets, dict) self.assertEqual(len(datasets), 2) self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)
def exercise(): apikey = 'fbc6d4b7868ce52640f6ec74cf076f48' connector = APIConnector(apikey=apikey) #loading data dataset = connector.download_dataset(59) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) # iris = pd.DataFrame(X, columns=attribute_names) clf = svm.SVC(kernel='rbf') # gammapar = [] # for i in range(-15, 16, 1): # gammapar.append(math.pow(2,i)); # param_dist = dict(gamma=gammapar) # print gammapar r = np.logspace(-15, 15, 10, base=2) param_dist = {'gamma': r} rand = GridSearchCV(clf, param_dist, cv=10, scoring="roc_auc") rand.fit(X,y) rand.grid_scores_ rand_mean_scores =[result.mean_validation_score for result in rand.grid_scores_] print rand.best_score_ print rand.best_params_ plt.style.use('ggplot') # x_labels = [i for i in range(31)] # gammapar1 = [] # for i in range(-15, 16, 1): # temp = "2^"+str(i) # gammapar1.append(temp); # plt.plot(x_labels, rand_mean_scores) # plt.xticks(x_labels, gammapar1 ) # plt.xlabel('Gamma') # plt.ylabel('AUC') # plt.show() # x_labels = [i for i in range(10)] gammapar1 = [] for i in range(11): temp = r[i-1] gammapar1.append(temp); # plt.plot(x_labels, rand_mean_scores) # plt.xticks(x_labels, gammapar1 ) # plt.xlabel('Gamma') # plt.ylabel('AUC') # plt.show() print rand_mean_scores print r print x_labels print gammapar1
def load_data(dataset_id): #openml connection home_dir = os.path.expanduser("~") openml_dir = os.path.join(home_dir, "openml") cache_dir = os.path.join(openml_dir, "cache") with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh: key = fh.readline().rstrip('\n') openml = APIConnector(cache_directory=cache_dir, apikey=key) dataset = openml.download_dataset(dataset_id) # load data into panda dataframe X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) print("no. of samples :"+str(len(X))) return (X,y,attribute_names)
def test_get_cached_dataset(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") with mock.patch.object(APIConnector, "_perform_api_call") as api_mock: api_mock.return_value = 400, \ """<oml:authenticate xmlns:oml = "http://openml.org/openml"> <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash> <oml:valid_until>2014-08-13 20:01:29</oml:valid_until> <oml:timezone>Europe/Berlin</oml:timezone> </oml:authenticate>""" connector = APIConnector(cache_directory=workdir) dataset = connector.get_cached_dataset(2) self.assertIsInstance(dataset, OpenMLDataset) self.assertTrue(connector._perform_api_call.is_called_once())
def setUp(self): self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) self.workdir = os.path.join(workdir, "tmp") try: shutil.rmtree(self.workdir) except: pass os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True try: apikey = os.environ['OPENMLAPIKEY'] except: apikey = None try: travis = os.environ['TRAVIS'] if apikey is None: raise Exception('Running on travis-ci, but no environment ' 'variable OPENMLAPIKEY found.') except: pass self.connector = APIConnector(cache_directory=self.workdir, apikey=apikey)
def get_dataset(did): home_dir = os.path.expanduser("~") openml_dir = os.path.join(home_dir, ".openml") cache_dir = os.path.join(openml_dir, "cache") with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh: key = fh.readline().rstrip('\n') fh.close() openml = APIConnector(cache_directory = cache_dir, apikey = key) dataset = openml.download_dataset(did) # print('Data-set name: %s'%dataset.name) # print(dataset.description) data, meta = loadarff(dataset.data_file) target_attribute = dataset.default_target_attribute target_attribute_names = meta[target_attribute][1] X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True) return X, y, attribute_names, target_attribute_names
def variance_exercise3(): apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0) total_variance = [] variance_fold = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: variance_fold = [] clf.set_params(n_estimators=i) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) predicted_elements = clf.predict(X_test) # for i in range(0, len(y_test)): variance_fold.append( predicted_elements ) total_variance.append( np.array(variance_fold).var() ) plt.style.use('ggplot') plt.plot(lst, total_variance, '#009999', marker='o') plt.xticks(lst) plt.margins(0.02) plt.xlabel('number of trees') plt.ylabel('Variance') plt.show()
def bias_exercise3(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0) error = [] error_mean = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) for i in lst: error_mean = [] clf.set_params(n_estimators=i) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) predicted_elements = clf.predict(X_test) for i in range(0, len(y_test)): error_mean.append( (y_test[i] - predicted_elements[i])*(y_test[i] - predicted_elements[i]) ) error.append( np.array(error_mean).mean() ) plt.style.use('ggplot') plt.plot(lst, error, '#009999', marker='o') plt.xticks(lst) plt.margins(0.02) plt.xlabel('number of trees') plt.ylabel('Bias Squared') plt.show()
def exercise_1(): #connect to openml api apikey = 'ca2397ea8a2cdd9707ef39d76576e786' connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(44) X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) error = [] lst = [int(math.pow(2, i)) for i in range(0, 8)] # lst_2 = [i for i in range(1, 200)] #train the classifier clf = RandomForestClassifier(oob_score=True, max_features="auto", random_state=0) #loop estimator parameter for i in lst: clf.set_params(n_estimators=i) clf.fit(X, y) error.append(1 - clf.oob_score_) #plot plt.style.use('ggplot') plt.scatter(lst, error) plt.xticks(lst) plt.show()
def test_get_chached_dataset_description(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) description = connector._get_cached_dataset_description(2) self.assertIsInstance(description, dict)
tree.export_graphviz(clf, out_file=f,feature_names=feature_names, class_names=class_names, filled=True, rounded=True, special_characters=True) command = ["dot", "-Tpng", "dt.dot", "-o", figure_name+".png"] try: subprocess.check_call(command) except: exit("Could not run dot, ie graphviz, to " "produce visualization") #openml connection home_dir = os.path.expanduser("~") openml_dir = os.path.join(home_dir, "openml") cache_dir = os.path.join(openml_dir, "cache") with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh: key = fh.readline().rstrip('\n') openml = APIConnector(cache_directory=cache_dir, apikey=key) dataset = openml.download_dataset(10) dataset = openml.download_dataset(10) # load data into panda dataframe X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True) lymph = pd.DataFrame(X, columns=attribute_names) lymph['class'] = y print(len(lymph)) # histogram of class variable n, bins, patches = plt.hist(lymph['class'], facecolor='green') plt.xlabel('class')
def load(dataset_id): print 'Loadding data_id %d' % (dataset_id) connector = APIConnector(apikey=apikey) dataset = connector.download_dataset(dataset_id) return dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)
class TestAPIConnector(unittest.TestCase): """Test the APIConnector Note ---- A config file with the username and password must be present to test the API calls. """ def setUp(self): self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) self.workdir = os.path.join(workdir, "tmp") try: shutil.rmtree(self.workdir) except: pass os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True try: apikey = os.environ['OPENMLAPIKEY'] except: apikey = None try: travis = os.environ['TRAVIS'] if apikey is None: raise Exception('Running on travis-ci, but no environment ' 'variable OPENMLAPIKEY found.') except: pass self.connector = APIConnector(cache_directory=self.workdir, apikey=apikey) def tearDown(self): os.chdir(self.cwd) shutil.rmtree(self.workdir) ############################################################################ # Test administrative stuff @unittest.skip("Not implemented yet.") def test_parse_config(self): raise Exception() ############################################################################ # Test all local stuff def test_get_cached_datasets(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) datasets = connector.get_cached_datasets() self.assertIsInstance(datasets, dict) self.assertEqual(len(datasets), 2) self.assertIsInstance(list(datasets.values())[0], OpenMLDataset) def test_get_cached_dataset(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") with mock.patch.object(APIConnector, "_perform_api_call") as api_mock: api_mock.return_value = 400, \ """<oml:authenticate xmlns:oml = "http://openml.org/openml"> <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash> <oml:valid_until>2014-08-13 20:01:29</oml:valid_until> <oml:timezone>Europe/Berlin</oml:timezone> </oml:authenticate>""" connector = APIConnector(cache_directory=workdir) dataset = connector.get_cached_dataset(2) self.assertIsInstance(dataset, OpenMLDataset) self.assertTrue(connector._perform_api_call.is_called_once()) def test_get_chached_dataset_description(self): workdir = os.path.dirname(os.path.abspath(__file__)) workdir = os.path.join(workdir, "files") connector = APIConnector(cache_directory=workdir) description = connector._get_cached_dataset_description(2) self.assertIsInstance(description, dict) @unittest.skip("Not implemented yet.") def test_get_cached_tasks(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_task(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_splits(self): raise Exception() @unittest.skip("Not implemented yet.") def test_get_cached_split(self): raise Exception() ############################################################################ # Test all remote stuff ############################################################################ # Datasets def test_get_dataset_list(self): # We can only perform a smoke test here because we test on dynamic # data from the internet... datasets = self.connector.get_dataset_list() # 1087 as the number of datasets on openml.org self.assertTrue(len(datasets) >= 1087) for dataset in datasets: self.assertEqual(type(dataset), dict) self.assertGreaterEqual(len(dataset), 2) self.assertIn('did', dataset) self.assertIsInstance(dataset['did'], int) self.assertIn('status', dataset) self.assertTrue(is_string(dataset['status'])) self.assertIn(dataset['status'], ['in_preparation', 'active', 'deactivated']) @unittest.skip("Not implemented yet.") def test_datasets_active(self): raise NotImplementedError() def test_download_datasets(self): dids = [1, 2] datasets = self.connector.download_datasets(dids) self.assertEqual(len(datasets), 2) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "description.xml"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "2", "description.xml"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "dataset.arff"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "2", "dataset.arff"))) def test_download_dataset(self): dataset = self.connector.download_dataset(1) self.assertEqual(type(dataset), OpenMLDataset) self.assertEqual(dataset.name, 'anneal') self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "description.xml"))) self.assertTrue(os.path.exists(os.path.join( self.connector.dataset_cache_dir, "1", "dataset.arff"))) def test_download_rowid(self): # Smoke test which checks that the dataset has the row-id set correctly did = 164 dataset = self.connector.download_dataset(did) self.assertEqual(dataset.row_id_attribute, 'instance') def test_download_dataset_description(self): # Only a smoke test, I don't know exactly how to test the URL # retrieval and "caching" description = self.connector.download_dataset_description(2) self.assertIsInstance(description, dict) def test_download_dataset_features(self): # Only a smoke check features = self.connector.download_dataset_features(2) self.assertIsInstance(features, dict) def test_download_dataset_qualities(self): # Only a smoke check qualities = self.connector.download_dataset_qualities(2) self.assertIsInstance(qualities, dict) ############################################################################ # Tasks def test_get_task_list(self): # We can only perform a smoke test here because we test on dynamic # data from the internet... def check_task(task): self.assertEqual(type(task), dict) self.assertGreaterEqual(len(task), 2) self.assertIn('did', task) self.assertIsInstance(task['did'], int) self.assertIn('status', task) self.assertTrue(is_string(task['status'])) self.assertIn(task['status'], ['in_preparation', 'active', 'deactivated']) tasks = self.connector.get_task_list(task_type_id=1) # 1759 as the number of supervised classification tasks retrieved # openml.org from this call; don't trust the number on openml.org as # it also counts private datasets self.assertGreaterEqual(len(tasks), 1759) for task in tasks: check_task(task) tasks = self.connector.get_task_list(task_type_id=2) self.assertGreaterEqual(len(tasks), 735) for task in tasks: check_task(task) def test_download_task(self): task = self.connector.download_task(1) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "task.xml"))) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "datasets", "1", "dataset.arff"))) def test_download_split(self): task = self.connector.download_task(1) split = self.connector.download_split(task) self.assertEqual(type(split), OpenMLSplit) self.assertTrue(os.path.exists( os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff"))) ############################################################################ # Runs @unittest.skip('The method which is tested by this function doesnt exist') def test_download_run_list(self): def check_run(run): self.assertIsInstance(run, dict) self.assertEqual(len(run), 6) runs = self.connector.get_runs_list(task_id=1) self.assertGreaterEqual(len(runs), 800) for run in runs: check_run(run) runs = self.connector.get_runs_list(flow_id=1) self.assertGreaterEqual(len(runs), 1) for run in runs: check_run(run) runs = self.connector.get_runs_list(setup_id=1) self.assertGreaterEqual(len(runs), 260) for run in runs: check_run(run) @unittest.skip('The method which is tested by this function doesnt exist') def test_download_run(self): run = self.connector.download_run(473350) self.assertGreaterEqual(len(run.tags), 2) self.assertEqual(len(run.datasets), 1) self.assertGreaterEqual(len(run.files), 2) self.assertGreaterEqual(len(run.evaluations), 18) self.assertEqual(len(run.evaluations['f_measure']), 2) # ########################################################################### # Flows @unittest.skip('The method which is tested by this function doesnt exist') def test_download_flow_list(self): def check_flow(flow): self.assertIsInstance(flow, dict) self.assertEqual(len(flow), 6) flows = self.connector.get_flow_list() self.assertGreaterEqual(len(flows), 1448) for flow in flows: check_flow(flow) def test_upload_dataset(self): dataset = self.connector.download_dataset(3) file_path = os.path.join(self.connector.dataset_cache_dir, "3", "dataset.arff") description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml"> <oml:name>anneal</oml:name> <oml:version>1</oml:version> <oml:description>test</oml:description> <oml:format>ARFF</oml:format> <oml:licence>Public</oml:licence> <oml:default_target_attribute>class</oml:default_target_attribute> <oml:md5_checksum></oml:md5_checksum> </oml:data_set_description> """ return_code, dataset_xml = self.connector.upload_dataset(description, file_path) self.assertEqual(return_code, 200) def test_upload_dataset_with_url(self): description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml"> <oml:name>UploadTestWithURL</oml:name> <oml:version>1</oml:version> <oml:description>test</oml:description> <oml:format>ARFF</oml:format> <oml:url>http://expdb.cs.kuleuven.be/expdb/data/uci/nominal/iris.arff</oml:url> </oml:data_set_description> """ return_code, dataset_xml = self.connector.upload_dataset(description) self.assertEqual(return_code, 200) def test_upload_flow(self): file_path = os.path.join(self.connector.dataset_cache_dir,"uploadflow.txt") file = open(file_path, "w") file.write("Testing upload flow") file.close() description = '''<oml:flow xmlns:oml="http://openml.org/openml"><oml:name>Test</oml:name><oml:description>description</oml:description> </oml:flow>''' return_code, dataset_xml = self.connector.upload_flow(description, file_path) self.assertEqual(return_code, 200) def test_upload_run(self): file = urlopen("http://www.openml.org/data/download/224/weka_generated_predictions1977525485999711307.arff") file_text = file.read() prediction_file_path = os.path.join(self.connector.dataset_cache_dir, "weka_generated_predictions1977525485999711307.arff") with open(prediction_file_path, "wb") as prediction_file: prediction_file.write(file_text) description_text = '''<oml:run xmlns:oml="http://openml.org/openml"><oml:task_id>59</oml:task_id><oml:flow_id>67</oml:flow_id></oml:run>''' description_path = os.path.join(self.connector.dataset_cache_dir, "description.xml") with open(description_path, "w") as description_file: description_file.write(description_text) return_code, dataset_xml = self.connector.upload_run(prediction_file_path, description_path) self.assertEqual(return_code, 200)
print(__doc__) import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import Normalize from openml.apiconnector import APIConnector import os from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler from sklearn.datasets import load_iris from sklearn.cross_validation import StratifiedShuffleSplit from sklearn.grid_search import GridSearchCV import math apikey = 'fbc6d4b7868ce52640f6ec74cf076f48' connector = APIConnector(apikey=apikey) #loading data dataset = connector.download_dataset(59) # Utility function to move the midpoint of a colormap to be around # the values of interest. class MidpointNormalize(Normalize): def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False): self.midpoint = midpoint Normalize.__init__(self, vmin, vmax, clip) def __call__(self, value, clip=None): x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1] return np.ma.masked_array(np.interp(value, x, y))
from openml.apiconnector import APIConnector from openml.autorun import openml_run from sklearn import ensemble import xmltodict import os """ An example of an automated machine learning experiment using openml_run """ key_file_path = "apikey.txt" with open(key_file_path, 'r') as fh: key = fh.readline() task_id = 59 clf = ensemble.RandomForestClassifier() connector = APIConnector(apikey = key) task = connector.download_task(task_id) prediction_path, description_path = openml_run(task, clf) prediction_abspath = os.path.abspath(prediction_path) description_abspath = os.path.abspath(description_path) return_code, response = connector.upload_run(prediction_abspath, description_abspath) if(return_code == 200): response_dict = xmltodict.parse(response.content) run_id = response_dict['oml:upload_run']['oml:run_id'] print("Uploaded run with id %s" % (run_id))