def exercise_2():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.KFold(len(X), n_folds=10, shuffle=False, random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    for i in lst:
        error_mean = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.set_params(n_estimators=i)
            clf.fit(X_train, y_train)
            error_mean.append( zero_one_loss(y_test, clf.predict(X_test)) )
        error.append( np.array(error_mean).mean() )
    #plot
    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.show()
def exercise_3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)


    kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0)
    error = []
    error_cart = []
    error_mean = []
    error_mean_cart = []

    clf = RandomForestClassifier(n_estimators=100, oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    clf_cart = DecisionTreeClassifier()
    error_mean = []
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        clf_cart.fit(X_train, y_train)

        error_mean.append( roc_auc_score(y_test, clf.predict(X_test)) )
        error_mean_cart.append( roc_auc_score(y_test, clf_cart.predict(X_test)) )

    error.append( np.array(error_mean).mean() )
    error_cart.append( np.array(error_mean_cart).mean() )

    print 'Error RandomForest: ', error
    print 'Error CART: ', error_cart
 def test_get_cached_datasets(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     datasets = connector.get_cached_datasets()
     self.assertIsInstance(datasets, dict)
     self.assertEqual(len(datasets), 2)
     self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)
def exercise():
    apikey = 'fbc6d4b7868ce52640f6ec74cf076f48'
    connector = APIConnector(apikey=apikey)
    #loading data
    dataset = connector.download_dataset(59)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)
    # iris = pd.DataFrame(X, columns=attribute_names)

    clf = svm.SVC(kernel='rbf')
    # gammapar = []
    # for i in range(-15, 16, 1):
    #     gammapar.append(math.pow(2,i));
    # param_dist = dict(gamma=gammapar)
    # print gammapar
    r = np.logspace(-15, 15, 10, base=2)
    param_dist = {'gamma': r}
    rand = GridSearchCV(clf, param_dist, cv=10, scoring="roc_auc")

    rand.fit(X,y)
    rand.grid_scores_
    rand_mean_scores =[result.mean_validation_score for result in rand.grid_scores_]
    print rand.best_score_
    print rand.best_params_

    plt.style.use('ggplot')

    # x_labels = [i for i in range(31)]
    # gammapar1 = []
    # for i in range(-15, 16, 1):
    #     temp = "2^"+str(i)
    #     gammapar1.append(temp);
    # plt.plot(x_labels, rand_mean_scores)
    # plt.xticks(x_labels, gammapar1 )
    # plt.xlabel('Gamma')
    # plt.ylabel('AUC')
    # plt.show()
    #
    x_labels = [i for i in range(10)]
    gammapar1 = []
    for i in range(11):
        temp = r[i-1]
        gammapar1.append(temp);
    # plt.plot(x_labels, rand_mean_scores)
    # plt.xticks(x_labels, gammapar1 )
    # plt.xlabel('Gamma')
    # plt.ylabel('AUC')
    # plt.show()
    print rand_mean_scores
    print r
    print x_labels
    print gammapar1
def load_data(dataset_id):
    #openml connection
    home_dir = os.path.expanduser("~")
    openml_dir = os.path.join(home_dir, "openml")
    cache_dir = os.path.join(openml_dir, "cache")
    with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
        key = fh.readline().rstrip('\n')
    openml = APIConnector(cache_directory=cache_dir, apikey=key)
    dataset = openml.download_dataset(dataset_id)
    # load data into panda dataframe
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

    print("no. of samples :"+str(len(X)))
    return (X,y,attribute_names)
    def test_get_cached_dataset(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")

        with mock.patch.object(APIConnector, "_perform_api_call") as api_mock:
            api_mock.return_value = 400, \
                """<oml:authenticate xmlns:oml = "http://openml.org/openml">
                <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash>
                <oml:valid_until>2014-08-13 20:01:29</oml:valid_until>
                <oml:timezone>Europe/Berlin</oml:timezone>
                </oml:authenticate>"""

            connector = APIConnector(cache_directory=workdir)
            dataset = connector.get_cached_dataset(2)
            self.assertIsInstance(dataset, OpenMLDataset)
            self.assertTrue(connector._perform_api_call.is_called_once())
    def setUp(self):
        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        self.workdir = os.path.join(workdir, "tmp")
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        try:
            apikey = os.environ['OPENMLAPIKEY']
        except:
            apikey = None

        try:
            travis = os.environ['TRAVIS']
            if apikey is None:
                raise Exception('Running on travis-ci, but no environment '
                                'variable OPENMLAPIKEY found.')
        except:
            pass

        self.connector = APIConnector(cache_directory=self.workdir,
                                      apikey=apikey)
Example #8
0
def get_dataset(did):
    home_dir = os.path.expanduser("~")
    openml_dir = os.path.join(home_dir, ".openml")
    cache_dir = os.path.join(openml_dir, "cache")
    
    with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
        key = fh.readline().rstrip('\n')
    fh.close()
    
    openml = APIConnector(cache_directory = cache_dir, apikey = key)
    dataset = openml.download_dataset(did)
    # print('Data-set name: %s'%dataset.name)
    # print(dataset.description)
    data, meta = loadarff(dataset.data_file)
    target_attribute = dataset.default_target_attribute
    target_attribute_names = meta[target_attribute][1]
    X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True)
    
    return X, y, attribute_names, target_attribute_names
def variance_exercise3():
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)


    kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0)
    total_variance = []
    variance_fold = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)


    for i in lst:
        variance_fold = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)


            predicted_elements = clf.predict(X_test)

            # for i in range(0, len(y_test)):
            variance_fold.append( predicted_elements )
        total_variance.append( np.array(variance_fold).var() )

    plt.style.use('ggplot')
    plt.plot(lst, total_variance, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Variance')
    plt.show()
def bias_exercise3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)


    kf = cross_validation.ShuffleSplit(len(X),n_iter=10, test_size=0.1, train_size=0.9, random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    for i in lst:
        error_mean = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            predicted_elements = clf.predict(X_test)

            for i in range(0, len(y_test)):
                error_mean.append( (y_test[i] - predicted_elements[i])*(y_test[i] - predicted_elements[i])  )
        error.append( np.array(error_mean).mean() )

    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Bias Squared')
    plt.show()
def exercise_1():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)

    error = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    # lst_2 = [i for i in range(1, 200)]
    #train the classifier
    clf = RandomForestClassifier(oob_score=True,
                                   max_features="auto",
                                   random_state=0)
    #loop estimator parameter
    for i in lst:
        clf.set_params(n_estimators=i)
        clf.fit(X, y)
        error.append(1 - clf.oob_score_)
    #plot
    plt.style.use('ggplot')
    plt.scatter(lst, error)
    plt.xticks(lst)
    plt.show()
Example #12
0
 def test_get_chached_dataset_description(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     description = connector._get_cached_dataset_description(2)
     self.assertIsInstance(description, dict)
Example #13
0
        tree.export_graphviz(clf, out_file=f,feature_names=feature_names, class_names=class_names, filled=True, rounded=True,  special_characters=True)
    command = ["dot", "-Tpng", "dt.dot", "-o", figure_name+".png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")


#openml connection
home_dir = os.path.expanduser("~")
openml_dir = os.path.join(home_dir, "openml")
cache_dir = os.path.join(openml_dir, "cache")
with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
    key = fh.readline().rstrip('\n')
openml = APIConnector(cache_directory=cache_dir, apikey=key)
dataset = openml.download_dataset(10)
dataset = openml.download_dataset(10)


# load data into panda dataframe
X, y, attribute_names = dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)
lymph = pd.DataFrame(X, columns=attribute_names)
lymph['class'] = y
print(len(lymph))



# histogram of class variable
n, bins, patches = plt.hist(lymph['class'], facecolor='green')
plt.xlabel('class')
Example #14
0
def load(dataset_id):
    print 'Loadding data_id %d' % (dataset_id)
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(dataset_id)
    return dataset.get_dataset(target=dataset.default_target_attribute, return_attribute_names=True)
Example #15
0
class TestAPIConnector(unittest.TestCase):
    """Test the APIConnector

    Note
    ----
    A config file with the username and password must be present to test the
    API calls.
    """

    def setUp(self):
        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        self.workdir = os.path.join(workdir, "tmp")
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        try:
            apikey = os.environ['OPENMLAPIKEY']
        except:
            apikey = None

        try:
            travis = os.environ['TRAVIS']
            if apikey is None:
                raise Exception('Running on travis-ci, but no environment '
                                'variable OPENMLAPIKEY found.')
        except:
            pass

        self.connector = APIConnector(cache_directory=self.workdir,
                                      apikey=apikey)

    def tearDown(self):
        os.chdir(self.cwd)
        shutil.rmtree(self.workdir)

    ############################################################################
    # Test administrative stuff
    @unittest.skip("Not implemented yet.")
    def test_parse_config(self):
        raise Exception()

    ############################################################################
    # Test all local stuff
    def test_get_cached_datasets(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")
        connector = APIConnector(cache_directory=workdir)
        datasets = connector.get_cached_datasets()
        self.assertIsInstance(datasets, dict)
        self.assertEqual(len(datasets), 2)
        self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)

    def test_get_cached_dataset(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")

        with mock.patch.object(APIConnector, "_perform_api_call") as api_mock:
            api_mock.return_value = 400, \
                """<oml:authenticate xmlns:oml = "http://openml.org/openml">
                <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash>
                <oml:valid_until>2014-08-13 20:01:29</oml:valid_until>
                <oml:timezone>Europe/Berlin</oml:timezone>
                </oml:authenticate>"""

            connector = APIConnector(cache_directory=workdir)
            dataset = connector.get_cached_dataset(2)
            self.assertIsInstance(dataset, OpenMLDataset)
            self.assertTrue(connector._perform_api_call.is_called_once())

    def test_get_chached_dataset_description(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")
        connector = APIConnector(cache_directory=workdir)
        description = connector._get_cached_dataset_description(2)
        self.assertIsInstance(description, dict)

    @unittest.skip("Not implemented yet.")
    def test_get_cached_tasks(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_task(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_splits(self):
        raise Exception()

    @unittest.skip("Not implemented yet.")
    def test_get_cached_split(self):
        raise Exception()

    ############################################################################
    # Test all remote stuff

    ############################################################################
    # Datasets
    def test_get_dataset_list(self):
        # We can only perform a smoke test here because we test on dynamic
        # data from the internet...
        datasets = self.connector.get_dataset_list()
        # 1087 as the number of datasets on openml.org
        self.assertTrue(len(datasets) >= 1087)
        for dataset in datasets:
            self.assertEqual(type(dataset), dict)
            self.assertGreaterEqual(len(dataset), 2)
            self.assertIn('did', dataset)
            self.assertIsInstance(dataset['did'], int)
            self.assertIn('status', dataset)
            self.assertTrue(is_string(dataset['status']))
            self.assertIn(dataset['status'], ['in_preparation', 'active',
                                              'deactivated'])

    @unittest.skip("Not implemented yet.")
    def test_datasets_active(self):
        raise NotImplementedError()

    def test_download_datasets(self):
        dids = [1, 2]
        datasets = self.connector.download_datasets(dids)
        self.assertEqual(len(datasets), 2)
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "description.xml")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "2", "description.xml")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "dataset.arff")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "2", "dataset.arff")))

    def test_download_dataset(self):
        dataset = self.connector.download_dataset(1)
        self.assertEqual(type(dataset), OpenMLDataset)
        self.assertEqual(dataset.name, 'anneal')
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "description.xml")))
        self.assertTrue(os.path.exists(os.path.join(
            self.connector.dataset_cache_dir, "1", "dataset.arff")))

    def test_download_rowid(self):
        # Smoke test which checks that the dataset has the row-id set correctly
        did = 164
        dataset = self.connector.download_dataset(did)
        self.assertEqual(dataset.row_id_attribute, 'instance')

    def test_download_dataset_description(self):
        # Only a smoke test, I don't know exactly how to test the URL
        # retrieval and "caching"
        description = self.connector.download_dataset_description(2)
        self.assertIsInstance(description, dict)

    def test_download_dataset_features(self):
        # Only a smoke check
        features = self.connector.download_dataset_features(2)
        self.assertIsInstance(features, dict)

    def test_download_dataset_qualities(self):
        # Only a smoke check
        qualities = self.connector.download_dataset_qualities(2)
        self.assertIsInstance(qualities, dict)

    ############################################################################
    # Tasks
    def test_get_task_list(self):
        # We can only perform a smoke test here because we test on dynamic
        # data from the internet...
        def check_task(task):
            self.assertEqual(type(task), dict)
            self.assertGreaterEqual(len(task), 2)
            self.assertIn('did', task)
            self.assertIsInstance(task['did'], int)
            self.assertIn('status', task)
            self.assertTrue(is_string(task['status']))
            self.assertIn(task['status'],
                          ['in_preparation', 'active', 'deactivated'])

        tasks = self.connector.get_task_list(task_type_id=1)
        # 1759 as the number of supervised classification tasks retrieved
        # openml.org from this call; don't trust the number on openml.org as
        # it also counts private datasets
        self.assertGreaterEqual(len(tasks), 1759)
        for task in tasks:
            check_task(task)

        tasks = self.connector.get_task_list(task_type_id=2)
        self.assertGreaterEqual(len(tasks), 735)
        for task in tasks:
            check_task(task)

    def test_download_task(self):
        task = self.connector.download_task(1)
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "tasks", "1", "task.xml")))
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "datasets", "1", "dataset.arff")))

    def test_download_split(self):
        task = self.connector.download_task(1)
        split = self.connector.download_split(task)
        self.assertEqual(type(split), OpenMLSplit)
        self.assertTrue(os.path.exists(
            os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))

    ############################################################################
    # Runs
    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_run_list(self):
        def check_run(run):
            self.assertIsInstance(run, dict)
            self.assertEqual(len(run), 6)

        runs = self.connector.get_runs_list(task_id=1)
        self.assertGreaterEqual(len(runs), 800)
        for run in runs:
            check_run(run)

        runs = self.connector.get_runs_list(flow_id=1)
        self.assertGreaterEqual(len(runs), 1)
        for run in runs:
            check_run(run)

        runs = self.connector.get_runs_list(setup_id=1)
        self.assertGreaterEqual(len(runs), 260)
        for run in runs:
            check_run(run)

    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_run(self):
        run = self.connector.download_run(473350)
        self.assertGreaterEqual(len(run.tags), 2)
        self.assertEqual(len(run.datasets), 1)
        self.assertGreaterEqual(len(run.files), 2)
        self.assertGreaterEqual(len(run.evaluations), 18)
        self.assertEqual(len(run.evaluations['f_measure']), 2)

    # ###########################################################################
    # Flows
    @unittest.skip('The method which is tested by this function doesnt exist')
    def test_download_flow_list(self):
        def check_flow(flow):
            self.assertIsInstance(flow, dict)
            self.assertEqual(len(flow), 6)

        flows = self.connector.get_flow_list()
        self.assertGreaterEqual(len(flows), 1448)
        for flow in flows:
            check_flow(flow)

    def test_upload_dataset(self):

        dataset = self.connector.download_dataset(3)
        file_path = os.path.join(self.connector.dataset_cache_dir, "3", "dataset.arff")

        description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml">
                        <oml:name>anneal</oml:name>
                        <oml:version>1</oml:version>
                        <oml:description>test</oml:description>
                        <oml:format>ARFF</oml:format>
                        <oml:licence>Public</oml:licence>
                        <oml:default_target_attribute>class</oml:default_target_attribute>
                        <oml:md5_checksum></oml:md5_checksum>
                        </oml:data_set_description>
                         """
        return_code, dataset_xml = self.connector.upload_dataset(description, file_path)
        self.assertEqual(return_code, 200)

    def test_upload_dataset_with_url(self):

        description = """ <oml:data_set_description xmlns:oml="http://openml.org/openml">
                        <oml:name>UploadTestWithURL</oml:name>
                        <oml:version>1</oml:version>
                        <oml:description>test</oml:description>
                        <oml:format>ARFF</oml:format>
                        <oml:url>http://expdb.cs.kuleuven.be/expdb/data/uci/nominal/iris.arff</oml:url>
                        </oml:data_set_description>
                         """
        return_code, dataset_xml = self.connector.upload_dataset(description)
        self.assertEqual(return_code, 200)

    def test_upload_flow(self):
        file_path = os.path.join(self.connector.dataset_cache_dir,"uploadflow.txt")
        file = open(file_path, "w")
        file.write("Testing upload flow")
        file.close()
        description = '''<oml:flow xmlns:oml="http://openml.org/openml"><oml:name>Test</oml:name><oml:description>description</oml:description> </oml:flow>'''
        return_code, dataset_xml = self.connector.upload_flow(description, file_path)
        self.assertEqual(return_code, 200)

    def test_upload_run(self):
        file = urlopen("http://www.openml.org/data/download/224/weka_generated_predictions1977525485999711307.arff")
        file_text = file.read()
        prediction_file_path = os.path.join(self.connector.dataset_cache_dir, "weka_generated_predictions1977525485999711307.arff")
        with open(prediction_file_path, "wb") as prediction_file:
            prediction_file.write(file_text)

        description_text = '''<oml:run xmlns:oml="http://openml.org/openml"><oml:task_id>59</oml:task_id><oml:flow_id>67</oml:flow_id></oml:run>'''
        description_path = os.path.join(self.connector.dataset_cache_dir, "description.xml")
        with open(description_path, "w") as description_file:
            description_file.write(description_text)

        return_code, dataset_xml = self.connector.upload_run(prediction_file_path, description_path)
        self.assertEqual(return_code, 200)
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from openml.apiconnector import APIConnector
import os
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
import math
apikey = 'fbc6d4b7868ce52640f6ec74cf076f48'
connector = APIConnector(apikey=apikey)
#loading data
dataset = connector.download_dataset(59)
# Utility function to move the midpoint of a colormap to be around

# the values of interest.

class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))
from openml.apiconnector import APIConnector
from openml.autorun import openml_run
from sklearn import ensemble
import xmltodict
import os
"""
An example of an automated machine learning experiment using openml_run
"""

key_file_path = "apikey.txt"
with open(key_file_path, 'r') as fh:
	key = fh.readline()

task_id = 59

clf = ensemble.RandomForestClassifier()
connector = APIConnector(apikey = key)
task = connector.download_task(task_id)

prediction_path, description_path = openml_run(task, clf)

prediction_abspath = os.path.abspath(prediction_path)
description_abspath = os.path.abspath(description_path)

return_code, response = connector.upload_run(prediction_abspath, description_abspath)

if(return_code == 200):
	response_dict = xmltodict.parse(response.content)
	run_id = response_dict['oml:upload_run']['oml:run_id']
	print("Uploaded run with id %s" % (run_id))