def exercise_1():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    error = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    # lst_2 = [i for i in range(1, 200)]
    #train the classifier
    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    #loop estimator parameter
    for i in lst:
        clf.set_params(n_estimators=i)
        clf.fit(X, y)
        error.append(1 - clf.oob_score_)
    #plot
    plt.style.use('ggplot')
    plt.scatter(lst, error)
    plt.xticks(lst)
    plt.show()
def exercise_2():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.KFold(len(X),
                                n_folds=10,
                                shuffle=False,
                                random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]
    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    for i in lst:
        error_mean = []
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.set_params(n_estimators=i)
            clf.fit(X_train, y_train)
            error_mean.append(zero_one_loss(y_test, clf.predict(X_test)))
        error.append(np.array(error_mean).mean())
    #plot
    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.show()
Example #3
0
    def setUp(self):
        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        self.workdir = os.path.join(workdir, "tmp")
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        try:
            apikey = os.environ['OPENMLAPIKEY']
        except:
            apikey = None

        try:
            travis = os.environ['TRAVIS']
            if apikey is None:
                raise Exception('Running on travis-ci, but no environment '
                                'variable OPENMLAPIKEY found.')
        except:
            pass

        self.connector = APIConnector(cache_directory=self.workdir,
                                      apikey=apikey)
Example #4
0
 def test_get_cached_datasets(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     datasets = connector.get_cached_datasets()
     self.assertIsInstance(datasets, dict)
     self.assertEqual(len(datasets), 2)
     self.assertIsInstance(list(datasets.values())[0], OpenMLDataset)
Example #5
0
    def setUp(self, api_connector_mock):
        __file__ = inspect.getfile(OpenMLTaskTest)
        self.directory = os.path.dirname(__file__)
        self.split_filename = os.path.join(self.directory, "..", "files",
                                           "tasks", "datasplits.arff")

        api_connector_mock.return_value = None
        self.api_connector = APIConnector()
        self.task = OpenMLTask(1, "supervised classification", 1, "class",
                               "crossvalidation wth holdout", None, None, None,
                               None, self.api_connector)
Example #6
0
    def test_get_cached_dataset(self):
        workdir = os.path.dirname(os.path.abspath(__file__))
        workdir = os.path.join(workdir, "files")

        with mock.patch.object(APIConnector, "_perform_api_call") as api_mock:
            api_mock.return_value = 400, \
                """<oml:authenticate xmlns:oml = "http://openml.org/openml">
                <oml:session_hash>G9MPPN114ZCZNWW2VN3JE9VF1FMV8Y5FXHUDUL4P</oml:session_hash>
                <oml:valid_until>2014-08-13 20:01:29</oml:valid_until>
                <oml:timezone>Europe/Berlin</oml:timezone>
                </oml:authenticate>"""

            connector = APIConnector(cache_directory=workdir)
            dataset = connector.get_cached_dataset(2)
            self.assertIsInstance(dataset, OpenMLDataset)
            self.assertTrue(connector._perform_api_call.is_called_once())
Example #7
0
def get_dataset(did):
    home_dir = os.path.expanduser("~")
    openml_dir = os.path.join(home_dir, ".openml")
    cache_dir = os.path.join(openml_dir, "cache")
    
    with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
        key = fh.readline().rstrip('\n')
    fh.close()
    
    openml = APIConnector(cache_directory = cache_dir, apikey = key)
    dataset = openml.download_dataset(did)
    # print('Data-set name: %s'%dataset.name)
    # print(dataset.description)
    data, meta = loadarff(dataset.data_file)
    target_attribute = dataset.default_target_attribute
    target_attribute_names = meta[target_attribute][1]
    X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True)
    
    return X, y, attribute_names, target_attribute_names
def variance_exercise3():
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.ShuffleSplit(len(X),
                                       n_iter=10,
                                       test_size=0.1,
                                       train_size=0.9,
                                       random_state=0)
    total_variance = []
    variance_fold = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)

    for i in lst:
        variance_fold = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf.fit(X_train, y_train)

            predicted_elements = clf.predict(X_test)

            # for i in range(0, len(y_test)):
            variance_fold.append(predicted_elements)
        total_variance.append(np.array(variance_fold).var())

    plt.style.use('ggplot')
    plt.plot(lst, total_variance, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Variance')
    plt.show()
def bias_exercise3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.ShuffleSplit(len(X),
                                       n_iter=10,
                                       test_size=0.1,
                                       train_size=0.9,
                                       random_state=0)
    error = []
    error_mean = []
    lst = [int(math.pow(2, i)) for i in range(0, 8)]

    clf = RandomForestClassifier(oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    for i in lst:
        error_mean = []
        clf.set_params(n_estimators=i)
        for train_index, test_index in kf:
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            clf.fit(X_train, y_train)
            predicted_elements = clf.predict(X_test)

            for i in range(0, len(y_test)):
                error_mean.append((y_test[i] - predicted_elements[i]) *
                                  (y_test[i] - predicted_elements[i]))
        error.append(np.array(error_mean).mean())

    plt.style.use('ggplot')
    plt.plot(lst, error, '#009999', marker='o')
    plt.xticks(lst)
    plt.margins(0.02)
    plt.xlabel('number of trees')
    plt.ylabel('Bias Squared')
    plt.show()
def exercise_3():
    #connect to openml api
    apikey = 'ca2397ea8a2cdd9707ef39d76576e786'
    connector = APIConnector(apikey=apikey)
    dataset = connector.download_dataset(44)
    X, y, attribute_names = dataset.get_dataset(
        target=dataset.default_target_attribute, return_attribute_names=True)

    kf = cross_validation.ShuffleSplit(len(X),
                                       n_iter=10,
                                       test_size=0.1,
                                       train_size=0.9,
                                       random_state=0)
    error = []
    error_cart = []
    error_mean = []
    error_mean_cart = []

    clf = RandomForestClassifier(n_estimators=100,
                                 oob_score=True,
                                 max_features="auto",
                                 random_state=0)
    clf_cart = DecisionTreeClassifier()
    error_mean = []
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        clf_cart.fit(X_train, y_train)

        error_mean.append(roc_auc_score(y_test, clf.predict(X_test)))
        error_mean_cart.append(roc_auc_score(y_test, clf_cart.predict(X_test)))

    error.append(np.array(error_mean).mean())
    error_cart.append(np.array(error_mean_cart).mean())

    print 'Error RandomForest: ', error
    print 'Error CART: ', error_cart
Example #11
0
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers.core import Dense, Activation
from openml.apiconnector import APIConnector
import numpy as np
from pandas import Series, DataFrame
import pandas as pd

apikey = 'b6da739f426042fa9785167b29887d1a'
connector = APIConnector(apikey=apikey)
print 0
dataset = connector.download_dataset(554)
optimizer = None
exception_verbosity = 'high'

print 1
columns_names = ['feature_' + str(x) for x in range(0, 784)]
columns_names.append('target')
print 2
train = dataset.get_dataset()
train = pd.DataFrame(train, columns=columns_names)
y = train['target']
X = train.iloc[:, :-1]

X_train = X.iloc[0:60000].values
Y_train = y.iloc[0:60000].values
X_test = X.iloc[60000:].values
Y_test = y.iloc[60000:].values

from keras.utils import np_utils, generic_utils
Example #12
0
import numpy as np
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import fbeta_score, confusion_matrix, roc_curve, get_scorer
from subprocess import check_output

home_dir = os.path.expanduser("~")
openml_dir = os.path.join(home_dir, ".openml")
cache_dir = os.path.join(openml_dir, "cache")

with open(os.path.join(openml_dir, "apikey.txt"), 'r') as fh:
    key = fh.readline().rstrip('\n')
fh.close()

## load dataset lists
openml = APIConnector(cache_directory = cache_dir, apikey = key)
# datasets = openml.get_dataset_list()
# data = pd.DataFrame(datasets)

dataset = openml.download_dataset(10)
# print('Data-set name: %s'%dataset.name)
# print(dataset.description)
data, meta = loadarff(dataset.data_file)
target_attribute = dataset.default_target_attribute
target_attribute_names = meta[target_attribute][1]
X, y, attribute_names = dataset.get_dataset(target = target_attribute, return_attribute_names = True)
y_values = np.unique(y)
print('y_values%s'%y_values)
fig, axes_bar = plt.subplots(1, 1)
# plot the distribution of target attribute
y_values_counts, bin_edges = np.histogram(y, y_values.size, density = False)
Example #13
0
 def test_get_chached_dataset_description(self):
     workdir = os.path.dirname(os.path.abspath(__file__))
     workdir = os.path.join(workdir, "files")
     connector = APIConnector(cache_directory=workdir)
     description = connector._get_cached_dataset_description(2)
     self.assertIsInstance(description, dict)