Beispiel #1
0
    def test_from_array_without_label(self):
        ds = Dataset.from_array(
            [[10, 20, 30], [20, 10, 50], [40, 10, 30]],  # data
            None,  # labels
            ['k1', 'k2', 'k3'],  # feature_names
            ['pos', 'neg'],  # label_names
        )

        expected_labels = [None, None, None]
        expected_k1s = [10, 20, 40]
        actual_labels = []
        actual_k1s = []
        for (idx, (label, d)) in ds:
            actual_labels.append(label)
            actual_k1s.append(dict(d.num_values)['k1'])

        self.assertEqual(expected_labels, actual_labels)
        self.assertEqual(expected_k1s, actual_k1s)
Beispiel #2
0
  def test_from_array(self):
    ds = Dataset.from_array(
        [ [10,20,30], [20,10,50], [40,10,30] ], # data
        [ 0,          1,          0          ], # labels
        ['k1', 'k2', 'k3'],                     # feature_names
        ['pos', 'neg'],                         # label_names
    )

    expected_labels = ['pos', 'neg', 'pos']
    expected_k1s = [10, 20, 40]
    actual_labels = []
    actual_k1s = []
    for (idx, (label, d)) in ds:
      actual_labels.append(label)
      actual_k1s.append(dict(d.num_values)['k1'])

    self.assertEqual(expected_labels, actual_labels)
    self.assertEqual(expected_k1s, actual_k1s)
from jubakit.classifier import Classifier, Dataset, Config

# switch StratifiedKFold API
sklearn_version = int(sklearn.__version__.split('.')[1])
if sklearn_version < 18:
    from sklearn.cross_validation import StratifiedKFold
else:
    from sklearn.model_selection import StratifiedKFold

# Load built-in `iris` dataset from scikit-learn.
iris = sklearn.datasets.load_iris()

# Convert it into jubakit Dataset.
#dataset = Dataset.from_array(iris.data, iris.target)
# ... or, optionally you can assign feature/label names to improve human-readbility.
dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names,
                             iris.target_names)

# Shuffle the dataset, as the dataset is sorted by label.
dataset = dataset.shuffle()

# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
Beispiel #4
0
# switch StratifiedKFold API
sklearn_version = int(sklearn.__version__.split('.')[1])
if sklearn_version < 18:
    from sklearn.cross_validation import StratifiedKFold
else:
    from sklearn.model_selection import StratifiedKFold


# Load built-in `iris` dataset from scikit-learn.
iris = sklearn.datasets.load_iris()

# Convert it into jubakit Dataset.
#dataset = Dataset.from_array(iris.data, iris.target)
# ... or, optionally you can assign feature/label names to improve human-readbility.
dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names, iris.target_names)

# Shuffle the dataset, as the dataset is sorted by label.
dataset = dataset.shuffle()

# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
Beispiel #5
0
===================================================

In this example we show classification using Digits dataset.
"""

import sklearn.datasets
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load the digits dataset.
digits = sklearn.datasets.load_digits()

# Create a Dataset.
dataset = Dataset.from_array(digits.data, digits.target)
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier Service
cfg = Config(method='AROW', parameter={'regularization_weight': 0.1})
classifier = Classifier.run(cfg)

print("Started Service: {0}".format(classifier))

# Train the classifier using the first half of the dataset.
train_ds = dataset[:n_train_samples]
print("Training...: {0}".format(train_ds))
for _ in classifier.train(train_ds):
    pass
  n_redundant=2,
  n_repeated=0,
  n_classes=2,
  n_clusters_per_class=2,
  weights=None,
  flip_y=0.01,
  class_sep=1.0,
  hypercube=True,
  shift=0.0,
  scale=1.0,
  shuffle=True,
  random_state=0,  # fixed seed
)

# Convert arrays into jubakit Dataset.
dataset = Dataset.from_array(X, y)

# Try finding the best classifier parameter.
param2metrics = {}
for method in ['AROW', 'NHERD', 'CW']:
  for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]:
    print('Running ({0} / regularization_weight = {1})...'.format(method, rw))

    # Create a config data structure.
    jubatus_config = Config(method=method, parameter={'regularization_weight': rw})

    # It is equivalent to:
    #jubatus_config = Config.default()
    #jubatus_config['method'] = method
    #jubatus_config['parameter']['regularization_weight'] = rw
Beispiel #7
0
===================================================

In this example we show classification using Digits dataset.
"""

import sklearn.datasets
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load the digits dataset.
digits = sklearn.datasets.load_digits()

# Create a Dataset.
dataset = Dataset.from_array(digits.data, digits.target)
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier Service
cfg = Config(method='AROW', parameter={'regularization_weight': 0.1})
classifier = Classifier.run(cfg)

print("Started Service: {0}".format(classifier))

# Train the classifier using the first half of the dataset.
train_ds = dataset[:n_train_samples]
print("Training...: {0}".format(train_ds))
for _ in classifier.train(train_ds): pass

# Test the classifier using the last half of the dataset.
Beispiel #8
0
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=2,
    weights=None,
    flip_y=0.01,
    class_sep=1.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=0,  # fixed seed
)

# Convert arrays into jubakit Dataset.
dataset = Dataset.from_array(X, y)

# Try finding the best classifier parameter.
param2metrics = {}
for method in ['AROW', 'NHERD', 'CW']:
    for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]:
        print('Running ({0} / regularization_weight = {1})...'.format(
            method, rw))

        # Create a config data structure.
        jubatus_config = Config(method=method,
                                parameter={'regularization_weight': rw})

        # It is equivalent to:
        #jubatus_config = Config.default()
        #jubatus_config['method'] = method
le.fit(labels)
c = le.transform(y)

# scale dataset with (mean, variance) = (0, 1)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# calculate the domain
X_min = X.min(axis=0)
#X_min = np.ones(X.shape[1])
X_max = X.max(axis=0)
X0, X1 = np.meshgrid(np.linspace(X_min[0], X_max[0], meshsize),
                     np.linspace(X_min[1], X_max[1], meshsize))

# make training dataset
dataset = Dataset.from_array(X, y)
# make mesh dataset to plot decision surface
contourf_dataset = Dataset.from_array(np.c_[X0.ravel(), X1.ravel()])

# setup and run jubatus
config = Config(method=method,
                parameter={'regularization_weight': regularization_weight})
classifier = Classifier.run(config, port=port)

# construct classifier prediction models and dump model weights
for i, _ in enumerate(classifier.train(dataset)):
    model_name = 'decision_surface_{}'.format(i)
    classifier.save(name=model_name)

# prepare figure
fig, ax = plt.subplots()