def test_invalid_get_labels(self): loader = StubLoader() schema = Schema({'v': Schema.LABEL}) ds = Dataset(loader, schema, static=False) # get_labels returns generator; as generator will be evaluated # when actually iterating over it, pass it to list(). self.assertRaises(RuntimeError, list, ds.get_labels())
def test_simple(self): loader = StubLoader() schema = Schema({'v': Schema.LABEL}) ds = Dataset(loader, schema) for (idx, (label, d)) in ds: self.assertEqual(unicode_t(idx + 1), label) self.assertEqual(0, len(d.string_values)) self.assertEqual(0, len(d.num_values)) self.assertEqual(0, len(d.binary_values)) self.assertEqual(['1', '2', '3'], list(ds.get_labels()))
def test_simple(self): loader = StubLoader() schema = Schema({'v': Schema.LABEL}) ds = Dataset(loader, schema) for (idx, (label, d)) in ds: self.assertEqual(unicode_t(idx+1), label) self.assertEqual(0, len(d.string_values)) self.assertEqual(0, len(d.num_values)) self.assertEqual(0, len(d.binary_values)) self.assertEqual(['1','2','3'], list(ds.get_labels()))
def test_from_data(self): # load from array format ds = Dataset.from_data( [[10, 20, 30], [20, 10, 50], [40, 10, 30]], # data [0, 1, 0], # labels ['k1', 'k2', 'k3'], # feature_names ['pos', 'neg'], # label_names ) expected_labels = ['pos', 'neg', 'pos'] expected_k1s = [10, 20, 40] actual_labels = [] actual_k1s = [] for (idx, (label, d)) in ds: actual_labels.append(label) actual_k1s.append(dict(d.num_values)['k1']) self.assertEqual(expected_labels, actual_labels) self.assertEqual(expected_k1s, actual_k1s) # load from scipy.sparse format ds = Dataset.from_data( self._create_matrix(), # data [0, 1, 0], # labels ['k1', 'k2', 'k3'], # feature_names ['pos', 'neg'], # label_names ) expected_labels = ['pos', 'neg', 'pos'] expected_k1s = [1, None, 4] expected_k3s = [2, 3, 6] actual_labels = [] actual_k1s = [] actual_k3s = [] for (idx, (label, d)) in ds: actual_labels.append(label) actual_k1s.append(dict(d.num_values).get('k1', None)) actual_k3s.append(dict(d.num_values).get('k3', None)) self.assertEqual(expected_labels, actual_labels) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_k3s, actual_k3s)
def test_from_array(self): ds = Dataset.from_array( [ [10,20,30], [20,10,50], [40,10,30] ], # data [ 0, 1, 0 ], # labels ['k1', 'k2', 'k3'], # feature_names ['pos', 'neg'], # label_names ) expected_labels = ['pos', 'neg', 'pos'] expected_k1s = [10, 20, 40] actual_labels = [] actual_k1s = [] for (idx, (label, d)) in ds: actual_labels.append(label) actual_k1s.append(dict(d.num_values)['k1']) self.assertEqual(expected_labels, actual_labels) self.assertEqual(expected_k1s, actual_k1s)
def test_from_array_without_label(self): ds = Dataset.from_array( [[10, 20, 30], [20, 10, 50], [40, 10, 30]], # data None, # labels ['k1', 'k2', 'k3'], # feature_names ['pos', 'neg'], # label_names ) expected_labels = [None, None, None] expected_k1s = [10, 20, 40] actual_labels = [] actual_k1s = [] for (idx, (label, d)) in ds: actual_labels.append(label) actual_k1s.append(dict(d.num_values)['k1']) self.assertEqual(expected_labels, actual_labels) self.assertEqual(expected_k1s, actual_k1s)
def test_from_matrix(self): ds = Dataset.from_matrix( self._create_matrix(), # data [0, 1, 0], # labels ['k1', 'k2', 'k3'], # feature_names ['pos', 'neg'], # label_names ) expected_labels = ['pos', 'neg', 'pos'] expected_k1s = [1, None, 4] expected_k3s = [2, 3, 6] actual_labels = [] actual_k1s = [] actual_k3s = [] for (idx, (label, d)) in ds: actual_labels.append(label) actual_k1s.append(dict(d.num_values).get('k1', None)) actual_k3s.append(dict(d.num_values).get('k3', None)) self.assertEqual(expected_labels, actual_labels) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_k3s, actual_k3s)
def test_from_matrix(self): ds = Dataset.from_matrix( self._create_matrix(), # data [ 0, 1, 0 ], # labels [ 'k1', 'k2', 'k3'], # feature_names [ 'pos', 'neg'], # label_names ) expected_labels = ['pos', 'neg', 'pos'] expected_k1s = [1,None,4] expected_k3s = [2,3,6] actual_labels = [] actual_k1s = [] actual_k3s = [] for (idx, (label, d)) in ds: actual_labels.append(label) actual_k1s.append(dict(d.num_values).get('k1', None)) actual_k3s.append(dict(d.num_values).get('k3', None)) self.assertEqual(expected_labels, actual_labels) self.assertEqual(expected_k1s, actual_k1s) self.assertEqual(expected_k3s, actual_k3s)
le.fit(labels) c = le.transform(y) # scale dataset with (mean, variance) = (0, 1) scaler = StandardScaler() X = scaler.fit_transform(X) # calculate the domain X_min = X.min(axis=0) #X_min = np.ones(X.shape[1]) X_max = X.max(axis=0) X0, X1 = np.meshgrid(np.linspace(X_min[0], X_max[0], meshsize), np.linspace(X_min[1], X_max[1], meshsize)) # make training dataset dataset = Dataset.from_array(X, y) # make mesh dataset to plot decision surface contourf_dataset = Dataset.from_array(np.c_[X0.ravel(), X1.ravel()]) # setup and run jubatus config = Config(method=method, parameter={'regularization_weight': regularization_weight}) classifier = Classifier.run(config, port=port) # construct classifier prediction models and dump model weights for i, _ in enumerate(classifier.train(dataset)): model_name = 'decision_surface_{}'.format(i) classifier.save(name=model_name) # prepare figure fig, ax = plt.subplots()
def test_get_labels(self): loader = StubLoader() schema = Schema({'v': Schema.LABEL}) ds = Dataset(loader, schema) self.assertEqual(['1', '2', '3'], list(ds.get_labels()))
def test_predict(self): loader = StubLoader() dataset = Dataset(loader) # predict self.assertEqual(['v', 1.0], dataset[0][1].num_values[0])
from jubakit.classifier import Classifier, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load the shogun dataset. train_loader = CSVLoader('shogun.train.csv') test_loader = CSVLoader('shogun.test.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'family_name': Schema.LABEL, 'first_name': Schema.STRING, }) # Create a Dataset. train_dataset = Dataset(train_loader, schema).shuffle() test_dataset = Dataset(test_loader, schema) # Create a Classifier Service. cfg = Config( method = 'PA', converter = { 'string_rules': [{'key': 'first_name', 'type': 'unigram', 'sample_weight': 'bin', 'global_weight': 'bin'}] } ) classifier = Classifier.run(cfg) # Train the classifier. for _ in classifier.train(train_dataset): pass # Classify using the classifier. for (idx, label, result) in classifier.classify(test_dataset):
from sklearn.datasets import load_svmlight_files import sklearn.metrics import jubakit from jubakit.classifier import Classifier, Dataset, Config # Load LIBSVM files. # Note that these example files are not included in this repository. # You can fetch them from: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20 print("Loading LIBSVM files...") (train_X, train_y, test_X, test_y) = load_svmlight_files(['news20', 'news20.t']) # Create a Train Dataset. print("Creating train dataset...") train_ds = Dataset.from_matrix(train_X, train_y) # Create a Test Dataset print("Creating test dataset...") test_ds = Dataset.from_matrix(test_X, test_y) # Create a Classifier Service classifier = Classifier.run(Config()) # Train the classifier. print("Training...") for (idx, _) in classifier.train(train_ds): if idx % 1000 == 0: print("Training... ({0} %)".format(100 * idx / len(train_ds))) # Test the classifier.
# switch StratifiedKFold API sklearn_version = int(sklearn.__version__.split('.')[1]) if sklearn_version < 18: from sklearn.cross_validation import StratifiedKFold else: from sklearn.model_selection import StratifiedKFold # Load built-in `iris` dataset from scikit-learn. iris = sklearn.datasets.load_iris() # Convert it into jubakit Dataset. #dataset = Dataset.from_array(iris.data, iris.target) # ... or, optionally you can assign feature/label names to improve human-readbility. dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names, iris.target_names) # Shuffle the dataset, as the dataset is sorted by label. dataset = dataset.shuffle() # Create a Classifier Service. # Classifier process starts using a default configuration. classifier = Classifier.run(Config()) # Prepare arrays to keep true/predicted labels to display a report later. true_labels = [] predicted_labels = [] # Run stratified K-fold validation. labels = list(dataset.get_labels()) if sklearn_version < 18:
=================================================== In this example we show classification using Digits dataset. """ import sklearn.datasets import sklearn.metrics import jubakit from jubakit.classifier import Classifier, Dataset, Config # Load the digits dataset. digits = sklearn.datasets.load_digits() # Create a Dataset. dataset = Dataset.from_array(digits.data, digits.target) n_samples = len(dataset) n_train_samples = int(n_samples / 2) # Create a Classifier Service cfg = Config(method='AROW', parameter={'regularization_weight': 0.1}) classifier = Classifier.run(cfg) print("Started Service: {0}".format(classifier)) # Train the classifier using the first half of the dataset. train_ds = dataset[:n_train_samples] print("Training...: {0}".format(train_ds)) for _ in classifier.train(train_ds): pass
n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=0, # fixed seed ) # Convert arrays into jubakit Dataset. dataset = Dataset.from_array(X, y) # Try finding the best classifier parameter. param2metrics = {} for method in ['AROW', 'NHERD', 'CW']: for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]: print('Running ({0} / regularization_weight = {1})...'.format( method, rw)) # Create a config data structure. jubatus_config = Config(method=method, parameter={'regularization_weight': rw}) # It is equivalent to: #jubatus_config = Config.default() #jubatus_config['method'] = method
from jubakit.classifier import Classifier, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load the shogun dataset. train_loader = CSVLoader('shogun.train.csv') test_loader = CSVLoader('shogun.test.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'family_name': Schema.LABEL, 'first_name': Schema.STRING, }) # Create a Dataset. train_dataset = Dataset(train_loader, schema).shuffle() test_dataset = Dataset(test_loader, schema) # Create a Classifier Service. cfg = Config(method='PA', converter={ 'string_rules': [{ 'key': 'first_name', 'type': 'unigram', 'sample_weight': 'bin', 'global_weight': 'bin' }] }) classifier = Classifier.run(cfg) # Train the classifier.
=================================================== In this example we show classification using Digits dataset. """ import sklearn.datasets import sklearn.metrics import jubakit from jubakit.classifier import Classifier, Dataset, Config # Load the digits dataset. digits = sklearn.datasets.load_digits() # Create a Dataset. dataset = Dataset.from_array(digits.data, digits.target) n_samples = len(dataset) n_train_samples = int(n_samples / 2) # Create a Classifier Service cfg = Config(method='AROW', parameter={'regularization_weight': 0.1}) classifier = Classifier.run(cfg) print("Started Service: {0}".format(classifier)) # Train the classifier using the first half of the dataset. train_ds = dataset[:n_train_samples] print("Training...: {0}".format(train_ds)) for _ in classifier.train(train_ds): pass # Test the classifier using the last half of the dataset.
# what's going on in jubakit. jubakit.logger.setup_logger(jubakit.logger.INFO) # Load a CSV file. loader = CSVLoader('iris.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'Species': Schema.LABEL, }, Schema.NUMBER) # Display Schema print('Schema: {0}'.format(schema)) # Create a Dataset. dataset = Dataset(loader, schema).shuffle() n_samples = len(dataset) n_train_samples = int(n_samples / 2) # Create a Classifier configuration. cfg = Config() # Bulk train-test the classifier. result = Classifier.train_and_classify( cfg, dataset[:n_train_samples], dataset[n_train_samples:], sklearn.metrics.classification_report ) print('---- Classification Report -----------------------------------')
n_redundant=2, n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=0, # fixed seed ) # Convert arrays into jubakit Dataset. dataset = Dataset.from_array(X, y) # Try finding the best classifier parameter. param2metrics = {} for method in ['AROW', 'NHERD', 'CW']: for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]: print('Running ({0} / regularization_weight = {1})...'.format(method, rw)) # Create a config data structure. jubatus_config = Config(method=method, parameter={'regularization_weight': rw}) # It is equivalent to: #jubatus_config = Config.default() #jubatus_config['method'] = method #jubatus_config['parameter']['regularization_weight'] = rw
from jubakit.classifier import Classifier, Dataset, Config # switch StratifiedKFold API sklearn_version = int(sklearn.__version__.split('.')[1]) if sklearn_version < 18: from sklearn.cross_validation import StratifiedKFold else: from sklearn.model_selection import StratifiedKFold # Load built-in `iris` dataset from scikit-learn. iris = sklearn.datasets.load_iris() # Convert it into jubakit Dataset. #dataset = Dataset.from_array(iris.data, iris.target) # ... or, optionally you can assign feature/label names to improve human-readbility. dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names, iris.target_names) # Shuffle the dataset, as the dataset is sorted by label. dataset = dataset.shuffle() # Create a Classifier Service. # Classifier process starts using a default configuration. classifier = Classifier.run(Config()) # Prepare arrays to keep true/predicted labels to display a report later. true_labels = [] predicted_labels = [] # Run stratified K-fold validation. labels = list(dataset.get_labels()) if sklearn_version < 18:
'.user.lang': Schema.STRING, '.user.description': Schema.STRING, }, Schema.IGNORE) # Create a Classifier Service. classifier = Classifier.run(Config()) # Number of tweets used for training. n_train = 1000 print('---- Train: {0} tweets -------------------------------------'.format( n_train)) # Train the classifier using tweets from Twitter stream. trained_labels = set() dataset = Dataset(get_loader(), schema) for (idx, label) in classifier.train(dataset): if idx == n_train: break trained_labels.add(label) text_summary = dataset.get(idx)['.text'].replace('\n', '') print('Train[{0}]: language {1} >> {2}'.format(idx, label, text_summary)) print('Languages Trained: {0}'.format(str(trained_labels))) print('---- Prediction (Ctrl-C to stop) -------------------------------------') try: # Classify tweets using the classifier. (y_true, y_pred) = ([], []) dataset = Dataset(get_loader(), schema)