def test_simple(self): schema = Schema({ 'k1': Schema.STRING, 'k2': Schema.LABEL, }) (label, d) = schema.transform({'k1': 'abc', 'k2': 'def'}) self.assertEqual(label, 'def') self.assertEqual({'k1': 'abc'}, dict(d.string_values)) (label, d) = schema.transform({'k1': 'foo', 'k2': None}) # unlabeled data self.assertEqual(label, None) self.assertEqual({'k1': 'foo'}, dict(d.string_values))
def test_invalid_get_labels(self): loader = StubLoader() schema = Schema({'v': Schema.LABEL}) ds = Dataset(loader, schema, static=False) # get_labels returns generator; as generator will be evaluated # when actually iterating over it, pass it to list(). self.assertRaises(RuntimeError, list, ds.get_labels())
def test_simple(self): loader = StubLoader() schema = Schema({'v': Schema.LABEL}) ds = Dataset(loader, schema) for (idx, (label, d)) in ds: self.assertEqual(unicode_t(idx + 1), label) self.assertEqual(0, len(d.string_values)) self.assertEqual(0, len(d.num_values)) self.assertEqual(0, len(d.binary_values)) self.assertEqual(['1', '2', '3'], list(ds.get_labels()))
import sklearn.metrics from jubakit.classifier import Classifier, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader import jubakit.logger # In this example, we enable logging mechanism to show you # what's going on in jubakit. jubakit.logger.setup_logger(jubakit.logger.INFO) # Load a CSV file. loader = CSVLoader('iris.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'Species': Schema.LABEL, }, Schema.NUMBER) # Display Schema print('Schema: {0}'.format(schema)) # Create a Dataset. dataset = Dataset(loader, schema).shuffle() n_samples = len(dataset) n_train_samples = int(n_samples / 2) # Create a Classifier configuration. cfg = Config() # Bulk train-test the classifier. result = Classifier.train_and_classify(
* How to load CSV files and convert it into Jubakit dataset. * Training the classifier using the dataset. * Getting classification result. """ from jubakit.classifier import Classifier, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('iris.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'Species': Schema.LABEL, 'Sepal.Length': Schema.NUMBER, 'Sepal.Width': Schema.NUMBER, 'Petal.Length': Schema.NUMBER, 'Petal.Width': Schema.NUMBER, }) # Create a Dataset, which is an abstract representation of a set of data # that can be fed to Services like Classifier. `shuffle()` returns a new # Dataset whose order of data is shuffled. Note that datasets are immutable # objects. dataset = Dataset(loader, schema).shuffle() # Create a Classifier Service. # Classifier process starts using a default configuration. cfg = Config.default() classifier = Classifier.run(cfg)
======================================== This is a famous `shogun` classifier example that predicts family name of Shogun from his first name. """ from jubakit.classifier import Classifier, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load the shogun dataset. train_loader = CSVLoader('shogun.train.csv') test_loader = CSVLoader('shogun.test.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'family_name': Schema.LABEL, 'first_name': Schema.STRING, }) # Create a Dataset. train_dataset = Dataset(train_loader, schema).shuffle() test_dataset = Dataset(test_loader, schema) # Create a Classifier Service. cfg = Config(method='PA', converter={ 'string_rules': [{ 'key': 'first_name', 'type': 'unigram', 'sample_weight': 'bin', 'global_weight': 'bin' }]
def test_get_labels(self): loader = StubLoader() schema = Schema({'v': Schema.LABEL}) ds = Dataset(loader, schema) self.assertEqual(['1', '2', '3'], list(ds.get_labels()))
def test_without_label(self): # schema without label can be defined Schema({ 'k1': Schema.STRING, })
# Creates a Twitter stream loader. # Fill in your keys here; you can get keys at: https://apps.twitter.com/ return TwitterStreamLoader( TwitterOAuthHandler( consumer_key='XXXXXXXXXXXXXXXXXXXX', consumer_secret='XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', access_token='XXXXXXXX-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', access_secret='XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', )) # Define a Schema. schema = Schema( { '.lang': Schema.LABEL, '.text': Schema.STRING, '.user.lang': Schema.STRING, '.user.description': Schema.STRING, }, Schema.IGNORE) # Create a Classifier Service. classifier = Classifier.run(Config()) # Number of tweets used for training. n_train = 1000 print('---- Train: {0} tweets -------------------------------------'.format( n_train)) # Train the classifier using tweets from Twitter stream. trained_labels = set()
def test_predict(self): schema = Schema({ 'k1': Schema.STRING, 'k2': Schema.LABEL, }) self.assertRaises(RuntimeError, schema.predict, {}, True)