def test_simplekeras(self, capsys): model = SimpleKerasModel('TestModel', [OneHotFeature()], TestOfferSample) train = DataSet('batching_train', maxRAM=13302 * 1000) with capsys.disabled(): model.train(train, batch_size=1000) test = DataSet('batching_test', maxRAM=13302 * 1000) print len(test) print model.evaluate(test)
def test_kerasgenerator(self, capsys): with capsys.disabled(): ds = DataSet('batching_train', maxRAM=13302 * 1000) print ds.meta gen = KerasGenerator(ds, 10, None, True, 1) x, y = gen._get_batches_of_transformed_samples([0, 3, 14]) assert x.shape == (3, ) + OneHotFeature().output_shape assert y.shape == (3, 2) ds2 = DataSet('batching_test', maxRAM=13302 * 1000) assert len(ds) + len(ds2) >= 10000 assert len(ds.metaparts) >= 4
def test_training(self, capsys): model = ScikitLearnModel('TestModel', [OneHotFeature()], TestOfferSample) train = DataSet('bootstrapped_training') test = DataSet('bootstrapped_test') print 'Training' model.train(train) with capsys.disabled(): metrics, _, _ = model.evaluate_and_print(test) assert 'f1_score' in metrics assert 'precision_score' in metrics assert 'recall_score' in metrics assert metrics['f1_score'] > 0.1
def setup_class(cls): model = SimpleKerasModel('TestModel', [OneHotFeature()], TestOfferSample) train_samples = [ TestOfferSample.fromjson({ 'entityid': str(i), 'TestModelLabel': [0, 1] if i % 2 == 0 else [1, 0] }) for i in xrange(0, 10000) ] test_samples = [ TestOfferSample.fromjson({ 'entityid': str(i), 'TestModelLabel': [0, 1] if i % 2 == 0 else [1, 0] }) for i in xrange(10000, 12000) ] DataSet.remove('batching_train') DataSet.bootstrap('batching_train', model, train_samples, part_size=2000) DataSet.remove('batching_test') DataSet.bootstrap('batching_test', model, test_samples, part_size=2000)
def setup_class(cls): model = ScikitLearnModel('TestModel', [OneHotFeature()], TestOfferSample) train_samples = [ TestOfferSample.fromjson({ 'entityid': str(i), 'TestModelLabel': i % 2 }) for i in xrange(0, 10000) ] test_samples = [ TestOfferSample.fromjson({ 'entityid': str(i), 'TestModelLabel': i % 2 }) for i in xrange(10000, 12000) ] DataSet.remove('bootstrapped_training') DataSet.bootstrap('bootstrapped_training', model, train_samples, part_size=2000, numclasses=2) DataSet.remove('bootstrapped_test') DataSet.bootstrap('bootstrapped_test', model, test_samples, part_size=2000, numclasses=2)
def testgenerator(self): DataSet.generate('testdataset', self.generatormodel, maxRAM=288, customclient=self.mongoclient, query='query', numclasses=2, filter={}) ds = DataSet('testdataset') print len(ds) assert len(ds) == 1000 x, y = ds[0] assert y == 0 assert x.shape == (1, ) x, y = ds[10] xx, yy = ds[9] assert x[0] - xx[0] == 1.0 x, y = ds[150] assert y == 150 % 2 assert len(ds.cache) == 1 for i in xrange(0, 1000): assert ds[i][1] == i % 2 X, y_true = ds.get_all_samples() assert X.shape == (1000, 1) assert np.sum(X) == np.sum(xrange(0, 1000)) assert y_true.shape == (1000, ) assert np.sum(y_true) == 0.5 * len(ds) X, y_true = ds.get_samples([14, 15]) assert X.shape == (2, 1) assert X[1][0] - X[0][0] == 1 assert y_true.shape == (2, ) assert y_true[0] + y_true[1] == 1
# -*- coding: utf-8 -*- import sys import logging import datetime as dt from iwlearn.training import DataSet from tutorial.common.models import RelocationModelPro if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) train = DataSet('train-pro') test = DataSet('test-pro') print 'Samples in train %d, in test %d' % (len(train), len(test)) model = RelocationModelPro() model.train(train) model.evaluate_and_print(test) scored_features = model.feature_selection(test, step=1, n_splits=4) selected_features = [] for i, (feature, score) in enumerate(zip(model.features, scored_features)): logging.info('%s %s %s ' % (i, feature.name, score)) if score <= 1: selected_features.append(feature)
# -*- coding: utf-8 -*- import sys import logging import datetime as dt import numpy as np from iwlearn.training import DataSet from tutorial.common.models import RelocationModelHyper if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) train = DataSet('train-hyper') test = DataSet('test-hyper') print 'Samples in train %d, in test %d' % (len(train), len(test)) model = RelocationModelHyper() # Train the model in a simple way to provide a baseline of the model performance model.train(train) model.evaluate_and_print(test) print print # Now perform training with the hyperparameter optimization n_estimators_range = np.linspace(start=100, stop=600, num=5, dtype=int) max_features_range = np.linspace(start=5, stop=10, num=3, dtype=int) min_samples_leaf_range = np.linspace(start=1, stop=4, num=2, dtype=int)
# -*- coding: utf-8 -*- import sys import logging import iwlearn.mongo as mongo from iwlearn.training import DataSet from tutorial.common.samples import RelocationUserSample from tutorial.common.rules import RelocationRule if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) mongo.setmongouri('mongodb://localhost:27017/') DataSet.remove('train') DataSet.generate('train', RelocationRule(), numclasses=2, filter={'entityid': { '$regex': r'^user[0-9]*?[0-7]$' }}) DataSet.remove('test') DataSet.generate('test', RelocationRule(), numclasses=2, filter={'entityid': { '$regex': r'^user[0-9]*?[8-9]$' }})
# -*- coding: utf-8 -*- import sys import logging import iwlearn.mongo as mongo from iwlearn.training import DataSet from tutorial.common.models import RelocationModelHyper if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) mongo.setmongouri('mongodb://localhost:27017/') DataSet.remove('train-hyper') DataSet.generate('train-hyper', RelocationModelHyper(), numclasses=2, filter={'entityid': {'$regex': r'^user[0-9]*?[0-7]$'}}) DataSet.remove('test-hyper') DataSet.generate('test-hyper', RelocationModelHyper(), numclasses=2, filter={'entityid': {'$regex': r'^user[0-9]*?[8-9]$'}})
# -*- coding: utf-8 -*- import logging import sys from iwlearn.training import DataSet from tutorial.common.rules import RelocationRule if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) test = DataSet('test') rule = RelocationRule() rule.evaluate_and_print(test)
# -*- coding: utf-8 -*- import logging import sys from iwlearn.training import DataSet if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) train = DataSet('train') print 'Number of samples %d' % len(train) train.plot_data(bins=20)
# -*- coding: utf-8 -*- import sys import logging from iwlearn.training import DataSet from tutorial.common.models import RelocationModel if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) train = DataSet('train') test = DataSet('test') print 'Samples in train %d, in test %d' % (len(train), len(test)) model = RelocationModel() model.train(train) model.evaluate_and_print(test)
def test_expansion(self, capsys, caplog, monkeypatch): with capsys.disabled(): import shutil try: shutil.rmtree('input/v2_test') except: pass samples = [] samples_2 = [] for x in xrange(0, 100): samples.append( TestOfferSample.fromjson({ 'entityid': uuid.uuid4(), 'value_1': x, 'value_2': x % 17, 'TestModelLabel': (1 if x % 17 == 0 else 0) })) samples_2.append( TestOfferSample.fromjson({ 'entityid': uuid.uuid4(), 'value_1': 2 * x, 'value_2': 2 * x % 17, 'TestModelLabel': 1 if x % 17 == 0 else 0 })) model_1 = ScikitLearnModel('TestModel', [SimpleSampleFeature('value_1')], TestOfferSample) DataSet.bootstrap('v2_test', model_1, samples, part_size=10, numclasses=2) ds = DataSet('v2_test') assert len(ds) == 100 assert tuple(ds.meta['model_input_shape']) == (1, ) assert [x['name'] for x in ds.meta['features'] ] == ['SimpleSampleFeature_value_1'] assert len(ds.meta['features']) == 1 assert len(ds.metaparts) == 10 for k, p in ds.metaparts.iteritems(): assert p['unordered_features'] == ['SimpleSampleFeature_value_1'] # Expand vertically DataSet.bootstrap('v2_test', model_1, samples_2, part_size=10, numclasses=2) ds = DataSet('v2_test') assert len(ds) == 200 assert tuple(ds.meta['model_input_shape']) == (1, ) assert [x['name'] for x in ds.meta['features'] ] == ['SimpleSampleFeature_value_1'] assert len(ds.meta['features']) == 1 assert len(ds.metaparts) == 20 for k, p in ds.metaparts.iteritems(): assert p['unordered_features'] == ['SimpleSampleFeature_value_1'] # Expand horizontally model_2 = ScikitLearnModel( 'TestModel', [SimpleSampleFeature('value_1'), SimpleSampleFeature('value_2')], TestOfferSample) caplog.clear() monkeypatch.setattr( '__builtin__.input', lambda x: 'n') # do not remove parts with missing feature DataSet.bootstrap('v2_test', model_2, samples, part_size=10, numclasses=2) ds = DataSet('v2_test') assert len(caplog.records) == 10 for tpl in caplog.records: assert "does not contain following features: set(['SimpleSampleFeature_value_2'])" in tpl.msg assert len(ds) == 200 assert tuple(ds.meta['model_input_shape']) == (2, ) assert [x['name'] for x in ds.meta['features']] == [ 'SimpleSampleFeature_value_1', 'SimpleSampleFeature_value_2' ] assert len(ds.metaparts) == 20 for k, p in ds.metaparts.iteritems(): assert 'SimpleSampleFeature_value_1' in p['unordered_features'] assert len(p['unordered_features']) == 1 or len( p['unordered_features']) == 2 # Check dataset would not crash for parts do not containing the second feature X, y_true = ds.get_all_samples() assert X.shape == (200, 2) import numpy as np assert np.mean(X[:, 0]) != BaseFeature.MISSING_VALUE assert np.mean(X[:, 1]) != BaseFeature.MISSING_VALUE assert sum(X[:, 1] == BaseFeature.MISSING_VALUE) == 100 # Remove first feature caplog.clear() DataSet.bootstrap('v2_test', model_1, samples, part_size=10, numclasses=2) ds = DataSet('v2_test') assert len(caplog.records) == 1 assert "Following features removed from dataset_V5.json: set(['SimpleSampleFeature_value_2'])" in \ caplog.text