Esempio n. 1
0
 def test_simplekeras(self, capsys):
     model = SimpleKerasModel('TestModel', [OneHotFeature()],
                              TestOfferSample)
     train = DataSet('batching_train', maxRAM=13302 * 1000)
     with capsys.disabled():
         model.train(train, batch_size=1000)
         test = DataSet('batching_test', maxRAM=13302 * 1000)
         print len(test)
         print model.evaluate(test)
Esempio n. 2
0
 def test_kerasgenerator(self, capsys):
     with capsys.disabled():
         ds = DataSet('batching_train', maxRAM=13302 * 1000)
         print ds.meta
         gen = KerasGenerator(ds, 10, None, True, 1)
         x, y = gen._get_batches_of_transformed_samples([0, 3, 14])
         assert x.shape == (3, ) + OneHotFeature().output_shape
         assert y.shape == (3, 2)
         ds2 = DataSet('batching_test', maxRAM=13302 * 1000)
         assert len(ds) + len(ds2) >= 10000
         assert len(ds.metaparts) >= 4
Esempio n. 3
0
    def test_training(self, capsys):
        model = ScikitLearnModel('TestModel', [OneHotFeature()],
                                 TestOfferSample)

        train = DataSet('bootstrapped_training')
        test = DataSet('bootstrapped_test')
        print 'Training'
        model.train(train)
        with capsys.disabled():
            metrics, _, _ = model.evaluate_and_print(test)
        assert 'f1_score' in metrics
        assert 'precision_score' in metrics
        assert 'recall_score' in metrics
        assert metrics['f1_score'] > 0.1
Esempio n. 4
0
    def setup_class(cls):
        model = SimpleKerasModel('TestModel', [OneHotFeature()],
                                 TestOfferSample)

        train_samples = [
            TestOfferSample.fromjson({
                'entityid':
                str(i),
                'TestModelLabel': [0, 1] if i % 2 == 0 else [1, 0]
            }) for i in xrange(0, 10000)
        ]
        test_samples = [
            TestOfferSample.fromjson({
                'entityid':
                str(i),
                'TestModelLabel': [0, 1] if i % 2 == 0 else [1, 0]
            }) for i in xrange(10000, 12000)
        ]

        DataSet.remove('batching_train')
        DataSet.bootstrap('batching_train',
                          model,
                          train_samples,
                          part_size=2000)
        DataSet.remove('batching_test')
        DataSet.bootstrap('batching_test', model, test_samples, part_size=2000)
Esempio n. 5
0
    def setup_class(cls):
        model = ScikitLearnModel('TestModel', [OneHotFeature()],
                                 TestOfferSample)

        train_samples = [
            TestOfferSample.fromjson({
                'entityid': str(i),
                'TestModelLabel': i % 2
            }) for i in xrange(0, 10000)
        ]
        test_samples = [
            TestOfferSample.fromjson({
                'entityid': str(i),
                'TestModelLabel': i % 2
            }) for i in xrange(10000, 12000)
        ]

        DataSet.remove('bootstrapped_training')
        DataSet.bootstrap('bootstrapped_training',
                          model,
                          train_samples,
                          part_size=2000,
                          numclasses=2)
        DataSet.remove('bootstrapped_test')
        DataSet.bootstrap('bootstrapped_test',
                          model,
                          test_samples,
                          part_size=2000,
                          numclasses=2)
Esempio n. 6
0
    def testgenerator(self):
        DataSet.generate('testdataset',
                         self.generatormodel,
                         maxRAM=288,
                         customclient=self.mongoclient,
                         query='query',
                         numclasses=2,
                         filter={})
        ds = DataSet('testdataset')
        print len(ds)
        assert len(ds) == 1000
        x, y = ds[0]
        assert y == 0
        assert x.shape == (1, )

        x, y = ds[10]
        xx, yy = ds[9]
        assert x[0] - xx[0] == 1.0

        x, y = ds[150]
        assert y == 150 % 2

        assert len(ds.cache) == 1
        for i in xrange(0, 1000):
            assert ds[i][1] == i % 2

        X, y_true = ds.get_all_samples()
        assert X.shape == (1000, 1)
        assert np.sum(X) == np.sum(xrange(0, 1000))
        assert y_true.shape == (1000, )
        assert np.sum(y_true) == 0.5 * len(ds)

        X, y_true = ds.get_samples([14, 15])
        assert X.shape == (2, 1)
        assert X[1][0] - X[0][0] == 1
        assert y_true.shape == (2, )
        assert y_true[0] + y_true[1] == 1
Esempio n. 7
0
# -*- coding: utf-8 -*-
import sys
import logging
import datetime as dt

from iwlearn.training import DataSet

from tutorial.common.models import RelocationModelPro

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    train = DataSet('train-pro')
    test = DataSet('test-pro')
    print 'Samples in train %d, in test %d' % (len(train), len(test))

    model = RelocationModelPro()
    model.train(train)
    model.evaluate_and_print(test)

    scored_features = model.feature_selection(test, step=1, n_splits=4)

    selected_features = []
    for i, (feature, score) in enumerate(zip(model.features, scored_features)):
        logging.info('%s %s %s ' % (i, feature.name, score))
        if score <= 1:
            selected_features.append(feature)
Esempio n. 8
0
# -*- coding: utf-8 -*-
import sys
import logging
import datetime as dt

import numpy as np

from iwlearn.training import DataSet

from tutorial.common.models import RelocationModelHyper

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    train = DataSet('train-hyper')
    test = DataSet('test-hyper')
    print 'Samples in train %d, in test %d' % (len(train), len(test))

    model = RelocationModelHyper()

    # Train the model in a simple way to provide a baseline of the model performance
    model.train(train)
    model.evaluate_and_print(test)

    print
    print

    # Now perform training with the hyperparameter optimization
    n_estimators_range = np.linspace(start=100, stop=600, num=5, dtype=int)
    max_features_range = np.linspace(start=5, stop=10, num=3, dtype=int)
    min_samples_leaf_range = np.linspace(start=1, stop=4, num=2, dtype=int)
Esempio n. 9
0
# -*- coding: utf-8 -*-
import sys
import logging

import iwlearn.mongo as mongo
from iwlearn.training import DataSet

from tutorial.common.samples import RelocationUserSample
from tutorial.common.rules import RelocationRule

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    mongo.setmongouri('mongodb://localhost:27017/')

    DataSet.remove('train')
    DataSet.generate('train',
                     RelocationRule(),
                     numclasses=2,
                     filter={'entityid': {
                         '$regex': r'^user[0-9]*?[0-7]$'
                     }})

    DataSet.remove('test')
    DataSet.generate('test',
                     RelocationRule(),
                     numclasses=2,
                     filter={'entityid': {
                         '$regex': r'^user[0-9]*?[8-9]$'
                     }})
Esempio n. 10
0
# -*- coding: utf-8 -*-
import sys
import logging

import iwlearn.mongo as mongo
from iwlearn.training import DataSet

from tutorial.common.models import RelocationModelHyper

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    mongo.setmongouri('mongodb://localhost:27017/')

    DataSet.remove('train-hyper')
    DataSet.generate('train-hyper', RelocationModelHyper(), numclasses=2, filter={'entityid': {'$regex': r'^user[0-9]*?[0-7]$'}})

    DataSet.remove('test-hyper')
    DataSet.generate('test-hyper', RelocationModelHyper(), numclasses=2, filter={'entityid': {'$regex': r'^user[0-9]*?[8-9]$'}})
Esempio n. 11
0
# -*- coding: utf-8 -*-
import logging
import sys

from iwlearn.training import DataSet
from tutorial.common.rules import RelocationRule

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    test = DataSet('test')
    rule = RelocationRule()
    rule.evaluate_and_print(test)
Esempio n. 12
0
# -*- coding: utf-8 -*-
import logging
import sys

from iwlearn.training import DataSet

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    train = DataSet('train')
    print 'Number of samples %d' % len(train)

    train.plot_data(bins=20)
Esempio n. 13
0
# -*- coding: utf-8 -*-
import sys
import logging

from iwlearn.training import DataSet

from tutorial.common.models import RelocationModel

if __name__ == "__main__":
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)

    train = DataSet('train')
    test = DataSet('test')
    print 'Samples in train %d, in test %d' % (len(train), len(test))

    model = RelocationModel()
    model.train(train)
    model.evaluate_and_print(test)
Esempio n. 14
0
    def test_expansion(self, capsys, caplog, monkeypatch):
        with capsys.disabled():
            import shutil
            try:
                shutil.rmtree('input/v2_test')
            except:
                pass

        samples = []
        samples_2 = []
        for x in xrange(0, 100):
            samples.append(
                TestOfferSample.fromjson({
                    'entityid':
                    uuid.uuid4(),
                    'value_1':
                    x,
                    'value_2':
                    x % 17,
                    'TestModelLabel': (1 if x % 17 == 0 else 0)
                }))
            samples_2.append(
                TestOfferSample.fromjson({
                    'entityid':
                    uuid.uuid4(),
                    'value_1':
                    2 * x,
                    'value_2':
                    2 * x % 17,
                    'TestModelLabel':
                    1 if x % 17 == 0 else 0
                }))

        model_1 = ScikitLearnModel('TestModel',
                                   [SimpleSampleFeature('value_1')],
                                   TestOfferSample)
        DataSet.bootstrap('v2_test',
                          model_1,
                          samples,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')

        assert len(ds) == 100
        assert tuple(ds.meta['model_input_shape']) == (1, )
        assert [x['name'] for x in ds.meta['features']
                ] == ['SimpleSampleFeature_value_1']
        assert len(ds.meta['features']) == 1
        assert len(ds.metaparts) == 10
        for k, p in ds.metaparts.iteritems():
            assert p['unordered_features'] == ['SimpleSampleFeature_value_1']

        # Expand vertically
        DataSet.bootstrap('v2_test',
                          model_1,
                          samples_2,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')

        assert len(ds) == 200
        assert tuple(ds.meta['model_input_shape']) == (1, )
        assert [x['name'] for x in ds.meta['features']
                ] == ['SimpleSampleFeature_value_1']
        assert len(ds.meta['features']) == 1
        assert len(ds.metaparts) == 20
        for k, p in ds.metaparts.iteritems():
            assert p['unordered_features'] == ['SimpleSampleFeature_value_1']

        # Expand horizontally
        model_2 = ScikitLearnModel(
            'TestModel',
            [SimpleSampleFeature('value_1'),
             SimpleSampleFeature('value_2')], TestOfferSample)
        caplog.clear()
        monkeypatch.setattr(
            '__builtin__.input',
            lambda x: 'n')  # do not remove parts with missing feature
        DataSet.bootstrap('v2_test',
                          model_2,
                          samples,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')
        assert len(caplog.records) == 10
        for tpl in caplog.records:
            assert "does not contain following features: set(['SimpleSampleFeature_value_2'])" in tpl.msg
        assert len(ds) == 200
        assert tuple(ds.meta['model_input_shape']) == (2, )
        assert [x['name'] for x in ds.meta['features']] == [
            'SimpleSampleFeature_value_1', 'SimpleSampleFeature_value_2'
        ]
        assert len(ds.metaparts) == 20
        for k, p in ds.metaparts.iteritems():
            assert 'SimpleSampleFeature_value_1' in p['unordered_features']
            assert len(p['unordered_features']) == 1 or len(
                p['unordered_features']) == 2

        # Check dataset would not crash for parts do not containing the second feature
        X, y_true = ds.get_all_samples()
        assert X.shape == (200, 2)
        import numpy as np
        assert np.mean(X[:, 0]) != BaseFeature.MISSING_VALUE
        assert np.mean(X[:, 1]) != BaseFeature.MISSING_VALUE
        assert sum(X[:, 1] == BaseFeature.MISSING_VALUE) == 100

        # Remove first feature
        caplog.clear()
        DataSet.bootstrap('v2_test',
                          model_1,
                          samples,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')
        assert len(caplog.records) == 1
        assert "Following features removed from dataset_V5.json: set(['SimpleSampleFeature_value_2'])" in \
               caplog.text