Ejemplo n.º 1
0
    def testgenerator(self):
        DataSet.generate('testdataset',
                         self.generatormodel,
                         maxRAM=288,
                         customclient=self.mongoclient,
                         query='query',
                         numclasses=2,
                         filter={})
        ds = DataSet('testdataset')
        print len(ds)
        assert len(ds) == 1000
        x, y = ds[0]
        assert y == 0
        assert x.shape == (1, )

        x, y = ds[10]
        xx, yy = ds[9]
        assert x[0] - xx[0] == 1.0

        x, y = ds[150]
        assert y == 150 % 2

        assert len(ds.cache) == 1
        for i in xrange(0, 1000):
            assert ds[i][1] == i % 2

        X, y_true = ds.get_all_samples()
        assert X.shape == (1000, 1)
        assert np.sum(X) == np.sum(xrange(0, 1000))
        assert y_true.shape == (1000, )
        assert np.sum(y_true) == 0.5 * len(ds)

        X, y_true = ds.get_samples([14, 15])
        assert X.shape == (2, 1)
        assert X[1][0] - X[0][0] == 1
        assert y_true.shape == (2, )
        assert y_true[0] + y_true[1] == 1
Ejemplo n.º 2
0
    def test_expansion(self, capsys, caplog, monkeypatch):
        with capsys.disabled():
            import shutil
            try:
                shutil.rmtree('input/v2_test')
            except:
                pass

        samples = []
        samples_2 = []
        for x in xrange(0, 100):
            samples.append(
                TestOfferSample.fromjson({
                    'entityid':
                    uuid.uuid4(),
                    'value_1':
                    x,
                    'value_2':
                    x % 17,
                    'TestModelLabel': (1 if x % 17 == 0 else 0)
                }))
            samples_2.append(
                TestOfferSample.fromjson({
                    'entityid':
                    uuid.uuid4(),
                    'value_1':
                    2 * x,
                    'value_2':
                    2 * x % 17,
                    'TestModelLabel':
                    1 if x % 17 == 0 else 0
                }))

        model_1 = ScikitLearnModel('TestModel',
                                   [SimpleSampleFeature('value_1')],
                                   TestOfferSample)
        DataSet.bootstrap('v2_test',
                          model_1,
                          samples,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')

        assert len(ds) == 100
        assert tuple(ds.meta['model_input_shape']) == (1, )
        assert [x['name'] for x in ds.meta['features']
                ] == ['SimpleSampleFeature_value_1']
        assert len(ds.meta['features']) == 1
        assert len(ds.metaparts) == 10
        for k, p in ds.metaparts.iteritems():
            assert p['unordered_features'] == ['SimpleSampleFeature_value_1']

        # Expand vertically
        DataSet.bootstrap('v2_test',
                          model_1,
                          samples_2,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')

        assert len(ds) == 200
        assert tuple(ds.meta['model_input_shape']) == (1, )
        assert [x['name'] for x in ds.meta['features']
                ] == ['SimpleSampleFeature_value_1']
        assert len(ds.meta['features']) == 1
        assert len(ds.metaparts) == 20
        for k, p in ds.metaparts.iteritems():
            assert p['unordered_features'] == ['SimpleSampleFeature_value_1']

        # Expand horizontally
        model_2 = ScikitLearnModel(
            'TestModel',
            [SimpleSampleFeature('value_1'),
             SimpleSampleFeature('value_2')], TestOfferSample)
        caplog.clear()
        monkeypatch.setattr(
            '__builtin__.input',
            lambda x: 'n')  # do not remove parts with missing feature
        DataSet.bootstrap('v2_test',
                          model_2,
                          samples,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')
        assert len(caplog.records) == 10
        for tpl in caplog.records:
            assert "does not contain following features: set(['SimpleSampleFeature_value_2'])" in tpl.msg
        assert len(ds) == 200
        assert tuple(ds.meta['model_input_shape']) == (2, )
        assert [x['name'] for x in ds.meta['features']] == [
            'SimpleSampleFeature_value_1', 'SimpleSampleFeature_value_2'
        ]
        assert len(ds.metaparts) == 20
        for k, p in ds.metaparts.iteritems():
            assert 'SimpleSampleFeature_value_1' in p['unordered_features']
            assert len(p['unordered_features']) == 1 or len(
                p['unordered_features']) == 2

        # Check dataset would not crash for parts do not containing the second feature
        X, y_true = ds.get_all_samples()
        assert X.shape == (200, 2)
        import numpy as np
        assert np.mean(X[:, 0]) != BaseFeature.MISSING_VALUE
        assert np.mean(X[:, 1]) != BaseFeature.MISSING_VALUE
        assert sum(X[:, 1] == BaseFeature.MISSING_VALUE) == 100

        # Remove first feature
        caplog.clear()
        DataSet.bootstrap('v2_test',
                          model_1,
                          samples,
                          part_size=10,
                          numclasses=2)
        ds = DataSet('v2_test')
        assert len(caplog.records) == 1
        assert "Following features removed from dataset_V5.json: set(['SimpleSampleFeature_value_2'])" in \
               caplog.text