Beispiel #1
0
    def test_predict_start(self):
        # Setup variables
        primitives = [
            'sklearn.preprocessing.StandardScaler',
            'sklearn.linear_model.LogisticRegression'
        ]
        pipeline = MLPipeline(primitives)
        pipeline.fit(self.X_train, self.y_train)

        # Mock the first block
        block_mock = Mock()
        pipeline.blocks['sklearn.preprocessing.StandardScaler#1'] = block_mock

        # Run first block
        context = {
            'X': self.X_train,
        }
        int_start = 1
        str_start = 'sklearn.linear_model.LogisticRegression#1'

        pipeline.predict(start_=int_start, **context)
        pipeline.predict(start_=str_start, **context)

        # Assert that mock has not been called
        block_mock.predict.assert_not_called()
Beispiel #2
0
    def test_fit_output(self):

        # Setup variables
        primitives = [
            'sklearn.preprocessing.StandardScaler',
            'sklearn.linear_model.LogisticRegression'
        ]
        pipeline = MLPipeline(primitives)

        int_block = 0
        invalid_int = 10
        str_block = 'sklearn.preprocessing.StandardScaler#1'
        invalid_block = 'InvalidBlockName'
        str_block_variable = 'sklearn.preprocessing.StandardScaler#1.y'
        invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'

        # Run
        int_out = pipeline.fit(self.X_train[0:5],
                               self.y_train[0:5],
                               output_=int_block)
        str_out = pipeline.fit(self.X_train[0:5],
                               self.y_train[0:5],
                               output_=str_block)
        str_out_variable = pipeline.fit(self.X_train[0:5],
                                        self.y_train[0:5],
                                        output_=str_block_variable)
        no_output = pipeline.fit(self.X_train, self.y_train)

        # Assert successful calls
        X = np.array([[0.71269665, -1.45152899, 0.55344946, 0.31740553],
                      [0.26726124, 1.23648766, -1.1557327, -1.0932857],
                      [-1.95991577, 0.967686, -1.1557327, -1.0932857],
                      [0.71269665, -0.645124, 0.39067021, 0.31740553],
                      [0.26726124, -0.10752067, 1.36734573, 1.55176035]])
        y = np.array([1, 0, 0, 1, 2])
        context = {'X': X, 'y': y}
        almost_equal(context, int_out)
        almost_equal(context, str_out)

        almost_equal(y, str_out_variable)

        assert no_output is None

        # Run asserting exceptions
        with self.assertRaises(IndexError):
            pipeline.fit(self.X_train[0:5],
                         self.y_train[0:5],
                         output_=invalid_int)

        with self.assertRaises(ValueError):
            pipeline.fit(self.X_train[0:5],
                         self.y_train[0:5],
                         output_=invalid_block)

        with self.assertRaises(ValueError):
            pipeline.fit(self.X_train[0:5],
                         self.y_train[0:5],
                         output_=invalid_variable)
Beispiel #3
0
def run():

    print("============================================")
    print("Testing Multi Table Pipeline")
    print("============================================")

    orders = pd.read_csv("data/Retail/orders.csv")
    order_products = pd.read_csv("data/Retail/order_products.csv")
    label_times = pd.read_csv("data/Retail/label_times.csv")

    X_train = label_times.sample(frac=0.8)
    X_test = label_times.drop(X_train.index)
    y_train = X_train["label"]
    y_test = X_test["label"]

    entity_set = make_entity_set(orders, order_products)

    multitable = MLPipeline(['dfs', 'random_forest_classifier'])

    updated_hyperparam = MLHyperparam('max_depth', 'int', [1, 10])
    updated_hyperparam.block_name = 'dfs'
    # multitable.update_tunable_hyperparams([updated_hyperparam])

    # Check that the hyperparameters are correct.
    for hyperparam in multitable.get_tunable_hyperparams():
        print(hyperparam)

    # Check that the blocks are correct.
    expected_blocks = {'dfs', 'rf_classifier'}
    blocks = set(multitable.blocks.keys())
    assert expected_blocks == blocks

    # Check that we can score properly.
    produce_params = {
        ('dfs', 'entityset'): entity_set,
        ('dfs', 'cutoff_time_in_index'): True
    }
    print("\nFitting pipeline...")
    fit_params = {
        ('dfs', 'entityset'): entity_set,
        ('dfs', 'target_entity'): "users",
        ('dfs', 'training_window'): ft.Timedelta("60 days")
    }
    multitable.fit(X_train,
                   y_train,
                   fit_params=fit_params,
                   produce_params=produce_params)
    print("\nFit pipeline.")

    print("\nScoring pipeline...")
    predicted_y_val = multitable.predict(X_test, predict_params=produce_params)
    score = f1_score(predicted_y_val, y_test, average='micro')
    print("\nf1 micro score: %f" % score)

    return score
Beispiel #4
0
def run(train_size=160, test_size=40):

    print("============================================")
    print("Testing Audio Pipeline")
    print("============================================")

    # Data loading.
    classes = [
        'street_music', 'siren', 'jackhammer', 'gun_shot', 'engine_idling',
        'drilling', 'dog_bark', 'children_playing', 'car_horn',
        'air_conditioner'
    ]

    labels = []
    all_filepaths = []
    for label_class in classes:
        for filepath in glob.glob(
                os.path.join('data/UrbanSound/data', label_class, '*.wav')):
            all_filepaths.append(filepath)
            labels.append(label_class)

    filepaths, filepaths_test, y, y_test = train_test_split(
        all_filepaths, labels, train_size=train_size, test_size=test_size)

    audio_pipeline = MLPipeline([
        'audio_featurizer', 'audio_padder', 'pca', 'random_forest_classifier'
    ])

    # Check that the hyperparameters are correct.
    for hyperparam in audio_pipeline.get_tunable_hyperparams():
        print(hyperparam)

    # Check that the blocks are correct.
    expected_blocks = {
        'audio_featurizer', 'audio_padder', 'pca', 'rf_classifier'
    }
    blocks = set(audio_pipeline.blocks.keys())
    assert expected_blocks == blocks

    # Check that we can score properly.
    print("\nFitting pipeline...")
    X, sample_freqs = load_and_segment(filepaths)
    produce_params = {('audio_featurizer', 'sample_freqs'): sample_freqs}
    audio_pipeline.fit(X, y, produce_params=produce_params)
    print("\nFit pipeline.")

    print("\nScoring pipeline...")
    X_test, sample_freqs_test = load_and_segment(filepaths_test)
    predict_params = {('audio_featurizer', 'sample_freqs'): sample_freqs_test}
    predicted_y_val = audio_pipeline.predict(X_test, predict_params)
    score = f1_score(predicted_y_val, y_test, average='micro')
    print("\nf1 micro score: %f" % score)

    return score
Beispiel #5
0
    def test_fit_produce_debug_str(self):
        outputs = {
            'default': [
                {
                    'name': 'a_name',
                    'variable': 'a_primitive#1.a_variable',
                    'type': 'a_type',
                }
            ]
        }
        mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
        mlpipeline.blocks['a_primitive#1'].fit_args = [
            {
                'name': 'fit_input',
                'type': 'whatever'
            }
        ]

        mlpipeline.blocks['a_primitive#1'].produce_args = [
            {
                'name': 'input',
                'type': 'whatever'
            }
        ]

        mlpipeline.blocks['a_primitive#1'].produce_output = [
            {
                'name': 'a_name',
                'type': 'a_type'
            }
        ]

        expected_return = dict()
        expected_return['debug'] = 'tm'
        expected_return['fit'] = {
            'a_primitive#1': {
                'time': 0,
                'memory': 0,
            }
        }
        expected_return['produce'] = {
            'a_primitive#1': {
                'time': 0,
                'memory': 0,
            }
        }

        returned, debug_returned = mlpipeline.fit(output_='default', debug='tm')

        assert len([returned]) == len(outputs['default'])
        assert isinstance(debug_returned, dict)
        assert set(debug_returned.keys()) == set(expected_return.keys())  # fit / produce
        assert set(debug_returned['fit'].keys()) == set(expected_return['fit'].keys())
        assert set(debug_returned['produce'].keys()) == set(expected_return['produce'].keys())

        for block_name, dictionary in expected_return['fit'].items():
            assert set(debug_returned['fit'][block_name].keys()) == set(dictionary.keys())

        for block_name, dictionary in expected_return['produce'].items():
            assert set(debug_returned['produce'][block_name].keys()) == set(dictionary.keys())
Beispiel #6
0
    def test_fit_debug_str(self):
        mlpipeline = MLPipeline(['a_primitive'])
        mlpipeline.blocks['a_primitive#1'].fit_args = [
            {
                'name': 'fit_input',
                'type': 'whatever'
            }
        ]

        expected_return = dict()
        expected_return['debug'] = 'tm'
        expected_return['fit'] = {
            'a_primitive#1': {
                'time': 0,
                'memory': 0,
            }
        }

        returned = mlpipeline.fit(debug='tm')

        assert isinstance(returned, dict)
        assert set(returned.keys()) == set(expected_return.keys())  # fit / produce
        assert set(returned['fit'].keys()) == set(expected_return['fit'].keys())  # block name

        for block_name, dictionary in expected_return['fit'].items():
            assert set(returned['fit'][block_name].keys()) == set(dictionary.keys())
Beispiel #7
0
    def test_fit_debug(self):
        mlpipeline = MLPipeline(['a_primitive'])
        mlpipeline.blocks['a_primitive#1'].fit_args = [{
            'name': 'fit_input',
            'type': 'whatever'
        }]

        expected_return = dict()
        expected_return["fit"] = {
            "a_primitive#1": {
                "elapsed": 0,
                "input": {"whatever"}
            }
        }

        returned = mlpipeline.fit(debug=True)

        print(returned)
        assert isinstance(returned, dict)
        assert set(returned.keys()) == set(
            expected_return.keys())  # fit / produce
        assert set(returned["fit"].keys()) == set(
            expected_return["fit"].keys())  # block name

        for block_name, dictionary in expected_return["fit"].items():
            assert set(returned["fit"][block_name].keys()) == set(
                dictionary.keys())
Beispiel #8
0
    def test_fit_produce_debug(self):
        outputs = {
            'default': [{
                'name': 'a_name',
                'variable': 'a_primitive#1.a_variable',
                'type': 'a_type',
            }]
        }
        mlpipeline = MLPipeline(['a_primitive'], outputs=outputs)
        mlpipeline.blocks['a_primitive#1'].fit_args = [{
            'name': 'fit_input',
            'type': 'whatever'
        }]

        mlpipeline.blocks['a_primitive#1'].produce_args = [{
            'name': 'input',
            'type': 'whatever'
        }]

        mlpipeline.blocks['a_primitive#1'].produce_output = [{
            'name': 'a_name',
            'type': 'a_type'
        }]

        expected_return = dict()
        expected_return["fit"] = {
            "a_primitive#1": {
                "elapsed": 0,
                "input": {"whatever"}
            }
        }
        expected_return["produce"] = {
            "a_primitive#1": {
                "elapsed": 0,
                "input": {"whatever"},
                "output": {"whatever"}
            }
        }

        returned, debug_returned = mlpipeline.fit(output_='default',
                                                  debug=True)

        assert len([returned]) == len(outputs["default"])
        assert isinstance(debug_returned, dict)
        assert set(debug_returned.keys()) == set(
            expected_return.keys())  # fit / produce
        assert set(debug_returned["fit"].keys()) == set(
            expected_return["fit"].keys())
        assert set(debug_returned["produce"].keys()) == set(
            expected_return["produce"].keys())

        for block_name, dictionary in expected_return["fit"].items():
            assert set(debug_returned["fit"][block_name].keys()) == set(
                dictionary.keys())

        for block_name, dictionary in expected_return["produce"].items():
            assert set(debug_returned["produce"][block_name].keys()) == set(
                dictionary.keys())
Beispiel #9
0
    def test_fit_no_debug(self):
        mlpipeline = MLPipeline(['a_primitive'])
        mlpipeline.blocks['a_primitive#1'].fit_args = [{
            'name': 'fit_input',
            'type': 'whatever'
        }]

        returned = mlpipeline.fit(debug=False)

        assert returned is None
Beispiel #10
0
    def test_fit_pending_one_primitive(self):
        block_1 = get_mlblock_mock()
        block_2 = get_mlblock_mock()
        blocks = OrderedDict((
            ('a.primitive.Name#1', block_1),
            ('a.primitive.Name#2', block_2),
        ))

        self_ = MagicMock(autospec=MLPipeline)
        self_.blocks = blocks
        self_._last_fit_block = 'a.primitive.Name#1'

        MLPipeline.fit(self_)

        expected = [
            call('a.primitive.Name#1'),
        ]
        self_._fit_block.call_args_list = expected

        assert not self_._produce_block.called
    def test_fit_output(self):

        # Setup variables
        primitives = [
            'sklearn.preprocessing.StandardScaler',
            'sklearn.linear_model.LogisticRegression'
        ]
        pipeline = MLPipeline(primitives)

        named = 'default'
        list_ = ['default', 0]
        int_block = 0
        invalid_int = 10
        str_block = 'sklearn.preprocessing.StandardScaler#1'
        invalid_block = 'InvalidBlockName'
        str_block_variable = 'sklearn.preprocessing.StandardScaler#1.X'
        invalid_variable = 'sklearn.preprocessing.StandardScaler#1.invalid'

        # Run
        named_out = pipeline.fit(self.X, self.y, output_=named)
        list_out = pipeline.fit(self.X, self.y, output_=list_)
        int_out = pipeline.fit(self.X, self.y, output_=int_block)
        str_out = pipeline.fit(self.X, self.y, output_=str_block)
        str_out_variable = pipeline.fit(self.X,
                                        self.y,
                                        output_=str_block_variable)
        no_output = pipeline.fit(self.X, self.y)

        # Assert successful calls
        X = np.array([
            [2., -0.5, -0.5, -0.5, -0.5],
            [-0.5, 2., -0.5, -0.5, -0.5],
            [-0.5, -0.5, 2., -0.5, -0.5],
            [-0.5, -0.5, -0.5, 2., -0.5],
            [-0.5, -0.5, -0.5, -0.5, 2.],
        ])
        y = np.array([0, 0, 0, 0, 1])
        context = {'X': X, 'y': y}

        almost_equal(named_out, y)
        assert len(list_out) == 2
        almost_equal(list_out[0], y)
        almost_equal(list_out[1], context)
        almost_equal(context, int_out)
        almost_equal(context, str_out)
        almost_equal(X, str_out_variable)
        assert no_output is None

        # Run asserting exceptions
        with self.assertRaises(IndexError):
            pipeline.fit(self.X, self.y, output_=invalid_int)

        with self.assertRaises(ValueError):
            pipeline.fit(self.X, self.y, output_=invalid_block)

        with self.assertRaises(ValueError):
            pipeline.fit(self.X, self.y, output_=invalid_variable)