def test_run2():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)
    with LogCapture() as l:
        runner.run(dry_run=True)
    l.check(
        ("root", "INFO", "Taking nodes to run from default"),
        (
            "root",
            "INFO",
            "DRY RUN 0: would run node csv_reader of type reader_config and class CsvReader",
        ),
        ("root", "INFO", "All done. Bye bye!"),
    )
def test_run_bad2():
    class TestWriterTmp(AbstractWriter):
        @staticmethod
        def necessary_config(node_config):
            return set([])

        def run(self, data_object):
            raise Exception("Deliberate error")
            # return data_object, False

    NodeFactory().register("TestWriterTmp", TestWriterTmp)

    config = {
        "implementation_config": {
            "writer_config": {
                "mywriter": {
                    "class": "TestWriterTmp",
                    "destinations": []
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    with pytest.raises(Exception) as e:
        runner.run()
    assert "Deliberate error" in str(e)
def test_filter_sequence4():
    config = {
        "metadata": {
            "section_run": ["writer_config"]
        },
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["recipe_csv_writer"],
                }
            },
            "writer_config": {
                "recipe_csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                }
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    sequence = runner.filter_sequence(["read_data", "recipe_csv_writer"])
    assert sequence == ["recipe_csv_writer"]
def test_run_notification_error():
    config = {
        "metadata": {
            "section_registry": ["phase1"],
            "notify_on_error": {
                "client": "SlackClient",
                "channel": "some-channel",
                "token": "slack-api-token",
                "member_id": "optional-key",
            },
        },
        "implementation_config": {
            "phase1": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "bad/path.csv"
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)

    mock_client = mock.Mock()
    mock_client.post_message = mock.Mock()
    mock_get_notification_client = mock.Mock(return_value=mock_client)

    path = "primrose.notification_utils.get_notification_client"
    with mock.patch(path) as mock_get_notification_client:
        with pytest.raises(Exception) as error:
            runner.run()
            assert mock_get_notification_client.post_message.call_count == 1
def test_run_bad():
    class TestWriterTmp(AbstractWriter):
        @staticmethod
        def necessary_config(node_config):
            return set([])

        def run(self, data_object):
            return data_object, False

    NodeFactory().register("TestWriterTmp", TestWriterTmp)

    config = {
        "implementation_config": {
            "writer_config": {
                "mywriter": {
                    "class": "TestWriterTmp",
                    "destinations": []
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    # unregister this class
    del NodeFactory().name_dict["TestWriterTmp"]

    with pytest.raises(Exception) as e:
        runner.run()
    assert "Issue instantiating mywriter and class TestWriterTmp" in str(e)
def test_run4():
    class TestWriter(AbstractFileWriter):
        def __init__(self, configuration, instance_name):
            pass

        @staticmethod
        def necessary_config(node_config):
            return set([])

        def run(self, data_object):
            terminate = True
            return data_object, terminate

    NodeFactory().register("TestWriter", TestWriter)

    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["csv_writer"],
                }
            },
            "writer_config": {
                "csv_writer": {
                    "class": "TestWriter"
                }
            },
        }
    }

    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)

    with LogCapture() as l:
        runner.run(dry_run=False)
    l.check(
        ("root", "INFO", "Taking nodes to run from default"),
        (
            "root",
            "INFO",
            "received node csv_reader of type reader_config and class CsvReader",
        ),
        ("root", "INFO", "Reading test/minimal.csv from CSV"),
        (
            "root",
            "INFO",
            "received node csv_writer of type writer_config and class TestWriter",
        ),
        ("root", "INFO", "Terminating early due to signal from csv_writer"),
        ("root", "INFO", "All done. Bye bye!"),
    )
def test_run6():
    config = {
        "metadata": {
            "section_registry": ["phase1", "cleanup_config"],
            "notify_on_error": {
                "client": "SlackClient",
                "channel": "some-channel",
                "token": "slack-api-token",
                "member_id": "optional-key",
            },
        },
        "implementation_config": {
            "phase1": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["notification"],
                }
            },
            "cleanup_config": {
                "notification": {
                    "class": "ClientNotification",
                    "client": "SlackClient",
                    "channel": "some-channel",
                    "token": "slack-api-token",
                    "member_id": "optional-key",
                    "message": "Yay! Sucess",
                }
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)

    with LogCapture() as l:
        runner.run(dry_run=True)
    l.check(
        ("root", "INFO", "Taking nodes to run from section_registry"),
        (
            "root",
            "INFO",
            "DRY RUN 0: would run node csv_reader of type phase1 and class CsvReader",
        ),
        (
            "root",
            "INFO",
            "DRY RUN 1: would run node notification of type cleanup_config and class ClientNotification",
        ),
        ("root", "INFO", "All done. Bye bye!"),
    )
def test_create_data_object():

    filename = "dag_runner_create_data_object.pkl"
    # hack part 1: make sure this filename exists so that checks in Configuration pass
    open(filename, "w+")

    config = {
        "metadata": {
            "data_object": {
                "read_from_cache": True,
                "read_filename": "dag_runner_create_data_object.pkl",
            }
        },
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    # hack part 2: now get rid of it
    if os.path.exists(filename):
        os.remove(filename)

    # now write the actual object to restore from
    data_object = DataObject(configuration)
    writer = CsvReader(configuration, "csv_reader")
    data_object.add(writer, "some_data")
    data_object.write_to_cache(filename)
    assert os.path.exists(filename)

    # now we get to the code to test
    runner = DagRunner(configuration)
    restored_data_object = runner.create_data_object()

    # run some checks
    assert isinstance(restored_data_object, DataObject)
    assert (restored_data_object.get(
        "csv_reader", rtype=DataObjectResponseType.VALUE.value) == "some_data")

    # cleanup
    if os.path.exists(filename):
        os.remove(filename)
Beispiel #9
0
def run(config, dry_run=False):
    """Run a primrose job"""
    from primrose.configuration.configuration import Configuration
    from primrose.dag_runner import DagRunner

    configuration = Configuration(config_location=config)
    DagRunner(configuration).run(dry_run=dry_run)
Beispiel #10
0
def test_init_traverser_from_config():
    class TestTraverser(DagTraverser):
        def traversal_list(self):
            return []

        def run_section_by_section(self):
            return False

    TraverserFactory().register("TestTraverser", TestTraverser)

    config = {
        "metadata": {
            "traverser": "TestTraverser"
        },
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    assert isinstance(runner.dag_traverser, TestTraverser)
Beispiel #11
0
def test_filter_sequence6():
    # test of dependencies with a section
    config = {
        "metadata": {
            "traverser": "DepthFirstTraverser"
        },
        "implementation_config": {
            "reader_config": {
                "read_data1": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["csv_writer"],
                },
                "read_data2": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["csv_writer"],
                },
                "read_data3": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["read_data2"],
                },
            },
            "writer_config": {
                "csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                }
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)
    s_in = ["read_data3", "read_data2", "read_data1", "csv_writer"]
    s_out = runner.filter_sequence(s_in)
    assert s_out == s_in

    with pytest.raises(Exception) as e:
        runner.filter_sequence(
            ["read_data1", "csv_writer", "read_data3", "read_data2"])
    assert "Upstream path found, from read_data3 to csv_writer" in str(e.value)
Beispiel #12
0
def test_run():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)
    runner.run()
Beispiel #13
0
def test_run5():
    config = {
        "metadata": {
            "section_registry": ["phase1", "phase2"]
        },
        "implementation_config": {
            "phase1": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["csv_writer"],
                }
            },
            "phase2": {
                "csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "test/unittest_similar_recipes.csv",
                }
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)

    with LogCapture() as l:
        runner.run(dry_run=True)
    l.check(
        ("root", "INFO", "Taking nodes to run from section_registry"),
        (
            "root",
            "INFO",
            "DRY RUN 0: would run node csv_reader of type phase1 and class CsvReader",
        ),
        (
            "root",
            "INFO",
            "DRY RUN 1: would run node csv_writer of type phase2 and class CsvWriter",
        ),
        ("root", "INFO", "All done. Bye bye!"),
    )
Beispiel #14
0
def test_filter_sequence5():
    config = {
        "metadata": {
            "section_run": ["writer_config", "reader_config"]
        },
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations":
                    ["recipe_csv_writer", "recipe_csv_writer2"],
                }
            },
            "writer_config": {
                "recipe_csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
                "recipe_csv_writer2": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)
    with pytest.raises(Exception) as e:
        runner.filter_sequence(
            ["recipe_csv_writer", "read_data", "recipe_csv_writer2"])
    assert (
        "Traverser is mismatched with section writer_config. Expecting set ['recipe_csv_writer', 'recipe_csv_writer2']"
        in str(e))
Beispiel #15
0
def test_filter_sequence1():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    with pytest.raises(Exception) as e:
        runner.filter_sequence(["csv_reader", "csv_reader"])
    assert "You have duplicate nodes from traverser!" in str(e)
Beispiel #16
0
def test_filter_sequence2():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    with pytest.raises(Exception) as e:
        runner.filter_sequence(["csv_reader", "junk!"])
    assert "Unknown key junk!" in str(e)
Beispiel #17
0
def test_cache_data_object():
    config = {
        "metadata": {
            "data_object": {
                "write_to_cache": True,
                "write_filename": "dag_runner_test_cache_data_object.pkl",
            }
        },
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    data_object = DataObject(configuration)
    writer = CsvReader(configuration, "csv_reader")
    data_object.add(writer, "some_data")

    runner = DagRunner(configuration)

    filename = "dag_runner_test_cache_data_object.pkl"
    if os.path.exists(filename):
        os.remove(filename)

    cached = runner.cache_data_object(data_object)

    assert cached

    assert os.path.exists(filename)

    if os.path.exists(filename):
        os.remove(filename)
def test_run():
    config = {
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "SklearnDatasetReader",
                    "dataset": "iris",
                    "destinations": ["train_test_split"],
                }
            },
            "pipeline_config": {
                "train_test_split": {
                    "class":
                    "TrainTestSplit",
                    "features": [
                        "sepal length (cm)",
                        "petal length (cm)",
                        "petal width (cm)",
                    ],
                    "target_variable":
                    "sepal width (cm)",
                    "training_fraction":
                    0.65,
                    "is_training":
                    True,
                    "seed":
                    42,
                    "destinations": ["regression_model"],
                }
            },
            "model_config": {
                "regression_model": {
                    "class": "SklearnRegressionModel",
                    "mode": "train",
                    "model": {
                        "class": "linear_model.LinearRegression"
                    },
                    "destinations": [],
                }
            },
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DagRunner(configuration).run()
    scores = data_object.data_dict["regression_model"]["scores"]
    print(scores)

    assert math.isclose(scores["Explained variance"],
                        0.531103247696713,
                        abs_tol=0.00001)
Beispiel #19
0
def test_filter_sequence3():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    with pytest.raises(Exception) as e:
        runner.filter_sequence([])
    assert "Ran out of nodes for section reader_config. Only received []" in str(
        e)
Beispiel #20
0
def test_check_upstream2():
    config = {
        "metadata": {
            "section_run": ["writer_config", "reader_config"]
        },
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations":
                    ["recipe_csv_writer", "recipe_csv_writer2"],
                }
            },
            "writer_config": {
                "recipe_csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
                "recipe_csv_writer2": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)
    with pytest.raises(Exception) as e:
        runner.check_for_upstream(
            ["recipe_csv_writer2", "read_data", "recipe_csv_writer"])
    assert "Upstream path found, from read_data to recipe_csv_writer2" in str(
        e.value)
Beispiel #21
0
def test_check_upstream():
    config = {
        "metadata": {
            "section_run": ["reader_config", "writer_config"]
        },
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations":
                    ["recipe_csv_writer", "recipe_csv_writer2"],
                }
            },
            "writer_config": {
                "recipe_csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
                "recipe_csv_writer2": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)
    assert (runner.check_for_upstream(
        ["read_data", "recipe_csv_writer", "recipe_csv_writer2"]) == False)
Beispiel #22
0
def main():
    """
    Run a job: i.e. run a configuration file through the DAGRunner
    """
    args, _ = parse_arguments()

    logging.basicConfig(
        format=
        "%(asctime)s %(levelname)s %(filename)s %(funcName)s: %(message)s",
        level=logging.INFO,
    )

    configuration = Configuration(config_location=args.config_loc)

    DagRunner(configuration).run(dry_run=args.is_dry_run)
def test_run():
    config = {
        "metadata": {
            "section_registry": [
                "reader_config",
                "pipeline_config",
                "model_config",
                "dataviz_config",
            ]
        },
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/unclustered.csv",
                    "destinations": ["normalize_data"],
                }
            },
            "pipeline_config": {
                "normalize_data": {
                    "class":
                    "SklearnPreprocessingPipeline",
                    "operations": [
                        {
                            "class": "preprocessing.StandardScaler",
                            "columns": ["x", "y"],
                            "args": {
                                "with_mean": True,
                                "with_std": True
                            },
                        },
                    ],
                    "is_training":
                    True,
                    "training_fraction":
                    0.65,
                    "seed":
                    42,
                    "destinations": ["cluster_model"],
                }
            },
            "model_config": {
                "cluster_model": {
                    "class": "SklearnClusterModel",
                    "mode": "train",
                    "features": ["x", "y"],
                    "model": {
                        "class": "cluster.KMeans",
                        "args": {
                            "n_clusters": 6,
                            "random_state": 42
                        },
                    },
                    "destinations": ["cluster_plotter"],
                }
            },
            "dataviz_config": {
                "cluster_plotter": {
                    "class": "ClusterPlotter",
                    "id_col": "predictions",
                    "filename": "clusters.png",
                    "title": "Results of KMeans(k=6)",
                    "destinations": [],
                }
            },
        },
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    fname = "clusters.png"

    if os.path.exists(fname):
        os.remove(fname)

    DagRunner(configuration).run()

    assert os.path.exists(fname)

    if os.path.exists(fname):
        os.remove(fname)
Beispiel #24
0
def test_run_pruned():
    config = {
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "data/tennis.csv",
                    "destinations": ["conditional_node"],
                },
                "conditional_node": {
                    "class": "SimpleSwitch",
                    "path_to_travel": "left",
                    "destinations": ["left", "right"],
                },
            },
            "writer_config": {
                "left": {
                    "class": "LoggingSuccess",
                    "msg": "left node!",
                    "level": "INFO",
                },
                "right": {
                    "class": "LoggingSuccess",
                    "msg": "right node!",
                    "level": "INFO",
                    "destinations": ["right2"],
                },
                "right2": {
                    "class": "LoggingSuccess",
                    "msg": "right node2!",
                    "level": "INFO",
                },
            },
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)
    with LogCapture() as l:
        runner.run()
    l.check(
        ("root", "INFO", "Taking nodes to run from default"),
        (
            "root",
            "INFO",
            "received node read_data of type reader_config and class CsvReader",
        ),
        ("root", "INFO", "Reading data/tennis.csv from CSV"),
        (
            "root",
            "INFO",
            "received node conditional_node of type reader_config and class SimpleSwitch",
        ),
        ("root", "INFO", "Skipping pruned node right"),
        ("root", "INFO", "Skipping pruned node right2"),
        (
            "root",
            "INFO",
            "received node left of type writer_config and class LoggingSuccess",
        ),
        ("root", "INFO", "left node!"),
        ("root", "INFO", "All done. Bye bye!"),
    )