def test_sections_in_order3():
    config = {
        "metadata": {"section_run": ["writer_config"]},
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["recipe_csv_writer"],
                }
            },
            "writer_config": {
                "recipe_csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                }
            },
        },
    }
    config = Configuration(
        config_location=None, is_dict_config=True, dict_config=config
    )
    sections, source = config.sections_in_order()
    assert sections == ["writer_config"]
    assert source == "section_run"
Example #2
0
def test_init_ok(config):
    corpus = pd.read_csv("test/minimal.csv")

    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    writer = CsvWriter(configuration, "recipe_csv_writer")

    data_object = DataObject(configuration)
    requestor = CsvReader(configuration, "csv_reader")
    data_object.add(requestor, key="test_data", data=corpus)

    c = configuration.config_for_instance(
        "recipe_csv_writer"
    )  # configuration.sec .writer_config['recipe_csv_writer']
    filename = c["dir"] + os.path.sep + c["filename"]

    # clean out test file location
    if os.path.exists(filename):
        os.remove(filename)

    writer.run(data_object)

    assert os.path.exists(filename)

    df = pd.read_csv(filename)

    assert corpus.equals(df)
Example #3
0
def test_init_other_ok(config):
    config["implementation_config"]["writer_config"]["recipe_file_writer"][
        "filename"] = "unittest_file_writer.other"
    config["implementation_config"]["writer_config"]["recipe_file_writer"][
        "serializer"] = "other"

    test_data_string = "some test data"

    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    data_object = DataObject(configuration)

    requestor = CsvReader(configuration, "csv_reader")

    data_object.add(requestor, test_data_string, "test_data")

    writer = Serializer(configuration, "recipe_file_writer")

    c = configuration.config_for_instance("recipe_file_writer")
    filename = c["dir"] + os.path.sep + c["filename"]

    # clean out test file location
    if os.path.exists(filename):
        os.remove(filename)

    with pytest.raises(Exception, match=r"Unsupported"):
        writer.run(data_object)
Example #4
0
def test_init_ok(config):

    test_data_string = "some test data"

    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    data_object = DataObject(configuration)

    requestor = CsvReader(configuration, "csv_reader")

    data_object.add(requestor, test_data_string, "test_data")

    writer = FileWriter(configuration, "recipe_file_writer")

    c = configuration.config_for_instance("recipe_file_writer")
    filename = c["dir"] + os.path.sep + c["filename"]

    # clean out test file location
    if os.path.exists(filename):
        os.remove(filename)

    data_object, terminate = writer.run(data_object)

    assert not terminate

    assert os.path.exists(filename)

    read_data = open(filename).read()

    assert test_data_string == read_data
def test_perform_any_config_fragment_substitution_bad():
    config_str = """
    {
        {% include "does/not/exist" %}
        "implementation_config": {
        }
    }
    """
    with pytest.raises(Exception) as e:
        Configuration.perform_any_config_fragment_substitution(config_str)
    assert "Substitution files do not exist: does/not/exist" in str(e)
def test_yaml_perform_any_config_fragment_substitution_env_var(monkeypatch):
    monkeypatch.setenv("TEST","foo")
    config_str = """
{% include "test/metadata_fragment.yml" %}
implementation_config:
{% include "test/read_write_fragment.yml" %}
    """
    final_str = Configuration.perform_any_config_fragment_substitution(config_str)
    expected = """
metadata:
  test: foo
implementation_config:
  reader_config:
    read_data:
      class: CsvReader
      destinations:
      - write_output
      filename: data/tennis.csv
  writer_config:
    write_output:
      class: CsvWriter
      dir: cache
      filename: tennis_output.csv
      key: data
    """
    assert final_str == expected
Example #7
0
def test_init_ok_pickle():
    config = {
        "implementation_config": {
            "reader_config": {
                "pickle_reader": {
                    "class": "Deserializer",
                    "filename": "test/tinymodel.pickle",
                    "deserializer": "pickle",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = Deserializer(configuration, "pickle_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    data = data_object.get("pickle_reader",
                           rtype=DataObjectResponseType.VALUE.value)

    assert data is not None
    assert set(data.keys()) == {"test", "model"}

    assert data["test"] == [1, 2, 3]
    assert isinstance(data["model"], DecisionTreeClassifier)
def test_class_package(mock_env):
    config_path = {
        "metadata": {"class_package": "test"},
        "implementation_config": {
            "reader_config": {"read_data": {"class": "TestExtNode", "destinations": []}}
        },
    }
    config_full_path = {
        "metadata": {"class_package": "test/ext_node_example.py"},
        "implementation_config": {
            "reader_config": {"read_data": {"class": "TestExtNode", "destinations": []}}
        },
    }
    config_full_dot = {
        "metadata": {"class_package": "test"},
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "TestExtNode",
                    "class_prefix": "ext_node_example",
                    "destinations": [],
                }
            }
        },
    }
    for config in [config_full_path, config_path, config_full_dot]:
        config = Configuration(
            config_location=None, is_dict_config=True, dict_config=config
        )
        assert config.config_string
        assert config.config_hash
        NodeFactory().unregister("TestExtNode")
Example #9
0
def test_init_traverser_from_config():
    class TestTraverser(DagTraverser):
        def traversal_list(self):
            return []

        def run_section_by_section(self):
            return False

    TraverserFactory().register("TestTraverser", TestTraverser)

    config = {
        "metadata": {
            "traverser": "TestTraverser"
        },
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    assert isinstance(runner.dag_traverser, TestTraverser)
def test_transform():
    config = {
        "implementation_config": {
            "reader_config": {
                "myreader_left": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["mypipeline"],
                },
                "myreader_right": {
                    "class": "CsvReader",
                    "filename": "test/merge_right3.csv",
                    "destinations": ["mypipeline"],
                },
            },
            "pipeline_config": {
                "mypipeline": {
                    "class": "DataFrameJoiner",
                    "join_key": ["first"],
                    "start_table": "myreader_left",
                    "is_training": True,
                }
            },
        }
    }
    configuration = Configuration(
        config_location=None, is_dict_config=True, dict_config=config
    )

    data_object = DataObject(configuration)

    left_df = pd.read_csv("test/minimal.csv")
    reader_left = CsvReader(configuration, "myreader_left")
    data_object.add(reader_left, left_df)

    right_df = pd.read_csv("test/merge_right3.csv")
    reader_right = CsvReader(configuration, "myreader_right")
    data_object.add(reader_right, right_df)

    pipeline = DataFrameJoiner(configuration, "mypipeline")

    data_object, terminate = pipeline.run(data_object)

    assert not terminate

    joined_data = data_object.get(
        "mypipeline", pop_data=True, rtype=DataObjectResponseType.VALUE.value
    )
    assert joined_data.shape[0] == 2

    assert list(joined_data.T.to_dict().values())[0] == {
        "first": "joe",
        "last": "doe",
        "age": 47,
    }
    assert list(joined_data.T.to_dict().values())[1] == {
        "first": "mary",
        "last": "poppins",
        "age": 42,
    }
Example #11
0
def config():
    config = {
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "data/tennis.csv",
                    "destinations": ["transformers"],
                }
            },
            "pipeline_config": {
                "transformers": {
                    "class": "TransformerPipeline",
                    "transformer_sequence": [
                        {
                            "class": "primrose.transformers.strings.StringTransformer",
                            "method": "replace",
                            "columns": "outlook",
                            "pat": "sunny",
                            "repl": "rainy",
                        }
                    ],
                }
            },
        }
    }
    configuration = Configuration(None, is_dict_config=True, dict_config=config)
    return configuration
Example #12
0
def test_init_ok():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = CsvReader(configuration, "csv_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    df = data_object.get("csv_reader",
                         rtype=DataObjectResponseType.VALUE.value)
    assert df is not None
    assert df.shape == (2, 2)

    node_config = {
        "class": "CsvReader",
        "filename": "test/minimal.csv",
        "destinations": [],
    }

    assert isinstance(CsvReader.necessary_config(node_config), set)
    assert len(CsvReader.necessary_config(node_config)) > 0
Example #13
0
def run(config, dry_run=False):
    """Run a primrose job"""
    from primrose.configuration.configuration import Configuration
    from primrose.dag_runner import DagRunner

    configuration = Configuration(config_location=config)
    DagRunner(configuration).run(dry_run=dry_run)
Example #14
0
def test_run():
    config = {
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "RReader",
                    "dataset": "iris",
                    "destinations": []
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = RReader(configuration, "read_data")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    df = data_object.get("read_data", rtype=DataObjectResponseType.VALUE.value)
    assert df is not None
    assert df.shape == (150, 6)
    assert list(df.columns) == [
        "Sepal.Length",
        "Sepal.Width",
        "Petal.Length",
        "Petal.Width",
        "Species",
        "row_names",
    ]
def test_init_error8():
    with pytest.raises(ValueError) as e:
        Configuration("test/tennis.csv")
    assert (
        "config file at: test/tennis.csv has improper extension type - please use a .json or .yml file"
        in str(e)
    )
Example #16
0
def test_run_bad():
    class TestWriterTmp(AbstractWriter):
        @staticmethod
        def necessary_config(node_config):
            return set([])

        def run(self, data_object):
            return data_object, False

    NodeFactory().register("TestWriterTmp", TestWriterTmp)

    config = {
        "implementation_config": {
            "writer_config": {
                "mywriter": {
                    "class": "TestWriterTmp",
                    "destinations": []
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    # unregister this class
    del NodeFactory().name_dict["TestWriterTmp"]

    with pytest.raises(Exception) as e:
        runner.run()
    assert "Issue instantiating mywriter and class TestWriterTmp" in str(e)
Example #17
0
def test_run_bad2():
    class TestWriterTmp(AbstractWriter):
        @staticmethod
        def necessary_config(node_config):
            return set([])

        def run(self, data_object):
            raise Exception("Deliberate error")
            # return data_object, False

    NodeFactory().register("TestWriterTmp", TestWriterTmp)

    config = {
        "implementation_config": {
            "writer_config": {
                "mywriter": {
                    "class": "TestWriterTmp",
                    "destinations": []
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    with pytest.raises(Exception) as e:
        runner.run()
    assert "Deliberate error" in str(e)
def test_destinations_to_prune():
    config = {
        "implementation_config": {
            "reader_config": {
                "conditional_node": {
                    "class": "SimpleSwitch",
                    "path_to_travel": "left",
                    "destinations": ["left", "right"],
                }
            },
            "writer_config": {
                "left": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
                "right": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                },
            },
        }
    }

    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    node = SimpleSwitch(configuration, "conditional_node")
    to_prune = node.destinations_to_prune()
    assert to_prune == ["right"]
Example #19
0
def test_filter_sequence4():
    config = {
        "metadata": {
            "section_run": ["writer_config"]
        },
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["recipe_csv_writer"],
                }
            },
            "writer_config": {
                "recipe_csv_writer": {
                    "class": "CsvWriter",
                    "key": "test_data",
                    "dir": "cache",
                    "filename": "unittest_similar_recipes.csv",
                }
            },
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    runner = DagRunner(configuration)

    sequence = runner.filter_sequence(["read_data", "recipe_csv_writer"])
    assert sequence == ["recipe_csv_writer"]
Example #20
0
def test_run2():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)
    with LogCapture() as l:
        runner.run(dry_run=True)
    l.check(
        ("root", "INFO", "Taking nodes to run from default"),
        (
            "root",
            "INFO",
            "DRY RUN 0: would run node csv_reader of type reader_config and class CsvReader",
        ),
        ("root", "INFO", "All done. Bye bye!"),
    )
Example #21
0
def test_run_notification_error():
    config = {
        "metadata": {
            "section_registry": ["phase1"],
            "notify_on_error": {
                "client": "SlackClient",
                "channel": "some-channel",
                "token": "slack-api-token",
                "member_id": "optional-key",
            },
        },
        "implementation_config": {
            "phase1": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "bad/path.csv"
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)

    mock_client = mock.Mock()
    mock_client.post_message = mock.Mock()
    mock_get_notification_client = mock.Mock(return_value=mock_client)

    path = "primrose.notification_utils.get_notification_client"
    with mock.patch(path) as mock_get_notification_client:
        with pytest.raises(Exception) as error:
            runner.run()
            assert mock_get_notification_client.post_message.call_count == 1
def test_init_pipeline():
    config = {
        "implementation_config": {
            "reader_config": {
                "myreader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["mypipeline"],
                }
            },
            "pipeline_config": {
                "mypipeline": {
                    "class": "DataFrameJoiner",
                    "join_key": ["first"],
                    "start_table": "myreader",
                }
            },
        }
    }
    configuration = Configuration(
        config_location=None, is_dict_config=True, dict_config=config
    )

    pipeline = DataFrameJoiner(configuration, "mypipeline")
    ts = pipeline.init_pipeline()
    assert isinstance(ts, TransformerSequence)
Example #23
0
def test_kwargs():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "kwargs": {
                        "header": None,
                        "sep": ":"
                    },
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = CsvReader(configuration, "csv_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    df = data_object.get("csv_reader",
                         rtype=DataObjectResponseType.VALUE.value)
    assert df is not None
    assert df.shape == (3, 1)
    def test_run_node(self):

        path = "primrose.notifications.success_notification.get_notification_client"
        with mock.patch(path) as get_client_mock:
            get_client_mock.return_value = mock.Mock()

            NodeFactory().register("SlackDataMock", SlackDataMock)

            config = Configuration(None,
                                   is_dict_config=True,
                                   dict_config=config_dict_node_message)
            data_object = DataObject(config)

            reader = SlackDataMock(config, "test_node")
            data_object = reader.run(data_object)

            success_instance = ClientNotification(
                configuration=config,
                instance_name="node_notification",
            )
            success_instance.client = get_client_mock.return_value

            success_instance.run(data_object)

            success_instance.client.post_message.assert_called_once_with(
                message="Node Success!")
def test_comments_in_json():
    # should not raise exception even though there are comments in the JSON
    config = Configuration(config_location="test/config_with_comments.json")
    assert list(config.config.keys()) == ["reader_config", "writer_config"]
    assert config.config["reader_config"]["read_data"]["class"] == "CsvReader"
    assert list(config.config["reader_config"]["read_data"]["destinations"]) == [
        "write_output"
    ]
def test_init_error7():
    config = {"junk": {}}
    with pytest.raises(Exception) as e:
        Configuration(config_location=None, is_dict_config=True, dict_config=config)
    assert (
        "Unsupported top-level key: junk. Supported keys are ['metadata', 'implementation_config']"
        in str(e)
    )
def test_init_error1():
    with pytest.raises(Exception) as e:
        Configuration(None)
    if sys.version_info[:2] == (3, 5):
        assert "stat: can't specify None for path argument" in str(e)
    else:
        assert (
            "stat: path should be string, bytes, os.PathLike or integer, not NoneType"
            in str(e)
        )
Example #28
0
def test_run4():
    class TestWriter(AbstractFileWriter):
        def __init__(self, configuration, instance_name):
            pass

        @staticmethod
        def necessary_config(node_config):
            return set([])

        def run(self, data_object):
            terminate = True
            return data_object, terminate

    NodeFactory().register("TestWriter", TestWriter)

    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["csv_writer"],
                }
            },
            "writer_config": {
                "csv_writer": {
                    "class": "TestWriter"
                }
            },
        }
    }

    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)
    runner = DagRunner(configuration)

    with LogCapture() as l:
        runner.run(dry_run=False)
    l.check(
        ("root", "INFO", "Taking nodes to run from default"),
        (
            "root",
            "INFO",
            "received node csv_reader of type reader_config and class CsvReader",
        ),
        ("root", "INFO", "Reading test/minimal.csv from CSV"),
        (
            "root",
            "INFO",
            "received node csv_writer of type writer_config and class TestWriter",
        ),
        ("root", "INFO", "Terminating early due to signal from csv_writer"),
        ("root", "INFO", "All done. Bye bye!"),
    )
Example #29
0
def test_nodes_of_type():
    config = {
        "metadata": {},
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["decision_tree_model"],
                },
                "read_data2": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": ["decision_tree_model"],
                },
            },
            "model_config": {
                "decision_tree_model": {
                    "class": "SklearnClassifierModel",
                    "mode": "predict",
                    "sklearn_classifier_name": "tree.DecisionTreeClassifier",
                    "grid_search_scoring": "roc_auc",
                    "cv_folds": 3,
                    "model_parameters": {},
                    "destinations": ["write_output"],
                }
            },
            "writer_config": {
                "write_output": {
                    "class": "CsvWriter",
                    "key": "predictions",
                    "dir": "cache",
                    "filename": "hello_world_predictions.csv",
                }
            },
        },
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    nodes = configuration.dag.nodes_of_type(OperationType.reader)
    assert nodes == set(["read_data", "read_data2"])

    nodes = configuration.dag.nodes_of_type(OperationType.pipeline)
    assert nodes == set([])

    nodes = configuration.dag.upstream_nodes_of_type("write_output",
                                                     OperationType.reader)
    assert nodes == set(["read_data", "read_data2"])

    nodes = configuration.dag.upstream_nodes_of_type("write_output",
                                                     OperationType.cleanup)
    assert nodes == set([])
def test_sections_in_order2():
    config = {
        "metadata": {},
        "implementation_config": {
            "reader_config": {
                "read_data": {
                    "class": "CsvReader",
                    "filename": "test/tennis.csv",
                    "destinations": [],
                }
            },
            "writer_config": {},
        },
    }
    config = Configuration(
        config_location=None, is_dict_config=True, dict_config=config
    )
    sections, source = config.sections_in_order()
    assert sections == ["reader_config", "writer_config"]
    assert source == "default"