Exemple #1
0
def test_init_ok_pickle():
    config = {
        "implementation_config": {
            "reader_config": {
                "pickle_reader": {
                    "class": "Deserializer",
                    "filename": "test/tinymodel.pickle",
                    "deserializer": "pickle",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = Deserializer(configuration, "pickle_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    data = data_object.get("pickle_reader",
                           rtype=DataObjectResponseType.VALUE.value)

    assert data is not None
    assert set(data.keys()) == {"test", "model"}

    assert data["test"] == [1, 2, 3]
    assert isinstance(data["model"], DecisionTreeClassifier)
def data_obj(config):
    df = pd.read_csv("test/tennis.csv")
    data_object = DataObject(config)
    reader = CsvReader(config, "read_data")
    data_object.add(reader, df)
    encoder = EncodeTrainTestSplit(config, "encode_and_split")
    data_object, terminate = encoder.run(data_object)
    return data_object
Exemple #3
0
def test_run(monkeypatch):
    config = {
        "implementation_config": {
            "reader_config": {
                "mynode": {
                    "class": "PostgresReader",
                    "query_json": [{
                        "query": "test/test_mysql.sql"
                    }],
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    data_object = DataObject(configuration)

    keys = [
        "POSTGRES_HOST",
        "POSTGRES_PORT",
        "POSTGRES_DB",
        "POSTGRES_USER",
        "POSTGRES_PASS",
    ]
    for i, k in enumerate(keys):
        os.environ[k] = str(i)

    reader = PostgresReader(configuration, "mynode")

    with patch("psycopg2.connect") as mock_connect:

        def fake_df(query, con):
            return pd.DataFrame({
                "Name": ["Tom", "nick", "krish", "jack"],
                "Age": [20, 21, 19, 18]
            })

        monkeypatch.setattr(pd, "read_sql", fake_df)

        data_object, terminate = reader.run(data_object)

        assert not terminate

        dd = data_object.get("mynode",
                             rtype=DataObjectResponseType.KEY_VALUE.value)
        assert "query_0" in dd
        df = dd["query_0"]
        assert list(df.T.to_dict().values())[0] == {"Name": "Tom", "Age": 20}
    def test_run_node(self):

        path = "primrose.notifications.success_notification.get_notification_client"
        with mock.patch(path) as get_client_mock:
            get_client_mock.return_value = mock.Mock()

            NodeFactory().register("SlackDataMock", SlackDataMock)

            config = Configuration(None,
                                   is_dict_config=True,
                                   dict_config=config_dict_node_message)
            data_object = DataObject(config)

            reader = SlackDataMock(config, "test_node")
            data_object = reader.run(data_object)

            success_instance = ClientNotification(
                configuration=config,
                instance_name="node_notification",
            )
            success_instance.client = get_client_mock.return_value

            success_instance.run(data_object)

            success_instance.client.post_message.assert_called_once_with(
                message="Node Success!")
Exemple #5
0
def test_run_pickle(monkeypatch):
    # returns 2 objects from pickle reader
    config = {
        "implementation_config": {
            "reader_config": {
                "myreader": {
                    "class": "GcsDeserializer",
                    "bucket_name": "test1",
                    "blob_name": "test2",
                    "deserializer": "pickle",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    reader = GcsDeserializer(configuration, "myreader")

    data_object = DataObject(configuration)

    def fake_blobs():
        with open("test/pickle_reader_blob1.pkl", "wb") as pickle_file:
            pickle.dump("some_data", pickle_file)
        with open("test/pickle_reader_blob2.pkl", "wb") as pickle_file:
            pickle.dump("some_other_data", pickle_file)
        dat1 = open("test/pickle_reader_blob1.pkl", "rb").read()
        dat2 = open("test/pickle_reader_blob2.pkl", "rb").read()
        return [dat1, dat2]

    monkeypatch.setattr(reader, "download_blobs_as_strings", fake_blobs)

    reader_object, terminate = reader.run(data_object)

    assert not terminate

    assert "myreader" in reader_object.data_dict

    dat = reader_object.data_dict["myreader"]

    assert "reader_data" in dat
    datlist = dat["reader_data"]

    assert len(datlist) == 2

    assert "some_data" in datlist
    assert "some_other_data" in datlist

    files = ["test/pickle_reader_blob1.pkl", "test/pickle_reader_blob2.pkl"]
    for f in files:
        if os.path.exists(f):
            os.remove(f)
def test_cache_data_object():
    config = {
        "metadata": {
            "data_object": {
                "write_to_cache": True,
                "write_filename": "dag_runner_test_cache_data_object.pkl",
            }
        },
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    data_object = DataObject(configuration)
    writer = CsvReader(configuration, "csv_reader")
    data_object.add(writer, "some_data")

    runner = DagRunner(configuration)

    filename = "dag_runner_test_cache_data_object.pkl"
    if os.path.exists(filename):
        os.remove(filename)

    cached = runner.cache_data_object(data_object)

    assert cached

    assert os.path.exists(filename)

    if os.path.exists(filename):
        os.remove(filename)
Exemple #7
0
    def create_data_object(self):
        """restore data_object from cache

        Returns:
            data_object (DataObject)

        """
        if self.configuration.config_metadata and "data_object" in self.configuration.config_metadata:

            cfg = self.configuration.config_metadata["data_object"]

            if "read_from_cache" in cfg and cfg["read_from_cache"]:
                # we can assume that read_filename exists due to configuration checks
                filename = cfg["read_filename"]
                assert os.path.exists(filename)

                logging.info("Reading DataObject from cache " + filename)
                return DataObject.read_from_cache(filename)

        data_object = DataObject(self.configuration)

        return data_object
Exemple #8
0
def test_init_ok():
    config = {
        "implementation_config": {
            "reader_config": {
                "dill_reader": {
                    "class": "DillReader",
                    "filename": "test/tinymodel.dill",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = DillReader(configuration, "dill_reader")
    data_object, terminate = reader.run(data_object)
    assert not terminate
    data = data_object.get("dill_reader",
                           rtype=DataObjectResponseType.VALUE.value)

    assert data is not None
    assert set(data.keys()) == {"test", "model"}

    node_config = {
        "class": "DillReader",
        "filename": "test/tinymodel.dill",
        "destinations": [],
    }

    assert isinstance(DillReader.necessary_config(node_config), set)
    assert len(DillReader.necessary_config(node_config)) > 0

    assert data["test"] == [1, 2, 3]
    assert isinstance(data["model"], DecisionTreeClassifier)
Exemple #9
0
def test_run_dill_2(monkeypatch):
    # returns 1 objects from dill reader
    config = {
        "implementation_config": {
            "reader_config": {
                "myreader": {
                    "class": "GcsDillReader",
                    "bucket_name": "test1",
                    "blob_name": "test2",
                    "deserializer": "dill",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    reader = GcsDeserializer(configuration, "myreader")

    data_object = DataObject(configuration)

    def fake_blobs():
        with open("test/dill_reader_blob1.pkl", "wb") as dill_file:
            dill.dump("some_data", dill_file)
        dat1 = open("test/dill_reader_blob1.pkl", "rb").read()
        return [dat1]

    monkeypatch.setattr(reader, "download_blobs_as_strings", fake_blobs)

    reader_object, terminate = reader.run(data_object)

    assert not terminate

    assert "myreader" in reader_object.data_dict

    dat = reader_object.data_dict["myreader"]

    assert "reader_data" in dat
    data = dat["reader_data"]

    assert "some_data" == data

    files = ["test/dill_reader_blob1.pkl"]
    for f in files:
        if os.path.exists(f):
            os.remove(f)
def test_create_data_object():

    filename = "dag_runner_create_data_object.pkl"
    # hack part 1: make sure this filename exists so that checks in Configuration pass
    open(filename, "w+")

    config = {
        "metadata": {
            "data_object": {
                "read_from_cache": True,
                "read_filename": "dag_runner_create_data_object.pkl",
            }
        },
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        },
    }
    configuration = Configuration(None,
                                  is_dict_config=True,
                                  dict_config=config)

    # hack part 2: now get rid of it
    if os.path.exists(filename):
        os.remove(filename)

    # now write the actual object to restore from
    data_object = DataObject(configuration)
    writer = CsvReader(configuration, "csv_reader")
    data_object.add(writer, "some_data")
    data_object.write_to_cache(filename)
    assert os.path.exists(filename)

    # now we get to the code to test
    runner = DagRunner(configuration)
    restored_data_object = runner.create_data_object()

    # run some checks
    assert isinstance(restored_data_object, DataObject)
    assert (restored_data_object.get(
        "csv_reader", rtype=DataObjectResponseType.VALUE.value) == "some_data")

    # cleanup
    if os.path.exists(filename):
        os.remove(filename)
def test_get_upstream_data4():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader1": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["recipe_s3_writer"],
                },
                "csv_reader2": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["recipe_s3_writer"],
                },
            },
            "writer_config": {
                "recipe_s3_writer": {
                    "class": "S3Writer",
                    "dir": "cache",
                    "key": "data",
                    "bucket_name": "does_not_exist_bucket_name",
                    "bucket_filename": "does_not_exist.csv",
                }
            },
        }
    }

    configuration = Configuration(None, is_dict_config=True, dict_config=config)

    data_object = DataObject(configuration)

    reader1 = CsvReader(configuration, "csv_reader1")
    reader2 = CsvReader(configuration, "csv_reader2")

    data_object.add(reader1, "data1")
    data_object.add(reader2, "data2")

    response = data_object.get_upstream_data("recipe_s3_writer")
    assert isinstance(response, dict)
    assert "csv_reader1" in response
    assert "csv_reader2" in response
    assert response["csv_reader1"][DataObject.DATA_KEY] == "data1"
    assert response["csv_reader2"][DataObject.DATA_KEY] == "data2"

    response = data_object.get_upstream_data("recipe_s3_writer")
    assert isinstance(response, dict)
    assert response["csv_reader1"][DataObject.DATA_KEY] == "data1"
    assert response["csv_reader2"][DataObject.DATA_KEY] == "data2"
Exemple #12
0
def test_init_ok_unsupported():
    config = {
        "implementation_config": {
            "reader_config": {
                "other_reader": {
                    "class": "Deserializer",
                    "filename": "test/tinymodel.pickle",
                    "deserializer": "other",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)
    data_object = DataObject(configuration)

    reader = Deserializer(configuration, "other_reader")
    with pytest.raises(Exception, match=r"Unsupported"):
        reader.run(data_object)
def setup_vars():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["recipe_s3_writer"],
                }
            },
            "writer_config": {
                "recipe_s3_writer": {
                    "class": "S3Writer",
                    "dir": "cache",
                    "key": "data",
                    "bucket_name": "does_not_exist_bucket_name",
                    "bucket_filename": "does_not_exist.csv",
                }
            },
        }
    }
    configuration = Configuration(None, is_dict_config=True, dict_config=config)
    data_object = DataObject(configuration)
    return configuration, data_object
Exemple #14
0
def test_run_other(monkeypatch):
    config = {
        "implementation_config": {
            "reader_config": {
                "myreader": {
                    "class": "GcsDillReader",
                    "bucket_name": "test1",
                    "blob_name": "test2",
                    "deserializer": "other",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(config_location=None,
                                  is_dict_config=True,
                                  dict_config=config)

    reader = GcsDeserializer(configuration, "myreader")

    data_object = DataObject(configuration)

    with pytest.raises(Exception, match=r"Unsupported"):
        reader.run(data_object)
def test_get_filtered_multiple_upstream_data():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader1": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["recipe_s3_writer"],
                },
                "csv_reader2": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": ["recipe_s3_writer"],
                },
            },
            "writer_config": {
                "recipe_s3_writer": {
                    "class": "S3Writer",
                    "dir": "cache",
                    "key": "data",
                    "bucket_name": "does_not_exist_bucket_name",
                    "bucket_filename": "does_not_exist.csv",
                }
            },
        }
    }

    configuration = Configuration(None, is_dict_config=True, dict_config=config)

    data_object = DataObject(configuration)

    reader1 = CsvReader(configuration, "csv_reader1")
    reader2 = CsvReader(configuration, "csv_reader2")

    data_object.add(reader1, "some_data_to_save")
    data_object.add(reader2, "some_data_to_save")

    data = data_object.get_filtered_upstream_data("recipe_s3_writer", "data")
    assert data == [{"data": "some_data_to_save"}, {"data": "some_data_to_save"}]
    assert isinstance(data, list)

    data = data_object.get_filtered_upstream_data("recipe_s3_writer", "JUNK")
    assert not data
def test_caching():
    config = {
        "implementation_config": {
            "reader_config": {
                "csv_reader": {
                    "class": "CsvReader",
                    "filename": "test/minimal.csv",
                    "destinations": [],
                }
            }
        }
    }
    configuration = Configuration(None, is_dict_config=True, dict_config=config)
    data_object = DataObject(configuration)

    writer = CsvReader(configuration, "csv_reader")

    data_object.add(writer, "some_data")

    filename = "test_data_object_cache.pkl"
    if os.path.exists(filename):
        os.remove(filename)

    data_object.write_to_cache(filename)

    assert os.path.exists(filename)

    restored_data_object = DataObject.read_from_cache(filename)

    assert isinstance(restored_data_object, DataObject)

    assert (
        restored_data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value)
        == "some_data"
    )

    if os.path.exists(filename):
        os.remove(filename)