def test_init_ok_pickle(): config = { "implementation_config": { "reader_config": { "pickle_reader": { "class": "Deserializer", "filename": "test/tinymodel.pickle", "deserializer": "pickle", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = Deserializer(configuration, "pickle_reader") data_object, terminate = reader.run(data_object) assert not terminate data = data_object.get("pickle_reader", rtype=DataObjectResponseType.VALUE.value) assert data is not None assert set(data.keys()) == {"test", "model"} assert data["test"] == [1, 2, 3] assert isinstance(data["model"], DecisionTreeClassifier)
def data_obj(config): df = pd.read_csv("test/tennis.csv") data_object = DataObject(config) reader = CsvReader(config, "read_data") data_object.add(reader, df) encoder = EncodeTrainTestSplit(config, "encode_and_split") data_object, terminate = encoder.run(data_object) return data_object
def test_run(monkeypatch): config = { "implementation_config": { "reader_config": { "mynode": { "class": "PostgresReader", "query_json": [{ "query": "test/test_mysql.sql" }], "destinations": [], } } } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) keys = [ "POSTGRES_HOST", "POSTGRES_PORT", "POSTGRES_DB", "POSTGRES_USER", "POSTGRES_PASS", ] for i, k in enumerate(keys): os.environ[k] = str(i) reader = PostgresReader(configuration, "mynode") with patch("psycopg2.connect") as mock_connect: def fake_df(query, con): return pd.DataFrame({ "Name": ["Tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18] }) monkeypatch.setattr(pd, "read_sql", fake_df) data_object, terminate = reader.run(data_object) assert not terminate dd = data_object.get("mynode", rtype=DataObjectResponseType.KEY_VALUE.value) assert "query_0" in dd df = dd["query_0"] assert list(df.T.to_dict().values())[0] == {"Name": "Tom", "Age": 20}
def test_run_node(self): path = "primrose.notifications.success_notification.get_notification_client" with mock.patch(path) as get_client_mock: get_client_mock.return_value = mock.Mock() NodeFactory().register("SlackDataMock", SlackDataMock) config = Configuration(None, is_dict_config=True, dict_config=config_dict_node_message) data_object = DataObject(config) reader = SlackDataMock(config, "test_node") data_object = reader.run(data_object) success_instance = ClientNotification( configuration=config, instance_name="node_notification", ) success_instance.client = get_client_mock.return_value success_instance.run(data_object) success_instance.client.post_message.assert_called_once_with( message="Node Success!")
def test_run_pickle(monkeypatch): # returns 2 objects from pickle reader config = { "implementation_config": { "reader_config": { "myreader": { "class": "GcsDeserializer", "bucket_name": "test1", "blob_name": "test2", "deserializer": "pickle", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) reader = GcsDeserializer(configuration, "myreader") data_object = DataObject(configuration) def fake_blobs(): with open("test/pickle_reader_blob1.pkl", "wb") as pickle_file: pickle.dump("some_data", pickle_file) with open("test/pickle_reader_blob2.pkl", "wb") as pickle_file: pickle.dump("some_other_data", pickle_file) dat1 = open("test/pickle_reader_blob1.pkl", "rb").read() dat2 = open("test/pickle_reader_blob2.pkl", "rb").read() return [dat1, dat2] monkeypatch.setattr(reader, "download_blobs_as_strings", fake_blobs) reader_object, terminate = reader.run(data_object) assert not terminate assert "myreader" in reader_object.data_dict dat = reader_object.data_dict["myreader"] assert "reader_data" in dat datlist = dat["reader_data"] assert len(datlist) == 2 assert "some_data" in datlist assert "some_other_data" in datlist files = ["test/pickle_reader_blob1.pkl", "test/pickle_reader_blob2.pkl"] for f in files: if os.path.exists(f): os.remove(f)
def test_cache_data_object(): config = { "metadata": { "data_object": { "write_to_cache": True, "write_filename": "dag_runner_test_cache_data_object.pkl", } }, "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } }, } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) writer = CsvReader(configuration, "csv_reader") data_object.add(writer, "some_data") runner = DagRunner(configuration) filename = "dag_runner_test_cache_data_object.pkl" if os.path.exists(filename): os.remove(filename) cached = runner.cache_data_object(data_object) assert cached assert os.path.exists(filename) if os.path.exists(filename): os.remove(filename)
def create_data_object(self): """restore data_object from cache Returns: data_object (DataObject) """ if self.configuration.config_metadata and "data_object" in self.configuration.config_metadata: cfg = self.configuration.config_metadata["data_object"] if "read_from_cache" in cfg and cfg["read_from_cache"]: # we can assume that read_filename exists due to configuration checks filename = cfg["read_filename"] assert os.path.exists(filename) logging.info("Reading DataObject from cache " + filename) return DataObject.read_from_cache(filename) data_object = DataObject(self.configuration) return data_object
def test_init_ok(): config = { "implementation_config": { "reader_config": { "dill_reader": { "class": "DillReader", "filename": "test/tinymodel.dill", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = DillReader(configuration, "dill_reader") data_object, terminate = reader.run(data_object) assert not terminate data = data_object.get("dill_reader", rtype=DataObjectResponseType.VALUE.value) assert data is not None assert set(data.keys()) == {"test", "model"} node_config = { "class": "DillReader", "filename": "test/tinymodel.dill", "destinations": [], } assert isinstance(DillReader.necessary_config(node_config), set) assert len(DillReader.necessary_config(node_config)) > 0 assert data["test"] == [1, 2, 3] assert isinstance(data["model"], DecisionTreeClassifier)
def test_run_dill_2(monkeypatch): # returns 1 objects from dill reader config = { "implementation_config": { "reader_config": { "myreader": { "class": "GcsDillReader", "bucket_name": "test1", "blob_name": "test2", "deserializer": "dill", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) reader = GcsDeserializer(configuration, "myreader") data_object = DataObject(configuration) def fake_blobs(): with open("test/dill_reader_blob1.pkl", "wb") as dill_file: dill.dump("some_data", dill_file) dat1 = open("test/dill_reader_blob1.pkl", "rb").read() return [dat1] monkeypatch.setattr(reader, "download_blobs_as_strings", fake_blobs) reader_object, terminate = reader.run(data_object) assert not terminate assert "myreader" in reader_object.data_dict dat = reader_object.data_dict["myreader"] assert "reader_data" in dat data = dat["reader_data"] assert "some_data" == data files = ["test/dill_reader_blob1.pkl"] for f in files: if os.path.exists(f): os.remove(f)
def test_create_data_object(): filename = "dag_runner_create_data_object.pkl" # hack part 1: make sure this filename exists so that checks in Configuration pass open(filename, "w+") config = { "metadata": { "data_object": { "read_from_cache": True, "read_filename": "dag_runner_create_data_object.pkl", } }, "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } }, } configuration = Configuration(None, is_dict_config=True, dict_config=config) # hack part 2: now get rid of it if os.path.exists(filename): os.remove(filename) # now write the actual object to restore from data_object = DataObject(configuration) writer = CsvReader(configuration, "csv_reader") data_object.add(writer, "some_data") data_object.write_to_cache(filename) assert os.path.exists(filename) # now we get to the code to test runner = DagRunner(configuration) restored_data_object = runner.create_data_object() # run some checks assert isinstance(restored_data_object, DataObject) assert (restored_data_object.get( "csv_reader", rtype=DataObjectResponseType.VALUE.value) == "some_data") # cleanup if os.path.exists(filename): os.remove(filename)
def test_get_upstream_data4(): config = { "implementation_config": { "reader_config": { "csv_reader1": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, "csv_reader2": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, }, "writer_config": { "recipe_s3_writer": { "class": "S3Writer", "dir": "cache", "key": "data", "bucket_name": "does_not_exist_bucket_name", "bucket_filename": "does_not_exist.csv", } }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader1 = CsvReader(configuration, "csv_reader1") reader2 = CsvReader(configuration, "csv_reader2") data_object.add(reader1, "data1") data_object.add(reader2, "data2") response = data_object.get_upstream_data("recipe_s3_writer") assert isinstance(response, dict) assert "csv_reader1" in response assert "csv_reader2" in response assert response["csv_reader1"][DataObject.DATA_KEY] == "data1" assert response["csv_reader2"][DataObject.DATA_KEY] == "data2" response = data_object.get_upstream_data("recipe_s3_writer") assert isinstance(response, dict) assert response["csv_reader1"][DataObject.DATA_KEY] == "data1" assert response["csv_reader2"][DataObject.DATA_KEY] == "data2"
def test_init_ok_unsupported(): config = { "implementation_config": { "reader_config": { "other_reader": { "class": "Deserializer", "filename": "test/tinymodel.pickle", "deserializer": "other", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = Deserializer(configuration, "other_reader") with pytest.raises(Exception, match=r"Unsupported"): reader.run(data_object)
def setup_vars(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], } }, "writer_config": { "recipe_s3_writer": { "class": "S3Writer", "dir": "cache", "key": "data", "bucket_name": "does_not_exist_bucket_name", "bucket_filename": "does_not_exist.csv", } }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) return configuration, data_object
def test_run_other(monkeypatch): config = { "implementation_config": { "reader_config": { "myreader": { "class": "GcsDillReader", "bucket_name": "test1", "blob_name": "test2", "deserializer": "other", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) reader = GcsDeserializer(configuration, "myreader") data_object = DataObject(configuration) with pytest.raises(Exception, match=r"Unsupported"): reader.run(data_object)
def test_get_filtered_multiple_upstream_data(): config = { "implementation_config": { "reader_config": { "csv_reader1": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, "csv_reader2": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, }, "writer_config": { "recipe_s3_writer": { "class": "S3Writer", "dir": "cache", "key": "data", "bucket_name": "does_not_exist_bucket_name", "bucket_filename": "does_not_exist.csv", } }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader1 = CsvReader(configuration, "csv_reader1") reader2 = CsvReader(configuration, "csv_reader2") data_object.add(reader1, "some_data_to_save") data_object.add(reader2, "some_data_to_save") data = data_object.get_filtered_upstream_data("recipe_s3_writer", "data") assert data == [{"data": "some_data_to_save"}, {"data": "some_data_to_save"}] assert isinstance(data, list) data = data_object.get_filtered_upstream_data("recipe_s3_writer", "JUNK") assert not data
def test_caching(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) writer = CsvReader(configuration, "csv_reader") data_object.add(writer, "some_data") filename = "test_data_object_cache.pkl" if os.path.exists(filename): os.remove(filename) data_object.write_to_cache(filename) assert os.path.exists(filename) restored_data_object = DataObject.read_from_cache(filename) assert isinstance(restored_data_object, DataObject) assert ( restored_data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) == "some_data" ) if os.path.exists(filename): os.remove(filename)