def test_kwargs(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "kwargs": { "header": None, "sep": ":" }, "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "csv_reader") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (3, 1)
def test_init_ok(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "csv_reader") data_object, terminate = reader.run(data_object) assert not terminate df = data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) assert df is not None assert df.shape == (2, 2) node_config = { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } assert isinstance(CsvReader.necessary_config(node_config), set) assert len(CsvReader.necessary_config(node_config)) > 0
def test_transform(): config = { "implementation_config": { "reader_config": { "myreader_left": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mypipeline"], }, "myreader_right": { "class": "CsvReader", "filename": "test/merge_right3.csv", "destinations": ["mypipeline"], }, }, "pipeline_config": { "mypipeline": { "class": "DataFrameJoiner", "join_key": ["first"], "start_table": "myreader_left", "is_training": True, } }, } } configuration = Configuration( config_location=None, is_dict_config=True, dict_config=config ) data_object = DataObject(configuration) left_df = pd.read_csv("test/minimal.csv") reader_left = CsvReader(configuration, "myreader_left") data_object.add(reader_left, left_df) right_df = pd.read_csv("test/merge_right3.csv") reader_right = CsvReader(configuration, "myreader_right") data_object.add(reader_right, right_df) pipeline = DataFrameJoiner(configuration, "mypipeline") data_object, terminate = pipeline.run(data_object) assert not terminate joined_data = data_object.get( "mypipeline", pop_data=True, rtype=DataObjectResponseType.VALUE.value ) assert joined_data.shape[0] == 2 assert list(joined_data.T.to_dict().values())[0] == { "first": "joe", "last": "doe", "age": 47, } assert list(joined_data.T.to_dict().values())[1] == { "first": "mary", "last": "poppins", "age": 42, }
def test_get_upstream_data4(): config = { "implementation_config": { "reader_config": { "csv_reader1": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, "csv_reader2": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, }, "writer_config": { "recipe_s3_writer": { "class": "S3Writer", "dir": "cache", "key": "data", "bucket_name": "does_not_exist_bucket_name", "bucket_filename": "does_not_exist.csv", } }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader1 = CsvReader(configuration, "csv_reader1") reader2 = CsvReader(configuration, "csv_reader2") data_object.add(reader1, "data1") data_object.add(reader2, "data2") response = data_object.get_upstream_data("recipe_s3_writer") assert isinstance(response, dict) assert "csv_reader1" in response assert "csv_reader2" in response assert response["csv_reader1"][DataObject.DATA_KEY] == "data1" assert response["csv_reader2"][DataObject.DATA_KEY] == "data2" response = data_object.get_upstream_data("recipe_s3_writer") assert isinstance(response, dict) assert response["csv_reader1"][DataObject.DATA_KEY] == "data1" assert response["csv_reader2"][DataObject.DATA_KEY] == "data2"
def test_init_other_ok(config): config["implementation_config"]["writer_config"]["recipe_file_writer"][ "filename"] = "unittest_file_writer.other" config["implementation_config"]["writer_config"]["recipe_file_writer"][ "serializer"] = "other" test_data_string = "some test data" configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) requestor = CsvReader(configuration, "csv_reader") data_object.add(requestor, test_data_string, "test_data") writer = Serializer(configuration, "recipe_file_writer") c = configuration.config_for_instance("recipe_file_writer") filename = c["dir"] + os.path.sep + c["filename"] # clean out test file location if os.path.exists(filename): os.remove(filename) with pytest.raises(Exception, match=r"Unsupported"): writer.run(data_object)
def test_init_ok(config): test_data_string = "some test data" configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) requestor = CsvReader(configuration, "csv_reader") data_object.add(requestor, test_data_string, "test_data") writer = FileWriter(configuration, "recipe_file_writer") c = configuration.config_for_instance("recipe_file_writer") filename = c["dir"] + os.path.sep + c["filename"] # clean out test file location if os.path.exists(filename): os.remove(filename) data_object, terminate = writer.run(data_object) assert not terminate assert os.path.exists(filename) read_data = open(filename).read() assert test_data_string == read_data
def test_init_ok(config): corpus = pd.read_csv("test/minimal.csv") configuration = Configuration(None, is_dict_config=True, dict_config=config) writer = CsvWriter(configuration, "recipe_csv_writer") data_object = DataObject(configuration) requestor = CsvReader(configuration, "csv_reader") data_object.add(requestor, key="test_data", data=corpus) c = configuration.config_for_instance( "recipe_csv_writer" ) # configuration.sec .writer_config['recipe_csv_writer'] filename = c["dir"] + os.path.sep + c["filename"] # clean out test file location if os.path.exists(filename): os.remove(filename) writer.run(data_object) assert os.path.exists(filename) df = pd.read_csv(filename) assert corpus.equals(df)
def data_obj(config): df = pd.read_csv("test/tennis.csv") data_object = DataObject(config) reader = CsvReader(config, "read_data") data_object.add(reader, df) encoder = EncodeTrainTestSplit(config, "encode_and_split") data_object, terminate = encoder.run(data_object) return data_object
def test_transform2(): config = { "implementation_config": { "reader_config": { "myreader_left": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mypipeline"], }, "myreader_right": { "class": "CsvReader", "filename": "test/merge_right3.csv", "destinations": ["mypipeline"], }, }, "pipeline_config": { "mypipeline": { "class": "DataFrameJoiner", "join_key": ["first"], "start_table": "JUNK", "is_training": True, } }, } } configuration = Configuration( config_location=None, is_dict_config=True, dict_config=config ) data_object = DataObject(configuration) left_df = pd.read_csv("test/minimal.csv") reader_left = CsvReader(configuration, "myreader_left") right_df = pd.read_csv("test/merge_right3.csv") reader_right = CsvReader(configuration, "myreader_right") # note: am deliberately swapping order to right is first data_object.add(reader_right, right_df) data_object.add(reader_left, left_df) pipeline = DataFrameJoiner(configuration, "mypipeline") with pytest.raises(Exception) as e: pipeline.run(data_object) assert "Could not find start_table in upstream keys: JUNK" in str(e)
def test_get_upstream_data3(setup_vars): configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) response = data_object.get_upstream_data("recipe_s3_writer") assert isinstance(response, dict) assert DataObject.DATA_KEY in response assert response[DataObject.DATA_KEY] == data_to_save
def test_get(setup_vars): configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) with pytest.raises(Exception) as e: data_object.get("reader", rtype="junk") assert "Unrecognized rtype: junk" in str(e)
def test_add(setup_vars): """test value""" configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) response = data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) assert response == data_to_save
def test_repr(setup_vars): configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) assert ( str(data_object) == "DataObject:defaultdict(<class 'dict'>, {'csv_reader': {'data': 'TESTING'}})" )
def test_get4(setup_vars): configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_object.add(reader, key="k1", data="v1") data_object.add(reader, key="k2", data="v2") response = data_object.get( reader.instance_name, rtype=DataObjectResponseType.VALUE.value ) assert isinstance(response, dict) assert len(response.keys()) == 2
def data_object_factory(): df = pd.read_csv("test/tennis.csv") data_object = DataObject(configuration) csv_reader = CsvReader(configuration, "read_data") data_object.add(csv_reader, df) return data_object
def test_get_filtered_multiple_upstream_data(): config = { "implementation_config": { "reader_config": { "csv_reader1": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, "csv_reader2": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["recipe_s3_writer"], }, }, "writer_config": { "recipe_s3_writer": { "class": "S3Writer", "dir": "cache", "key": "data", "bucket_name": "does_not_exist_bucket_name", "bucket_filename": "does_not_exist.csv", } }, } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader1 = CsvReader(configuration, "csv_reader1") reader2 = CsvReader(configuration, "csv_reader2") data_object.add(reader1, "some_data_to_save") data_object.add(reader2, "some_data_to_save") data = data_object.get_filtered_upstream_data("recipe_s3_writer", "data") assert data == [{"data": "some_data_to_save"}, {"data": "some_data_to_save"}] assert isinstance(data, list) data = data_object.get_filtered_upstream_data("recipe_s3_writer", "JUNK") assert not data
def test_add4(setup_vars): """test add 2 items""" configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) data_object.add(reader, data_to_save, overwrite=True) with pytest.raises(Exception) as e: data_object.add(reader, data_to_save, overwrite=False) assert "Key already exists for csv_reader" in str(e)
def test_add2(setup_vars): """test key value""" configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) response = data_object.get( "csv_reader", rtype=DataObjectResponseType.KEY_VALUE.value ) assert isinstance(response, dict) assert DataObject.DATA_KEY in response assert response[DataObject.DATA_KEY] == data_to_save
def test_get3(setup_vars): configuration, data_object = setup_vars assert len(data_object.data_dict.keys()) == 0 reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) assert len(data_object.data_dict.keys()) == 1 data_object.get("csv_reader", pop_data=False) assert len(data_object.data_dict.keys()) == 1 data_object.get("csv_reader", pop_data=True) assert len(data_object.data_dict.keys()) == 0
def test_create_data_object(): filename = "dag_runner_create_data_object.pkl" # hack part 1: make sure this filename exists so that checks in Configuration pass open(filename, "w+") config = { "metadata": { "data_object": { "read_from_cache": True, "read_filename": "dag_runner_create_data_object.pkl", } }, "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } }, } configuration = Configuration(None, is_dict_config=True, dict_config=config) # hack part 2: now get rid of it if os.path.exists(filename): os.remove(filename) # now write the actual object to restore from data_object = DataObject(configuration) writer = CsvReader(configuration, "csv_reader") data_object.add(writer, "some_data") data_object.write_to_cache(filename) assert os.path.exists(filename) # now we get to the code to test runner = DagRunner(configuration) restored_data_object = runner.create_data_object() # run some checks assert isinstance(restored_data_object, DataObject) assert (restored_data_object.get( "csv_reader", rtype=DataObjectResponseType.VALUE.value) == "some_data") # cleanup if os.path.exists(filename): os.remove(filename)
def test_add3(setup_vars): """test instance key value""" configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) response = data_object.get( "csv_reader", rtype=DataObjectResponseType.INSTANCE_KEY_VALUE.value ) assert isinstance(response, dict) assert "csv_reader" in response assert 1 == len(list(response.keys())) assert response["csv_reader"][DataObject.DATA_KEY] == data_to_save
def test_concatenate_data(pipeline_obj, configuration): df1 = pd.read_csv("test/tennis.csv") df2 = pd.read_csv("test/tennis.csv") data_object = DataObject(configuration) csv_reader = CsvReader(configuration, "read_data") data_object.add(csv_reader, df1, "query1") data_object.add(csv_reader, df2, "query2") data_object, terminate = pipeline_obj.run(data_object) encoded_data = data_object.get("encode_and_split")["data_train"] assert len(encoded_data) == 18
def test_upstream_keys(setup_vars): configuration, data_object = setup_vars reader = CsvReader(configuration, "csv_reader") data_to_save = "TESTING" data_object.add(reader, data_to_save) keys = data_object.upstream_keys("recipe_s3_writer") assert keys == ["csv_reader"] keys = data_object.upstream_keys( "recipe_s3_writer", operation_type_filter=OperationType.reader ) assert keys == ["csv_reader"] keys = data_object.upstream_keys( "recipe_s3_writer", operation_type_filter=OperationType.writer ) assert keys == []
def test_cache_data_object(): config = { "metadata": { "data_object": { "write_to_cache": True, "write_filename": "dag_runner_test_cache_data_object.pkl", } }, "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } }, } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) writer = CsvReader(configuration, "csv_reader") data_object.add(writer, "some_data") runner = DagRunner(configuration) filename = "dag_runner_test_cache_data_object.pkl" if os.path.exists(filename): os.remove(filename) cached = runner.cache_data_object(data_object) assert cached assert os.path.exists(filename) if os.path.exists(filename): os.remove(filename)
def test_caching(): config = { "implementation_config": { "reader_config": { "csv_reader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": [], } } } } configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) writer = CsvReader(configuration, "csv_reader") data_object.add(writer, "some_data") filename = "test_data_object_cache.pkl" if os.path.exists(filename): os.remove(filename) data_object.write_to_cache(filename) assert os.path.exists(filename) restored_data_object = DataObject.read_from_cache(filename) assert isinstance(restored_data_object, DataObject) assert ( restored_data_object.get("csv_reader", rtype=DataObjectResponseType.VALUE.value) == "some_data" ) if os.path.exists(filename): os.remove(filename)
def test_init_pickle_ok(config): config["implementation_config"]["writer_config"]["recipe_file_writer"][ "filename"] = "unittest_file_writer.pickle" config["implementation_config"]["writer_config"]["recipe_file_writer"][ "serializer"] = "pickle" test_data_string = "some test data" configuration = Configuration(None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) requestor = CsvReader(configuration, "csv_reader") data_object.add(requestor, test_data_string, "test_data") writer = Serializer(configuration, "recipe_file_writer") c = configuration.config_for_instance("recipe_file_writer") filename = c["dir"] + os.path.sep + c["filename"] # clean out test file location if os.path.exists(filename): os.remove(filename) data_object, terminate = writer.run(data_object) assert not terminate assert os.path.exists(filename) read_data = pickle.load(open(filename, "rb")) assert test_data_string == read_data
def test_run(): class TestPipeline(AbstractPipeline): def transform(self, data_object): logging.info("TRANSFORM CALLED") return data_object def fit_transform(self, data_object): logging.info("FIT_TRANSFORM CALLED") return self.transform(data_object) @staticmethod def necessary_config(node_config): return set(["is_training"]) NodeFactory().register("TestPipeline", TestPipeline) config = { "implementation_config": { "reader_config": { "myreader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mypipeline"], } }, "pipeline_config": { "mypipeline": { "class": "TestPipeline", "is_training": True } }, } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) reference_file_path = "test/minimal.csv" corpus = pd.read_csv(reference_file_path) reader = CsvReader(configuration, "myreader") data_object = DataObject(configuration) data_object.add(reader, corpus) pipeline = TestPipeline(configuration, "mypipeline") with LogCapture() as l: pipeline.run(data_object) l.check( ( "root", "INFO", "No upstream TransformerSequence found. Creating new TransformerSequence...", ), ("root", "INFO", "FIT_TRANSFORM CALLED"), ("root", "INFO", "TRANSFORM CALLED"), ) data_object.add(reader, TransformerSequence(), "tsequence") with LogCapture() as l: pipeline.run(data_object) l.check( ( "root", "INFO", "Upstream TransformerSequence found, initializing pipeline...", ), ("root", "INFO", "FIT_TRANSFORM CALLED"), ("root", "INFO", "TRANSFORM CALLED"), ) config["implementation_config"]["pipeline_config"]["mypipeline"][ "is_training"] = False configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) reader = CsvReader(configuration, "myreader") data_object = DataObject(configuration) data_object.add(reader, corpus) pipeline = TestPipeline(configuration, "mypipeline") with LogCapture() as l: pipeline.run(data_object) l.check( ( "root", "INFO", "No upstream TransformerSequence found. Creating new TransformerSequence...", ), ("root", "INFO", "TRANSFORM CALLED"), )
def test_run(): class TestModel(AbstractModel): @staticmethod def necessary_config(node_config): return set(["mode"]) def train_model(self, data_object): logging.info("TRAIN called") return data_object def eval_model(self, data_object): logging.info("EVAL called") return data_object def predict(self, data_object): logging.info("PREDICT called") return data_object NodeFactory().register("TestModel", TestModel) config = { "implementation_config": { "reader_config": { "myreader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mymodel"], } }, "model_config": { "mymodel": { "class": "TestModel", "mode": "train" } }, } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "myreader") df = pd.read_csv("test/minimal.csv") data_object.add(reader, df) model = TestModel(configuration, "mymodel") with LogCapture() as l: model.run(data_object) l.check( ("root", "INFO", "TRAIN called"), ("root", "INFO", "EVAL called"), ("root", "INFO", "PREDICT called"), ) config = { "implementation_config": { "reader_config": { "myreader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mymodel"], } }, "model_config": { "mymodel": { "class": "TestModel", "mode": "eval" } }, } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) data_object = DataObject(configuration) reader = CsvReader(configuration, "myreader") data_object.add(reader, df) model = TestModel(configuration, "mymodel") with LogCapture() as l: model.run(data_object) l.check(("root", "INFO", "EVAL called"), ("root", "INFO", "PREDICT called"))
def data_object(config): data_object = DataObject(config) csv_reader = CsvReader(config, "read_data") data_object, _ = csv_reader.run(data_object) return data_object
def test_execute_pipeline(): class TestTransformer(AbstractTransformer): def fit(self, data): logging.info("Transfer FIT CALLED") def transform(self, data): logging.info("Transfer TRANSFORM CALLED") return data def fit_transform(self, data): logging.info("Transfer FIT_TRANSFORM CALLED") self.fit(data) return self.transform(data) class TestPipeline2(AbstractPipeline): def transform(self, data_object): return data_object @staticmethod def necessary_config(node_config): return set(["is_training"]) NodeFactory().register("TestPipeline2", TestPipeline2) config = { "implementation_config": { "reader_config": { "myreader": { "class": "CsvReader", "filename": "test/minimal.csv", "destinations": ["mypipeline"], } }, "pipeline_config": { "mypipeline": { "class": "TestPipeline", "is_training": True } }, } } configuration = Configuration(config_location=None, is_dict_config=True, dict_config=config) reference_file_path = "test/minimal.csv" corpus = pd.read_csv(reference_file_path) reader = CsvReader(configuration, "myreader") data_object = DataObject(configuration) data_object.add(reader, corpus) sequence = TransformerSequence() sequence.add(TestTransformer()) data_object.add(reader, sequence, "tsequence") pipeline = TestPipeline2(configuration, "mypipeline") with pytest.raises(Exception) as e: pipeline.execute_pipeline(corpus, PipelineModeType.FIT) assert "run() must be called to extract/create a TransformerSequence" in str( e) pipeline.run(data_object) with pytest.raises(Exception) as e: pipeline.execute_pipeline(corpus, "JUNK") assert "mode must be of type PipelineModeType Enum object." in str(e) with LogCapture() as l: pipeline.execute_pipeline(corpus, PipelineModeType.FIT) l.check(("root", "INFO", "Transfer FIT CALLED")) with LogCapture() as l: pipeline.execute_pipeline(corpus, PipelineModeType.FIT_TRANSFORM) l.check( ("root", "INFO", "Transfer FIT_TRANSFORM CALLED"), ("root", "INFO", "Transfer FIT CALLED"), ("root", "INFO", "Transfer TRANSFORM CALLED"), )