def test_resolve_missing_condition_definition(self): exported = DataFlow.auto(add).export(linked=True) del exported["definitions"]["is_add"] with self.assertRaisesRegex( DefinitionMissing, "add.conditions.*is_add" ): DataFlow._fromdict(**exported)
async def test_run(self): stdout = io.BytesIO() with unittest.mock.patch("sys.stdout.buffer.write", new=stdout.write): await Export(export="tests.test_df:DATAFLOW", not_linked=False).run() exported = json.loads(stdout.getvalue()) DataFlow._fromdict(**exported)
async def test_dataflow_usage_example(self): # Write out shouldi dataflow orig = self.mktempfile() + ".json" pathlib.Path(orig).write_text(json.dumps(self.DATAFLOW.export())) # Import from feature/git transform_to_repo = Operation.load("dffml.mapping.create") lines_of_code_by_language, lines_of_code_to_comments = list( load( "dffml_feature_git.feature.operations:lines_of_code_by_language", "dffml_feature_git.feature.operations:lines_of_code_to_comments", relative=relative_path("..", "..", "feature", "git"), )) # Create new dataflow override = DataFlow.auto( transform_to_repo, lines_of_code_by_language, lines_of_code_to_comments, ) # TODO Modify and compare against yaml in docs example # Write out override dataflow created = self.mktempfile() + ".json" pathlib.Path(created).write_text(json.dumps(override.export())) # Merge the two with contextlib.redirect_stdout(self.stdout): await CLI.cli("dataflow", "merge", orig, created) DataFlow._fromdict(**json.loads(self.stdout.getvalue()))
async def setUp(self): super().setUp() self.stdout = io.StringIO() InputDataflow = DataFlow( operations={ "AcceptUserInput": AcceptUserInput.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[AcceptUserInput.op.outputs["InputData"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={AcceptUserInput.op.name: AcceptUserInput}, ) OutputDataflow = DataFlow( operations={ "print_output": print_output.op, "get_single": GetSingle.imp.op, }, implementations={print_output.op.name: print_output.imp}, ) self.InputDataflow = InputDataflow self.OutputDataflow = OutputDataflow
async def test_export(self): self.required_plugins("shouldi") stdout = io.StringIO() # Use shouldi's dataflow for tests with relative_chdir("..", "..", "examples", "shouldi"): with unittest.mock.patch("sys.stdout.buffer.write") as write: await Develop.cli("export", "shouldi.cli:DATAFLOW") DataFlow._fromdict(**json.loads(write.call_args[0][0]))
async def test_pos_tagger(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(pos_tagger, GetSingle), [ Input( value=[pos_tagger.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=pos_tagger.op.inputs["text"], ), Input( value="en_core_web_sm", definition=pos_tagger.op.inputs["spacy_model"], ), ], ): pos_tags = results[pos_tagger.op.outputs["result"].name] words = input_sentence.split() for i, _ in enumerate(words): self.assertEqual(pos_tags[i][0], words[i]) self.assertIn(pos_tags[i][1], ["DT", "NN", "VBZ", "CC", ","])
async def test_get_similarity(self): input_sentence1 = ( "The end is the beginning , and the beginning is the end" ) input_sentence2 = ( "The end was the beginning , and the beginning was the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_similarity, GetSingle), [ Input( value=[get_similarity.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence1, definition=get_similarity.op.inputs["text_1"], ), Input( value=input_sentence2, definition=get_similarity.op.inputs["text_2"], ), Input( value="en_core_web_sm", definition=get_similarity.op.inputs["spacy_model"], ), ], ): similarity_score = results[ get_similarity.op.outputs["result"].name ] self.assertGreater(similarity_score, 0.9)
async def test_associatedefinition(self): feed_def = Definition(name="feed", primitive="string") dead_def = Definition(name="dead", primitive="string") output = Definition(name="output", primitive="string") feed_input = Input(value="my favorite value", definition=feed_def) face_input = Input( value="face", definition=output, parents=[feed_input] ) dead_input = Input( value="my second favorite value", definition=dead_def ) beef_input = Input( value="beef", definition=output, parents=[dead_input] ) test_result = {"feed": "face", "dead": "beef"} for test_value in test_result.keys(): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(AssociateDefinition), [ feed_input, face_input, dead_input, beef_input, Input( value={test_value: "output"}, definition=AssociateDefinition.op.inputs["spec"], ), ], ): self.assertEqual( results, {test_value: test_result[test_value]} )
async def setUp(self): self.dataflow = DataFlow.auto(*OPIMPS) self.dataflow.seed.append( Input( value=[ restart_running_containers.op.outputs["containers"].name ], definition=GetSingle.op.inputs["spec"], )) self.test_inputs = { "TestRun": [ Input( value={ "ref": "refs/master", "repository": { "clone_url": f"https://github.com/{USER}/{REPO}.git", "default_branch": "master", "html_url": f"https://github.com/{USER}/{REPO}", }, }, definition=get_url_from_payload.op.inputs["payload"], ) ] } self.containers_to_remove = []
async def test_get_embedding(self): input_sentence = ( "The end is the beginning , and the beginning is the end") async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_embedding, GetSingle), [ Input( value=[get_embedding.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=get_embedding.op.inputs["text"], ), Input( value="en_core_web_sm", definition=get_embedding.op.inputs["spacy_model"], ), ], ): embeddings = results[get_embedding.op.outputs["result"].name] self.assertEqual(len(input_sentence.split()), len(embeddings)) self.assertEqual( embeddings[randint(0, len(input_sentence.split()) - 1)].shape, embeddings[randint(0, len(input_sentence.split()) - 1)].shape, )
async def test_calcHist(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(calcHist, GetSingle), [ Input( value=[ calcHist.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=calcHist.op.inputs["images"], ), Input( value=None, definition=calcHist.op.inputs["mask"], ), Input( value=[0, 1], definition=calcHist.op.inputs["channels"], ), Input( value=[32, 32], definition=calcHist.op.inputs["histSize"], ), Input( value=[0, 256, 0, 256], definition=calcHist.op.inputs["ranges"], ), ], ): self.assertEqual(results[calcHist.op.outputs["result"].name].shape, (32, 32))
async def test_convert_color(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(convert_color, GetSingle), [ Input( value=[ convert_color.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=convert_color.op.inputs["src"], ), Input( value="BGR2RGB", definition=convert_color.op.inputs["code"], ), ], ): self.assertEqual( cv2.cvtColor( results[convert_color.op.outputs["result"].name], cv2.COLOR_RGB2BGR, ).flatten().tolist(), self.INPUT_ARRAY.flatten().tolist(), )
async def test_run(self): dataflow = DataFlow.auto(convert_to_gif, GetSingle) dataflow.seed.append( Input( value=[convert_to_gif.op.outputs["output_file"].name], definition=GetSingle.op.inputs["spec"], ) ) input_file_path = self.parent_path / "input.mp4" with open(input_file_path, "rb") as f: input_file = f.read(-1) test_inputs = { "Test": [ Input( value=input_file, definition=convert_to_gif.op.inputs["input_file"], ), Input( value=240, definition=convert_to_gif.op.inputs["resolution"], ), ] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(dataflow) as octx: async for ctx, results in octx.run(test_inputs): self.assertIn("output_file", results) output = results["output_file"] self.assertGreater(len(output), 100000)
async def test_vaildation_by_op(self): test_dataflow = DataFlow( operations={ "validate_shout_instance": validate_shouts.op, "echo_shout": echo_shout.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[echo_shout.op.outputs["shout_out"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={ validate_shouts.op.name: validate_shouts.imp, echo_shout.op.name: echo_shout.imp, }, ) test_inputs = { "TestShoutOut": [Input(value="validation_status:", definition=SHOUTIN)] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("shout_out", results) self.assertEqual(results["shout_out"], "validation_status:_validated")
async def setUp(self): self.dataflow = DataFlow.auto(*OPIMPS) self.dataflow.seed += [ Input( value=[ restart_running_containers.op.outputs["containers"].name ], definition=GetSingle.op.inputs["spec"], ), Input(value=True, definition=clone_git_repo.op.conditions[0]), ] test_data = { "ref": "refs/main", "repository": { "clone_url": f"https://github.com/{USER}/{REPO}.git", "default_branch": "main", "html_url": f"https://github.com/{USER}/{REPO}", }, } self.test_inputs = { "TestRun": [ Input( value=test_data, definition=check_secret_match.op.outputs["git_payload"], ) ] } self.containers_to_remove = []
async def test_simple_imputer(self): input_data = [[np.nan, 2], [6, np.nan], [7, 6]] output_data = [[6.5, 2], [6, 4], [7, 6]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(simple_imputer, GetSingle), [ Input( value=[simple_imputer.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=simple_imputer.op.inputs["data"], ), Input( value=np.nan, definition=simple_imputer.op.inputs["missing_values"], ), Input( value="mean", definition=simple_imputer.op.inputs["strategy"], ), ], ): self.assertTrue((results[simple_imputer.op.outputs["result"].name] == output_data).all())
def test_export(self): exported = DataFlow.auto(add).export(linked=True) # Operations self.assertIn("operations", exported) self.assertIn("tests.test_df:add", exported["operations"]) self.assertIn("inputs", exported["operations"]["tests.test_df:add"]) self.assertIn("outputs", exported["operations"]["tests.test_df:add"]) self.assertIn("conditions", exported["operations"]["tests.test_df:add"]) self.assertIn( "is_add", exported["operations"]["tests.test_df:add"]["conditions"]) self.assertIn("numbers", exported["operations"]["tests.test_df:add"]["inputs"]) self.assertEqual( "numbers", exported["operations"]["tests.test_df:add"]["inputs"]["numbers"], ) self.assertIn("sum", exported["operations"]["tests.test_df:add"]["outputs"]) self.assertEqual( "result", exported["operations"]["tests.test_df:add"]["outputs"]["sum"], ) # Definitions self.assertIn("definitions", exported) self.assertIn("numbers", exported["definitions"]) self.assertIn("primitive", exported["definitions"]["numbers"]) self.assertEqual("List[int]", exported["definitions"]["numbers"]["primitive"]) self.assertIn("result", exported["definitions"]) self.assertIn("primitive", exported["definitions"]["result"]) self.assertEqual("int", exported["definitions"]["result"]["primitive"])
async def test_principal_component_analysis(self): input_data, _ = make_classification( n_samples=10, n_features=10, n_informative=8, n_redundant=2, random_state=7, ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(principal_component_analysis, GetSingle), [ Input( value=[ principal_component_analysis.op.outputs["result"].name ], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=principal_component_analysis.op.inputs["data"], ), Input( value=8, definition=principal_component_analysis.op. inputs["n_components"], ), ], ): self.assertTrue((10, 8) == results[ principal_component_analysis.op.outputs["result"].name].shape)
async def test_run(self): self.required_plugins("dffml-config-yaml", "dffml-model-scratch") # Load get_single and model_predict get_single = Operation.load("get_single") model_predict = list(load("dffml.operation.model:model_predict"))[0] # Create new dataflow from operations dataflow = DataFlow.auto(get_single, model_predict) # Add the seed inputs dataflow.seed.append( Input( value=[ definition.name for definition in model_predict.op.outputs.values() ], definition=get_single.inputs["spec"], )) # Write out the dataflow dataflow_yaml = pathlib.Path(self.mktempfile() + ".yaml") async with BaseConfigLoader.load("yaml").withconfig( {}) as configloader: async with configloader() as loader: dataflow_yaml.write_bytes(await loader.dumpb( dataflow.export(linked=True))) # TODO Figure out how nested model config options will work # print(dataflow_yaml.read_text()) return
async def operation_db(): """ Create the database and table (myTable) for the db operations """ sdb = SqliteDatabase(SqliteDatabaseConfig(filename="examples.db")) dataflow = DataFlow( operations={"db_query_create": db_query_create_table.op}, configs={"db_query_create": DatabaseQueryConfig(database=sdb)}, seed=[], ) inputs = [ Input( value="myTable", definition=db_query_create_table.op.inputs["table_name"], ), Input( value={ "key": "INTEGER NOT NULL PRIMARY KEY", "firstName": "text", "lastName": "text", "age": "int", }, definition=db_query_create_table.op.inputs["cols"], ), ] async for ctx, result in MemoryOrchestrator.run(dataflow, inputs): pass
async def test_run(self): dataflow = DataFlow.auto(*OPIMPS) passwords = [str(random.random()) for _ in range(0, 20)] # Orchestrate the running of these operations async with MemoryOrchestrator.withconfig({}) as orchestrator: definitions = Operation.definitions(*OPERATIONS) passwords = [ Input( value=password, definition=definitions["UnhashedPassword"], parents=None, ) for password in passwords ] output_spec = Input( value=["ScryptPassword"], definition=definitions["get_single_spec"], parents=None, ) async with orchestrator(dataflow) as octx: try: async for _ctx, results in octx.run({ password.value: [password, output_spec] for password in passwords }): self.assertTrue(results) except AttributeError as error: raise
def _create_dataflow_with_op(self, query_op, seed=[]): return DataFlow( operations={ "db_query": query_op.op, "get_single": GetSingle.imp.op, }, configs={"db_query": DatabaseQueryConfig(database=self.sdb)}, seed=seed, implementations={query_op.op.name: query_op.imp}, )
def create_dataflow(operation, seed): dataflow = DataFlow( operations={operation.op.name: operation}, seed={ Input(value=val, definition=operation.op.inputs[input_name]) for input_name, val in seed.items() }, implementations={operation.op.name: operation.imp}, ) return dataflow
async def test_condition_does_not_run_auto_start(self): ran = [] @op(conditions=[CONDITION]) async def condition_test(): ran.append(True) # pragma: no cover async with MemoryOrchestrator() as orchestrator: async with orchestrator(DataFlow(condition_test)) as octx: async for _ in octx.run([]): pass self.assertFalse(ran)
async def test_dataflow_run_cli_example(self): # Write out override dataflow created = self.mktempfile() + ".yaml" with open(created, "w") as fileobj: with contextlib.redirect_stdout(fileobj): await CLI.cli( "dataflow", "create", "dffml.mapping.create", "print_output", "-configloader", "yaml", ) # Load the generated dataflow async with ConfigLoaders() as cfgl: _, exported = await cfgl.load_file(created) dataflow = DataFlow._fromdict(**exported) # Modify the dataflow dataflow.flow["print_output"].inputs["data"] = [{ "dffml.mapping.create": "mapping" }] # Write back modified dataflow async with BaseConfigLoader.load("yaml").withconfig( {}) as configloader: async with configloader() as loader: with open(created, "wb") as fileobj: fileobj.write(await loader.dumpb(dataflow.export(linked=True))) # Run the dataflow with contextlib.redirect_stdout(self.stdout): await CLI.cli( "dataflow", "run", "records", "all", "-no-echo", "-record-def", "value", "-inputs", "hello=key", "-dataflow", created, "-sources", "m=memory", "-source-records", "world", "user", ) self.assertEqual(self.stdout.getvalue(), "{'hello': 'world'}\n{'hello': 'user'}\n")
async def setUp(self): self.dataflow = DataFlow( operations={ "get_circle": get_circle.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[get_circle.op.outputs["shape"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={"get_circle": get_circle.imp}, )
async def setUp(self): dataflow = DataFlow( operations={ "announce": announce.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[announce.op.outputs["string_out"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={announce.op.name: announce.imp}, ) self.dataflow = dataflow
async def __aenter__(self) -> "DataFlowSourceContext": self.sctx = await self.parent.source().__aenter__() if isinstance(self.parent.config.dataflow, str): dataflow_path = pathlib.Path(self.parent.config.dataflow) config_type = dataflow_path.suffix.replace(".", "") config_cls = BaseConfigLoader.load(config_type) async with config_cls.withconfig({}) as configloader: async with configloader() as loader: exported = await loader.loadb(dataflow_path.read_bytes()) self.parent.config.dataflow = DataFlow._fromdict(**exported) self.octx = await self.parent.orchestrator(self.parent.config.dataflow ).__aenter__() return self
async def test_gen_with_input(self): test_dataflow = DataFlow.auto(GetMulti, counter, echo_num) test_dataflow.seed.append( Input( value=[echo_num.op.outputs["number_out"].name], definition=GetMulti.op.inputs["spec"], )) test_dataflow.implementations[counter.op.name] = counter.imp test_dataflow.implementations[echo_num.op.name] = echo_num.imp test_inputs = {"TestCount": [Input(value=1, definition=CountStart)]} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("number", results) self.assertEqual(set([1, 2, 3, 4, 5]), set(results["number"]))
async def test_standard_scaler(self): input_data = [[0, 0], [0, 0], [1, 1], [1, 1]] output_data = [[-1, -1], [-1, -1], [1, 1], [1, 1]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(standard_scaler, GetSingle), [ Input( value=[standard_scaler.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=standard_scaler.op.inputs["data"], ), ], ): self.assertTrue((results[standard_scaler.op.outputs["result"].name] == output_data))