async def test_run(self): dataflow = DataFlow.auto(convert_to_gif, GetSingle) dataflow.seed.append( Input( value=[convert_to_gif.op.outputs["output_file"].name], definition=GetSingle.op.inputs["spec"], ) ) input_file_path = self.parent_path / "input.mp4" with open(input_file_path, "rb") as f: input_file = f.read(-1) test_inputs = { "Test": [ Input( value=input_file, definition=convert_to_gif.op.inputs["input_file"], ), Input( value=240, definition=convert_to_gif.op.inputs["resolution"], ), ] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(dataflow) as octx: async for ctx, results in octx.run(test_inputs): self.assertIn("output_file", results) output = results["output_file"] self.assertGreater(len(output), 100000)
async def setUp(self): self.dataflow = DataFlow.auto(*OPIMPS) self.dataflow.seed += [ Input( value=[ restart_running_containers.op.outputs["containers"].name ], definition=GetSingle.op.inputs["spec"], ), Input(value=True, definition=clone_git_repo.op.conditions[0]), ] test_data = { "ref": "refs/main", "repository": { "clone_url": f"https://github.com/{USER}/{REPO}.git", "default_branch": "main", "html_url": f"https://github.com/{USER}/{REPO}", }, } self.test_inputs = { "TestRun": [ Input( value=test_data, definition=check_secret_match.op.outputs["git_payload"], ) ] } self.containers_to_remove = []
async def test_2_lookup(self): seed = [ Input( value=[db_query_lookup.op.outputs["lookups"].name], definition=GetSingle.op.inputs["spec"], ) ] df = self._create_dataflow_with_op(db_query_lookup, seed=seed) test_inputs = { "lookup": { "table_name": self.table_name, "cols": [], "conditions": [], } } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(df) as octx: async for _ctx, results in octx.run({ test_ctx: [ Input( value=val, definition=db_query_lookup.op.inputs[key], ) for key, val in test_val.items() ] for test_ctx, test_val in test_inputs.items() }): self.assertIn("query_lookups", results) results = results["query_lookups"] self.assertEqual(self.data_dicts, results)
async def test_vaildation_by_op(self): test_dataflow = DataFlow( operations={ "validate_shout_instance": validate_shouts.op, "echo_shout": echo_shout.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[echo_shout.op.outputs["shout_out"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={ validate_shouts.op.name: validate_shouts.imp, echo_shout.op.name: echo_shout.imp, }, ) test_inputs = { "TestShoutOut": [Input(value="validation_status:", definition=SHOUTIN)] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("shout_out", results) self.assertEqual(results["shout_out"], "validation_status:_validated")
async def test_pos_tagger(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(pos_tagger, GetSingle), [ Input( value=[pos_tagger.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=pos_tagger.op.inputs["text"], ), Input( value="en_core_web_sm", definition=pos_tagger.op.inputs["spacy_model"], ), ], ): pos_tags = results[pos_tagger.op.outputs["result"].name] words = input_sentence.split() for i, _ in enumerate(words): self.assertEqual(pos_tags[i][0], words[i]) self.assertIn(pos_tags[i][1], ["DT", "NN", "VBZ", "CC", ","])
async def test_simple_imputer(self): input_data = [[np.nan, 2], [6, np.nan], [7, 6]] output_data = [[6.5, 2], [6, 4], [7, 6]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(simple_imputer, GetSingle), [ Input( value=[simple_imputer.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=simple_imputer.op.inputs["data"], ), Input( value=np.nan, definition=simple_imputer.op.inputs["missing_values"], ), Input( value="mean", definition=simple_imputer.op.inputs["strategy"], ), ], ): self.assertTrue((results[simple_imputer.op.outputs["result"].name] == output_data).all())
async def test_associatedefinition(self): feed_def = Definition(name="feed", primitive="string") dead_def = Definition(name="dead", primitive="string") output = Definition(name="output", primitive="string") feed_input = Input(value="my favorite value", definition=feed_def) face_input = Input( value="face", definition=output, parents=[feed_input] ) dead_input = Input( value="my second favorite value", definition=dead_def ) beef_input = Input( value="beef", definition=output, parents=[dead_input] ) test_result = {"feed": "face", "dead": "beef"} for test_value in test_result.keys(): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(AssociateDefinition), [ feed_input, face_input, dead_input, beef_input, Input( value={test_value: "output"}, definition=AssociateDefinition.op.inputs["spec"], ), ], ): self.assertEqual( results, {test_value: test_result[test_value]} )
async def input_set(self, record: Record) -> List[Input]: return ([ Input( value=record.feature(feature.name), definition=Definition( name=feature.name, primitive=str(feature.dtype()), ), ) for feature in self.parent.config.features ] + [ Input( value=value, definition=self.parent.config.dataflow.definitions[name], ) for value, name in self.parent.config.inputs ] + ([] if not self.parent.config.length else [ Input( value=await self.sctx.length(), definition=Definition( name=self.parent.config.length, primitive="int", ), ) ]) + ([] if not self.parent.config.record_def else [ Input( value=record.key, definition=Definition( name=self.parent.config.record_def, primitive="string", ), ) ]))
async def test_get_embedding(self): input_sentence = ( "The end is the beginning , and the beginning is the end") async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_embedding, GetSingle), [ Input( value=[get_embedding.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=get_embedding.op.inputs["text"], ), Input( value="en_core_web_sm", definition=get_embedding.op.inputs["spacy_model"], ), ], ): embeddings = results[get_embedding.op.outputs["result"].name] self.assertEqual(len(input_sentence.split()), len(embeddings)) self.assertEqual( embeddings[randint(0, len(input_sentence.split()) - 1)].shape, embeddings[randint(0, len(input_sentence.split()) - 1)].shape, )
async def setUp(self): self.dataflow = DataFlow.auto(*OPIMPS) self.dataflow.seed.append( Input( value=[ restart_running_containers.op.outputs["containers"].name ], definition=GetSingle.op.inputs["spec"], )) self.test_inputs = { "TestRun": [ Input( value={ "ref": "refs/master", "repository": { "clone_url": f"https://github.com/{USER}/{REPO}.git", "default_branch": "master", "html_url": f"https://github.com/{USER}/{REPO}", }, }, definition=get_url_from_payload.op.inputs["payload"], ) ] } self.containers_to_remove = []
async def test_convert_color(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(convert_color, GetSingle), [ Input( value=[ convert_color.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=convert_color.op.inputs["src"], ), Input( value="BGR2RGB", definition=convert_color.op.inputs["code"], ), ], ): self.assertEqual( cv2.cvtColor( results[convert_color.op.outputs["result"].name], cv2.COLOR_RGB2BGR, ).flatten().tolist(), self.INPUT_ARRAY.flatten().tolist(), )
async def test_run(self): calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420} async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: async with orchestrator() as octx: for to_calc in calc_strings_check.keys(): await octx.ictx.sadd( to_calc, Input( value=to_calc, definition=calc_parse_line.op.inputs["line"], ), Input( value=[calc_add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ) async for ctx, results in octx.run_operations(): ctx_str = (await ctx.handle()).as_string() self.assertEqual( calc_strings_check[ctx_str], results[GetSingle.op.name][ calc_add.op.outputs["sum"].name ], )
async def test_principal_component_analysis(self): input_data, _ = make_classification( n_samples=10, n_features=10, n_informative=8, n_redundant=2, random_state=7, ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(principal_component_analysis, GetSingle), [ Input( value=[ principal_component_analysis.op.outputs["result"].name ], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=principal_component_analysis.op.inputs["data"], ), Input( value=8, definition=principal_component_analysis.op. inputs["n_components"], ), ], ): self.assertTrue((10, 8) == results[ principal_component_analysis.op.outputs["result"].name].shape)
async def test_run(self): passwords = [str(random.random()) for _ in range(0, 20)] # Orchestrate the running of these operations async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: definitions = Operation.definitions(*OPERATIONS) passwords = [ Input(value=password, definition=definitions['UnhashedPassword'], parents=None) for password in passwords ] output_spec = Input(value=['ScryptPassword'], definition=definitions['get_single_spec'], parents=None) async with orchestrator() as octx: # Add our inputs to the input network with the context being the URL for password in passwords: await octx.ictx.add( MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext(password.value), inputs=[password, output_spec]))) try: async for _ctx, results in octx.run_operations( strict=True): self.assertTrue(results) except AttributeError as error: if "module 'hashlib' has no attribute 'scrypt'" \ in str(error): return raise
async def test_get_similarity(self): input_sentence1 = ( "The end is the beginning , and the beginning is the end" ) input_sentence2 = ( "The end was the beginning , and the beginning was the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_similarity, GetSingle), [ Input( value=[get_similarity.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence1, definition=get_similarity.op.inputs["text_1"], ), Input( value=input_sentence2, definition=get_similarity.op.inputs["text_2"], ), Input( value="en_core_web_sm", definition=get_similarity.op.inputs["spacy_model"], ), ], ): similarity_score = results[ get_similarity.op.outputs["result"].name ] self.assertGreater(similarity_score, 0.9)
async def operation_db(): """ Create the database and table (myTable) for the db operations """ sdb = SqliteDatabase(SqliteDatabaseConfig(filename="examples.db")) dataflow = DataFlow( operations={"db_query_create": db_query_create_table.op}, configs={"db_query_create": DatabaseQueryConfig(database=sdb)}, seed=[], ) inputs = [ Input( value="myTable", definition=db_query_create_table.op.inputs["table_name"], ), Input( value={ "key": "INTEGER NOT NULL PRIMARY KEY", "firstName": "text", "lastName": "text", "age": "int", }, definition=db_query_create_table.op.inputs["cols"], ), ] async for ctx, result in MemoryOrchestrator.run(dataflow, inputs): pass
async def test_run(self): dataflow = DataFlow.auto(*OPIMPS) passwords = [str(random.random()) for _ in range(0, 20)] # Orchestrate the running of these operations async with MemoryOrchestrator.withconfig({}) as orchestrator: definitions = Operation.definitions(*OPERATIONS) passwords = [ Input( value=password, definition=definitions["UnhashedPassword"], parents=None, ) for password in passwords ] output_spec = Input( value=["ScryptPassword"], definition=definitions["get_single_spec"], parents=None, ) async with orchestrator(dataflow) as octx: try: async for _ctx, results in octx.run({ password.value: [password, output_spec] for password in passwords }): self.assertTrue(results) except AttributeError as error: raise
async def test_validation_error(self): with self.assertRaises(InputValidationError): test_inputs = { "area": [ Input(value="unitcircle", definition=ShapeName), Input(value=1, definition=Radius), Input(value=4, definition=Pie), # this should raise validation eror ] } pass
def _create_dataflow(self, input_, output): dataflow = create_archive_dataflow({ Input( value=input_, definition=Definition("test_inp", primitive="str"), origin="input_path", ), Input( value=output, definition=Definition("test_out", primitive="str"), origin="output_path", ), }) return dataflow
async def run(self): # Create an Orchestrator which will manage the running of our operations async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: # Create a orchestrator context, everything in DFFML follows this # one-two context entry pattern async with orchestrator() as octx: for package_name in self.packages: # For each package add a new input set to the network of # inputs (ictx). Operations run under a context, the context # here is the package_name to evaluate (the first argument). # The next arguments are all the inputs we're seeding the # network with for that context. We give the package name # because pypi_latest_package_version needs it to find the # version, which safety will then use. We also give an input # to the output operation GetSingle, which takes a list of # data type definitions we want to select as our results. await octx.ictx.sadd( package_name, Input( value=package_name, definition=pypi_package_json.op.inputs["package"], ), Input( value=[ safety_check.op.outputs["issues"].name, run_bandit.op.outputs["report"].name, ], definition=GetSingle.op.inputs["spec"], ), ) # Run all the operations, Each iteration of this loop happens # when all inputs are exhausted for a context, the output # operations are then run and their results are yielded async for ctx, results in octx.run_operations(): # The context for this data flow was the package name package_name = (await ctx.handle()).as_string() # Get the results of the GetSingle output operation results = results[GetSingle.op.name] # Check if any of the values of the operations evaluate to # true, so if the number of issues found by safety is # non-zero then this will be true any_issues = list(results.values()) if (any_issues[0] > 0 or any_issues[1]["CONFIDENCE.HIGH_AND_SEVERITY.HIGH"] > 5): print(f"Do not install {package_name}! {results!r}") else: print(f"{package_name} is okay to install")
async def test_1_insert(self): df = self._create_dataflow_with_op(db_query_insert) for _data in self.data_dicts: test_inputs = { "insert": { "table_name": self.table_name, "data": _data } } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(df) as octx: async for _ctx, results in octx.run({ test_ctx: [ Input( value=val, definition=db_query_insert.op.inputs[key], ) for key, val in test_val.items() ] for test_ctx, test_val in test_inputs.items() }): continue async with self.sdb as db: async with db() as db_ctx: query = f"SELECT * FROM {self.table_name}" db_ctx.parent.cursor.execute(query) rows = db_ctx.parent.cursor.fetchall() self.assertEqual(self.data_dicts, list(map(dict, rows)))
async def test_0_create(self): df = self._create_dataflow_with_op(db_query_create_table) test_inputs = { "create": { "table_name": self.table_name, "cols": self.cols } } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(df) as octx: async for _ctx, results in octx.run({ test_ctx: [ Input( value=val, definition=db_query_create_table.op.inputs[key], ) for key, val in test_val.items() ] for test_ctx, test_val in test_inputs.items() }): pass async with self.sdb as db: async with db() as db_ctx: query = ( "SELECT count(name) FROM sqlite_master " + f" WHERE type='table' and name='{self.table_name}' ") db_ctx.parent.cursor.execute(query) results = db_ctx.parent.cursor.fetchone() self.assertEqual(results["count(name)"], 1)
async def test_run(self): self.required_plugins("dffml-config-yaml", "dffml-model-scratch") # Load get_single and model_predict get_single = Operation.load("get_single") model_predict = list(load("dffml.operation.model:model_predict"))[0] # Create new dataflow from operations dataflow = DataFlow.auto(get_single, model_predict) # Add the seed inputs dataflow.seed.append( Input( value=[ definition.name for definition in model_predict.op.outputs.values() ], definition=get_single.inputs["spec"], )) # Write out the dataflow dataflow_yaml = pathlib.Path(self.mktempfile() + ".yaml") async with BaseConfigLoader.load("yaml").withconfig( {}) as configloader: async with configloader() as loader: dataflow_yaml.write_bytes(await loader.dumpb( dataflow.export(linked=True))) # TODO Figure out how nested model config options will work # print(dataflow_yaml.read_text()) return
async def run_custom(self, inputs: Dict[str, Any]) -> Dict[str, Any]: # TODO Move string primitive validation into init of # an OperationImplementation (and then keep this as the context). ctx_input_name, ctx_definition = list(self.parent.op.inputs.items())[0] if ctx_definition.primitive != "string": raise InvalidCustomRunDataFlowContext(ctx_definition.export()) subflow_inputs = {inputs[ctx_input_name]: []} for input_name, value in inputs.items(): definition = self.parent.op.inputs[input_name] subflow_inputs[inputs[ctx_input_name]].append( Input(value=value, definition=definition) ) op_outputs = sorted(self.parent.op.outputs.keys()) async with self.subflow(self.config.dataflow) as octx: async for ctx, result in octx.run(subflow_inputs): if op_outputs != sorted(result.keys()): raise InvalidCustomRunDataFlowOutputs( ctx_definition.export() ) return result
async def setUp(self): super().setUp() self.stdout = io.StringIO() InputDataflow = DataFlow( operations={ "AcceptUserInput": AcceptUserInput.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[AcceptUserInput.op.outputs["InputData"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={AcceptUserInput.op.name: AcceptUserInput}, ) OutputDataflow = DataFlow( operations={ "print_output": print_output.op, "get_single": GetSingle.imp.op, }, implementations={print_output.op.name: print_output.imp}, ) self.InputDataflow = InputDataflow self.OutputDataflow = OutputDataflow
async def test_validate(self): test_inputs = { "area": [ Input(value="unitcircle", definition=ShapeName), Input(value=1, definition=Radius), Input(value=3.14, definition=Pie), ] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(self.dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("mapping", results) results = results["mapping"] self.assertEqual(results["name"], "UNITCIRCLE") self.assertEqual(results["area"], 3.14) self.assertEqual(results["radius"], 1)
async def test_gen_with_input(self): test_dataflow = DataFlow.auto(GetMulti, counter, echo_num) test_dataflow.seed.append( Input( value=[echo_num.op.outputs["number_out"].name], definition=GetMulti.op.inputs["spec"], )) test_dataflow.implementations[counter.op.name] = counter.imp test_dataflow.implementations[echo_num.op.name] = echo_num.imp test_inputs = {"TestCount": [Input(value=1, definition=CountStart)]} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("number", results) self.assertEqual(set([1, 2, 3, 4, 5]), set(results["number"]))
async def test_standard_scaler(self): input_data = [[0, 0], [0, 0], [1, 1], [1, 1]] output_data = [[-1, -1], [-1, -1], [1, 1], [1, 1]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(standard_scaler, GetSingle), [ Input( value=[standard_scaler.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=standard_scaler.op.inputs["data"], ), ], ): self.assertTrue((results[standard_scaler.op.outputs["result"].name] == output_data))
async def test_HuMoments(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(HuMoments, GetSingle), [ Input( value=[ HuMoments.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=HuMoments.op.inputs["m"], ), ], ): self.assertEqual( results[HuMoments.op.outputs["result"].name].shape, (7, ))
async def test_remove_whitespaces(self): input_data = [[" ABC ", "XYD "], [" ABC", " XYD "]] output_data = [["ABC", "XYD"], ["ABC", "XYD"]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(remove_whitespaces, GetSingle), [ Input( value=[remove_whitespaces.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=remove_whitespaces.op.inputs["data"], ), ], ): self.assertTrue( (results[remove_whitespaces.op.outputs["result"].name] == output_data).all())