async def multicomm_dataflow(self, config, request): # Seed the network with inputs given by caller # TODO(p0,security) allowlist of valid definitions to seed (set # Input.origin to something other than seed) inputs = [] # If data was sent add those inputs if request.method == "POST": # Accept a list of input data # TODO validate that input data is dict of list of inputs each item # has definition and value properties for ctx, client_inputs in (await request.json()).items(): for input_data in client_inputs: if (not input_data["definition"] in config.dataflow.definitions): return web.json_response( { "error": f"Missing definition for {input_data['definition']} in dataflow" }, status=HTTPStatus.NOT_FOUND, ) inputs.append( MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext(ctx), inputs=[ Input( value=input_data["value"], definition=config.dataflow.definitions[ input_data["definition"]], ) for input_data in client_inputs ], ))) # Run the operation in an orchestrator # TODO(dfass) Create the orchestrator on startup of the HTTP API itself async with MemoryOrchestrator.basic_config() as orchestrator: # TODO(dfass) Create octx on dataflow registration async with orchestrator(config.dataflow) as octx: results = { str(ctx): result async for ctx, result in octx.run(*inputs) } # TODO Implement input and presentation stages? """ if config.presentation == "blob": return web.Response(body=results) elif config.presentation == "text": return web.Response(text=results) else: """ return web.json_response(results)
async def test_get_noun_chunks(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_noun_chunks, GetSingle), [ Input( value=[get_noun_chunks.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=get_noun_chunks.op.inputs["text"], ), Input( value="en_core_web_sm", definition=get_noun_chunks.op.inputs["spacy_model"], ), ], ): noun_chunks = results[get_noun_chunks.op.outputs["result"].name] self.assertEqual(len(noun_chunks), 4)
async def test_run(self): calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420} async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: async with orchestrator() as octx: for to_calc in calc_strings_check.keys(): await octx.ictx.sadd( to_calc, Input( value=to_calc, definition=parse_line.op.inputs["line"], ), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ) async for ctx, results in octx.run_operations(): ctx_str = (await ctx.handle()).as_string() results = results[GetSingle.op.name] self.assertEqual( calc_strings_check[ctx_str], results[add.op.outputs["sum"].name], )
async def setUp(self): self.dataflow = DataFlow( operations={ "get_circle": get_circle.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[get_circle.op.outputs["shape"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={"get_circle": get_circle.imp}, )
async def test_run(self): dataflow = DataFlow.auto(*OPIMPS) calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(dataflow) as octx: async for ctx, results in octx.run({ to_calc: [ Input( value=to_calc, definition=calc_parse_line.op.inputs["line"], ), Input( value=[calc_add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ] for to_calc in calc_strings_check.keys() }): ctx_str = (await ctx.handle()).as_string() self.assertEqual( calc_strings_check[ctx_str], results[calc_add.op.outputs["sum"].name], )
async def test_print_output(self): test_inputs = [ Input( value="Testing print_output", definition=self.OutputDataflow.definitions["DataToPrint"], parents=None, ) ] async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(self.OutputDataflow) as octx: with contextlib.redirect_stdout(self.stdout): async for ctx_str, _ in octx.run(test_inputs): results = self.stdout.getvalue() self.assertIn("Testing print_output", results)
async def test_one_hot_encoder(self): categories = [["Male", "Female"], [1, 2, 3]] input_data = [["Female", 1], ["Male", 3]] output_data = [[0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(one_hot_encoder, GetSingle), [ Input( value=[one_hot_encoder.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=one_hot_encoder.op.inputs["data"], ), Input( value=categories, definition=one_hot_encoder.op.inputs["categories"], ), ], ): self.assertTrue((results[one_hot_encoder.op.outputs["result"].name] == output_data).all())
async def test_lemmatizer(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(lemmatizer, GetSingle), [ Input( value=[lemmatizer.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=lemmatizer.op.inputs["text"], ), Input( value="en_core_web_sm", definition=lemmatizer.op.inputs["spacy_model"], ), ], ): lemma_list = results[lemmatizer.op.outputs["result"].name] self.assertEqual(len(input_sentence.split()), len(lemma_list))
async def test_run(self): packages = { "http://pkg.freebsd.org/FreeBSD:13:amd64/latest/All/ImageMagick7-7.0.8.48.txz": {}, "https://download.clearlinux.org/releases/10540/clear/x86_64/os/Packages/sudo-setuid-1.8.17p1-34.x86_64.rpm": { "./usr/bin/sudo": True }, "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/20/Everything/x86_64/os/Packages/c/curl-7.32.0-3.fc20.x86_64.rpm": { "./usr/bin/curl": False }, } found = dict(zip(packages.keys(), [False] * len(packages))) async for ctx, results in MemoryOrchestrator.run( dataflow, { URL: [ Input(value=URL, definition=URLToURLBytes.op.inputs["URL"]), Input( value=["rpm_filename", "binary_is_PIE"], definition=Associate.op.inputs["spec"], ), ] for URL in packages }, strict=True, ): package_url = (await ctx.handle()).as_string() with self.subTest(package_url=package_url): self.assertIn("binary_is_PIE", results) self.assertDictEqual(results["binary_is_PIE"], packages[package_url]) found[package_url] = True self.assertTrue(all(found.values()), "Not all packages we analyized: f{found}")
async def test_tfidf_vectorizer(self): input_sentence = [ "The end is the beginning. The beginning is the end." ] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(tfidf_vectorizer, GetSingle), [ Input( value=[tfidf_vectorizer.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=tfidf_vectorizer.op.inputs["text"], ), Input( value=[1, 1], definition=count_vectorizer.op.inputs["ngram_range"], ), Input( value=True, definition=tfidf_vectorizer.op.inputs["get_feature_names"], ), ], ): vectors = results[tfidf_vectorizer.op.outputs["result"].name][0] features = results[tfidf_vectorizer.op.outputs["result"].name][1] self.assertTrue(isinstance(features, list)) self.assertTrue(isinstance(vectors, np.ndarray)) unique_tokens = list( set(input_sentence[0].lower().replace(".", "").split()) ) self.assertEqual(len(vectors[0]), len(unique_tokens)) self.assertEqual( set(features).intersection(set(unique_tokens)), set(features) )
async def test_resize(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(resize, GetSingle), [ Input( value=[ resize.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=resize.op.inputs["src"], ), Input( value=[50, 50, 3], definition=resize.op.inputs["dsize"], ), ], ): self.assertEqual( results[resize.op.outputs["result"].name].shape, (50, 50, 3), )
async def test_get_embedding(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) max_sentence_len = 15 async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_embedding, GetSingle), [ Input( value=[get_embedding.op.outputs["embedding"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=get_embedding.op.inputs["text"], ), Input( value="en_core_web_sm", definition=get_embedding.op.inputs["spacy_model"], ), Input( value=max_sentence_len, definition=get_embedding.op.inputs["max_len"], ), Input( value="<PAD>", definition=get_embedding.op.inputs["pad_token"], ), ], ): embeddings = results[get_embedding.op.outputs["embedding"].name] self.assertEqual(max_sentence_len, len(embeddings)) self.assertEqual( embeddings[randint(0, max_sentence_len - 1)].shape, embeddings[randint(0, max_sentence_len - 1)].shape, )
async def records(self) -> AsyncIterator[Record]: async for record in self.sctx.records(): async for ctx, result in MemoryOrchestrator.run( self.parent.config.dataflow, [ Input( value=record.feature(feature.name), definition=Definition(name=feature.name, primitive=str(feature.dtype())), ) for feature in self.parent.config.features ], ): if result: record.evaluated(result) yield record
async def test_run(self): repos = [ "http://pkg.freebsd.org/FreeBSD:13:amd64/latest/All/ImageMagick7-7.0.8.48.txz", "https://download.clearlinux.org/releases/10540/clear/x86_64/os/Packages/sudo-setuid-1.8.17p1-34.x86_64.rpm", "https://rpmfind.net/linux/fedora/linux/updates/29/Everything/x86_64/Packages/g/gzip-1.9-9.fc29.x86_64.rpm", "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/20/Everything/x86_64/os/Packages/c/curl-7.32.0-3.fc20.x86_64.rpm", ] dataflow = DataFlow.auto( URLToURLBytes, files_in_rpm, urlbytes_to_rpmfile, urlbytes_to_tarfile, is_binary_pie, Associate, cleanup_rpm, ) async with MemoryOrchestrator.withconfig({}) as orchestrator: definitions = Operation.definitions(*OPERATIONS) async with orchestrator(dataflow) as octx: async for ctx, results in octx.run( { URL: [ Input(value=URL, definition=definitions["URL"]), Input( value=["rpm_filename", "binary_is_PIE"], definition=definitions["associate_spec"], ), ] for URL in repos }, strict=True, ): self.assertTrue(results)
async def setUp(self): dataflow = DataFlow( operations={ "announce": announce.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[announce.op.outputs["string_out"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={announce.op.name: announce.imp}, ) self.dataflow = dataflow
async def test_condition_does_not_run(self): ran = [] @op(conditions=[CONDITION]) async def condition_test(hi: str): ran.append(True) async with MemoryOrchestrator() as orchestrator: async with orchestrator(DataFlow(condition_test)) as octx: async for _ in octx.run([ Input( value=True, definition=condition_test.op.inputs["hi"], ), ]): pass self.assertFalse(ran)
async def run_dataflow(self, inputs: Dict[str, Any]) -> Dict[str, Any]: inputs_created = {} definitions = self.config.dataflow.definitions for ctx_str, val_defs in inputs.items(): inputs_created[ctx_str] = [ Input( value=val_def["value"], definition=definitions[val_def["definition"]], ) for val_def in val_defs ] async with self.octx.parent(self.config.dataflow) as octx: results = [{ (await ctx.handle()).as_string(): result } async for ctx, result in octx.run(inputs_created)] return {"results": results}
async def run(self): # Create an Orchestrator which will manage the running of our operations async with MemoryOrchestrator.withconfig({}) as orchestrator: # Create a orchestrator context, everything in DFFML follows this # one-two context entry pattern async with orchestrator(DATAFLOW) as octx: # Run all the operations, Each iteration of this loop happens # when all inputs are exhausted for a context, the output # operations are then run and their results are yielded async for package_name, results in octx.run({ # For each package add a new input set to the input network # The context operations execute under is the package name # to evaluate. Contexts ensure that data pertaining to # package A doesn't mingle with data pertaining to package B package_name: [ # The only input to the operations is the package name. Input( value=package_name, definition=pypi_package_json.op.inputs["package"], ) ] for package_name in self.packages }): # Grab the number of safety issues and the bandit report # from the results dict safety_issues = results[ safety_check.op.outputs["issues"].name] bandit_report = results[ run_bandit.op.outputs["report"].name] # Decide if those numbers mean we should stop ship or not if (safety_issues > 0 or bandit_report["CONFIDENCE.HIGH_AND_SEVERITY.HIGH"] > 5): print(f"Do not install {package_name}!") for definition_name, result in results.items(): print(f" {definition_name}: {result}") else: print(f"{package_name} is okay to install")
async def run_dataflow(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """ Starts a subflow `self.config.dataflow` and runs `inputs` in it. Parameters: inputs: Dict[str,Any] -> eg: { "ctx_str" : [ { "value":val1, "defintion":defintion1 }, { "value":val2, "defintion":defintion2 } ] } Returns: Dict[str,Any] -> maps context strings in inputs to output after running through dataflow """ inputs_created = {} definitions = self.config.dataflow.definitions for ctx_str, val_defs in inputs.items(): inputs_created[ctx_str] = [ Input( value=val_def["value"], definition=definitions[val_def["definition"]], ) for val_def in val_defs ] async with self.subflow(self.config.dataflow) as octx: results = [{ (await ctx.handle()).as_string(): result } async for ctx, result in octx.run(inputs_created)] return {"results": results}
async def run_default(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """ The default implementation for the dataflow.run operation is the uctx mode. This mode is when we map unique strings to a list of inputs to be given to the respective string's context. """ inputs_created = {} definitions = self.config.dataflow.definitions for ctx_str, val_defs in inputs.items(): inputs_created[ctx_str] = [ Input( value=val_def["value"], definition=definitions[val_def["definition"]], ) for val_def in val_defs ] async with self.subflow(self.config.dataflow) as octx: results = [ {(await ctx.handle()).as_string(): result} async for ctx, result in octx.run(inputs_created) ] return {"results": results}
pypi_package_url, pypi_package_contents, cleanup_pypi_package, safety_check, run_bandit, GetSingle, ) # Seed inputs are added to each executing context. The following Input tells the # GetSingle output operation that we want the output of the network to include # data matching the "issues" output of the safety_check operation, and the # "report" output of the run_bandit operation, for each context. DATAFLOW.seed.append( Input( value=[ safety_check.op.outputs["issues"].name, run_bandit.op.outputs["report"].name, ], definition=GetSingle.op.inputs["spec"], )) class Install(CMD): arg_packages = Arg("packages", nargs="+", help="Package to check if we should install") async def run(self): # Create an Orchestrator which will manage the running of our operations async with MemoryOrchestrator.withconfig({}) as orchestrator: # Create a orchestrator context, everything in DFFML follows this
async def test_run(self): calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420} # TODO(p0) Implement and test asyncgenerator callstyles_no_expand = [ "asyncgenerator", "dict", "dict_custom_input_set_context", ] callstyles = { "dict": { to_calc: [ Input(value=to_calc, definition=parse_line.op.inputs["line"]), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ] for to_calc in calc_strings_check.keys() }, "dict_custom_input_set_context": { CustomInputSetContext(to_calc): [ Input(value=to_calc, definition=parse_line.op.inputs["line"]), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ] for to_calc in calc_strings_check.keys() }, "list_input_sets": [ MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext(to_calc), inputs=[ Input( value=to_calc, definition=parse_line.op.inputs["line"], ), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ], )) for to_calc in calc_strings_check.keys() ], "uctx": [[ Input(value=to_calc, definition=parse_line.op.inputs["line"]), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ] for to_calc in calc_strings_check.keys()], } async with self.create_octx() as octx: for callstyle, inputs in callstyles.items(): with self.subTest(callstyle=callstyle): if callstyle in callstyles_no_expand: run_coro = self.run_dataflow(octx, inputs) else: run_coro = self.run_dataflow(octx, *inputs) async for ctx, results in run_coro: ctx_str = (await ctx.handle()).as_string() if callstyle == "uctx": self.assertIn( results[add.op.outputs["sum"].name], dict( zip( calc_strings_check.values(), calc_strings_check.keys(), )), ) else: if callstyle == "dict_custom_input_set_context": self.assertTrue( isinstance(ctx, CustomInputSetContext)) self.assertEqual( calc_strings_check[ctx_str], results[add.op.outputs["sum"].name], )
async def test_run(self): calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420} dataflow = DataFlow.auto(*OPIMPS) # TODO(p0) Implement and test asyncgenerator callstyles_no_expand = ["asyncgenerator", "dict"] callstyles = { "dict": { to_calc: [ Input(value=to_calc, definition=parse_line.op.inputs["line"]), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ] for to_calc in calc_strings_check.keys() }, "list_input_sets": [ MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext(to_calc), inputs=[ Input( value=to_calc, definition=parse_line.op.inputs["line"], ), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ], )) for to_calc in calc_strings_check.keys() ], "uctx": [[ Input(value=to_calc, definition=parse_line.op.inputs["line"]), Input( value=[add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ] for to_calc in calc_strings_check.keys()], } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(dataflow) as octx: for callstyle, inputs in callstyles.items(): with self.subTest(callstyle=callstyle): if callstyle in callstyles_no_expand: run_coro = octx.run(inputs) else: run_coro = octx.run(*inputs) async for ctx, results in run_coro: ctx_str = (await ctx.handle()).as_string() if callstyle == "uctx": self.assertIn( results[add.op.outputs["sum"].name], dict( zip( calc_strings_check.values(), calc_strings_check.keys(), )), ) else: self.assertEqual( calc_strings_check[ctx_str], results[add.op.outputs["sum"].name], )
async def _multicomm_dataflow(self, config, request): # Seed the network with inputs given by caller # TODO(p0,security) allowlist of valid definitions to seed (set # Input.origin to something other than seed) inputs = [] # If data was sent add those inputs if request.method == "POST": # Accept a list of input data according to config.input_mode if config.input_mode == "default": # TODO validate that input data is dict of list of inputs each item # has definition and value properties for ctx, client_inputs in (await request.json()).items(): for input_data in client_inputs: if (not input_data["definition"] in config.dataflow.definitions): return web.json_response( { "error": f"Missing definition for {input_data['definition']} in dataflow" }, status=HTTPStatus.NOT_FOUND, ) inputs.append( MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext(ctx), inputs=[ Input( value=input_data["value"], definition=config.dataflow.definitions[ input_data["definition"]], ) for input_data in client_inputs ] + ([ Input( value=request.headers, definition=config.dataflow.definitions[ config.forward_headers], ) ] if config.forward_headers else []), ))) elif ":" in config.input_mode: preprocess_mode, *input_def = config.input_mode.split(":") input_def = ":".join(input_def) if input_def not in config.dataflow.definitions: return web.json_response( { "error": f"Missing definition for {input_def} in dataflow" }, status=HTTPStatus.NOT_FOUND, ) if preprocess_mode == "json": value = await request.json() elif preprocess_mode == "text": value = await request.text() elif preprocess_mode == "bytes": value = await request.read() elif preprocess_mode == "stream": value = request.content else: return web.json_response( { "error": f"preprocess tag must be one of {self.IO_MODES}, got {preprocess_mode}" }, status=HTTPStatus.NOT_FOUND, ) inputs.append( MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext("post_input"), inputs=[ Input( value=value, definition=config.dataflow. definitions[input_def], ) ] + ([ Input( value=request.headers, definition=config.dataflow.definitions[ config.forward_headers], ) ] if config.forward_headers else []), ))) else: raise NotImplementedError( "Input modes other than default,preprocess:definition_name not yet implemented" ) # Run the operation in an orchestrator # TODO(dfass) Create the orchestrator on startup of the HTTP API itself async with MemoryOrchestrator() as orchestrator: # TODO(dfass) Create octx on dataflow registration async with orchestrator(config.dataflow) as octx: results = { str(ctx): result async for ctx, result in octx.run(*inputs) } if config.output_mode == "json": return web.json_response(results) # content_info is a List[str] ([content_type,output_keys]) # in case of stream,bytes and string in others postprocess_mode, *content_info = config.output_mode.split(":") if postprocess_mode == "stream": # stream:text/plain:get_single.beef raise NotImplementedError( "output mode not yet implemented") elif postprocess_mode == "bytes": content_type, output_keys = content_info output_data = traverse_get(results, output_keys) return web.Response(body=output_data) elif postprocess_mode == "text": output_data = traverse_get(results, content_info[0]) return web.Response(text=output_data) else: return web.json_response( {"error": f"output mode not valid"}, status=HTTPStatus.NOT_FOUND, )
HELLO_BLANK_DATAFLOW = DataFlow( operations={ "hello_blank": formatter.op, "remap_to_response": remap.op }, configs={ "hello_blank": { "formatting": "Hello {}" }, "remap_to_response": { "dataflow": DataFlow( operations={"get_formatted_message": GetSingle.op}, seed=[ Input( value=[formatter.op.outputs["string"].name], definition=GetSingle.op.inputs["spec"], ) ], ) }, }, seed=[ Input( value={"response": [formatter.op.outputs["string"].name]}, definition=remap.op.inputs["spec"], ) ], ) HELLO_WORLD_DATAFLOW = copy.deepcopy(HELLO_BLANK_DATAFLOW) HELLO_WORLD_DATAFLOW.seed.append(
}], }), "predict_using_model": InputFlow(inputs={"features": [{ "create_feature_map": "mapping" }]}), "print_predictions": InputFlow(inputs={"data": [{ "predict_using_model": "prediction" }]}), }, ) dataflow.seed.append( Input( value="Years", definition=create_mapping.op.inputs["key"], origin="seed.Years", )) async def main(): # train the model await train( slr_model, { "Years": 0, "Salary": 10 }, { "Years": 1, "Salary": 20
"edit_feature": InputFlow( inputs={ "features": [ {"seed": ["Years", "Expertise", "Trust", "Salary"]} ] }, ), "associate_definition": InputFlow(inputs={"spec": ["seed"]}), }, ) TEST_DATAFLOW1.seed = [ # I don't think we need this as we are providing the flow Input( value={ feature.name: edit_feature.op.outputs["updated_features"].name for feature in TEST_FEATURE }, definition=AssociateDefinition.op.inputs["spec"], ) ] class TestDataFlowSource(AsyncTestCase): @classmethod def setUpClass(self): self.records = [ Record( str(i), data={ "features": { "Years": A[i],
async def test_run(self): linker = Linker() exported = linker.export(*OPERATIONS) definitions, operations, _outputs = linker.resolve(exported) # Instantiate inputs repos = glob.glob( os.path.join( os.path.expanduser("~"), "Documents", "python", "testrepos", "*", ) ) if not repos: repos = glob.glob( os.path.join( os.path.expanduser("~"), "Documents", "python", "dffml" ) ) if not repos: repos = [ "https://github.com/intel/dffml", "https://github.com/pdxjohnny/dffml", ] repos = repos[:1] urls = [ Input(value=URL, definition=definitions["URL"], parents=None) for URL in repos ] no_git_branch_given = Input( value=True, definition=definitions["no_git_branch_given"], parents=None, ) date_spec = Input( value=datetime.now().strftime(TIME_FORMAT_MINTUE_RESOLUTION), definition=definitions["quarter_start_date"], parents=None, ) quarters = [ Input(value=i, definition=definitions["quarter"], parents=None) for i in range(0, 10) ] group_by_spec = Input( value={ "cloc": { "group": "quarter", "by": "language_to_comment_ratio", "fill": 0, }, "authors": { "group": "quarter", "by": "author_count", "fill": 0, }, "work": {"group": "quarter", "by": "work_spread", "fill": 0}, "release": { "group": "quarter", "by": "release_within_period", "fill": False, }, "commits": { "group": "quarter", "by": "commit_count", "fill": 0, }, }, definition=definitions["group_by_spec"], parents=None, ) # Orchestrate the running of these operations help(MemoryOrchestrator.basic_config) async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: async with orchestrator() as octx: # Add our inputs to the input network with the context being the URL for url in urls: await octx.ictx.sadd( url.value, url, no_git_branch_given, date_spec, group_by_spec, *quarters, ) async for ctx, results in octx.run_operations(): self.assertTrue(results)
async def run_dataflow(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """ Starts a subflow ``self.config.dataflow`` and adds ``inputs`` in it. Parameters ---------- inputs : dict The inputs to add to the subflow. These should be a key value mapping of the context string to the inputs which should be seeded for that context string. Returns ------- dict Maps context strings in inputs to output after running through dataflow. Examples -------- >>> URL = Definition(name="URL", primitive="string") >>> >>> subflow = DataFlow.auto(GetSingle) >>> subflow.definitions[URL.name] = URL >>> subflow.seed.append( ... Input( ... value=[URL.name], ... definition=GetSingle.op.inputs["spec"] ... ) ... ) >>> >>> dataflow = DataFlow.auto(run_dataflow, GetSingle) >>> dataflow.configs[run_dataflow.imp.op.name] = RunDataFlowConfig(subflow) >>> dataflow.seed.append( ... Input( ... value=[run_dataflow.imp.op.outputs["results"].name], ... definition=GetSingle.op.inputs["spec"] ... ) ... ) >>> >>> async def main(): ... async for ctx, results in MemoryOrchestrator.run(dataflow, { ... "run_subflow": [ ... Input( ... value={ ... "dffml": [ ... { ... "value": "https://github.com/intel/dffml", ... "definition": URL.name ... } ... ] ... }, ... definition=run_dataflow.imp.op.inputs["inputs"] ... ) ... ] ... }): ... print(results) >>> >>> asyncio.run(main()) {'flow_results': {'dffml': {'URL': 'https://github.com/intel/dffml'}}} """ inputs_created = {} definitions = self.config.dataflow.definitions for ctx_str, val_defs in inputs.items(): inputs_created[ctx_str] = [ Input( value=val_def["value"], definition=definitions[val_def["definition"]], ) for val_def in val_defs ] async with self.subflow(self.config.dataflow) as octx: results = [{ (await ctx.handle()).as_string(): result } async for ctx, result in octx.run(inputs_created)] return {"results": results}
async def test_run(self): test_dataflow = DataFlow( operations={ "run_dataflow": run_dataflow.op, "get_single": GetSingle.imp.op, }, configs={"run_dataflow": RunDataFlowConfig(dataflow=DATAFLOW)}, seed=[ Input( value=[run_dataflow.op.outputs["results"].name], definition=GetSingle.op.inputs["spec"], ) ], ) test_inputs = [ { "add_op": [ { "value": "add 40 and 2", "definition": parse_line.op.inputs["line"].name, }, { "value": [add.op.outputs["sum"].name], "definition": GetSingle.op.inputs["spec"].name, }, ] }, { "mult_op": [ { "value": "multiply 42 and 10", "definition": parse_line.op.inputs["line"].name, }, { "value": [mult.op.outputs["product"].name], "definition": GetSingle.op.inputs["spec"].name, }, ] }, ] test_outputs = {"add_op": 42, "mult_op": 420} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for _ctx, results in octx.run({ list(test_input.keys())[0]: [ Input( value=test_input, definition=run_dataflow.op.inputs["inputs"], ) ] for test_input in test_inputs }): ctx_str = (await _ctx.handle()).as_string() self.assertIn("flow_results", results) results = results["flow_results"] self.assertIn(ctx_str, map(str, results.keys())) self.assertIn(ctx_str, test_outputs) results = results[list(results.keys())[0]] self.assertIn("result", results) results = results["result"] expected_results = test_outputs[ctx_str] self.assertEqual(expected_results, results)