def test_resolve_missing_condition_definition(self): exported = DataFlow.auto(add).export(linked=True) del exported["definitions"]["is_add"] with self.assertRaisesRegex( DefinitionMissing, "add.conditions.*is_add" ): DataFlow._fromdict(**exported)
async def setUp(self): self.dataflow = DataFlow.auto(*OPIMPS) self.dataflow.seed += [ Input( value=[ restart_running_containers.op.outputs["containers"].name ], definition=GetSingle.op.inputs["spec"], ), Input(value=True, definition=clone_git_repo.op.conditions[0]), ] test_data = { "ref": "refs/main", "repository": { "clone_url": f"https://github.com/{USER}/{REPO}.git", "default_branch": "main", "html_url": f"https://github.com/{USER}/{REPO}", }, } self.test_inputs = { "TestRun": [ Input( value=test_data, definition=check_secret_match.op.outputs["git_payload"], ) ] } self.containers_to_remove = []
async def test_associatedefinition(self): feed_def = Definition(name="feed", primitive="string") dead_def = Definition(name="dead", primitive="string") output = Definition(name="output", primitive="string") feed_input = Input(value="my favorite value", definition=feed_def) face_input = Input( value="face", definition=output, parents=[feed_input] ) dead_input = Input( value="my second favorite value", definition=dead_def ) beef_input = Input( value="beef", definition=output, parents=[dead_input] ) test_result = {"feed": "face", "dead": "beef"} for test_value in test_result.keys(): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(AssociateDefinition), [ feed_input, face_input, dead_input, beef_input, Input( value={test_value: "output"}, definition=AssociateDefinition.op.inputs["spec"], ), ], ): self.assertEqual( results, {test_value: test_result[test_value]} )
async def test_simple_imputer(self): input_data = [[np.nan, 2], [6, np.nan], [7, 6]] output_data = [[6.5, 2], [6, 4], [7, 6]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(simple_imputer, GetSingle), [ Input( value=[simple_imputer.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=simple_imputer.op.inputs["data"], ), Input( value=np.nan, definition=simple_imputer.op.inputs["missing_values"], ), Input( value="mean", definition=simple_imputer.op.inputs["strategy"], ), ], ): self.assertTrue((results[simple_imputer.op.outputs["result"].name] == output_data).all())
async def test_get_embedding(self): input_sentence = ( "The end is the beginning , and the beginning is the end") async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_embedding, GetSingle), [ Input( value=[get_embedding.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=get_embedding.op.inputs["text"], ), Input( value="en_core_web_sm", definition=get_embedding.op.inputs["spacy_model"], ), ], ): embeddings = results[get_embedding.op.outputs["result"].name] self.assertEqual(len(input_sentence.split()), len(embeddings)) self.assertEqual( embeddings[randint(0, len(input_sentence.split()) - 1)].shape, embeddings[randint(0, len(input_sentence.split()) - 1)].shape, )
async def setUp(self): self.dataflow = DataFlow.auto(*OPIMPS) self.dataflow.seed.append( Input( value=[ restart_running_containers.op.outputs["containers"].name ], definition=GetSingle.op.inputs["spec"], )) self.test_inputs = { "TestRun": [ Input( value={ "ref": "refs/master", "repository": { "clone_url": f"https://github.com/{USER}/{REPO}.git", "default_branch": "master", "html_url": f"https://github.com/{USER}/{REPO}", }, }, definition=get_url_from_payload.op.inputs["payload"], ) ] } self.containers_to_remove = []
def test_export(self): exported = DataFlow.auto(add).export(linked=True) # Operations self.assertIn("operations", exported) self.assertIn("tests.test_df:add", exported["operations"]) self.assertIn("inputs", exported["operations"]["tests.test_df:add"]) self.assertIn("outputs", exported["operations"]["tests.test_df:add"]) self.assertIn("conditions", exported["operations"]["tests.test_df:add"]) self.assertIn( "is_add", exported["operations"]["tests.test_df:add"]["conditions"]) self.assertIn("numbers", exported["operations"]["tests.test_df:add"]["inputs"]) self.assertEqual( "numbers", exported["operations"]["tests.test_df:add"]["inputs"]["numbers"], ) self.assertIn("sum", exported["operations"]["tests.test_df:add"]["outputs"]) self.assertEqual( "result", exported["operations"]["tests.test_df:add"]["outputs"]["sum"], ) # Definitions self.assertIn("definitions", exported) self.assertIn("numbers", exported["definitions"]) self.assertIn("primitive", exported["definitions"]["numbers"]) self.assertEqual("List[int]", exported["definitions"]["numbers"]["primitive"]) self.assertIn("result", exported["definitions"]) self.assertIn("primitive", exported["definitions"]["result"]) self.assertEqual("int", exported["definitions"]["result"]["primitive"])
async def test_calcHist(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(calcHist, GetSingle), [ Input( value=[ calcHist.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=calcHist.op.inputs["images"], ), Input( value=None, definition=calcHist.op.inputs["mask"], ), Input( value=[0, 1], definition=calcHist.op.inputs["channels"], ), Input( value=[32, 32], definition=calcHist.op.inputs["histSize"], ), Input( value=[0, 256, 0, 256], definition=calcHist.op.inputs["ranges"], ), ], ): self.assertEqual(results[calcHist.op.outputs["result"].name].shape, (32, 32))
async def test_convert_color(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(convert_color, GetSingle), [ Input( value=[ convert_color.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=convert_color.op.inputs["src"], ), Input( value="BGR2RGB", definition=convert_color.op.inputs["code"], ), ], ): self.assertEqual( cv2.cvtColor( results[convert_color.op.outputs["result"].name], cv2.COLOR_RGB2BGR, ).flatten().tolist(), self.INPUT_ARRAY.flatten().tolist(), )
async def test_principal_component_analysis(self): input_data, _ = make_classification( n_samples=10, n_features=10, n_informative=8, n_redundant=2, random_state=7, ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(principal_component_analysis, GetSingle), [ Input( value=[ principal_component_analysis.op.outputs["result"].name ], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=principal_component_analysis.op.inputs["data"], ), Input( value=8, definition=principal_component_analysis.op. inputs["n_components"], ), ], ): self.assertTrue((10, 8) == results[ principal_component_analysis.op.outputs["result"].name].shape)
async def test_get_similarity(self): input_sentence1 = ( "The end is the beginning , and the beginning is the end" ) input_sentence2 = ( "The end was the beginning , and the beginning was the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_similarity, GetSingle), [ Input( value=[get_similarity.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence1, definition=get_similarity.op.inputs["text_1"], ), Input( value=input_sentence2, definition=get_similarity.op.inputs["text_2"], ), Input( value="en_core_web_sm", definition=get_similarity.op.inputs["spacy_model"], ), ], ): similarity_score = results[ get_similarity.op.outputs["result"].name ] self.assertGreater(similarity_score, 0.9)
async def test_dataflow_usage_example(self): # Write out shouldi dataflow orig = self.mktempfile() + ".json" pathlib.Path(orig).write_text(json.dumps(self.DATAFLOW.export())) # Import from feature/git transform_to_repo = Operation.load("dffml.mapping.create") lines_of_code_by_language, lines_of_code_to_comments = list( load( "dffml_feature_git.feature.operations:lines_of_code_by_language", "dffml_feature_git.feature.operations:lines_of_code_to_comments", relative=relative_path("..", "..", "feature", "git"), )) # Create new dataflow override = DataFlow.auto( transform_to_repo, lines_of_code_by_language, lines_of_code_to_comments, ) # TODO Modify and compare against yaml in docs example # Write out override dataflow created = self.mktempfile() + ".json" pathlib.Path(created).write_text(json.dumps(override.export())) # Merge the two with contextlib.redirect_stdout(self.stdout): await CLI.cli("dataflow", "merge", orig, created) DataFlow._fromdict(**json.loads(self.stdout.getvalue()))
async def test_run(self): dataflow = DataFlow.auto(*OPIMPS) passwords = [str(random.random()) for _ in range(0, 20)] # Orchestrate the running of these operations async with MemoryOrchestrator.withconfig({}) as orchestrator: definitions = Operation.definitions(*OPERATIONS) passwords = [ Input( value=password, definition=definitions["UnhashedPassword"], parents=None, ) for password in passwords ] output_spec = Input( value=["ScryptPassword"], definition=definitions["get_single_spec"], parents=None, ) async with orchestrator(dataflow) as octx: try: async for _ctx, results in octx.run({ password.value: [password, output_spec] for password in passwords }): self.assertTrue(results) except AttributeError as error: raise
async def test_run(self): self.required_plugins("dffml-config-yaml", "dffml-model-scratch") # Load get_single and model_predict get_single = Operation.load("get_single") model_predict = list(load("dffml.operation.model:model_predict"))[0] # Create new dataflow from operations dataflow = DataFlow.auto(get_single, model_predict) # Add the seed inputs dataflow.seed.append( Input( value=[ definition.name for definition in model_predict.op.outputs.values() ], definition=get_single.inputs["spec"], )) # Write out the dataflow dataflow_yaml = pathlib.Path(self.mktempfile() + ".yaml") async with BaseConfigLoader.load("yaml").withconfig( {}) as configloader: async with configloader() as loader: dataflow_yaml.write_bytes(await loader.dumpb( dataflow.export(linked=True))) # TODO Figure out how nested model config options will work # print(dataflow_yaml.read_text()) return
async def test_run(self): dataflow = DataFlow.auto(convert_to_gif, GetSingle) dataflow.seed.append( Input( value=[convert_to_gif.op.outputs["output_file"].name], definition=GetSingle.op.inputs["spec"], ) ) input_file_path = self.parent_path / "input.mp4" with open(input_file_path, "rb") as f: input_file = f.read(-1) test_inputs = { "Test": [ Input( value=input_file, definition=convert_to_gif.op.inputs["input_file"], ), Input( value=240, definition=convert_to_gif.op.inputs["resolution"], ), ] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(dataflow) as octx: async for ctx, results in octx.run(test_inputs): self.assertIn("output_file", results) output = results["output_file"] self.assertGreater(len(output), 100000)
async def test_pos_tagger(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(pos_tagger, GetSingle), [ Input( value=[pos_tagger.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=pos_tagger.op.inputs["text"], ), Input( value="en_core_web_sm", definition=pos_tagger.op.inputs["spacy_model"], ), ], ): pos_tags = results[pos_tagger.op.outputs["result"].name] words = input_sentence.split() for i, _ in enumerate(words): self.assertEqual(pos_tags[i][0], words[i]) self.assertIn(pos_tags[i][1], ["DT", "NN", "VBZ", "CC", ","])
async def test_gen_with_input(self): test_dataflow = DataFlow.auto(GetMulti, counter, echo_num) test_dataflow.seed.append( Input( value=[echo_num.op.outputs["number_out"].name], definition=GetMulti.op.inputs["spec"], )) test_dataflow.implementations[counter.op.name] = counter.imp test_dataflow.implementations[echo_num.op.name] = echo_num.imp test_inputs = {"TestCount": [Input(value=1, definition=CountStart)]} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("number", results) self.assertEqual(set([1, 2, 3, 4, 5]), set(results["number"]))
async def test_standard_scaler(self): input_data = [[0, 0], [0, 0], [1, 1], [1, 1]] output_data = [[-1, -1], [-1, -1], [1, 1], [1, 1]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(standard_scaler, GetSingle), [ Input( value=[standard_scaler.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=standard_scaler.op.inputs["data"], ), ], ): self.assertTrue((results[standard_scaler.op.outputs["result"].name] == output_data))
async def test_HuMoments(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(HuMoments, GetSingle), [ Input( value=[ HuMoments.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=HuMoments.op.inputs["m"], ), ], ): self.assertEqual( results[HuMoments.op.outputs["result"].name].shape, (7, ))
async def test_remove_whitespaces(self): input_data = [[" ABC ", "XYD "], [" ABC", " XYD "]] output_data = [["ABC", "XYD"], ["ABC", "XYD"]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(remove_whitespaces, GetSingle), [ Input( value=[remove_whitespaces.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=remove_whitespaces.op.inputs["data"], ), ], ): self.assertTrue( (results[remove_whitespaces.op.outputs["result"].name] == output_data).all())
async def test_normalize(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(normalize, GetSingle), [ Input( value=[ normalize.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=normalize.op.inputs["src"], ), ], ): self.assertEqual( results[normalize.op.outputs["result"].name].shape, self.INPUT_ARRAY.shape, )
async def test_get_sentences(self): input_sentence = "The end is the beginning. The beginning is the end." async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_sentences, GetSingle), [ Input( value=[get_sentences.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=get_sentences.op.inputs["text"], ), Input( value="en_core_web_sm", definition=get_sentences.op.inputs["spacy_model"], ), ], ): sentences = results[get_sentences.op.outputs["result"].name] self.assertEqual(len(sentences), 2)
async def test_remove_stopwords(self): input_sentence = ( "The end is the beginning, and the beginning is the end") output_sentence = "end beginning , beginning end" async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(remove_stopwords, GetSingle), [ Input( value=[remove_stopwords.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=remove_stopwords.op.inputs["text"], ), ], ): self.assertEqual( results[remove_stopwords.op.outputs["result"].name], output_sentence, )
async def test_singular_value_decomposition(self): input_data, _ = make_classification( n_samples=10, n_features=10, n_informative=8, n_redundant=2, random_state=7, ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(singular_value_decomposition, GetSingle), [ Input( value=[ singular_value_decomposition.op.outputs["result"].name ], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=singular_value_decomposition.op.inputs["data"], ), Input( value=8, definition=singular_value_decomposition.op. inputs["n_components"], ), Input( value=1, definition=singular_value_decomposition.op. inputs["n_iter"], ), Input( value=7, definition=singular_value_decomposition.op. inputs["random_state"], ), ], ): self.assertTrue((10, 8) == results[singular_value_decomposition.op. outputs["result"].name].shape, )
async def test_ordinal_encoder(self): input_data = [["x", "a"], ["x", "b"], ["y", "a"]] output_data = [ [1.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0], ] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(ordinal_encoder, GetSingle), [ Input( value=[ordinal_encoder.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=ordinal_encoder.op.inputs["data"], ), ], ): self.assertTrue((results[ordinal_encoder.op.outputs["result"].name] == output_data).all())
async def test_flatten(self): input_array = np.zeros((100, 100, 3), dtype=np.uint8) output_array = [0] * (100 * 100 * 3) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(flatten, GetSingle), [ Input( value=[ flatten.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=input_array, definition=flatten.op.inputs["array"], ), ], ): self.assertEqual( results[flatten.op.outputs["result"].name].tolist(), output_array, )
async def test_run(self): dataflow = DataFlow.auto(*OPIMPS) calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(dataflow) as octx: async for ctx, results in octx.run({ to_calc: [ Input( value=to_calc, definition=calc_parse_line.op.inputs["line"], ), Input( value=[calc_add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ] for to_calc in calc_strings_check.keys() }): ctx_str = (await ctx.handle()).as_string() self.assertEqual( calc_strings_check[ctx_str], results[calc_add.op.outputs["sum"].name], )
async def test_lemmatizer(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(lemmatizer, GetSingle), [ Input( value=[lemmatizer.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=lemmatizer.op.inputs["text"], ), Input( value="en_core_web_sm", definition=lemmatizer.op.inputs["spacy_model"], ), ], ): lemma_list = results[lemmatizer.op.outputs["result"].name] self.assertEqual(len(input_sentence.split()), len(lemma_list))
async def test_one_hot_encoder(self): categories = [["Male", "Female"], [1, 2, 3]] input_data = [["Female", 1], ["Male", 3]] output_data = [[0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(one_hot_encoder, GetSingle), [ Input( value=[one_hot_encoder.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=one_hot_encoder.op.inputs["data"], ), Input( value=categories, definition=one_hot_encoder.op.inputs["categories"], ), ], ): self.assertTrue((results[one_hot_encoder.op.outputs["result"].name] == output_data).all())
async def test_run(self): repos = [ "http://pkg.freebsd.org/FreeBSD:13:amd64/latest/All/ImageMagick7-7.0.8.48.txz", "https://download.clearlinux.org/releases/10540/clear/x86_64/os/Packages/sudo-setuid-1.8.17p1-34.x86_64.rpm", "https://rpmfind.net/linux/fedora/linux/updates/29/Everything/x86_64/Packages/g/gzip-1.9-9.fc29.x86_64.rpm", "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/20/Everything/x86_64/os/Packages/c/curl-7.32.0-3.fc20.x86_64.rpm", ] dataflow = DataFlow.auto( URLToURLBytes, files_in_rpm, urlbytes_to_rpmfile, urlbytes_to_tarfile, is_binary_pie, Associate, cleanup_rpm, ) async with MemoryOrchestrator.withconfig({}) as orchestrator: definitions = Operation.definitions(*OPERATIONS) async with orchestrator(dataflow) as octx: async for ctx, results in octx.run( { URL: [ Input(value=URL, definition=definitions["URL"]), Input( value=["rpm_filename", "binary_is_PIE"], definition=definitions["associate_spec"], ), ] for URL in repos }, strict=True, ): self.assertTrue(results)