def test_predict(self): self.required_plugins("dffml-model-scikit") # Import SciKit modules dffml_model_scikit = importlib.import_module("dffml_model_scikit") # Instantiate the model model = dffml_model_scikit.LinearRegressionModel( directory=self.mktempdir(), predict=Feature("Salary", int, 1), features=Features( Feature("Years", int, 1), Feature("Expertise", int, 1), Feature("Trust", float, 1), ), ) training_data = CSVSource(filename=self.train_filename) test_data = CSVSource(filename=self.test_filename) predict_data = CSVSource(filename=self.predict_filename) # Train the model train(model, training_data) # Assess accuracy accuracy(model, test_data) # Make prediction predictions = [ prediction for prediction in predict(model, predict_data) ] self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70) self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80)
async def test_csv_tag(self): with non_existant_tempfile() as csv_tempfile: # Move the pre-populated json data to a csv source with self.subTest(json_to_csv=True): await Merge.cli( "dest=csv", "src=json", "-source-dest-filename", csv_tempfile, "-source-src-filename", self.temp_filename, "-source-src-allowempty", "-source-dest-allowempty", "-source-src-readwrite", "-source-dest-readwrite", ) # Merge one tag to another within the same file with self.subTest(merge_same_file=True): await Merge.cli( "dest=csv", "src=csv", "-source-dest-filename", csv_tempfile, "-source-dest-tag", "sometag", "-source-src-filename", csv_tempfile, "-source-src-allowempty", "-source-dest-allowempty", "-source-src-readwrite", "-source-dest-readwrite", ) contents = Path(csv_tempfile).read_text() self.assertIn("untagged", contents) self.assertIn("sometag", contents) # Check the untagged source with self.subTest(tagged=None): async with CSVSource( CSVSourceConfig(filename=csv_tempfile) ) as source: async with source() as sctx: repos = [repo async for repo in sctx.repos()] self.assertEqual(len(repos), len(self.repos)) contents = Path(csv_tempfile).read_text() self.assertIn("sometag", contents) self.assertIn("untagged", contents) # Check the tagged source with self.subTest(tagged="sometag"): async with CSVSource( CSVSourceConfig(filename=csv_tempfile, tag="sometag") ) as source: async with source() as sctx: repos = [repo async for repo in sctx.repos()] self.assertEqual(len(repos), len(self.repos)) contents = Path(csv_tempfile).read_text() self.assertIn("sometag", contents) self.assertIn("untagged", contents)
def test_config_readonly_default(self): config = CSVSource.config( parse_unknown("--source-csv-filename", "feedface")) self.assertEqual(config.filename, "feedface") self.assertEqual(config.label, "unlabeled") self.assertEqual(config.key, None) self.assertFalse(config.readonly)
def test_config_default(self): config = CSVSource.config( parse_unknown("--source-csv-filename", "feedface")) self.assertEqual(config.filename, "feedface") self.assertEqual(config.tag, "untagged") self.assertEqual(config.tagcol, "tag") self.assertEqual(config.key, "key") self.assertFalse(config.readwrite) self.assertFalse(config.allowempty)
def test_config_default(self): config = CSVSource.config( parse_unknown("--source-csv-filename", "feedface")) self.assertEqual(config.filename, "feedface") self.assertEqual(config.label, "unlabeled") self.assertEqual(config.labelcol, "label") self.assertEqual(config.key, "src_url") self.assertFalse(config.readwrite) self.assertFalse(config.allowempty)
async def test_save_and_load(self): source = CSVSource( filename=self.save_and_load, allowempty=True, readwrite=True ) await save( source, Record( "1", data={ "features": {"A": 0, "B": 1}, "prediction": {"C": {"value": 1, "confidence": 1.0}}, }, ), Record( "2", data={ "features": {"A": 3, "B": 4}, "prediction": {"C": {"value": 2, "confidence": 1.0}}, }, ), ) # All records in source results = [record.export() async for record in load(source)] self.assertEqual( results, [ { "key": "1", "features": {"A": 0, "B": 1}, "prediction": {"C": {"confidence": 1.0, "value": "1"}}, "extra": {}, }, { "key": "2", "features": {"A": 3, "B": 4}, "prediction": {"C": {"confidence": 1.0, "value": "2"}}, "extra": {}, }, ], ) # For specific records in a source results = [record.export() async for record in load(source, "1")] self.assertEqual( results, [ { "key": "1", "features": {"A": 0, "B": 1}, "prediction": {"C": {"confidence": 1.0, "value": "1"}}, "extra": {}, } ], )
async def test_predict(self): self.required_plugins("dffml-model-scikit") # Import SciKit modules dffml_model_scikit = importlib.import_module("dffml_model_scikit") # Instantiate the model model = dffml_model_scikit.LinearRegressionModel( location=self.mktempdir(), predict=Feature("Salary", int, 1), features=Features( Feature("Years", int, 1), Feature("Expertise", int, 1), Feature("Trust", float, 1), ), ) training_data = CSVSource(filename=self.train_filename) test_data = CSVSource(filename=self.test_filename) predict_data = CSVSource(filename=self.predict_filename) # Train the model await train(model, training_data) # Assess accuracy scorer = MeanSquaredErrorAccuracy() await score(model, scorer, Feature("Salary", int, 1), test_data) # Make prediction predictions = [ prediction async for prediction in predict(model, predict_data) ] self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70) self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80) # Test input data as list await train(model, *self.train_data) await score(model, scorer, Feature("Salary", int, 1), *self.test_data) predictions = [ prediction async for prediction in predict(model, *self.predict_data) ] self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70) self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80)
async def test_key(self): with tempfile.NamedTemporaryFile() as fileobj: fileobj.write(b"KeyHeader,ValueColumn\n") fileobj.write(b"a,42\n") fileobj.write(b"b,420\n") fileobj.seek(0) async with CSVSource( CSVSourceConfig(filename=fileobj.name, key="KeyHeader")) as source: async with source() as sctx: record_a = await sctx.record("a") record_b = await sctx.record("b") self.assertEqual(record_a.feature("ValueColumn"), 42) self.assertEqual(record_b.feature("ValueColumn"), 420)
def test_config_readonly_set(self): config = CSVSource.config( parse_unknown( "--source-csv-filename", "feedface", "--source-csv-label", "default-label", "--source-csv-key", "SourceURLColumn", "--source-csv-readonly", )) self.assertEqual(config.filename, "feedface") self.assertEqual(config.label, "default-label") self.assertEqual(config.key, "SourceURLColumn") self.assertTrue(config.readonly)
async def test_key(self): with tempfile.TemporaryDirectory() as testdir: testfile = os.path.join(testdir, str(random.random())) pathlib.Path(testfile).write_text( inspect.cleandoc(""" KeyHeader,ValueColumn a,42 b,420 """)) async with CSVSource( CSVSourceConfig(filename=testfile, key="KeyHeader")) as source: async with source() as sctx: record_a = await sctx.record("a") record_b = await sctx.record("b") self.assertEqual(record_a.feature("ValueColumn"), 42) self.assertEqual(record_b.feature("ValueColumn"), 420)
async def my_training_dataset( url: str = "http://download.example.com/data/my_training.csv", expected_sha384_hash: str = "db9ec70abdc8b74bcf91a7399144dd15fc01e3dad91bbbe3c41fbbe33065b98a3e06e8e0ba053d850d7dc19e6837310e", cache_dir: pathlib.Path = (pathlib.Path("~", ".cache", "dffml", "datasets", "my").expanduser().resolve()), ): # Download the file from the url give, place the downloaded file at # ~/.cache/dffml/datasets/my/training.csv. Ensure the SHA 384 hash # of the download's contents is equal the the expected value filepath = await cached_download( url, cache_dir / "training.csv", expected_sha384_hash, protocol_allowlist=["http://"] + DEFAULT_PROTOCOL_ALLOWLIST, ) # Create a source using downloaded file yield CSVSource(filename=str(filepath))
def test_config_set(self): config = CSVSource.config( parse_unknown( "--source-csv-filename", "feedface", "--source-csv-tag", "default-tag", "--source-csv-tagcol", "dffml_tag", "--source-csv-key", "SourceURLColumn", "--source-csv-readwrite", "--source-csv-allowempty", )) self.assertEqual(config.filename, "feedface") self.assertEqual(config.tag, "default-tag") self.assertEqual(config.tagcol, "dffml_tag") self.assertEqual(config.key, "SourceURLColumn") self.assertTrue(config.readwrite) self.assertTrue(config.allowempty)
async def setUpSource(self): return CSVSource( CSVSourceConfig(filename=self.testfile, allowempty=True, readwrite=True))
async def setUpSource(self): return CSVSource(CSVSourceConfig(filename=self.testfile))
from dffml.cli.ml import Train, Accuracy, PredictAll from dffml.feature.feature import Features, DefFeature from dffml.source.csv import CSVSource, CSVSourceConfig from dffml_model_tensorflow.dnnr import ( DNNRegressionModel, DNNRegressionModelConfig, ) training_data = CSVSource( CSVSourceConfig(filename="training.csv", readonly=True)) test_data = CSVSource(CSVSourceConfig(filename="test.csv", readonly=True)) predict_data = CSVSource(CSVSourceConfig(filename="predict.csv", readonly=True)) model = DNNRegressionModel( DNNRegressionModelConfig( features=Features( DefFeature("Years", int, 1), DefFeature("Expertise", int, 1), DefFeature("Trust", float, 1), ), predict="Salary", )) Train(model=model, sources=[training_data])() accuracy = Accuracy(model=model, sources=[test_data])() row0, row1 = PredictAll(model=model, sources=[predict_data])() print("Accuracy", accuracy)
from dffml.cli.ml import Train, Accuracy, PredictAll from dffml.feature.feature import Features, DefFeature from dffml.source.csv import CSVSource, CSVSourceConfig from dffml_model_tensorflow.dnnr import ( DNNRegressionModel, DNNRegressionModelConfig, ) training_data = CSVSource( CSVSourceConfig(filename="training.csv", readwrite=False)) test_data = CSVSource(CSVSourceConfig(filename="test.csv", readwrite=False)) predict_data = CSVSource( CSVSourceConfig(filename="predict.csv", readwrite=False)) model = DNNRegressionModel( DNNRegressionModelConfig( features=Features( DefFeature("Years", int, 1), DefFeature("Expertise", int, 1), DefFeature("Trust", float, 1), ), predict=DefFeature("Salary", float, 1), )) Train(model=model, sources=[training_data])() accuracy = Accuracy(model=model, sources=[test_data])() row0, row1 = PredictAll(model=model, sources=[predict_data])() print("Accuracy", accuracy)