Example #1
0
    def test_predict(self):
        self.required_plugins("dffml-model-scikit")
        # Import SciKit modules
        dffml_model_scikit = importlib.import_module("dffml_model_scikit")
        # Instantiate the model
        model = dffml_model_scikit.LinearRegressionModel(
            directory=self.mktempdir(),
            predict=Feature("Salary", int, 1),
            features=Features(
                Feature("Years", int, 1),
                Feature("Expertise", int, 1),
                Feature("Trust", float, 1),
            ),
        )

        training_data = CSVSource(filename=self.train_filename)
        test_data = CSVSource(filename=self.test_filename)
        predict_data = CSVSource(filename=self.predict_filename)

        # Train the model
        train(model, training_data)
        # Assess accuracy
        accuracy(model, test_data)
        # Make prediction
        predictions = [
            prediction for prediction in predict(model, predict_data)
        ]
        self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70)
        self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80)
Example #2
0
 async def test_csv_tag(self):
     with non_existant_tempfile() as csv_tempfile:
         # Move the pre-populated json data to a csv source
         with self.subTest(json_to_csv=True):
             await Merge.cli(
                 "dest=csv",
                 "src=json",
                 "-source-dest-filename",
                 csv_tempfile,
                 "-source-src-filename",
                 self.temp_filename,
                 "-source-src-allowempty",
                 "-source-dest-allowempty",
                 "-source-src-readwrite",
                 "-source-dest-readwrite",
             )
         # Merge one tag to another within the same file
         with self.subTest(merge_same_file=True):
             await Merge.cli(
                 "dest=csv",
                 "src=csv",
                 "-source-dest-filename",
                 csv_tempfile,
                 "-source-dest-tag",
                 "sometag",
                 "-source-src-filename",
                 csv_tempfile,
                 "-source-src-allowempty",
                 "-source-dest-allowempty",
                 "-source-src-readwrite",
                 "-source-dest-readwrite",
             )
         contents = Path(csv_tempfile).read_text()
         self.assertIn("untagged", contents)
         self.assertIn("sometag", contents)
         # Check the untagged source
         with self.subTest(tagged=None):
             async with CSVSource(
                 CSVSourceConfig(filename=csv_tempfile)
             ) as source:
                 async with source() as sctx:
                     repos = [repo async for repo in sctx.repos()]
                     self.assertEqual(len(repos), len(self.repos))
         contents = Path(csv_tempfile).read_text()
         self.assertIn("sometag", contents)
         self.assertIn("untagged", contents)
         # Check the tagged source
         with self.subTest(tagged="sometag"):
             async with CSVSource(
                 CSVSourceConfig(filename=csv_tempfile, tag="sometag")
             ) as source:
                 async with source() as sctx:
                     repos = [repo async for repo in sctx.repos()]
                     self.assertEqual(len(repos), len(self.repos))
         contents = Path(csv_tempfile).read_text()
         self.assertIn("sometag", contents)
         self.assertIn("untagged", contents)
Example #3
0
 def test_config_readonly_default(self):
     config = CSVSource.config(
         parse_unknown("--source-csv-filename", "feedface"))
     self.assertEqual(config.filename, "feedface")
     self.assertEqual(config.label, "unlabeled")
     self.assertEqual(config.key, None)
     self.assertFalse(config.readonly)
Example #4
0
 def test_config_default(self):
     config = CSVSource.config(
         parse_unknown("--source-csv-filename", "feedface"))
     self.assertEqual(config.filename, "feedface")
     self.assertEqual(config.tag, "untagged")
     self.assertEqual(config.tagcol, "tag")
     self.assertEqual(config.key, "key")
     self.assertFalse(config.readwrite)
     self.assertFalse(config.allowempty)
Example #5
0
 def test_config_default(self):
     config = CSVSource.config(
         parse_unknown("--source-csv-filename", "feedface"))
     self.assertEqual(config.filename, "feedface")
     self.assertEqual(config.label, "unlabeled")
     self.assertEqual(config.labelcol, "label")
     self.assertEqual(config.key, "src_url")
     self.assertFalse(config.readwrite)
     self.assertFalse(config.allowempty)
Example #6
0
    async def test_save_and_load(self):
        source = CSVSource(
            filename=self.save_and_load, allowempty=True, readwrite=True
        )
        await save(
            source,
            Record(
                "1",
                data={
                    "features": {"A": 0, "B": 1},
                    "prediction": {"C": {"value": 1, "confidence": 1.0}},
                },
            ),
            Record(
                "2",
                data={
                    "features": {"A": 3, "B": 4},
                    "prediction": {"C": {"value": 2, "confidence": 1.0}},
                },
            ),
        )
        # All records in source
        results = [record.export() async for record in load(source)]
        self.assertEqual(
            results,
            [
                {
                    "key": "1",
                    "features": {"A": 0, "B": 1},
                    "prediction": {"C": {"confidence": 1.0, "value": "1"}},
                    "extra": {},
                },
                {
                    "key": "2",
                    "features": {"A": 3, "B": 4},
                    "prediction": {"C": {"confidence": 1.0, "value": "2"}},
                    "extra": {},
                },
            ],
        )

        # For specific records in a source
        results = [record.export() async for record in load(source, "1")]
        self.assertEqual(
            results,
            [
                {
                    "key": "1",
                    "features": {"A": 0, "B": 1},
                    "prediction": {"C": {"confidence": 1.0, "value": "1"}},
                    "extra": {},
                }
            ],
        )
    async def test_predict(self):
        self.required_plugins("dffml-model-scikit")
        # Import SciKit modules
        dffml_model_scikit = importlib.import_module("dffml_model_scikit")
        # Instantiate the model
        model = dffml_model_scikit.LinearRegressionModel(
            location=self.mktempdir(),
            predict=Feature("Salary", int, 1),
            features=Features(
                Feature("Years", int, 1),
                Feature("Expertise", int, 1),
                Feature("Trust", float, 1),
            ),
        )

        training_data = CSVSource(filename=self.train_filename)
        test_data = CSVSource(filename=self.test_filename)
        predict_data = CSVSource(filename=self.predict_filename)

        # Train the model
        await train(model, training_data)
        # Assess accuracy
        scorer = MeanSquaredErrorAccuracy()
        await score(model, scorer, Feature("Salary", int, 1), test_data)
        # Make prediction
        predictions = [
            prediction async for prediction in predict(model, predict_data)
        ]
        self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70)
        self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80)

        # Test input data as list
        await train(model, *self.train_data)
        await score(model, scorer, Feature("Salary", int, 1), *self.test_data)
        predictions = [
            prediction
            async for prediction in predict(model, *self.predict_data)
        ]
        self.assertEqual(round(predictions[0][2]["Salary"]["value"]), 70)
        self.assertEqual(round(predictions[1][2]["Salary"]["value"]), 80)
Example #8
0
 async def test_key(self):
     with tempfile.NamedTemporaryFile() as fileobj:
         fileobj.write(b"KeyHeader,ValueColumn\n")
         fileobj.write(b"a,42\n")
         fileobj.write(b"b,420\n")
         fileobj.seek(0)
         async with CSVSource(
                 CSVSourceConfig(filename=fileobj.name,
                                 key="KeyHeader")) as source:
             async with source() as sctx:
                 record_a = await sctx.record("a")
                 record_b = await sctx.record("b")
                 self.assertEqual(record_a.feature("ValueColumn"), 42)
                 self.assertEqual(record_b.feature("ValueColumn"), 420)
Example #9
0
 def test_config_readonly_set(self):
     config = CSVSource.config(
         parse_unknown(
             "--source-csv-filename",
             "feedface",
             "--source-csv-label",
             "default-label",
             "--source-csv-key",
             "SourceURLColumn",
             "--source-csv-readonly",
         ))
     self.assertEqual(config.filename, "feedface")
     self.assertEqual(config.label, "default-label")
     self.assertEqual(config.key, "SourceURLColumn")
     self.assertTrue(config.readonly)
Example #10
0
 async def test_key(self):
     with tempfile.TemporaryDirectory() as testdir:
         testfile = os.path.join(testdir, str(random.random()))
         pathlib.Path(testfile).write_text(
             inspect.cleandoc("""
                 KeyHeader,ValueColumn
                 a,42
                 b,420
                 """))
         async with CSVSource(
                 CSVSourceConfig(filename=testfile,
                                 key="KeyHeader")) as source:
             async with source() as sctx:
                 record_a = await sctx.record("a")
                 record_b = await sctx.record("b")
                 self.assertEqual(record_a.feature("ValueColumn"), 42)
                 self.assertEqual(record_b.feature("ValueColumn"), 420)
Example #11
0
async def my_training_dataset(
    url: str = "http://download.example.com/data/my_training.csv",
    expected_sha384_hash:
    str = "db9ec70abdc8b74bcf91a7399144dd15fc01e3dad91bbbe3c41fbbe33065b98a3e06e8e0ba053d850d7dc19e6837310e",
    cache_dir: pathlib.Path = (pathlib.Path("~", ".cache", "dffml", "datasets",
                                            "my").expanduser().resolve()),
):
    # Download the file from the url give, place the downloaded file at
    # ~/.cache/dffml/datasets/my/training.csv. Ensure the SHA 384 hash
    # of the download's contents is equal the the expected value
    filepath = await cached_download(
        url,
        cache_dir / "training.csv",
        expected_sha384_hash,
        protocol_allowlist=["http://"] + DEFAULT_PROTOCOL_ALLOWLIST,
    )
    # Create a source using downloaded file
    yield CSVSource(filename=str(filepath))
Example #12
0
 def test_config_set(self):
     config = CSVSource.config(
         parse_unknown(
             "--source-csv-filename",
             "feedface",
             "--source-csv-tag",
             "default-tag",
             "--source-csv-tagcol",
             "dffml_tag",
             "--source-csv-key",
             "SourceURLColumn",
             "--source-csv-readwrite",
             "--source-csv-allowempty",
         ))
     self.assertEqual(config.filename, "feedface")
     self.assertEqual(config.tag, "default-tag")
     self.assertEqual(config.tagcol, "dffml_tag")
     self.assertEqual(config.key, "SourceURLColumn")
     self.assertTrue(config.readwrite)
     self.assertTrue(config.allowempty)
Example #13
0
 async def setUpSource(self):
     return CSVSource(
         CSVSourceConfig(filename=self.testfile,
                         allowempty=True,
                         readwrite=True))
Example #14
0
 async def setUpSource(self):
     return CSVSource(CSVSourceConfig(filename=self.testfile))
Example #15
0
from dffml.cli.ml import Train, Accuracy, PredictAll
from dffml.feature.feature import Features, DefFeature
from dffml.source.csv import CSVSource, CSVSourceConfig
from dffml_model_tensorflow.dnnr import (
    DNNRegressionModel,
    DNNRegressionModelConfig,
)

training_data = CSVSource(
    CSVSourceConfig(filename="training.csv", readonly=True))
test_data = CSVSource(CSVSourceConfig(filename="test.csv", readonly=True))
predict_data = CSVSource(CSVSourceConfig(filename="predict.csv",
                                         readonly=True))

model = DNNRegressionModel(
    DNNRegressionModelConfig(
        features=Features(
            DefFeature("Years", int, 1),
            DefFeature("Expertise", int, 1),
            DefFeature("Trust", float, 1),
        ),
        predict="Salary",
    ))

Train(model=model, sources=[training_data])()

accuracy = Accuracy(model=model, sources=[test_data])()

row0, row1 = PredictAll(model=model, sources=[predict_data])()

print("Accuracy", accuracy)
Example #16
0
from dffml.cli.ml import Train, Accuracy, PredictAll
from dffml.feature.feature import Features, DefFeature
from dffml.source.csv import CSVSource, CSVSourceConfig
from dffml_model_tensorflow.dnnr import (
    DNNRegressionModel,
    DNNRegressionModelConfig,
)

training_data = CSVSource(
    CSVSourceConfig(filename="training.csv", readwrite=False))
test_data = CSVSource(CSVSourceConfig(filename="test.csv", readwrite=False))
predict_data = CSVSource(
    CSVSourceConfig(filename="predict.csv", readwrite=False))

model = DNNRegressionModel(
    DNNRegressionModelConfig(
        features=Features(
            DefFeature("Years", int, 1),
            DefFeature("Expertise", int, 1),
            DefFeature("Trust", float, 1),
        ),
        predict=DefFeature("Salary", float, 1),
    ))

Train(model=model, sources=[training_data])()

accuracy = Accuracy(model=model, sources=[test_data])()

row0, row1 = PredictAll(model=model, sources=[predict_data])()

print("Accuracy", accuracy)