def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() if cls.MODEL_TYPE is "CLASSIFICATION": cls.features.append(DefFeature("A", float, 1)) cls.features.append(DefFeature("B", float, 1)) cls.features.append(DefFeature("C", float, 1)) cls.features.append(DefFeature("D", float, 1)) cls.features.append(DefFeature("E", float, 1)) cls.features.append(DefFeature("F", float, 1)) cls.features.append(DefFeature("G", float, 1)) cls.features.append(DefFeature("H", float, 1)) cls.features.append(DefFeature("I", float, 1)) A, B, C, D, E, F, G, H, I, X = list( zip(*FEATURE_DATA_CLASSIFICATION)) cls.repos = [ Repo( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "I": I[i], "X": X[i], } }, ) for i in range(0, len(A)) ] elif cls.MODEL_TYPE is "REGRESSION": cls.features.append(DefFeature("A", float, 1)) cls.features.append(DefFeature("B", float, 1)) cls.features.append(DefFeature("C", float, 1)) A, B, C, X = list(zip(*FEATURE_DATA_REGRESSION)) cls.repos = [ Repo( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos))) cls.model = cls.MODEL( cls.MODEL_CONFIG( directory=cls.model_dir.name, predict="X", features=cls.features, ))
async def update(self, repo: Repo): db = self.parent.db # Store feature data feature_cols = self.parent.FEATURE_COLS feature_data = OrderedDict.fromkeys(feature_cols) feature_data.update(repo.features(feature_cols)) await db.execute( "INSERT OR REPLACE INTO features (src_url, " + ", ".join(feature_cols) + ") " "VALUES(?, " + ", ".join("?" * len(feature_cols)) + ")", [repo.src_url] + list(feature_data.values()), ) # Store prediction prediction = repo.prediction() if prediction: prediction_cols = self.parent.PREDICTION_COLS prediction_data = OrderedDict.fromkeys(prediction_cols) prediction_data.update(prediction.dict()) await db.execute( "INSERT OR REPLACE INTO prediction (src_url, " + ", ".join(prediction_cols) + ") " "VALUES(?, " + ", ".join("?" * len(prediction_cols)) + ")", [repo.src_url] + list(prediction_data.values()), ) # Store classification if repo.classified(): await db.execute( "INSERT OR REPLACE INTO classification " "(src_url, classification) VALUES(?, ?)", [repo.src_url, repo.classification()], )
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.model = Misc(ModelConfig(directory=cls.model_dir.name)) cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.classifications = ['a', 'not a'] cls.repos = [ Repo('a' + str(random.random()), data={ 'features': { cls.feature.NAME: 1 }, 'classification': 'a' }) for _ in range(0, 1000) ] cls.repos += [ Repo('b' + str(random.random()), data={ 'features': { cls.feature.NAME: 0 }, 'classification': 'not a' }) for _ in range(0, 1000) ] cls.sources = \ Sources(MemorySource(MemorySourceConfig(repos=cls.repos)))
async def test_update(self): key = "1" new_repo = Repo(key, data={"features": {"by_ten": 10}}) async with self.post(f"/source/{self.slabel}/update/{key}", json=new_repo.export()) as r: self.assertEqual(await r.json(), OK) self.assertEqual((await self.sctx.repo(key)).feature("by_ten"), 10)
async def repo(self, key: str): query = self.parent.config.repo_query repo = Repo(key) db = self.conn await db.execute(query, (key, )) row = await db.fetchone() if row is not None: features = {} predictions = {} for key, value in row.items(): if key.startswith("feature_"): features[key.replace("feature_", "")] = value elif "_value" in key: target = key.replace("_value", "") predictions[target] = { "value": row[target + "_value"], "confidence": row[target + "_confidence"], } repo.merge( Repo( row["key"], data={ "features": features, "prediction": predictions }, )) return repo
def setUpClass(cls): cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.model_dir = tempfile.TemporaryDirectory() cls.model = MiscModel( MiscModelConfig( directory=cls.model_dir.name, classifications=["not a", "a"], features=cls.features, )) cls.repos = [ Repo( "a" + str(random.random()), data={"features": { cls.feature.NAME: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.repos += [ Repo( "b" + str(random.random()), data={"features": { cls.feature.NAME: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos)))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.model = DNNClassifierModel( DNNClassifierModelConfig(directory=cls.model_dir.name, steps=1000, epochs=30, hidden=[10, 20, 10], classification="string", classifications=["a", "not a"], clstype=str)) cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.repos = [ Repo( "a" + str(random.random()), data={"features": { cls.feature.NAME: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.repos += [ Repo( "b" + str(random.random()), data={"features": { cls.feature.NAME: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos)))
async def update(self, repo: Repo): db = self.parent.db # Store feature data feature_cols = self.parent.FEATURE_COLS feature_data = OrderedDict.fromkeys(feature_cols) feature_data.update(repo.features(feature_cols)) await db.execute( "INSERT OR REPLACE INTO features (key, " + ", ".join(feature_cols) + ") " "VALUES(?, " + ", ".join("?" * len(feature_cols)) + ")", [repo.key] + list(feature_data.values()), ) # Store prediction try: prediction = repo.prediction("target_name") prediction_cols = self.parent.PREDICTION_COLS prediction_data = OrderedDict.fromkeys(prediction_cols) prediction_data.update(prediction.dict()) await db.execute( "INSERT OR REPLACE INTO prediction (key, " + ", ".join(prediction_cols) + ") " "VALUES(?, " + ", ".join("?" * len(prediction_cols)) + ")", [repo.key] + list(prediction_data.values()), ) except KeyError: pass
async def test_label(self): with tempfile.TemporaryDirectory() as testdir: self.testfile = os.path.join(testdir, str(random.random())) unlabeled = await self.setUpSource() labeled = await self.setUpSource() labeled.config = labeled.config._replace(label="somelabel") async with unlabeled, labeled: async with unlabeled() as uctx, labeled() as lctx: await uctx.update( Repo("0", data={"features": {"feed": 1}}) ) await lctx.update( Repo("0", data={"features": {"face": 2}}) ) # async with unlabeled, labeled: async with unlabeled() as uctx, labeled() as lctx: repo = await uctx.repo("0") self.assertIn("feed", repo.features()) repo = await lctx.repo("0") self.assertIn("face", repo.features()) with open(self.testfile, "r") as fd: dict_reader = csv.DictReader(fd, dialect="strip") rows = { row["label"]: {row["src_url"]: row} for row in dict_reader } self.assertIn("unlabeled", rows) self.assertIn("somelabel", rows) self.assertIn("0", rows["unlabeled"]) self.assertIn("0", rows["somelabel"]) self.assertIn("feed", rows["unlabeled"]["0"]) self.assertIn("face", rows["somelabel"]["0"]) self.assertEqual("1", rows["unlabeled"]["0"]["feed"]) self.assertEqual("2", rows["somelabel"]["0"]["face"])
def setUp(self): self.null = Repo("null") self.full = Repo( "full", data=dict( features=dict(dead="beef"), extra=dict(extra="read all about it"), ), extra=dict(half=True), )
async def test_02_predict(self): a = Repo("a", data={"features": {self.feature.NAME: 1}}) b = Repo("not a", data={"features": {self.feature.NAME: 0}}) async with Sources(MemorySource(MemorySourceConfig( repos=[a, b]))) as sources, self.model as model: async with sources() as sctx, model() as mctx: num = 0 async for repo, prediction, confidence in mctx.predict( sctx.repos()): with self.subTest(repo=repo): self.assertEqual(prediction, repo.key) num += 1 self.assertEqual(num, 2)
async def repo(self, src_url: str): db = self.parent.db repo = Repo(src_url) # Get features features = await db.execute( "SELECT " + ", ".join(self.parent.FEATURE_COLS) + " " "FROM features WHERE src_url=?", (repo.src_url,), ) features = await features.fetchone() if features is not None: repo.evaluated(features) # Get prediction prediction = await db.execute( "SELECT * FROM prediction WHERE " "src_url=?", (repo.src_url,) ) prediction = await prediction.fetchone() if prediction is not None: repo.predicted( prediction["classification"], prediction["confidence"] ) # Get classification classification = await db.execute( "SELECT * FROM classification WHERE " "src_url=?", (repo.src_url,) ) classification = await classification.fetchone() if classification is not None: repo.classify(classification["classification"]) return repo
async def model_predict(self, request, mctx): # TODO Provide an iterkey method for model prediction chunk_size = int(request.match_info["chunk_size"]) if chunk_size != 0: return web.json_response( {"error": "Multiple request iteration not yet supported"}, status=HTTPStatus.BAD_REQUEST, ) # Get the repos repos: Dict[str, Repo] = { src_url: Repo(src_url, data=repo_data) for src_url, repo_data in (await request.json()).items() } # Create an async generator to feed repos async def repo_gen(): for repo in repos.values(): yield repo # Feed them through prediction return web.json_response( { "iterkey": None, "repos": { repo.src_url: repo.export() async for repo in mctx.predict(repo_gen()) }, } )
async def setUp(self): await super().setUp() self.repos = [Repo(str(random.random())) for _ in range(0, 10)] self.temp_filename = self.mktempfile() self.sconfig = FileSourceConfig(filename=self.temp_filename) async with JSONSource(self.sconfig) as source: async with source() as sctx: for repo in self.repos: await sctx.update(repo) contents = json.loads(Path(self.sconfig.filename).read_text()) # Ensure there are repos in the file self.assertEqual( len(contents.get(self.sconfig.label)), len(self.repos), "ReposTestCase JSON file erroneously initialized as empty", ) # TODO(p3) For some reason patching Model.load doesn't work # self._stack.enter_context(patch("dffml.model.model.Model.load", # new=model_load)) self._stack.enter_context( patch.object( ModelCMD, "arg_model", new=ModelCMD.arg_model.modify(type=model_load), )) self._stack.enter_context( patch("dffml.feature.feature.Feature.load", new=feature_load)) self._stack.enter_context( patch("dffml.df.base.OperationImplementation.load", new=opimp_load)) self._stack.enter_context( patch("dffml.df.types.Operation.load", new=op_load))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature1 = Feature_1() cls.feature2 = Feature_2() cls.features = Features(cls.feature1, cls.feature2) cls.model = DNNRegressionModel( DNNRegressionModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=DefFeature("TARGET", float, 1), features=cls.features, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.repos = [ Repo( "x" + str(random.random()), data={ "features": { cls.feature1.NAME: float(_temp_data[0][i]), cls.feature2.NAME: float(_temp_data[1][i]), "TARGET": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos)))
def setUpClass(cls): cls.features = Features() cls.features.append(DefFeature("A", str, 1)) A, X = list(zip(*DATA)) cls.repos = [ Repo( str(i), data={"features": { "A": A[i], "X": X[i], }}, ) for i in range(0, len(X)) ] cls.sources = Sources(MemorySource( MemorySourceConfig(repos=cls.repos))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = TextClassificationModel( TextClassifierConfig( directory=cls.model_dir.name, classifications=[0, 1], features=cls.features, predict=DefFeature("X", int, 1), add_layers=True, layers=[ "Dense(units = 120, activation='relu')", "Dense(units = 64, activation=relu)", "Dense(units = 2, activation='softmax')", ], model_path= "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1", epochs=30, ))
async def test_02_predict(self): test_feature_val = [ 0, 1.5, 2, ] # inserting zero so that its 1-indexable test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2] # should be same function used in TestDNN.setupclass a = Repo( "a", data={ "features": { self.feature1.NAME: test_feature_val[1], self.feature2.NAME: test_feature_val[2], } }, ) async with Sources(MemorySource(MemorySourceConfig( repos=[a]))) as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: res = [repo async for repo in mctx.predict(sctx.repos())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) test_error_norm = abs( (test_target - res[0].prediction(target_name).value) / test_target + 1e-6) error_threshold = 0.3 self.assertLess(test_error_norm, error_threshold)
async def test_predict(self): repos: Dict[str, Repo] = { repo.key: repo.export() async for repo in self.sctx.repos() } async with self.post( f"/model/{self.mlabel}/predict/0", json=repos ) as r: i: int = 0 response = await r.json() for key, repo_data in response["repos"].items(): repo = Repo(key, data=repo_data) self.assertEqual(int(repo.key), i) self.assertEqual( repo.feature("by_ten"), repo.prediction().value / 10 ) self.assertEqual(float(repo.key), repo.prediction().confidence) i += 1 self.assertEqual(i, self.num_repos)
async def test_02_predict(self): a = Repo("a", data={"features": {self.feature.NAME: 1}}) async with Sources(MemorySource(MemorySourceConfig( repos=[a]))) as sources, self.model as model: async with sources() as sctx, model() as mctx: res = [repo async for repo in mctx.predict(sctx.repos())] self.assertEqual(len(res), 1) self.assertEqual(res[0].src_url, a.src_url) self.assertTrue(res[0].prediction().value)
async def test_update(self): full_src_url = '0' empty_src_url = '1' full_repo = Repo(full_src_url, data= { "classification": "1", "features": { "PetalLength": 3.9, "PetalWidth": 1.2, "SepalLength": 5.8, "SepalWidth": 2.7, }, "prediction": { "classification": "feedface", "confidence": 0.42 }, }) empty_repo = Repo(empty_src_url, data= { "classification": "1", "features": { "PetalLength": 3.9, "PetalWidth": 1.2, "SepalLength": 5.8, "SepalWidth": 2.7, }, }) with tempfile.NamedTemporaryFile() as csvfile: csvSource = CSVSource(csvfile.name) # Open, update, and close async with csvSource as source: await source.update(full_repo) await source.update(empty_repo) # Open and confirm we saved and loaded correctly async with csvSource as source: with self.subTest(src_url=full_src_url): repo = await source.repo(full_src_url) self.assertEqual(repo.data.prediction.classification, "feedface") self.assertEqual(repo.data.prediction.confidence, 0.42) with self.subTest(src_url=empty_src_url): repo = await source.repo(empty_src_url) self.assertFalse(repo.data.prediction.classification) self.assertFalse(repo.data.prediction.confidence)
async def setUp(self): super().setUp() self.repos = [Repo(str(random.random())) for _ in range(0, 10)] self.__temp_filename = non_existant_tempfile() self.temp_filename = self.__temp_filename.__enter__() self.sconfig = FileSourceConfig(filename=self.temp_filename) async with JSONSource(self.sconfig) as source: async with source() as sctx: for repo in self.repos: await sctx.update(repo)
async def _add_memory_source(self): async with MemorySource( MemorySourceConfig(repos=[ Repo(str(i), data={"features": { "by_ten": i * 10 }}) for i in range(0, self.num_repos) ])) as source: self.source = self.cli.app["sources"][self.slabel] = source async with source() as sctx: self.sctx = self.cli.app["source_contexts"][self.slabel] = sctx yield
async def test_02_predict(self): a = Repo('a', data={'features': {self.feature.NAME: 1}}) sources = Sources(RepoSource(a)) async with sources as sources, self.features as features: res = [ repo async for repo in self.model.predict( sources.repos(), features, self.classifications) ] self.assertEqual(len(res), 1) self.assertEqual(res[0][0].src_url, a.src_url) self.assertTrue(res[0][1])
async def test_02_predict(self): a = Repo('a', data={'features': {self.feature.NAME: 1}}) async with Sources(MemorySource(MemorySourceConfig(repos=[a]))) \ as sources, self.features as features, self.model as model: async with sources() as sctx, model() as mctx: res = [ repo async for repo in mctx.predict( sctx.repos(), features, self.classifications) ] self.assertEqual(len(res), 1) self.assertEqual(res[0][0].src_url, a.src_url) self.assertTrue(res[0][1])
async def update(self, repo: Repo): db = self.conn # Just dump it (if you want a setup the queries easily, then you need to # massage the columns in this table to your liking, and perhaps add more # tables. marshall = json.dumps(repo.dict()) await db.execute( "INSERT INTO ml_data (src_url, json) VALUES(%s, %s) " "ON DUPLICATE KEY UPDATE json = %s", (repo.src_url, marshall, marshall), ) self.logger.debug("updated: %s", marshall) self.logger.debug("update: %s", await self.repo(repo.src_url))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.model = SLR(SLRConfig(directory=cls.model_dir.name, predict="Y")) cls.feature = DefFeature("X", float, 1) cls.features = Features(cls.feature) X, Y = list(zip(*FEATURE_DATA)) cls.repos = [ Repo(str(i), data={"features": {"X": X[i], "Y": Y[i]}}) for i in range(0, len(Y)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(repos=cls.repos)) )
async def repo(self, key: str): query = self.parent.config.repo_query repo = Repo(key) db = self.conn await db.execute(query, (key, )) row = await db.fetchone() if row is not None: repo.merge( Repo( row["key"], data={ "features": { key.replace("feature_", ""): value for key, value in row.items() if key.startswith("feature_") }, "prediction": { key.replace("prediction_", ""): value for key, value in row.items() if key.startswith("prediction_") }, }, )) return repo
def convert_to_repo(self, result): modified_repo = {"key": "", "data": {"features": {}, "prediction": {}}} for key, value in result.items(): if key.startswith("feature_"): modified_repo["data"]["features"][key.replace("feature_", "")] = value elif ("_value" in key) or ("_confidence" in key): target = key.replace("_value", "").replace("_confidence", "") modified_repo["data"]["prediction"][target] = { "value": result[target + "_value"], "confidence": result[target + "_confidence"], } else: modified_repo[key] = value return Repo(modified_repo["key"], data=modified_repo["data"])
async def repo(self, key: str): repo = Repo(key) db = self.conn # Get features await db.execute("SELECT json FROM ml_data WHERE key=%s", (key, )) dump = await db.fetchone() if dump is not None and dump[0] is not None: repo.merge(Repo(key, data=json.loads(dump[0]))) await db.execute("SELECT maintained FROM `status` WHERE key=%s", (key, )) maintained = await db.fetchone() if maintained is not None and maintained[0] is not None: repo.evaluated({"maintained": str(maintained[0])}) return repo
async def repo(self, src_url: str): repo = Repo(src_url) db = self.conn # Get features await db.execute("SELECT json FROM ml_data WHERE src_url=%s", (src_url, )) dump = await db.fetchone() if dump is not None and dump[0] is not None: repo.merge(Repo(src_url, data=json.loads(dump[0]))) await db.execute("SELECT maintained FROM `status` WHERE src_url=%s", (src_url, )) classification = await db.fetchone() if classification is not None and classification[0] is not None: repo.classify(str(classification[0])) return repo