async def test_tag(self): with tempfile.TemporaryDirectory() as testdir: self.testfile = os.path.join(testdir, str(random.random())) untagged = await self.setUpSource() tagged = await self.setUpSource() tagged.config = tagged.config._replace(tag="sometag") async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: await uctx.update( Record("0", data={"features": { "feed": 1 }})) await lctx.update( Record("0", data={"features": { "face": 2 }})) # async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: record = await uctx.record("0") self.assertIn("feed", record.features()) record = await lctx.record("0") self.assertIn("face", record.features()) with open(self.testfile, "r") as fd: dict_reader = csv.DictReader(fd, dialect="strip") rows = {row["tag"]: {row["key"]: row} for row in dict_reader} self.assertIn("untagged", rows) self.assertIn("sometag", rows) self.assertIn("0", rows["untagged"]) self.assertIn("0", rows["sometag"]) self.assertIn("feed", rows["untagged"]["0"]) self.assertIn("face", rows["sometag"]["0"]) self.assertEqual("1", rows["untagged"]["0"]["feed"]) self.assertEqual("2", rows["sometag"]["0"]["face"])
async def record(self, key: str): record = Record(key) async with self.parent.db() as db_ctx: try: row = await db_ctx.lookup( self.parent.config.table_name, cols=None, # None turns into *. We want all rows conditions=[[Condition("key", "=", key)]], ).__anext__() except StopAsyncIteration: # This would happen if there is no matching row, so the async generator reached the end return record if row is not None: features = {} predictions = {} for key, value in row.items(): if key.startswith("feature_"): features[key.replace("feature_", "")] = value elif "_value" in key: target = key.replace("_value", "") predictions[target] = { "value": row[target + "_value"], "confidence": row[target + "_confidence"], } record.merge( Record( row["key"], data={ "features": features, "prediction": predictions }, )) return record
async def test_ini(self): with TemporaryDirectory() as testdir: self.testfile = os.path.join(testdir, "testfile.ini") # Create a source source = INISource(filename=self.testfile, allowempty=True, readwrite=True) # Save some data in the source await save( source, Record("section1", data={"features": { "A": 1, "B": 2 }}), Record("section2", data={"features": { "C": 3, "D": 4 }}), ) # Load all the records records = [record async for record in load(source)] self.assertIsInstance(records, list) self.assertEqual(len(records), 2) self.assertDictEqual(records[0].features(), {"a": 1, "b": 2}) self.assertDictEqual(records[1].features(), {"c": 3, "d": 4})
async def record(self, key: str): query = self.parent.config.record_query record = Record(key) db = self.conn await db.execute(query, (key, )) row = await db.fetchone() if row is not None: features = {} predictions = {} for key, value in row.items(): if key.startswith("feature_"): features[key.replace("feature_", "")] = value elif "_value" in key: target = key.replace("_value", "") predictions[target] = { "value": row[target + "_value"], "confidence": row[target + "_confidence"], } record.merge( Record( row["key"], data={ "features": features, "prediction": predictions }, )) return record
def setUpClass(cls): cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.model_dir = tempfile.TemporaryDirectory() cls.model = MiscModel( MiscModelConfig( directory=cls.model_dir.name, classifications=["not a", "a"], features=cls.features, ) ) cls.records = [ Record( "a" + str(random.random()), data={"features": {cls.feature.NAME: 1, "string": "a"}}, ) for _ in range(0, 1000) ] cls.records += [ Record( "b" + str(random.random()), data={"features": {cls.feature.NAME: 0, "string": "not a"}}, ) for _ in range(0, 1000) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) )
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature = Feature("starts_with_a", int, 1) cls.features = Features(cls.feature) cls.records = [ Record( "a" + str(random.random()), data={"features": { cls.feature.name: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.records += [ Record( "b" + str(random.random()), data={"features": { cls.feature.name: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model = DNNClassifierModel( DNNClassifierModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=Feature("string", str, 1), classifications=["a", "not a"], clstype=str, features=cls.features, ))
def setUpClass(cls): ( A_train, B_train, C_train, X_train, D_train, E_train, ) = list(zip(*TRAIN_DATA)) A_test, B_test, C_test, X_test, D_test, E_test = list(zip(*TEST_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "title": A_train[i], "context": B_train[i], "question": C_train[i], "answer_text": X_train[i], "start_pos_char": D_train[i], "is_impossible": E_train[i], "answers": [], } }, ) for i in range(len(X_train)) ] cls.test_records = [ Record( str(i), data={ "features": { "title": A_test[i], "context": B_test[i], "question": C_test[i], "answer_text": X_test[i], "start_pos_char": D_test[i], "is_impossible": E_test[i], "answers": [], } }, ) for i in range(len(X_test)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records))) cls.test_sources = Sources( MemorySource(MemorySourceConfig(records=cls.test_records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = QAModel( QAModelConfig( model_name_or_path="bert-base-cased", cache_dir=CACHE_DIR, directory=cls.model_dir.name, log_dir=cls.model_dir.name, model_type="bert", no_cuda=True, ))
def setUp(self): self.null = Record("null") self.full = Record( "full", data=dict( features=dict(dead="beef"), extra=dict(extra="read all about it"), ), extra=dict(half=True), )
async def test_save_and_load(self): source = CSVSource( filename=self.save_and_load, allowempty=True, readwrite=True ) await save( source, Record( "1", data={ "features": {"A": 0, "B": 1}, "prediction": {"C": {"value": 1, "confidence": 1.0}}, }, ), Record( "2", data={ "features": {"A": 3, "B": 4}, "prediction": {"C": {"value": 2, "confidence": 1.0}}, }, ), ) # All records in source results = [record.export() async for record in load(source)] self.assertEqual( results, [ { "key": "1", "features": {"A": 0, "B": 1}, "prediction": {"C": {"confidence": 1.0, "value": "1"}}, "extra": {}, }, { "key": "2", "features": {"A": 3, "B": 4}, "prediction": {"C": {"confidence": 1.0, "value": "2"}}, "extra": {}, }, ], ) # For specific records in a source results = [record.export() async for record in load(source, "1")] self.assertEqual( results, [ { "key": "1", "features": {"A": 0, "B": 1}, "prediction": {"C": {"confidence": 1.0, "value": "1"}}, "extra": {}, } ], )
async def record(self, key: str): record = Record(key) db = self.conn # Get features await db.execute("SELECT json FROM ml_data WHERE key=%s", (key, )) dump = await db.fetchone() if dump is not None and dump[0] is not None: record.merge(Record(key, data=json.loads(dump[0]))) await db.execute("SELECT maintained FROM `status` WHERE key=%s", (key, )) maintained = await db.fetchone() if maintained is not None and maintained[0] is not None: record.evaluated({"maintained": str(maintained[0])}) return record
def setUpClass(cls): A_train, B_train, X = list(zip(*TRAIN_DATA)) A_predict, B_predict = list(zip(*PREDICT_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "sentence_id": A_train[i], "words": B_train[i], "ner_tag": X[i], } }, ) for i in range(0, len(X)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records)) ) cls.predict_records = [ Record( str(i), data={ "features": { "sentence_id": A_predict[i], "words": B_predict[i], } }, ) for i in range(0, len(A_predict)) ] cls.predict_sources = Sources( MemorySource(MemorySourceConfig(records=cls.predict_records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = NERModel( NERModelConfig( sid=Feature("sentence_id", int, 1), words=Feature("words", str, 1), predict=Feature("ner_tag", str, 1), output_dir=cls.model_dir.name, model_architecture_type="bert", model_name_or_path="bert-base-cased", no_cuda=True, ) )
async def test_02_predict(self): a = Record("a", data={"features": {self.feature.NAME: 1}}) b = Record("not a", data={"features": {self.feature.NAME: 0}}) async with Sources( MemorySource(MemorySourceConfig(records=[a, b])) ) as sources, self.model as model: async with sources() as sctx, model() as mctx: num = 0 async for record, prediction, confidence in mctx.predict( sctx.records() ): with self.subTest(record=record): self.assertEqual(prediction, record.key) num += 1 self.assertEqual(num, 2)
def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = AnomalyModel( features=Features( Feature("A", int, 1), Feature("B", int, 2), ), predict=Feature("Y", int, 1), directory=cls.model_dir.name, ) # Generating data _n_data = 1800 _temp_data = np.random.normal(2, 1, size=(2, _n_data)) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "A": float(_temp_data[0][i]), "B": float(_temp_data[1][i]), "Y": (_temp_data[0][i] > 1 - _temp_data[1][i]).astype(int), } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1400]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1400:])))
async def setUp(self): await super().setUp() self.records = [Record(str(random.random())) for _ in range(0, 10)] self.temp_filename = self.mktempfile() self.sconfig = FileSourceConfig(filename=self.temp_filename, readwrite=True, allowempty=True) async with JSONSource(self.sconfig) as source: async with source() as sctx: for record in self.records: await sctx.update(record) contents = json.loads(Path(self.sconfig.filename).read_text()) # Ensure there are records in the file self.assertEqual( len(contents.get(self.sconfig.tag)), len(self.records), "RecordsTestCase JSON file erroneously initialized as empty", ) # TODO(p3) For some reason patching Model.load doesn't work self._stack.enter_context( patch("dffml.model.model.Model.load", new=model_load)) self._stack.enter_context( patch("dffml.df.base.OperationImplementation.load", new=opimp_load)) self._stack.enter_context( patch("dffml.df.types.Operation.load", new=op_load))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature1 = Feature("feature_1", float, 1) cls.feature2 = Feature("feature_2", float, 1) cls.features = Features(cls.feature1, cls.feature2) cls.model = DNNRegressionModel( DNNRegressionModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=Feature("TARGET", float, 1), features=cls.features, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.records = [ Record( "x" + str(random.random()), data={ "features": { cls.feature1.name: float(_temp_data[0][i]), cls.feature2.name: float(_temp_data[1][i]), "TARGET": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)))
async def model_predict(self, request, mctx): # TODO Provide an iterkey method for model prediction chunk_size = int(request.match_info["chunk_size"]) if chunk_size != 0: return web.json_response( {"error": "Multiple request iteration not yet supported"}, status=HTTPStatus.BAD_REQUEST, ) # Get the records records: Dict[str, Record] = {} # Create a source with will provide the records async with Sources( MemorySource(records=[ Record(key, data=record_data) for key, record_data in (await request.json()).items() ])) as source: async with source() as sctx: # Feed them through prediction return web.json_response({ "iterkey": None, "records": { record.key: record.export() async for record in mctx.predict(sctx) }, })
async def test_02_predict(self): test_feature_val = [ 0, 1.5, 2, ] # inserting zero so that its 1-indexable test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2] # should be same function used in TestDNN.setupclass a = Record( "a", data={ "features": { self.feature1.name: test_feature_val[1], self.feature2.name: test_feature_val[2], } }, ) async with Sources(MemorySource(MemorySourceConfig( records=[a]))) as sources, self.model as model: target_name = model.config.predict.name async with sources() as sctx, model() as mctx: res = [record async for record in mctx.predict(sctx.records())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) test_error_norm = abs( (test_target - res[0].prediction(target_name).value) / test_target + 1e-6) error_threshold = 0.3 self.assertLess(test_error_norm, error_threshold)
async def test_update(self): key = "1" new_record = Record(key, data={"features": {"by_ten": 10}}) async with self.post(f"/source/{self.slabel}/update/{key}", json=new_record.export()) as r: self.assertEqual(await r.json(), OK) self.assertEqual((await self.sctx.record(key)).feature("by_ten"), 10)
async def model_predict(self, request, mctx): # TODO Provide an iterkey method for model prediction chunk_size = int(request.match_info["chunk_size"]) if chunk_size != 0: return web.json_response( {"error": "Multiple request iteration not yet supported"}, status=HTTPStatus.BAD_REQUEST, ) # Get the records records: Dict[str, Record] = { key: Record(key, data=record_data) for key, record_data in (await request.json()).items() } # Create an async generator to feed records async def record_gen(): for record in records.values(): yield record # Feed them through prediction return web.json_response({ "iterkey": None, "records": { record.key: record.export() async for record in mctx.predict(record_gen()) }, })
def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = XGBRegressorModel( XGBRegressorModelConfig( features=Features(Feature("Feature1", float, 1), Feature("Feature2")), predict=Feature("Target", float, 1), directory=cls.model_dir.name, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "Feature1": float(_temp_data[0][i]), "Feature2": float(_temp_data[1][i]), "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1800]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1800:])))
def setUpClass(cls): cls.features = Features() cls.features.append(Feature("A", str, 1)) A, X = list(zip(*DATA)) cls.records = [ Record(str(i), data={"features": { "A": A[i], "X": X[i] }}) for i in range(0, len(X)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = TextClassificationModel( TextClassifierConfig( directory=cls.model_dir.name, classifications=[0, 1], features=cls.features, predict=Feature("X", int, 1), add_layers=True, layers=[ "Dense(units = 120, activation='relu')", "Dense(units = 64, activation=relu)", "Dense(units = 2, activation='softmax')", ], model_path= "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1", epochs=30, ))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) cls.features.append(Feature("E", float, 1)) cls.features.append(Feature("F", float, 1)) cls.features.append(Feature("G", int, 1)) cls.features.append(Feature("H", int, 1)) A, B, C, D, E, F, G, H, X = list(zip(*DATA)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) ) cls.model = VWModel( VWConfig( location=cls.model_dir.name, features=cls.features, predict=Feature("X", float, 1), # A and B will be namespace n1 # A and C will be in namespace n2 namespace=["n1_A_B", "n2_A_C"], importance=Feature("H", int, 1), tag=Feature("G", int, 1), task="regression", vwcmd=[ "l2", "0.1", "loss_function", "squared", "passes", "10", ], ) ) cls.scorer = MeanSquaredErrorAccuracy()
async def test_02_predict(self): a = Record("a", data={"features": {self.feature.NAME: 1}}) async with Sources(MemorySource(MemorySourceConfig( records=[a]))) as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: res = [record async for record in mctx.predict(sctx.records())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) self.assertTrue(res[0].prediction(target_name).value)
def setUpClass(cls): A_train, X_train = list(zip(*TRAIN_DATA)) A_test, X_test = list(zip(*TEST_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "sentence": A_train[i], "entities": X_train[i], } }, ) for i in range(len(X_train)) ] cls.test_records = [ Record( str(i), data={ "features": {"sentence": A_test[i], "entities": X_test[i],} }, ) for i in range(len(X_test)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records)) ) cls.test_sources = Sources( MemorySource(MemorySourceConfig(records=cls.test_records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = SpacyNERModel( SpacyNERModelConfig( model_name="en_core_web_sm", location=cls.model_dir.name, n_iter=10, dropout=0.4, ) ) cls.scorer = SpacyNerAccuracy()
async def record(self, key: str): # Create a blank record in case it doesn't exist within the source record = Record(key) # Execute the query to get a single record from a key await self.conn.execute(self.parent.config.record, (key,)) # Retrieve the result row = await self.conn.fetchone() # Convert it to a record if it exists and populate the previously blank # record by merging the two if row is not None: record.merge(self.row_to_record(row)) self.logger.debug("Got: %s: %r", record.key, record.export()) return record
def setUpClass(self): self.records = [ Record( str(i), data={ "features": { "Years": A[i], "Expertise": B[i], "Trust": C[i], "Salary": D[i], } }, ) for i in range(4) ] self.source = Sources( MemorySource(MemorySourceConfig(records=self.records)))
async def test_model(self): test_feature_val = [ 0, 1.5, 2, ] # inserting zero so that its 1-indexable test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2] # should be same function used in TestDNN.setupclass a = Record( "a", data={ "features": { self.feature1.name: test_feature_val[1], self.feature2.name: test_feature_val[2], } }, ) target_name = self.model.config.predict.name for i in range(0, 7): await train(self.model, self.sources) res = await accuracy(self.model, self.sources) # Retry because of tensorflow intermitant low accuracy if res <= 0.8 and i < 5: print("Retry i:", i, "accuracy:", res) self.model_dir.cleanup() self.model_dir = tempfile.TemporaryDirectory() self.model.config = self.model.config._replace( directory=self.model_dir.name ) continue self.assertGreater(res, 0.8) res = [ record async for record in predict(self.model, a, keep_record=True) ] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) test_error_norm = abs( (test_target - res[0].prediction(target_name).value) / test_target + 1e-6 ) error_threshold = 0.3 self.assertLess(test_error_norm, error_threshold)
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature = DefFeature("X", float, 1) cls.features = Features(cls.feature) X, Y = list(zip(*FEATURE_DATA)) cls.records = [ Record(str(i), data={"features": { "X": X[i], "Y": Y[i] }}) for i in range(0, len(Y)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model = SLR( SLRConfig( directory=cls.model_dir.name, predict=DefFeature("Y", float, 1), features=cls.features, ))
async def setUp(self): await super().setUp() self.train_data = [ [0, 1, 0.2, 10], [1, 3, 0.4, 20], [2, 5, 0.6, 30], [3, 7, 0.8, 40], ] self.test_data = [[4, 9, 1.0, 50], [5, 11, 1.2, 60]] self.predict_data = [[6, 13, 1.4], [7, 15, 1.6]] for use in ["train", "test", "predict"]: records = [ Record(i, data={"features": dict(zip(FEATURE_NAMES, features))}) for i, features in enumerate(getattr(self, f"{use}_data")) ] setattr(self, f"{use}_records", records) filename = self.mktempfile() + ".csv" setattr(self, f"{use}_filename", filename) await self.populate_source(CSVSource, *records, filename=filename)
def row_to_record(self, row): features = {} predictions = {} # Features for feature_name, column_name in self.parent.config.features.items(): features[feature_name] = row[column_name] # Predictions for ( feature_name, (value_column_name, confidence_column_name), ) in self.parent.config.predictions.items(): predictions[feature_name] = { "value": row[value_column_name], # Set confidence to Not A Number if not given "confidence": row.get(confidence_column_name, float("nan")), } return Record( row[self.parent.config.key], data={"features": features, "prediction": predictions}, )