async def test_tag(self): with tempfile.TemporaryDirectory() as testdir: self.testfile = os.path.join(testdir, str(random.random())) untagged = await self.setUpSource() tagged = await self.setUpSource() tagged.config = tagged.config._replace(tag="sometag") async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: await uctx.update( Record("0", data={"features": { "feed": 1 }})) await lctx.update( Record("0", data={"features": { "face": 2 }})) # async with untagged, tagged: async with untagged() as uctx, tagged() as lctx: record = await uctx.record("0") self.assertIn("feed", record.features()) record = await lctx.record("0") self.assertIn("face", record.features()) with open(self.testfile, "r") as fd: dict_reader = csv.DictReader(fd, dialect="strip") rows = {row["tag"]: {row["key"]: row} for row in dict_reader} self.assertIn("untagged", rows) self.assertIn("sometag", rows) self.assertIn("0", rows["untagged"]) self.assertIn("0", rows["sometag"]) self.assertIn("feed", rows["untagged"]["0"]) self.assertIn("face", rows["sometag"]["0"]) self.assertEqual("1", rows["untagged"]["0"]["feed"]) self.assertEqual("2", rows["sometag"]["0"]["face"])
async def record(self, key: str): record = Record(key) async with self.parent.db() as db_ctx: try: row = await db_ctx.lookup( self.parent.config.table_name, cols=None, # None turns into *. We want all rows conditions=[[Condition("key", "=", key)]], ).__anext__() except StopAsyncIteration: # This would happen if there is no matching row, so the async generator reached the end return record if row is not None: features = {} predictions = {} for key, value in row.items(): if key.startswith("feature_"): features[key.replace("feature_", "")] = value elif "_value" in key: target = key.replace("_value", "") predictions[target] = { "value": row[target + "_value"], "confidence": row[target + "_confidence"], } record.merge( Record( row["key"], data={ "features": features, "prediction": predictions }, )) return record
def setUpClass(cls): cls.feature = StartsWithA() cls.features = Features(cls.feature) cls.model_dir = tempfile.TemporaryDirectory() cls.model = MiscModel( MiscModelConfig( directory=cls.model_dir.name, classifications=["not a", "a"], features=cls.features, ) ) cls.records = [ Record( "a" + str(random.random()), data={"features": {cls.feature.NAME: 1, "string": "a"}}, ) for _ in range(0, 1000) ] cls.records += [ Record( "b" + str(random.random()), data={"features": {cls.feature.NAME: 0, "string": "not a"}}, ) for _ in range(0, 1000) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) )
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature = Feature("starts_with_a", int, 1) cls.features = Features(cls.feature) cls.records = [ Record( "a" + str(random.random()), data={"features": { cls.feature.name: 1, "string": "a" }}, ) for _ in range(0, 1000) ] cls.records += [ Record( "b" + str(random.random()), data={"features": { cls.feature.name: 0, "string": "not a" }}, ) for _ in range(0, 1000) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model = DNNClassifierModel( DNNClassifierModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=Feature("string", str, 1), classifications=["a", "not a"], clstype=str, features=cls.features, ))
async def record(self, key: str): query = self.parent.config.record_query record = Record(key) db = self.conn await db.execute(query, (key, )) row = await db.fetchone() if row is not None: features = {} predictions = {} for key, value in row.items(): if key.startswith("feature_"): features[key.replace("feature_", "")] = value elif "_value" in key: target = key.replace("_value", "") predictions[target] = { "value": row[target + "_value"], "confidence": row[target + "_confidence"], } record.merge( Record( row["key"], data={ "features": features, "prediction": predictions }, )) return record
async def update(self, record: Record): # Column name of value mapping bindings = {self.parent.config.key: record.key} # Features features = record.features(self.parent.config.features.keys()) for feature_name, column_name in self.parent.config.features.items(): bindings[column_name] = features.get(feature_name, None) # Predictions predictions = record.predictions(self.parent.config.predictions.keys()) for ( feature_name, (value_column_name, confidence_column_name), ) in self.parent.config.predictions.items(): bindings[value_column_name] = None if confidence_column_name is not None: bindings[confidence_column_name] = None if feature_name in predictions: bindings[value_column_name] = predictions[feature_name][ "value" ] if confidence_column_name is not None: bindings[confidence_column_name] = predictions[ feature_name ]["confidence"] # Bindings should be the values for each column, where the value for the # key is not repeated for the UPDATE. If using REPLACE INTO, don't # repeat values values = list(bindings.values()) if not "REPLACE" in self.parent.config.update.upper(): values += list(bindings.values())[1:] # Execute the update query await self.conn.execute(self.parent.config.update, values) self.logger.debug("Updated: %s: %r", record.key, bindings)
async def update(self, record: Record): db = self.parent.db # Store feature data feature_cols = self.parent.FEATURE_COLS feature_data = OrderedDict.fromkeys(feature_cols) feature_data.update(record.features(feature_cols)) await db.execute( "INSERT OR REPLACE INTO features (key, " + ", ".join(feature_cols) + ") " "VALUES(?, " + ", ".join("?" * len(feature_cols)) + ")", [record.key] + list(feature_data.values()), ) # Store prediction try: prediction = record.prediction("target_name") prediction_cols = self.parent.PREDICTION_COLS prediction_data = OrderedDict.fromkeys(prediction_cols) prediction_data.update(prediction.dict()) await db.execute( "INSERT OR REPLACE INTO prediction (key, " + ", ".join(prediction_cols) + ") " "VALUES(?, " + ", ".join("?" * len(prediction_cols)) + ")", [record.key] + list(prediction_data.values()), ) except KeyError: pass
async def test_update(self): key = "1" new_record = Record(key, data={"features": {"by_ten": 10}}) async with self.post(f"/source/{self.slabel}/update/{key}", json=new_record.export()) as r: self.assertEqual(await r.json(), OK) self.assertEqual((await self.sctx.record(key)).feature("by_ten"), 10)
async def test_ini(self): with TemporaryDirectory() as testdir: self.testfile = os.path.join(testdir, "testfile.ini") # Create a source source = INISource(filename=self.testfile, allowempty=True, readwrite=True) # Save some data in the source await save( source, Record("section1", data={"features": { "A": 1, "B": 2 }}), Record("section2", data={"features": { "C": 3, "D": 4 }}), ) # Load all the records records = [record async for record in load(source)] self.assertIsInstance(records, list) self.assertEqual(len(records), 2) self.assertDictEqual(records[0].features(), {"a": 1, "b": 2}) self.assertDictEqual(records[1].features(), {"c": 3, "d": 4})
def setUpClass(cls): ( A_train, B_train, C_train, X_train, D_train, E_train, ) = list(zip(*TRAIN_DATA)) A_test, B_test, C_test, X_test, D_test, E_test = list(zip(*TEST_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "title": A_train[i], "context": B_train[i], "question": C_train[i], "answer_text": X_train[i], "start_pos_char": D_train[i], "is_impossible": E_train[i], "answers": [], } }, ) for i in range(len(X_train)) ] cls.test_records = [ Record( str(i), data={ "features": { "title": A_test[i], "context": B_test[i], "question": C_test[i], "answer_text": X_test[i], "start_pos_char": D_test[i], "is_impossible": E_test[i], "answers": [], } }, ) for i in range(len(X_test)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records))) cls.test_sources = Sources( MemorySource(MemorySourceConfig(records=cls.test_records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = QAModel( QAModelConfig( model_name_or_path="bert-base-cased", cache_dir=CACHE_DIR, directory=cls.model_dir.name, log_dir=cls.model_dir.name, model_type="bert", no_cuda=True, ))
def setUp(self): self.null = Record("null") self.full = Record( "full", data=dict( features=dict(dead="beef"), extra=dict(extra="read all about it"), ), extra=dict(half=True), )
async def test_save_and_load(self): source = CSVSource( filename=self.save_and_load, allowempty=True, readwrite=True ) await save( source, Record( "1", data={ "features": {"A": 0, "B": 1}, "prediction": {"C": {"value": 1, "confidence": 1.0}}, }, ), Record( "2", data={ "features": {"A": 3, "B": 4}, "prediction": {"C": {"value": 2, "confidence": 1.0}}, }, ), ) # All records in source results = [record.export() async for record in load(source)] self.assertEqual( results, [ { "key": "1", "features": {"A": 0, "B": 1}, "prediction": {"C": {"confidence": 1.0, "value": "1"}}, "extra": {}, }, { "key": "2", "features": {"A": 3, "B": 4}, "prediction": {"C": {"confidence": 1.0, "value": "2"}}, "extra": {}, }, ], ) # For specific records in a source results = [record.export() async for record in load(source, "1")] self.assertEqual( results, [ { "key": "1", "features": {"A": 0, "B": 1}, "prediction": {"C": {"confidence": 1.0, "value": "1"}}, "extra": {}, } ], )
async def record(self, key: str): # Create a blank record in case it doesn't exist within the source record = Record(key) # Execute the query to get a single record from a key await self.conn.execute(self.parent.config.record, (key,)) # Retrieve the result row = await self.conn.fetchone() # Convert it to a record if it exists and populate the previously blank # record by merging the two if row is not None: record.merge(self.row_to_record(row)) self.logger.debug("Got: %s: %r", record.key, record.export()) return record
def setUpClass(cls): A_train, B_train, X = list(zip(*TRAIN_DATA)) A_predict, B_predict = list(zip(*PREDICT_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "sentence_id": A_train[i], "words": B_train[i], "ner_tag": X[i], } }, ) for i in range(0, len(X)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records)) ) cls.predict_records = [ Record( str(i), data={ "features": { "sentence_id": A_predict[i], "words": B_predict[i], } }, ) for i in range(0, len(A_predict)) ] cls.predict_sources = Sources( MemorySource(MemorySourceConfig(records=cls.predict_records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = NERModel( NERModelConfig( sid=Feature("sentence_id", int, 1), words=Feature("words", str, 1), predict=Feature("ner_tag", str, 1), output_dir=cls.model_dir.name, model_architecture_type="bert", model_name_or_path="bert-base-cased", no_cuda=True, ) )
async def test_02_predict(self): a = Record("a", data={"features": {self.feature.NAME: 1}}) b = Record("not a", data={"features": {self.feature.NAME: 0}}) async with Sources( MemorySource(MemorySourceConfig(records=[a, b])) ) as sources, self.model as model: async with sources() as sctx, model() as mctx: num = 0 async for record, prediction, confidence in mctx.predict( sctx.records() ): with self.subTest(record=record): self.assertEqual(prediction, record.key) num += 1 self.assertEqual(num, 2)
def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = AnomalyModel( features=Features( Feature("A", int, 1), Feature("B", int, 2), ), predict=Feature("Y", int, 1), directory=cls.model_dir.name, ) # Generating data _n_data = 1800 _temp_data = np.random.normal(2, 1, size=(2, _n_data)) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "A": float(_temp_data[0][i]), "B": float(_temp_data[1][i]), "Y": (_temp_data[0][i] > 1 - _temp_data[1][i]).astype(int), } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1400]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1400:])))
def setUpClass(cls): cls.features = Features() cls.features.append(Feature("A", str, 1)) A, X = list(zip(*DATA)) cls.records = [ Record(str(i), data={"features": { "A": A[i], "X": X[i] }}) for i in range(0, len(X)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records))) cls.model_dir = tempfile.TemporaryDirectory() cls.model = TextClassificationModel( TextClassifierConfig( directory=cls.model_dir.name, classifications=[0, 1], features=cls.features, predict=Feature("X", int, 1), add_layers=True, layers=[ "Dense(units = 120, activation='relu')", "Dense(units = 64, activation=relu)", "Dense(units = 2, activation='softmax')", ], model_path= "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1", epochs=30, ))
async def input_set(self, record: Record) -> List[Input]: return ([ Input( value=record.feature(feature.name), definition=Definition( name=feature.name, primitive=str(feature.dtype()), ), ) for feature in self.parent.config.features ] + [ Input( value=value, definition=self.parent.config.dataflow.definitions[name], ) for value, name in self.parent.config.inputs ] + ([] if not self.parent.config.length else [ Input( value=await self.sctx.length(), definition=Definition( name=self.parent.config.length, primitive="int", ), ) ]) + ([] if not self.parent.config.record_def else [ Input( value=record.key, definition=Definition( name=self.parent.config.record_def, primitive="string", ), ) ]))
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.feature1 = Feature("feature_1", float, 1) cls.feature2 = Feature("feature_2", float, 1) cls.features = Features(cls.feature1, cls.feature2) cls.model = DNNRegressionModel( DNNRegressionModelConfig( directory=cls.model_dir.name, steps=1000, epochs=40, hidden=[50, 20, 10], predict=Feature("TARGET", float, 1), features=cls.features, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.records = [ Record( "x" + str(random.random()), data={ "features": { cls.feature1.name: float(_temp_data[0][i]), cls.feature2.name: float(_temp_data[1][i]), "TARGET": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)))
async def setUp(self): await super().setUp() self.records = [Record(str(random.random())) for _ in range(0, 10)] self.temp_filename = self.mktempfile() self.sconfig = FileSourceConfig(filename=self.temp_filename, readwrite=True, allowempty=True) async with JSONSource(self.sconfig) as source: async with source() as sctx: for record in self.records: await sctx.update(record) contents = json.loads(Path(self.sconfig.filename).read_text()) # Ensure there are records in the file self.assertEqual( len(contents.get(self.sconfig.tag)), len(self.records), "RecordsTestCase JSON file erroneously initialized as empty", ) # TODO(p3) For some reason patching Model.load doesn't work self._stack.enter_context( patch("dffml.model.model.Model.load", new=model_load)) self._stack.enter_context( patch("dffml.df.base.OperationImplementation.load", new=opimp_load)) self._stack.enter_context( patch("dffml.df.types.Operation.load", new=op_load))
async def test_02_predict(self): test_feature_val = [ 0, 1.5, 2, ] # inserting zero so that its 1-indexable test_target = 2 * test_feature_val[1] + 3 * test_feature_val[2] # should be same function used in TestDNN.setupclass a = Record( "a", data={ "features": { self.feature1.name: test_feature_val[1], self.feature2.name: test_feature_val[2], } }, ) async with Sources(MemorySource(MemorySourceConfig( records=[a]))) as sources, self.model as model: target_name = model.config.predict.name async with sources() as sctx, model() as mctx: res = [record async for record in mctx.predict(sctx.records())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) test_error_norm = abs( (test_target - res[0].prediction(target_name).value) / test_target + 1e-6) error_threshold = 0.3 self.assertLess(test_error_norm, error_threshold)
async def model_predict(self, request, mctx): # TODO Provide an iterkey method for model prediction chunk_size = int(request.match_info["chunk_size"]) if chunk_size != 0: return web.json_response( {"error": "Multiple request iteration not yet supported"}, status=HTTPStatus.BAD_REQUEST, ) # Get the records records: Dict[str, Record] = {} # Create a source with will provide the records async with Sources( MemorySource(records=[ Record(key, data=record_data) for key, record_data in (await request.json()).items() ])) as source: async with source() as sctx: # Feed them through prediction return web.json_response({ "iterkey": None, "records": { record.key: record.export() async for record in mctx.predict(sctx) }, })
def setUpClass(cls): # Create a temporary directory to store the trained model cls.model_dir = tempfile.TemporaryDirectory() # Create an instance of the model cls.model = XGBRegressorModel( XGBRegressorModelConfig( features=Features(Feature("Feature1", float, 1), Feature("Feature2")), predict=Feature("Target", float, 1), directory=cls.model_dir.name, )) # Generating data f(x1,x2) = 2*x1 + 3*x2 _n_data = 2000 _temp_data = np.random.rand(2, _n_data) cls.records = [ Record( "x" + str(random.random()), data={ "features": { "Feature1": float(_temp_data[0][i]), "Feature2": float(_temp_data[1][i]), "Target": 2 * _temp_data[0][i] + 3 * _temp_data[1][i], } }, ) for i in range(0, _n_data) ] cls.trainingsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[:1800]))) cls.testsource = Sources( MemorySource(MemorySourceConfig(records=cls.records[1800:])))
async def model_predict(self, request, mctx): # TODO Provide an iterkey method for model prediction chunk_size = int(request.match_info["chunk_size"]) if chunk_size != 0: return web.json_response( {"error": "Multiple request iteration not yet supported"}, status=HTTPStatus.BAD_REQUEST, ) # Get the records records: Dict[str, Record] = { key: Record(key, data=record_data) for key, record_data in (await request.json()).items() } # Create an async generator to feed records async def record_gen(): for record in records.values(): yield record # Feed them through prediction return web.json_response({ "iterkey": None, "records": { record.key: record.export() async for record in mctx.predict(record_gen()) }, })
def setUpClass(cls): cls.model_dir = tempfile.TemporaryDirectory() cls.features = Features() cls.features.append(Feature("A", float, 1)) cls.features.append(Feature("B", float, 1)) cls.features.append(Feature("C", float, 1)) cls.features.append(Feature("D", float, 1)) cls.features.append(Feature("E", float, 1)) cls.features.append(Feature("F", float, 1)) cls.features.append(Feature("G", int, 1)) cls.features.append(Feature("H", int, 1)) A, B, C, D, E, F, G, H, X = list(zip(*DATA)) cls.records = [ Record( str(i), data={ "features": { "A": A[i], "B": B[i], "C": C[i], "D": D[i], "E": E[i], "F": F[i], "G": G[i], "H": H[i], "X": X[i], } }, ) for i in range(0, len(A)) ] cls.sources = Sources( MemorySource(MemorySourceConfig(records=cls.records)) ) cls.model = VWModel( VWConfig( location=cls.model_dir.name, features=cls.features, predict=Feature("X", float, 1), # A and B will be namespace n1 # A and C will be in namespace n2 namespace=["n1_A_B", "n2_A_C"], importance=Feature("H", int, 1), tag=Feature("G", int, 1), task="regression", vwcmd=[ "l2", "0.1", "loss_function", "squared", "passes", "10", ], ) ) cls.scorer = MeanSquaredErrorAccuracy()
async def test_02_predict(self): a = Record("a", data={"features": {self.feature.NAME: 1}}) async with Sources(MemorySource(MemorySourceConfig( records=[a]))) as sources, self.model as model: target_name = model.config.predict.NAME async with sources() as sctx, model() as mctx: res = [record async for record in mctx.predict(sctx.records())] self.assertEqual(len(res), 1) self.assertEqual(res[0].key, a.key) self.assertTrue(res[0].prediction(target_name).value)
def setUpClass(cls): A_train, X_train = list(zip(*TRAIN_DATA)) A_test, X_test = list(zip(*TEST_DATA)) cls.train_records = [ Record( str(i), data={ "features": { "sentence": A_train[i], "entities": X_train[i], } }, ) for i in range(len(X_train)) ] cls.test_records = [ Record( str(i), data={ "features": {"sentence": A_test[i], "entities": X_test[i],} }, ) for i in range(len(X_test)) ] cls.train_sources = Sources( MemorySource(MemorySourceConfig(records=cls.train_records)) ) cls.test_sources = Sources( MemorySource(MemorySourceConfig(records=cls.test_records)) ) cls.model_dir = tempfile.TemporaryDirectory() cls.model = SpacyNERModel( SpacyNERModelConfig( model_name="en_core_web_sm", location=cls.model_dir.name, n_iter=10, dropout=0.4, ) ) cls.scorer = SpacyNerAccuracy()
async def test_predict(self): records: Dict[str, Record] = { record.key: record.export() async for record in self.sctx.records() } async with self.post(f"/model/{self.mlabel}/predict/0", json=records) as r: i: int = 0 response = await r.json() for key, record_data in response["records"].items(): record = Record(key, data=record_data) self.assertEqual(int(record.key), i) self.assertEqual( record.feature("by_ten"), record.prediction("Salary").value / 10, ) self.assertEqual(float(record.key), record.prediction("Salary").confidence) i += 1 self.assertEqual(i, self.num_records)
async def update(self, record: Record): db = self.conn # Just dump it (if you want a setup the queries easily, then you need to # massage the columns in this table to your liking, and perhaps add more # tables. marshall = json.dumps(record.dict()) await db.execute( "INSERT INTO ml_data (key, json) VALUES(%s, %s) " "ON DUPLICATE KEY UPDATE json = %s", (record.key, marshall, marshall), ) self.logger.debug("updated: %s", marshall) self.logger.debug("update: %s", await self.record(record.key))
def setUpClass(self): self.records = [ Record( str(i), data={ "features": { "Years": A[i], "Expertise": B[i], "Trust": C[i], "Salary": D[i], } }, ) for i in range(4) ] self.source = Sources( MemorySource(MemorySourceConfig(records=self.records)))