async def producer(queue, message): """Process messagees and schedule worker work items.""" logger.info( "Scheduling [%s] work item for client [%s] message [%s]", message.message_type.name, message.client_id, message.message_id, ) if message.message_type == MessageType.NEW and message.client_id: if message.data.get("use_model"): message.message_type = MessageType.MODEL asyncio.create_task(queue.put(message)) elif message.data.get("filepath"): message.message_type = MessageType.SAMPLE asyncio.create_task(queue.put(message)) elif message.data.get("labeled_pairs"): message.message_type = MessageType.LABEL asyncio.create_task(queue.put(message)) elif message.data.get("labeling_complete"): message.message_type = MessageType.TRAIN asyncio.create_task(queue.put(message)) elif message.data.get("training_complete"): message.message_type = MessageType.DEDUPE asyncio.create_task(queue.put(message)) else: logger.error( "Unable to schedule message (%s). Unknown message data %s", message.message_id, message.data, ) else: logger.error("Unable to schedule message %s invalid", message)
async def consumer(queue): """Consume scheduled work items.""" while True: message = await queue.get() datastore = ds.get_datastore(message.client_id) logger.info( "Processing [%s] work item for client_id [%s] message_id [%s]", message.message_type.name, message.client_id, message.message_id, ) if message.message_type == MessageType.MODEL: await datastore.model(message.data.get("filepath"), message.client_id) await datastore.dedupe() queue.task_done() if message.message_type == MessageType.SAMPLE: await datastore.sample(message.data.get("filepath")) queue.task_done() if message.message_type == MessageType.LABEL: await datastore.pairs(message.data.get("labeled_pairs")) queue.task_done() if message.message_type == MessageType.TRAIN: await datastore.train(message.client_id) queue.task_done() if message.message_type == MessageType.DEDUPE: await datastore.dedupe() queue.task_done()
def clean_df(df): logger.info("Cleaning dataframe") df = df.astype(str) df = df.applymap(lambda x: unidecode(x)) df = df.applymap(lambda x: x.lower()) df = df.replace({"nan": "", "none": "", "nat": ""}) for i in df.columns: df[i] = df[i].str.replace(r"[^a-zA-Z0-9\/-]", "") return df
def select_fields(fields): logger.info("Generating data field mappings") def gen_field(field): if type(field) == str: return {"field": field, "type": "String"} if len(field) == 2: return {"field": field[0], "type": field[1]} return [gen_field(field) for field in fields]
def route_cors(host, app): """Add cors to all routes for supplied host.""" logger.info("Configuring cors") cors = aiohttp_cors.setup( app, defaults={ host: aiohttp_cors.ResourceOptions(allow_credentials=True, expose_headers="*", allow_headers="*") }, ) [cors.add(route) for route in list(app.router.routes())]
def data_cluster(deduper, data_dict, threshold): logger.debug("Clustering data") duplicates = deduper.match(data_dict, threshold) logger.info("Duplicate records found: %d", len(duplicates)) df_data = [ {"id": record_id, "cluster_id": cluster_id, "confidence": score} for cluster_id, records in enumerate(duplicates) for record_id, score in zip(*records) ] clustered_df = pd.DataFrame(df_data) clustered_df = clustered_df.set_index("id") return clustered_df
def deduplicate( df, recall_weight=1, sample_size=0.3, settings_file="training-data/dedupe_learned_settings", training_file="training-data/dedupe_training.json", ): fields = df.columns df, data_d = data_prep(df) if os.path.exists(settings_file): logger.info("Existing settings found. Loading from: %s", settings_file) with open(settings_file, "rb") as f: deduper = dedupe.StaticDedupe(f) else: fields = select_fields(fields) deduper = dedupe.Dedupe(fields) sample_num = math.floor(len(data_d) * sample_size) logger.info("Extracting data sample of %s records", sample_num) deduper.sample(data_d, sample_num) if os.path.exists(training_file): logger.info("Reading training examples from: %s", training_file) with open(training_file, "rb") as f: deduper.readTraining(f) logger.info("Starting active labeling") dedupe.consoleLabel(deduper) deduper.train() with open(training_file, "w") as tf: deduper.writeTraining(tf) with open(settings_file, "wb") as sf: deduper.writeSettings(sf) threshold = deduper.threshold(data_d, recall_weight=recall_weight) clustered_df = data_cluster(deduper, data_d, threshold) results = df.join(clustered_df, how="left") results.drop(["dictionary"], axis=1, inplace=True) return results
async def on_shutdown(app, signal=None): """Cleanup tasks tied to the service's shutdown.""" if signal: logger.info(f"Received exit signal, shutting down.") tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] logger.info(f"Cancelling {len(tasks)} outstanding tasks") [task.cancel() for task in tasks] await asyncio.gather(*tasks, return_exceptions=True) logger.info(f"Stopping")
def run(): """Duple service runnner.""" logger.info("Starting duple.") web.run_app(create_app()) logger.info("Duple successfully shutdown.")
async def training_post(request): """Training endpoint for submitting labelled data. --- description: Accepts labeled data for training the data matching model. tags: - Training produces: - application/json responses: "200": description: Successful operation. Return further training records. "400": description: Unsuccessful operation. Labelled date not supplied. parameters: - in: header name: clientId schema: type: string format: uuid required: true - in: body name: body description: Labelled training data required: true schema: type: object properties: match: type: array items: type: array items: - $ref: '#/definitions/Person' - $ref: '#/definitions/Person' distinct: type: array items: type: array items: - $ref: '#/definitions/Person' - $ref: '#/definitions/Person' """ if request.body_exists and request.can_read_body: logger.debug("Labelled training data recieved.") client_id = request.headers.get("clientId") datastore = get_datastore(client_id) if datastore.training_rounds < 4: logger.info("Updating traing pairs for labeling") labeled_pairs = await request.json() message = message_wrapper(client_id, {"labeled_pairs": labeled_pairs}) await app["message_queue"].put(message) else: message = message_wrapper(client_id, {"labeling_complete": True}) await app["message_queue"].put(message) return web.Response(status=201) return web.Response(status=400)
def console_deduplicate(filename): logger.info("Starting console deduplicator") df = pd.read_csv(filename) result = deduplicate(df) logger.info("Writing results file to relateddata.csv") result.to_csv("relateddata.csv", index=False)