async def stats(request): """Stats endpoint for retrieving classification statistics. --- description: Supplies information reagarding the records processed. tags: - Results produces: - application/json responses: "200": description: successful operation. Return duple statistics. parameters: - in: header name: clientId schema: type: string format: uuid required: true """ client_id = request.headers.get("clientId") datastore = get_datastore(client_id) if datastore.has_result: return web.json_response(datastore.stats) else: return web.json_response({})
async def training_get(request): """Training endpoint for retrieving sample data. --- description: Supplies data used to train the duple data matching model. tags: - Training produces: - application/json responses: "200": description: successful operation. Return unlabeled training records. parameters: - in: header name: clientId schema: type: string format: uuid required: true """ logger.debug("Training data request recieved") client_id = request.headers.get("clientId") datastore = get_datastore(client_id) if datastore.training_rounds <= 4: training_data = await datastore.get_pairs() return web.json_response(training_data) else: return web.json_response([])
async def consumer(queue): """Consume scheduled work items.""" while True: message = await queue.get() datastore = ds.get_datastore(message.client_id) logger.info( "Processing [%s] work item for client_id [%s] message_id [%s]", message.message_type.name, message.client_id, message.message_id, ) if message.message_type == MessageType.MODEL: await datastore.model(message.data.get("filepath"), message.client_id) await datastore.dedupe() queue.task_done() if message.message_type == MessageType.SAMPLE: await datastore.sample(message.data.get("filepath")) queue.task_done() if message.message_type == MessageType.LABEL: await datastore.pairs(message.data.get("labeled_pairs")) queue.task_done() if message.message_type == MessageType.TRAIN: await datastore.train(message.client_id) queue.task_done() if message.message_type == MessageType.DEDUPE: await datastore.dedupe() queue.task_done()
async def results(request): """Results endpoint for retrieving classified data. --- description: Supplies clustered data containing duplicates. tags: - Results produces: - application/json responses: "200": description: successful operation. Return labeled results. parameters: - in: header name: clientId schema: type: string format: uuid required: true """ client_id = request.headers.get("clientId") datastore = get_datastore(client_id) if datastore.has_result and datastore.result.size > 0: result = datastore.result result = ( result[result.cluster_id > 0].sort_values("cluster_id").to_json( orient="table")) return web.Response(body=result, content_type="application/json") else: if await datastore.get_status("training"): message = message_wrapper(client_id, {"training_complete": True}) await app["message_queue"].put(message) if await datastore.get_status("dedupe") and datastore.result.size > 0: result = datastore.result result = (result[result.cluster_id > 0].sort_values( "cluster_id").to_json(orient="table")) return web.Response(body=result, content_type="application/json") if not datastore.has_result or datastore.result.size <= 0: return web.json_response({})
async def results_file(request): """Results endpoint for retrieving classified data results file. --- description: Supplies a file containing duplicates found. tags: - Results produces: - text/csv responses: "200": description: successful operation. Return labeled results file. parameters: - in: header name: clientId schema: type: string format: uuid required: true """ params = request.rel_url.query datastore = get_datastore(params.get("clientId")) if datastore.has_result: result = datastore.result result = ( result[result.cluster_id > 0].sort_values("cluster_id").to_csv( mode="wb", index=False)) return web.Response( headers=MultiDict({ "Content-Disposition": 'attachment; filename="relateddata.csv"' }), body=result, ) else: return web.Response(status=503)
def test_delete_empty_datastores(modify_repository, existing_datastore, new_datastore): delete_datastore("test234") assert get_datastore("test234") == new_datastore
def test_get_existing_datastore(modify_repository, existing_datastore, new_datastore): assert get_datastore("test234") != new_datastore assert get_datastore("test234") == existing_datastore assert get_datastore("test567") != get_datastore("test234")
def test_get_new_datastore(new_datastore): assert get_datastore("test123") == new_datastore
async def training_post(request): """Training endpoint for submitting labelled data. --- description: Accepts labeled data for training the data matching model. tags: - Training produces: - application/json responses: "200": description: Successful operation. Return further training records. "400": description: Unsuccessful operation. Labelled date not supplied. parameters: - in: header name: clientId schema: type: string format: uuid required: true - in: body name: body description: Labelled training data required: true schema: type: object properties: match: type: array items: type: array items: - $ref: '#/definitions/Person' - $ref: '#/definitions/Person' distinct: type: array items: type: array items: - $ref: '#/definitions/Person' - $ref: '#/definitions/Person' """ if request.body_exists and request.can_read_body: logger.debug("Labelled training data recieved.") client_id = request.headers.get("clientId") datastore = get_datastore(client_id) if datastore.training_rounds < 4: logger.info("Updating traing pairs for labeling") labeled_pairs = await request.json() message = message_wrapper(client_id, {"labeled_pairs": labeled_pairs}) await app["message_queue"].put(message) else: message = message_wrapper(client_id, {"labeling_complete": True}) await app["message_queue"].put(message) return web.Response(status=201) return web.Response(status=400)