def dedupe_pairs(self, deduper, pairs=6): """Training pairs for labelling. Selects a number of pairs for active labelling in the training phase. Arguments: deduper {Dedupe} -- Dedupe instance prepared with fields. Keyword Arguments: pairs {number} -- Number of labeling pairs to return (default: {6}) Returns: list -- List of pairs to be matched. """ logger.debug("Retrieving pairs for active labeling") try: training_pairs = [{ "pair_id": i, "records": { "record": m1, "match": m2 } } for i in range(pairs) for m1, m2 in deduper.uncertainPairs()] return training_pairs except IndexError as e: logger.error( "Unable to retrieve training pairs with the following error: %s", e) return []
async def training_get(request): """Training endpoint for retrieving sample data. --- description: Supplies data used to train the duple data matching model. tags: - Training produces: - application/json responses: "200": description: successful operation. Return unlabeled training records. parameters: - in: header name: clientId schema: type: string format: uuid required: true """ logger.debug("Training data request recieved") client_id = request.headers.get("clientId") datastore = get_datastore(client_id) if datastore.training_rounds <= 4: training_data = await datastore.get_pairs() return web.json_response(training_data) else: return web.json_response([])
def data_prep(df): logger.debug("Preparing data and dictionary for deduplication") df = clean_df(df) df["dictionary"] = df.to_dict("records") data_d = dict(zip(df.index, df.dictionary)) return df, data_d
def dedupe_deduplicate(self, deduper, df, recall_weight=1): """Process DataFrame with clutered duplicates. Classifies input dataframe and assigns clusters to expected duplicate recrods along with confidence intervals. Arguments: deduper {Dedupe} -- [description] df {DataFrame} -- [description] Keyword Arguments: recall_weight {number} -- Weighting for precision vs. recall (default: {1}) Returns: DataFrame, number -- The results DataFrame and the number of duplicates """ logger.debug("Preparing deduper results") df, data_d = data_prep(df) try: threshold = deduper.threshold(data_d, recall_weight=recall_weight) clustered_df, duplicates = data_cluster(deduper, data_d, threshold) results = df.join(clustered_df, how="left") results.drop(["dictionary"], axis=1, inplace=True) return results, duplicates except Exception as e: logger.error( "Unable to deduplicate data with the following error: %s", e) return pd.DataFrame(), 0
def dedupe_train(self, deduper, client_id): """Perform training on Dedupe with labeled examples. Finalises the trainign phase after providing labeled examples. Arguments: deduper {Dedupe} -- Dedupe instance trained with labelled examples client_id {str} -- Unique client identifier """ logger.debug("Finalising deduper training") deduper.train() logger.debug("Writing training files to disk") training_file = os.path.join("training-data", client_id, "dedupe_training.json") os.makedirs(os.path.dirname(training_file), exist_ok=True) settings_file = os.path.join("training-data", client_id, "dedupe_learned_settings") os.makedirs(os.path.dirname(settings_file), exist_ok=True) with open(training_file, "w") as tf: deduper.writeTraining(tf) with open(settings_file, "wb") as sf: deduper.writeSettings(sf)
def dedupe_mark(self, deduper, labelled_pairs): """Mark labeled pairs. Marks labelled pairs with the Dedupe instance for reinforcement training. Arguments: deduper {Dedupe} -- Dudeper instance labelled_pairs {list} -- List of pairs labeleld as distinct or a match """ logger.debug("Marking labelled training pairs") deduper.markPairs(labelled_pairs)
def data_cluster(deduper, data_dict, threshold): logger.debug("Clustering data") duplicates = deduper.match(data_dict, threshold) logger.info("Duplicate records found: %d", len(duplicates)) df_data = [ {"id": record_id, "cluster_id": cluster_id, "confidence": score} for cluster_id, records in enumerate(duplicates) for record_id, score in zip(*records) ] clustered_df = pd.DataFrame(df_data) clustered_df = clustered_df.set_index("id") return clustered_df
def dedupe_prep(self, fields): """Prepare a Dedupe instance. Prepares a dedupe instance based on input data fields Arguments: fields {list} -- List of fields representing the input data columns. Returns: Dedupe -- New Deduper for further training """ logger.debug("Preparing deduper training phase %s", fields) fields = select_fields(fields) return dedupe.Dedupe(fields)
async def existing(request): """File upload endpoint using existing model. --- description: Recieves data for deduplication. tags: - Upload produces: - application/json responses: "200": description: successful operation. Return confirmation response. parameters: - in: header name: clientId schema: type: string format: uuid required: true """ client_id = request.headers.get("clientId") logger.debug("Recieving data for classification") reader = await request.multipart() field = await reader.next() filepath = os.path.join("profile-data/", client_id, field.filename) size = 0 os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, "wb") as f: while True: chunk = await field.read_chunk() if not chunk: break size += len(chunk) f.write(chunk) message = message_wrapper(client_id, { "use_model": True, "filepath": filepath }) await app["message_queue"].put(message) return web.json_response({"recieved": field.filename, "size": size})
def dedupe_sample(self, deduper, df, sample_size=0.3): """Sample supplied DataFrame for training. Selects a representative sample from the input DataFrame to be used in the training phase. Arguments: deduper {Dedupe} -- Active Deduper instance df {DataFrame} -- DataFrame containing training + classification data Keyword Arguments: sample_size {number} -- Sample size used for training (default: {0.3}) """ df, data_dict = data_prep(df) logger.debug("Getting candidate training matches") if len(data_dict) < 500: sample_num = len(data_dict) elif len(data_dict) > 5000: sample_num = math.floor(len(data_dict) * sample_size) else: logger.debug("Dataset too small, decreasing sample size") sample_num = math.floor(len(data_dict) * 0.2) logger.debug("Requsting data sample of %s records", sample_num) deduper.sample(data_dict, sample_num)
async def training_post(request): """Training endpoint for submitting labelled data. --- description: Accepts labeled data for training the data matching model. tags: - Training produces: - application/json responses: "200": description: Successful operation. Return further training records. "400": description: Unsuccessful operation. Labelled date not supplied. parameters: - in: header name: clientId schema: type: string format: uuid required: true - in: body name: body description: Labelled training data required: true schema: type: object properties: match: type: array items: type: array items: - $ref: '#/definitions/Person' - $ref: '#/definitions/Person' distinct: type: array items: type: array items: - $ref: '#/definitions/Person' - $ref: '#/definitions/Person' """ if request.body_exists and request.can_read_body: logger.debug("Labelled training data recieved.") client_id = request.headers.get("clientId") datastore = get_datastore(client_id) if datastore.training_rounds < 4: logger.info("Updating traing pairs for labeling") labeled_pairs = await request.json() message = message_wrapper(client_id, {"labeled_pairs": labeled_pairs}) await app["message_queue"].put(message) else: message = message_wrapper(client_id, {"labeling_complete": True}) await app["message_queue"].put(message) return web.Response(status=201) return web.Response(status=400)