Beispiel #1
0
    def dedupe_pairs(self, deduper, pairs=6):
        """Training pairs for labelling.

        Selects a number of pairs for active labelling in the training phase.

        Arguments:
            deduper {Dedupe} -- Dedupe instance prepared with fields.

        Keyword Arguments:
            pairs {number} -- Number of labeling pairs to return (default: {6})

        Returns:
            list -- List of pairs to be matched.

        """
        logger.debug("Retrieving pairs for active labeling")

        try:
            training_pairs = [{
                "pair_id": i,
                "records": {
                    "record": m1,
                    "match": m2
                }
            } for i in range(pairs) for m1, m2 in deduper.uncertainPairs()]
            return training_pairs
        except IndexError as e:
            logger.error(
                "Unable to retrieve training pairs with the following error: %s",
                e)
            return []
Beispiel #2
0
async def training_get(request):
    """Training endpoint for retrieving sample data.

    ---
    description: Supplies data used to train the duple data matching model.
    tags:
    - Training
    produces:
    - application/json
    responses:
        "200":
            description: successful operation. Return unlabeled training records.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true

    """
    logger.debug("Training data request recieved")
    client_id = request.headers.get("clientId")
    datastore = get_datastore(client_id)

    if datastore.training_rounds <= 4:
        training_data = await datastore.get_pairs()
        return web.json_response(training_data)
    else:
        return web.json_response([])
Beispiel #3
0
def data_prep(df):
    logger.debug("Preparing data and dictionary for deduplication")
    df = clean_df(df)
    df["dictionary"] = df.to_dict("records")
    data_d = dict(zip(df.index, df.dictionary))

    return df, data_d
Beispiel #4
0
    def dedupe_deduplicate(self, deduper, df, recall_weight=1):
        """Process DataFrame with clutered duplicates.

        Classifies input dataframe and assigns clusters to expected duplicate
        recrods along with confidence intervals.

        Arguments:
            deduper {Dedupe} -- [description]
            df {DataFrame} -- [description]

        Keyword Arguments:
            recall_weight {number} -- Weighting for precision vs. recall (default: {1})

        Returns:
            DataFrame, number -- The results DataFrame and the number of duplicates

        """
        logger.debug("Preparing deduper results")
        df, data_d = data_prep(df)
        try:
            threshold = deduper.threshold(data_d, recall_weight=recall_weight)
            clustered_df, duplicates = data_cluster(deduper, data_d, threshold)
            results = df.join(clustered_df, how="left")
            results.drop(["dictionary"], axis=1, inplace=True)

            return results, duplicates
        except Exception as e:
            logger.error(
                "Unable to deduplicate data with the following error: %s", e)
            return pd.DataFrame(), 0
Beispiel #5
0
    def dedupe_train(self, deduper, client_id):
        """Perform training on Dedupe with labeled examples.

        Finalises the trainign phase after providing labeled examples.

        Arguments:
            deduper {Dedupe} -- Dedupe instance trained with labelled examples
            client_id {str} -- Unique client identifier

        """
        logger.debug("Finalising deduper training")
        deduper.train()

        logger.debug("Writing training files to disk")
        training_file = os.path.join("training-data", client_id,
                                     "dedupe_training.json")
        os.makedirs(os.path.dirname(training_file), exist_ok=True)
        settings_file = os.path.join("training-data", client_id,
                                     "dedupe_learned_settings")
        os.makedirs(os.path.dirname(settings_file), exist_ok=True)

        with open(training_file, "w") as tf:
            deduper.writeTraining(tf)
        with open(settings_file, "wb") as sf:
            deduper.writeSettings(sf)
Beispiel #6
0
    def dedupe_mark(self, deduper, labelled_pairs):
        """Mark labeled pairs.

        Marks labelled pairs with the Dedupe instance for reinforcement training.

        Arguments:
            deduper {Dedupe} -- Dudeper instance
            labelled_pairs {list} -- List of pairs labeleld as distinct or a match

        """
        logger.debug("Marking labelled training pairs")
        deduper.markPairs(labelled_pairs)
Beispiel #7
0
def data_cluster(deduper, data_dict, threshold):
    logger.debug("Clustering data")
    duplicates = deduper.match(data_dict, threshold)
    logger.info("Duplicate records found: %d", len(duplicates))

    df_data = [
        {"id": record_id, "cluster_id": cluster_id, "confidence": score}
        for cluster_id, records in enumerate(duplicates)
        for record_id, score in zip(*records)
    ]

    clustered_df = pd.DataFrame(df_data)
    clustered_df = clustered_df.set_index("id")

    return clustered_df
Beispiel #8
0
    def dedupe_prep(self, fields):
        """Prepare a Dedupe instance.

        Prepares a dedupe instance based on input data fields

        Arguments:
            fields {list} -- List of fields representing the input data columns.

        Returns:
            Dedupe -- New Deduper for further training

        """
        logger.debug("Preparing deduper training phase %s", fields)
        fields = select_fields(fields)
        return dedupe.Dedupe(fields)
Beispiel #9
0
async def existing(request):
    """File upload endpoint using existing model.

    ---
    description: Recieves data for deduplication.
    tags:
    - Upload
    produces:
    - application/json
    responses:
        "200":
            description: successful operation. Return confirmation response.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true

    """
    client_id = request.headers.get("clientId")
    logger.debug("Recieving data for classification")
    reader = await request.multipart()
    field = await reader.next()
    filepath = os.path.join("profile-data/", client_id, field.filename)
    size = 0

    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "wb") as f:
        while True:
            chunk = await field.read_chunk()
            if not chunk:
                break
            size += len(chunk)
            f.write(chunk)

    message = message_wrapper(client_id, {
        "use_model": True,
        "filepath": filepath
    })
    await app["message_queue"].put(message)

    return web.json_response({"recieved": field.filename, "size": size})
Beispiel #10
0
    def dedupe_sample(self, deduper, df, sample_size=0.3):
        """Sample supplied DataFrame for training.

        Selects a representative sample from the input DataFrame to be used in
        the training phase.

        Arguments:
            deduper {Dedupe} -- Active Deduper instance
            df {DataFrame} -- DataFrame containing training + classification data

        Keyword Arguments:
            sample_size {number} -- Sample size used for training (default: {0.3})

        """
        df, data_dict = data_prep(df)

        logger.debug("Getting candidate training matches")
        if len(data_dict) < 500:
            sample_num = len(data_dict)
        elif len(data_dict) > 5000:
            sample_num = math.floor(len(data_dict) * sample_size)
        else:
            logger.debug("Dataset too small, decreasing sample size")
            sample_num = math.floor(len(data_dict) * 0.2)

        logger.debug("Requsting data sample of %s records", sample_num)
        deduper.sample(data_dict, sample_num)
Beispiel #11
0
async def training_post(request):
    """Training endpoint for submitting labelled data.

    ---
    description: Accepts labeled data for training the data matching model.
    tags:
    - Training
    produces:
    - application/json
    responses:
        "200":
            description: Successful operation. Return further training records.
        "400":
            description: Unsuccessful operation. Labelled date not supplied.
    parameters:
      - in: header
        name: clientId
        schema:
          type: string
          format: uuid
        required: true
      - in: body
        name: body
        description: Labelled training data
        required: true
        schema:
          type: object
          properties:
            match:
              type: array
              items:
                type: array
                items:
                  - $ref: '#/definitions/Person'
                  - $ref: '#/definitions/Person'
            distinct:
              type: array
              items:
                type: array
                items:
                  - $ref: '#/definitions/Person'
                  - $ref: '#/definitions/Person'

    """
    if request.body_exists and request.can_read_body:
        logger.debug("Labelled training data recieved.")
        client_id = request.headers.get("clientId")
        datastore = get_datastore(client_id)

        if datastore.training_rounds < 4:
            logger.info("Updating traing pairs for labeling")
            labeled_pairs = await request.json()
            message = message_wrapper(client_id,
                                      {"labeled_pairs": labeled_pairs})
            await app["message_queue"].put(message)
        else:
            message = message_wrapper(client_id, {"labeling_complete": True})
            await app["message_queue"].put(message)
        return web.Response(status=201)

    return web.Response(status=400)