コード例 #1
0
def train_gazetteer(messy, canonical, model_settings=None, should_index=False):
    """
    Train and return a dedupe.Gazetteer using the specified messy and canonical
    dictionaries. The messy and canonical objects should have the same
    structure:
      - The key is a unique ID
      - The value is another dictionary of field:value pairs. This dictionary
        must contain at least 'country', 'name', and 'address' keys.

    Reads a training.json file containing positive and negative matches.
    """
    if model_settings:
        gazetteer = dedupe.StaticGazetteer(model_settings)
    else:
        fields = [
            {
                'field': 'country',
                'type': 'Exact'
            },
            {
                'field': 'name',
                'type': 'String'
            },
            {
                'field': 'address',
                'type': 'String'
            },
        ]

        gazetteer = dedupe.Gazetteer(fields)
        gazetteer.sample(messy, canonical, 15000)
        training_file = os.path.join(settings.BASE_DIR, 'api', 'data',
                                     'training.json')
        with open(training_file) as tf:
            gazetteer.readTraining(tf)
        gazetteer.train()
        gazetteer.cleanupTraining()

    if should_index:
        index_start = datetime.now()
        logger.info('Indexing started')
        gazetteer.index(canonical)
        index_duration = datetime.now() - index_start
        logger.info('Indexing finished ({})'.format(index_duration))
        logger.info('Cleanup training')

    return gazetteer
コード例 #2
0
def gazetteer_dataframes(messy_df,
                         canonical_df,
                         field_properties,
                         recall_weight,
                         n_matches,
                         config_name="gazetteer_dataframes"):

    config_name = config_name.replace(" ", "_")

    settings_file = config_name + '_learned_settings'
    training_file = config_name + '_training.json'

    print('importing data ...')

    messy_df = clean_punctuation(messy_df)
    specify_type(messy_df, field_properties)

    messy_df['index_field'] = messy_df.index
    messy_df['index_field'] = messy_df['index_field'].apply(
        lambda x: "messy_df" + str(x))
    messy_df.set_index(['index_field'], inplace=True)

    data_1 = messy_df.to_dict(orient='index')

    canonical_df = clean_punctuation(canonical_df)
    specify_type(canonical_df, field_properties)

    canonical_df['index_field'] = canonical_df.index
    canonical_df['index_field'] = canonical_df['index_field'].apply(
        lambda x: "canonical_df" + str(x))
    canonical_df.set_index(['index_field'], inplace=True)

    data_2 = canonical_df.to_dict(orient='index')

    # ---------------------------------------------------------------------------------

    # ## Training

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            gazetteer = dedupe.StaticGazetteer(sf)

    else:
        # Define the fields the linker will pay attention to
        #
        # Notice how we are telling the linker to use a custom field comparator
        # for the 'price' field.

        fields = []
        select_fields(fields, field_properties)

        # Create a new gazetteer object and pass our data model to it.
        gazetteer = dedupe.Gazetteer(fields)

        # To train the gazetteer, we feed it a sample of records.
        gazetteer.sample(data_1, data_2, 15000)

        # If we have training data saved from a previous run of linker,
        # look for it an load it in.
        # __Note:__ if you want to train from scratch, delete the training_file
        if os.path.exists(training_file):
            print('reading labeled examples from ', training_file)
            with open(training_file) as tf:
                linker.readTraining(tf)

        # ## Active learning
        # Dedupe will find the next pair of records
        # it is least certain about and ask you to label them as matches
        # or not.
        # use 'y', 'n' and 'u' keys to flag duplicates
        # press 'f' when you are finished
        print('starting active labeling...')

        dedupe.consoleLabel(gazetteer)

        gazetteer.train()

        # When finished, save our training awak to disk

        with open(training_file, 'w') as tf:
            gazetteer.writeTraining(tf)

        # Make the canonical set
        gazetteer.index(data_2)

        # Save our weights and predicates to disk. If the settings file exists,
        # we will skip all training and learning next time we run this file.
        with open(settings_file, 'wb') as sf:
            gazetteer.writeSettings(sf, index=True)

        gazetteer.cleanupTraining()

    gazetteer.index(data_2)
    #Calc threshold
    print('start calculating threshold')
    threshold = gazetteer.threshold(data_1, recall_weight)
    print('Threshold: {}'.format(threshold))

    results = gazetteer.match(data_1, threshold=threshold, n_matches=n_matches)

    results_df = pd.DataFrame(results)

    results_df['messy_df_link'] = results_df[0].apply(lambda x: x[0][0])
    results_df['messy_df_link'] = results_df['messy_df_link'].str.strip(
        'messy_df')
    results_df['messy_df_link'] = results_df['messy_df_link'].astype(int)

    results_df['canonical_df_link'] = results_df[0].apply(lambda x: x[0][1])
    results_df['canonical_df_link'] = results_df[
        'canonical_df_link'].str.strip('canonical_df')
    results_df['canonical_df_link'] = results_df['canonical_df_link'].astype(
        int)

    results_df['confidence'] = results_df[0].apply(lambda x: x[1])
    results_df['cluster id'] = results_df.index

    results_df = results_df.rename(columns={0: 'results'})
    results_df['results'] = results_df['results'].astype(str)

    #For both messy_df & canonical_df, add cluster id & confidence score from results_df
    messy_df.index.rename('messy_df_link', inplace=True)
    messy_df = messy_df.rename(columns={'unique_id': 'messy_unique_id'})
    messy_df = messy_df.merge(results_df_copy, on='messy_df_link', how='left')

    canonical_df.index.rename('canonical_df_link', inplace=True)
    canonical_df = canonical_df.rename(
        columns={'unique_id': 'canonical_unique_id'})
    canonical_df = canonical_df.merge(results_df_copy,
                                      on='canonical_df_link',
                                      how='left')

    #Merge messy_df & canonical_df together
    final_df = messy_df.merge(canonical_df, on='results')

    return final_df
コード例 #3
0
    print('importing data ...')
    messy = readData(messy_file)
    print('N data 1 records: {}'.format(len(messy)))

    canonical = readData(canon_file)
    print('N data 2 records: {}'.format(len(canonical)))

    def descriptions():
        for dataset in (messy, canonical):
            for record in dataset.values():
                yield record['description']

    if os.path.exists(settings_file):
        print('reading from', settings_file)
        with open(settings_file, 'rb') as sf:
            gazetteer = dedupe.StaticGazetteer(sf)

    else:
        # Define the fields the gazetteer will pay attention to
        #
        # Notice how we are telling the gazetteer to use a custom
        # field comparator for the 'price' field.
        fields = [{
            'field': 'title',
            'type': 'String'
        }, {
            'field': 'title',
            'type': 'Text',
            'corpus': descriptions()
        }, {
            'field': 'description',
コード例 #4
0
def match():
    try:
        post = json.loads(request.data)
    except ValueError:
        r = {
            'status': 'error',
            'message': ''' 
                The content of your request should be a 
                string encoded JSON object.
            ''',
            'object': request.data,
        }
        resp = make_response(json.dumps(r), 400)
        resp.headers['Content-Type'] = 'application/json'
        return resp
    r, status_code, sess = validate_post(post)
    if r['status'] != 'error':
        api_key = post['api_key']
        session_id = post['session_id']
        n_matches = post.get('num_matches', 5)
        obj = post['object']
        field_defs = json.loads(sess.field_defs)
        model_fields = sorted(list(set([f['field'] for f in field_defs])))
        fields = ', '.join(['r.{0}'.format(f) for f in model_fields])
        engine = db_session.bind
        entity_table = Table('entity_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        try:
            hash_me = []
            for field in model_fields:
                if obj[field]:
                    hash_me.append(unicode(obj[field]))
                else:
                    hash_me.append('')
            hash_me = ';'.join(hash_me)
        except KeyError, e:
            r['status'] = 'error'
            r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\
                .format(','.join(obj.keys()), ','.join(model_fields))
            resp = make_response(json.dumps(r), 400)
            resp.headers['Content-Type'] = 'application/json'
            return resp
        if set(obj.keys()).isdisjoint(set(model_fields)):
            r['status'] = 'error'
            r['message'] = 'Sent fields "{0}" do no match model fields "{1}"'\
                .format(','.join(obj.keys()), ','.join(model_fields))
            resp = make_response(json.dumps(r), 400)
            resp.headers['Content-Type'] = 'application/json'
            return resp
        md5_hash = md5(unidecode(hash_me)).hexdigest()
        exact_match = db_session.query(entity_table)\
            .filter(entity_table.c.source_hash == md5_hash).first()
        match_list = []
        if exact_match:
            sel = text(''' 
                  SELECT {0} 
                  FROM "raw_{1}" AS r
                  JOIN "entity_{1}" AS e
                    ON r.record_id = e.record_id
                  WHERE e.entity_id = :entity_id
                  LIMIT :limit
                '''.format(fields, session_id))
            rows = []
            with engine.begin() as conn:
                rows = list(conn.execute(sel, 
                    entity_id=exact_match.entity_id, limit=n_matches))
            for row in rows:
                d = {f: getattr(row, f) for f in model_fields}
                d['entity_id'] = exact_match.entity_id
                d['match_confidence'] = '1.0'
                match_list.append(d)
        else:
            deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file))
            for k,v in obj.items():
                obj[k] = preProcess(unicode(v))
            block_keys = tuple([b[0] for b in list(deduper.blocker([('blob', obj)]))])
            
            # Sometimes the blocker does not find blocks. In this case we can't match
            if block_keys:
                sel = text('''
                      SELECT r.record_id, {1}
                      FROM "processed_{0}" as r
                      JOIN (
                        SELECT record_id
                        FROM "match_blocks_{0}"
                        WHERE block_key IN :block_keys
                      ) AS s
                      ON r.record_id = s.record_id
                    '''.format(session_id, fields))
                with engine.begin() as conn:
                    data_d = {int(i[0]): dict(zip(model_fields, i[1:])) \
                        for i in list(conn.execute(sel, block_keys=block_keys))}
                if data_d:
                    deduper.index(data_d)
                    linked = deduper.match({'blob': obj}, threshold=0, n_matches=n_matches)
                    if linked:
                        ids = []
                        confs = {}
                        for l in linked[0]:
                            id_set, confidence = l
                            ids.extend([i for i in id_set if i != 'blob'])
                            confs[id_set[1]] = confidence
                        ids = tuple(set(ids))
                        sel = text(''' 
                              SELECT {0}, r.record_id, e.entity_id
                              FROM "raw_{1}" as r
                              JOIN "entity_{1}" as e
                                ON r.record_id = e.record_id
                              WHERE r.record_id IN :ids
                            '''.format(fields, session_id))
                        matches = []
                        with engine.begin() as conn:
                            matches = list(conn.execute(sel, ids=ids))
                        for match in matches:
                            m = {f: getattr(match, f) for f in model_fields}
                            m['record_id'] = getattr(match, 'record_id')
                            m['entity_id'] = getattr(match, 'entity_id')
                            # m['match_confidence'] = float(confs[str(m['entity_id'])])
                            match_list.append(m)
            else:
                if sentry:
                    sentry.captureMessage('Unable to block record', extra=post)
        r['matches'] = match_list
コード例 #5
0
def add_entity():
    ''' 
    Add an entry to the entity map. 
    POST data should be a string encoded JSON object which looks like:
    
    {
        "object": {
            "city":"Macon",
            "cont_name":"Kinght & Fisher, LLP",
            "zip":"31201",
            "firstname":null,
            "employer":null,
            "address":"350 Second St",
            "record_id":3,
            "type":"Monetary",
            "occupation":null
        },
        "api_key":"6bf73c41-404e-47ae-bc2d-051e935c298e",
        "match_id": 100,
    }

    The object key should contain a mapping of fields that are in the data
    model. If the record_id field is present, an attempt will be made to look
    up the record in the raw / processed table before making the entry. If
    match_id is present, the record will be added as a member of the entity
    referenced by the id.
    '''
    r = {
        'status': 'ok',
        'message': ""
    }
    status_code = 200
    session_id = flask_session['session_id']

    try:
        post = json.loads(request.data)
    except ValueError:
        r = {
            'status': 'error',
            'message': ''' 
                The content of your request should be a 
                string encoded JSON object.
            ''',
            'object': request.data,
        }
        resp = make_response(json.dumps(r), 400)
        resp.headers['Content-Type'] = 'application/json'
        return resp
    obj = post['object']
    record_id = obj.get('record_id')
    if record_id:
        del obj['record_id']
    match_id = json.loads(request.data).get('match_id')
    sess = db_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)
    fds = {}
    for fd in field_defs:
        try:
            fds[fd['field']].append(fd['type'])
        except KeyError:
            fds[fd['field']] = [fd['type']]
    if not set(fds.keys()) == set(obj.keys()):
        r['status'] = 'error'
        r['message'] = "The fields in the object do not match the fields in the model"
        status_code = 400
    else:
        engine = db_session.bind
        proc_table = Table('processed_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        row = db_session.query(proc_table)\
            .filter(proc_table.c.record_id == record_id)\
            .first()
        if not row: # pragma: no cover
            raw_table = Table('raw_{0}'.format(session_id), Base.metadata, 
                autoload=True, autoload_with=engine, keep_existing=True)
            proc_ins = 'INSERT INTO "processed_{0}" (SELECT record_id, '\
                .format(proc_table_name)
            for idx, field in enumerate(fds.keys()):
                try:
                    field_types = fds[field]
                except KeyError:
                    field_types = ['String']
                # TODO: Need to figure out how to parse a LatLong field type
                if 'Price' in field_types:
                    col_def = 'COALESCE(CAST("{0}" AS DOUBLE PRECISION), 0.0) AS {0}'.format(field)
                else:
                    col_def = 'CAST(TRIM(COALESCE(LOWER("{0}"), \'\')) AS VARCHAR) AS {0}'.format(field)
                if idx < len(fds.keys()) - 1:
                    proc_ins += '{0}, '.format(col_def)
                else:
                    proc_ins += '{0} '.format(col_def)
            else:
                proc_ins += 'FROM "raw_{0}" WHERE record_id = :record_id)'\
                    .format(session_id)

            with engine.begin() as conn:
                record_id = conn.execute(raw_table.insert()\
                    .returning(raw_table.c.record_id) , **obj)
                conn.execute(text(proc_ins), record_id=record_id)
        hash_me = ';'.join([preProcess(unicode(obj[i])) for i in fds.keys()])
        md5_hash = md5(unidecode(hash_me)).hexdigest()
        entity = {
            'entity_id': unicode(uuid4()),
            'record_id': record_id,
            'source_hash': md5_hash,
            'clustered': True,
            'checked_out': False,
        }
        entity_table = Table('entity_{0}'.format(session_id), Base.metadata, 
            autoload=True, autoload_with=engine, keep_existing=True)
        if match_id:
            entity['target_record_id'] = match_id
            entity_id = db_session.query(entity_table.c.entity_id)\
                .filter(entity_table.c.record_id == match_id)\
                .first()
            entity['entity_id'] = entity_id.entity_id
        with engine.begin() as conn:
            conn.execute(entity_table.insert(), **entity)
        deduper = dedupe.StaticGazetteer(StringIO(sess.gaz_settings_file))
        for k,v in obj.items():
            obj[k] = preProcess(unicode(v))
        block_keys = [{'record_id': b[1], 'block_key': b[0]} \
                for b in list(deduper.blocker([(record_id, obj)]))]
        with engine.begin() as conn:
            conn.execute(text(''' 
                INSERT INTO "match_blocks_{0}" (
                    block_key,
                    record_id
                ) VALUES (:block_key, :record_id)
            '''.format(sess.id)), *block_keys)
    if sess.review_count:
        sess.review_count = sess.review_count - 1
        db_session.add(sess)
        db_session.commit()
    resp = make_response(json.dumps(r), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
コード例 #6
0
def _train(settings_file, training_file, clean_data, messy_data,
           field_properties, sample_size, update_model, n_cores):
    """Internal method that trains the deduper model from scratch or update
        an existing dedupe model.
        Parameters
        ----------
        settings_file : str
            A path to a settings file that will be loaded if it exists.
        training_file : str
            A path to a training file that will be loaded to keep training
            from.
        clean_data : dict
            The dictionary form of the gazette that gazetteer_dedupe requires.
        messy_data : dict
            The dictionary form of the messy data that needs to be deduplicated 
            (and canonicalized)
        field_properties : dict
            The mapping of fields to their respective data types. Please
            see the dedupe documentation for further details.
        sample_size : float, default 0.3
            Specify the sample size used for training as a float from 0 to 1.
            By default it is 30% (0.3) of our data.
        update_model : bool, default False
            If True, it allows user to update existing model by uploading
            training file.
        n_cores : int, default None
            Specify the number of cores to use during clustering.
            By default n_cores is equal to None (i.e. use multipressing equal to CPU count).
        Returns
        -------
        dedupe.Gazetteer
            A gazetteer model instance.
    """
    # Define the fields dedupe will pay attention to
    fields = []
    select_fields(fields, [field_properties])

    if update_model == False:

        # If a settings file already exists, we'll just load that and skip training
        if os.path.exists(settings_file):
            print('Reading from', settings_file)
            with open(settings_file, 'rb') as f:
                deduper = dedupe.StaticGazetteer(f, num_cores=n_cores)

        #Create a new deduper object and pass our data model to it.
        else:
            # Initialise dedupe
            deduper = dedupe.Gazetteer(fields, num_cores=n_cores)

            # Launch active learning
            deduper = _active_learning(clean_data, messy_data, sample_size,
                                       deduper, training_file, settings_file)

    else:
        # ## Training
        # Initialise dedupe
        deduper = dedupe.Gazetteer(fields, num_cores=n_cores)

        # Import existing model
        print('Reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(clean_data, messy_data, training_file=f)

        # Launch active learning
        deduper = _active_learning(clean_data, messy_data, sample_size,
                                   deduper, training_file, settings_file)

    return deduper
コード例 #7
0
def link_games(
    gazetteer,
    paths,
    id_prefixes=None,
    id_fields=None,
    training_file=None,
    manual_labelling=False,
    threshold=None,
    recall_weight=1,
    output=None,
    pretty_print=True,
):
    """ find links for games """

    paths = tuple(arg_to_iter(paths))
    if len(paths) < 2:
        raise ValueError(
            f"need at least 2 files to link games, but received {paths}")

    id_prefixes = tuple(arg_to_iter(id_prefixes))
    id_prefixes = id_prefixes + tuple(map(_filename, paths[len(id_prefixes):]))
    id_fields = tuple(arg_to_iter(id_fields))
    id_fields = id_fields + tuple(f"{prefix}_id"
                                  for prefix in id_prefixes[len(id_fields):])

    games_canonical = _load_games(paths[0])
    data_canonical = _make_data(games_canonical, id_fields[0], id_prefixes[0])
    del games_canonical

    LOGGER.info("loaded %d games in the canonical dataset <%s>",
                len(data_canonical), paths[0])

    data_link = {}
    for path, id_field, id_prefix in zip(paths[1:], id_fields[1:],
                                         id_prefixes[1:]):
        games = _load_games(path)
        data_link.update(_make_data(games, id_field, id_prefix))
        del games

    LOGGER.info("loaded %d games to link in the datasets %s", len(data_link),
                paths[1:])

    if training_file:
        gazetteer_trained = _train_gazetteer(
            data_canonical,
            data_link,
            training_file=training_file,
            manual_labelling=manual_labelling,
            pretty_print=pretty_print,
        )

        if isinstance(gazetteer, str):
            LOGGER.info("saving gazetteer model to <%s>", gazetteer)
            with smart_open(gazetteer, "wb") as file_obj:
                gazetteer_trained.writeSettings(file_obj)

        gazetteer = gazetteer_trained
        del gazetteer_trained

    elif isinstance(gazetteer, str):
        LOGGER.info("reading gazetteer model from <%s>", gazetteer)
        with smart_open(gazetteer, "rb") as file_obj:
            gazetteer = dedupe.StaticGazetteer(file_obj)

    gazetteer.index(data_canonical)

    LOGGER.info("using gazetteer model %r", gazetteer)

    threshold = threshold or gazetteer.threshold(data_link,
                                                 recall_weight=recall_weight)

    LOGGER.info("using threshold %.3f", threshold)

    clusters = gazetteer.match(messy_data=data_link,
                               threshold=threshold,
                               n_matches=None,
                               generator=True)
    links = defaultdict(set)
    del gazetteer

    for cluster in clusters:
        for (id_link, id_canonical), _ in cluster:
            links[id_canonical].add(id_link)
    del clusters

    LOGGER.info("found links for %d items", len(links))

    if output == "-":
        for id_canonical, linked in links.items():
            LOGGER.info("%s <-> %s", id_canonical, linked)

    elif output:
        LOGGER.info("saving clusters as JSON to <%s>", output)
        links_sorted = {
            key: sorted(value)
            for key, value in links.items() if key and value
        }
        json_formats = {"sort_keys": True, "indent": 4} if pretty_print else {}
        with smart_open(output, "w") as file_obj:
            serialize_json(obj=links_sorted, file=file_obj, **json_formats)
        del links_sorted

    return links
コード例 #8
0
output_file = 'address_matching_output.csv'
settings_file = 'address_matching_learned_settings'
training_file = 'address_matching_training.json'
canonical_file = 'data/chicago_addresses.csv'
messy_file = 'data/messy_addresses.csv'

print 'importing data ...'
messy_addresses = readData(messy_file)

canonical_addresses = readData(canonical_file)

# ## Training
if os.path.exists(settings_file):
    print 'reading from', settings_file
    with open(settings_file) as sf:
        linker = dedupe.StaticGazetteer(sf, num_cores=2)

else:
    # Define the fields dedupe will pay attention to
    fields = [{'field': 'Address', 'type': 'Address'}]

    # Create a new linker object and pass our data model to it.
    linker = dedupe.Gazetteer(fields, num_cores=2)
    # To train dedupe, we feed it a random sample of records.
    linker.sample(messy_addresses, canonical_addresses, 30000)

    if os.path.exists(training_file):
        print 'reading labeled examples from ', training_file
        with open(training_file) as tf:
            linker.readTraining(tf)