Beispiel #1
0
def clusterDedupe(session_id, canonical=False, threshold=0.75):
    dd_session = worker_session.query(DedupeSession)\
        .get(session_id)
    deduper = dedupe.StaticDedupe(StringIO(dd_session.settings_file))
    engine = worker_session.bind
    metadata = MetaData()
    sc_format = 'small_cov_{0}'
    proc_format = 'processed_{0}'
    if canonical:
        sc_format = 'small_cov_{0}_cr'
        proc_format = 'cr_{0}'
    small_cov = Table(sc_format.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    proc = Table(proc_format.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    trained_fields = list(set([f['field'] for f in json.loads(dd_session.field_defs)]))
    proc_cols = [getattr(proc.c, f) for f in trained_fields]
    cols = [c for c in small_cov.columns] + proc_cols
    rows = worker_session.query(*cols)\
        .join(proc, small_cov.c.record_id == proc.c.record_id)
    fields = [c.name for c in cols]
    clustered_dupes = []
    while not clustered_dupes:
        clustered_dupes = deduper.matchBlocks(
            clusterGen(windowed_query(rows, small_cov.c.block_id, 50000), fields), 
            threshold=threshold
        )
        threshold = threshold - 0.1
    return clustered_dupes
Beispiel #2
0
def reDedupeCanon(session_id, threshold=0.25):
    upd = text(''' 
        UPDATE "entity_{0}" SET
            entity_id = subq.old_entity_id,
            last_update = :last_update
        FROM (
            SELECT 
               c.record_id AS old_entity_id,
               e.entity_id AS new_entity_id
            FROM "entity_{0}_cr" AS c
            JOIN "entity_{0}" AS e
                ON c.target_record_id = e.entity_id
            WHERE c.clustered = TRUE
            ) AS subq
        WHERE "entity_{0}".entity_id = subq.new_entity_id
    '''.format(session_id))
    engine = worker_session.bind
    last_update = datetime.now().replace(tzinfo=TIME_ZONE)
    with engine.begin() as c:
        c.execute(upd, last_update=last_update)
    dedupeCanon(session_id, threshold=threshold)
    sess = worker_session.query(DedupeSession).get(session_id)
    sess.status = 'canon clustered'
    worker_session.add(sess)
    worker_session.commit()
    return 'ok'
Beispiel #3
0
 def test_exception(self):
     key = error.delay()
     processMessage()
     time.sleep(1)
     work = worker_session.query(WorkTable).get(key)
     worker_session.refresh(work)
     assert work.value == 'Test Exception'
Beispiel #4
0
def dedupeCanon(session_id, threshold=0.25):
    dd = worker_session.query(DedupeSession).get(session_id)
    engine = worker_session.bind
    metadata = MetaData()
    writeCanonRep(session_id)
    writeProcessedTable(session_id, 
                        proc_table_format='processed_{0}_cr', 
                        raw_table_format='cr_{0}')
    entity_table_name = 'entity_{0}_cr'.format(session_id)
    entity_table = entity_map(entity_table_name, metadata, record_id_type=String)
    entity_table.drop(bind=engine, checkfirst=True)
    entity_table.create(bind=engine)
    block_gen = blockDedupe(session_id, 
        table_name='processed_{0}_cr'.format(session_id), 
        entity_table_name='entity_{0}_cr'.format(session_id), 
        canonical=True)
    writeBlockingMap(session_id, block_gen, canonical=True)
    clustered_dupes = clusterDedupe(session_id, canonical=True, threshold=threshold)
    if clustered_dupes:
        fname = '/tmp/clusters_{0}.csv'.format(session_id)
        with open(fname, 'wb') as f:
            writer = UnicodeCSVWriter(f)
            for ids, scores in clustered_dupes:
                new_ent = unicode(uuid4())
                writer.writerow([
                    new_ent,
                    ids[0],
                    scores[0],
                    None,
                    False,
                    False,
                ])
                for id, score in zip(ids[1:], scores):
                    writer.writerow([
                        new_ent,
                        id,
                        score,
                        ids[0],
                        False,
                        False,
                    ])
        with open(fname, 'rb') as f:
            conn = engine.raw_connection()
            cur = conn.cursor()
            try:
                cur.copy_expert(''' 
                    COPY "entity_{0}_cr" (
                        entity_id,
                        record_id,
                        confidence,
                        target_record_id,
                        clustered,
                        checked_out
                    ) 
                    FROM STDIN CSV'''.format(session_id), f)
                conn.commit()
                os.remove(fname)
            except Exception, e: # pragma: no cover
                conn.rollback()
                raise e
Beispiel #5
0
def initializeModel(session_id):
    sess = worker_session.query(DedupeSession).get(session_id)
    while True:
        worker_session.refresh(sess, ['field_defs', 'sample'])
        if not sess.field_defs: # pragma: no cover
            time.sleep(3)
        else:
            field_defs = json.loads(sess.field_defs)
            fields = list(set([f['field'] for f in field_defs]))
            writeProcessedTable(session_id)
            updated_fds = []
            for field in field_defs:
                if field['type'] == 'Categorical':
                    categories = getDistinct(field['field'], session_id)
                    if len(categories) <= 6:
                        field.update({'categories': categories})
                    else:
                        field['type'] = 'Exact'
                updated_fds.append(field)
            sess.field_defs = json.dumps(updated_fds)
            worker_session.add(sess)
            worker_session.commit()
            initializeEntityMap(session_id, fields)
            drawSample(session_id)
            print 'got sample'
            break
    return 'woo'
Beispiel #6
0
def initializeSession(session_id):
    sess = worker_session.query(DedupeSession).get(session_id)
    file_path = '/tmp/{0}_raw.csv'.format(session_id)
    kwargs = {
        'session_id':session_id,
        'file_path':file_path
    }
    writeRawTable(**kwargs)
    engine = worker_session.bind
    metadata = MetaData()
    raw_table = Table('raw_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    sess.record_count = worker_session.query(raw_table).count()
    worker_session.add(sess)
    worker_session.commit()
    print 'session initialized'
Beispiel #7
0
def reDedupeRaw(session_id, threshold=0.75):
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)
    fields = list(set([f['field'] for f in field_defs]))
    initializeEntityMap(session_id, fields)
    dedupeRaw(session_id, threshold=threshold)
    sess.status = 'entity map updated'
    worker_session.add(sess)
    worker_session.commit()
    return 'ok'
Beispiel #8
0
def drawSample(session_id):
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)
    fields = list(set([f['field'] for f in field_defs]))
    d = dedupe.Dedupe(field_defs)
    data_d = makeSampleDict(sess.id, fields=fields)
    if len(data_d) < 50001:
        sample_size = 5000
    else: # pragma: no cover
        sample_size = round(int(len(data_d) * 0.01), -3)
    d.sample(data_d, sample_size=sample_size, blocked_proportion=1)
    sess.sample = cPickle.dumps(d.data_sample)
    worker_session.add(sess)
    worker_session.commit()
Beispiel #9
0
def writeCanonRep(session_id, name_pattern='cr_{0}'):
    engine = worker_session.bind
    metadata = MetaData()
    entity = Table('entity_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    proc_table = Table('processed_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)

    cr_cols = [Column('record_id', String, primary_key=True)]
    for col in proc_table.columns:
        if col.name != 'record_id':
            cr_cols.append(Column(col.name, col.type))
    cr = Table(name_pattern.format(session_id), metadata, *cr_cols)
    cr.drop(bind=engine, checkfirst=True)
    cr.create(bind=engine)

    cols = [entity.c.entity_id]
    col_names = [c for c in proc_table.columns.keys() if c != 'record_id']
    for name in col_names:
        cols.append(label(name, func.array_agg(getattr(proc_table.c, name))))
    rows = worker_session.query(*cols)\
        .filter(entity.c.record_id == proc_table.c.record_id)\
        .group_by(entity.c.entity_id)
    names = cr.columns.keys()
    with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'wb') as f:
        writer = UnicodeCSVWriter(f)
        writer.writerow(names)
        for row in rows:
            r = [row.entity_id]
            dicts = [dict(**{n:None for n in col_names}) for i in range(len(row[1]))]
            for idx, dct in enumerate(dicts):
                for name in col_names:
                    dicts[idx][name] = unicode(getattr(row, name)[idx])
            canon_form = dedupe.canonicalize(dicts)
            r.extend([canon_form[k] for k in names if canon_form.get(k) is not None])
            writer.writerow(r)
    canon_table_name = name_pattern.format(session_id)
    copy_st = 'COPY "{0}" ('.format(canon_table_name)
    for idx, name in enumerate(names):
        if idx < len(names) - 1:
            copy_st += '"{0}", '.format(name)
        else:
            copy_st += '"{0}")'.format(name)
    else:
        copy_st += "FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',', NULL ' ')"
    conn = engine.raw_connection()
    cur = conn.cursor()
    with open('/tmp/{0}.csv'.format(name_pattern.format(session_id)), 'rb') as f:
        cur.copy_expert(copy_st, f)
    conn.commit()
Beispiel #10
0
def blockDedupe(session_id, 
                table_name=None, 
                entity_table_name=None, 
                canonical=False):

    if not table_name:
        table_name = 'processed_{0}'.format(session_id)
    if not entity_table_name:
        entity_table_name = 'entity_{0}'.format(session_id)
    dd_session = worker_session.query(DedupeSession)\
        .get(session_id)
    deduper = dedupe.StaticDedupe(StringIO(dd_session.settings_file))
    engine = worker_session.bind
    metadata = MetaData()
    proc_table = Table(table_name, metadata,
        autoload=True, autoload_with=engine)
    entity_table = Table(entity_table_name, metadata,
        autoload=True, autoload_with=engine)
    for field in deduper.blocker.tfidf_fields:
        with engine.begin() as conn:
            fd = conn.execute('select record_id, {0} from "{1}"'.format(field, table_name))
            deduper.blocker.tfIdfBlock(fd, field)
    """ 
    SELECT p.* <-- need the fields that we trained on at least
        FROM processed as p
        LEFT OUTER JOIN entity_map as e
           ON p.record_id = e.record_id
        WHERE e.target_record_id IS NULL
    """
    proc_records = worker_session.query(proc_table)\
        .outerjoin(entity_table, proc_table.c.record_id == entity_table.c.record_id)\
        .filter(entity_table.c.target_record_id == None)
    fields = proc_table.columns.keys()
    full_data = ((getattr(row, 'record_id'), dict(zip(fields, row))) \
        for row in proc_records.yield_per(50000))
    return deduper.blocker(full_data)
Beispiel #11
0
def trainDedupe(session_id):
    dd_session = worker_session.query(DedupeSession)\
        .get(session_id)
    data_sample = cPickle.loads(dd_session.sample)
    deduper = dedupe.Dedupe(json.loads(dd_session.field_defs), 
        data_sample=data_sample)
    training_data = StringIO(dd_session.training_data)
    deduper.readTraining(training_data)
    deduper.train()
    settings_file_obj = StringIO()
    deduper.writeSettings(settings_file_obj)
    dd_session.settings_file = settings_file_obj.getvalue()
    worker_session.add(dd_session)
    worker_session.commit()
    deduper.cleanupTraining()
Beispiel #12
0
def dedupeRaw(session_id, threshold=0.75):
    trainDedupe(session_id)
    block_gen = blockDedupe(session_id)
    writeBlockingMap(session_id, block_gen, canonical=False)
    clustered_dupes = clusterDedupe(session_id)
    updateEntityMap(clustered_dupes, session_id)
    engine = worker_session.bind
    metadata = MetaData()
    entity_table = Table('entity_{0}'.format(session_id), metadata,
        autoload=True, autoload_with=engine, keep_existing=True)
    entity_count = worker_session.query(entity_table.c.entity_id.distinct())\
        .count()
    review_count = worker_session.query(entity_table.c.entity_id.distinct())\
        .filter(entity_table.c.clustered == False)\
        .count()
    sel = ''' 
        SELECT 
            entity_id, 
            MAX(confidence)::DOUBLE PRECISION,
            COUNT(*)
        FROM "entity_{0}"
        WHERE clustered = FALSE
        GROUP BY entity_id
    '''.format(session_id)
    clusters = list(engine.execute(sel))
    examples = {c[0]:{'attributes':c[1:], 'label': None, 'score': 1.0} \
        for c in clusters}
    machine = ReviewMachine(examples)
    dd = worker_session.query(DedupeSession).get(session_id)
    dd.review_machine = cPickle.dumps(machine)
    dd.entity_count = entity_count
    dd.review_count = review_count
    dd.status = 'entity map updated'
    worker_session.add(dd)
    worker_session.commit()
    return 'ok'
Beispiel #13
0
def bulkMarkCanonClusters(session_id, user=None):
    sess = worker_session.query(DedupeSession).get(session_id)
    engine = worker_session.bind
    upd_vals = {
        'user_name': user, 
        'clustered': True,
        'match_type': 'bulk accepted - canon',
        'last_update': datetime.now().replace(tzinfo=TIME_ZONE)
    }
    upd = text(''' 
        UPDATE "entity_{0}" SET 
            entity_id=subq.entity_id,
            clustered= :clustered,
            reviewer = :user_name,
            match_type = :match_type,
            last_update = :last_update
        FROM (
            SELECT 
                c.record_id as canon_record_id,
                c.entity_id, 
                e.record_id 
            FROM "entity_{0}" as e
            JOIN "entity_{0}_cr" as c 
                ON e.entity_id = c.record_id 
            LEFT JOIN (
                SELECT record_id, target_record_id FROM "entity_{0}"
                ) AS s 
                ON e.record_id = s.target_record_id
            ) as subq 
        WHERE "entity_{0}".record_id=subq.record_id
        RETURNING "entity_{0}".entity_id, subq.canon_record_id
        '''.format(session_id))
    with engine.begin() as c:
        updated = c.execute(upd,**upd_vals)
        for row in updated:
            c.execute(text(''' 
                    UPDATE "entity_{0}_cr" SET
                        target_record_id = :target,
                        clustered = TRUE
                    WHERE record_id = :record_id
                '''.format(session_id)),
                target=row[0], record_id=row[1])
    getMatchingReady(session_id)
Beispiel #14
0
def writeProcessedTable(session_id, 
                        raw_table_format='raw_{0}', 
                        proc_table_format='processed_{0}'):
    dd = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(dd.field_defs)
    fds = {}
    for fd in field_defs:
        try:
            fds[fd['field']].append(fd['type'])
        except KeyError:
            fds[fd['field']] = [fd['type']]
    engine = worker_session.bind
    metadata = MetaData()
    proc_table_name = proc_table_format.format(session_id)
    raw_table_name = raw_table_format.format(session_id)
    raw_table = Table(raw_table_name, metadata, 
        autoload=True, autoload_with=engine)
    raw_fields = [f for f in raw_table.columns.keys() if f != 'record_id']
    create = 'CREATE TABLE "{0}" AS (SELECT record_id, '.format(proc_table_name)
    for idx, field in enumerate(raw_fields):
        try:
            field_types = fds[field]
        except KeyError:
            field_types = ['String']
        # TODO: Need to figure out how to parse a LatLong field type
        if 'Price' in field_types:
            col_def = 'COALESCE(CAST("{0}" AS DOUBLE PRECISION), 0.0) AS {0}'.format(field)
        else:
            col_def = 'CAST(TRIM(COALESCE(LOWER("{0}"), \'\')) AS VARCHAR) AS {0}'.format(field)
        if idx < len(raw_fields) - 1:
            create += '{0}, '.format(col_def)
        else:
            create += '{0} '.format(col_def)
    else:
        create += 'FROM "{0}")'.format(raw_table_name)
    create_stmt = text(create)
    with engine.begin() as c:
        c.execute('DROP TABLE IF EXISTS "{0}"'.format(proc_table_name))
    with engine.begin() as c:
        c.execute(create_stmt)
    with engine.begin() as c:
        c.execute('ALTER TABLE "{0}" ADD PRIMARY KEY (record_id)'.format(proc_table_name))
Beispiel #15
0
def addRowHash(session_id):
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)
    fields = sorted(list(set([f['field'] for f in field_defs])))
    engine = worker_session.bind
    fields = ["COALESCE(r.{0}, '')".format(f) for f in fields]
    fields = " || ';' || ".join(fields)
    upd = ''' 
      UPDATE "entity_{0}" SET
        source_hash=s.source_hash 
        FROM (
          SELECT 
            MD5({1}) as source_hash,
            r.record_id
          FROM "entity_{0}" as e
          JOIN "raw_{0}" as r
            ON e.record_id = r.record_id
        ) AS s
        WHERE "entity_{0}".record_id = s.record_id
    '''.format(session_id, fields)
    with engine.begin() as conn:
        conn.execute(upd)
Beispiel #16
0
 def test_queuefunc(self):
     key = add.delay(1,3)
     processMessage()
     time.sleep(1)
     work = worker_session.query(WorkTable).get(key)
     assert work is None
Beispiel #17
0
                     confidence,
                     target_record_id,
                     clustered,
                     checked_out
                 ) 
                 FROM STDIN CSV'''.format(session_id), f)
             conn.commit()
             os.remove(fname)
         except Exception, e: # pragma: no cover
             conn.rollback()
             raise e
 else: # pragma: no cover
     print 'did not find clusters'
     getMatchingReady(session_id)
 review_count = worker_session.query(entity_table.c.entity_id.distinct())\
     .filter(entity_table.c.clustered == False)\
     .count()
 sel = ''' 
     SELECT 
         entity_id, 
         MAX(confidence)::DOUBLE PRECISION,
         COUNT(*)
     FROM "entity_{0}_cr"
     WHERE clustered = FALSE
     GROUP BY entity_id
 '''.format(session_id)
 clusters = list(engine.execute(sel))
 examples = {c[0]:{'attributes':c[1:], 'label': None, 'score': 1.0} \
     for c in clusters}
 machine = ReviewMachine(examples)
 dd.review_machine = cPickle.dumps(machine)
Beispiel #18
0
def initializeEntityMap(session_id, fields):
    engine = worker_session.bind
    metadata = MetaData()
    create = '''
        CREATE TABLE "exact_match_{0}" AS (
          SELECT 
            s.record_id,
            UNNEST(s.members) as match
          FROM (
            SELECT 
              MIN(record_id) AS record_id, 
              (array_agg(record_id ORDER BY record_id))
                [2:array_upper(array_agg(record_id), 1)] AS members
            FROM "processed_{0}" 
            GROUP BY {1} 
            HAVING (array_length(array_agg(record_id), 1) > 1)
          ) AS s
        )
        '''.format(session_id, ', '.join(fields))
    with engine.begin() as conn:
        conn.execute('DROP TABLE IF EXISTS "exact_match_{0}"'.format(session_id))
        conn.execute(create)
    exact_table = Table('exact_match_{0}'.format(session_id), metadata,
                  autoload=True, autoload_with=engine, keep_existing=True)
    rows = worker_session.query(exact_table)
    entity_table = entity_map('entity_%s' % session_id, metadata)
    entity_table.drop(engine, checkfirst=True)
    entity_table.create(engine)
    s = StringIO()
    writer = UnicodeCSVWriter(s)
    now = datetime.now().replace(tzinfo=TIME_ZONE).isoformat()
    rows = sorted(rows, key=itemgetter(0))
    grouped = {}
    for k, g in groupby(rows, key=itemgetter(0)):
        rs = [r[1] for r in g]
        grouped[k] = rs
    for king,serfs in grouped.items():
        entity_id = unicode(uuid4())
        writer.writerow([
            king, 
            None, 
            entity_id, 
            1.0,
            'raw_{0}'.format(session_id),
            'TRUE',
            'FALSE',
            'exact',
            now,
        ])
        for serf in serfs:
            writer.writerow([
                serf,
                king,
                entity_id,
                1.0,
                'raw_{0}'.format(session_id),
                'TRUE',
                'FALSE',
                'exact',
                now,
            ])
    s.seek(0)
    conn = engine.raw_connection()
    cur = conn.cursor()
    cur.copy_expert('''
        COPY "entity_{0}" (
            record_id, 
            target_record_id, 
            entity_id, 
            confidence,
            source,
            clustered,
            checked_out,
            match_type,
            last_update
        ) 
        FROM STDIN CSV'''.format(session_id), s)
    conn.commit()
Beispiel #19
0
def getMatchingReady(session_id):
    addRowHash(session_id)
    cleanupTables(session_id)
    engine = worker_session.bind
    with engine.begin() as conn:
        conn.execute('DROP TABLE IF EXISTS "match_blocks_{0}"'\
            .format(session_id))
        conn.execute(''' 
            CREATE TABLE "match_blocks_{0}" (
                block_key VARCHAR, 
                record_id BIGINT
            )
            '''.format(session_id))
    sess = worker_session.query(DedupeSession).get(session_id)
    field_defs = json.loads(sess.field_defs)

    # Save Gazetteer settings
    d = dedupe.Gazetteer(field_defs)

    # Disabling canopy based predicates for now
    for definition in d.data_model.primary_fields:
        for idx, predicate in enumerate(definition.predicates):
            if predicate.type == 'TfidfPredicate':
                definition.predicates.pop(idx)

    d.readTraining(StringIO(sess.training_data))
    d.train()
    g_settings = StringIO()
    d.writeSettings(g_settings)
    g_settings.seek(0)
    sess.gaz_settings_file = g_settings.getvalue()
    worker_session.add(sess)
    worker_session.commit()

    # Write match_block table
    model_fields = list(set([f['field'] for f in field_defs]))
    fields = ', '.join(['p.{0}'.format(f) for f in model_fields])
    sel = ''' 
        SELECT 
          p.record_id, 
          {0}
        FROM "processed_{1}" AS p 
        LEFT JOIN "exact_match_{1}" AS e 
          ON p.record_id = e.match 
        WHERE e.record_id IS NULL;
        '''.format(fields, session_id)
    conn = engine.connect()
    rows = conn.execute(sel)
    data = ((getattr(row, 'record_id'), dict(zip(model_fields, row[1:]))) \
        for row in rows)
    block_gen = d.blocker(data)
    s = StringIO()
    writer = UnicodeCSVWriter(s)
    writer.writerows(block_gen)
    conn.close()
    s.seek(0)
    conn = engine.raw_connection()
    curs = conn.cursor()
    try:
        curs.copy_expert('COPY "match_blocks_{0}" FROM STDIN CSV'\
            .format(session_id), s)
        conn.commit()
    except Exception, e: # pragma: no cover
        conn.rollback()
        raise e
Beispiel #20
0
def bulkMarkClusters(session_id, user=None):
    dd = worker_session.query(DedupeSession).get(session_id)
    engine = worker_session.bind
    now =  datetime.now().replace(tzinfo=TIME_ZONE)
    upd_vals = {
        'user_name': user, 
        'clustered': True,
        'match_type': 'bulk accepted',
        'last_update': now,
    }
    upd = text(''' 
        UPDATE "entity_{0}" SET 
            entity_id=subq.entity_id,
            clustered= :clustered,
            reviewer = :user_name,
            match_type = :match_type,
            last_update = :last_update
        FROM (
                SELECT 
                    s.entity_id AS entity_id,
                    e.record_id 
                FROM "entity_{0}" AS e
                JOIN (
                    SELECT 
                        record_id, 
                        entity_id
                    FROM "entity_{0}"
                ) AS s
                    ON e.target_record_id = s.record_id
            ) as subq 
        WHERE "entity_{0}".record_id=subq.record_id 
            AND ( "entity_{0}".clustered=FALSE 
                  OR "entity_{0}".match_type != 'clerical review' )
        RETURNING "entity_{0}".entity_id
        '''.format(session_id))
    with engine.begin() as c:
        child_entities = c.execute(upd, **upd_vals)
    upd = text(''' 
        UPDATE "entity_{0}" SET
            clustered = :clustered,
            reviewer = :user_name,
            last_update = :last_update,
            match_type = :match_type
        WHERE target_record_id IS NULL
            AND clustered=FALSE
        RETURNING entity_id;
    '''.format(session_id))
    with engine.begin() as c:
        parent_entities = c.execute(upd, **upd_vals)
    child_entities = set([c.entity_id for c in child_entities])
    parent_entities = set([p.entity_id for p in parent_entities])
    count = len(child_entities.union(parent_entities))
    with engine.begin() as conn:
        conn.execute(text('''
          UPDATE dedupe_session SET 
            review_count = 0,
            entity_count = :entity_count
          WHERE id = :id
          '''), entity_count=count, id=session_id)
    dedupeCanon(session_id)
    return None