import pandas as pd import numpy as np from drain import util, dedupe engine = util.create_engine() edges = pd.read_sql( """ with buildings as ( select bc.id1 id, oa.orig_bldg_ from buildings.building_components bc join buildings.original_buildings oa on bc.id2 = oa.gid ) select b1.id id1, b2.id id2 from buildings b1 join buildings b2 using (orig_bldg_) where b1.id < b2.id; """, engine) components = dedupe.get_components(edges) deduped = dedupe.components_to_df(components) deduped.to_sql('complex_components', con=engine, schema='buildings', if_exists='replace', index=False) dedupe.insert_singletons('buildings.original_buildings', 'buildings.complex_components', 'gid', engine)
import pandas as pd from drain import util, dedupe engine = util.create_engine() edges = pd.read_sql( """ with kid_ids as ( SELECT id, first_name, last_name, date_of_birth, coalesce(canon_id, id) kid_id FROM dedupe.infants LEFT JOIN entity_map using (id) ) SELECT k1.kid_id id1, k2.kid_id id2 from kid_ids k1 JOIN (SELECT min(kid_id) id2, first_name, last_name, date_of_birth from kid_ids group by 2,3,4 having count(*) > 1) t using (first_name, last_name, date_of_birth) JOIN kid_ids k2 on id2 = k2.kid_id where k1.kid_id > k2.kid_id group by 1,2 """, engine) components = dedupe.components_dict_to_df(dedupe.get_components(edges)) db = util.PgSQLDatabase(engine) db.to_sql(frame=components, if_exists='replace', name='exact_matches', schema='dedupe', index=False)
import pandas as pd from drain import util, dedupe engine = util.create_engine() edges = pd.read_sql(""" with kid_ids as ( SELECT id, first_name, last_name, date_of_birth, coalesce(canon_id, id) kid_id FROM dedupe.infants LEFT JOIN entity_map using (id) ) SELECT k1.kid_id id1, k2.kid_id id2 from kid_ids k1 JOIN (SELECT min(kid_id) id2, first_name, last_name, date_of_birth from kid_ids group by 2,3,4 having count(*) > 1) t using (first_name, last_name, date_of_birth) JOIN kid_ids k2 on id2 = k2.kid_id where k1.kid_id > k2.kid_id group by 1,2 """, engine) components = dedupe.components_dict_to_df(dedupe.get_components(edges)) db = util.PgSQLDatabase(engine) db.to_sql(frame=components, if_exists='replace', name='exact_matches', schema='dedupe', index=False)
#! /usr/bin/python import pandas as pd import numpy as np from drain import util, dedupe engine = util.create_engine() edges = pd.read_sql(""" with addresses as ( select a.id, oa.ogc_fid from buildings.addresses a join buildings.original_addresses oa using (address) ) select a1.ogc_fid id1, a2.ogc_fid id2 from addresses a1 join addresses a2 using (id) where a1.ogc_fid < a2.ogc_fid; """, engine) components = dedupe.get_components(edges) deduped = dedupe.components_dict_to_df(components) deduped.to_sql('building_components',con=engine, schema='buildings', if_exists='replace', index=False) dedupe.insert_singletons('buildings.original_buildings', 'buildings.building_components', 'ogc_fid', engine)