import pandas as pd
import numpy as np
from drain import util, dedupe

engine = util.create_engine()
edges = pd.read_sql(
    """
    with buildings as (
        select bc.id1 id, oa.orig_bldg_ from buildings.building_components bc join buildings.original_buildings oa on bc.id2 = oa.gid
    )

    select b1.id id1, b2.id id2 from buildings b1 join buildings b2 using (orig_bldg_) where b1.id < b2.id;
    """, engine)

components = dedupe.get_components(edges)
deduped = dedupe.components_to_df(components)

deduped.to_sql('complex_components',
               con=engine,
               schema='buildings',
               if_exists='replace',
               index=False)
dedupe.insert_singletons('buildings.original_buildings',
                         'buildings.complex_components', 'gid', engine)
Exemple #2
0
import pandas as pd
from drain import util, dedupe

engine = util.create_engine()

edges = pd.read_sql(
    """
with kid_ids as (
    SELECT id, first_name, last_name, date_of_birth, coalesce(canon_id, id) kid_id 
    FROM dedupe.infants LEFT JOIN entity_map using (id)
)

SELECT k1.kid_id id1, k2.kid_id id2 from
kid_ids k1 JOIN
(SELECT min(kid_id) id2, first_name, last_name, date_of_birth from kid_ids group by 2,3,4 having count(*) > 1) t
using (first_name, last_name, date_of_birth)
JOIN kid_ids k2 on id2 = k2.kid_id
where k1.kid_id > k2.kid_id
group by 1,2
""", engine)

components = dedupe.components_dict_to_df(dedupe.get_components(edges))

db = util.PgSQLDatabase(engine)

db.to_sql(frame=components,
          if_exists='replace',
          name='exact_matches',
          schema='dedupe',
          index=False)
import pandas as pd
from drain import util, dedupe

engine = util.create_engine()

edges = pd.read_sql("""
with kid_ids as (
    SELECT id, first_name, last_name, date_of_birth, coalesce(canon_id, id) kid_id 
    FROM dedupe.infants LEFT JOIN entity_map using (id)
)

SELECT k1.kid_id id1, k2.kid_id id2 from
kid_ids k1 JOIN
(SELECT min(kid_id) id2, first_name, last_name, date_of_birth from kid_ids group by 2,3,4 having count(*) > 1) t
using (first_name, last_name, date_of_birth)
JOIN kid_ids k2 on id2 = k2.kid_id
where k1.kid_id > k2.kid_id
group by 1,2
""", engine)

components = dedupe.components_dict_to_df(dedupe.get_components(edges))

db = util.PgSQLDatabase(engine)

db.to_sql(frame=components, if_exists='replace', name='exact_matches', schema='dedupe', index=False)
#! /usr/bin/python

import pandas as pd
import numpy as np
from drain import util, dedupe

engine = util.create_engine()
edges = pd.read_sql("""
    with addresses as (
        select a.id, oa.ogc_fid from buildings.addresses a join buildings.original_addresses oa using (address)
    ) 

    select a1.ogc_fid id1, a2.ogc_fid id2 from addresses a1 join addresses a2 using (id) where a1.ogc_fid < a2.ogc_fid;
    """, engine)

components = dedupe.get_components(edges)
deduped = dedupe.components_dict_to_df(components)

deduped.to_sql('building_components',con=engine, schema='buildings', if_exists='replace', index=False)
dedupe.insert_singletons('buildings.original_buildings', 'buildings.building_components', 'ogc_fid', engine)