def join(data, strategy, source_left, source_right, destination, key_left, key_right, prefix_left, prefix_right, presorted, buffersize, tempdir, cache, missing): """Perform a join on two data tables.""" source_left = data.get(source_left) source_right = data.get(source_right) kwargs = {} if key_left == key_right: kwargs['key'] = key_left else: kwargs['lkey'] = key_left kwargs['rkey'] = key_right if presorted is True: kwargs['presorted'] = presorted if buffersize is not None: kwargs['buffersize'] = buffersize if tempdir: kwargs['tempdir'] = tempdir if 'anti' not in strategy: if prefix_left is not None: kwargs['lprefix'] = prefix_left if prefix_right is not None: kwargs['rprefix'] = prefix_right if strategy not in ['join', 'antijoin', 'hashjoin', 'hashantijoin']: kwargs['missing'] = missing if strategy == 'join': o = petl.join(source_left, source_right, **kwargs) elif strategy == 'leftjoin': o = petl.leftjoin(source_left, source_right, **kwargs) elif strategy == 'lookupjoin': o = petl.lookupjoin(source_left, source_right, **kwargs) elif strategy == 'rightjoin': o = petl.rightjoin(source_left, source_right, **kwargs) elif strategy == 'outerjoin': o = petl.outerjoin(source_left, source_right, **kwargs) elif strategy == 'antijoin': o = petl.antijoin(source_left, source_right, **kwargs) elif strategy == 'hashjoin': o = petl.antijoin(source_left, source_right, **kwargs) elif strategy == 'hashleftjoin': o = petl.hashleftjoin(source_left, source_right, **kwargs) elif strategy == 'hashlookupjoin': o = petl.hashlookupjoin(source_left, source_right, **kwargs) elif strategy == 'hashrightjoin': o = petl.hashrightjoin(source_left, source_right, **kwargs) data.set(destination, o)
# antijoin table1 = [['id', 'colour'], [0, 'black'], [1, 'blue'], [2, 'red'], [4, 'yellow'], [5, 'white']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square']] from petl import antijoin, look look(table1) look(table2) table3 = antijoin(table1, table2, key='id') look(table3) # rangefacet table1 = [['foo', 'bar'], ['a', 3], ['a', 7], ['b', 2], ['b', 1], ['b', 9], ['c', 4], ['d', 3]] from petl import rangefacet, look
# Get duplicate parcel_ids: non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id') unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows) # Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id: print("Relating condos to parcels...") joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \ .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True) print("joined rowcount: ", etl.nrows(joined)) if DEV: print(etl.look(joined)) # Calculate errors print("Calculating errors...") unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id') print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield( 'reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched)) if DEV: print(etl.look(dor_condos_unjoined_unmatched)) dor_condos_unjoined_duplicates = etl.antijoin( unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg') print("non-unique active/remainder mapreg error rowcount: ",
############# import petl as etl table1 = [['id', 'colour'], [1, 'blue'], [2, 'red']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square']] table3 = etl.crossjoin(table1, table2) table3 # antijoin() ############ import petl as etl table1 = [['id', 'colour'], [0, 'black'], [1, 'blue'], [2, 'red'], [4, 'yellow'], [5, 'white']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square']] table3 = etl.antijoin(table1, table2, key='id') table3 # lookupjoin() ############## import petl as etl table1 = [['id', 'color', 'cost'], [1, 'blue', 12], [2, 'red', 8], [3, 'purple', 4]] table2 = [['id', 'shape', 'size'], [1, 'circle', 'big'], [1, 'circle', 'small'], [2, 'square', 'tiny'], [2, 'square', 'big'], [3, 'ellipse', 'small'], [3, 'ellipse', 'tiny']] table3 = etl.lookupjoin(table1, table2, key='id') table3 # unjoin()
# Get duplicate parcel_ids: non_unique_parcel_id_rows = engine_dor_parcel_rows.duplicates(key='parcel_id') unique_parcel_id_rows = etl.complement(engine_dor_parcel_rows, non_unique_parcel_id_rows) # Get address comps for condos by joining to dor_parcel with unique parcel_id on parcel_id: print("Relating condos to parcels...") joined = etl.join(source_dor_condo_rows, unique_parcel_id_rows, key='parcel_id') \ .convert('street_address', lambda a, row: row.street_address + ' # ' + row.unit_num, pass_row=True) print("joined rowcount: ", etl.nrows(joined)) if DEV: print(etl.look(joined)) # Calculate errors print("Calculating errors...") unjoined = etl.antijoin(source_dor_condo_rows, joined, key='source_object_id') print("unjoined rowcount: ", etl.nrows(unjoined)) dor_condos_unjoined_unmatched = etl.antijoin(unjoined, non_unique_parcel_id_rows, key='parcel_id').addfield('reason', 'non-active/remainder mapreg') print("non-active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_unmatched)) if DEV: print(etl.look(dor_condos_unjoined_unmatched)) dor_condos_unjoined_duplicates = etl.antijoin(unjoined, dor_condos_unjoined_unmatched, key='source_object_id').addfield('reason', 'non-unique active/remainder mapreg') print("non-unique active/remainder mapreg error rowcount: ", etl.nrows(dor_condos_unjoined_duplicates)) if DEV: print(etl.look(dor_condos_unjoined_duplicates)) error_table = etl.cat(dor_condos_unjoined_unmatched, dor_condos_unjoined_duplicates) if DEV: print(etl.look(error_table)) # Write to engine db if not DEV:
def extract_backend(offline=OFFLINE): # Done in 4 steps: (1) grab the driver table from the CloudSQL, # (2) use the user uuids to query for users one by one through # the API, (3) get the fleet table from CloudSQL and (4) join # everything together. def extract_drivers(): query = SQLReader('sql.drivers_from_cloudsql') drivers_df = sql.execute(query.statements[0]) drivers_tb = fromdataframe(drivers_df) mappings = { 'driver_uuid': lambda rec: str(UUID(bytes=rec['uuid'], version=4)), 'fleet_uuid': lambda rec: str(UUID(bytes=rec['fleet_uuid'], version=4)), 'user_uuid': lambda rec: str(UUID(bytes=rec['user_ds_uuid'], version=4)), 'fullname': lambda rec: rec['last_name'].strip() + ', ' + rec['first_name'].strip(), } drivers_tb = drivers_tb.fieldmap(mappings) drivers_tb = drivers_tb.suffixheader('_in_backend') return drivers_tb def extract_users(): users_records = [api.get_record('users', driver.user_uuid_in_backend) for driver in drivers.namedtuples()] users_df = DataFrame().from_records(users_records) users_tb = fromdataframe(users_df) mappings = { 'driver_uuid': 'driver', 'user_uuid': 'uuid', 'backend_username': '******' } users_tb = users_tb.fieldmap(mappings) users_tb = users_tb.suffixheader('_in_backend') return users_tb def extract_fleets_from_dwh(): query = SQLReader('sql.fleets_from_tableau') fleets_df = dwh.execute(query.statements[0]) fleets_tb = fromdataframe(fleets_df) mappings = { 'fleet_uuid': 'uuid', 'fleetname': lambda rec: rec['backend_name'].replace('_', ' '), 'country_code': 'country_code', } fleets_tb = fleets_tb.cutout('country_code') fleets_tb = fleets_tb.fieldmap(mappings) fleets_tb = fleets_tb.suffixheader('_in_backend') return fleets_tb if not offline: sql = CloudSQLConnector() api = ValkfleetConnector() dwh = WarehouseConnector() drivers = extract_drivers() fleets = extract_fleets_from_dwh() users = extract_users() drivers.topickle(DRIVERS_IN_BACKEND_FILEPATH) fleets.topickle(FLEETS_IN_BACKEND_FILEPATH) users.topickle(USERS_IN_BACKEND_FILEPATH) else: drivers = frompickle(DRIVERS_IN_BACKEND_FILEPATH) fleets = frompickle(FLEETS_IN_BACKEND_FILEPATH) users = frompickle(USERS_IN_BACKEND_FILEPATH) write_to_log(drivers, 'drivers', 'backend') write_to_log(fleets, 'fleets', 'backend') write_to_log(users, 'users', 'backend') drivers_without_fleet = antijoin(drivers, fleets, key='fleet_uuid_in_backend') drivers_without_user = antijoin(drivers, users, key='user_uuid_in_backend') write_to_log(drivers_without_fleet, 'drivers without fleet', 'backend') write_to_log(drivers_without_user, 'drivers without user', 'backend') drivers_n_fleets = join(drivers, fleets, key='fleet_uuid_in_backend').cutout('fleet_uuid_in_backend') backend_drivers = join(drivers_n_fleets, users, key='user_uuid_in_backend') backend_drivers = backend_drivers.addfield('backend_username', lambda rec: rec['backend_username_in_backend']) backend_drivers = backend_drivers.cutout('driver_uuid_in_backend') backend_drivers = standardize_missing_values(backend_drivers) write_to_log(backend_drivers, 'drivers', 'backend') return backend_drivers
# antijoin() ############ import petl as etl table1 = [['id', 'colour'], [0, 'black'], [1, 'blue'], [2, 'red'], [4, 'yellow'], [5, 'white']] table2 = [['id', 'shape'], [1, 'circle'], [3, 'square']] table3 = etl.antijoin(table1, table2, key='id') table3 # lookupjoin() ############## import petl as etl table1 = [['id', 'color', 'cost'], [1, 'blue', 12], [2, 'red', 8], [3, 'purple', 4]] table2 = [['id', 'shape', 'size'], [1, 'circle', 'big'], [1, 'circle', 'small'], [2, 'square', 'tiny'],