def produce_delivery_count_table(): log.addFilter(MultilineFilter()) # useful for tables log.info('Starting to generate the monthly german payroll table') # ------------------------------ # Extract driver names from Odoo # ------------------------------ log.info('Extracting driver names from Odoo') odoo = OdooConnector() filters = [('supplier', '=', True), ('active', '=', True), ('company_id', '=', 5)] # 5 is germany df = odoo.extract('res.partner', filters) odoo_drivers = fromdataframe(df) mappings = { 'driver_app_username': '******', 'planday_salary_id_in_odoo': 'x_salary_id', 'odoo_id': 'id', 'fullname_in_odoo': 'display_name' } odoo_drivers = odoo_drivers.fieldmap(mappings) # cache the results odoo_drivers.toxlsx(odoo_cache_file) log.info('%s drivers found in Odoo', odoo_drivers.nrows()) log.debug(odoo_drivers.look()) # ------------------------------------------ # Extract delivery counts from the warehouse # ------------------------------------------ log.info('Extracting delivery counts from the DWH') dwh = WarehouseConnector() query = SQLReader('sql.german_drivers_delivery_counts').statements[0] log.debug(query) df = dwh.execute(query) driver_counts = fromdataframe(df) # cache the results driver_counts.toxlsx(dwh_cache_file) log.info('%s drivers found in the DWH', driver_counts.nrows()) log.info('Deliveries per driver %s', driver_counts.stats('number_of_deliveries')) log.debug(driver_counts.look()) # ---------------------------- # Join the two tables together # ---------------------------- payroll = leftjoin(driver_counts, odoo_drivers, key='driver_app_username') # Some usernames appear multiple times in Odoo payroll = payroll.distinct('driver_app_username') log.debug(payroll.look()) payroll.toxlsx(output_file) log.info('Payroll table saved to %s', output_file) log.removeFilter(MultilineFilter())
def transform_to_petl(data): isodate = etl.dateparser("%Y-%m-%d") data = etl.fromdataframe(data) data = etl.rename(data, {"index": "Date", "VALUE": "Value"}) data = etl.convert(data, {"Date": lambda d: d[:10]}) data = etl.convert(data, {"Date": lambda d: isodate(d)}) return data
def extract_odoo(offline=OFFLINE): if not offline: api = OdooConnector() filters = [('supplier', '=', True), ('active', '=', True), ('company_id', '=', 3)] dataframe = api.extract('res.partner', filters) drivers = fromdataframe(dataframe) mappings = { 'backend_username': '******', 'backend_uuid': 'x_backend_uuid', 'salary_id': 'x_salary_id', 'odoo_id': 'id', 'fleetname': lambda rec: rec['x_fleet'][1].replace('_', ' '), 'fullname': lambda rec: rec['display_name'].strip() } drivers = drivers.fieldmap(mappings) drivers = drivers.suffixheader('_in_odoo') drivers.topickle(DRIVERS_IN_ODOO_FILEPATH) else: drivers = frompickle(DRIVERS_IN_ODOO_FILEPATH) drivers = drivers.addfield('backend_username', lambda rec: rec['backend_username_in_odoo']) drivers = drivers.addfield('salary_id', lambda rec: rec['salary_id_in_odoo']) drivers = standardize_missing_values(drivers) write_to_log(drivers, 'drivers', 'odoo') return drivers
def test_integration(): tbl = [('foo', 'bar', 'baz'), ('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, .1)] df = etl.wrap(tbl).todataframe() tbl2 = etl.fromdataframe(df) ieq(tbl, tbl2) ieq(tbl, tbl2)
def extract_planday(): def get_fleet_columns(df): for column in df.columns: label = column.lower() if 'employee group' in label and 'driver' in label and 'test' not in label: yield column def reduce_fleets_to_single_column(df): fleets = list(get_fleet_columns(df)) fleets_dict = {} for employee_index, employee_row in df[fleets].iterrows(): employee_fleets = employee_row.dropna() fleets = [] for fleet, _ in employee_fleets.iteritems(): # The label is not exactly the fleet # name: it needs a little grooming. fleet = fleet.split(': ')[1] fleets.append(fleet) fleets = ', '.join(fleets) fleets_dict.update({employee_index: fleets}) return Series(fleets_dict) def process_fleetname(text): valk_driver_re = compile('valk(.?)driver(s?)\s', IGNORECASE) match = valk_driver_re.search(text) if match: return text.replace(match.group(0), '').lower() else: return text planday = read_excel(DRIVERS_IN_PLANDAY_FILEPATH) planday = planday[planday['Employee group: Managers'] != 0] fullname = planday['Last name'].map(strip) + ', ' + planday['First name'].map(strip) salary_id = planday['Salary identifier'].map(lambda x: x if x else None) fleetname = reduce_fleets_to_single_column(planday).map(process_fleetname).replace('', None) username = planday['Username'] username.name = 'planday_username_in_planday' fullname.name = 'fullname_in_planday' salary_id.name = 'salary_id_in_planday' fleetname.name = 'fleetname_in_planday' planday = concat([fullname, salary_id, fleetname, username], axis=1) planday = fromdataframe(planday) planday = standardize_missing_values(planday) planday = planday.addfield('salary_id', lambda rec: rec['salary_id_in_planday']) write_to_log(planday, 'drivers', 'planday') return planday
def from_dataframe(cls, dataframe, include_index=False): """ Create a ``parsons table`` from a Pandas dataframe. `Args:` dataframe: dataframe A valid Pandas dataframe objectt include_index: boolean Include index column """ return cls(petl.fromdataframe(dataframe, include_index=include_index))
def extract_fleets_from_dwh(): query = SQLReader('sql.fleets_from_tableau') fleets_df = dwh.execute(query.statements[0]) fleets_tb = fromdataframe(fleets_df) mappings = { 'fleet_uuid': 'uuid', 'fleetname': lambda rec: rec['backend_name'].replace('_', ' '), 'country_code': 'country_code', } fleets_tb = fleets_tb.cutout('country_code') fleets_tb = fleets_tb.fieldmap(mappings) fleets_tb = fleets_tb.suffixheader('_in_backend') return fleets_tb
def extract_users(): users_records = [api.get_record('users', driver.user_uuid_in_backend) for driver in drivers.namedtuples()] users_df = DataFrame().from_records(users_records) users_tb = fromdataframe(users_df) mappings = { 'driver_uuid': 'driver', 'user_uuid': 'uuid', 'backend_username': '******' } users_tb = users_tb.fieldmap(mappings) users_tb = users_tb.suffixheader('_in_backend') return users_tb
def extract_drivers(): query = SQLReader('sql.drivers_from_cloudsql') drivers_df = sql.execute(query.statements[0]) drivers_tb = fromdataframe(drivers_df) mappings = { 'driver_uuid': lambda rec: str(UUID(bytes=rec['uuid'], version=4)), 'fleet_uuid': lambda rec: str(UUID(bytes=rec['fleet_uuid'], version=4)), 'user_uuid': lambda rec: str(UUID(bytes=rec['user_ds_uuid'], version=4)), 'fullname': lambda rec: rec['last_name'].strip() + ', ' + rec['first_name'].strip(), } drivers_tb = drivers_tb.fieldmap(mappings) drivers_tb = drivers_tb.suffixheader('_in_backend') return drivers_tb
def _format_as_geojson(results, geodata_model): """joins the results to the corresponding geojson via the Django model. :param results: [description] :type results: [type] :param geodata_model: [description] :type geodata_model: [type] :return: [description] :rtype: [type] """ # read the geojson into a PETL table object # features_table = etl.fromdicts(fc.features).convert('id', str) df = geodata_model.as_dataframe_using_drf_serializer(geodata_model.objects.all()) t = etl.fromdataframe(df) # join the results to the geojson features, then shoehorn the results into the properties object of each feature # put the ID into the id property of the feature features = etl\ .fromdicts(results)\ .leftjoin(features_table, 'id')\ .sort(('ts', 'id'))\ .aggregate(key=('id', 'type', 'geometry'), aggregation=list, value=['src', 'val', 'ts'])\ .fieldmap( { 'id':'id', 'type':'type', 'geometry':'geometry', 'properties': lambda rec: ( dict( data=[dict(src=r[0],val=r[1],ts=r[2]) for r in rec.value], total=sum([r[1] for r in rec.value if r[1]]) ) ) }, failonerror=True ).dicts() return geojson.FeatureCollection(features=list(features))
from __future__ import division, print_function, absolute_import # todataframe() ############### import petl as etl table = [("foo", "bar", "baz"), ("apples", 1, 2.5), ("oranges", 3, 4.4), ("pears", 7, 0.1)] df = etl.todataframe(table) df # fromdataframe() ################# import petl as etl import pandas as pd records = [("apples", 1, 2.5), ("oranges", 3, 4.4), ("pears", 7, 0.1)] df = pd.DataFrame.from_records(records, columns=("foo", "bar", "baz")) table = etl.fromdataframe(df) table
# ('apples', 1, 2.5), # ('oranges', 3, 4.4), # ('pears', 7, .1)] # a = etl.toarray(t, dtype='U4, i2, f4') # print(a) # # array1 = np.array([('apples', 1, 2.5), # ('oranges', 3, 4.4), # ('pears', 7, .1)], dtype='U4, i2, f4') # tb7 = etl.fromarray(array1) # print(tb7) # dataframe(pandas) import pandas as pd records = [('apple', 1, 2.5), ('orange', 3, 4.5), ('pears', 5, 6.5)] df = pd.DataFrame.from_records(records, columns=('foo', 'bar', 'baz')) tb8 = etl.fromdataframe(df) print(tb8) # load data from given table into dataframe table = [('foo', 'bar', 'baz'), ('apple', 1, 2.5), ('orange', 3, 4.5), ('pears', 5, 6.5)] df = etl.todataframe(table) print(df) # excel xls/xlsx # HDFS # oracle
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Korea, South', 'South Korea') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Burma', 'Myanmar') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'US', 'USA') t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(t_confirmed) df = df_confirmed.groupby(['Country']).sum() t_confirmed = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente t_confirmed = etl.rename(t_confirmed, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas t_confirmed = etl.melt(t_confirmed, 'Country') t_confirmed = etl.rename(t_confirmed, {'variable': 'Date'}) t_confirmed = etl.rename(t_confirmed, {'value': 'Cases'}) # Luego agregamos el continente para agrupar t_confirmed = etl.addfield(t_confirmed, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. t_confirmed = etl.convert(t_confirmed, 'Cases', int)
#!/usr/bin/env python3 import petl as etl import pandas as pd import psycopg2 as pg NASA_SOURCE = "../kepler_accepted_results.csv" k_conn = pg.connect(user='******', password='******', host='localhost', database='kepler') # Not using petl to pull the data in from CSV because it's CSV methods # are a bit lacking. Pandas does a better job here. df = pd.read_csv(NASA_SOURCE, skiprows=31) # df = df.set_index('kepid') stbl = etl.fromdataframe(df, include_index=False) # Not doing any transformation at all yet etl.todb(stbl, k_conn, 'nasa_result', create=True, drop=False, sample=0) k_conn.close()
cs = "dbname=%s user=%s password=%s host=%s port=%s" % (dn, du, dp, dh, dbp) connection = psycopg2.connect(cs) # a = np.random.rand(100000, 200).tolist() b = np.random.rand(100000, 10).tolist() etl.fromcolumns() #pandas -> table -> db import pandas as pd df = pd.DataFrame(columns=['id', 'features', 'model_id', 'regdate']) for i in range(1, 10000): df.loc[i] = [i, np.random.rand(10).tolist(), 'test1', datetime.now()] pddf = etl.fromdataframe(df) etl.todb(pddf, connection, 'defect_features', 'public') #select query mkcursor = lambda: connection.cursor(name='arbitrary') table = etl.fromdb(mkcursor, 'select * from public.defect_features') table #primitive -> db import numpy as np from datetime import datetime datatable = [['id', 'features', 'model_id', 'regdate']] for i in range(1, 10): datatable.append([i, np.random.rand(10).tolist(), 'test1', datetime.now()]) etl.todb(datatable, connection, 'defect_features', 'public')
def dimension_values(): connection = psycopg2.connect(dbname='voyager', user='******', password='******', host='172.16.0.45') engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager') com = 'select id as id_component, name as component from dim_com' table_com = etl.fromdb(connection, com) loc = 'select id as id_location, name as name from dim_loc' table_loc = etl.fromdb(connection, loc) tim = 'select id as id_time, time as timestamp from dim_time' table_time = etl.fromdb(connection, tim) print(table_com) print(table_loc) print(table_time) for ran in range(0, 65424, 1000): sql = "select * from KNMI_station_data kk " \ "RIGHT JOIN weatherstations w ON " \ " CAST (kk.weather_station_id AS INTEGER) = CAST (w.station_number AS INTEGER) " \ "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran print(sql) table = etl.fromdb(connection, sql) print('knmi') print(table) table.log_progress() table = etl.convert(table, 'date', str) table = etl.convert(table, 'hour', str) table = etl.convert(table, 'temperature', int) table = etl.convert(table, 'temperature_dew', int) table = etl.convert(table, 'temperature_min', int) table = etl.convert(table, 'wind_speed_avg', int) table = etl.convert(table, 'wind_speed', int) table = etl.convert(table, 'wind_speed_max', int) table = etl.convert(table, 'temperature', lambda v: v / 10) table = etl.convert(table, 'temperature_dew', lambda v: v / 10) table = etl.convert(table, 'temperature_min', lambda v: v / 10) table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10) table = etl.convert(table, 'wind_speed', lambda v: v / 10) table = etl.convert(table, 'wind_speed_max', lambda v: v / 10) df = pd.DataFrame(table) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['date'] + df['hour'] df['weather_station_id'] = df['weather_station_id'].astype(str) df['timestamp'] = df['timestamp'].apply(custom_to_datetime) df['timestamp'] = df['timestamp'].astype(str) df = df.drop(columns=['date', 'hour'], axis=1) final_knmi_table = etl.fromdataframe(df) final_knmi_table = etl.melt(final_knmi_table, key=[ 'weather_station_id', 'timestamp', 'id', 'latitude', 'longitude', 'name', 'station_number', 'data_source_id', 'altitude' ]) final_knmi_table = etl.rename(final_knmi_table, 'variable', 'component') print(final_knmi_table) final_knmi_table2 = etl.join(final_knmi_table, table_com, key='component') final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name') final_knmi_table2 = etl.join(final_knmi_table2, table_time, key='timestamp') print('dos') print(final_knmi_table2) df = pd.DataFrame(final_knmi_table2) df.columns = df.iloc[0] df = df.drop(0) fact_source = df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi') for rn in range(0, 1148, 1000): print('lmn') final_lmn_table = etl.fromdb( connection, "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, " "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name" " from luchtmeetnet_data ld " "right join luchtmeetnet_sensors ls on ld.formula = ls.formula " " join weatherstations ws on ld.station_number = ws.station_number " "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' " "LIMIT 1000 OFFSET %s" % rn) final_lmn_table = etl.rename(final_lmn_table, {'station_number': 'weather_station_id'}) final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1) # print(final_lmn_table) # print(final_lmn_table) # print(table_com) final_lmn_table2 = etl.join(final_lmn_table, table_com, key='component') # print(final_lmn_table2) final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name') # print(final_lmn_table2) df = pd.DataFrame(final_lmn_table2) df.columns = df.iloc[0] df = df.drop(0) df['timestamp'] = df['timestamp'].str[:-6] # print(df) final_lmn_table2 = etl.fromdataframe(df) final_lmn_table2 = etl.join(final_lmn_table2, table_time, key='timestamp') # print(final_lmn_table2) print(final_lmn_table2) final_lmn_df = pd.DataFrame(final_lmn_table2) final_lmn_df.columns = final_lmn_df.iloc[0] final_lmn_df = final_lmn_df.drop(0) fact_source = final_lmn_df[[ 'id_component', 'id_location', 'id_time', 'value', 'data_source_id', 'weather_station_id' ]] print(fact_source) fact_source.to_sql('fact_source', engine, if_exists='append', index=False, method='multi')
from __future__ import division, print_function, absolute_import # todataframe() ############### import petl as etl table = [('foo', 'bar', 'baz'), ('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, .1)] df = etl.todataframe(table) df # fromdataframe() ################# import petl as etl import pandas as pd records = [('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)] df = pd.DataFrame.from_records(records, columns=('foo', 'bar', 'baz')) table = etl.fromdataframe(df) table
def procesar_fuente(path, nombre): try: # Procesamos primero casos confirmados tabla = etl.fromcsv(path) # Cambiamos el nombre a los encabezados tabla = etl.rename(tabla, {'Country/Region': 'Country'}) # Ajustamos los tipos de datos # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header headers = etl.fieldnames(tabla) i=0 for header in headers: if i>=4: tabla = etl.convert(tabla, header, int) # corregimos el tipo de dato fecha = datetime.datetime.strptime(header, '%m/%d/%y') # calculamos la fecha en formato correcto tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d')) i = i + 1 # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar tabla = etl.cutout(tabla, 0, 2, 3) # Ajustamos algunos nombres de países para luego asignarles una región/continente tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo') tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast') tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea') tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine') tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar') tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA') tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan') # Luego procedemos a agrupar y acumular los resultados por el país df_confirmed = etl.todataframe(tabla) df = df_confirmed.groupby(['Country']).sum() tabla = etl.fromdataframe(df, include_index=True) # Renombramos el campo de Country nuevamente tabla = etl.rename(tabla, {'index': 'Country'}) # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas tabla = etl.melt(tabla, 'Country') tabla = etl.rename(tabla, {'variable': 'Date'}) tabla = etl.rename(tabla, {'value': 'Cases'}) # Luego agregamos el continente para agrupar tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country'])) # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser. tabla = etl.convert(tabla, 'Cases', int) tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') ) #Finalmente, subimos el archivo al repositorio de datos conn = pymysql.connect(password='******', database='covid', user='******') conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES') etl.todb(tabla, conn, nombre, create=True, drop=True) conn.close() except: print('Se ha presentado un error! ', sys.exc_info()[0]) raise