Ejemplo n.º 1
0
def produce_delivery_count_table():
    log.addFilter(MultilineFilter())  # useful for tables
    log.info('Starting to generate the monthly german payroll table')

    # ------------------------------
    # Extract driver names from Odoo
    # ------------------------------

    log.info('Extracting driver names from Odoo')
    odoo = OdooConnector()

    filters = [('supplier', '=', True),
               ('active', '=', True),
               ('company_id', '=', 5)]  # 5 is germany
    df = odoo.extract('res.partner', filters)
    odoo_drivers = fromdataframe(df)

    mappings = {
        'driver_app_username': '******',
        'planday_salary_id_in_odoo': 'x_salary_id',
        'odoo_id': 'id',
        'fullname_in_odoo': 'display_name'
    }
    odoo_drivers = odoo_drivers.fieldmap(mappings)

    # cache the results
    odoo_drivers.toxlsx(odoo_cache_file)
    log.info('%s drivers found in Odoo', odoo_drivers.nrows())
    log.debug(odoo_drivers.look())

    # ------------------------------------------
    # Extract delivery counts from the warehouse
    # ------------------------------------------

    log.info('Extracting delivery counts from the DWH')
    dwh = WarehouseConnector()

    query = SQLReader('sql.german_drivers_delivery_counts').statements[0]
    log.debug(query)
    df = dwh.execute(query)
    driver_counts = fromdataframe(df)

    # cache the results
    driver_counts.toxlsx(dwh_cache_file)
    log.info('%s drivers found in the DWH', driver_counts.nrows())
    log.info('Deliveries per driver %s', driver_counts.stats('number_of_deliveries'))
    log.debug(driver_counts.look())

    # ----------------------------
    # Join the two tables together
    # ----------------------------

    payroll = leftjoin(driver_counts, odoo_drivers, key='driver_app_username')
    # Some usernames appear multiple times in Odoo
    payroll = payroll.distinct('driver_app_username')
    log.debug(payroll.look())

    payroll.toxlsx(output_file)
    log.info('Payroll table saved to %s', output_file)
    log.removeFilter(MultilineFilter())
Ejemplo n.º 2
0
def transform_to_petl(data):
    isodate = etl.dateparser("%Y-%m-%d")
    data = etl.fromdataframe(data)
    data = etl.rename(data, {"index": "Date", "VALUE": "Value"})
    data = etl.convert(data, {"Date": lambda d: d[:10]})
    data = etl.convert(data, {"Date": lambda d: isodate(d)})
    return data
def extract_odoo(offline=OFFLINE):
    if not offline:
        api = OdooConnector()

        filters = [('supplier', '=', True),
                   ('active', '=', True),
                   ('company_id', '=', 3)]

        dataframe = api.extract('res.partner', filters)
        drivers = fromdataframe(dataframe)

        mappings = {
            'backend_username': '******',
            'backend_uuid': 'x_backend_uuid',
            'salary_id': 'x_salary_id',
            'odoo_id': 'id',
            'fleetname': lambda rec: rec['x_fleet'][1].replace('_', ' '),
            'fullname': lambda rec: rec['display_name'].strip()
        }

        drivers = drivers.fieldmap(mappings)
        drivers = drivers.suffixheader('_in_odoo')
        drivers.topickle(DRIVERS_IN_ODOO_FILEPATH)

    else:
        drivers = frompickle(DRIVERS_IN_ODOO_FILEPATH)

    drivers = drivers.addfield('backend_username', lambda rec: rec['backend_username_in_odoo'])
    drivers = drivers.addfield('salary_id', lambda rec: rec['salary_id_in_odoo'])

    drivers = standardize_missing_values(drivers)
    write_to_log(drivers, 'drivers', 'odoo')

    return drivers
Ejemplo n.º 4
0
 def test_integration():
     tbl = [('foo', 'bar', 'baz'),
            ('apples', 1, 2.5),
            ('oranges', 3, 4.4),
            ('pears', 7, .1)]
     df = etl.wrap(tbl).todataframe()
     tbl2 = etl.fromdataframe(df)
     ieq(tbl, tbl2)
     ieq(tbl, tbl2)
def extract_planday():
    def get_fleet_columns(df):
        for column in df.columns:
            label = column.lower()
            if 'employee group' in label and 'driver' in label and 'test' not in label:
                yield column

    def reduce_fleets_to_single_column(df):
        fleets = list(get_fleet_columns(df))
        fleets_dict = {}

        for employee_index, employee_row in df[fleets].iterrows():
            employee_fleets = employee_row.dropna()

            fleets = []
            for fleet, _ in employee_fleets.iteritems():
                # The label is not exactly the fleet
                # name: it needs a little grooming.
                fleet = fleet.split(': ')[1]
                fleets.append(fleet)

            fleets = ', '.join(fleets)
            fleets_dict.update({employee_index: fleets})

        return Series(fleets_dict)

    def process_fleetname(text):
        valk_driver_re = compile('valk(.?)driver(s?)\s', IGNORECASE)
        match = valk_driver_re.search(text)
        if match:
            return text.replace(match.group(0), '').lower()
        else:
            return text

    planday = read_excel(DRIVERS_IN_PLANDAY_FILEPATH)
    planday = planday[planday['Employee group: Managers'] != 0]

    fullname = planday['Last name'].map(strip) + ', ' + planday['First name'].map(strip)
    salary_id = planday['Salary identifier'].map(lambda x: x if x else None)
    fleetname = reduce_fleets_to_single_column(planday).map(process_fleetname).replace('', None)
    username = planday['Username']

    username.name = 'planday_username_in_planday'
    fullname.name = 'fullname_in_planday'
    salary_id.name = 'salary_id_in_planday'
    fleetname.name = 'fleetname_in_planday'

    planday = concat([fullname, salary_id, fleetname, username], axis=1)
    planday = fromdataframe(planday)
    planday = standardize_missing_values(planday)
    planday = planday.addfield('salary_id', lambda rec: rec['salary_id_in_planday'])

    write_to_log(planday, 'drivers', 'planday')

    return planday
Ejemplo n.º 6
0
    def from_dataframe(cls, dataframe, include_index=False):
        """
        Create a ``parsons table`` from a Pandas dataframe.

        `Args:`
            dataframe: dataframe
                A valid Pandas dataframe objectt
            include_index: boolean
                Include index column
        """

        return cls(petl.fromdataframe(dataframe, include_index=include_index))
    def extract_fleets_from_dwh():
        query = SQLReader('sql.fleets_from_tableau')
        fleets_df = dwh.execute(query.statements[0])
        fleets_tb = fromdataframe(fleets_df)

        mappings = {
            'fleet_uuid': 'uuid',
            'fleetname': lambda rec: rec['backend_name'].replace('_', ' '),
            'country_code': 'country_code',
        }

        fleets_tb = fleets_tb.cutout('country_code')
        fleets_tb = fleets_tb.fieldmap(mappings)
        fleets_tb = fleets_tb.suffixheader('_in_backend')

        return fleets_tb
    def extract_users():
        users_records = [api.get_record('users', driver.user_uuid_in_backend)
                         for driver in drivers.namedtuples()]
        users_df = DataFrame().from_records(users_records)
        users_tb = fromdataframe(users_df)

        mappings = {
            'driver_uuid': 'driver',
            'user_uuid': 'uuid',
            'backend_username': '******'
        }

        users_tb = users_tb.fieldmap(mappings)
        users_tb = users_tb.suffixheader('_in_backend')

        return users_tb
    def extract_drivers():
        query = SQLReader('sql.drivers_from_cloudsql')
        drivers_df = sql.execute(query.statements[0])
        drivers_tb = fromdataframe(drivers_df)

        mappings = {
            'driver_uuid': lambda rec: str(UUID(bytes=rec['uuid'], version=4)),
            'fleet_uuid': lambda rec: str(UUID(bytes=rec['fleet_uuid'], version=4)),
            'user_uuid': lambda rec: str(UUID(bytes=rec['user_ds_uuid'], version=4)),
            'fullname': lambda rec: rec['last_name'].strip() + ', ' + rec['first_name'].strip(),
        }

        drivers_tb = drivers_tb.fieldmap(mappings)
        drivers_tb = drivers_tb.suffixheader('_in_backend')

        return drivers_tb
Ejemplo n.º 10
0
def _format_as_geojson(results, geodata_model):
    """joins the results to the corresponding geojson via the Django model.

    :param results: [description]
    :type results: [type]
    :param geodata_model: [description]
    :type geodata_model: [type]
    :return: [description]
    :rtype: [type]
    """

    # read the geojson into a PETL table object
    # features_table = etl.fromdicts(fc.features).convert('id', str)

    df = geodata_model.as_dataframe_using_drf_serializer(geodata_model.objects.all())

    t = etl.fromdataframe(df)

    # join the results to the geojson features, then shoehorn the results into the properties object of each feature
    # put the ID into the id property of the feature
    features = etl\
        .fromdicts(results)\
        .leftjoin(features_table, 'id')\
        .sort(('ts', 'id'))\
        .aggregate(key=('id', 'type', 'geometry'), aggregation=list, value=['src', 'val', 'ts'])\
        .fieldmap(
            {
                'id':'id',
                'type':'type',
                'geometry':'geometry',
                'properties': lambda rec: (
                    dict(
                        data=[dict(src=r[0],val=r[1],ts=r[2]) for r in rec.value],
                        total=sum([r[1] for r in rec.value if r[1]])
                    )
                )
            },
            failonerror=True
        ).dicts()

    return geojson.FeatureCollection(features=list(features))
Ejemplo n.º 11
0
from __future__ import division, print_function, absolute_import


# todataframe()
###############

import petl as etl

table = [("foo", "bar", "baz"), ("apples", 1, 2.5), ("oranges", 3, 4.4), ("pears", 7, 0.1)]
df = etl.todataframe(table)
df


# fromdataframe()
#################

import petl as etl
import pandas as pd

records = [("apples", 1, 2.5), ("oranges", 3, 4.4), ("pears", 7, 0.1)]
df = pd.DataFrame.from_records(records, columns=("foo", "bar", "baz"))
table = etl.fromdataframe(df)
table
Ejemplo n.º 12
0
#      ('apples', 1, 2.5),
#      ('oranges', 3, 4.4),
#      ('pears', 7, .1)]
# a = etl.toarray(t, dtype='U4, i2, f4')
# print(a)
#
# array1 = np.array([('apples', 1, 2.5),
#                    ('oranges', 3, 4.4),
#                    ('pears', 7, .1)], dtype='U4, i2, f4')
# tb7 = etl.fromarray(array1)
# print(tb7)

# dataframe(pandas)
import pandas as pd

records = [('apple', 1, 2.5), ('orange', 3, 4.5), ('pears', 5, 6.5)]
df = pd.DataFrame.from_records(records, columns=('foo', 'bar', 'baz'))
tb8 = etl.fromdataframe(df)
print(tb8)

# load data from given table into dataframe
table = [('foo', 'bar', 'baz'), ('apple', 1, 2.5), ('orange', 3, 4.5),
         ('pears', 5, 6.5)]

df = etl.todataframe(table)
print(df)

# excel xls/xlsx
# HDFS
# oracle
Ejemplo n.º 13
0
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Cote d\'Ivoire',
                          'Ivory Coast')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Korea, South',
                          'South Korea')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace',
                          'West Bank and Gaza', 'Palestine')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Burma',
                          'Myanmar')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'US', 'USA')
t_confirmed = etl.convert(t_confirmed, 'Country', 'replace', 'Taiwan*',
                          'Taiwan')

# Luego procedemos a agrupar y acumular los resultados por el país
df_confirmed = etl.todataframe(t_confirmed)
df = df_confirmed.groupby(['Country']).sum()
t_confirmed = etl.fromdataframe(df, include_index=True)

# Renombramos el campo de Country nuevamente
t_confirmed = etl.rename(t_confirmed, {'index': 'Country'})

# Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
t_confirmed = etl.melt(t_confirmed, 'Country')
t_confirmed = etl.rename(t_confirmed, {'variable': 'Date'})
t_confirmed = etl.rename(t_confirmed, {'value': 'Cases'})

# Luego agregamos el continente para agrupar
t_confirmed = etl.addfield(t_confirmed, 'Continent',
                           lambda rec: get_continent_code(rec['Country']))

# Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
t_confirmed = etl.convert(t_confirmed, 'Cases', int)
Ejemplo n.º 14
0
#!/usr/bin/env python3

import petl as etl
import pandas as pd
import psycopg2 as pg

NASA_SOURCE = "../kepler_accepted_results.csv"

k_conn = pg.connect(user='******', password='******',
                    host='localhost', database='kepler')

# Not using petl to pull the data in from CSV because it's CSV methods
# are a bit lacking.  Pandas does a better job here.
df = pd.read_csv(NASA_SOURCE, skiprows=31)
# df = df.set_index('kepid')

stbl = etl.fromdataframe(df, include_index=False)

# Not doing any transformation at all yet

etl.todb(stbl, k_conn, 'nasa_result', create=True, drop=False, sample=0)
k_conn.close()
Ejemplo n.º 15
0
cs = "dbname=%s user=%s password=%s host=%s port=%s" % (dn, du, dp, dh, dbp)
connection = psycopg2.connect(cs)

#
a = np.random.rand(100000, 200).tolist()
b = np.random.rand(100000, 10).tolist()
etl.fromcolumns()

#pandas -> table -> db
import pandas as pd
df = pd.DataFrame(columns=['id', 'features', 'model_id', 'regdate'])
for i in range(1, 10000):
    df.loc[i] = [i, np.random.rand(10).tolist(), 'test1', datetime.now()]

pddf = etl.fromdataframe(df)
etl.todb(pddf, connection, 'defect_features', 'public')

#select query
mkcursor = lambda: connection.cursor(name='arbitrary')
table = etl.fromdb(mkcursor, 'select * from public.defect_features')
table

#primitive -> db
import numpy as np
from datetime import datetime
datatable = [['id', 'features', 'model_id', 'regdate']]
for i in range(1, 10):
    datatable.append([i, np.random.rand(10).tolist(), 'test1', datetime.now()])
etl.todb(datatable, connection, 'defect_features', 'public')
Ejemplo n.º 16
0
def dimension_values():
    connection = psycopg2.connect(dbname='voyager',
                                  user='******',
                                  password='******',
                                  host='172.16.0.45')
    engine = create_engine('postgresql://*****:*****@172.16.0.45:5432/voyager')

    com = 'select id as id_component, name as component from dim_com'
    table_com = etl.fromdb(connection, com)

    loc = 'select id as id_location, name as name from dim_loc'
    table_loc = etl.fromdb(connection, loc)

    tim = 'select id as id_time, time as timestamp from dim_time'
    table_time = etl.fromdb(connection, tim)

    print(table_com)
    print(table_loc)
    print(table_time)

    for ran in range(0, 65424, 1000):
        sql = "select * from KNMI_station_data kk " \
              "RIGHT JOIN weatherstations w ON " \
              " CAST (kk.weather_station_id AS INTEGER)  = CAST (w.station_number AS INTEGER) " \
              "WHERE w.station_number NOT LIKE \'NL%%\' AND date > 20190901 LIMIT 1000 OFFSET %s" % ran
        print(sql)
        table = etl.fromdb(connection, sql)

        print('knmi')
        print(table)
        table.log_progress()
        table = etl.convert(table, 'date', str)
        table = etl.convert(table, 'hour', str)

        table = etl.convert(table, 'temperature', int)
        table = etl.convert(table, 'temperature_dew', int)
        table = etl.convert(table, 'temperature_min', int)
        table = etl.convert(table, 'wind_speed_avg', int)
        table = etl.convert(table, 'wind_speed', int)
        table = etl.convert(table, 'wind_speed_max', int)

        table = etl.convert(table, 'temperature', lambda v: v / 10)
        table = etl.convert(table, 'temperature_dew', lambda v: v / 10)
        table = etl.convert(table, 'temperature_min', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_avg', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed', lambda v: v / 10)
        table = etl.convert(table, 'wind_speed_max', lambda v: v / 10)

        df = pd.DataFrame(table)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['date'] + df['hour']

        df['weather_station_id'] = df['weather_station_id'].astype(str)
        df['timestamp'] = df['timestamp'].apply(custom_to_datetime)
        df['timestamp'] = df['timestamp'].astype(str)

        df = df.drop(columns=['date', 'hour'], axis=1)

        final_knmi_table = etl.fromdataframe(df)

        final_knmi_table = etl.melt(final_knmi_table,
                                    key=[
                                        'weather_station_id', 'timestamp',
                                        'id', 'latitude', 'longitude', 'name',
                                        'station_number', 'data_source_id',
                                        'altitude'
                                    ])
        final_knmi_table = etl.rename(final_knmi_table, 'variable',
                                      'component')
        print(final_knmi_table)

        final_knmi_table2 = etl.join(final_knmi_table,
                                     table_com,
                                     key='component')
        final_knmi_table2 = etl.join(final_knmi_table2, table_loc, key='name')
        final_knmi_table2 = etl.join(final_knmi_table2,
                                     table_time,
                                     key='timestamp')
        print('dos')

        print(final_knmi_table2)
        df = pd.DataFrame(final_knmi_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        fact_source = df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]

        print(fact_source)
        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')

    for rn in range(0, 1148, 1000):
        print('lmn')
        final_lmn_table = etl.fromdb(
            connection,
            "select ld.id, ld.station_number, ld.value, ld.timestamp, ls.name as component, "
            "ws.id as lid, ws.latitude, ws.longitude, ws.data_source_id, ws.altitude, ws.name as name"
            " from luchtmeetnet_data ld "
            "right join luchtmeetnet_sensors ls on ld.formula = ls.formula "
            " join weatherstations ws on ld.station_number = ws.station_number "
            "where ws.station_number like \'NL%%\' AND timestamp > '2019-09-01' "
            "LIMIT 1000 OFFSET %s" % rn)
        final_lmn_table = etl.rename(final_lmn_table,
                                     {'station_number': 'weather_station_id'})
        final_lmn_table = etl.movefield(final_lmn_table, 'timestamp', 1)
        # print(final_lmn_table)
        # print(final_lmn_table)

        # print(table_com)
        final_lmn_table2 = etl.join(final_lmn_table,
                                    table_com,
                                    key='component')
        # print(final_lmn_table2)

        final_lmn_table2 = etl.join(final_lmn_table2, table_loc, key='name')
        # print(final_lmn_table2)
        df = pd.DataFrame(final_lmn_table2)
        df.columns = df.iloc[0]
        df = df.drop(0)
        df['timestamp'] = df['timestamp'].str[:-6]
        # print(df)

        final_lmn_table2 = etl.fromdataframe(df)

        final_lmn_table2 = etl.join(final_lmn_table2,
                                    table_time,
                                    key='timestamp')
        # print(final_lmn_table2)

        print(final_lmn_table2)
        final_lmn_df = pd.DataFrame(final_lmn_table2)
        final_lmn_df.columns = final_lmn_df.iloc[0]
        final_lmn_df = final_lmn_df.drop(0)
        fact_source = final_lmn_df[[
            'id_component', 'id_location', 'id_time', 'value',
            'data_source_id', 'weather_station_id'
        ]]
        print(fact_source)

        fact_source.to_sql('fact_source',
                           engine,
                           if_exists='append',
                           index=False,
                           method='multi')
Ejemplo n.º 17
0
from __future__ import division, print_function, absolute_import

# todataframe()
###############

import petl as etl
table = [('foo', 'bar', 'baz'), ('apples', 1, 2.5), ('oranges', 3, 4.4),
         ('pears', 7, .1)]
df = etl.todataframe(table)
df

# fromdataframe()
#################

import petl as etl
import pandas as pd
records = [('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)]
df = pd.DataFrame.from_records(records, columns=('foo', 'bar', 'baz'))
table = etl.fromdataframe(df)
table
Ejemplo n.º 18
0
def procesar_fuente(path, nombre):
    try: 
        # Procesamos primero casos confirmados
        tabla = etl.fromcsv(path)

        # Cambiamos el nombre a los encabezados
        tabla = etl.rename(tabla, {'Country/Region': 'Country'})

        # Ajustamos los tipos de datos
        # A partir de la columna 5, el tipo de dato es integer, que es el número de personas/casos
        # Adicionalmente aprovechamos para cambiar el formato de la fecha de 1/23/20 a 2020-01-23 en el header
        headers = etl.fieldnames(tabla)
        i=0
        for header in headers:
            if i>=4:
                tabla = etl.convert(tabla, header, int)        # corregimos el tipo de dato
                fecha =  datetime.datetime.strptime(header, '%m/%d/%y')    # calculamos la fecha en formato correcto
                tabla = etl.rename(tabla, header, fecha.strftime('%Y-%m-%d'))   
            i = i + 1

        # Eliminamos las columnas de Province/State, Lat y Lon que no vamos a utilizar
        tabla = etl.cutout(tabla, 0, 2, 3)

        # Ajustamos algunos nombres de países para luego asignarles una región/continente
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Brazzaville)', 'Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Congo (Kinshasa)', 'Democratic Republic of the Congo')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Cote d\'Ivoire', 'Ivory Coast')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Korea, South', 'South Korea')
        tabla = etl.convert(tabla, 'Country', 'replace', 'West Bank and Gaza', 'Palestine')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Burma', 'Myanmar')
        tabla = etl.convert(tabla, 'Country', 'replace', 'US', 'USA')
        tabla = etl.convert(tabla, 'Country', 'replace', 'Taiwan*', 'Taiwan')

        # Luego procedemos a agrupar y acumular los resultados por el país
        df_confirmed = etl.todataframe(tabla)
        df = df_confirmed.groupby(['Country']).sum()
        tabla = etl.fromdataframe(df, include_index=True)

        # Renombramos el campo de Country nuevamente
        tabla = etl.rename(tabla, {'index': 'Country'})

        # Luego agregamos las columnas de fecha como datos y renombramos las nuevas columnas
        tabla = etl.melt(tabla, 'Country')
        tabla = etl.rename(tabla, {'variable': 'Date'})
        tabla = etl.rename(tabla, {'value': 'Cases'})

        # Luego agregamos el continente para agrupar
        tabla = etl.addfield(tabla, 'Continent', lambda rec: get_continent_code(rec['Country']))

        # Y nuevamente nos aseguramos que sean del tipo de dato que deben ser.
        tabla = etl.convert(tabla, 'Cases', int)
        tabla = etl.convert(tabla, 'Date', lambda v: datetime.datetime.strptime(v, '%Y-%m-%d') )

        #Finalmente, subimos el archivo al repositorio de datos
        conn = pymysql.connect(password='******', database='covid', user='******')
        conn.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
        etl.todb(tabla, conn, nombre, create=True, drop=True)
        conn.close()
    except:
        print('Se ha presentado un error! ', sys.exc_info()[0])
        raise