Esempio n. 1
0
def transform_xls(hires_and_promotions_excel, separations_excel,
                  exempt_roster_excel, output_file):
    hires_and_promotions = petl.io.xls \
                            .fromxls(hires_and_promotions_excel, sheet='Data') \
                            .rename(column_map_shared)

    separations = petl.io.xls \
                    .fromxls(separations_excel, sheet='Data') \
                    .rename({**column_map_shared, **column_map_separations})

    def dedup_separations(payroll_number, rows):
        rows_sorted = sorted(rows, key=lambda x: x['termination_date'])
        return rows_sorted[-1]

    separations_deduped = petl.rowreduce(separations, 'payroll_number',
                                         dedup_separations)

    exempt_roster = petl.io.xls \
                        .fromxls(exempt_roster_excel, sheet='Data') \
                        .rename(column_map_roster)

    merged = petl.mergesort(hires_and_promotions,
                            separations_deduped,
                            exempt_roster,
                            key='payroll_number')

    def dedup_merged(payroll_number, rows):
        rows_sorted = sorted(rows, key=lambda x: x['latest_start_date'])

        if len(rows_sorted) == 1:
            return rows_sorted[-1]

        merged_row = []
        for i in range(0, len(rows_sorted[0]) - 1):
            if (rows_sorted[0][i] == '' or rows_sorted[0][i] == None
                ) and rows_sorted[1][i] != '' and rows_sorted[1][i] != None:
                merged_row.append(rows_sorted[1][i])
            elif (rows_sorted[1][i] == '' or rows_sorted[1][i] == None
                  ) and rows_sorted[0][i] != '' and rows_sorted[0][i] != None:
                merged_row.append(rows_sorted[0][i])
            elif rows_sorted[0][i] == rows_sorted[1][i]:
                merged_row.append(rows_sorted[0][i])
            else:
                merged_row.append(
                    rows_sorted[1][i])  ## take latest value by start date

        return merged_row

    merged_deduped = petl.rowreduce(merged, 'payroll_number', dedup_merged)

    petl.tocsv(merged_deduped, source=output_file)
Esempio n. 2
0
# rowreduce

table1 = [['foo', 'bar'],
          ['a', 3],
          ['a', 7],
          ['b', 2],
          ['b', 1],
          ['b', 9],
          ['c', 4]]

from petl import rowreduce, look    
look(table1)
def sumbar(key, rows):
    return [key, sum(row[1] for row in rows)]

table2 = rowreduce(table1, key='foo', reducer=sumbar, fields=['foo', 'barsum'])
look(table2)


# recordreduce

table1 = [['foo', 'bar'],
          ['a', 3],
          ['a', 7],
          ['b', 2],
          ['b', 1],
          ['b', 9],
          ['c', 4]]

from petl import recordreduce, look    
look(table1)
Esempio n. 3
0
# rowreduce

table1 = [['foo', 'bar'],
          ['a', 3],
          ['a', 7],
          ['b', 2],
          ['b', 1],
          ['b', 9],
          ['c', 4]]

from petl import rowreduce, look    
look(table1)
def sumbar(key, rows):
    return [key, sum(row[1] for row in rows)]

table2 = rowreduce(table1, key='foo', reducer=sumbar, fields=['foo', 'barsum'])
look(table2)


# recordreduce

table1 = [['foo', 'bar'],
          ['a', 3],
          ['a', 7],
          ['b', 2],
          ['b', 1],
          ['b', 9],
          ['c', 4]]

from petl import recordreduce, look    
look(table1)
Esempio n. 4
0
    def reduce_rows(self,
                    columns,
                    reduce_func,
                    headers,
                    presorted=False,
                    **kwargs):
        """
        Group rows by a column or columns, then reduce the groups to a single row.

        Based on the `rowreduce petl function <https://petl.readthedocs.io/en/stable/transform.html#petl.transform.reductions.rowreduce>`_.

        For example, the output from the query to get a table's definition is
        returned as one component per row. The `reduce_rows` method can be used
        to reduce all those to a single row containg the entire query.

        .. code-block:: python

            >>> ddl = rs.query(sql_to_get_table_ddl)
            >>> ddl.table

            +--------------+--------------+----------------------------------------------------+
            | schemaname   | tablename    | ddl                                                |
            +==============+==============+====================================================+
            | 'db_scratch' | 'state_fips' | '--DROP TABLE db_scratch.state_fips;'              |
            +--------------+--------------+----------------------------------------------------+
            | 'db_scratch' | 'state_fips' | 'CREATE TABLE IF NOT EXISTS db_scratch.state_fips' |
            +--------------+--------------+----------------------------------------------------+
            | 'db_scratch' | 'state_fips' | '('                                                |
            +--------------+--------------+----------------------------------------------------+
            | 'db_scratch' | 'state_fips' | '\\tstate VARCHAR(1024)   ENCODE RAW'               |
            +--------------+--------------+----------------------------------------------------+
            | 'db_scratch' | 'state_fips' | '\\t,stusab VARCHAR(1024)   ENCODE RAW'             |
            +--------------+--------------+----------------------------------------------------+

            >>> reducer_fn = lambda columns, rows: [
            ...     f"{columns[0]}.{columns[1]}",
            ...     '\\n'.join([row[2] for row in rows])]
            >>> ddl.reduce_rows(
            ...     ['schemaname', 'tablename'],
            ...     reducer_fn,
            ...     ['tablename', 'ddl'],
            ...     presorted=True)
            >>> ddl.table

            +-------------------------+-----------------------------------------------------------------------+
            | tablename               | ddl                                                                   |
            +=========================+=======================================================================+
            | 'db_scratch.state_fips' | '--DROP TABLE db_scratch.state_fips;\\nCREATE TABLE IF NOT EXISTS      |
            |                         | db_scratch.state_fips\\n(\\n\\tstate VARCHAR(1024)   ENCODE RAW\\n\\t      |
            |                         | ,db_scratch.state_fips\\n(\\n\\tstate VARCHAR(1024)   ENCODE RAW         |
            |                         | \\n\\t,stusab VARCHAR(1024)   ENCODE RAW\\n\\t,state_name                 |
            |                         | VARCHAR(1024)   ENCODE RAW\\n\\t,statens VARCHAR(1024)   ENCODE         |
            |                         | RAW\\n)\\nDISTSTYLE EVEN\\n;'                                            |
            +-------------------------+-----------------------------------------------------------------------+

        `Args:`
            columns: list
                The column(s) by which to group the rows.
            reduce_func: fun
                The function by which to reduce the rows. Should take the 2
                arguments, the columns list and the rows list and return a list.
                `reducer(columns: list, rows: list) -> list;`
            headers: list
                The list of headers for modified table. The length of `headers`
                should match the length of the list returned by the reduce
                function.
            presorted: bool
                If false, the row will be sorted.
        `Returns:`
            `Parsons Table` and also updates self

        """ # noqa: E501,E261

        self.table = petl.rowreduce(self.table,
                                    columns,
                                    reduce_func,
                                    header=headers,
                                    presorted=presorted,
                                    **kwargs)

        return self
Esempio n. 5
0
def reducer_execute(c, **kwargs):
    r = c()
    if 'addfields' in kwargs:
        r = etl.addfields(r, kwargs['addfields'])
    kwargs = filter_keys(kwargs, ("key", "reducer", "header"))
    return etl.rowreduce(r, **kwargs)