def _test_rightjoin_multiple(rightjoin_impl):

    table1 = (('id', 'color', 'cost'),
              (1, 'blue', 12),
              (1, 'red', 8),
              (2, 'yellow', 15),
              (2, 'orange', 5),
              (3, 'purple', 4),
              (4, 'chartreuse', 42))

    table2 = (('id', 'shape', 'size'),
              (1, 'circle', 'big'),
              (2, 'square', 'tiny'),
              (2, 'square', 'big'),
              (3, 'ellipse', 'small'),
              (3, 'ellipse', 'tiny'),
              (5, 'didodecahedron', 3.14159265))

    actual = rightjoin_impl(table1, table2, key='id')
    expect = (('id', 'color', 'cost', 'shape', 'size'),
              (1, 'blue', 12, 'circle', 'big'),
              (1, 'red', 8, 'circle', 'big'),
              (2, 'yellow', 15, 'square', 'tiny'),
              (2, 'yellow', 15, 'square', 'big'),
              (2, 'orange', 5, 'square', 'tiny'),
              (2, 'orange', 5, 'square', 'big'),
              (3, 'purple', 4, 'ellipse', 'small'),
              (3, 'purple', 4, 'ellipse', 'tiny'),
              (5, None, None, 'didodecahedron', 3.14159265))

    # N.B., need to sort because hash and sort implementations will return
    # rows in a different order
    ieq(sort(expect), sort(actual))
Exemple #2
0
    def etl_from_table(self, table, force_reload=False):
        """Extract, translate, load exclusions (and not reinstatements) from
        a petl TABLE.

        Set FORCE_RELOAD to True to turn off the protections against
        reading the same csv file twice.  There is no harm in redoing
        a csv file, since the csv contents replaces the db table
        entirely.  We avoid reloading because it is faster and because
        it prevents the db from having an empty table for a moment
        between blowing away and refilling it.

        """
        if not force_reload:
            # If UPDATED.csv has the same number of rows and the same most
            # recent date as our db, we've already snarfed this csv file and
            # can skip it.
            db_latest = self.conn.get_latest_exclusion_date().replace('-', '')
            db_num_rows = self.conn.count_exclusions()
            updated_latest = etl.cut(etl.sort(table, 'EXCLDATE'),
                                     'EXCLDATE')[len(table) - 1][0]
            updated_num_rows = len(table) - 1
            if (db_num_rows == updated_num_rows
                    and db_latest == updated_latest):
                return

        # Massage data
        individual, business = clean_and_separate(table)

        # Save to db, BLOWING AWAY data in the existing tables.  If
        # tables don't exist, will create them, but without any
        # constraints.
        info("Replacing individual_exclusion and business_exclusion tables.")
        etl.todb(individual, self.conn.conn, 'individual_exclusion')
        etl.todb(business, self.conn.conn, 'business_exclusion')
Exemple #3
0
def lookup_and_transform(ts_kv_table):
    """The table has the following structure:
    +---------------------------------+---------------+---------------+--------+
    | entity_id                       | key           | ts            | value  |
    +=================================+===============+===============+========+
    | 1ea47494dc14d40bd76a73c738b665f | Temperature   | 1583010011665 |  -1.8  |
    +---------------------------------+---------------+---------------+--------+
    | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 |   227  |
    +---------------------------------+---------------+---------------+--------+
    
    The output is a dictionary {device_id:table} of tables like that:
    +--------------+--------------+---------------+
    | ts           | Temperature  | WindDirection |
    +--------------+--------------+---------------+
    |1583010011665 | -1.8         |  230          |
    +--------------+--------------+---------------+
    |1583010000692 |   -2.5       | 227           |
    +--------------+--------------+---------------+
    """

    lkp = petl.lookup(ts_kv_table, 'entity_id', value=('key', 'ts', 'value'))
    for entity_id in lkp:
        tbl = [('key', 'ts', 'value')] + lkp[entity_id]
        tbl = petl.recast(tbl, variablefield='key', valuefield='value')
        cut_keys = KEYS_TO_REMOVE & set(petl.fieldnames(tbl))
        tbl = petl.cutout(tbl, *cut_keys)
        tbl = petl.transform.headers.sortheader(tbl)
        tbl = petl.transform.basics.movefield(tbl, 'ts', 0)
        lkp[entity_id] = petl.sort(tbl, 'ts')
    return lkp
Exemple #4
0
def sort_execute(c, **kwargs):
    r = c()
    if 'addfields' in kwargs:
        r = etl.addfields(r, kwargs['addfields'])
    kwargs = filter_keys(kwargs, ("key", "reverse"))
    r = etl.sort(r, **kwargs)
    return r
    def get_table(self):
        table = self.get_body()
        if (table is not None) and (len(table) > 0):
            if (self.header is not None):
                table = etl.headers.pushheader(table, self.header)
            table = etl.sort(table, 0)

        self.rows = table
        return self.rows
Exemple #6
0
def xref_symbol_reports():
    symbol_reports = [
        f for f in os.listdir()
        if re.match('OCLC Datasync Unresolved.*\.csv', f)
    ]

    today = str(date.today())

    for report in symbol_reports:

        symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report)
        symbol = symbol_split[1]
        xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx'
        xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls'
        txt_outfile = symbol + '_staging_OCNs_' + today + '.txt'

        symbol_table_raw = etl.fromcsv(report, encoding='utf-8')
        symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID')
        symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None")
        symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID')

        xref_table = etl.fromcsv('unresxref.csv')
        xref_table2 = etl.select(xref_table, "{MMS ID} is not None")
        xref_table_sorted = etl.sort(xref_table2, 'MMS ID')

        symbol_xref_table = etl.join(symbol_table_sorted,
                                     xref_table_sorted,
                                     presorted=True,
                                     lkey="MMS ID",
                                     rkey="MMS ID")

        try:
            etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8')
        except TypeError:
            etl.toxls(symbol_xref_table,
                      xls_outfile,
                      'Sheet1',
                      encoding='utf-8')

        staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN')
        template = '{Staging OCN}\n'
        etl.totext(staging_ocns_table, txt_outfile, template=template)
Exemple #7
0
def dataPreProcessing(fileName):
    inputData = fromcsv(fileName)
    table1 = cutout(inputData, 'member_id', 'grade', 'sub_grade', 'emp_title',
                    'url', 'desc', 'title', 'accept_d', 'exp_d', 'list_d',
                    'issue_d', 'purpose', 'addr_city', 'addr_state',
                    'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d',
                    'last_credit_pull_d')
    table2 = select(
        table1,
        lambda i: i['term'] == ' 36 months' and i['loan_status'] is not "")
    labelMapping = OrderedDict()
    labelMapping['loan_status'] = 'loan_status'
    labelMapping['id'] = 'id'
    table6 = fieldmap(table2, labelMapping)
    table8 = sort(table6, 'id')
    table10 = cutout(table8, 'id')
    mappings = OrderedDict()
    mappings['id'] = 'id'
    mappings['home_ownership'] = 'ownership', {
        'MORTGAGE': '-1',
        'RENT': '0',
        'OWN': '1'
    }
    mappings['emp_length'] = 'empLength', {'n/a': 0}
    mappings['is_inc_v'] = 'verificationStatus', {
        'Source Verified': 1,
        'Verified': 0,
        'Not Verified': -1
    }
    mappings['pymnt_plan'] = 'paymentPlan', {'n': 0, 'y': 1}
    mappings['initial_list_status'] = 'listStatus', {'f': 0, 'w': 1}
    table3 = fieldmap(table2, mappings)
    table4 = cutout(table2, 'home_ownership', 'is_inc_v', 'pymnt_plan',
                    'initial_list_status', 'term', 'loan_status')
    table5 = merge(table3, table4, key='id')
    table7 = sort(table5, 'id')
    table9 = cutout(table7, 'id')
    featureFileCsv = tocsv(table9, 'featureFileCsv.csv')
    labelsFileCsv = tocsv(table10, 'labelsFileCsv.csv')
    return featureFileCsv, labelsFileCsv
def print_table(ctx):
    """Output a list of pipelines as table."""

    rows = [dict(source.state) for source in ctx.obj['sources']]
    message = '\nNumber of pipelines = {}\n'
    secho(message.format(len(rows)), **SUCCESS)

    subset = [
        'id', 'pipeline_status', 'validation_status', 'nb_validation_errors',
        'scraper_required', 'resource_type', 'extension'
    ]
    sorted_rows = sort(cut(fromdicts(rows), *subset), key='id')
    echo(look(sorted_rows, limit=None))
Exemple #9
0
    def sort(self, columns=None, reverse=False):
        """
        Sort the rows a table.

        `Args:`
            sort_columns: list or str
                Sort by a single column or a list of column. If ``None`` then
                will sort columns from left to right.
            reverse: boolean
                Sort rows in reverse order.
        `Returns:`
            `Parsons Table` and also updates self
        """

        self.table = petl.sort(self.table, key=columns, reverse=reverse)

        return self
Exemple #10
0
def process_animal_extended(shelter_id, session, input_directory):
    table = petl.fromxls(os.path.join(input_directory,
                                      'AnimalIntakeExtended.xls'),
                         sheet='AnimalIntakeExtended')

    ## Because an animal can appear in the intake report more than once,
    ## we must sort the table in order to upsert the latest value
    table_sorted = petl.sort(table, key='Intake Date/Time')

    for row in petl.dicts(table_sorted):
        id = row['Animal ID']

        set_values = {
            'arn': normalize_string(row['ARN']),
            'name': normalize_string(row['Animal Name']),
            'species': normalize_string(row['Species']),
            'primary_breed': normalize_string(row['Primary Breed']),
            'secondary_bred': normalize_string(row['Secondary Breed']),
            'gender': normalize_string(row['Gender']),
            'pre_altered': to_bool(row['Pre Altered']),
            'altered': to_bool(row['Altered']),
            'primary_color': normalize_string(row['Primary Colour']),
            'secondary_color': normalize_string(row['Secondary Colour']),
            'third_color': normalize_string(row['Third Colour']),
            'color_pattern': normalize_string(row['Colour Pattern']),
            'second_color_pattern':
            normalize_string(row['Second Colour Pattern']),
            'size': normalize_string(row['Size'])
        }

        insert_stmt = insert(Animal)\
            .values(
                id=id,
                shelter_id=shelter_id, ## TODO: add to index for constraint? make composite pk?
                **set_values)\
            .on_conflict_do_update(
                constraint='animals_pkey',
                set_={
                    'shelter_id': shelter_id,
                    **set_values,
                    'updated_at': func.now()
                })

        session.execute(insert_stmt)
        session.commit()
def print_table(ctx):
    """Output a list of pipelines as table."""

    rows = [dict(source.state) for source in ctx.obj['sources']]
    message = '\nNumber of pipelines = {}\n'
    secho(message.format(len(rows)), **SUCCESS)

    subset = [
        'id',
        'pipeline_status',
        'validation_status',
        'nb_validation_errors',
        'scraper_required',
        'resource_type',
        'extension'
    ]
    sorted_rows = sort(cut(fromdicts(rows), *subset), key='id')
    echo(look(sorted_rows, limit=None))
Exemple #12
0
    def etl(self, *args, **kw):
        table = petl.fromxlsx(self._src_path)

        model = DEPTH_TO_WATER
        self._update_model(model, self._vocab)

        # group table by sys_loc_code
        header = petl.header(table)
        for name, records in petl.rowgroupby(petl.sort(table, 'sys_loc_code'),
                                             'sys_loc_code'):
            records = [dict(zip(header, record)) for record in records]
            record = records[0]
            location_id = self._post_location(record, model)
            thing_id = self._post_thing(record, model, location_id)

            print('---------------')
            print(f'len records {len(records)}')
            # self.add_package(record)
            self.observation.set_records(records)
            self.observation.etl(tids=self._make_tids(thing_id, record),
                                 models=(model, ))
def createFacts(events, users):
    try:
        events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign')
        events_tui = etl.cutout(events, 'user_id')

        stage_uid = etl.join(users, events_uid, key='user_id')
        stage_tui = etl.join(users, events_tui, key='tracking_id')

        stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign')
        stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id')
        stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email'])

        mappings = OrderedDict()
        mappings['tid'] = 'tracking_id'
        mappings['uid'] = 'user_id'
        mappings['utm_medium'] = 'utm_medium'
        mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
        mappings['utm_campaigntype'] = 'utm_campaign'
        mappings['email'] = 'email'
        mappings['subscription'] = 'type'
        mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'}
        mappings['created_at'] = 'created_at'

        # Mapping
        stage_mapping = etl.fieldmap(stage_m_s, mappings)

        # Sort
        stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order'])

        # Datetime split
        t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True)
        t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day'])
        stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second'])

        # Export as csv to load folder
        etl.tocsv(stage_ready, 'load/facts.csv')

    except Exception as e:
        print("Something went wrong. Error {0}".format(e))
Exemple #14
0
def order_by_constraint(base_path, table, schema, self_dep_set):
    file_name = base_path + "/content/data/" + table + ".tsv"
    tempfile = NamedTemporaryFile(mode='w',
                                  dir=base_path + "/content/data/",
                                  delete=False)
    table = etl.fromcsv(file_name,
                        delimiter='\t',
                        skipinitialspace=True,
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        escapechar='')

    key_dep_dict = {}

    # print(file_name)
    for constraint in self_dep_set:
        child_dep, parent_dep = constraint.split(':')
        data = etl.values(table, child_dep, parent_dep)
        for d in data:
            key_dep_set = {d[1]}
            key_dep_dict.update({d[0]: key_dep_set})

    key_dep_list = toposort_flatten(key_dep_dict)
    table = etl.addfield(table, 'pwb_index',
                         lambda rec: int(key_dep_list.index(rec[child_dep])))
    table = etl.sort(table, 'pwb_index')
    table = etl.cutout(table, 'pwb_index')

    writer = csv.writer(tempfile,
                        delimiter='\t',
                        quoting=csv.QUOTE_NONE,
                        quotechar='',
                        lineterminator='\n',
                        escapechar='')

    writer.writerows(table)
    shutil.move(tempfile.name, file_name)
Exemple #15
0
print('RENAMING mine_acc_no to mine_incident_no')
table = etl.rename(table, 'mine_acc_no', 'proponent_incident_no')

print('CREATING create_user = MMS_DO_IMPORT')
table = etl.addfield(table, 'create_user', 'MMS_DO_IMPORT')
table = etl.addfield(table, 'update_user', 'MMS_DO_IMPORT')

#RENAME SOURCE COLUMNS WE WANT TO PRESERVE
print("RENAME insp_cd to mms_insp_cd")
table = etl.rename(table, 'insp_cd', 'mms_insp_cd')
print("RENAME min_acc_no to mine_incident_no")
table = etl.rename(table, 'min_acc_no', 'mine_incident_no')

#force id column SQL will reset the sequence
table = etl.addrownumbers(table, field='mine_incident_id')
table = etl.sort(table, 'incident_timestamp', reverse=True)

print('UNJOIN Recommendations into separate table')
table, recommendation_table = etl.unjoin(table,
                                         'recommendation',
                                         key='mine_incident_id')
recommendation_table = etl.select(recommendation_table, 'recommendation',
                                  lambda x: x is not None and not x.isspace())
recommendation_table = etl.addfield(recommendation_table, 'create_user',
                                    'MMS_DO_IMPORT')
recommendation_table = etl.addfield(recommendation_table, 'update_user',
                                    'MMS_DO_IMPORT')

print("TRUNCATE public.mine_incident_recommendation")
connection.cursor().execute(
    'TRUNCATE TABLE public.mine_incident_recommendation;')
Exemple #16
0
from __future__ import absolute_import, print_function, division

# sort()
########

import petl as etl
table1 = [['foo', 'bar'], ['C', 2], ['A', 9], ['A', 6], ['F', 1], ['D', 10]]
table2 = etl.sort(table1, 'foo')
table2
# sorting by compound key is supported
table3 = etl.sort(table1, key=['foo', 'bar'])
table3
# if no key is specified, the default is a lexical sort
table4 = etl.sort(table1)
table4

# mergesort()
#############

import petl as etl
table1 = [['foo', 'bar'], ['A', 9], ['C', 2], ['D', 10], ['A', 6], ['F', 1]]
table2 = [['foo', 'bar'], ['B', 3], ['D', 10], ['A', 10], ['F', 4]]
table3 = etl.mergesort(table1, table2, key='foo')
table3.lookall()

# issorted()
############

import petl as etl
table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]]
etl.issorted(table1, key='foo')
Exemple #17
0
import petl as etl

table = (
    etl
    .fromcsv('Credit_Card_Transactions_for_FY_2017_by_Department.csv')
    .convert('DEPT_NAME', 'lower')
    .addfield('APPROX_AMT_K', lambda row: int(round(float(row.MERCHANDISE_AMT[1:]) / 1000, 0)))
    .addfield('APPROX_AMT', lambda row: '$' + str(row.APPROX_AMT_K) + 'K')
)

new_table = table.cut('DEPT_NAME','APPROX_AMT')

sort_table = etl.sort(new_table, key=['DEPT_NAME'])

print '\nDepartment wise credit transaction for year 2017'
print sort_table


##Sample Output:
##
##Department wise credit transaction for year 2017
##+--------------------------------+------------+
##| DEPT_NAME                      | APPROX_AMT |
##+================================+============+
##| academia antonia alonso        | $18K       |
##+--------------------------------+------------+
##| advisory counc exceptnl citizn | $2K        |
##+--------------------------------+------------+
##| appoquinimink school district  | $774K      |
##+--------------------------------+------------+
##| auditor of accounts            | $78K       |
                                        escapechar='')
                    key_dep_dict = {}

                    print(file_name)
                    for constraint in value:
                        child_dep, parent_dep = constraint.split(':')
                        data = etl.values(table, child_dep, parent_dep)
                        for d in data:
                            key_dep_set = {d[1]}
                            key_dep_dict.update({d[0]: key_dep_set})

                    key_dep_list = toposort_flatten(key_dep_dict)
                    table = etl.addfield(
                        table, 'pwb_index',
                        lambda rec: int(key_dep_list.index(rec[child_dep])))
                    table = etl.sort(table, 'pwb_index')
                    table = etl.cutout(table, 'pwb_index')

                    writer = csv.writer(tempfile,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_NONE,
                                        quotechar='',
                                        lineterminator='\n',
                                        escapechar='')

                    writer.writerows(table)
                    shutil.move(tempfile.name, file_name)

            open(tsv_done_file, 'a').close()

            ddl = []
mappings['utm_medium'] = 'utm_medium'
mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'}
mappings['utm_campaign_type'] = 'utm_campaign'
mappings['email'] = 'email'
mappings['subscription'] = 'type'
mappings['sub_order'] = 'type', {
    'Signup Completed': '1',
    'Trial Started': '2',
    'Subscription Started': '3',
    'Subscription Ended': '4'
}
mappings['created_at'] = 'created_at'

# Mapping
stage_mapping = etl.fieldmap(stage_m_s, mappings)

# Sort
stage_mapping_ordered = etl.sort(stage_mapping,
                                 key=['created_at', 'email', 'sub_order'])

# Datetime split
t1 = etl.split(stage_mapping_ordered,
               'created_at',
               'T', ['date', 'time'],
               include_original=True)
t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day'])
stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second'])

# Export as csv to load folder
etl.tocsv(stage_ready, 'load/facts.csv')
Exemple #20
0
def sales_summary(start_dt=None, end_dt=None):
    """tally up gross (sale over list) profits
    TODO: tally up net profites (gross profit vs inventory purchase total)

    TODO: Keyword Arguments:
        start_dt {[type]} -- datetime for start of query (default: {None})
        end_dt {[type]} -- datetime for start of query [description] (default: {None})

    Returns:
        [dict] -- various types of sales information, stored in a dictionary.
    """

    # products = db.session.query(Product).all()
    # sales = db.session.query(Sale).all()

    # retrieve existing tables
    products_records = etl.fromdb(db.engine, 'SELECT * FROM product')
    sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale')

    # join product info to sales data
    sales_data = etl.join(sales_records,
                          products_records,
                          lkey='product_id',
                          rkey='id')

    # prep joined sales data for tabulation
    sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt))
    sales_data = etl.sort(sales_data, 'date')
    sales_data = etl.convert(sales_data, 'quantity',
                             lambda q: handle_none(q, replace_with=1))
    sales_data = etl.addfield(sales_data, 'profit',
                              lambda rec: calculate_profit(rec))
    sales_data = etl.addfield(sales_data, 'gross_sales',
                              lambda rec: calculate_gross_sales(rec))

    # summarize data into charting-friendly data structures
    chart_count = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'quantity',
                           presorted=True)
    chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'})
    chart_count, chart_count_missing_date = etl.biselect(
        chart_count, lambda rec: rec.x is not None)
    # print(chart_count)
    # etl.lookall(chart_count)

    chart_gross = etl.fold(sales_data,
                           'date',
                           operator.add,
                           'gross_sales',
                           presorted=True)
    chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'})
    chart_gross, chart_gross_missing_date = etl.biselect(
        chart_gross, lambda rec: rec.x is not None)
    # print(chart_gross)
    # etl.lookall(chart_gross)

    chart_profit = etl.fold(sales_data,
                            'date',
                            operator.add,
                            'profit',
                            presorted=True)
    chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'})
    chart_profit, chart_profit_missing_date = etl.biselect(
        chart_profit, lambda rec: rec.x is not None)

    # tabulate some figures
    gross_sales = 0
    profits = 0
    for sale in etl.dicts(sales_data):
        profits += calculate_profit(sale)
        gross_sales += calculate_gross_sales(sale)

    # for i in etl.dicts(chart_count):
    #     print(i)
    # for i in etl.dicts(chart_gross):
    #     print(i)

    return {
        'gross_sales': gross_sales,
        'profits': profits,
        'chart_gross': list(etl.dicts(chart_gross)),
        'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)),
        'chart_profit': list(etl.dicts(chart_profit)),
        'chart_profit_missing_date':
        list(etl.dicts(chart_profit_missing_date)),
        'chart_count': list(etl.dicts(chart_count)),
        'chart_count_missing_date': list(etl.dicts(chart_count_missing_date))
    }
Exemple #21
0
# cut function is used to cut out the column given in the bracket below from the table
# cut function is not compulsory for table1 because the value given below are the total field that are present in table1
data = etl.cut(table1, 'iso_code', 'location', 'date', 'total_cases',
               'new_cases', 'total_deaths', 'new_deaths',
               'total_cases_per_million', 'new_cases_per_million',
               'total_deaths_per_million', 'new_deaths_per_million',
               'total_tests', 'new_tests', 'total_tests_per_thousand',
               'new_tests_per_thousand', 'tests_units')

# selecting the data from table on the basis of current date
# variable num consist of only the data of 2020-04-30 from each country.Hence the latest data is filter out.
num = etl.select(data, 'date', lambda r: r == '2020-04-30')

# sort function is used to sort the unsorted data on the basis of iso_code
# thus ,this process help us to join the data easily in furthur steps
table1_sort = etl.sort(num, key='iso_code')

# counter variable is declared to count the number of country
count = 0

# values function is used to read the data from table
for i in etl.values(table1_sort, 'iso_code', 'location', 'date', 'total_cases',
                    'new_cases', 'total_deaths', 'new_deaths',
                    'total_cases_per_million', 'new_cases_per_million',
                    'total_deaths_per_million', 'new_deaths_per_million',
                    'total_tests', 'new_tests', 'total_tests_per_thousand',
                    'new_tests_per_thousand', 'tests_units'):

    # condition to take 15 countries from the table for intregration
    # thus 16 is given because when count will be 0 it contain the unnecessary data.
    if count == 16:
Exemple #22
0
"""
DB-related tests, separated from main unit tests because they need local database
setup prior to running.

"""

import sys
sys.path.insert(0, './src')
from petl import dummytable, sort, nrows
import logging
logging.basicConfig(level=logging.DEBUG)

t = (('foo', 'bar'), ('C', 2), ('A', 9), ('B', 6), ('E', 1), ('D', 10))
u = sort(t, buffersize=3)

print 'buffer up the data'
print nrows(u)

print 'create iterators'
it1 = iter(u)
it2 = iter(u)

print 'iterate'
print 1, it1.next()
print 1, it1.next()
print 1, it1.next()
print 2, it2.next()
print 2, it2.next()
print 1, it1.next()
print 1, it1.next()
Exemple #23
0
            transformed_outreaches.table, key="created_date", aggregation=len
        )
    )
    leaderboard = petl.aggregate(
        transformed_outreaches.table, key="name", aggregation=len
    )

    calls_per_office = get_calls_per_office(transformed_outreaches)

    # rename columns for spreadsheet
    calls_per_day.rename_column('value', 'num_calls')
    calls_per_day=calls_per_day.rename_column('created_date', 'day')

    calls_per_office=calls_per_office.rename_column('name', 'office')
    # Sort leaderboard by num calls per person
    leaderboard_ranked = Table(petl.sort(leaderboard, 'value', reverse=True))
    leaderboard_ranked=leaderboard_ranked.rename_column('value', 'num_calls')
    # Get set up spreadsheet and errors spreadsheet
    spreadsheet_id = "1fPlKWVtpDWid06R8oi0bHgch1ShYovYyks2aSZKY6nY"
    # Push to Google Sheets
    parsons_sheets.overwrite_sheet(
        spreadsheet_id,
        calls_per_day,
        worksheet="calls per day",
        user_entered_value=False,
    )
    parsons_sheets.overwrite_sheet(
        spreadsheet_id,
        leaderboard_ranked,
        worksheet="leaderboard",
        user_entered_value=False,
Exemple #24
0
#gets the information from ppms and creates a csv file
r = requests.post(url, data=payload, headers=headers)
f = open('todays_bookings.csv', 'wb')
f.write(r.text)
f.close()

#Load the table
table1 = petl.fromcsv('todays_bookings.csv')
# Alter the columns
table2 = petl.cut(table1, ' Object', ' User', ' Start time', ' End time',
                  ' Training', ' Assisted')
# Reorder the user names
table3 = petl.convert(table2, ' User',
                      lambda row: " ".join(re.findall("\S+", row)[::-1]))
# Reorder the rows
table4 = petl.sort(table3, key=[' Object', ' Start time'])
# Save to new file
petl.tocsv(table4, 'new.csv')

#Reopens the CSV file (stupid, I know) and removes unnecessary characters
csvfile = ""
ppmscal = csv.reader(open('new.csv'), delimiter=',')
for row in ppmscal:
    csvfile += str(row) + '\n'
csvtxt = csvfile.replace("(", "").replace(")", "").replace("'", "").replace(
    "[", "").replace("]", "")
csvtxt = csvtxt[:-1]


#The CSV to HTML code has come from https://www.rosettacode.org/wiki/CSV_to_HTML_translation#Python
#It creates an html file of the CSV so I can load, and auto-refresh it in  browser
Exemple #25
0
"""

import sys
sys.path.insert(0, './src')
from petl import dummytable, sort, nrows
import logging
logging.basicConfig(level=logging.DEBUG)

t = (('foo', 'bar'),
     ('C', 2),
     ('A', 9),
     ('B', 6),
     ('E', 1),
     ('D', 10))
u = sort(t, buffersize=3)

print 'buffer up the data'
print nrows(u)

print 'create iterators'
it1 = iter(u)
it2 = iter(u)

print 'iterate'
print 1, it1.next()
print 1, it1.next()
print 1, it1.next()
print 2, it2.next()
print 2, it2.next()
print 1, it1.next()
Exemple #26
0
table2 = tail(table1, 4)
look(table2)    


# sort

table1 = [['foo', 'bar'],
          ['C', 2],
          ['A', 9],
          ['A', 6],
          ['F', 1],
          ['D', 10]]

from petl import sort, look
look(table1)
table2 = sort(table1, 'foo')
look(table2)
# sorting by compound key is supported
table3 = sort(table1, key=['foo', 'bar'])
look(table3)
# if no key is specified, the default is a lexical sort
table4 = sort(table1)
look(table4)


# melt

table1 = [['id', 'gender', 'age'],
          [1, 'F', 12],
          [2, 'M', 17],
          [3, 'M', 16]]
Exemple #27
0
table2 = tail(table1, 4)
look(table2)    


# sort

table1 = [['foo', 'bar'],
          ['C', 2],
          ['A', 9],
          ['A', 6],
          ['F', 1],
          ['D', 10]]

from petl import sort, look
look(table1)
table2 = sort(table1, 'foo')
look(table2)
# sorting by compound key is supported
table3 = sort(table1, key=['foo', 'bar'])
look(table3)
# if no key is specified, the default is a lexical sort
table4 = sort(table1)
look(table4)


# melt

table1 = [['id', 'gender', 'age'],
          [1, 'F', 12],
          [2, 'M', 17],
          [3, 'M', 16]]
international_code = "(+61)"

with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile:
    csv_reader = csv.reader(infile)
    writer = csv.writer(outfile)
    headers = next(csv_reader, None)  #skipping header row
    writer.writerow(headers)
    for row in csv_reader:
        number_column = row[5]
        state_column = row[3]
        clean_num = re.sub("\D", "", row[5])[-8:]
        formatted_num = international_code + " " + regional_code[
            state_column] + " " + clean_num
        row[5] = formatted_num
        writer.writerow(row)

services = petl.fromcsv(SERVICES_FILE)
offices = petl.fromcsv(OUT_FILE)
offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"})
offices = petl.cutout(offices,"State","Postcode")

locations = petl.fromcsv(LOC_FILE)
locations = locations.rename({"officeID": "OfficeID"})
office_service = petl.join(services, offices, key='OfficeID')

office_service_locations = petl.join(
    office_service, locations, key='OfficeID')

office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int)
office_service_locations = petl.sort(office_service_locations,'OfficeServiceID')
petl.tocsv(office_service_locations, 'office_service_locations.csv')
Exemple #29
0
]

table1 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)),
    "Type", "Red")
table2 = etl.addfield(
    etl.convertnumbers(
        etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)),
    "Type", "White")

#print(etl.head(table1))
#print(etl.head(table2))

table1_filtered = etl.select(table1, "Quality", lambda v: v > 6)
table2_filtered = etl.select(table2, "Quality", lambda v: v > 4)

good_wines = etl.cat(table1_filtered, table2_filtered)

good_wines_enhanced = etl.addfields(
    good_wines,
    [("Max Acidity",
      lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]),
     ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])])
#print(etl.head(good_wines_enhanced))
#print(etl.tail(good_wines_enhanced))

gwe_sorted = etl.sort(good_wines_enhanced, key=["Quality", "Sugar"])

#print(etl.head(gwe_sorted))
print(etl.lookall(etl.tail(gwe_sorted, 500)))
Exemple #30
0
from __future__ import absolute_import, print_function, division


# sort()
########

import petl as etl
table1 = [['foo', 'bar'],
          ['C', 2],
          ['A', 9],
          ['A', 6],
          ['F', 1],
          ['D', 10]]
table2 = etl.sort(table1, 'foo')
table2
# sorting by compound key is supported
table3 = etl.sort(table1, key=['foo', 'bar'])
table3
# if no key is specified, the default is a lexical sort
table4 = etl.sort(table1)
table4


# mergesort()
#############

import petl as etl
table1 = [['foo', 'bar'],
          ['A', 9],
          ['C', 2],
          ['D', 10],