def test_toxlsx_with_non_str_header(xlsx_table_with_non_str_header): f = NamedTemporaryFile(delete=True, suffix='.xlsx') f.close() toxlsx(xlsx_table_with_non_str_header, f.name, 'Sheet1') actual = etl.fromxlsx(f.name, 'Sheet1') ieq(xlsx_table_with_non_str_header, actual)
def test_integration(): tbl = (('foo', 'bar'), ('A', 1), ('B', 2), ('C', 2), (u'é', datetime(2012, 1, 1))) f = NamedTemporaryFile(delete=False) f.close() etl.wrap(tbl).toxlsx(f.name, 'Sheet1') actual = etl.fromxlsx(f.name, 'Sheet1') ieq(tbl, actual)
def test_integration(xlsx_test_table): f = NamedTemporaryFile(delete=True, suffix='.xlsx') f.close() tbl = etl.wrap(xlsx_test_table) tbl.toxlsx(f.name, 'Sheet1') actual = etl.fromxlsx(f.name, 'Sheet1') ieq(tbl, actual) tbl.appendxlsx(f.name, 'Sheet1') expect = tbl.cat(tbl) ieq(expect, actual)
def test_integration(): tbl = (('foo', 'bar'), ('A', 1), ('B', 2), ('C', 2), (u'é', datetime(2012, 1, 1))) f = NamedTemporaryFile(delete=True, suffix='.xlsx') f.close() tbl = etl.wrap(tbl) tbl.toxlsx(f.name, 'Sheet1') actual = etl.fromxlsx(f.name, 'Sheet1') ieq(tbl, actual) tbl.appendxlsx(f.name, 'Sheet1') expect = tbl.cat(tbl) ieq(expect, actual)
def main(): """The main function which extracts the data from the sample-data spreadsheet and splits it to a csv per account""" bank_lookup = { 'Spending': bank_main, 'Income': bank_main, 'Saving': bank_savings, 'Credit card': bank_credit, } sheet_data = petl.fromxlsx('sample-data.xlsx', sheet='Data') data = sheet_data.cut(*range(5)) early_data = data.select('Date', lambda r: r.month <= 2) for account, table in split_table(early_data, 'Account'): modified_table = bank_lookup[account](table) # modified_table.tocsv(table['Account'][0]+'.csv') print(modified_table)
def etl(self, *args, **kw): table = petl.fromxlsx(self._src_path) model = DEPTH_TO_WATER self._update_model(model, self._vocab) # group table by sys_loc_code header = petl.header(table) for name, records in petl.rowgroupby(petl.sort(table, 'sys_loc_code'), 'sys_loc_code'): records = [dict(zip(header, record)) for record in records] record = records[0] location_id = self._post_location(record, model) thing_id = self._post_thing(record, model, location_id) print('---------------') print(f'len records {len(records)}') # self.add_package(record) self.observation.set_records(records) self.observation.etl(tids=self._make_tids(thing_id, record), models=(model, ))
def kcmo_convert(filepath, xtrapath): """ Takes the file path to a csv in the format used by Kansas City proper converts to universal format outputs csv. """ kcmo = etl.fromcsv(filepath) kcx = etl.fromxlsx(xtrapath) kcjoin = etl.join(kcmo, kcx, lkey='POLEID', rkey='IDNumber') del kcmo del kcx kcjoin = etl.addfield(kcjoin, 'PoleID', lambda x: x['POLEID']) kcjoin = etl.addfield(kcjoin, 'Longitude', lambda x: geom_to_tuple(x['the_geom'])[0]) kcjoin = etl.addfield(kcjoin, 'Latitude', lambda x: geom_to_tuple(x['the_geom'])[1]) kcjoin = etl.addfield(kcjoin, 'LightbulbType', lambda x: x['LUMINAIRE TYPE']) kcjoin = etl.addfield(kcjoin, 'Wattage', lambda x: x['WATTS']) kcjoin = etl.addfield(kcjoin, 'Lumens', None) kcjoin = etl.addfield( kcjoin, 'LightAttributes', lambda x: make_a_list( x['ATTACHMENT 10'], x['ATTACHMENT 9'], x['ATTACHMENT 8'], x[ 'ATTACHMENT 7'], x['ATTACHMENT 6'], x['ATTACHMENT 5'], x[ 'ATTACHMENT 4'], x['ATTACHMENT 3'], x['ATTACHMENT 2'], x[ 'ATTACHMENT 1'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'AttachedTech', lambda x: bool(x['LightAttributes'])) kcjoin = etl.addfield( kcjoin, 'FiberWiFiEnable', lambda x: find_wifi(*x[ 'LightAttributes'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'PoleType', lambda x: x['POLE TYPE']) kcjoin = etl.addfield(kcjoin, 'PoleOwner', lambda x: x['POLE OWNER']) kcjoin = etl.addfield(kcjoin, 'DataSource', 'Kansas City') kcjoin = etl.cut(kcjoin, 'PoleID', 'Longitude', 'Latitude', 'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech', 'LightAttributes', 'FiberWiFiEnable', 'PoleType', 'PoleOwner', 'DataSource') etl.tocsv(kcjoin, 'data/kcmo_clean.csv')
# To run (ideally using screen): # python $HOME/src/github/malariagen/methods-dev/pf3k_techbm/scripts/gatk_pipeline/run_all_pipelines.py >> $HOME/src/github/malariagen/methods-dev/pf3k_techbm/log/gatk_pipeline_20150917.log import os import petl as etl import subprocess PIPELINES_FN='%s/src/github/malariagen/methods-dev/pf3k_techbm/scripts/gatk_pipeline/pipeline_parameters.xlsx' % os.environ['HOME'] SCRIPT_FN='%s/src/github/malariagen/methods-dev/pf3k_techbm/scripts/gatk_pipeline/gatk_pipeline_20150917.sh' % os.environ['HOME'] tbl_pipelines = etl.fromxlsx(PIPELINES_FN) var_names = tbl_pipelines.header() for rec in tbl_pipelines.data(): print(rec) for i, var_name in enumerate(var_names): print(var_name, rec[i]) os.environ[var_name] = str(rec[i]) subprocess.call(SCRIPT_FN)