def _test_rightjoin_multiple(rightjoin_impl): table1 = (('id', 'color', 'cost'), (1, 'blue', 12), (1, 'red', 8), (2, 'yellow', 15), (2, 'orange', 5), (3, 'purple', 4), (4, 'chartreuse', 42)) table2 = (('id', 'shape', 'size'), (1, 'circle', 'big'), (2, 'square', 'tiny'), (2, 'square', 'big'), (3, 'ellipse', 'small'), (3, 'ellipse', 'tiny'), (5, 'didodecahedron', 3.14159265)) actual = rightjoin_impl(table1, table2, key='id') expect = (('id', 'color', 'cost', 'shape', 'size'), (1, 'blue', 12, 'circle', 'big'), (1, 'red', 8, 'circle', 'big'), (2, 'yellow', 15, 'square', 'tiny'), (2, 'yellow', 15, 'square', 'big'), (2, 'orange', 5, 'square', 'tiny'), (2, 'orange', 5, 'square', 'big'), (3, 'purple', 4, 'ellipse', 'small'), (3, 'purple', 4, 'ellipse', 'tiny'), (5, None, None, 'didodecahedron', 3.14159265)) # N.B., need to sort because hash and sort implementations will return # rows in a different order ieq(sort(expect), sort(actual))
def etl_from_table(self, table, force_reload=False): """Extract, translate, load exclusions (and not reinstatements) from a petl TABLE. Set FORCE_RELOAD to True to turn off the protections against reading the same csv file twice. There is no harm in redoing a csv file, since the csv contents replaces the db table entirely. We avoid reloading because it is faster and because it prevents the db from having an empty table for a moment between blowing away and refilling it. """ if not force_reload: # If UPDATED.csv has the same number of rows and the same most # recent date as our db, we've already snarfed this csv file and # can skip it. db_latest = self.conn.get_latest_exclusion_date().replace('-', '') db_num_rows = self.conn.count_exclusions() updated_latest = etl.cut(etl.sort(table, 'EXCLDATE'), 'EXCLDATE')[len(table) - 1][0] updated_num_rows = len(table) - 1 if (db_num_rows == updated_num_rows and db_latest == updated_latest): return # Massage data individual, business = clean_and_separate(table) # Save to db, BLOWING AWAY data in the existing tables. If # tables don't exist, will create them, but without any # constraints. info("Replacing individual_exclusion and business_exclusion tables.") etl.todb(individual, self.conn.conn, 'individual_exclusion') etl.todb(business, self.conn.conn, 'business_exclusion')
def lookup_and_transform(ts_kv_table): """The table has the following structure: +---------------------------------+---------------+---------------+--------+ | entity_id | key | ts | value | +=================================+===============+===============+========+ | 1ea47494dc14d40bd76a73c738b665f | Temperature | 1583010011665 | -1.8 | +---------------------------------+---------------+---------------+--------+ | 1ea47494dc14d40bd76a73c738b665f | WindDirection | 1583010000692 | 227 | +---------------------------------+---------------+---------------+--------+ The output is a dictionary {device_id:table} of tables like that: +--------------+--------------+---------------+ | ts | Temperature | WindDirection | +--------------+--------------+---------------+ |1583010011665 | -1.8 | 230 | +--------------+--------------+---------------+ |1583010000692 | -2.5 | 227 | +--------------+--------------+---------------+ """ lkp = petl.lookup(ts_kv_table, 'entity_id', value=('key', 'ts', 'value')) for entity_id in lkp: tbl = [('key', 'ts', 'value')] + lkp[entity_id] tbl = petl.recast(tbl, variablefield='key', valuefield='value') cut_keys = KEYS_TO_REMOVE & set(petl.fieldnames(tbl)) tbl = petl.cutout(tbl, *cut_keys) tbl = petl.transform.headers.sortheader(tbl) tbl = petl.transform.basics.movefield(tbl, 'ts', 0) lkp[entity_id] = petl.sort(tbl, 'ts') return lkp
def sort_execute(c, **kwargs): r = c() if 'addfields' in kwargs: r = etl.addfields(r, kwargs['addfields']) kwargs = filter_keys(kwargs, ("key", "reverse")) r = etl.sort(r, **kwargs) return r
def get_table(self): table = self.get_body() if (table is not None) and (len(table) > 0): if (self.header is not None): table = etl.headers.pushheader(table, self.header) table = etl.sort(table, 0) self.rows = table return self.rows
def xref_symbol_reports(): symbol_reports = [ f for f in os.listdir() if re.match('OCLC Datasync Unresolved.*\.csv', f) ] today = str(date.today()) for report in symbol_reports: symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report) symbol = symbol_split[1] xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx' xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls' txt_outfile = symbol + '_staging_OCNs_' + today + '.txt' symbol_table_raw = etl.fromcsv(report, encoding='utf-8') symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID') symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None") symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID') xref_table = etl.fromcsv('unresxref.csv') xref_table2 = etl.select(xref_table, "{MMS ID} is not None") xref_table_sorted = etl.sort(xref_table2, 'MMS ID') symbol_xref_table = etl.join(symbol_table_sorted, xref_table_sorted, presorted=True, lkey="MMS ID", rkey="MMS ID") try: etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8') except TypeError: etl.toxls(symbol_xref_table, xls_outfile, 'Sheet1', encoding='utf-8') staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN') template = '{Staging OCN}\n' etl.totext(staging_ocns_table, txt_outfile, template=template)
def dataPreProcessing(fileName): inputData = fromcsv(fileName) table1 = cutout(inputData, 'member_id', 'grade', 'sub_grade', 'emp_title', 'url', 'desc', 'title', 'accept_d', 'exp_d', 'list_d', 'issue_d', 'purpose', 'addr_city', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d') table2 = select( table1, lambda i: i['term'] == ' 36 months' and i['loan_status'] is not "") labelMapping = OrderedDict() labelMapping['loan_status'] = 'loan_status' labelMapping['id'] = 'id' table6 = fieldmap(table2, labelMapping) table8 = sort(table6, 'id') table10 = cutout(table8, 'id') mappings = OrderedDict() mappings['id'] = 'id' mappings['home_ownership'] = 'ownership', { 'MORTGAGE': '-1', 'RENT': '0', 'OWN': '1' } mappings['emp_length'] = 'empLength', {'n/a': 0} mappings['is_inc_v'] = 'verificationStatus', { 'Source Verified': 1, 'Verified': 0, 'Not Verified': -1 } mappings['pymnt_plan'] = 'paymentPlan', {'n': 0, 'y': 1} mappings['initial_list_status'] = 'listStatus', {'f': 0, 'w': 1} table3 = fieldmap(table2, mappings) table4 = cutout(table2, 'home_ownership', 'is_inc_v', 'pymnt_plan', 'initial_list_status', 'term', 'loan_status') table5 = merge(table3, table4, key='id') table7 = sort(table5, 'id') table9 = cutout(table7, 'id') featureFileCsv = tocsv(table9, 'featureFileCsv.csv') labelsFileCsv = tocsv(table10, 'labelsFileCsv.csv') return featureFileCsv, labelsFileCsv
def print_table(ctx): """Output a list of pipelines as table.""" rows = [dict(source.state) for source in ctx.obj['sources']] message = '\nNumber of pipelines = {}\n' secho(message.format(len(rows)), **SUCCESS) subset = [ 'id', 'pipeline_status', 'validation_status', 'nb_validation_errors', 'scraper_required', 'resource_type', 'extension' ] sorted_rows = sort(cut(fromdicts(rows), *subset), key='id') echo(look(sorted_rows, limit=None))
def sort(self, columns=None, reverse=False): """ Sort the rows a table. `Args:` sort_columns: list or str Sort by a single column or a list of column. If ``None`` then will sort columns from left to right. reverse: boolean Sort rows in reverse order. `Returns:` `Parsons Table` and also updates self """ self.table = petl.sort(self.table, key=columns, reverse=reverse) return self
def process_animal_extended(shelter_id, session, input_directory): table = petl.fromxls(os.path.join(input_directory, 'AnimalIntakeExtended.xls'), sheet='AnimalIntakeExtended') ## Because an animal can appear in the intake report more than once, ## we must sort the table in order to upsert the latest value table_sorted = petl.sort(table, key='Intake Date/Time') for row in petl.dicts(table_sorted): id = row['Animal ID'] set_values = { 'arn': normalize_string(row['ARN']), 'name': normalize_string(row['Animal Name']), 'species': normalize_string(row['Species']), 'primary_breed': normalize_string(row['Primary Breed']), 'secondary_bred': normalize_string(row['Secondary Breed']), 'gender': normalize_string(row['Gender']), 'pre_altered': to_bool(row['Pre Altered']), 'altered': to_bool(row['Altered']), 'primary_color': normalize_string(row['Primary Colour']), 'secondary_color': normalize_string(row['Secondary Colour']), 'third_color': normalize_string(row['Third Colour']), 'color_pattern': normalize_string(row['Colour Pattern']), 'second_color_pattern': normalize_string(row['Second Colour Pattern']), 'size': normalize_string(row['Size']) } insert_stmt = insert(Animal)\ .values( id=id, shelter_id=shelter_id, ## TODO: add to index for constraint? make composite pk? **set_values)\ .on_conflict_do_update( constraint='animals_pkey', set_={ 'shelter_id': shelter_id, **set_values, 'updated_at': func.now() }) session.execute(insert_stmt) session.commit()
def etl(self, *args, **kw): table = petl.fromxlsx(self._src_path) model = DEPTH_TO_WATER self._update_model(model, self._vocab) # group table by sys_loc_code header = petl.header(table) for name, records in petl.rowgroupby(petl.sort(table, 'sys_loc_code'), 'sys_loc_code'): records = [dict(zip(header, record)) for record in records] record = records[0] location_id = self._post_location(record, model) thing_id = self._post_thing(record, model, location_id) print('---------------') print(f'len records {len(records)}') # self.add_package(record) self.observation.set_records(records) self.observation.etl(tids=self._make_tids(thing_id, record), models=(model, ))
def createFacts(events, users): try: events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaigntype'] = 'utm_campaign' mappings['email'] = 'email' mappings['subscription'] = 'type' mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'} mappings['created_at'] = 'created_at' # Mapping stage_mapping = etl.fieldmap(stage_m_s, mappings) # Sort stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order']) # Datetime split t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True) t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day']) stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second']) # Export as csv to load folder etl.tocsv(stage_ready, 'load/facts.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def order_by_constraint(base_path, table, schema, self_dep_set): file_name = base_path + "/content/data/" + table + ".tsv" tempfile = NamedTemporaryFile(mode='w', dir=base_path + "/content/data/", delete=False) table = etl.fromcsv(file_name, delimiter='\t', skipinitialspace=True, quoting=csv.QUOTE_NONE, quotechar='', escapechar='') key_dep_dict = {} # print(file_name) for constraint in self_dep_set: child_dep, parent_dep = constraint.split(':') data = etl.values(table, child_dep, parent_dep) for d in data: key_dep_set = {d[1]} key_dep_dict.update({d[0]: key_dep_set}) key_dep_list = toposort_flatten(key_dep_dict) table = etl.addfield(table, 'pwb_index', lambda rec: int(key_dep_list.index(rec[child_dep]))) table = etl.sort(table, 'pwb_index') table = etl.cutout(table, 'pwb_index') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', lineterminator='\n', escapechar='') writer.writerows(table) shutil.move(tempfile.name, file_name)
print('RENAMING mine_acc_no to mine_incident_no') table = etl.rename(table, 'mine_acc_no', 'proponent_incident_no') print('CREATING create_user = MMS_DO_IMPORT') table = etl.addfield(table, 'create_user', 'MMS_DO_IMPORT') table = etl.addfield(table, 'update_user', 'MMS_DO_IMPORT') #RENAME SOURCE COLUMNS WE WANT TO PRESERVE print("RENAME insp_cd to mms_insp_cd") table = etl.rename(table, 'insp_cd', 'mms_insp_cd') print("RENAME min_acc_no to mine_incident_no") table = etl.rename(table, 'min_acc_no', 'mine_incident_no') #force id column SQL will reset the sequence table = etl.addrownumbers(table, field='mine_incident_id') table = etl.sort(table, 'incident_timestamp', reverse=True) print('UNJOIN Recommendations into separate table') table, recommendation_table = etl.unjoin(table, 'recommendation', key='mine_incident_id') recommendation_table = etl.select(recommendation_table, 'recommendation', lambda x: x is not None and not x.isspace()) recommendation_table = etl.addfield(recommendation_table, 'create_user', 'MMS_DO_IMPORT') recommendation_table = etl.addfield(recommendation_table, 'update_user', 'MMS_DO_IMPORT') print("TRUNCATE public.mine_incident_recommendation") connection.cursor().execute( 'TRUNCATE TABLE public.mine_incident_recommendation;')
from __future__ import absolute_import, print_function, division # sort() ######## import petl as etl table1 = [['foo', 'bar'], ['C', 2], ['A', 9], ['A', 6], ['F', 1], ['D', 10]] table2 = etl.sort(table1, 'foo') table2 # sorting by compound key is supported table3 = etl.sort(table1, key=['foo', 'bar']) table3 # if no key is specified, the default is a lexical sort table4 = etl.sort(table1) table4 # mergesort() ############# import petl as etl table1 = [['foo', 'bar'], ['A', 9], ['C', 2], ['D', 10], ['A', 6], ['F', 1]] table2 = [['foo', 'bar'], ['B', 3], ['D', 10], ['A', 10], ['F', 4]] table3 = etl.mergesort(table1, table2, key='foo') table3.lookall() # issorted() ############ import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]] etl.issorted(table1, key='foo')
import petl as etl table = ( etl .fromcsv('Credit_Card_Transactions_for_FY_2017_by_Department.csv') .convert('DEPT_NAME', 'lower') .addfield('APPROX_AMT_K', lambda row: int(round(float(row.MERCHANDISE_AMT[1:]) / 1000, 0))) .addfield('APPROX_AMT', lambda row: '$' + str(row.APPROX_AMT_K) + 'K') ) new_table = table.cut('DEPT_NAME','APPROX_AMT') sort_table = etl.sort(new_table, key=['DEPT_NAME']) print '\nDepartment wise credit transaction for year 2017' print sort_table ##Sample Output: ## ##Department wise credit transaction for year 2017 ##+--------------------------------+------------+ ##| DEPT_NAME | APPROX_AMT | ##+================================+============+ ##| academia antonia alonso | $18K | ##+--------------------------------+------------+ ##| advisory counc exceptnl citizn | $2K | ##+--------------------------------+------------+ ##| appoquinimink school district | $774K | ##+--------------------------------+------------+ ##| auditor of accounts | $78K |
escapechar='') key_dep_dict = {} print(file_name) for constraint in value: child_dep, parent_dep = constraint.split(':') data = etl.values(table, child_dep, parent_dep) for d in data: key_dep_set = {d[1]} key_dep_dict.update({d[0]: key_dep_set}) key_dep_list = toposort_flatten(key_dep_dict) table = etl.addfield( table, 'pwb_index', lambda rec: int(key_dep_list.index(rec[child_dep]))) table = etl.sort(table, 'pwb_index') table = etl.cutout(table, 'pwb_index') writer = csv.writer(tempfile, delimiter='\t', quoting=csv.QUOTE_NONE, quotechar='', lineterminator='\n', escapechar='') writer.writerows(table) shutil.move(tempfile.name, file_name) open(tsv_done_file, 'a').close() ddl = []
mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaign_type'] = 'utm_campaign' mappings['email'] = 'email' mappings['subscription'] = 'type' mappings['sub_order'] = 'type', { 'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4' } mappings['created_at'] = 'created_at' # Mapping stage_mapping = etl.fieldmap(stage_m_s, mappings) # Sort stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order']) # Datetime split t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True) t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day']) stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second']) # Export as csv to load folder etl.tocsv(stage_ready, 'load/facts.csv')
def sales_summary(start_dt=None, end_dt=None): """tally up gross (sale over list) profits TODO: tally up net profites (gross profit vs inventory purchase total) TODO: Keyword Arguments: start_dt {[type]} -- datetime for start of query (default: {None}) end_dt {[type]} -- datetime for start of query [description] (default: {None}) Returns: [dict] -- various types of sales information, stored in a dictionary. """ # products = db.session.query(Product).all() # sales = db.session.query(Sale).all() # retrieve existing tables products_records = etl.fromdb(db.engine, 'SELECT * FROM product') sales_records = etl.fromdb(db.engine, 'SELECT * FROM sale') # join product info to sales data sales_data = etl.join(sales_records, products_records, lkey='product_id', rkey='id') # prep joined sales data for tabulation sales_data = etl.convert(sales_data, 'date', lambda dt: format_date(dt)) sales_data = etl.sort(sales_data, 'date') sales_data = etl.convert(sales_data, 'quantity', lambda q: handle_none(q, replace_with=1)) sales_data = etl.addfield(sales_data, 'profit', lambda rec: calculate_profit(rec)) sales_data = etl.addfield(sales_data, 'gross_sales', lambda rec: calculate_gross_sales(rec)) # summarize data into charting-friendly data structures chart_count = etl.fold(sales_data, 'date', operator.add, 'quantity', presorted=True) chart_count = etl.rename(chart_count, {'key': 'x', 'value': 'y'}) chart_count, chart_count_missing_date = etl.biselect( chart_count, lambda rec: rec.x is not None) # print(chart_count) # etl.lookall(chart_count) chart_gross = etl.fold(sales_data, 'date', operator.add, 'gross_sales', presorted=True) chart_gross = etl.rename(chart_gross, {'key': 'x', 'value': 'y'}) chart_gross, chart_gross_missing_date = etl.biselect( chart_gross, lambda rec: rec.x is not None) # print(chart_gross) # etl.lookall(chart_gross) chart_profit = etl.fold(sales_data, 'date', operator.add, 'profit', presorted=True) chart_profit = etl.rename(chart_profit, {'key': 'x', 'value': 'y'}) chart_profit, chart_profit_missing_date = etl.biselect( chart_profit, lambda rec: rec.x is not None) # tabulate some figures gross_sales = 0 profits = 0 for sale in etl.dicts(sales_data): profits += calculate_profit(sale) gross_sales += calculate_gross_sales(sale) # for i in etl.dicts(chart_count): # print(i) # for i in etl.dicts(chart_gross): # print(i) return { 'gross_sales': gross_sales, 'profits': profits, 'chart_gross': list(etl.dicts(chart_gross)), 'chart_gross_missing_date': list(etl.dicts(chart_gross_missing_date)), 'chart_profit': list(etl.dicts(chart_profit)), 'chart_profit_missing_date': list(etl.dicts(chart_profit_missing_date)), 'chart_count': list(etl.dicts(chart_count)), 'chart_count_missing_date': list(etl.dicts(chart_count_missing_date)) }
# cut function is used to cut out the column given in the bracket below from the table # cut function is not compulsory for table1 because the value given below are the total field that are present in table1 data = etl.cut(table1, 'iso_code', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'new_cases_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'tests_units') # selecting the data from table on the basis of current date # variable num consist of only the data of 2020-04-30 from each country.Hence the latest data is filter out. num = etl.select(data, 'date', lambda r: r == '2020-04-30') # sort function is used to sort the unsorted data on the basis of iso_code # thus ,this process help us to join the data easily in furthur steps table1_sort = etl.sort(num, key='iso_code') # counter variable is declared to count the number of country count = 0 # values function is used to read the data from table for i in etl.values(table1_sort, 'iso_code', 'location', 'date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'new_cases_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'tests_units'): # condition to take 15 countries from the table for intregration # thus 16 is given because when count will be 0 it contain the unnecessary data. if count == 16:
""" DB-related tests, separated from main unit tests because they need local database setup prior to running. """ import sys sys.path.insert(0, './src') from petl import dummytable, sort, nrows import logging logging.basicConfig(level=logging.DEBUG) t = (('foo', 'bar'), ('C', 2), ('A', 9), ('B', 6), ('E', 1), ('D', 10)) u = sort(t, buffersize=3) print 'buffer up the data' print nrows(u) print 'create iterators' it1 = iter(u) it2 = iter(u) print 'iterate' print 1, it1.next() print 1, it1.next() print 1, it1.next() print 2, it2.next() print 2, it2.next() print 1, it1.next() print 1, it1.next()
transformed_outreaches.table, key="created_date", aggregation=len ) ) leaderboard = petl.aggregate( transformed_outreaches.table, key="name", aggregation=len ) calls_per_office = get_calls_per_office(transformed_outreaches) # rename columns for spreadsheet calls_per_day.rename_column('value', 'num_calls') calls_per_day=calls_per_day.rename_column('created_date', 'day') calls_per_office=calls_per_office.rename_column('name', 'office') # Sort leaderboard by num calls per person leaderboard_ranked = Table(petl.sort(leaderboard, 'value', reverse=True)) leaderboard_ranked=leaderboard_ranked.rename_column('value', 'num_calls') # Get set up spreadsheet and errors spreadsheet spreadsheet_id = "1fPlKWVtpDWid06R8oi0bHgch1ShYovYyks2aSZKY6nY" # Push to Google Sheets parsons_sheets.overwrite_sheet( spreadsheet_id, calls_per_day, worksheet="calls per day", user_entered_value=False, ) parsons_sheets.overwrite_sheet( spreadsheet_id, leaderboard_ranked, worksheet="leaderboard", user_entered_value=False,
#gets the information from ppms and creates a csv file r = requests.post(url, data=payload, headers=headers) f = open('todays_bookings.csv', 'wb') f.write(r.text) f.close() #Load the table table1 = petl.fromcsv('todays_bookings.csv') # Alter the columns table2 = petl.cut(table1, ' Object', ' User', ' Start time', ' End time', ' Training', ' Assisted') # Reorder the user names table3 = petl.convert(table2, ' User', lambda row: " ".join(re.findall("\S+", row)[::-1])) # Reorder the rows table4 = petl.sort(table3, key=[' Object', ' Start time']) # Save to new file petl.tocsv(table4, 'new.csv') #Reopens the CSV file (stupid, I know) and removes unnecessary characters csvfile = "" ppmscal = csv.reader(open('new.csv'), delimiter=',') for row in ppmscal: csvfile += str(row) + '\n' csvtxt = csvfile.replace("(", "").replace(")", "").replace("'", "").replace( "[", "").replace("]", "") csvtxt = csvtxt[:-1] #The CSV to HTML code has come from https://www.rosettacode.org/wiki/CSV_to_HTML_translation#Python #It creates an html file of the CSV so I can load, and auto-refresh it in browser
""" import sys sys.path.insert(0, './src') from petl import dummytable, sort, nrows import logging logging.basicConfig(level=logging.DEBUG) t = (('foo', 'bar'), ('C', 2), ('A', 9), ('B', 6), ('E', 1), ('D', 10)) u = sort(t, buffersize=3) print 'buffer up the data' print nrows(u) print 'create iterators' it1 = iter(u) it2 = iter(u) print 'iterate' print 1, it1.next() print 1, it1.next() print 1, it1.next() print 2, it2.next() print 2, it2.next() print 1, it1.next()
table2 = tail(table1, 4) look(table2) # sort table1 = [['foo', 'bar'], ['C', 2], ['A', 9], ['A', 6], ['F', 1], ['D', 10]] from petl import sort, look look(table1) table2 = sort(table1, 'foo') look(table2) # sorting by compound key is supported table3 = sort(table1, key=['foo', 'bar']) look(table3) # if no key is specified, the default is a lexical sort table4 = sort(table1) look(table4) # melt table1 = [['id', 'gender', 'age'], [1, 'F', 12], [2, 'M', 17], [3, 'M', 16]]
international_code = "(+61)" with open(IN_FILE, 'r') as infile, open(OUT_FILE, "w") as outfile: csv_reader = csv.reader(infile) writer = csv.writer(outfile) headers = next(csv_reader, None) #skipping header row writer.writerow(headers) for row in csv_reader: number_column = row[5] state_column = row[3] clean_num = re.sub("\D", "", row[5])[-8:] formatted_num = international_code + " " + regional_code[ state_column] + " " + clean_num row[5] = formatted_num writer.writerow(row) services = petl.fromcsv(SERVICES_FILE) offices = petl.fromcsv(OUT_FILE) offices = offices.rename({"Contact Name": "Office", "Phone Number": "Phone"}) offices = petl.cutout(offices,"State","Postcode") locations = petl.fromcsv(LOC_FILE) locations = locations.rename({"officeID": "OfficeID"}) office_service = petl.join(services, offices, key='OfficeID') office_service_locations = petl.join( office_service, locations, key='OfficeID') office_service_locations = petl.convert(office_service_locations,'OfficeServiceID',int) office_service_locations = petl.sort(office_service_locations,'OfficeServiceID') petl.tocsv(office_service_locations, 'office_service_locations.csv')
] table1 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-red.csv'), table_header)), "Type", "Red") table2 = etl.addfield( etl.convertnumbers( etl.setheader(etl.fromcsv('winequality-white.csv'), table_header)), "Type", "White") #print(etl.head(table1)) #print(etl.head(table2)) table1_filtered = etl.select(table1, "Quality", lambda v: v > 6) table2_filtered = etl.select(table2, "Quality", lambda v: v > 4) good_wines = etl.cat(table1_filtered, table2_filtered) good_wines_enhanced = etl.addfields( good_wines, [("Max Acidity", lambda rec: rec["Fixed Acidity"] + rec["Volatile Acidity"]), ("Locked SO2", lambda rec: rec["Total SO2"] - rec["Free SO2"])]) #print(etl.head(good_wines_enhanced)) #print(etl.tail(good_wines_enhanced)) gwe_sorted = etl.sort(good_wines_enhanced, key=["Quality", "Sugar"]) #print(etl.head(gwe_sorted)) print(etl.lookall(etl.tail(gwe_sorted, 500)))
from __future__ import absolute_import, print_function, division # sort() ######## import petl as etl table1 = [['foo', 'bar'], ['C', 2], ['A', 9], ['A', 6], ['F', 1], ['D', 10]] table2 = etl.sort(table1, 'foo') table2 # sorting by compound key is supported table3 = etl.sort(table1, key=['foo', 'bar']) table3 # if no key is specified, the default is a lexical sort table4 = etl.sort(table1) table4 # mergesort() ############# import petl as etl table1 = [['foo', 'bar'], ['A', 9], ['C', 2], ['D', 10],