def _test_dbo(write_dbo, read_dbo=None): if read_dbo is None: read_dbo = write_dbo expect_empty = (('foo', 'bar'),) expect = (('foo', 'bar'), ('a', 1), ('b', 2)) expect_appended = (('foo', 'bar'), ('a', 1), ('b', 2), ('a', 1), ('b', 2)) actual = etl.fromdb(read_dbo, 'SELECT * FROM test') debug('verify empty to start with...') debug(etl.look(actual)) ieq(expect_empty, actual) debug('write some data and verify...') etl.todb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('append some data and verify...') etl.appenddb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect_appended, actual) debug('overwrite and verify...') etl.todb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('cut, overwrite and verify') etl.todb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('cut, append and verify') etl.appenddb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test') debug(etl.look(actual)) ieq(expect_appended, actual) debug('try a single row') etl.todb(etl.head(expect, 1), write_dbo, 'test') debug(etl.look(actual)) ieq(etl.head(expect, 1), actual)
def __bool__(self): # Try to get a single row from our table head_one = petl.head(self.table) # See if our single row is empty return petl.nrows(head_one) > 0
def drained_entries(ctx: typer.Context, issues, entries, project): config = ctx.meta['config'] empty_entries, unset_entries = petl.biselect( entries, lambda row: row['issue_id'] is None) drain_issues = list( petl.dicts( transform.select_drain_issues( issues, assignee_id=ctx.meta['rdm_user']['id'], drain_cf_id=get_proj_attr(config, project, 'rdm_drain_cf_id')))) if not len(drain_issues): log.error('No drain issues found') return petl.head(unset_entries, 0), entries if len(drain_issues) > 1: log.warning( f'Found {len(drain_issues)} drain issues. Will use only first one') drain_issue = drain_issues[0] drained = petl.addfield(petl.cutout(empty_entries, 'issue_id'), 'issue_id', drain_issue['id']) return drained, unset_entries
def createDimCampaign(): try: tbl_campaign = [['campaign_name', 'campaign_started', 'campaign_ended'], ['none', '2014-04-28T00:00:00', '2018-09-30T00:00:00']] dim_campaign = etl.head(tbl_campaign, 1) # Export as csv to load folder etl.tocsv(dim_campaign, 'load/dim_campaign.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def load(request): filename = request.GET.get('name','') fullpath = settings.DATA_DIR+filename """ detector = UniversalDetector() file_open = open(fullpath) for line in file_open.readlines(): detector.feed(line) if detector.done: break detector.close() file_open.close() """ encoding = 'ascii'#detector.result['encoding'] response_data = {} a = tryExtractors(fullpath, encoding) response_data['result'] = [row for row in etl.head(a)] response_data['headers'] = etl.header(a) typeInference(a) return HttpResponse(json.dumps(response_data), content_type="application/json")
def get_context_data(self, **kwargs): context = super().get_context_data(**kwargs) context["filename"] = self.object.downloaded_file.name.split(os.path.sep)[-1] context["count_query_kwarg"] = self.count_query_kwarg table = petl.fromcsv(self.object.downloaded_file) context["header"] = petl.header(table) try: record_count_to_show = int(self.request.GET.get(self.count_query_kwarg)) except (TypeError, ValueError): record_count_to_show = self.count_increment # Potentially expensive, cache / save in database for dataset if petl.nrows(table) > record_count_to_show: context[ "load_more_url" ] = f"{self.request.path}?{self.count_query_kwarg}={record_count_to_show+self.count_increment}" context["rows"] = petl.records(petl.head(table, record_count_to_show)) return context
def create_table(table, dbo, tablename, schema=None, commit=True, constraints=True, metadata=None, dialect=None, sample=1000): """ Create a database table based on a sample of data in the given table. Parameters ---------- table : sequence of sequences (petl table) Table data to load dbo : database object DB-API 2.0 connection, callable returning a DB-API 2.0 cursor, or SQLAlchemy connection, engine or session tablename : string Name of the table schema : string Name of the database schema to create the table in commit : bool If True commit the changes constraints : bool If True use length and nullable constraints (only relevant if create=True) metadata : sqlalchemy.MetaData Custom table metadata (only relevant if create=True) dialect : string One of {'access', 'sybase', 'sqlite', 'informix', 'firebird', 'mysql', 'oracle', 'maxdb', 'postgresql', 'mssql'} (only relevant if create=True) sample : int Number of rows to sample when inferring types etc., set to 0 to use the whole table (only relevant if create=True) """ if sample > 0: table = head(table, sample) sql = make_create_table_statement(table, tablename, schema=schema, constraints=constraints, metadata=metadata, dialect=dialect) _execute(sql, dbo, commit=commit)
def split_dataset(dataset, p_train_data, split_mode): fields = list(fieldnames(dataset)) size_dataset = len(values(dataset, fields[0])) size_train_data = int(round(size_dataset * p_train_data)) size_test_data = abs(size_train_data - size_dataset) if split_mode == 'normal' : train_data = head(dataset, size_train_data - 1) if size_test_data == 0: test_data = [] else: test_data = tail(dataset, size_test_data - 1) #################### Falta incluir Shuffle mode ############### return train_data, test_data
'new_tests_per_thousand', 'tests_units'): # condition to take 15 countries from the table for intregration # thus 16 is given because when count will be 0 it contain the unnecessary data. if count == 16: break # thus data1 was already declared with header on above code and other data are being appended data1.append(i) count = count + 1 # removing the unnecessary data from the list which was in data1[1] data1.pop(1) # converting the list into tables table_old = etl.head(data1, 15) # importing data from latest covid -19 data and converting the fields which will be needed table2 = (etl.fromcsv('current_covid.csv').convert( 'median_age', float).convert('aged_65_older', float).convert('aged_70_older', float)) # same as above table , list is declared with header table2_header = [['iso_code', 'median_age', 'aged_65_older', 'aged_70_older']] table2_data = etl.cut(table2, 'iso_code', 'date', 'median_age', 'aged_65_older', 'aged_70_older') table2_dated = etl.select(table2_data, 'date', lambda v: v == '2020-04-30') table2_sort = etl.sort(table2_dated, key='iso_code') count = 0 for j in etl.values(table2_sort, 'iso_code', 'median_age', 'aged_65_older', 'aged_70_older'):
table4 # head() ######## import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 5], ['d', 7], ['f', 42], ['f', 3], ['h', 90]] table2 = etl.head(table1, 4) table2 # tail() ######## import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 5], ['d', 7], ['f', 42], ['f', 3], ['h', 90],
stores = etl.fromcsv('stores.csv') # Open XML document locations = etl.fromxml('locations.xml', 'store', {'Name': 'Name', 'Lat': 'Lat', 'Lon': 'Lon'}) print(locations) # Set output output_table = [["ID", "Name", "Suburb", "State", "Postcode"]] store_id = 1 # Read through the store.csv to generate output_table store = etl.cut(stores, 'Name', 'Suburb', 'State', 'Postcode').distinct() print(store) for s in etl.values(store, 'Name', 'Suburb', 'State', 'Postcode'): output_table.append([store_id, s]) store_id += 1 print (output_table) # Merge and join XML and CSV together merge_output = etl.join(stores, locations, key="Name") print(merge_output) store_table = etl.cut(merge_output, 'ID', 'Name', 'Suburb', 'State', 'Postcode', 'Lat', 'Lon') print(etl.head(store_table, 5)) # Export to CSV file etl.tocsv(merge_output, 'store_locations.csv')
# Another table, with the campaign name and start and end date will be connect with the facts table dim_campaigntype_cut = etl.cut(events, 'utm_campaign') dim_campaigntype_rename = etl.rename(dim_campaigntype_cut, {'utm_campaign': 'campaign_type'}) dim_campaigntype = etl.distinct(dim_campaigntype_rename) # export as csv to load folder etl.tocsv(dim_campaigntype, 'load/dim_campaigntype.csv') # Dim Campaign # Note: # Slowly changing dimension # No data for now, meaning that until we have this defined we will fill everything with a "none" # Let's define that this will be solved until the end of september, and the start date was on the 28 of April 2018 tbl_campaign = [['campaign_name', 'campaign_started', 'campaign_ended'], ['none', '2014-04-28T00:00:00', '2018-09-30T00:00:00']] dim_campaign = etl.head(tbl_campaign, 1) # Export as csv to load folder etl.tocsv(dim_campaign, 'load/dim_campaign.csv') # Dim Time # TO DO # Load a full year (2018) with the most simple datetime analysis # Year, month, day, hour, minute, second # For the full loading process , use the reference on the references.txt # This should be a processure with all the validation logic there, to create the next X months when it is called # Facts # This facts table will be the staging with all the needed info to quickly update with the dimension keys and load to the facts table # The facts table will have columns to match each column on the dim Time table, to make it easier to get the reference key
# head table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 5], ['d', 7], ['f', 42], ['f', 3], ['h', 90]] from petl import head, look look(table1) table2 = head(table1, 4) look(table2) # tail table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 5], ['d', 7], ['f', 42], ['f', 3], ['h', 90], ['k', 12], ['l', 77],
# coding:utf8 import petl as etl table1 = [('foo', 'bar', 'baz'), ('apple', 1, 2.5), ('orange', 3, 4.5), ('pears', 5, 6.5), ('bananer', 7, 8.5), ('cat', 9, 10.5)] # head 4 table_head = etl.head(table1, 4) print(table_head) # tail 4 table_tail = etl.tail(table1, 4) print(table_tail) # rowslice rowsliceTb = etl.rowslice(table1, 2) print(rowsliceTb) rowsliceTb_2_4 = etl.rowslice(table1, 2, 4) print(rowsliceTb_2_4) # 从1开始,2作为第一个,步长为2, rowsliceTb_1_2_5 = etl.rowslice(table1, 1, 5, 2) print(rowsliceTb_1_2_5) # cut cutTb = etl.cut(table1, 'foo', 'bar') print(cutTb) # index starts from 0 cutTb_0_2 = etl.cut(table1, 0, 2)