def _test_dbo(write_dbo, read_dbo=None): if read_dbo is None: read_dbo = write_dbo expect_empty = (('foo', 'bar'),) expect = (('foo', 'bar'), ('a', 1), ('b', 2)) expect_appended = (('foo', 'bar'), ('a', 1), ('b', 2), ('a', 1), ('b', 2)) actual = etl.fromdb(read_dbo, 'SELECT * FROM test') debug('verify empty to start with...') debug(etl.look(actual)) ieq(expect_empty, actual) debug('write some data and verify...') etl.todb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('append some data and verify...') etl.appenddb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect_appended, actual) debug('overwrite and verify...') etl.todb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('cut, overwrite and verify') etl.todb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('cut, append and verify') etl.appenddb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test') debug(etl.look(actual)) ieq(expect_appended, actual) debug('try a single row') etl.todb(etl.head(expect, 1), write_dbo, 'test') debug(etl.look(actual)) ieq(etl.head(expect, 1), actual)
def transform(data,data_set): data = data['observations'] data = etl.fromdicts(data, header=['value','realtime_start','realtime_end','date']) data = etl.cut(data,'date','value') data = etl.rename(data,{'date':'date','value': data_set.lower()}) data = etl.convert(data,data_set.lower(),lambda val: 0 if val == '.' else val) return data
def test_dictlookupone(): t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3)) try: dictlookupone(t1, 'foo', strict=True) except DuplicateKeyError: pass # expected else: assert False, 'expected error' # relax actual = dictlookupone(t1, 'foo', strict=False) # first wins expect = {'a': {'foo': 'a', 'bar': 1}, 'b': {'foo': 'b', 'bar': 2}} eq_(expect, actual) # key only actual = dictlookupone(cut(t1, 'foo'), 'foo') expect = {'a': {'foo': 'a'}, 'b': {'foo': 'b'}} eq_(expect, actual) t2 = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2, False), ('b', 3, True), ('b', 3, False)) # test compound key actual = dictlookupone(t2, ('foo', 'bar'), strict=False) expect = {('a', 1): {'foo': 'a', 'bar': 1, 'baz': True}, ('b', 2): {'foo': 'b', 'bar': 2, 'baz': False}, ('b', 3): {'foo': 'b', 'bar': 3, 'baz': True}} # first wins eq_(expect, actual)
def test_dictlookup(): t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3)) actual = dictlookup(t1, 'foo') expect = {'a': [{'foo': 'a', 'bar': 1}], 'b': [{'foo': 'b', 'bar': 2}, {'foo': 'b', 'bar': 3}]} eq_(expect, actual) # key only actual = dictlookup(cut(t1, 'foo'), 'foo') expect = {'a': [{'foo': 'a'}], 'b': [{'foo': 'b'}, {'foo': 'b'}]} eq_(expect, actual) t2 = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2, False), ('b', 3, True), ('b', 3, False)) # test compound key actual = dictlookup(t2, ('foo', 'bar')) expect = {('a', 1): [{'foo': 'a', 'bar': 1, 'baz': True}], ('b', 2): [{'foo': 'b', 'bar': 2, 'baz': False}], ('b', 3): [{'foo': 'b', 'bar': 3, 'baz': True}, {'foo': 'b', 'bar': 3, 'baz': False}]} eq_(expect, actual)
def test_lookup(): t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3)) # lookup one column on another actual = lookup(t1, 'foo', 'bar') expect = {'a': [1], 'b': [2, 3]} eq_(expect, actual) # test default value - tuple of whole row actual = lookup(t1, 'foo') # no value selector expect = {'a': [('a', 1)], 'b': [('b', 2), ('b', 3)]} eq_(expect, actual) # test default value - key only actual = lookup(cut(t1, 'foo'), 'foo') expect = {'a': [('a',)], 'b': [('b',), ('b',)]} eq_(expect, actual) t2 = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2, False), ('b', 3, True), ('b', 3, False)) # test value selection actual = lookup(t2, 'foo', ('bar', 'baz')) expect = {'a': [(1, True)], 'b': [(2, False), (3, True), (3, False)]} eq_(expect, actual) # test compound key actual = lookup(t2, ('foo', 'bar'), 'baz') expect = {('a', 1): [True], ('b', 2): [False], ('b', 3): [True, False]} eq_(expect, actual)
def extract_column(table, column): ''' Return a list of all values minus the header row for a given petl table :param table: A petl data table :param column: The name of the column to extract ''' a = petl.cut(table, column) return [i[0] for i in a][1:]
def test_outerjoin_novaluefield(): table1 = (('id', 'colour'), (0, 'black'), (1, 'blue'), (2, 'red'), (3, 'purple'), (5, 'yellow'), (7, 'white')) table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'), (4, 'ellipse')) expect = (('id', 'colour', 'shape'), (0, 'black', None), (1, 'blue', 'circle'), (2, 'red', None), (3, 'purple', 'square'), (4, None, 'ellipse'), (5, 'yellow', None), (7, 'white', None)) actual = outerjoin(table1, table2, key='id') ieq(expect, actual) actual = outerjoin(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 'id', 'shape'), actual) actual = outerjoin(table1, cut(table2, 'id'), key='id') ieq(cut(expect, 'id', 'colour'), actual) actual = outerjoin(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 'id'), actual)
def _test_rightjoin_novaluefield(rightjoin_impl): table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'), (3, 'purple')) table2 = (('id', 'shape'), (0, 'triangle'), (1, 'circle'), (3, 'square'), (4, 'ellipse'), (5, 'pentagon')) expect = (('id', 'colour', 'shape'), (0, None, 'triangle'), (1, 'blue', 'circle'), (3, 'purple', 'square'), (4, None, 'ellipse'), (5, None, 'pentagon')) actual = rightjoin_impl(table1, table2, key='id') ieq(expect, actual) actual = rightjoin_impl(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 'id', 'shape'), actual) actual = rightjoin_impl(table1, cut(table2, 'id'), key='id') ieq(cut(expect, 'id', 'colour'), actual) actual = rightjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 'id'), actual)
def _test_leftjoin_novaluefield(leftjoin_impl): table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'), (3, 'purple'), (5, 'yellow'), (7, 'orange')) table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'), (4, 'ellipse')) expect = (('id', 'colour', 'shape'), (1, 'blue', 'circle'), (2, 'red', None), (3, 'purple', 'square'), (5, 'yellow', None,), (7, 'orange', None)) actual = leftjoin_impl(table1, table2, key='id') ieq(expect, actual) actual = leftjoin_impl(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 'id', 'shape'), actual) actual = leftjoin_impl(table1, cut(table2, 'id'), key='id') ieq(cut(expect, 'id', 'colour'), actual) actual = leftjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 'id'), actual)
def createDimMedium(events): try: dim_medium_cut = etl.cut(events, 'utm_medium') dim_medium_rename = etl.rename(dim_medium_cut, {'utm_medium': 'medium'}) dim_medium = etl.distinct(dim_medium_rename) # Export as csv to load folder etl.tocsv(dim_medium, 'load/dim_medium.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def createDimSubscriptions(events): try: dim_subscriptions_cut = etl.cut(events, 'type') dim_subscriptions_rename = etl.rename(dim_subscriptions_cut, {'type': 'subscription_name'}) dim_subscriptions = etl.distinct(dim_subscriptions_rename) # Export as csv to load folder etl.tocsv(dim_subscriptions, 'load/dim_subscriptions.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def createDimCampaignType(events): try: dim_campaigntype_cut = etl.cut(events, 'utm_campaign') dim_campaigntype_rename = etl.rename(dim_campaigntype_cut, {'utm_campaign': 'campaign_type'}) dim_campaigntype = etl.distinct(dim_campaigntype_rename) # export as csv to load folder etl.tocsv(dim_campaigntype, 'load/dim_campaigntype.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def _test_dbo(write_dbo, read_dbo=None): if read_dbo is None: read_dbo = write_dbo expect_empty = (('foo', 'bar'), ) expect = (('foo', 'bar'), ('a', 1), ('b', 2)) expect_appended = (('foo', 'bar'), ('a', 1), ('b', 2), ('a', 1), ('b', 2)) actual = etl.fromdb(read_dbo, 'SELECT * FROM test') debug('verify empty to start with...') debug(etl.look(actual)) ieq(expect_empty, actual) debug('write some data and verify...') etl.todb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('append some data and verify...') etl.appenddb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect_appended, actual) debug('overwrite and verify...') etl.todb(expect, write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('cut, overwrite and verify') etl.todb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test') debug(etl.look(actual)) ieq(expect, actual) debug('cut, append and verify') etl.appenddb(etl.cut(expect, 'bar', 'foo'), write_dbo, 'test') debug(etl.look(actual)) ieq(expect_appended, actual) debug('try a single row') etl.todb(etl.head(expect, 1), write_dbo, 'test') debug(etl.look(actual)) ieq(etl.head(expect, 1), actual)
def transform_data(data): tbl_data = petl.fromdicts(data) tbl_data = petl.convert( tbl_data, {k: v['value'] for k, v in fields_to_transform.items()}) tbl_data = petl.rename( tbl_data, {k: v['key'] for k, v in fields_to_transform.items()}) tbl_data_allowed = petl.cut(tbl_data, *allowed_fields) return tbl_data_allowed
def anyServices(): # reading the csv file csv = pt.fromcsv('services.csv') # json content type declaration response.headers['Content-type'] = 'application/json' response.headers['Access-Control-Allow-Origin'] = '*' # cutting out the required column names jsonData = pt.cut(csv, 'ServiceID', 'Service') # convert the dictionary data into json data jsonData = json.JSONEncoder().encode(list(pt.dicts(jsonData))) # returning the json data return jsonData
def get_relationships(self): "Parses a list of `Relationship` objects." core_file = _find_loinc_table_core_file(self.uri.path) core = etl.fromcsv(core_file, delimiter=',') core = etl.cut(core, ['LOINC_NUM', 'LONG_COMMON_NAME']) hierarchy_file = _find_multi_axial_hierarchy_file(self.uri.path) hierarchy = etl.fromcsv(hierarchy_file, delimiter=',') hierarchy = etl.leftjoin(hierarchy, core, lkey='CODE', rkey='LOINC_NUM') hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'CODE_TEXT', 'LONG_COMMON_NAME']) hierarchy = etl.fillright(hierarchy) hierarchy = etl.cut(hierarchy, ['IMMEDIATE_PARENT', 'CODE', 'LONG_COMMON_NAME']) hierarchy = etl.rename(hierarchy, 'LONG_COMMON_NAME', 'CODE_TEXT') parents = etl.cut(hierarchy, ['CODE', 'CODE_TEXT']) hierarchy = etl.selectne(hierarchy, 'IMMEDIATE_PARENT', '') hierarchy = etl.leftjoin(hierarchy, parents, lkey='IMMEDIATE_PARENT', rkey='CODE', lprefix='source.', rprefix='target.') hierarchy = etl.distinct(hierarchy) if self.versioned: version = _parse_version(hierarchy_file) hierarchy = etl.addfield(hierarchy, 'version', version) hierarchy = etl.rowmapmany(hierarchy, _to_json, ['relationship']) return hierarchy
def exercise_ss_cursor(setup_dbo, ss_dbo): print '=' * len(repr(ss_dbo)) print 'EXERCISE WITH SERVER-SIDE CURSOR' print repr(ss_dbo) print '=' * len(repr(ss_dbo)) print expect_empty = (('foo', 'bar'),) expect = (('foo', 'bar'), ('a', 1), ('b', 1)) expect_appended = (('foo', 'bar'), ('a', 1), ('b', 1), ('a', 1), ('b', 1)) actual = fromdb(ss_dbo, 'SELECT * FROM test') print 'verify empty to start with...' ieq(expect_empty, actual) print look(actual) print 'write some data and verify...' todb(expect, setup_dbo, 'test') ieq(expect, actual) print look(actual) print 'append some data and verify...' appenddb(expect, setup_dbo, 'test') ieq(expect_appended, actual) print look(actual) print 'overwrite and verify...' todb(expect, setup_dbo, 'test') ieq(expect, actual) print look(actual) print 'cut, overwrite and verify' todb(cut(expect, 'bar', 'foo'), setup_dbo, 'test') ieq(expect, actual) print look(actual) print 'cut, append and verify' appenddb(cut(expect, 'bar', 'foo'), setup_dbo, 'test') ieq(expect_appended, actual) print look(actual)
def test_issue_231(): table = [['foo', 'bar'], ['a', '1'], ['b', '2']] t = cut(table, 'foo') totsv(t, 'tmp/issue_231.tsv') u = fromtsv('tmp/issue_231.tsv') ieq(t, u) tocsv(t, 'tmp/issue_231.csv') u = fromcsv('tmp/issue_231.csv') ieq(t, u) topickle(t, 'tmp/issue_231.pickle') u = frompickle('tmp/issue_231.pickle') ieq(t, u)
def exercise(dbo): print '=' * len(repr(dbo)) print repr(dbo) print '=' * len(repr(dbo)) print expect_empty = (('foo', 'bar'),) expect = (('foo', 'bar'), ('a', 1), ('b', 1)) expect_appended = (('foo', 'bar'), ('a', 1), ('b', 1), ('a', 1), ('b', 1)) actual = fromdb(dbo, 'SELECT * FROM test') print 'verify empty to start with...' ieq(expect_empty, actual) print look(actual) print 'write some data and verify...' todb(expect, dbo, 'test') ieq(expect, actual) print look(actual) print 'append some data and verify...' appenddb(expect, dbo, 'test') ieq(expect_appended, actual) print look(actual) print 'overwrite and verify...' todb(expect, dbo, 'test') ieq(expect, actual) print look(actual) print 'cut, overwrite and verify' todb(cut(expect, 'bar', 'foo'), dbo, 'test') ieq(expect, actual) print look(actual) print 'cut, append and verify' appenddb(cut(expect, 'bar', 'foo'), dbo, 'test') ieq(expect_appended, actual) print look(actual)
def print_table(ctx): """Output a list of pipelines as table.""" rows = [dict(source.state) for source in ctx.obj['sources']] message = '\nNumber of pipelines = {}\n' secho(message.format(len(rows)), **SUCCESS) subset = [ 'id', 'pipeline_status', 'validation_status', 'nb_validation_errors', 'scraper_required', 'resource_type', 'extension' ] sorted_rows = sort(cut(fromdicts(rows), *subset), key='id') echo(look(sorted_rows, limit=None))
def test_recordlookup(): t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3)) lkp = recordlookup(t1, 'foo') eq_(['a'], [r.foo for r in lkp['a']]) eq_(['b', 'b'], [r.foo for r in lkp['b']]) eq_([1], [r.bar for r in lkp['a']]) eq_([2, 3], [r.bar for r in lkp['b']]) # key only lkp = recordlookup(cut(t1, 'foo'), 'foo') eq_(['a'], [r.foo for r in lkp['a']]) eq_(['b', 'b'], [r.foo for r in lkp['b']])
def cut(self, *columns): """ Return a table of selection of columns `Args:` \*columns: str Columns in the parsons table `Returns:` A new parsons table containing the selected columnns """ # noqa: W605 from parsons.etl.table import Table return Table(petl.cut(self.table, *columns))
def test_crossjoin_novaluefield(): table1 = (('id', 'colour'), (1, 'blue'), (2, 'red')) table2 = (('id', 'shape'), (1, 'circle'), (3, 'square')) expect = (('id', 'colour', 'id', 'shape'), (1, 'blue', 1, 'circle'), (1, 'blue', 3, 'square'), (2, 'red', 1, 'circle'), (2, 'red', 3, 'square')) actual = crossjoin(table1, table2, key='id') ieq(expect, actual) actual = crossjoin(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 0, 2, 'shape'), actual) actual = crossjoin(table1, cut(table2, 'id'), key='id') ieq(cut(expect, 0, 'colour', 2), actual) actual = crossjoin(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 0, 2), actual)
def test_basics(): t1 = (('foo', 'bar'), ('A', 1), ('B', 2)) w1 = FluentWrapper(t1) eq_(('foo', 'bar'), w1.header()) eq_(petl.header(w1), w1.header()) iassertequal((('A', 1), ('B', 2)), w1.data()) iassertequal(petl.data(w1), w1.data()) w2 = w1.cut('bar', 'foo') expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B')) iassertequal(expect2, w2) iassertequal(petl.cut(w1, 'bar', 'foo'), w2) w3 = w1.cut('bar', 'foo').cut('foo', 'bar') iassertequal(t1, w3)
def test_basics(): t1 = (('foo', 'bar'), ('A', 1), ('B', 2)) w1 = etl.wrap(t1) eq_(('foo', 'bar'), w1.header()) eq_(etl.header(w1), w1.header()) ieq((('A', 1), ('B', 2)), w1.data()) ieq(etl.data(w1), w1.data()) w2 = w1.cut('bar', 'foo') expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B')) ieq(expect2, w2) ieq(etl.cut(w1, 'bar', 'foo'), w2) w3 = w1.cut('bar', 'foo').cut('foo', 'bar') ieq(t1, w3)
def _test_join_novaluefield(join_impl): table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'), (3, 'purple')) table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'), (4, 'ellipse')) expect = (('id', 'colour', 'shape'), (1, 'blue', 'circle'), (3, 'purple', 'square')) actual = join_impl(table1, table2, key='id') ieq(expect, actual) actual = join_impl(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 'id', 'shape'), actual) actual = join_impl(table1, cut(table2, 'id'), key='id') ieq(cut(expect, 'id', 'colour'), actual) actual = join_impl(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 'id'), actual)
def _test_lookupjoin_novaluefield(lookupjoin_impl): table1 = (('id', 'color', 'cost'), (1, 'blue', 12), (2, 'red', 8), (3, 'purple', 4)) table2 = (('id', 'shape', 'size'), (1, 'circle', 'big'), (2, 'square', 'tiny'), (3, 'ellipse', 'small')) expect = (('id', 'color', 'cost', 'shape', 'size'), (1, 'blue', 12, 'circle', 'big'), (2, 'red', 8, 'square', 'tiny'), (3, 'purple', 4, 'ellipse', 'small')) actual = lookupjoin_impl(table1, table2, key='id') ieq(expect, actual) actual = lookupjoin_impl(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 'id', 'shape', 'size'), actual) actual = lookupjoin_impl(table1, cut(table2, 'id'), key='id') ieq(cut(expect, 'id', 'color', 'cost'), actual) actual = lookupjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 'id'), actual)
def _test_antijoin_novaluefield(antijoin_impl): table1 = (('id', 'colour'), (0, 'black'), (1, 'blue'), (2, 'red'), (4, 'yellow'), (5, 'white')) table2 = (('id', 'shape'), (1, 'circle'), (3, 'square')) expect = (('id', 'colour'), (0, 'black'), (2, 'red'), (4, 'yellow'), (5, 'white')) actual = antijoin_impl(table1, table2, key='id') ieq(expect, actual) actual = antijoin_impl(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 'id'), actual) actual = antijoin_impl(table1, cut(table2, 'id'), key='id') ieq(expect, actual) actual = antijoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 'id'), actual)
def xref_symbol_reports(): symbol_reports = [ f for f in os.listdir() if re.match('OCLC Datasync Unresolved.*\.csv', f) ] today = str(date.today()) for report in symbol_reports: symbol_split = re.split('^.*processing.(M[A-Z]{2}).*$', report) symbol = symbol_split[1] xlsx_outfile = symbol + '_datasync_unresolved_' + today + '.xlsx' xls_outfile = symbol + '_datasync_unresolved_' + today + '.xls' txt_outfile = symbol + '_staging_OCNs_' + today + '.txt' symbol_table_raw = etl.fromcsv(report, encoding='utf-8') symbol_table = etl.rename(symbol_table_raw, '\ufeffMMS Id', 'MMS ID') symbol_table2 = etl.select(symbol_table, "{MMS ID} is not None") symbol_table_sorted = etl.sort(symbol_table2, 'MMS ID') xref_table = etl.fromcsv('unresxref.csv') xref_table2 = etl.select(xref_table, "{MMS ID} is not None") xref_table_sorted = etl.sort(xref_table2, 'MMS ID') symbol_xref_table = etl.join(symbol_table_sorted, xref_table_sorted, presorted=True, lkey="MMS ID", rkey="MMS ID") try: etl.toxlsx(symbol_xref_table, xlsx_outfile, encoding='utf-8') except TypeError: etl.toxls(symbol_xref_table, xls_outfile, 'Sheet1', encoding='utf-8') staging_ocns_table = etl.cut(symbol_xref_table, 'Staging OCN') template = '{Staging OCN}\n' etl.totext(staging_ocns_table, txt_outfile, template=template)
def test_lookupone(): t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3)) # lookup one column on another under strict mode try: lookupone(t1, 'foo', 'bar', strict=True) except DuplicateKeyError: pass # expected else: assert False, 'expected error' # lookup one column on another under, not strict actual = lookupone(t1, 'foo', 'bar', strict=False) expect = {'a': 1, 'b': 2} # first value wins eq_(expect, actual) # test default value - tuple of whole row actual = lookupone(t1, 'foo', strict=False) # no value selector expect = {'a': ('a', 1), 'b': ('b', 2)} # first wins eq_(expect, actual) # test default value - key only actual = lookupone(cut(t1, 'foo'), 'foo') expect = {'a': ('a',), 'b': ('b',)} eq_(expect, actual) t2 = (('foo', 'bar', 'baz'), ('a', 1, True), ('b', 2, False), ('b', 3, True), ('b', 3, False)) # test value selection actual = lookupone(t2, 'foo', ('bar', 'baz'), strict=False) expect = {'a': (1, True), 'b': (2, False)} eq_(expect, actual) # test compound key actual = lookupone(t2, ('foo', 'bar'), 'baz', strict=False) expect = {('a', 1): True, ('b', 2): False, ('b', 3): True} # first wins eq_(expect, actual)
def collection_count(request, pk): collection = Collection.objects.get(pk=pk) table = collection.data if request.method == "GET": fields = [] button_states = {} header, table = table[0], table[1:] else: fields = [f for f in FIELDS if f in request.POST] if fields: table = petl.cut(table, *fields) header, counts = list(table[0]) + ["Count"], petl.valuecounter( table, *fields) table = [] for k, v in counts.items(): if isinstance(k, str): table.append([k] + [v]) else: table.append(list(k) + [v]) else: header, table = table[0], table[1:] button_states = { f"{f}_checked": True for f in FIELDS if f in request.POST } return render( request, "api/collection_detail.html", { "header": header, "table": table, "collection_id": collection.id, "show_load_more": False, **button_states, }, )
def test_basics(): t1 = (('foo', 'bar'), ('A', 1), ('B', 2)) w1 = etl(t1) eq_(('foo', 'bar'), w1.header()) eq_(petl.header(w1), w1.header()) ieq((('A', 1), ('B', 2)), w1.data()) ieq(petl.data(w1), w1.data()) w2 = w1.cut('bar', 'foo') expect2 = (('bar', 'foo'), (1, 'A'), (2, 'B')) ieq(expect2, w2) ieq(petl.cut(w1, 'bar', 'foo'), w2) w3 = w1.cut('bar', 'foo').cut('foo', 'bar') ieq(t1, w3)
def kcmo_convert(filepath, xtrapath): """ Takes the file path to a csv in the format used by Kansas City proper converts to universal format outputs csv. """ kcmo = etl.fromcsv(filepath) kcx = etl.fromxlsx(xtrapath) kcjoin = etl.join(kcmo, kcx, lkey='POLEID', rkey='IDNumber') del kcmo del kcx kcjoin = etl.addfield(kcjoin, 'PoleID', lambda x: x['POLEID']) kcjoin = etl.addfield(kcjoin, 'Longitude', lambda x: geom_to_tuple(x['the_geom'])[0]) kcjoin = etl.addfield(kcjoin, 'Latitude', lambda x: geom_to_tuple(x['the_geom'])[1]) kcjoin = etl.addfield(kcjoin, 'LightbulbType', lambda x: x['LUMINAIRE TYPE']) kcjoin = etl.addfield(kcjoin, 'Wattage', lambda x: x['WATTS']) kcjoin = etl.addfield(kcjoin, 'Lumens', None) kcjoin = etl.addfield( kcjoin, 'LightAttributes', lambda x: make_a_list( x['ATTACHMENT 10'], x['ATTACHMENT 9'], x['ATTACHMENT 8'], x[ 'ATTACHMENT 7'], x['ATTACHMENT 6'], x['ATTACHMENT 5'], x[ 'ATTACHMENT 4'], x['ATTACHMENT 3'], x['ATTACHMENT 2'], x[ 'ATTACHMENT 1'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'AttachedTech', lambda x: bool(x['LightAttributes'])) kcjoin = etl.addfield( kcjoin, 'FiberWiFiEnable', lambda x: find_wifi(*x[ 'LightAttributes'], x['SPECIAL_N2'], x['SPECIAL_NO'])) kcjoin = etl.addfield(kcjoin, 'PoleType', lambda x: x['POLE TYPE']) kcjoin = etl.addfield(kcjoin, 'PoleOwner', lambda x: x['POLE OWNER']) kcjoin = etl.addfield(kcjoin, 'DataSource', 'Kansas City') kcjoin = etl.cut(kcjoin, 'PoleID', 'Longitude', 'Latitude', 'LightbulbType', 'Wattage', 'Lumens', 'AttachedTech', 'LightAttributes', 'FiberWiFiEnable', 'PoleType', 'PoleOwner', 'DataSource') etl.tocsv(kcjoin, 'data/kcmo_clean.csv')
def _test_leftjoin_novaluefield(leftjoin_impl): table1 = (('id', 'colour'), (1, 'blue'), (2, 'red'), (3, 'purple'), (5, 'yellow'), (7, 'orange')) table2 = (('id', 'shape'), (1, 'circle'), (3, 'square'), (4, 'ellipse')) expect = (('id', 'colour', 'shape'), (1, 'blue', 'circle'), (2, 'red', None), (3, 'purple', 'square'), ( 5, 'yellow', None, ), (7, 'orange', None)) actual = leftjoin_impl(table1, table2, key='id') ieq(expect, actual) actual = leftjoin_impl(cut(table1, 'id'), table2, key='id') ieq(cut(expect, 'id', 'shape'), actual) actual = leftjoin_impl(table1, cut(table2, 'id'), key='id') ieq(cut(expect, 'id', 'colour'), actual) actual = leftjoin_impl(cut(table1, 'id'), cut(table2, 'id'), key='id') ieq(cut(expect, 'id'), actual)
def xls_tidy(xls,qvalue): d=etl.fromtsv(xls) sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue)) psmsummary=sd ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue') #remove the mod info in peptide. ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1') ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'') aggregation = OrderedDict() aggregation['SpecCount'] = len cssd=etl.aggregate(ssd, 'Peptide', aggregation) fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue")) aggregation = OrderedDict() aggregation['Protein'] = 'Protein', etl.strjoin(';') aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';') assd=etl.aggregate(fssd, 'Peptide', aggregation) pepsummary=etl.join(assd, cssd, key='Peptide') return (psmsummary, pepsummary)
def test_recordlookupone(): t1 = (('foo', 'bar'), ('a', 1), ('b', 2), ('b', 3)) try: recordlookupone(t1, 'foo', strict=True) except DuplicateKeyError: pass # expected else: assert False, 'expected error' # relax lkp = recordlookupone(t1, 'foo', strict=False) eq_('a', lkp['a'].foo) eq_('b', lkp['b'].foo) eq_(1, lkp['a'].bar) eq_(2, lkp['b'].bar) # first wins # key only lkp = recordlookupone(cut(t1, 'foo'), 'foo', strict=False) eq_('a', lkp['a'].foo) eq_('b', lkp['b'].foo)
def createFacts(events, users): try: events_uid = etl.cutout(events, 'tracking_id', 'utm_medium', 'utm_campaign') events_tui = etl.cutout(events, 'user_id') stage_uid = etl.join(users, events_uid, key='user_id') stage_tui = etl.join(users, events_tui, key='tracking_id') stage_utm = etl.cut(stage_tui, 'user_id', 'utm_medium', 'utm_campaign') stage_uid_utm = etl.join(stage_uid, stage_utm, key='user_id') stage_m_s = etl.mergesort(stage_uid_utm, stage_tui, key=['created_at', 'email']) mappings = OrderedDict() mappings['tid'] = 'tracking_id' mappings['uid'] = 'user_id' mappings['utm_medium'] = 'utm_medium' mappings['utm_campaign'] = 'utm_campaign', {'audio': 'none', 'social': 'none'} mappings['utm_campaigntype'] = 'utm_campaign' mappings['email'] = 'email' mappings['subscription'] = 'type' mappings['sub_order'] = 'type', {'Signup Completed': '1', 'Trial Started': '2', 'Subscription Started': '3', 'Subscription Ended': '4'} mappings['created_at'] = 'created_at' # Mapping stage_mapping = etl.fieldmap(stage_m_s, mappings) # Sort stage_mapping_ordered = etl.sort(stage_mapping, key=['created_at', 'email', 'sub_order']) # Datetime split t1 = etl.split(stage_mapping_ordered, 'created_at', 'T', ['date', 'time'], include_original=True) t2 = etl.split(t1, 'date', '-', ['year', 'month', 'day']) stage_ready = etl.split(t2, 'time', ':', ['hour', 'minute', 'second']) # Export as csv to load folder etl.tocsv(stage_ready, 'load/facts.csv') except Exception as e: print("Something went wrong. Error {0}".format(e))
def import_process(self): self.data = self.hourly_data [ list( map( lambda value: value.update({ 'location': key.PositionName, 'position': (key.Latitude, key.Longitude, key.Elevation) }), values)) for key, values in self.data.items() ] weather_data = [ value for key, values in self.data.items() for value in values ] self.data = petl.fromdicts(weather_data) self.data = petl.cut(self.data, 'location', 'position', 'time', 'summary', 'temperature', 'pressure', 'humidity') return True
def add_bbreflink(rec): bid = rec['bbrefID'] initial = bid[0] return "http://www.baseball-reference.com/players/" + initial + "/" + bid + ".shtml" # Load Master.csv from the Lahman database. table = etl.fromcsv(sys.argv[1]) # Use US births only table2 = etl.select(table, lambda rec: rec.birthCountry == 'USA') # Only use these fields table3 = etl.cut(table2, 'nameFirst', 'nameLast', 'debut', 'bbrefID', 'weight', 'height', 'finalGame', 'birthCity', 'birthState', 'birthYear') # Remove null birth city and birth year table4 = etl.select(table3, lambda rec: rec.birthCity != "" and rec.birthYear != "") # Add Baseball Reference URL table5 = etl.addfield(table4, 'baseball_ref_url', add_bbreflink) # Remove unnecessary bbrefid table6 = etl.cutout(table5, "bbrefID") # Load city,state lat long table. city = etl.fromcsv(sys.argv[2]) # Only use these fields city2 = etl.cut(city, "city", "state", "lat", "long") # Join tables by two keys
table4['age'] = 'age_years' look(table4) # cut table1 = [['foo', 'bar', 'baz'], ['A', 1, 2.7], ['B', 2, 3.4], ['B', 3, 7.8], ['D', 42, 9.0], ['E', 12]] from petl import look, cut look(table1) table2 = cut(table1, 'foo', 'baz') look(table2) # fields can also be specified by index, starting from zero table3 = cut(table1, 0, 2) look(table3) # field names and indices can be mixed table4 = cut(table1, 'bar', 0) look(table4) # select a range of fields table5 = cut(table1, *range(0, 2)) look(table5) # cutout table1 = [['foo', 'bar', 'baz'],
def attendance_file2table(filename, output_csv_filebase, add_extra_fields): global full_name2sk_indiv_id print "*** Parsing file: " + filename print attendance_dicts = [] # CCB's Worship Service event IDs... event_ids = {} event_ids["8"] = 6 event_ids["9"] = 7 event_ids["10"] = 8 event_ids["11:15"] = 9 event_ids["Christmas"] = 13 # The following are used to create CSV output filenames and to emit human-readable event name if add_extra_fields # flag is on event_names = {} event_names[6] = "08am" event_names[7] = "09am" event_names[8] = "10am" event_names[9] = "11_15am" event_names[13] = "Christmas Eve" # Time of event in Excel-parseable format event_times = {} event_times[6] = "08:00 AM" event_times[7] = "09:00 AM" event_times[8] = "10:00 AM" event_times[9] = "11:15 AM" event_times[13] = "04:00 PM" # Starting state... prior_line = None matched_month_year = None matched_service_time = None month = None year = None service_time = None line_number = 1 total_row_dict = None event_id = None accumulated_row_totals_dict = {"week1": 0, "week2": 0, "week3": 0, "week4": 0, "week5": 0, "week6": 0, "total": 0} full_name = None phone = None num_processed_lines = 0 for line in open(filename): # First pick off line at front of file indicating month and year that this attendance file is for... if not matched_month_year: matched_month_year = re.search("For the month of ([A-Z][a-z]+), ([0-9]{4})", line) if matched_month_year: month = string2monthnum(matched_month_year.group(1)) year = string2yearnum(matched_month_year.group(2)) if not (month and year): print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, "*** ERROR! Invalid month or year found" print >> sys.stderr, line print >> sys.stderr sys.exit(1) first_day_in_month, num_days_in_month = calendar.monthrange(year, month) # Create list of 6 date objects, month_sundays, representing week1, week2, ... week6 Sunday dates # If a week has no Sunday, it is None day_countup = 1 day_countup += 6 - first_day_in_month month_sundays = [] if first_day_in_month != 6: month_sundays.append(None) while day_countup <= num_days_in_month: month_sundays.append(datetime.date(year, month, day_countup)) day_countup += 7 while len(month_sundays) < 6: month_sundays.append(None) christmas_eve_date = datetime.date(year, month, 24) # Second pick off line at front of file indicating worship service time that this attendance file is for... elif not matched_service_time: matched_service_time = re.search("Worship Service - (Sunday |Summer )?([^ ]*)", line) if matched_service_time: service_time = matched_service_time.group(2) if service_time in event_ids: event_id = event_ids[service_time] event_name = event_names[event_id] else: print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, '*** ERROR! Unrecognized service_time: "' + service_time + '"' print >> sys.stderr sys.exit(1) # ...then match attendance (row per person with weeks they attended) and total (summary at bottom) rows else: # Once we found row with totals...we're done, that's last line in attendance file we need to parse matched_total_line = re.search("^ {18}Total: {13}(?P<attendance>( +[0-9]+)+)\r?$", line) if matched_total_line: totals_attendance_dict = attendance_str2dict( matched_total_line.group("attendance"), [-3, -9, -15, -20, -24, -29, -35], 3 ) break matched_attendance_line = re.search( "^ {6}" + "(?P<full_name>(?P<last_name>[A-Za-z]+([ \-'][A-Za-z]+)*), " + "(?P<first_name>([A-Za-z]+\.?)+([\-' ][A-Za-z]+)*)( \((?P<nick_name>[A-Za-z]+)\))?\.?)?\r?" + "(?P<phone>( +)?([0-9]{3}-[0-9]{3}-[0-9]{4}|Unlisted))?" + "(?P<attendance> +(1 +)+[1-6])?\r?$", line, ) if matched_attendance_line: if matched_attendance_line.group("full_name"): full_name = matched_attendance_line.group("full_name").strip() if matched_attendance_line.group("phone"): phone = matched_attendance_line.group("phone").strip() if matched_attendance_line.group("attendance"): if full_name: attendance = matched_attendance_line.group("attendance").strip() row_dict = attendance_str2dict(attendance, [-1, -7, -13, -18, -22, -27, -33], 1) row_dict["full_name"] = full_name if phone: row_dict["phone"] = phone else: row_dict["phone"] = "" num_processed_lines += 1 full_name = None phone = None if row_dict["total"] != ( row_dict["week1"] + row_dict["week2"] + row_dict["week3"] + row_dict["week4"] + row_dict["week5"] + row_dict["week6"] ): print >> sys.stderr, "*** Filename: " + filename + ", line number: " + str(line_number) print >> sys.stderr, "*** ERROR! Bad row total, doesn't match sum of weeks 1-6" print >> sys.stderr, row_dict print >> sys.stderr break for key in accumulated_row_totals_dict: accumulated_row_totals_dict[key] += row_dict[key] attendance_dicts.append(row_dict) # Buffer the current line for line folding if needed (see 'line folding' above) prior_line = line line_number += 1 print "*** Number of attendance lines processed: " + str(num_processed_lines) print "*** Number of attendees: " + str(accumulated_row_totals_dict["total"]) print if output_csv_filebase and event_id: output_csv_filename = ( output_csv_filebase + "/" + str(year) + format(month, "02d") + "_" + str(event_names[event_id]) + ".csv" ) all_columns_table = petl.fromdicts(attendance_dicts) petl.tocsv(all_columns_table, output_csv_filename) # Build 2nd list of dicts, where each list item is dict of individual date/event attendance. I.e. a row per # worship service date vs original attendance dicts format of a row per attendee across all weeks in month. # This is the actual one returned and eventually emitted into output file attendance_dicts2 = [] for attendance_dict in attendance_dicts: for key in attendance_dict: if key[:4] == "week" and attendance_dict[key] != 0: week_index = int(key[4:5]) - 1 if month_sundays[week_index] is not None: attendance_dict2 = {} full_name = attendance_dict["full_name"] if full_name in full_name2sk_indiv_id: attendance_dict2["Individual ID"] = full_name2sk_indiv_id[full_name] if event_name == "Christmas Eve": attendance_dict2["Date"] = christmas_eve_date else: attendance_dict2["Date"] = month_sundays[week_index] attendance_dict2["Event ID"] = event_id if add_extra_fields: attendance_dict2["Time"] = event_times[event_id] attendance_dict2["Full Name"] = full_name attendance_dict2["Event Name"] = event_name attendance_dict2["Week Num"] = week_index + 1 attendance_dicts2.append(attendance_dict2) else: print >> sys.stderr, '*** WARNING! Cannot find "' + full_name + '" in map' print >> sys.stderr else: print >> sys.stderr, '*** WARNING! Cannot find Sunday date for week index "' + str(week_index) + '"' print >> sys.stderr # Check if numbers on Servant Keeper's reported Total: line match the totals we've been accumulating # per attendance row entry. If they don't match, show WARNING (not ERROR, since via manual checks, it appears # that Servant Keeper totals are buggy) if totals_attendance_dict: for key in accumulated_row_totals_dict: if accumulated_row_totals_dict[key] != totals_attendance_dict[key]: pp = pprint.PrettyPrinter(stream=sys.stderr) print >> sys.stderr, "*** WARNING! Servant Keeper reported totals do not match data totals" print >> sys.stderr, "Servant Keeper Totals:" pp.pprint(totals_attendance_dict) print >> sys.stderr, "Data Totals:" pp.pprint(accumulated_row_totals_dict) print >> sys.stderr break return_table = petl.fromdicts(attendance_dicts2) header = petl.header(return_table) if "Event Name" in header: return_table = petl.cut( return_table, "Full Name", "Event Name", "Time", "Week Num", "Date", "Event ID", "Individual ID" ) else: return_table = petl.cut(return_table, "Date", "Event ID", "Individual ID") return return_table
def typeInference(table): for h in etl.header(table): col = etl.cut(table, h) print etl.nrows(col)
from __future__ import division, print_function, absolute_import # cut() ####### import petl as etl table1 = [['foo', 'bar', 'baz'], ['A', 1, 2.7], ['B', 2, 3.4], ['B', 3, 7.8], ['D', 42, 9.0], ['E', 12]] table2 = etl.cut(table1, 'foo', 'baz') table2 # fields can also be specified by index, starting from zero table3 = etl.cut(table1, 0, 2) table3 # field names and indices can be mixed table4 = etl.cut(table1, 'bar', 0) table4 # select a range of fields table5 = etl.cut(table1, *range(0, 2)) table5 # cutout() ########## import petl as etl table1 = [['foo', 'bar', 'baz'],