def build(self): orig_bundle = self.library.require('test') # original data dim_bundle = self.library.require('dim') # dimension table from orig sink = self.database.path tags = (petl.fromsqlite3(dim_bundle.database.path, "SELECT * FROM tags")) flags = (petl.fromsqlite3(dim_bundle.database.path, "SELECT * FROM flags")) orig = (petl.fromsqlite3(orig_bundle.database.path, "SELECT * FROM example")) print petl.look(tags) print petl.look(flags) fact = (orig .hashjoin(tags,key='tag').cutout('tag') .hashjoin(flags,key='flags').cutout('flags')) print petl.look(fact) fact.tosqlite3(sink, 'example', create=False) self.log("Wrote fact table to: "+self.database.path) return True
def _do_query_extract(self, extract_data): """Extract a CSV file and upload it to CKAN""" import tempfile import uuid import os p = extract_data['_partition'] # Set in _make_partition_dict file_name = extract_data.get('name', None) if file_name: file_ = self.bundle.filesystem.path('extracts', file_name) else: file_ = os.path.join(tempfile.gettempdir(), str(uuid.uuid4()) ) self.bundle.log("Extracting {} to {}".format(extract_data['title'],file_)) petlf.fromsqlite3(p.database.path, extract_data['query'] ).tocsv(file_) #@UndefinedVariable return file_
def build(self): test_bundle = self.library.require('test') source_db_path = test_bundle.database.path sink_db_path = self.database.path self.log("Processing original: "+source_db_path) table = (petl.fromsqlite3(source_db_path, "SELECT * FROM example") .convertnumbers()) tags = (table.valuecounts('tag') .cut('value') .addrownumbers(10) .rename('value','tag') .rename('row','tag_id')) tags.tosqlite3(sink_db_path, 'tags') # Save again for the parition. part = self.partitions.find(table='tags') tags.tosqlite3(part.database.path, 'tags') flags = (table.valuecounts('flags') .cut('value') .addrownumbers(10) .rename('value','flags') .rename('row','flags_id')) # Testing new petl code. #dummy, flags = table.unjoin('flags',autoincrement=(10, 1)) #print petl.look(flags) flags.tosqlite3(sink_db_path, 'flags') # Save again for the parition. part = self.partitions.find(table='flags') tags.tosqlite3(part.database.path, 'flags') self.log('Wrote dimension tables to: '+sink_db_path) return True