def test_teepickle(): t1 = (('foo', 'bar'), ('a', 2), ('b', 1), ('c', 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) etl.wrap(t1).teepickle(f1.name).selectgt('bar', 1).topickle(f2.name) ieq(t1, etl.frompickle(f1.name)) ieq(etl.wrap(t1).selectgt('bar', 1), etl.frompickle(f2.name))
def test_teepickle(): t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) etl.wrap(t1).teepickle(f1.name).selectgt("bar", 1).topickle(f2.name) ieq(t1, etl.frompickle(f1.name)) ieq(etl.wrap(t1).selectgt("bar", 1), etl.frompickle(f2.name))
def etl_(query): source_db = get_source_db(query) extract_query = get_extract_query(query) with source_db() as source: etl.fromdb(source, extract_query) \ .topickle(f'temp/{query.target_table}.p') with GISLNIDB.GISLNIDB() as target: etl.frompickle(f'temp/{query.target_table}.p') \ .todb(get_cursor(target), query.target_table.upper())
def etl_(query, logger): source_db = get_source_db(query) extract_query = get_extract_query(query) logger.info(f'{query.target_table} - extracting data into pickle file...') with source_db() as source: etl.fromdb(source, extract_query).topickle(f'temp/{query.target_table}.p') logger.info(f'{query.target_table} - loading data from pickle file...') with PERMITP.PERMITP() as target: etl.frompickle(f'temp/{query.target_table}.p').todb( get_cursor(target), query.target_table.upper())
def test_frompickle_cachetag_strict(): """Test the cachetag method on tables returned by frompickle.""" # initial data f = NamedTemporaryFile(delete=False) table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: pickle.dump(row, f) f.close() # cachetag with initial data tbl = frompickle(FileSource(f.name, checksumfun=crc32sum)) tag1 = tbl.cachetag() # make a change, preserving file size with open(f.name, 'wb') as o: rows = (('foo', 'bar'), ('d', 3), ('e', 5), ('f', 4)) for row in rows: pickle.dump(row, o) # check cachetag has changed tag2 = tbl.cachetag() assert tag2 != tag1, (tag2, tag1)
def materialize_to_file(self, file_path=None): """ "Materializes" a Table, meaning all pending transformations are applied. Unlike the original materialize function, this method does not bring the data into memory, but instead loads the data into a local temp file. This method updates the current table in place. `Args:` file_path: str The path to the file to materialize the table to; if not specified, a temp file will be created. `Returns:` str Path to the temp file that now contains the table """ # Load the data in batches, and "pickle" the rows to a temp file. # (We pickle rather than writing to, say, a CSV, so that we maintain # all the type information for each field.) file_path = file_path or files.create_temp_file() with open(file_path, 'wb') as handle: for row in self.table: pickle.dump(list(row), handle) # Load a Table from the file self.table = petl.frompickle(file_path) return file_path
def test_frompickle_cachetag(): """Test the cachetag method on tables returned by frompickle.""" # initial data f = NamedTemporaryFile(delete=False) table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: pickle.dump(row, f) f.close() # cachetag with initial data tbl = frompickle(f.name) tag1 = tbl.cachetag() # make a change with open(f.name, 'wb') as o: rows = ( ('foo', 'bar'), ('d', 3), # ('e', 5), ('f', 4)) for row in rows: pickle.dump(row, o) # check cachetag has changed tag2 = tbl.cachetag() assert tag2 != tag1, (tag2, tag1)
def test_frompickle_cachetag(): """Test the cachetag method on tables returned by frompickle.""" # initial data f = NamedTemporaryFile(delete=False) table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) for row in table: pickle.dump(row, f) f.close() # cachetag with initial data tbl = frompickle(f.name) tag1 = tbl.cachetag() # make a change with open(f.name, "wb") as o: rows = ( ("foo", "bar"), ("d", 3), # ('e', 5), ("f", 4), ) for row in rows: pickle.dump(row, o) # check cachetag has changed tag2 = tbl.cachetag() assert tag2 != tag1, (tag2, tag1)
def extract_odoo(offline=OFFLINE): if not offline: api = OdooConnector() filters = [('supplier', '=', True), ('active', '=', True), ('company_id', '=', 3)] dataframe = api.extract('res.partner', filters) drivers = fromdataframe(dataframe) mappings = { 'backend_username': '******', 'backend_uuid': 'x_backend_uuid', 'salary_id': 'x_salary_id', 'odoo_id': 'id', 'fleetname': lambda rec: rec['x_fleet'][1].replace('_', ' '), 'fullname': lambda rec: rec['display_name'].strip() } drivers = drivers.fieldmap(mappings) drivers = drivers.suffixheader('_in_odoo') drivers.topickle(DRIVERS_IN_ODOO_FILEPATH) else: drivers = frompickle(DRIVERS_IN_ODOO_FILEPATH) drivers = drivers.addfield('backend_username', lambda rec: rec['backend_username_in_odoo']) drivers = drivers.addfield('salary_id', lambda rec: rec['salary_id_in_odoo']) drivers = standardize_missing_values(drivers) write_to_log(drivers, 'drivers', 'odoo') return drivers
def test_teehtml(): t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) etl.wrap(t1).teehtml(f1.name).selectgt("bar", 1).topickle(f2.name) ieq(t1, etl.fromxml(f1.name, ".//tr", ("th", "td")).convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.frompickle(f2.name))
def test_teehtml(): t1 = (('foo', 'bar'), ('a', 2), ('b', 1), ('c', 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) etl.wrap(t1).teehtml(f1.name).selectgt('bar', 1).topickle(f2.name) ieq(t1, etl.fromxml(f1.name, './/tr', ('th', 'td')).convertnumbers()) ieq(etl.wrap(t1).selectgt('bar', 1), etl.frompickle(f2.name))
def test_teehtml_unicode(): t1 = ((u"foo", u"bar"), (u"Արամ Խաչատրյան", 2), (u"Johann Strauß", 1), (u"Вагиф Сәмәдоғлу", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teehtml(f1.name, encoding="utf-8").selectgt("bar", 1).topickle(f2.name)) ieq(t1, (etl.fromxml(f1.name, ".//tr", ("th", "td"), encoding="utf-8").convertnumbers())) ieq(etl.wrap(t1).selectgt("bar", 1), etl.frompickle(f2.name))
def test_frompickle(): """Test the frompickle function.""" f = NamedTemporaryFile(delete=False) table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: pickle.dump(row, f) f.close() actual = frompickle(f.name) ieq(table, actual) ieq(table, actual) # verify can iterate twice
def test_frompickle(): """Test the frompickle function.""" f = NamedTemporaryFile(delete=False) table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) for row in table: pickle.dump(row, f) f.close() actual = frompickle(f.name) ieq(table, actual) ieq(table, actual) # verify can iterate twice
def test_issue_231(): table = [['foo', 'bar'], ['a', '1'], ['b', '2']] t = cut(table, 'foo') totsv(t, 'tmp/issue_231.tsv') u = fromtsv('tmp/issue_231.tsv') ieq(t, u) tocsv(t, 'tmp/issue_231.csv') u = fromcsv('tmp/issue_231.csv') ieq(t, u) topickle(t, 'tmp/issue_231.pickle') u = frompickle('tmp/issue_231.pickle') ieq(t, u)
def test_teehtml_unicode(): t1 = ((u'foo', u'bar'), (u'Արամ Խաչատրյան', 2), (u'Johann Strauß', 1), (u'Вагиф Сәмәдоғлу', 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teehtml(f1.name, encoding='utf-8').selectgt('bar', 1).topickle(f2.name)) ieq(t1, (etl.fromxml(f1.name, './/tr', ('th', 'td'), encoding='utf-8').convertnumbers())) ieq(etl.wrap(t1).selectgt('bar', 1), etl.frompickle(f2.name))
def test_teetext(): t1 = (('foo', 'bar'), ('a', 2), ('b', 1), ('c', 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) prologue = 'foo,bar\n' template = '{foo},{bar}\n' epilogue = 'd,4' (etl.wrap(t1).teetext(f1.name, template=template, prologue=prologue, epilogue=epilogue).selectgt('bar', 1).topickle(f2.name)) ieq(t1 + (('d', 4), ), etl.fromcsv(f1.name).convertnumbers()) ieq(etl.wrap(t1).selectgt('bar', 1), etl.frompickle(f2.name))
def test_teetext_unicode(): t1 = ((u"foo", u"bar"), (u"Արամ Խաչատրյան", 2), (u"Johann Strauß", 1), (u"Вагиф Сәмәдоғлу", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) prologue = u"foo,bar\n" template = u"{foo},{bar}\n" epilogue = u"章子怡,4" ( etl.wrap(t1) .teetext(f1.name, template=template, prologue=prologue, epilogue=epilogue, encoding="utf-8") .selectgt("bar", 1) .topickle(f2.name) ) ieq(t1 + ((u"章子怡", 4),), etl.fromcsv(f1.name, encoding="utf-8").convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.frompickle(f2.name))
def test_teetext(): t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) prologue = "foo,bar\n" template = "{foo},{bar}\n" epilogue = "d,4" ( etl.wrap(t1) .teetext(f1.name, template=template, prologue=prologue, epilogue=epilogue) .selectgt("bar", 1) .topickle(f2.name) ) ieq(t1 + (("d", 4),), etl.fromcsv(f1.name).convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.frompickle(f2.name))
def query(self, sql): """ Run a BigQuery query and return the results as a Parsons table. `Args:` sql: str A valid BigTable statement `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ # Run the query query_job = self.client.query(sql) # We will use a temp file to cache the results so that they are not all living # in memory. We'll use pickle to serialize the results to file in order to maintain # the proper data types (e.g. integer). temp_filename = create_temp_file() wrote_header = False with open(temp_filename, 'wb') as temp_file: results = query_job.result() # If there are no results, just return None if results.total_rows == 0: return None for row in results: # Make sure we write out the header once and only once if not wrote_header: wrote_header = True header = list(row.keys()) pickle.dump(header, temp_file) row_data = list(row.values()) pickle.dump(row_data, temp_file) ptable = petl.frompickle(temp_filename) final_table = Table(ptable) return final_table
def test_teetext_unicode(): t1 = ((u'foo', u'bar'), (u'Արամ Խաչատրյան', 2), (u'Johann Strauß', 1), (u'Вагиф Сәмәдоғлу', 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) prologue = u'foo,bar\n' template = u'{foo},{bar}\n' epilogue = u'章子怡,4' (etl.wrap(t1).teetext(f1.name, template=template, prologue=prologue, epilogue=epilogue, encoding='utf-8').selectgt('bar', 1).topickle(f2.name)) ieq(t1 + ((u'章子怡', 4), ), etl.fromcsv(f1.name, encoding='utf-8').convertnumbers()) ieq(etl.wrap(t1).selectgt('bar', 1), etl.frompickle(f2.name))
# frompickle() ############## import petl as etl import pickle # set up a file to demonstrate with with open('example.p', 'wb') as f: pickle.dump(['foo', 'bar'], f) pickle.dump(['a', 1], f) pickle.dump(['b', 2], f) pickle.dump(['c', 2.5], f) # demonstrate the use of frompickle() table1 = etl.frompickle('example.p') table1 # topickle() ############ import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] etl.topickle(table1, 'example.p') # look what it did table2 = etl.frompickle('example.p') table2
look(testcsv) # topickle table = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] from petl import topickle, look look(table) topickle(table, 'test.dat') # look what it did from petl import frompickle look(frompickle('test.dat')) # appendpickle table = [['foo', 'bar'], ['d', 7], ['e', 42], ['f', 12]] from petl import look, frompickle # inspect an existing pickle file testdat = frompickle('test.dat') look(testdat) # append some data from petl import appendpickle
def query(self, sql, parameters=None): """ Run a BigQuery query and return the results as a Parsons table. To include python variables in your query, it is recommended to pass them as parameters, following the BigQuery style where parameters are prefixed with `@`s. Using the ``parameters`` argument ensures that values are escaped properly, and avoids SQL injection attacks. **Parameter Examples** .. code-block:: python name = "Beatrice O'Brady" sql = 'SELECT * FROM my_table WHERE name = %s' rs.query(sql, parameters=[name]) .. code-block:: python name = "Beatrice O'Brady" sql = "SELECT * FROM my_table WHERE name = %(name)s" rs.query(sql, parameters={'name': name}) `Args:` sql: str A valid BigTable statement parameters: dict A dictionary of query parameters for BigQuery. `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ # get our connection and cursor cursor = self._dbapi.connect(self.client).cursor() # Run the query cursor.execute(sql, parameters) # We will use a temp file to cache the results so that they are not all living # in memory. We'll use pickle to serialize the results to file in order to maintain # the proper data types (e.g. integer). temp_filename = create_temp_file() wrote_header = False with open(temp_filename, 'wb') as temp_file: # Track whether we got data, since if we don't get any results we need to return None got_results = False while True: batch = cursor.fetchmany(QUERY_BATCH_SIZE) if len(batch) == 0: break got_results = True for row in batch: # Make sure we write out the header once and only once if not wrote_header: wrote_header = True header = list(row.keys()) pickle.dump(header, temp_file) row_data = list(row.values()) pickle.dump(row_data, temp_file) if not got_results: return None ptable = petl.frompickle(temp_filename) final_table = Table(ptable) return final_table
from __future__ import division, print_function, absolute_import # frompickle() ############## import petl as etl import pickle # set up a file to demonstrate with with open('example.p', 'wb') as f: pickle.dump(['foo', 'bar'], f) pickle.dump(['a', 1], f) pickle.dump(['b', 2], f) pickle.dump(['c', 2.5], f) # demonstrate the use of frompickle() table1 = etl.frompickle('example.p') table1 # topickle() ############ import petl as etl table1 = [['foo', 'bar'], ['a', 1], ['b', 2], ['c', 2]] etl.topickle(table1, 'example.p') # look what it did table2 = etl.frompickle('example.p') table2
def query_with_connection(self, sql, connection, parameters=None, commit=True): """ Execute a query against the Redshift database, with an existing connection. Useful for batching queries together. Will return ``None`` if the query returns zero rows. `Args:` sql: str A valid SQL statement connection: obj A connection object obtained from ``redshift.connection()`` parameters: list A list of python variables to be converted into SQL values in your query commit: boolean Whether to commit the transaction immediately. If ``False`` the transaction will be committed when the connection goes out of scope and is closed (or you can commit manually with ``connection.commit()``). `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ # To Do: Have it return an ordered dict to return the # rows in the correct order with self.cursor(connection) as cursor: if 'credentials' not in sql: logger.debug(f'SQL Query: {sql}') cursor.execute(sql, parameters) if commit: connection.commit() # If the cursor is empty, don't cause an error if not cursor.description: logger.debug('Query returned 0 rows') return None else: # Fetch the data in batches, and "pickle" the rows to a temp file. # (We pickle rather than writing to, say, a CSV, so that we maintain # all the type information for each field.) temp_file = files.create_temp_file() with open(temp_file, 'wb') as f: # Grab the header header = [i[0] for i in cursor.description] pickle.dump(header, f) while True: batch = cursor.fetchmany(QUERY_BATCH_SIZE) if not batch: break logger.debug(f'Fetched {len(batch)} rows.') for row in batch: pickle.dump(list(row), f) # Load a Table from the file final_tbl = Table(petl.frompickle(temp_file)) logger.debug(f'Query returned {final_tbl.num_rows} rows.') return final_tbl
def query_with_connection(self, sql, connection, parameters=None, commit=True): """ Execute a query against the database, with an existing connection. Useful for batching queries together. Will return ``None`` if the query returns zero rows. `Args:` sql: str A valid SQL statement connection: obj A connection object obtained from ``mysql.connection()`` parameters: list A list of python variables to be converted into SQL values in your query commit: boolean Whether to commit the transaction immediately. If ``False`` the transaction will be committed when the connection goes out of scope and is closed (or you can commit manually with ``connection.commit()``). `Returns:` Parsons Table See :ref:`parsons-table` for output options. """ with self.cursor(connection) as cursor: # The python connector can only execute a single sql statement, so we will # break up each statement and execute them separately. for s in sql.strip().split(';'): if len(s) != 0: logger.debug(f'SQL Query: {sql}') cursor.execute(s, parameters) if commit: connection.commit() # If the SQL query provides no response, then return None if not cursor.description: logger.debug('Query returned 0 rows') return None else: # Fetch the data in batches, and "pickle" the rows to a temp file. # (We pickle rather than writing to, say, a CSV, so that we maintain # all the type information for each field.) temp_file = files.create_temp_file() with open(temp_file, 'wb') as f: # Grab the header pickle.dump(cursor.column_names, f) while True: batch = cursor.fetchmany(QUERY_BATCH_SIZE) if len(batch) == 0: break logger.debug(f'Fetched {len(batch)} rows.') for row in batch: pickle.dump(row, f) # Load a Table from the file final_tbl = Table(petl.frompickle(temp_file)) logger.debug(f'Query returned {final_tbl.num_rows} rows.') return final_tbl
def extract_backend(offline=OFFLINE): # Done in 4 steps: (1) grab the driver table from the CloudSQL, # (2) use the user uuids to query for users one by one through # the API, (3) get the fleet table from CloudSQL and (4) join # everything together. def extract_drivers(): query = SQLReader('sql.drivers_from_cloudsql') drivers_df = sql.execute(query.statements[0]) drivers_tb = fromdataframe(drivers_df) mappings = { 'driver_uuid': lambda rec: str(UUID(bytes=rec['uuid'], version=4)), 'fleet_uuid': lambda rec: str(UUID(bytes=rec['fleet_uuid'], version=4)), 'user_uuid': lambda rec: str(UUID(bytes=rec['user_ds_uuid'], version=4)), 'fullname': lambda rec: rec['last_name'].strip() + ', ' + rec['first_name'].strip(), } drivers_tb = drivers_tb.fieldmap(mappings) drivers_tb = drivers_tb.suffixheader('_in_backend') return drivers_tb def extract_users(): users_records = [api.get_record('users', driver.user_uuid_in_backend) for driver in drivers.namedtuples()] users_df = DataFrame().from_records(users_records) users_tb = fromdataframe(users_df) mappings = { 'driver_uuid': 'driver', 'user_uuid': 'uuid', 'backend_username': '******' } users_tb = users_tb.fieldmap(mappings) users_tb = users_tb.suffixheader('_in_backend') return users_tb def extract_fleets_from_dwh(): query = SQLReader('sql.fleets_from_tableau') fleets_df = dwh.execute(query.statements[0]) fleets_tb = fromdataframe(fleets_df) mappings = { 'fleet_uuid': 'uuid', 'fleetname': lambda rec: rec['backend_name'].replace('_', ' '), 'country_code': 'country_code', } fleets_tb = fleets_tb.cutout('country_code') fleets_tb = fleets_tb.fieldmap(mappings) fleets_tb = fleets_tb.suffixheader('_in_backend') return fleets_tb if not offline: sql = CloudSQLConnector() api = ValkfleetConnector() dwh = WarehouseConnector() drivers = extract_drivers() fleets = extract_fleets_from_dwh() users = extract_users() drivers.topickle(DRIVERS_IN_BACKEND_FILEPATH) fleets.topickle(FLEETS_IN_BACKEND_FILEPATH) users.topickle(USERS_IN_BACKEND_FILEPATH) else: drivers = frompickle(DRIVERS_IN_BACKEND_FILEPATH) fleets = frompickle(FLEETS_IN_BACKEND_FILEPATH) users = frompickle(USERS_IN_BACKEND_FILEPATH) write_to_log(drivers, 'drivers', 'backend') write_to_log(fleets, 'fleets', 'backend') write_to_log(users, 'users', 'backend') drivers_without_fleet = antijoin(drivers, fleets, key='fleet_uuid_in_backend') drivers_without_user = antijoin(drivers, users, key='user_uuid_in_backend') write_to_log(drivers_without_fleet, 'drivers without fleet', 'backend') write_to_log(drivers_without_user, 'drivers without user', 'backend') drivers_n_fleets = join(drivers, fleets, key='fleet_uuid_in_backend').cutout('fleet_uuid_in_backend') backend_drivers = join(drivers_n_fleets, users, key='user_uuid_in_backend') backend_drivers = backend_drivers.addfield('backend_username', lambda rec: rec['backend_username_in_backend']) backend_drivers = backend_drivers.cutout('driver_uuid_in_backend') backend_drivers = standardize_missing_values(backend_drivers) write_to_log(backend_drivers, 'drivers', 'backend') return backend_drivers
#Creating a pickle file a = ['test value', 'test value 2', 'test value 3'] file_Name = "testfile" # open the file for writing fileObject = open('pickel_file.p', 'wb') # this writes the object a to the # file named 'testfile' pickle.dump(a, fileObject) # here we close the fileObject fileObject.close() table3 = etl.frompickle('pickel_file.p') print('Pick') print(table3) ###################Reading Text Files################################# text = 'a,1\nb,2\nc,2\n' with open('example.txt', 'w') as f: f.write(text) table4 = etl.fromtext('example.txt') print(table4) ################Reading XML files################################## table5 = etl.fromxml('data.xml', 'tr', 'td')