def anonymize_table(table_id): if project.verbose: print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'], table_id) schema = API_BigQuery(project.task['auth']).tables().get( projectId=project.task['bigquery']['from']['project'], datasetId=project.task['bigquery']['from']['dataset'], tableId=table_id).execute()['schema']['fields'] rows = table_to_rows(project.task['auth'], project.task['bigquery']['from']['project'], project.task['bigquery']['from']['dataset'], table_id) rows = anonymize_rows(rows, schema) rows_to_table(project.task['auth'], project.task['bigquery']['to']['project'], project.task['bigquery']['to']['dataset'], table_id, rows, schema, skip_rows=0, disposition='WRITE_TRUNCATE')
def bigquery(): # check schema if given ( check independent of values ) if 'schema' in project.task['bigquery']: schema = table_to_schema( project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table'] ) object_compare(schema['fields'], project.task['bigquery']['schema']) # if query given check it if 'query' in project.task['bigquery']: rows = query_to_rows( project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['query'] ) object_compare(sorted(rows), sorted(project.task['bigquery']['values'])) # simple table check ( unless query given ) elif 'values' in project.task['bigquery']: rows = table_to_rows( project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table'] ) object_compare(sorted(rows), sorted(project.task['bigquery']['values']))
def anonymize_table(table_id): if project.verbose: print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'], table_id) schema = table_to_schema(project.task['auth'], project.task['bigquery']['from']['project'], project.task['bigquery']['from']['dataset'], table_id) rows = table_to_rows(project.task['auth'], project.task['bigquery']['from']['project'], project.task['bigquery']['from']['dataset'], table_id, as_object=True) rows = anonymize_rows(rows, project.task['bigquery']['to'].get('columns', [])) json_to_table(project.task['auth'], project.task['bigquery']['to']['project'], project.task['bigquery']['to']['dataset'], table_id, rows, schema, disposition='WRITE_TRUNCATE')
def bigquery(): print('TEST: bigquery') # check schema if given ( check independent of values ) if 'schema' in project.task['bigquery']: schema = table_to_schema(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) deltas = schema_compare(project.task['bigquery']['schema'], schema, path='') if deltas: print( '\nFAILED *******************************************************\n' ) for delta in deltas.values(): print('%(path)s: %(error)s ( %(expected)s - %(actual)s)' % delta) print( '\n**************************************************************\n' ) test_failed() else: test_passed() # if query given check it if 'query' in project.task['bigquery']: rows = query_to_rows( project.task['auth'], project.id, project.task['bigquery']['dataset'], query_parameters(project.task['bigquery']['query'], project.task['bigquery'].get('parameters')), legacy=project.task['bigquery'].get('legacy', True)) object_compare(sorted(rows), sorted(project.task['bigquery']['values'])) # simple table check ( unless query given ) elif 'values' in project.task['bigquery']: rows = table_to_rows(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) object_compare(sorted(rows), sorted(project.task['bigquery']['values']))
def anonymize_table(table_id): if project.verbose: print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'], table_id) schema = API_BigQuery(project.task['auth']).tables().get( projectId=project.task['bigquery']['from']['project'], datasetId=project.task['bigquery']['from']['dataset'], tableId=table_id).execute()['schema']['fields'] is_object = any([s['type'] == 'RECORD' for s in schema]) rows = table_to_rows(project.task['auth'], project.task['bigquery']['from']['project'], project.task['bigquery']['from']['dataset'], table_id, as_object=is_object) rows = anonymize_rows(rows, schema, is_object) if is_object: json_to_table(project.task['auth'], project.task['bigquery']['to']['project'], project.task['bigquery']['to']['dataset'], table_id, rows, schema, disposition='WRITE_TRUNCATE') else: rows_to_table(project.task['auth'], project.task['bigquery']['to']['project'], project.task['bigquery']['to']['dataset'], table_id, rows, schema, skip_rows=0, disposition='WRITE_TRUNCATE')
def get_rows(auth, source): """Processes standard read JSON block for dynamic loading of data. Allows us to quickly pull a column or columns of data from and use it as an input into a script. For example pull a list of ids from bigquery and act on each one. - When pulling a single column specify single_cell = True. Returns list AKA values. - When pulling a multiple columns specify single_cell = False. Returns list of lists AKA rows. - Values are always given as a list ( single_cell will trigger necessary wrapping ). - Values, bigquery, sheet are optional, if multiple given result is one continous iterator. - Extensible, add a handler to define a new source ( be kind update the documentation json ). Include the following JSON in a recipe, then in the run.py handler when encountering that block pass it to this function and use the returned results. from utils.data import get_rows var_json = { "in":{ "single_cell":[ boolean ], "values": [ integer list ], "bigquery":{ "dataset": [ string ], "query": [ string ], "legacy":[ boolean ] }, "bigquery":{ "dataset": [ string ], "table": [ string ], }, "sheet":{ "sheet":[ string - full URL, suggest using share link ], "tab":[ string ], "range":[ string - A1:A notation ] } } } values = get_rows('user', var_json) Or you can use it directly with project singleton. from util.project import project from utils.data import get_rows @project.from_parameters def something(): values = get_rows(project.task['auth'], project.task['in']) if __name__ == "__main__": something() Args: auth: (string) The type of authentication to use, user or service. source: (json) A json block resembling var_json described above. Returns: If single_cell is False: Returns a list of row values [[v1], [v2], ... ] If single_cell is True: Returns a list of values [v1, v2, ...] """ # if handler points to list, concatenate all the values from various sources into one list if isinstance(source, list): for s in source: for r in get_rows(auth, s): yield r # if handler is an endpoint, fetch data else: if 'values' in source: if isinstance(source['values'], list): for value in source['values']: yield value else: yield source['values'] if 'sheet' in source: rows = sheets_read( project.task['auth'], source['sheet']['sheet'], source['sheet']['tab'], source['sheet']['range'], ) for row in rows: yield row[0] if source.get('single_cell', False) else row if 'bigquery' in source: rows = [] if 'table' in source['bigquery']: rows = table_to_rows(source['bigquery'].get('auth', auth), project.id, source['bigquery']['dataset'], source['bigquery']['table'], as_object=source['bigquery'].get( 'as_object', False)) else: rows = query_to_rows( source['bigquery'].get('auth', auth), project.id, source['bigquery']['dataset'], query_parameters(source['bigquery']['query'], source['bigquery'].get('parameters', {})), legacy=source['bigquery'].get('legacy', False), as_object=source['bigquery'].get('as_object', False)) for row in rows: yield row[0] if source.get('single_cell', False) else row