def bigquery(): # check schema if given ( check independent of values ) if 'schema' in project.task['bigquery']: schema = table_to_schema(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) object_compare(schema, project.task['bigquery']['schema']) # if query given check it if 'query' in project.task['bigquery']: rows = query_to_rows( project.task['auth'], project.id, project.task['bigquery']['dataset'], query_parameters(project.task['bigquery']['query'], project.task['bigquery'].get('parameters')), ) object_compare(sorted(rows), sorted(project.task['bigquery']['values'])) # simple table check ( unless query given ) elif 'values' in project.task['bigquery']: rows = table_to_rows(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) object_compare(sorted(rows), sorted(project.task['bigquery']['values']))
def bigquery_run(): """Execute a query without expected return results. See: scripts/bigquery_run_query.json """ if project.verbose: print('RUN QUERY', project.task['run']['query']) run_query( project.task['auth'], project.id, query_parameters(project.task['run']['query'], project.task['run'].get('parameters')), project.task['run'].get('legacy', True))
def bigquery_run(config, task): """Execute a query without expected return results. See: scripts/bigquery_run_query.json """ if config.verbose: print('RUN QUERY', task['run']['query']) bq.run_query( config, task['auth'], config.project, bq.query_parameters(task['run']['query'], task['run'].get('parameters')), task['run'].get('legacy', True))
def bigquery(): print('TEST: bigquery') # check schema if given ( check independent of values ) if 'schema' in project.task['bigquery']: schema = table_to_schema(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) deltas = schema_compare(project.task['bigquery']['schema'], schema, path='') if deltas: print( '\nFAILED *******************************************************\n' ) for delta in deltas.values(): print('%(path)s: %(error)s ( %(expected)s - %(actual)s)' % delta) print( '\n**************************************************************\n' ) test_failed() else: test_passed() # if query given check it if 'query' in project.task['bigquery']: rows = query_to_rows( project.task['auth'], project.id, project.task['bigquery']['dataset'], query_parameters(project.task['bigquery']['query'], project.task['bigquery'].get('parameters')), legacy=project.task['bigquery'].get('legacy', True)) object_compare(sorted(rows), sorted(project.task['bigquery']['values'])) # simple table check ( unless query given ) elif 'values' in project.task['bigquery']: rows = table_to_rows(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) object_compare(sorted(rows), sorted(project.task['bigquery']['values']))
def bigquery(): if 'function' in project.task: query = None if project.task['function'] == 'pearson_significance_test': query = pearson_significance_test() if query: run_query(project.task['auth'], project.id, query, False, project.task['to']['dataset']) elif 'run' in project.task and 'query' in project.task.get('run', {}): if project.verbose: print('QUERY', project.task['run']['query']) run_query( project.task['auth'], project.id, query_parameters(project.task['run']['query'], project.task['run'].get('parameters')), project.task['run'].get('legacy', True), ) elif 'values' in project.task['from']: rows = get_rows(project.task['auth'], project.task['from']) rows_to_table(project.task['to'].get('auth', project.task['auth']), project.id, project.task['to']['dataset'], project.task['to']['table'], rows, project.task.get('schema', []), 0) elif 'query' in project.task['from']: if 'table' in project.task['to']: if project.verbose: print('QUERY TO TABLE', project.task['to']['table']) query_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), disposition=project.task['write_disposition'] if 'write_disposition' in project.task else 'WRITE_TRUNCATE', legacy=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql, target_project_id=project.task['to'].get( 'project_id', project.id)) elif 'sheet' in project.task['to']: if project.verbose: print('QUERY TO SHEET', project.task['to']['sheet']) rows = query_to_rows( project.task['auth'], project.id, project.task['from']['dataset'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), legacy=project.task['from'].get('legacy', True)) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(project.task['to'].get('auth', project.task['auth']), project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2')) sheets_write(project.task['to'].get('auth', project.task['auth']), project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2'), rows) elif 'sftp' in project.task['to']: rows = query_to_rows( project.task['auth'], project.id, project.task['from']['dataset'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), legacy=project.task['from'].get('use_legacy_sql', True)) if rows: if project.verbose: print('QUERY TO SFTP') put_rows(project.task['auth'], project.task['to'], rows) else: if project.verbose: print('QUERY TO VIEW', project.task['to']['view']) query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql project.task['to'].get('replace', False)) else: if project.verbose: print('STORAGE TO TABLE', project.task['to']['table']) storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], project.task['from']['bucket'] + ':' + project.task['from']['path'], project.task.get('schema', []), project.task.get('skip_rows', 1), project.task.get('structure', 'CSV'), project.task.get('disposition', 'WRITE_TRUNCATE'))
def get_rows(auth, source): """Processes standard read JSON block for dynamic loading of data. Allows us to quickly pull a column or columns of data from and use it as an input into a script. For example pull a list of ids from bigquery and act on each one. - When pulling a single column specify single_cell = True. Returns list AKA values. - When pulling a multiple columns specify single_cell = False. Returns list of lists AKA rows. - Values are always given as a list ( single_cell will trigger necessary wrapping ). - Values, bigquery, sheet are optional, if multiple given result is one continous iterator. - Extensible, add a handler to define a new source ( be kind update the documentation json ). Include the following JSON in a recipe, then in the run.py handler when encountering that block pass it to this function and use the returned results. from utils.data import get_rows var_json = { "in":{ "single_cell":[ boolean ], "values": [ integer list ], "bigquery":{ "dataset": [ string ], "query": [ string ], "legacy":[ boolean ] }, "bigquery":{ "dataset": [ string ], "table": [ string ], }, "sheet":{ "sheet":[ string - full URL, suggest using share link ], "tab":[ string ], "range":[ string - A1:A notation ] } } } values = get_rows('user', var_json) Or you can use it directly with project singleton. from util.project import project from utils.data import get_rows @project.from_parameters def something(): values = get_rows(project.task['auth'], project.task['in']) if __name__ == "__main__": something() Args: auth: (string) The type of authentication to use, user or service. source: (json) A json block resembling var_json described above. Returns: If single_cell is False: Returns a list of row values [[v1], [v2], ... ] If single_cell is True: Returns a list of values [v1, v2, ...] """ # if handler points to list, concatenate all the values from various sources into one list if isinstance(source, list): for s in source: for r in get_rows(auth, s): yield r # if handler is an endpoint, fetch data else: if 'values' in source: if isinstance(source['values'], list): for value in source['values']: yield value else: yield source['values'] if 'sheet' in source: rows = sheets_read( project.task['auth'], source['sheet']['sheet'], source['sheet']['tab'], source['sheet']['range'], ) for row in rows: yield row[0] if source.get('single_cell', False) else row if 'bigquery' in source: rows = [] if 'table' in source['bigquery']: rows = table_to_rows(source['bigquery'].get('auth', auth), project.id, source['bigquery']['dataset'], source['bigquery']['table'], as_object=source['bigquery'].get( 'as_object', False)) else: rows = query_to_rows( source['bigquery'].get('auth', auth), project.id, source['bigquery']['dataset'], query_parameters(source['bigquery']['query'], source['bigquery'].get('parameters', {})), legacy=source['bigquery'].get('legacy', False), as_object=source['bigquery'].get('as_object', False)) for row in rows: yield row[0] if source.get('single_cell', False) else row
def bigquery_query(): """Execute a query and write results to table. TODO: Replace with get_rows and put_rows combination. See: scripts/bigquery_query.json scripts/bigquery_storage.json scripts/bigquery_to_sheet.json scripts/bigquery_view.json """ if 'table' in project.task['to']: if project.verbose: print('QUERY TO TABLE', project.task['to']['table']) query_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), disposition=project.task['write_disposition'] if 'write_disposition' in project.task else 'WRITE_TRUNCATE', legacy=project.task['from'].get( 'legacy', project.task['from'].get('useLegacySql', True)), # DEPRECATED: useLegacySql, target_project_id=project.task['to'].get('project_id', project.id)) elif 'sheet' in project.task['to']: if project.verbose: print('QUERY TO SHEET', project.task['to']['sheet']) rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], query_parameters( project.task['from']['query'], project.task['from'].get('parameters')), legacy=project.task['from'].get('legacy', True)) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(project.task['to'].get('auth', project.task['auth']), project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2')) sheets_write(project.task['to'].get('auth', project.task['auth']), project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2'), rows) elif 'sftp' in project.task['to']: if project.verbose: print('QUERY TO SFTP') rows = query_to_rows( project.task['auth'], project.id, project.task['from']['dataset'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), legacy=project.task['from'].get('use_legacy_sql', True)) if rows: put_rows(project.task['auth'], project.task['to'], rows) else: if project.verbose: print('QUERY TO VIEW', project.task['to']['view']) query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get( 'legacy', project.task['from'].get('useLegacySql', True)), # DEPRECATED: useLegacySql project.task['to'].get('replace', False))
def bigquery(): if 'run' in project.task and 'query' in project.task.get('run', {}): if project.verbose: print("QUERY", project.task['run']['query']) run_query( project.task['auth'], project.id, query_parameters(project.task['run']['query'], project.task['run'].get('parameters')), project.task['run'].get('legacy', True), ) elif 'values' in project.task['from']: rows = get_rows(project.task['auth'], project.task['from']) rows_to_table(project.task['to'].get('auth', project.task['auth']), project.id, project.task['to']['dataset'], project.task['to']['table'], rows, project.task.get('schema', []), 0) elif 'query' in project.task['from']: if 'table' in project.task['to']: if project.verbose: print("QUERY TO TABLE", project.task['to']['table']) if 'pre_process_query' in project.task['to']: execute_statement(project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['pre_process_query'], use_legacy_sql=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True))) query_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), disposition=project.task['write_disposition'] if 'write_disposition' in project.task else 'WRITE_TRUNCATE', legacy=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql, target_project_id=project.task['to'].get( 'project_id', project.id)) # NOT USED SO RIPPING IT OUT # Mauriciod: Yes, it is used, look at project/mauriciod/target_winrate.json elif 'storage' in project.task['to']: if project.verbose: print("QUERY TO STORAGE", project.task['to']['storage']) local_file_name = '/tmp/%s' % str(uuid.uuid1()) rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query']) f = open(local_file_name, 'wb') writer = csv.writer(f) writer.writerows(rows) f.close() f = open(local_file_name, 'rb') object_put(project.task['auth'], project.task['to']['storage'], f) f.close() os.remove(local_file_name) elif 'sheet' in project.task['to']: if project.verbose: print("QUERY TO SHEET", project.task['to']['sheet']) rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'legacy', True)) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(project.task['auth'], project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2')) sheets_write(project.task['auth'], project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2'), rows) elif 'sftp' in project.task['to']: rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'use_legacy_sql', True)) if rows: if project.verbose: print("QUERY TO SFTP") put_rows(project.task['auth'], project.task['to'], rows) else: if project.verbose: print("QUERY TO VIEW", project.task['to']['view']) query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql project.task['to'].get('replace', False)) else: if project.verbose: print("STORAGE TO TABLE", project.task['to']['table']) storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], project.task['from']['bucket'] + ':' + project.task['from']['path'], project.task.get('schema', []), project.task.get('skip_rows', 1), project.task.get('structure', 'CSV'), project.task.get('disposition', 'WRITE_TRUNCATE'))