def clear(self): """Clears the store in the Bulkdozer feed. """ if self.trix_id: sheets_clear(self.auth, self.trix_id, 'Store', 'A1:Z1') self._store = {} self._id_map = {}
def bigquery(): if 'query' in project.task['from']: if 'table' in project.task['to']: if project.verbose: print "QUERY TO TABLE", project.task['to']['table'] if 'pre_process_query' in project.task['to']: print 'executing statement' execute_statement(project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['pre_process_query'], use_legacy_sql=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True))) query_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), disposition=project.task['write_disposition'] if 'write_disposition' in project.task else 'WRITE_TRUNCATE', legacy=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql, target_project_id=project.task['to'].get( 'project_id', project.id)) # NOT USED SO RIPPING IT OUT # Mauriciod: Yes, it is used, look at project/mauriciod/target_winrate.json elif 'storage' in project.task['to']: if project.verbose: print "QUERY TO STORAGE", project.task['to']['storage'] local_file_name = '/tmp/%s' % str(uuid.uuid1()) rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query']) f = open(local_file_name, 'wb') writer = csv.writer(f) writer.writerows(rows) f.close() f = open(local_file_name, 'rb') object_put(project.task['auth'], project.task['to']['storage'], f) f.close() os.remove(local_file_name) elif 'trix' in project.task['to']: if project.verbose: print "QUERY TO SHEET", project.task['to']['trix'] rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'legacy', True)) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(project.task['auth'], project.task['to']['trix'], project.task['to']['range'].split('!')[0], project.task['to']['range'].split('!')[1]) sheets_write(project.task['auth'], project.task['to']['trix'], project.task['to']['range'].split('!')[0], project.task['to']['range'].split('!')[1], rows) elif 'sftp' in project.task['to']: rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'use_legacy_sql', True)) if rows: if project.verbose: print "QUERY TO SFTP" put_rows(project.task['auth'], project.task['to'], '', rows) else: if project.verbose: print "QUERY TO VIEW", project.task['to']['view'] query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get('legacy', project.task['from'].get( 'useLegacySql', True)) # DEPRECATED: useLegacySql ) else: if project.verbose: print "STORAGE TO TABLE", project.task['to']['table'] storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], project.task['from']['bucket'] + ':' + project.task['from']['path'], project.task.get('schema', []), project.task.get('skip_rows', 1), project.task.get('structure', 'CSV'), project.task.get('disposition', 'WRITE_TRUNCATE'))
def clear(self): """Clears the log tab in the Bulkdozer feed, useful when a new execution is starting. """ sheets_clear(self.auth, self.trix_id, 'Log', 'A1:B') self._row = 1
def sheets(): if project.verbose: print 'SHEETS' # clear if specified if project.task.get('clear', False): sheets_clear(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range']) # delete if specified ( after clear to prevent errors in case both are given ) if project.task.get('delete', False): sheets_tab_delete(project.task['auth'], project.task['sheet'], project.task['tab']) # create or copy if specified if 'template' in project.task: sheets_tab_copy(project.task['auth'], project.task['template']['sheet'], project.task['template']['tab'], project.task['sheet'], project.task['tab']) else: sheets_tab_create(project.task['auth'], project.task['sheet'], project.task['tab']) # write data if specified if 'write' in project.task: rows = get_rows(project.task['auth'], project.task['write']) sheets_write(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range'], rows) # move if specified if 'out' in project.task: rows = sheets_read(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range']) if rows: schema = None # RECOMMENDED: define schema in json if project.task['out']['bigquery'].get('schema'): if project.verbose: print 'SHEETS SCHEMA DEFINED' schema = project.task['out']['bigquery']['schema'] # NOT RECOMMENDED: determine schema if missing else: if project.verbose: print 'SHEETS SCHEMA DETECT ( Note Recommended - Define Schema In JSON )' # cast rows to types ( for schema detection ) rows = rows_to_type(rows) rows, schema = get_schema(rows, project.task.get('header', False), infer_type=project.task.get( 'infer_type', True)) # write to table ( not using put because no use cases for other destinations ) rows_to_table( auth=project.task['auth'], project_id=project.id, dataset_id=project.task['out']['bigquery']['dataset'], table_id=project.task['out']['bigquery']['table'], rows=rows, schema=schema, skip_rows=1 if project.task.get('header', False) else 0, disposition=project.task['out']['bigquery'].get( 'disposition', 'WRITE_TRUNCATE')) else: print 'SHEET EMPTY'
def put_rows(auth, destination, filename, rows, variant=''): """Processes standard write JSON block for dynamic export of data. Allows us to quickly write the results of a script to a destination. For example write the results of a DCM report into BigQuery. - Will write to multiple destinations if specified. - Extensible, add a handler to define a new destination ( be kind update the documentation json ). Include the following JSON in a recipe, then in the run.py handler when encountering that block pass it to this function and use the returned results. from utils.data import put_rows var_json = { "out":{ "bigquery":{ "dataset": [ string ], "table": [ string ] "schema": [ json - standard bigquery schema json ], "skip_rows": [ integer - for removing header ] "disposition": [ string - same as BigQuery documentation ] }, "sheets":{ "url":[ string - full URL, suggest using share link ], "tab":[ string ], "range":[ string - A1:A notation ] "delete": [ boolean - if sheet range should be cleared before writing ] }, "storage":{ "bucket": [ string ], "path": [ string ] }, "directory":[ string - full path to place to write file ] } } values = put_rows('user', var_json) Or you can use it directly with project singleton. from util.project import project from utils.data import put_rows def something(): values = get_rows(project.task['auth'], project.task['out']) if __name__ == "__main__": project.load('something') something() Args: auth: (string) The type of authentication to use, user or service. destination: (json) A json block resembling var_json described above. filename: (string) A unique filename if writing to medium requiring one, Usually gnerated by script. rows ( list ) The data being written as a list object. variant ( string ) Appends this to the destination name to create a variant ( for example when downloading multiple tabs in a sheet ). Returns: If single_cell is False: Returns a list of row values [[v1], [v2], ... ] If single_cell is True: Returns a list of values [v1, v2, ...] """ if 'bigquery' in destination: if destination['bigquery'].get('format' , 'CSV') == 'JSON': json_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) elif destination['bigquery'].get('is_incremental_load', False) == True: incremental_rows_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get('skip_rows', 1), destination['bigquery'].get('disposition', 'WRITE_APPEND'), billing_project_id=project.id ) else: rows_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get('skip_rows', 1), destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) if 'sheets' in destination: if destination['sheets'].get('delete', False): sheets_clear(auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range']) sheets_write(auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'], rows) if 'directory' in destination: file_out = destination['directory'] + variant + filename if project.verbose: print 'SAVING', file_out makedirs_safe(parse_path(file_out)) with open(file_out, 'wb') as save_file: save_file.write(rows_to_csv(rows).read()) if 'storage' in destination and destination['storage'].get('bucket') and destination['storage'].get('path'): # create the bucket bucket_create(auth, project.id, destination['storage']['bucket']) # put the file file_out = destination['storage']['bucket'] + ':' + destination['storage']['path'] + variant + filename if project.verbose: print 'SAVING', file_out object_put(auth, file_out, rows_to_csv(rows)) # deprecated do not use if 'trix' in destination: trix_update(auth, destination['trix']['sheet_id'], destination['trix']['sheet_range'], rows_to_csv(rows), destination['trix']['clear']) if 'email' in destination: pass if 'sftp' in destination: try: sys.stderr = StringIO(); cnopts = pysftp.CnOpts() cnopts.hostkeys = None file_prefix = 'report' if 'file_prefix' in destination['sftp']: file_prefix = destination['sftp'].get('file_prefix') del destination['sftp']['file_prefix'] #sftp_configs = destination['sftp'] #sftp_configs['cnopts'] = cnopts #sftp = pysftp.Connection(**sftp_configs) sftp = pysftp.Connection(host=destination['sftp']['host'], username=destination['sftp']['username'], password=destination['sftp']['password'], port=destination['sftp']['port'], cnopts=cnopts) if 'directory' in destination['sftp']: sftp.cwd(destination['sftp']['directory']) tmp_file_name = '/tmp/%s_%s.csv' % (file_prefix, datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S')) tmp_file = open(tmp_file_name, 'wb') tmp_file.write(rows_to_csv(rows).read()) tmp_file.close() sftp.put(tmp_file_name) os.remove(tmp_file_name) sys.stderr = sys.__stderr__; except e: print e traceback.print_exc()