def write_report(report, dataset, table): # turn report file into rows rows = report_to_rows(report) rows = report_clean(rows) if rows: if project.verbose: print("DYNAMIC COSTS WRITTEN:", table) # pull DCM schema automatically try: schema = report_schema(next(rows)) except StopIteration: # report is empty raise ValueError("REPORT DID NOT RUN") # write report to bigquery rows_to_table( project.task['out']["auth"], project.id, project.task['out']["dataset"], table, rows, schema, 0 ) else: if project.verbose: print("DYNAMIC COSTS REPORT NOT READY:", table)
def create_cm_site_segmentation(config, task): # Read sheet to bq table sheet_rows = sheets_read(config, task['auth_sheets'], task['sheet'], 'CM_Site_Segments', 'A:C', retries=10) if not sheet_rows: sheet_rows = [] schema = [ { "type": "STRING", "name": "Site_Dcm", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Impressions", "mode": "NULLABLE" }, { "type": "STRING", "name": "Site_Type", "mode": "NULLABLE" } ] rows_to_table( config, auth=task['auth_bq'], project_id=config.project, dataset_id=task['dataset'], table_id=CM_SITE_SEGMENTATION_SHEET_TABLE, rows=sheet_rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE' ) # Get Site_Type from the sheet run_query_from_file(config, task, Queries.cm_site_segmentation.replace('{{dataset}}', task['dataset']), CM_SITE_SEGMENTATION_TABLE) # Move Table back to sheets query = 'SELECT * from `' + config.project + '.' + task['dataset'] + '.' + CM_SITE_SEGMENTATION_TABLE + '`' rows = query_to_rows(config, task['auth_bq'], config.project, task['dataset'], query, legacy=False) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(config, task['auth_sheets'], task['sheet'], 'CM_Site_Segments', 'A2:C') sheets_write(config, task['auth_sheets'], task['sheet'], 'CM_Site_Segments', 'A2:C', rows)
def anonymize_table(table_id): if project.verbose: print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'], table_id) schema = API_BigQuery(project.task['auth']).tables().get( projectId=project.task['bigquery']['from']['project'], datasetId=project.task['bigquery']['from']['dataset'], tableId=table_id).execute()['schema']['fields'] rows = table_to_rows(project.task['auth'], project.task['bigquery']['from']['project'], project.task['bigquery']['from']['dataset'], table_id) rows = anonymize_rows(rows, schema) rows_to_table(project.task['auth'], project.task['bigquery']['to']['project'], project.task['bigquery']['to']['dataset'], table_id, rows, schema, skip_rows=0, disposition='WRITE_TRUNCATE')
def dcm_log(config, task): if config.verbose: print('DCM LOG') accounts = list(get_rows(config, 'user', task['accounts'])) # determine start log date if table_exists(config, task['out']['auth'], task['out']['project'], task['out']['dataset'], CHANGELOGS_TABLE): start = next( query_to_rows( config, task['out']['auth'], task['out']['project'], task['out']['dataset'], 'SELECT FORMAT_TIMESTAMP("%%Y-%%m-%%dT%%H:%%M:%%S-00:00", MAX(changeTime), "UTC") FROM `%s`' % CHANGELOGS_TABLE, 1, False))[0] disposition = 'WRITE_APPEND' else: start = (datetime.utcnow() - timedelta(days=int(task['days'])) ).strftime('%Y-%m-%dT%H:%M:%S-00:00') disposition = 'WRITE_TRUNCATE' # load new logs rows = get_changelogs(config, task, accounts, start) if rows: rows_to_table(config, task['out']['auth'], task['out']['project'], task['out']['dataset'], CHANGELOGS_TABLE, rows, CHANGELOGS_SCHEMA, 0, disposition)
def sdf_to_bigquery(config, auth, sdf_zip_file, project_id, dataset, time_partitioned_table, create_single_day_table, table_suffix=''): with zipfile.ZipFile(sdf_zip_file, 'r', zipfile.ZIP_DEFLATED) as d: file_names = d.namelist() for file_name in file_names: if config.verbose: print('SDF: Loading: ' + file_name) with d.open(file_name) as sdf_file: rows = csv_to_rows(sdf_file.read().decode('utf-8')) if not rows: if config.verbose: print('SDF: Empty file ' + file_name) continue table_name = file_name.split('.')[0].replace( '-', '_') + table_suffix schema = sdf_schema(next(rows)) # Check if each SDF should have a dated table if create_single_day_table: table_name_dated = table_name + date.today().strftime( '%Y_%m_%d') # Create table and upload data table_create(auth, project_id, dataset, table_name_dated) rows_to_table(config, auth, project_id, dataset, table_name_dated, rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Create end result table if it doesn't already exist if not table_exists(config, auth, project_id, dataset, table_name): table_create(config, auth, project_id, dataset, table_name, is_time_partition=time_partitioned_table) rows_to_table(config, auth, project_id, dataset, table_name, rows, schema=schema, skip_rows=1, disposition='WRITE_APPEND' if time_partitioned_table else 'WRITE_TRUNCATE')
def sov_dbm(): # 1 - creat one report for client and oen report for peers client_name, peer_name = sov_create_reports() # 2 - Download the reports and aggregate into one table sov_rows = sov_process_client(client_name) sov_rows.extend(sov_process_peer(peer_name)) # 3 - Save the report to the specified table rows_to_table(project.task['auth'], project.id, project.task['dataset'], project.task['table'], sov_rows, SCHEMA, 0)
def create_cm_site_segmentation(project): # Read sheet to bq table sheet_rows = sheets_read('user', project.task['sheet'], 'CM_Site_Segments', 'A:C', retries=10) if not sheet_rows: sheet_rows = [] schema = [{ "type": "STRING", "name": "Site_Dcm", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Impressions", "mode": "NULLABLE" }, { "type": "STRING", "name": "Site_Type", "mode": "NULLABLE" }] rows_to_table(auth='service', project_id=project.id, dataset_id=project.task['dataset'], table_id=CM_SITE_SEGMENTATION_SHEET_TABLE, rows=sheet_rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Get Site_Type from the sheet run_query_from_file( os.path.join(os.path.dirname(__file__), SQL_DIRECTORY + CM_SITE_SEGMENTATION_FILENAME), project.id, project.task['dataset'], CM_SITE_SEGMENTATION_TABLE) # Move Table back to sheets query = 'SELECT * from `' + project.id + '.' + project.task[ 'dataset'] + '.' + CM_SITE_SEGMENTATION_TABLE + '`' rows = query_to_rows('service', project.id, project.task['dataset'], query, legacy=False) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C') sheets_write('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C', rows)
def create_dv360_segments(config, task): a1_notation = 'A:N' schema = [ { "type": "STRING", "name": "Advertiser", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Advertiser_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Campaign", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Campaign_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Insertion_Order", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Insertion_Order_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Line_Item", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Line_Item_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Line_Item_Type", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Impressions", "mode": "NULLABLE" }, { "type": "STRING", "name": "SegmentAutoGen", "mode": "NULLABLE" }, { "type": "STRING", "name": "Segment1", "mode": "NULLABLE" }, { "type": "STRING", "name": "Segment2", "mode": "NULLABLE" }, { "type": "STRING", "name": "Segment3", "mode": "NULLABLE" } ] sheet_rows = sheets_read(config, task['auth_sheets'], task['sheet'], 'DV3 Segments', a1_notation, retries=10) if not sheet_rows: sheet_rows = [] print('DV360 SEGMENT SHEET TABLE WRITE') rows_to_table( config, auth=task['auth_bq'], project_id=config.project, dataset_id=task['dataset'], table_id=DV360_CUSTOM_SEGMENTS_SHEET_TABLE, rows=sheet_rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE' ) # Run Query if config.verbose: print('RUN DV360 Custom Segments Query') run_query_from_file(config, task, Queries.dv360_custom_segments.replace('{{dataset}}', task['dataset']), DV360_CUSTOM_SEGMENTS_TABLE) # Move Table back to sheets query = 'SELECT * from `' + config.project + '.' + task['dataset'] + '.' + DV360_CUSTOM_SEGMENTS_TABLE + '`' rows = query_to_rows(config, task['auth_bq'], config.project, task['dataset'], query, legacy=False) # makes sure types are correct in sheet a1_notation = a1_notation[:1] + '2' + a1_notation[1:] rows = rows_to_type(rows) sheets_clear(config, task['auth_sheets'], task['sheet'], 'DV3 Segments', a1_notation) sheets_write(config, task['auth_sheets'], task['sheet'], 'DV3 Segments', a1_notation, rows)
def bigquery_values(config, task): """Write explicit values to a table. TODO: Replace with get_rows. See: scripts/bigquery_run_query.json """ if config.verbose: print('VALUES', task['from']['values']) rows = data.get_rows(config, task['auth'], task['from']) bq.rows_to_table(config, task['to'].get('auth', task['auth']), config.project, task['to']['dataset'], task['to']['table'], rows, task.get('schema', []), 0)
def bigquery_values(): """Write explicit values to a table. TODO: Replace with get_rows. See: scripts/bigquery_run_query.json """ if project.verbose: print('VALUES', project.task['from']['values']) rows = get_rows(project.task['auth'], project.task['from']) rows_to_table(project.task['to'].get('auth', project.task['auth']), project.id, project.task['to']['dataset'], project.task['to']['table'], rows, project.task.get('schema', []), 0)
def create_cm_site_segmentation(project): # Read sheet to bq table sheet_rows = sheets_read( 'user', project.task['sheet'], 'CM_Site_Segments', 'A:C', retries=10) if not sheet_rows: sheet_rows = [] schema = [{ 'type': 'STRING', 'name': 'Site_Dcm', 'mode': 'NULLABLE' }, { 'type': 'INTEGER', 'name': 'Impressions', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Site_Type', 'mode': 'NULLABLE' }] rows_to_table( auth='service', project_id=project.id, dataset_id=project.task['dataset'], table_id=CM_SITE_SEGMENTATION_SHEET_TABLE, rows=sheet_rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Get Site_Type from the sheet run_query_from_file(Queries.cm_site_segmentation, CM_SITE_SEGMENTATION_TABLE) # Move Table back to sheets query = 'SELECT * from `' + project.id + '.' + project.task[ 'dataset'] + '.' + CM_SITE_SEGMENTATION_TABLE + '`' rows = query_to_rows( 'service', project.id, project.task['dataset'], query, legacy=False) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C') sheets_write('user', project.task['sheet'], 'CM_Site_Segments', 'A2:C', rows)
def monthly_budget_mover(): if project.verbose: print('MONTHLY BUDGET MOVER') report = report_to_list(project.task['auth'], project.task['spend_report_id']) categories = remove_excluded_ios_from_categories(project.task['budget_categories'], project.task['excluded_ios']) categories_spend = aggregate_io_spend_to_categories(report, categories) sdf = list(sdf_read(project.task['auth'], project.task['sdf']['file_types'].split(','), project.task['sdf']['filter_type'], project.task['sdf']['filter_ids'])) categories_budget = aggregate_io_budget_to_categories(sdf, categories) category_budget_deltas = calc_budget_spend_deltas(categories_budget, categories_spend, categories) new_sdf,changes = apply_category_budgets(sdf, category_budget_deltas, categories) # Set Schemas for BQ tables schema = make_schema(new_sdf[0]) schema_changes = make_schema(['Io Id','Category','Old Value','Delta','New Value']) # Write old sdf to table rows_to_table(project.task['auth'], project.id, project.task['out']['dataset'], project.task['out']['old_sdf_table_name'], sdf, schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Write new sdf to table rows_to_table(project.task['auth'], project.id, project.task['out']['dataset'], project.task['out']['new_sdf_table_name'], new_sdf, schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Write log file to table rows_to_table(project.task['auth'], project.id, project.task['out']['dataset'], project.task['out']['changes_table_name'], iter(changes), schema_changes, skip_rows=0, disposition='WRITE_TRUNCATE')
def write_report(config, task, report, dataset, table): # turn report file into rows rows = report_to_rows(report) rows = report_clean(rows) if rows: if config.verbose: print('DYNAMIC COSTS WRITTEN:', table) # pull DCM schema automatically try: schema = report_schema(next(rows)) except StopIteration: # report is empty raise ValueError('REPORT DID NOT RUN') # write report to bigquery rows_to_table(config, task['out']['auth'], config.project, task['out']['dataset'], table, rows, schema, 0) else: if config.verbose: print('DYNAMIC COSTS REPORT NOT READY:', table)
def anonymize_table(table_id): if project.verbose: print('ANONYMIZE TABLE', project.task['bigquery']['to']['dataset'], table_id) schema = API_BigQuery(project.task['auth']).tables().get( projectId=project.task['bigquery']['from']['project'], datasetId=project.task['bigquery']['from']['dataset'], tableId=table_id).execute()['schema']['fields'] is_object = any([s['type'] == 'RECORD' for s in schema]) rows = table_to_rows(project.task['auth'], project.task['bigquery']['from']['project'], project.task['bigquery']['from']['dataset'], table_id, as_object=is_object) rows = anonymize_rows(rows, schema, is_object) if is_object: json_to_table(project.task['auth'], project.task['bigquery']['to']['project'], project.task['bigquery']['to']['dataset'], table_id, rows, schema, disposition='WRITE_TRUNCATE') else: rows_to_table(project.task['auth'], project.task['bigquery']['to']['project'], project.task['bigquery']['to']['dataset'], table_id, rows, schema, skip_rows=0, disposition='WRITE_TRUNCATE')
def create_dv360_segments(project): a1_notation = 'A:M' schema = [{ 'type': 'STRING', 'name': 'Advertiser', 'mode': 'NULLABLE' }, { 'type': 'INTEGER', 'name': 'Advertiser_Id', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Campaign', 'mode': 'NULLABLE' }, { 'type': 'INTEGER', 'name': 'Campaign_Id', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Insertion_Order', 'mode': 'NULLABLE' }, { 'type': 'INTEGER', 'name': 'Insertion_Order_Id', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Line_Item', 'mode': 'NULLABLE' }, { 'type': 'INTEGER', 'name': 'Line_Item_Id', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Line_Item_Type', 'mode': 'NULLABLE' }, { 'type': 'INTEGER', 'name': 'Impressions', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Segment1', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Segment2', 'mode': 'NULLABLE' }, { 'type': 'STRING', 'name': 'Segment3', 'mode': 'NULLABLE' }] sheet_rows = sheets_read( 'user', project.task['sheet'], 'DV3 Segments', a1_notation, retries=10) if not sheet_rows: sheet_rows = [] print('DV360 SEGMENT SHEET TABLE WRITE') rows_to_table( auth='service', project_id=project.id, dataset_id=project.task['dataset'], table_id=DV360_CUSTOM_SEGMENTS_SHEET_TABLE, rows=sheet_rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Run Query if project.verbose: print('RUN DV360 Custom Segments Query') run_query_from_file(Queries.dv360_custom_segments, DV360_CUSTOM_SEGMENTS_TABLE) # Move Table back to sheets query = 'SELECT * from `' + project.id + '.' + project.task[ 'dataset'] + '.' + DV360_CUSTOM_SEGMENTS_TABLE + '`' rows = query_to_rows( 'service', project.id, project.task['dataset'], query, legacy=False) # makes sure types are correct in sheet a1_notation = a1_notation[:1] + '2' + a1_notation[1:] rows = rows_to_type(rows) sheets_clear('user', project.task['sheet'], 'DV3 Segments', a1_notation) sheets_write('user', project.task['sheet'], 'DV3 Segments', a1_notation, rows)
def bigquery(): if 'function' in project.task: query = None if project.task['function'] == 'pearson_significance_test': query = pearson_significance_test() if query: run_query(project.task['auth'], project.id, query, False, project.task['to']['dataset']) elif 'run' in project.task and 'query' in project.task.get('run', {}): if project.verbose: print('QUERY', project.task['run']['query']) run_query( project.task['auth'], project.id, query_parameters(project.task['run']['query'], project.task['run'].get('parameters')), project.task['run'].get('legacy', True), ) elif 'values' in project.task['from']: rows = get_rows(project.task['auth'], project.task['from']) rows_to_table(project.task['to'].get('auth', project.task['auth']), project.id, project.task['to']['dataset'], project.task['to']['table'], rows, project.task.get('schema', []), 0) elif 'query' in project.task['from']: if 'table' in project.task['to']: if project.verbose: print('QUERY TO TABLE', project.task['to']['table']) query_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), disposition=project.task['write_disposition'] if 'write_disposition' in project.task else 'WRITE_TRUNCATE', legacy=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql, target_project_id=project.task['to'].get( 'project_id', project.id)) elif 'sheet' in project.task['to']: if project.verbose: print('QUERY TO SHEET', project.task['to']['sheet']) rows = query_to_rows( project.task['auth'], project.id, project.task['from']['dataset'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), legacy=project.task['from'].get('legacy', True)) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(project.task['to'].get('auth', project.task['auth']), project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2')) sheets_write(project.task['to'].get('auth', project.task['auth']), project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2'), rows) elif 'sftp' in project.task['to']: rows = query_to_rows( project.task['auth'], project.id, project.task['from']['dataset'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), legacy=project.task['from'].get('use_legacy_sql', True)) if rows: if project.verbose: print('QUERY TO SFTP') put_rows(project.task['auth'], project.task['to'], rows) else: if project.verbose: print('QUERY TO VIEW', project.task['to']['view']) query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql project.task['to'].get('replace', False)) else: if project.verbose: print('STORAGE TO TABLE', project.task['to']['table']) storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], project.task['from']['bucket'] + ':' + project.task['from']['path'], project.task.get('schema', []), project.task.get('skip_rows', 1), project.task.get('structure', 'CSV'), project.task.get('disposition', 'WRITE_TRUNCATE'))
def bigquery(): if 'run' in project.task and 'query' in project.task.get('run', {}): if project.verbose: print("QUERY", project.task['run']['query']) run_query( project.task['auth'], project.id, project.task['run']['query'], project.task['run'].get('legacy', True), #project.task['run'].get('billing_project_id', None) ) elif 'values' in project.task['from']: rows = get_rows(project.task['auth'], project.task['from']) rows_to_table(project.task['to'].get('auth', project.task['auth']), project.id, project.task['to']['dataset'], project.task['to']['table'], rows, project.task.get('schema', []), 0) elif 'query' in project.task['from']: if 'table' in project.task['to']: if project.verbose: print("QUERY TO TABLE", project.task['to']['table']) if 'pre_process_query' in project.task['to']: print('executing statement') execute_statement(project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['pre_process_query'], use_legacy_sql=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True))) query_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), disposition=project.task['write_disposition'] if 'write_disposition' in project.task else 'WRITE_TRUNCATE', legacy=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql, target_project_id=project.task['to'].get( 'project_id', project.id)) # NOT USED SO RIPPING IT OUT # Mauriciod: Yes, it is used, look at project/mauriciod/target_winrate.json elif 'storage' in project.task['to']: if project.verbose: print("QUERY TO STORAGE", project.task['to']['storage']) local_file_name = '/tmp/%s' % str(uuid.uuid1()) rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query']) f = open(local_file_name, 'wb') writer = csv.writer(f) writer.writerows(rows) f.close() f = open(local_file_name, 'rb') object_put(project.task['auth'], project.task['to']['storage'], f) f.close() os.remove(local_file_name) elif 'sheet' in project.task['to']: if project.verbose: print("QUERY TO SHEET", project.task['to']['sheet']) rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'legacy', True)) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(project.task['auth'], project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2')) sheets_write(project.task['auth'], project.task['to']['sheet'], project.task['to']['tab'], project.task['to'].get('range', 'A2'), rows) elif 'sftp' in project.task['to']: rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'use_legacy_sql', True)) if rows: if project.verbose: print("QUERY TO SFTP") put_rows(project.task['auth'], project.task['to'], rows) else: if project.verbose: print("QUERY TO VIEW", project.task['to']['view']) query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql project.task['to'].get('replace', False)) else: if project.verbose: print("STORAGE TO TABLE", project.task['to']['table']) storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], project.task['from']['bucket'] + ':' + project.task['from']['path'], project.task.get('schema', []), project.task.get('skip_rows', 1), project.task.get('structure', 'CSV'), project.task.get('disposition', 'WRITE_TRUNCATE'))
def put_rows(auth, destination, rows, variant=''): """Processes standard write JSON block for dynamic export of data. Allows us to quickly write the results of a script to a destination. For example write the results of a DCM report into BigQuery. - Will write to multiple destinations if specified. - Extensible, add a handler to define a new destination ( be kind update the documentation json ). Include the following JSON in a recipe, then in the run.py handler when encountering that block pass it to this function and use the returned results. from utils.data import put_rows var_json = { "out":{ "bigquery":{ "dataset": [ string ], "table": [ string ] "schema": [ json - standard bigquery schema json ], "skip_rows": [ integer - for removing header ] "disposition": [ string - same as BigQuery documentation ] }, "sheets":{ "sheet":[ string - full URL, suggest using share link ], "tab":[ string ], "range":[ string - A1:A notation ] "delete": [ boolean - if sheet range should be cleared before writing ] }, "storage":{ "bucket": [ string ], "path": [ string ] }, "file":[ string - full path to place to write file ] } } values = put_rows('user', var_json) Or you can use it directly with project singleton. from util.project import project from utils.data import put_rows @project.from_parameters def something(): values = get_rows(project.task['auth'], project.task['out']) if __name__ == "__main__": something() Args: auth: (string) The type of authentication to use, user or service. destination: (json) A json block resembling var_json described above. rows ( list ) The data being written as a list object. variant (string) Appended to destination to differentieate multiple objects Returns: If single_cell is False: Returns a list of row values [[v1], [v2], ... ] If single_cell is True: Returns a list of values [v1, v2, ...] """ if 'bigquery' in destination: if destination['bigquery'].get('format', 'CSV') == 'JSON': json_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) elif destination['bigquery'].get('is_incremental_load', False) == True: incremental_rows_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get( 'skip_rows', 1), #0 if 'schema' in destination['bigquery'] else 1), destination['bigquery'].get('disposition', 'WRITE_APPEND'), billing_project_id=project.id) else: rows_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get( 'skip_rows', 1), #0 if 'schema' in destination['bigquery'] else 1), destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) if 'sheets' in destination: if destination['sheets'].get('delete', False): sheets_clear( auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'], ) sheets_write(auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'], rows) if 'file' in destination: path_out, file_ext = destination['file'].rsplit('.', 1) file_out = path_out + variant + '.' + file_ext if project.verbose: print('SAVING', file_out) makedirs_safe(parse_path(file_out)) with open(file_out, 'w') as save_file: save_file.write(rows_to_csv(rows).read()) if 'storage' in destination and destination['storage'].get( 'bucket') and destination['storage'].get('path'): # create the bucket bucket_create(auth, project.id, destination['storage']['bucket']) # put the file file_out = destination['storage']['bucket'] + ':' + destination[ 'storage']['path'] + variant if project.verbose: print('SAVING', file_out) object_put(auth, file_out, rows_to_csv(rows)) if 'sftp' in destination: try: cnopts = pysftp.CnOpts() cnopts.hostkeys = None path_out, file_out = destination['sftp']['file'].rsplit('.', 1) file_out = path_out + variant + file_out sftp = pysftp.Connection(host=destination['sftp']['host'], username=destination['sftp']['username'], password=destination['sftp']['password'], port=destination['sftp']['port'], cnopts=cnopts) if '/' in file_out: dir_out, file_out = file_out.rsplit('/', 1) sftp.cwd(dir_out) sftp.putfo(rows_to_csv(rows), file_out) except e: print(str(e)) traceback.print_exc()
def handle(self, *args, **kwargs): impact = [ ] #{ 'day': DATE, 'deployment':INT, 'account': INT, 'product': STRING, 'recipe': STRING, 'user': STRING } missing = {} id_max = 0 project.initialize(_service=settings.UI_SERVICE, _verbose=True) if table_exists('service', 'google.com:starthinker', 'dashboard', 'ST_Scripts'): id_max = next( query_to_rows('service', 'google.com:starthinker', 'dashboard', 'SELECT MAX(Deployment) FROM ST_Scripts', legacy=False))[0] for recipe in Recipe.objects.filter( id__gt=id_max).order_by('id')[:kwargs['recipes']]: project.initialize(_user=recipe.account.get_credentials_path(), _service=settings.UI_SERVICE, _verbose=True) values = recipe.get_values() for v in values: if v['tag'] in ('dcm_to_bigquery', 'dcm_to_sheets', 'dcm_to_storage', 'dcm_run', 'conversion_upload_from_bigquery', 'conversion_upload_from_sheets'): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values'].get('account'), 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] in ('dbm_to_bigquery', 'dbm_to_sheets', 'dbm_to_storage'): for partner in account_from_dbm_report( v['values'].get('dbm_report_id'), v['values'].get('dbm_report_name')): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': partner, 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] in ('dt', ): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': account_from_dt(v['values']), 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] == 'barnacle': for account in v['values']['accounts']: impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': account, 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] in ('entity', ): for partner in v['values']['partners']: impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': partner, 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] == 'itp': impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values']['dcm_account'], 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values']['dbm_partner'], 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) elif v['tag'] == 'itp_audit': impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': v['values']['cm_account_id'], 'script': v['tag'], 'product': 'dcm', 'user': recipe.account.email.replace('@google.com', '') }) for partner in account_from_dbm_report( None, v['values'].get('dv360_report_name')): impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': partner, 'script': v['tag'], 'product': 'dbm', 'user': recipe.account.email.replace('@google.com', '') }) else: impact.append({ 'day': recipe.birthday, 'deployment': recipe.id, 'account': None, 'script': v['tag'], 'product': None, 'user': recipe.account.email.replace('@google.com', '') }) missing.setdefault(v['tag'], 0) missing[v['tag']] += 1 if impact: if kwargs['test']: print(impact) else: print('WRITING TO ST_Scripts') rows_to_table('service', 'google.com:starthinker', 'dashboard', 'ST_Scripts', [(i['day'], i['deployment'], i['user'], i['product'], i['script'], i['account']) for i in impact], schema=[ { 'mode': 'REQUIRED', 'name': 'Day', 'type': 'Date' }, { 'mode': 'REQUIRED', 'name': 'Deployment', 'type': 'INTEGER' }, { 'mode': 'REQUIRED', 'name': 'User', 'type': 'STRING' }, { 'mode': 'NULLABLE', 'name': 'Product', 'type': 'STRING' }, { 'mode': 'NULLABLE', 'name': 'Recipe', 'type': 'STRING' }, { 'mode': 'NULLABLE', 'name': 'Account', 'type': 'INTEGER' }, ], skip_rows=0, disposition='WRITE_TRUNCATE' if id_max == 0 else 'WRITE_APPEND', wait=True) print('MISSING', missing) print('Coverage:', (len(impact) * 100) / (len(missing) + len(impact))) else: print('No recipes newer than:', id_max)
def put_rows(auth, destination, filename, rows, variant=''): """Processes standard write JSON block for dynamic export of data. Allows us to quickly write the results of a script to a destination. For example write the results of a DCM report into BigQuery. - Will write to multiple destinations if specified. - Extensible, add a handler to define a new destination ( be kind update the documentation json ). Include the following JSON in a recipe, then in the run.py handler when encountering that block pass it to this function and use the returned results. from utils.data import put_rows var_json = { "out":{ "bigquery":{ "dataset": [ string ], "table": [ string ] "schema": [ json - standard bigquery schema json ], "skip_rows": [ integer - for removing header ] "disposition": [ string - same as BigQuery documentation ] }, "sheets":{ "url":[ string - full URL, suggest using share link ], "tab":[ string ], "range":[ string - A1:A notation ] "delete": [ boolean - if sheet range should be cleared before writing ] }, "storage":{ "bucket": [ string ], "path": [ string ] }, "directory":[ string - full path to place to write file ] } } values = put_rows('user', var_json) Or you can use it directly with project singleton. from util.project import project from utils.data import put_rows @project.from_parameters def something(): values = get_rows(project.task['auth'], project.task['out']) if __name__ == "__main__": something() Args: auth: (string) The type of authentication to use, user or service. destination: (json) A json block resembling var_json described above. filename: (string) A unique filename if writing to medium requiring one, Usually gnerated by script. rows ( list ) The data being written as a list object. variant ( string ) Appends this to the destination name to create a variant ( for example when downloading multiple tabs in a sheet ). Returns: If single_cell is False: Returns a list of row values [[v1], [v2], ... ] If single_cell is True: Returns a list of values [v1, v2, ...] """ if 'bigquery' in destination: if destination['bigquery'].get('format', 'CSV') == 'JSON': json_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) elif destination['bigquery'].get('is_incremental_load', False) == True: incremental_rows_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get( 'skip_rows', 1), #0 if 'schema' in destination['bigquery'] else 1), destination['bigquery'].get('disposition', 'WRITE_APPEND'), billing_project_id=project.id) else: rows_to_table( destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', project.id), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, destination['bigquery'].get('schema', []), destination['bigquery'].get( 'skip_rows', 1), #0 if 'schema' in destination['bigquery'] else 1), destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) if 'sheets' in destination: if destination['sheets'].get('delete', False): sheets_clear(auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range']) sheets_write(auth, destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'], rows) if 'directory' in destination: file_out = destination['directory'] + variant + filename if project.verbose: print 'SAVING', file_out makedirs_safe(parse_path(file_out)) with open(file_out, 'wb') as save_file: save_file.write(rows_to_csv(rows).read()) if 'storage' in destination and destination['storage'].get( 'bucket') and destination['storage'].get('path'): # create the bucket bucket_create(auth, project.id, destination['storage']['bucket']) # put the file file_out = destination['storage']['bucket'] + ':' + destination[ 'storage']['path'] + variant + filename if project.verbose: print 'SAVING', file_out object_put(auth, file_out, rows_to_csv(rows)) # deprecated do not use if 'trix' in destination: trix_update(auth, destination['trix']['sheet_id'], destination['trix']['sheet_range'], rows_to_csv(rows), destination['trix']['clear']) if 'email' in destination: pass if 'sftp' in destination: try: sys.stderr = StringIO() cnopts = pysftp.CnOpts() cnopts.hostkeys = None file_prefix = 'report' if 'file_prefix' in destination['sftp']: file_prefix = destination['sftp'].get('file_prefix') del destination['sftp']['file_prefix'] #sftp_configs = destination['sftp'] #sftp_configs['cnopts'] = cnopts #sftp = pysftp.Connection(**sftp_configs) sftp = pysftp.Connection(host=destination['sftp']['host'], username=destination['sftp']['username'], password=destination['sftp']['password'], port=destination['sftp']['port'], cnopts=cnopts) if 'directory' in destination['sftp']: sftp.cwd(destination['sftp']['directory']) tmp_file_name = '/tmp/%s_%s.csv' % ( file_prefix, datetime.datetime.now().strftime('%Y-%m-%dT%H-%M-%S')) tmp_file = open(tmp_file_name, 'wb') tmp_file.write(rows_to_csv(rows).read()) tmp_file.close() sftp.put(tmp_file_name) os.remove(tmp_file_name) sys.stderr = sys.__stderr__ except e: print e traceback.print_exc()
def put_rows(config, auth, destination, rows, schema=None, variant=''): """Processes standard write JSON block for dynamic export of data. Allows us to quickly write the results of a script to a destination. For example write the results of a DCM report into BigQuery. - Will write to multiple destinations if specified. - Extensible, add a handler to define a new destination ( be kind update the documentation json ). Include the following JSON in a recipe, then in the run.py handler when encountering that block pass it to this function and use the returned results. from utils.data import put_rows var_json = { "out":{ "bigquery":{ "auth":"[ user or service ]", "dataset": [ string ], "table": [ string ] "schema": [ json - standard bigquery schema json ], "header": [ boolean - true if header exists in rows ] "disposition": [ string - same as BigQuery documentation ] }, "sheets":{ "auth":"[ user or service ]", "sheet":[ string - full URL, suggest using share link ], "tab":[ string ], "range":[ string - A1:A notation ] "append": [ boolean - if sheet range should be appended to ] "delete": [ boolean - if sheet range should be cleared before writing ] ] }, "storage":{ "auth":"[ user or service ]", "bucket": [ string ], "path": [ string ] }, "file":[ string - full path to place to write file ] } } values = put_rows('user', var_json) Args: auth: (string) The type of authentication to use, user or service. rows: ( iterator ) The list of rows to be written, if NULL no action is performed. schema: (json) A bigquery schema definition. destination: (json) A json block resembling var_json described above. rows ( list ) The data being written as a list object. variant (string) Appended to destination to differentieate multiple objects Returns: If unnest is False: Returns a list of row values [[v1], [v2], ... ] If unnest is True: Returns a list of values [v1, v2, ...] """ if rows is None: if config.verbose: print('PUT ROWS: Rows is None, ignoring write.') return if 'bigquery' in destination: if not schema: schema = destination['bigquery'].get('schema') skip_rows = 1 if destination['bigquery'].get('header') and schema else 0 if destination['bigquery'].get('format', 'CSV') == 'JSON': json_to_table( config, destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', config.project), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, schema, destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) elif destination['bigquery'].get('is_incremental_load', False) == True: incremental_rows_to_table( config, destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', config.project), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, schema, destination['bigquery'].get('skip_rows', skip_rows), destination['bigquery'].get('disposition', 'WRITE_APPEND'), billing_project_id=config.project) else: rows_to_table( config, destination['bigquery'].get('auth', auth), destination['bigquery'].get('project_id', config.project), destination['bigquery']['dataset'], destination['bigquery']['table'] + variant, rows, schema, destination['bigquery'].get('skip_rows', skip_rows), destination['bigquery'].get('disposition', 'WRITE_TRUNCATE'), ) if 'sheets' in destination: if destination['sheets'].get('delete', False): sheets_clear( config, destination['sheets'].get('auth', auth), destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'], ) sheets_write( config, destination['sheets'].get('auth', auth), destination['sheets']['sheet'], destination['sheets']['tab'] + variant, destination['sheets']['range'], rows_to_type(rows), destination['sheets'].get('append', False), ) if 'file' in destination: path_out, file_ext = destination['file'].rsplit('.', 1) file_out = path_out + variant + '.' + file_ext if config.verbose: print('SAVING', file_out) makedirs_safe(parse_path(file_out)) with open(file_out, 'w') as save_file: save_file.write(rows_to_csv(rows).read()) if 'storage' in destination and destination['storage'].get( 'bucket') and destination['storage'].get('path'): bucket_create( config, destination['storage'].get('auth', auth), config.project, destination['storage']['bucket'] ) # put the file file_out = destination['storage']['bucket'] + ':' + destination['storage'][ 'path'] + variant if config.verbose: print('SAVING', file_out) object_put(config, auth, file_out, rows_to_csv(rows)) if 'sftp' in destination: try: cnopts = pysftp.CnOpts() cnopts.hostkeys = None path_out, file_out = destination['sftp']['file'].rsplit('.', 1) file_out = path_out + variant + file_out sftp = pysftp.Connection( host=destination['sftp']['host'], username=destination['sftp']['username'], password=destination['sftp']['password'], port=destination['sftp']['port'], cnopts=cnopts) if '/' in file_out: dir_out, file_out = file_out.rsplit('/', 1) sftp.cwd(dir_out) sftp.putfo(rows_to_csv(rows), file_out) except e: print(str(e)) traceback.print_exc()
def create_dv360_segments(project): a1_notation = 'A:M' schema = [{ "type": "STRING", "name": "Advertiser", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Advertiser_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Campaign", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Campaign_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Insertion_Order", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Insertion_Order_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Line_Item", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Line_Item_Id", "mode": "NULLABLE" }, { "type": "STRING", "name": "Line_Item_Type", "mode": "NULLABLE" }, { "type": "INTEGER", "name": "Impressions", "mode": "NULLABLE" }, { "type": "STRING", "name": "Segment1", "mode": "NULLABLE" }, { "type": "STRING", "name": "Segment2", "mode": "NULLABLE" }, { "type": "STRING", "name": "Segment3", "mode": "NULLABLE" }] sheet_rows = sheets_read('user', project.task['sheet'], 'DV3 Segments', a1_notation, retries=10) if not sheet_rows: sheet_rows = [] print('DV360 SEGMENT SHEET TABLE WRITE') rows_to_table(auth='service', project_id=project.id, dataset_id=project.task['dataset'], table_id=DV360_CUSTOM_SEGMENTS_SHEET_TABLE, rows=sheet_rows, schema=schema, skip_rows=1, disposition='WRITE_TRUNCATE') # Run Query path = os.path.join(os.path.dirname(__file__), SQL_DIRECTORY + DV360_CUSTOM_SEGMENTS_FILENAME) query = '' with open(path, 'r') as file: data = file.read().replace('\n', ' ') query = data.replace("{{project_id}}", project.id).replace("{{dataset}}", project.task['dataset']) print('DV360 CUSTOM SEGMENT TABLE') query_to_table('service', project.id, project.task['dataset'], DV360_CUSTOM_SEGMENTS_TABLE, query, legacy=False) # Move Table back to sheets query = 'SELECT * from `' + project.id + '.' + project.task[ 'dataset'] + '.' + DV360_CUSTOM_SEGMENTS_TABLE + '`' rows = query_to_rows('service', project.id, project.task['dataset'], query, legacy=False) # makes sure types are correct in sheet a1_notation = a1_notation[:1] + '2' + a1_notation[1:] rows = rows_to_type(rows) sheets_clear('user', project.task['sheet'], 'DV3 Segments', a1_notation) sheets_write('user', project.task['sheet'], 'DV3 Segments', a1_notation, rows)
def sheets(): if project.verbose: print('SHEETS') # if sheet or tab is missing, don't do anything if not project.task.get('sheet') or not project.task.get('tab'): if project.verbose: print('Missing Sheet and/or Tab, skipping task.') return # delete if specified, will delete sheet if no more tabs remain if project.task.get('delete', False): sheets_tab_delete(project.task['auth'], project.task['sheet'], project.task['tab']) # create a sheet and tab if specified, if template if 'template' in project.task: sheets_create( project.task['auth'], project.task['sheet'], project.task['tab'], project.task['template'].get('sheet'), project.task['template'].get('tab'), ) # copy template if specified ( clear in this context means overwrite ) #if project.task.get('template', {}).get('sheet'): # sheets_tab_copy( # project.task['auth'], # project.task['template']['sheet'], # project.task['template']['tab'], # project.task['sheet'], # project.task['tab'], # project.task.get('clear', False) # ) # if no template at least create tab #else: # sheets_tab_create( # project.task['auth'], # project.task['sheet'], # project.task['tab'] # ) # clear if specified if project.task.get('clear', False): sheets_clear(project.task['auth'], project.task['sheet'], project.task['tab'], project.task.get('range', 'A1')) # write data if specified if 'write' in project.task: rows = get_rows(project.task['auth'], project.task['write']) sheets_write(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range'], rows, append=False) # append data if specified if 'append' in project.task: rows = get_rows(project.task['auth'], project.task['append']) sheets_write(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range'], rows, append=True) # move data if specified # move data if specified if 'out' in project.task: rows = sheets_read(project.task['auth'], project.task['sheet'], project.task['tab'], project.task.get('range', 'A1')) if rows: schema = None # RECOMMENDED: define schema in json if project.task['out']['bigquery'].get('schema'): if project.verbose: print('SHEETS SCHEMA DEFINED') schema = project.task['out']['bigquery']['schema'] # NOT RECOMMENDED: determine schema if missing else: if project.verbose: print( 'SHEETS SCHEMA DETECT ( Note Recommended - Define Schema In JSON )' ) # cast rows to types ( for schema detection ) rows = rows_to_type(rows) rows, schema = get_schema(rows, project.task.get('header', False), infer_type=project.task.get( 'infer_type', True)) # write to table ( not using put because no use cases for other destinations ) rows_to_table( auth=project.task['out'].get('auth', project.task['auth']), project_id=project.id, dataset_id=project.task['out']['bigquery']['dataset'], table_id=project.task['out']['bigquery']['table'], rows=rows, schema=schema, skip_rows=1 if project.task.get('header', False) else 0, disposition=project.task['out']['bigquery'].get( 'disposition', 'WRITE_TRUNCATE')) else: print('SHEET EMPTY')
def main(): # get parameters parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent("""\ Command line to get table schema from BigQuery. Helps developers upload data to BigQuery and pull schemas. These are the most common BigQuery tasks when developing solutions. Examples: Display table schema: `python helper.py --project [id] --dataset [name] --table [name] -s [credentials]` Upload csv table: `python helper.py --project [id] --dataset [name] --table [name] --csv [file] --schema [file] -s [credentials]` Upload excel sheet: `python helper.py --project [id] --dataset [name] --table [name] --excel_file [file] --excel_sheet [name] --schema [file] -s [credentials]` """)) parser.add_argument('--dataset', help='name of BigQuery dataset', default=None) parser.add_argument('--table', help='name of BigQuery table', default=None) parser.add_argument('--csv', help='CSV file path', default=None) parser.add_argument('--schema', help='SCHEMA file path', default=None) parser.add_argument('--excel_workbook', help='Excel file path', default=None) parser.add_argument('--excel_sheet', help='Excel sheet name', default=None) # initialize project parser = commandline_parser(parser, arguments=('-u', '-c', '-s', '-v', '-p')) args = parser.parse_args() config = Configuration(user=args.user, client=args.client, service=args.service, verbose=args.verbose, project=args.project) auth = 'service' if args.service else 'user' schema = json.loads(args.schema) if args.schema else None if args.csv: with open(args.csv, 'r') as csv_file: rows = csv_to_rows(csv_file.read()) if not schema: rows, schema = get_schema(rows) print('DETECETED SCHEMA', json.dumps(schema)) print('Please run again with the above schema provided.') exit() rows_to_table(config, auth, config.project, args.dataset, args.table, rows, schema) elif args.excel_workbook and args.excel_sheet: with open(args.excel_workbook, 'r') as excel_file: rows = excel_to_rows(excel_file, args.excel_sheet) if not schema: rows, schema = get_schema(rows) print('DETECETED SCHEMA', json.dumps(schema)) print('Please run again with the above schema provided.') exit() rows_to_table(config, auth, config.project, args.dataset, args.table, rows, schema) else: # print schema print( json.dumps(table_to_schema(config, auth, config.project, args.dataset, args.table), indent=2))
def sheets(): if project.verbose: print 'SHEETS' # clear if specified if project.task.get('clear', False): sheets_clear(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range']) # delete if specified ( after clear to prevent errors in case both are given ) if project.task.get('delete', False): sheets_tab_delete(project.task['auth'], project.task['sheet'], project.task['tab']) # create or copy if specified if 'template' in project.task: sheets_tab_copy(project.task['auth'], project.task['template']['sheet'], project.task['template']['tab'], project.task['sheet'], project.task['tab']) else: sheets_tab_create(project.task['auth'], project.task['sheet'], project.task['tab']) # write data if specified if 'write' in project.task: rows = get_rows(project.task['auth'], project.task['write']) sheets_write(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range'], rows) # move if specified if 'out' in project.task: rows = sheets_read(project.task['auth'], project.task['sheet'], project.task['tab'], project.task['range']) if rows: schema = None # RECOMMENDED: define schema in json if project.task['out']['bigquery'].get('schema'): if project.verbose: print 'SHEETS SCHEMA DEFINED' schema = project.task['out']['bigquery']['schema'] # NOT RECOMMENDED: determine schema if missing else: if project.verbose: print 'SHEETS SCHEMA DETECT ( Note Recommended - Define Schema In JSON )' # cast rows to types ( for schema detection ) rows = rows_to_type(rows) rows, schema = get_schema(rows, project.task.get('header', False), infer_type=project.task.get( 'infer_type', True)) # write to table ( not using put because no use cases for other destinations ) rows_to_table( auth=project.task['out'].get('auth', project.task['auth']), project_id=project.id, dataset_id=project.task['out']['bigquery']['dataset'], table_id=project.task['out']['bigquery']['table'], rows=rows, schema=schema, skip_rows=1 if project.task.get('header', False) else 0, disposition=project.task['out']['bigquery'].get( 'disposition', 'WRITE_TRUNCATE')) else: print 'SHEET EMPTY'