def get_solutions(): if project.verbose: print 'GETTING SCORES' for solution in project.task['solutions']: scores = [] if 'sheet' in solution: scores = sheets_read( project.task['auth'], solution['sheet']['url'], solution['sheet']['tab'], solution['sheet']['range'] ) elif 'bigquery' in solution: scores = query_to_rows( project.task['auth'], project.id, solution['bigquery']['dataset'], solution['bigquery']['query'] ) # for easy lookup use dictionary solution['scores'] = {} for score in scores: solution['scores'].setdefault(str(score[0]), []) solution['scores'][str(score[0])].append({ 'variant_id':str(score[1]), 'variant':score[2], 'score':float(score[3]) }) if project.verbose: print 'GOT SCORES:', len(solution['scores']) return project.task['solutions']
def bigquery(): # check schema if given ( check independent of values ) if 'schema' in project.task['bigquery']: schema = table_to_schema(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) object_compare(schema['fields'], project.task['bigquery']['schema']) # if query given check it if 'query' in project.task['bigquery']: rows = query_to_rows(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['query']) object_compare(sorted(rows), sorted(project.task['bigquery']['values'])) # simple table check ( unless query given ) elif 'values' in project.task['bigquery']: rows = table_to_rows(project.task['auth'], project.id, project.task['bigquery']['dataset'], project.task['bigquery']['table']) object_compare(sorted(rows), sorted(project.task['bigquery']['values']))
def get_impacts(): if project.verbose: print 'GETTING IMPACTS' impacts = [] if 'sheet' in project.task['impacts']: impacts = sheets_read( project.task['auth'], project.task['impacts']['sheet']['url'], project.task['impacts']['sheet']['tab'], project.task['impacts']['sheet']['range'] ) elif 'bigquery' in project.task['impacts']: impacts = query_to_rows( project.task['auth'], project.id, project.task['impacts']['bigquery']['dataset'], project.task['impacts']['bigquery']['query'] ) # for easy lookup use dictionary impacts = dict([(str(i[0]), float(i[1])) for i in impacts]) if project.verbose: print 'GOT IMPACTS:', len(impacts) return impacts
def conversions_download(): if project.verbose: print 'CONVERSION DOWNLOAD' # pull from bigquery if specified if 'bigquery' in project.task: if project.verbose: print 'READING BIGQUERY' rows = query_to_rows( project.task['auth'], project.id, project.task['bigquery']['dataset'], 'SELECT * FROM %s' % project.task['bigquery']['table'], legacy=project.task['bigquery'].get('legacy', True) ) for row in rows: yield row # pull from sheets if specified if 'sheets' in project.task: if project.verbose: print 'READING SHEET' rows = sheets_read( project.task['auth'], project.task['sheets']['url'], project.task['sheets']['tab'], project.task['sheets']['range'] ) for row in rows: yield row # pull from csv if specified if 'csv' in project.task: if project.verbose: print 'READING CSV FILE' with io.open(project.task['csv']['file']) as f: for row in csv_to_rows(f): if row[0] not in CSV_HEADERS: yield row
def get_owners(): if project.verbose: print 'GETTING OWNERS' owners = [] if 'sheet' in project.task['owners']: owners = sheets_read( project.task['auth'], project.task['owners']['sheet']['url'], project.task['owners']['sheet']['tab'], project.task['owners']['sheet']['range'] ) elif 'bigquery' in project.task['owners']: owners = query_to_rows( project.task['auth'], project.id, project.task['owners']['bigquery']['dataset'], project.task['owners']['bigquery']['query'] ) # group account owners by email, create easy lookup sets for ids owners_grouped = {} for owner in owners: try: owners_grouped.setdefault(owner[2], { 'Account Name':owner[0], 'Account Owner':owner[1], 'Account Email':owner[2], 'DCM Network ID':[], 'DBM Partner ID':[], 'DS Account ID':[], 'Studio Account ID':[], }) if len(owner) > 3 and owner[3]: owners_grouped[owner[2]]['DCM Network ID'].append(str(owner[3])) if len(owner) > 4 and owner[4]: owners_grouped[owner[2]]['DBM Partner ID'].append(str(owner[4])) if len(owner) > 5 and owner[5]: owners_grouped[owner[2]]['DS Account ID'].append(str(owner[5])) if len(owner) > 6 and owner[6]: owners_grouped[owner[2]]['Studio Account ID'].append(str(owner[6])) except IndexError: print 'ERROR:', owner pass if project.verbose: print 'GOT OWNERS:', len(owners) return owners_grouped.values()
def _process_filters(partners, advertisers, filters, project_id, dataset_id, auth='user'): structures = [] for p in (partners or []): structures.append({ 'type':'FILTER_PARTNER', 'value':int(p) }) for a in (advertisers or []): structures.append({ 'type':'FILTER_ADVERTISER', 'value':int(a) }) for f in (filters or []): if isinstance(f['value'], basestring) and f['value'].startswith('SELECT '): items = query_to_rows(auth, project_id, dataset_id, f['value']) filtered = False for item in items: if item and len(item) != 0: filtered = True structures.append({ 'type':f['type'], 'value':item[0] }) else: break if not filtered: raise Exception('Select filter did not return any values: %s' % f['value']) else: structures.append({ 'type':f['type'], 'value':f['value'] }) return structures
def bigquery(): if 'query' in project.task['from']: if 'table' in project.task['to']: if project.verbose: print "QUERY TO TABLE", project.task['to']['table'] if 'pre_process_query' in project.task['to']: print 'executing statement' execute_statement(project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['pre_process_query'], use_legacy_sql=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True))) query_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), disposition=project.task['write_disposition'] if 'write_disposition' in project.task else 'WRITE_TRUNCATE', legacy=project.task['from'].get( 'legacy', project.task['from'].get( 'useLegacySql', True)), # DEPRECATED: useLegacySql, target_project_id=project.task['to'].get( 'project_id', project.id)) # NOT USED SO RIPPING IT OUT # Mauriciod: Yes, it is used, look at project/mauriciod/target_winrate.json elif 'storage' in project.task['to']: if project.verbose: print "QUERY TO STORAGE", project.task['to']['storage'] local_file_name = '/tmp/%s' % str(uuid.uuid1()) rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query']) f = open(local_file_name, 'wb') writer = csv.writer(f) writer.writerows(rows) f.close() f = open(local_file_name, 'rb') object_put(project.task['auth'], project.task['to']['storage'], f) f.close() os.remove(local_file_name) elif 'trix' in project.task['to']: if project.verbose: print "QUERY TO SHEET", project.task['to']['trix'] rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'legacy', True)) # makes sure types are correct in sheet rows = rows_to_type(rows) sheets_clear(project.task['auth'], project.task['to']['trix'], project.task['to']['range'].split('!')[0], project.task['to']['range'].split('!')[1]) sheets_write(project.task['auth'], project.task['to']['trix'], project.task['to']['range'].split('!')[0], project.task['to']['range'].split('!')[1], rows) elif 'sftp' in project.task['to']: rows = query_to_rows(project.task['auth'], project.id, project.task['from']['dataset'], project.task['from']['query'], legacy=project.task['from'].get( 'use_legacy_sql', True)) if rows: if project.verbose: print "QUERY TO SFTP" put_rows(project.task['auth'], project.task['to'], '', rows) else: if project.verbose: print "QUERY TO VIEW", project.task['to']['view'] query_to_view( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['view'], query_parameters(project.task['from']['query'], project.task['from'].get('parameters')), project.task['from'].get('legacy', project.task['from'].get( 'useLegacySql', True)) # DEPRECATED: useLegacySql ) else: if project.verbose: print "STORAGE TO TABLE", project.task['to']['table'] storage_to_table( project.task['auth'], project.id, project.task['to']['dataset'], project.task['to']['table'], project.task['from']['bucket'] + ':' + project.task['from']['path'], project.task.get('schema', []), project.task.get('skip_rows', 1), project.task.get('structure', 'CSV'), project.task.get('disposition', 'WRITE_TRUNCATE'))
def get_rows(auth, source): """Processes standard read JSON block for dynamic loading of data. Allows us to quickly pull a column or columns of data from and use it as an input into a script. For example pull a list of ids from bigquery and act on each one. - When pulling a single column specify single_cell = True. Returns list AKA values. - When pulling a multiple columns specify single_cell = False. Returns list of lists AKA rows. - Values are always given as a list ( single_cell will trigger necessary wrapping ). - Values, bigquery, sheet are optional, if multiple given result is one continous iterator. - Extensible, add a handler to define a new source ( be kind update the documentation json ). Include the following JSON in a recipe, then in the run.py handler when encountering that block pass it to this function and use the returned results. from utils.data import get_rows var_json = { "in":{ "single_cell":[ boolean ], "values": [ integer list ], "bigquery":{ "dataset": [ string ], "table": [ string ], "columns":[ integer list ], "legacy":[ boolean ] }, "sheet":{ "url":[ string - full URL, suggest using share link ], "tab":[ string ], "range":[ string - A1:A notation ] } } } values = get_rows('user', var_json) Or you can use it directly with project singleton. from util.project import project from utils.data import get_rows def something(): values = get_rows(project.task['auth'], project.task['in']) if __name__ == "__main__": project.load('something') something() Args: auth: (string) The type of authentication to use, user or service. source: (json) A json block resembling var_json described above. Returns: If single_cell is False: Returns a list of row values [[v1], [v2], ... ] If single_cell is True: Returns a list of values [v1, v2, ...] """ # if handler points to list, concatenate all the values from various sources into one list if isinstance(source, list): for s in source: for r in get_rows(auth, s): yield r # if handler is an endpoint, fetch data else: if 'values' in source: for value in source['values']: yield value if source.get('single_cell', False) else [value] if 'sheet' in source: rows = sheets_read(project.task['auth'], source['sheet']['url'], source['sheet']['tab'], source['sheet']['range']) for row in rows: yield row[0] if source.get('single_cell', False) else row if 'bigquery' in source: rows = query_to_rows( source['bigquery'].get('auth', auth), project.id, source['bigquery']['dataset'], source['bigquery']['query'], legacy=source['bigquery'].get('legacy', False) ) for row in rows: yield row[0] if source.get('single_cell', False) else row