def runQuery(client, qString, GeneList, SampleList, PatientList, dryRun=False): # """ #**`runQuery`**: a relatively generic BigQuery query-execution wrapper function which can be used to run a query in "dry-run" mode or not: the call to the `query()` function itself is inside a `try/except` block and if it fails we return `None`; otherwise a "dry" will return an empty dataframe, and a "live" run will return the query results as a dataframe. This function was modify from previous notebooks to handle user-defined parameteres necessary for the purpose of this notbeook. # """ print("\n in runQuery ... ") if (dryRun): print(" dry-run only ") ## set up QueryJobConfig object job_config = bigquery.QueryJobConfig() query_params = [ bigquery.ArrayQueryParameter('GENELIST', 'STRING', GeneList), bigquery.ArrayQueryParameter('SAMPLELIST', 'STRING', SampleList), bigquery.ArrayQueryParameter('PATIENTLIST', 'STRING', PatientList), ] job_config.query_parameters = query_params job_config.dry_run = dryRun job_config.use_query_cache = True job_config.use_legacy_sql = False ## run the query try: query_job = client.query(qString, job_config=job_config) ## print ( " query job state: ", query_job.state ) except: print(" FATAL ERROR: query execution failed ") return (None) ## return results as a dataframe (or an empty dataframe for a dry-run) if (not dryRun): try: df = query_job.to_dataframe() if (query_job.total_bytes_processed == 0): print(" the results for this query were previously cached ") else: print(" this query processed {} bytes ".format( query_job.total_bytes_processed)) jobTimeLine = query_job.timeline elapsed = 0 for entry in jobTimeLine: elapsed = int(entry.elapsed_ms) #print( entry.elapsed_ms) print(" Approx. elpased time : {} miliseconds ".format( elapsed)) if (len(df) < 1): print(" WARNING: this query returned NO results ") return (df) except: print(" FATAL ERROR: query execution failed ") return (None) else: print(" if not cached, this query will process {} bytes ".format( query_job.total_bytes_processed)) ## return an empty dataframe return (pd.DataFrame())
def generate_recovery_users_query_with_params(events, dateIntervals): print( "Generate recovery users query and params for events: {events} and date intervals: {dateIntervals}" .format(events=events, dateIntervals=dateIntervals)) requiredKeysForQuery = extract_required_keys_for_generate_recovery_query( events) stepBeforeDropOff = requiredKeysForQuery["stepBeforeDropOff"] nextStepList = requiredKeysForQuery["nextStepList"] recoveryStep = requiredKeysForQuery["recoveryStep"] dateOfYesterday = calculate_date_of_yesterday_at_utc() beginningOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int( dateOfYesterday, HOUR_MARKING_START_OF_DAY) endOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int( dateOfYesterday, HOUR_MARKING_END_OF_DAY) recoveryQuery = (""" select distinct(`user_id`) from `{full_table_url}` where `event_type` in UNNEST(@recoveryStep) and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds and `user_id` in ( select `user_id` from `{full_table_url}` where `user_id` not in ( select `user_id` from `{full_table_url}` where `event_type` in UNNEST(@nextStepList) and `time_transaction_occurred` <= @endOfYesterdayInMilliseconds ) and `event_type` = @stepBeforeDropOff and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds ) """.format(full_table_url=FULL_TABLE_URL)) recoveryParams = [ bigquery.ArrayQueryParameter("recoveryStep", "STRING", recoveryStep), bigquery.ScalarQueryParameter("stepBeforeDropOff", "STRING", stepBeforeDropOff), bigquery.ArrayQueryParameter("nextStepList", "STRING", nextStepList), bigquery.ScalarQueryParameter("beginningOfYesterdayInMilliseconds", "INT64", beginningOfYesterdayInMilliseconds), bigquery.ScalarQueryParameter("endOfYesterdayInMilliseconds", "INT64", endOfYesterdayInMilliseconds), ] print(""" Successfully generated recovery users query and params for events: {events} and date intervals: {dateIntervals}. Recovery query: {recoveryQuery}. Recovery params: {recoveryParams} """.format(events=events, dateIntervals=dateIntervals, recoveryQuery=recoveryQuery, recoveryParams=recoveryParams)) return {"recoveryQuery": recoveryQuery, "recoveryParams": recoveryParams}
def test_array_to_query_parameter_honors_given_type(alias, type_): from google.cloud import bigquery assert _helpers.array_to_query_parameter( [1.23], None, alias) == (bigquery.ArrayQueryParameter(None, type_, [1.23])) assert _helpers.array_to_query_parameter( (), "foo", alias) == (bigquery.ArrayQueryParameter("foo", type_, ()))
def get_weekly_clientlibs_downloads(clientlibs_table_name, date_str): """Use a SQL query to collect the weekly download data of the client libraries. Args: clientlibs_table_name (str): Table name, which is the key in the CLIENTLIBS dict. date_str (str): A date string in "YYYYMMDD" format. Returns: list: rows of the query result. """ client_libs = CLIENTLIBS[clientlibs_table_name] date_time = datetime.datetime.strptime(date_str, DATETIME_FORMAT) week_dates = [ (date_time + datetime.timedelta(days=-i)).strftime(DATETIME_FORMAT) for i in range(7) ] query = """ SELECT file.project as client_library_name, COUNT(*) as download_count FROM `the-psf.pypi.downloads*` WHERE file.project IN UNNEST(@client_libs) AND _TABLE_SUFFIX IN UNNEST(@week_dates) GROUP BY client_library_name """ client = bigquery.Client() query_job = client.run_async_query( str(uuid.uuid4()), query, query_parameters=(bigquery.ArrayQueryParameter('client_libs', 'STRING', client_libs), bigquery.ArrayQueryParameter('week_dates', 'STRING', week_dates))) query_job.use_legacy_sql = False # Start the query job and wait it to complete query_job.begin() wait_for_job(query_job) # Fetch the results result = query_job.query_results().fetch_data() result_list = [item for item in result] # In case the result_list contains the metadata like total_rows, the # actual rows will be the first element of the result_list. if len(result_list) > 0 and isinstance(result_list[0], list): result_list = result_list[0] rows = [(date_time, ) + row for row in result_list] print(rows) return rows
def GeneSymbol_standardization(Gene_list, project_id): client = bigquery.Client(project_id) #query=''' # SELECT * # FROM `syntheticlethality.gene_information.gene_info_human` # where Gene in UNNEST(@input_gene_list) # ''' query = ''' SELECT * FROM `isb-cgc-bq.synthetic_lethality.gene_info_human_HGNC_NCBI_2020_07` where Gene in UNNEST(@input_gene_list) ''' job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ArrayQueryParameter("input_gene_list", "STRING", Gene_list) ]) id_map = client.query(query, job_config=job_config).result().to_dataframe() Gene_list_all = list( set(list(id_map['Alias'].values) + list(id_map['Gene'].values))) #query1 = ''' # select Hugo_Symbol # from `syntheticlethality.DepMap_public_20Q3.CCLE_mutation` # where Hugo_Symbol in UNNEST(@input_gene_list_new) # ''' query1 = ''' select Hugo_Symbol from `isb-cgc-bq.DEPMAP.CCLE_mutation_DepMapPublic_current` where Hugo_Symbol in UNNEST(@input_gene_list_new) ''' job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ArrayQueryParameter("input_gene_list_new", "STRING", Gene_list_all) ]) Mut_mat = client.query(query1, job_config=job_config).result().to_dataframe() set_gene_CCLE = set(Mut_mat['Hugo_Symbol']) dic_gene_to_alias = {} output_gene_list = [] for Gene in Gene_list: dic_gene_to_alias[Gene] = set(id_map.loc[id_map['Gene'] == Gene] ['Alias']).intersection(set_gene_CCLE) if Gene in set(id_map['Gene']) and Gene in dic_gene_to_alias[Gene]: output_gene_list.append(Gene) else: print(Gene + ":" + ','.join(list(dic_gene_to_alias[Gene]))) for value in dic_gene_to_alias[Gene]: if value in set(id_map['Alias']): output_gene_list.append(value) return (dic_gene_to_alias, output_gene_list)
def main(): """ Get the configuration file """ logger = logging.getLogger(__name__) # Set up ENV CONFIG_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'env.json') try: with open(CONFIG_PATH) as env_file: ENV = json.load(env_file) except FileNotFoundError: logger.error( f'Configuration file could not be found; please add file "{CONFIG_PATH}".' ) ENV = dict() DATA_WAREHOUSE_COURSE_IDS = ENV["DATA_WAREHOUSE_COURSE_IDS"] DATA_WAREHOUSE_SHORT_COURSE_IDS = ENV["DATA_WAREHOUSE_SHORT_COURSE_IDS"] CANVAS_DATA_ID_INCREMENT = ENV["CANVAS_DATA_ID_INCREMENT"] TIME_LIMIT = ENV["TIME_LIMIT"] query_params = [ bigquery.ArrayQueryParameter('course_ids', 'STRING', DATA_WAREHOUSE_COURSE_IDS), bigquery.ArrayQueryParameter('course_ids_short', 'STRING', DATA_WAREHOUSE_SHORT_COURSE_IDS), bigquery.ScalarQueryParameter('canvas_data_id_increment', 'INT64', CANVAS_DATA_ID_INCREMENT), bigquery.ScalarQueryParameter('time_limit', 'STRING', TIME_LIMIT) ] # from the configuration variable, load the queries based on UDP expanded vs events table # run queries and compare the returned dataframes expanded_vs_events_queries_json = ENV["EXPANDED_VS_EVENTS_QUERIES"] for event_type in expanded_vs_events_queries_json: print(event_type) expanded_query_array = [] events_query_array = [] for attribute, value in expanded_vs_events_queries_json[ event_type].items(): if attribute == "expanded_query_array": expanded_query_array = value elif attribute == "events_query_array": events_query_array = value compare_expanded_vs_events_df(expanded_query_array, events_query_array, query_params)
def set_prefixes_to_status(prefixes: List[str], status: str, services: Services, options: STSJobManagerOptions): """ Sets a list of prefixes to a given status in the database. """ logger.info(f'Updating {len(prefixes)} prefixes to `{status}` status') table = get_table_identifier(services, options.bigquery_options, options.bigquery_options.table_name['job']) # API does not support table names for preparameterized queries # https://cloud.google.com/bigquery/docs/parameterized-queries # We can't UPDATE jobs that are currently in a stream, so defer for later # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-manipulation-language#limitations query = f""" UPDATE `{table}` SET status = @status, last_updated = CURRENT_TIMESTAMP() WHERE prefix IN UNNEST(@prefixes) AND last_updated < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 90 MINUTE) """ params = [ bigquery.ScalarQueryParameter("status", "STRING", status), bigquery.ArrayQueryParameter("prefixes", "STRING", prefixes) ] run_query(query, params, services, options).result()
def client_query_w_array_params(): # [START bigquery_query_params_arrays] from google.cloud import bigquery # Construct a BigQuery client object. client = bigquery.Client() query = """ SELECT name, sum(number) as count FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = @gender AND state IN UNNEST(@states) GROUP BY name ORDER BY count DESC LIMIT 10; """ job_config = bigquery.QueryJobConfig( query_parameters=[ bigquery.ScalarQueryParameter("gender", "STRING", "M"), bigquery.ArrayQueryParameter("states", "STRING", ["WA", "WI", "WV", "WY"]), ] ) query_job = client.query(query, job_config=job_config) # Make an API request. for row in query_job: print("{}: \t{}".format(row.name, row.count))
def __init__(self, blocklisted_document_types, date_from, date_until): self.blocklisted_document_types = blocklisted_document_types self.date_from = date_from self.date_until = date_until logger = logging.getLogger( 'make_functional_edges_and_weights.EdgeWeightExtractor') credentials, project_id = google.auth.default() logger.info(f'creating bigqquery client for project {project_id}') self.client = bigquery.Client(credentials=credentials, project=project_id) logger.info( 'reading query from src/data_preprocessing/query_content_id_edge_weights.sql' ) self.query_edge_list = read_file_as_string( "src/data_preprocessing/query_content_id_edge_weights.sql") self.query_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ScalarQueryParameter("three_weeks_ago", "STRING", self.date_from), bigquery.ScalarQueryParameter("yesterday", "STRING", self.date_until), bigquery.ArrayQueryParameter("excluded_document_types", "STRING", self.blocklisted_document_types) ]) self.df = self.client.query( self.query_edge_list, job_config=self.query_config).to_dataframe()
def _test_query_from_bq_table(self): '''skipping this for now, need to create a permanent test table to query from''' query = ''' SELECT stride_treatment_team_id, pat_enc_csn_id, treatment_team FROM stride_2008_2017.stride_treatment_team WHERE stride_treatment_team_id IN UNNEST(@tt_ids); ''' headers = ['stride_treatment_team_id', 'pat_enc_csn_id', 'treatment_team'] treatment_team_ids = list(range(2141155, 2141205)) query_params = [ bigquery.ArrayQueryParameter('tt_ids', 'INT64', treatment_team_ids) ] query_job = self.bqClient.queryBQ(str(query), query_params=query_params, location='US', batch_mode=False, verbose=True) actual_data = [] with open(self.tmp_csv_path, 'wb') as csvfile: writer = csv.writer(csvfile) for row in query_job: writer.writerow(row.values()) actual_data.append(row.values) # TODO fill expected data expected_data = [] self.assertEqualTable(expected_data, actual_data)
def find_files_in_load_history( bq_project: str, dataset: str, areas: dict[str, set[PathWithCrc]]) -> dict[str, set[PathWithCrc]]: client = bigquery.Client(project=bq_project) loaded_paths = {} for area, paths_with_crc in areas.items(): logging.debug(f"\tPulling loaded files for area {area}...") target_paths = [path_with_crc.path for path_with_crc in paths_with_crc] query = f""" SELECT target_path, checksum_crc32c FROM `datarepo_{dataset}.datarepo_load_history` dlh WHERE state = 'succeeded' AND target_path IN UNNEST(@paths) """ job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ArrayQueryParameter("paths", "STRING", target_paths), ]) query_job = client.query(query, job_config=job_config) loaded_paths[area] = { PathWithCrc(row["target_path"], row["checksum_crc32c"]) for row in query_job } return loaded_paths
def get_log_probs(names_list, timeout_seconds=30): bigquery_client = bigquery.Client() query = """ WITH test_names AS (SELECT * FROM UNNEST(@names) AS test_name CROSS JOIN {}) SELECT test_name name, test_year year, IFNULL(logp, min_logp - {}) logp FROM `{}` RIGHT OUTER JOIN test_names ON LOWER(name)=LOWER(test_name) AND year=test_year ORDER BY test_year ASC, test_name DESC """.format(min_logp_table, unknown_penalty, logp_table) job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ArrayQueryParameter('names', 'STRING', names_list) ]) job_config.use_query_cache = True query_job = bigquery_client.query(query, job_config=job_config) results = query_job.result(timeout=timeout_seconds) d = defaultdict(float) for row in results: d[row["year"]] += row["logp"] items = sorted(d.items()) years = np.array([i[0] for i in items]) logps = np.array([i[1] for i in items]) return years, logps
def GeneSymbol_standardization_output(Gene_list, project_id): Gene_list = list(set(Gene_list)) client = bigquery.Client(project_id) #query=''' # SELECT * # FROM `syntheticlethality.gene_information.gene_info_human` # where Gene in UNNEST(@input_gene_list) # ''' query = ''' SELECT * FROM `isb-cgc-bq.synthetic_lethality.gene_info_human_HGNC_NCBI_2020_07` where Gene in UNNEST(@input_gene_list) ''' job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ArrayQueryParameter("input_gene_list", "STRING", Gene_list) ]) id_map = client.query(query, job_config=job_config).result().to_dataframe() alias_list = list(id_map['Alias'].values) gene_list = list(id_map['Gene'].values) dic_alias_to_gene = {} for i in range(0, len(alias_list)): dic_alias_to_gene[alias_list[i]] = gene_list[i] return (dic_alias_to_gene)
def query_array_params(gender, states): client = bigquery.Client() query = """ SELECT name, sum(number) as count FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = @gender AND state IN UNNEST(@states) GROUP BY name ORDER BY count DESC LIMIT 10; """ query_job = client.run_async_query( str(uuid.uuid4()), query, query_parameters=(bigquery.ScalarQueryParameter( 'gender', 'STRING', gender), bigquery.ArrayQueryParameter('states', 'STRING', states))) query_job.use_legacy_sql = False query_job.begin() query_job.result() # Wait for job to complete # Print the results. destination_table = query_job.destination destination_table.reload() for row in destination_table.fetch_data(): print(row)
def make_bq_array_parameters(self, parameters: list) -> list: array_params = [ # TODO: Improve this method bigquery.ArrayQueryParameter(key, get_bq_data_type(value[0]), value) for key, value in parameters ] return array_params
def array_to_query_parameter(value, name=None): """Convert an array-like value into a query parameter. Args: value (Sequence[Any]): The elements of the array (should not be a string-like Sequence). name (Optional[str]): Name of the query parameter. Returns: A query parameter corresponding with the type and value of the plain Python object. Raises: google.cloud.bigquery.dbapi.exceptions.ProgrammingError: if the type of array elements cannot be determined. """ if not array_like(value): raise exceptions.ProgrammingError( "The value of parameter {} must be a sequence that is " "not string-like.".format(name)) if not value: raise exceptions.ProgrammingError( "Encountered an empty array-like value of parameter {}, cannot " "determine array elements type.".format(name)) # Assume that all elements are of the same type, and let the backend handle # any type incompatibilities among the array elements array_type = bigquery_scalar_type(value[0]) if array_type is None: raise exceptions.ProgrammingError( "Encountered unexpected first array element of parameter {}, " "cannot determine array elements type.".format(name)) return bigquery.ArrayQueryParameter(name, array_type, value)
def _build_query_parameters(query: BigQuery, parameters: dict) -> list: """ Builds the parameter list for a BigQuery job from a supplied list of parameter values. Args: query (BigQuery): A query with parameter specification. parameters (dict): Corresponding dict of parameters and supplied values. Returns: (list) A list of BigQuery parameters. """ # Build query parameters query_params = [] for spec in query.parameter_spec: pname = spec["name"] ptype = spec["type"] if pname not in parameters: raise RuntimeError( f"Parameter '{pname}' unspecified in `run_query`") if spec["array_type"] is False: bqparam = bigquery.ScalarQueryParameter(pname, ptype, parameters[pname]) else: if type(parameters[pname]) != list: raise RuntimeError( f"Query '{query.name}' expects parameter '{pname}' to be a list" ) bqparam = bigquery.ArrayQueryParameter(pname, ptype, parameters[pname]) query_params.append(bqparam) return query_params
def query_array_params(gender, states): client = bigquery.Client() query = """ SELECT name, sum(number) as count FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = @gender AND state IN UNNEST(@states) GROUP BY name ORDER BY count DESC LIMIT 10; """ query_params = [ bigquery.ScalarQueryParameter('gender', 'STRING', gender), bigquery.ArrayQueryParameter('states', 'STRING', states) ] job_config = bigquery.QueryJobConfig() job_config.query_parameters = query_params query_job = client.query(query, job_config=job_config) query_job.result() # Wait for job to complete # Print the results. destination_table_ref = query_job.destination table = client.get_table(destination_table_ref) for row in client.list_rows(table): print(row)
def __init__(self, query_path, blocklisted_document_types, date_from=None, date_until=None, weight_threshold=None): self.logger = logging.getLogger( 'make_functional_edges_and_weights.EdgeWeightExtractor') self.blocklisted_document_types = blocklisted_document_types self.weight_threshold = weight_threshold self.date_from = date_from self.date_until = date_until self.query_path = query_path self.df = None credentials, project_id = google.auth.default() self.logger.info(f'creating bigqquery client for project {project_id}') # TODO use the big query client util in future self.client = bigquery.Client(credentials=credentials, project=project_id) self.logger.info(f'reading query from {query_path}') self.query_edge_list = parse_sql_script(self.query_path) self.query_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ScalarQueryParameter("from_date", "STRING", self.date_from), bigquery.ScalarQueryParameter("to_date", "STRING", self.date_until), bigquery.ScalarQueryParameter("weight_threshold", "INT64", self.weight_threshold), bigquery.ArrayQueryParameter("excluded_document_types", "STRING", self.blocklisted_document_types) ])
def print_query_results(client, ids, min_duration=0, max_duration=84000): query = """ SELECT start_station_id , COUNT(*) as num_trips FROM `bigquery-public-data`.london_bicycles.cycle_hire WHERE start_station_id IN UNNEST(@STATIONS) AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION GROUP BY start_station_id """ query_params = [ bigquery.ArrayQueryParameter('STATIONS', "INT64", ids), bigquery.ScalarQueryParameter('MIN_DURATION', "FLOAT64", min_duration), bigquery.ScalarQueryParameter('MAX_DURATION', "FLOAT64", max_duration), ] job_config = bigquery.QueryJobConfig() job_config.query_parameters = query_params query_job = client.query( query, location="EU", job_config=job_config, ) for row in query_job: print("{}: \t{}".format(row.start_station_id, row.num_trips)) print("______________________")
def array_to_query_parameter(value, name=None, query_parameter_type=None): """Convert an array-like value into a query parameter. Args: value (Sequence[Any]): The elements of the array (should not be a string-like Sequence). name (Optional[str]): Name of the query parameter. query_parameter_type (Optional[str]): Given type for the parameter. Returns: A query parameter corresponding with the type and value of the plain Python object. Raises: google.cloud.bigquery.dbapi.exceptions.ProgrammingError: if the type of array elements cannot be determined. """ if not array_like(value): raise exceptions.ProgrammingError( "The value of parameter {} must be a sequence that is " "not string-like.".format(name)) if query_parameter_type or value: array_type = _parameter_type( name, value[0] if value else None, query_parameter_type, value_doc="array element ", ) else: raise exceptions.ProgrammingError( "Encountered an empty array-like value of parameter {}, cannot " "determine array elements type.".format(name)) return bigquery.ArrayQueryParameter(name, array_type, value)
def test_generate_recovery_users_query_with_params(): dateOfYesterday = calculate_date_of_yesterday_at_utc() beginningOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int( dateOfYesterday, HOUR_MARKING_START_OF_DAY) endOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int( dateOfYesterday, HOUR_MARKING_END_OF_DAY) expectedRecoveryQuery = (""" select distinct(`user_id`) from `{full_table_url}` where `event_type` in UNNEST(@recoveryStep) and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds and `user_id` in ( select `user_id` from `{full_table_url}` where `user_id` not in ( select `user_id` from `{full_table_url}` where `event_type` in UNNEST(@nextStepList) and `time_transaction_occurred` <= @endOfYesterdayInMilliseconds ) and `event_type` = @stepBeforeDropOff and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds ) """.format(full_table_url=FULL_TABLE_URL)) expectedRecoveryParams = [ bigquery.ArrayQueryParameter("recoveryStep", "STRING", recoveryStep), bigquery.ScalarQueryParameter("stepBeforeDropOff", "STRING", stepBeforeDropOff), bigquery.ArrayQueryParameter("nextStepList", "STRING", nextStepList), bigquery.ScalarQueryParameter("beginningOfYesterdayInMilliseconds", "INT64", beginningOfYesterdayInMilliseconds), bigquery.ScalarQueryParameter("endOfYesterdayInMilliseconds", "INT64", endOfYesterdayInMilliseconds), ] expectedRecoveryQueryAndParams = { "recoveryQuery": expectedRecoveryQuery, "recoveryParams": expectedRecoveryParams } assert generate_recovery_users_query_with_params( sampleEvents, sampleDateIntervals) == expectedRecoveryQueryAndParams
def get_weekly_clientlibs_downloads(clientlibs_table_name, date_str): """Use a SQL query to collect the weekly download data of the client libraries. Args: clientlibs_table_name (str): Table name, which is the key in the CLIENTLIBS dict. date_str (str): A date string in "YYYYMMDD" format. Returns: list: rows of the query result. """ client_libs = CLIENTLIBS[clientlibs_table_name] date_time = datetime.datetime.strptime(date_str, DATETIME_FORMAT) week_dates = [ (date_time + datetime.timedelta(days=-i)).strftime(DATETIME_FORMAT) for i in range(7) ] query = """ SELECT file.project as client_library_name, COUNT(*) as download_count FROM `the-psf.pypi.downloads*` WHERE file.project IN UNNEST(@client_libs) AND _TABLE_SUFFIX IN UNNEST(@week_dates) GROUP BY client_library_name """ client = bigquery.Client() query_parameters = [ bigquery.ArrayQueryParameter('client_libs', 'STRING', client_libs), bigquery.ArrayQueryParameter('week_dates', 'STRING', week_dates) ] job_config = bigquery.QueryJobConfig() job_config.query_parameters = query_parameters query_job = client.query(query, job_config=job_config) # Wait for the job to complete and get the results results = [row.values() for row in query_job.result()] rows = [(date_time, ) + row for row in results] return rows
def bq_param_array(param, value): param_type = param.type() assert isinstance(param_type, dt.Array), str(param_type) try: bigquery_type = _IBIS_TYPE_TO_DTYPE[str(param_type.value_type)] except KeyError: raise com.UnsupportedBackendType(param_type) else: return bq.ArrayQueryParameter(param._name, bigquery_type, value)
def bigquery_match(self, bq_client, bq_query, event_field_name, values): """Run a BigQuery query for rows with matching event_field_name values. Returns: BigQuery query job. """ job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ArrayQueryParameter(event_field_name, "STRING", values), ]) return bq_client.query(bq_query, job_config=job_config)
def test_to_query_parameters_list_w_types(): from google.cloud import bigquery assert _helpers.to_query_parameters( [1, 1.2, None, []], [None, "numeric", "string", "float64"]) == [ bigquery.ScalarQueryParameter(None, "INT64", 1), bigquery.ScalarQueryParameter(None, "NUMERIC", 1.2), bigquery.ScalarQueryParameter(None, "STRING", None), bigquery.ArrayQueryParameter(None, "FLOAT64", []), ]
def get_all_tx_from_address_v1(list_addresses): #get transaction overview from list of wallets query = """ WITH all_transactions AS ( -- inputs SELECT transaction_hash , block_timestamp as timestamp , array_to_string(addresses, ",") as address , value , 'sent' as type FROM `bigquery-public-data.crypto_bitcoin.inputs` UNION ALL -- outputs SELECT transaction_hash , block_timestamp as timestamp , array_to_string(addresses, ",") as address , value , 'received' as type FROM `bigquery-public-data.crypto_bitcoin.outputs` ) SELECT address , type , sum(value) as sum , avg(value) as avg , min(value) as min , max(value) as max , count(transaction_hash) as number_transactions , min(timestamp) as first_transaction , max(timestamp) as last_transaction FROM all_transactions WHERE address in UNNEST(@address) GROUP BY type, address """ query_params = [ bigquery.ArrayQueryParameter("address", "STRING", list_addresses), ] job_config = bigquery.QueryJobConfig() job_config.query_parameters = query_params query_job = client.query( query, job_config=job_config, ) result = query_job.result() wallet_info = result.to_dataframe() return wallet_info
def get_compatibility_combinations(self, packages: List[package.Package]) -> \ Mapping[FrozenSet[package.Package], List[CompatibilityResult]]: """Returns a mapping between package pairs and CompatibilityResults. Args: packages: The packages to check compatibility for. Returns: A mapping between every combination of input packages and their CompatibilityResults. For example: get_compatibility_combinations(packages = [p1, p2, p3]) => { frozenset([p1, p2]): [CompatibilityResult...], frozenset([p1, p3]): [CompatibilityResult...], frozenset([p2, p3]): [CompatibilityResult...], }. """ install_name_to_package = {p.install_name: p for p in packages} packages_to_results = {} for p1, p2 in itertools.combinations(packages, r=2): packages_to_results[frozenset([p1, p2])] = [] query_params = [ bigquery.ArrayQueryParameter('install_names', 'STRING', [p.install_name for p in packages]), ] job_config = bigquery.QueryJobConfig() job_config.query_parameters = query_params query = ('SELECT * ' 'FROM {} s1 ' 'WHERE s1.install_name_lower IN UNNEST(@install_names) ' ' AND s1.install_name_higher IN UNNEST(@install_names) ' ' AND timestamp = ( ' ' SELECT MAX(timestamp) ' ' FROM {} s2 ' ' WHERE s1.install_name_lower = s2.install_name_lower ' ' AND s1.install_name_higher = s2.install_name_higher ' ' AND s1.py_version = s2.py_version)'.format( self._pairwise_table_id, self._pairwise_table_id)) query_job = self._client.query(query, job_config=job_config) for row in query_job: p_lower = install_name_to_package[row.install_name_lower] p_higher = install_name_to_package[row.install_name_higher] packages_to_results[frozenset([p_lower, p_higher])].append( self._row_to_compatibility_status([p_lower, p_higher], row)) return { p: self._filter_older_versions(crs) for (p, crs) in packages_to_results.items() }
def get_latest_timestamps_from_bq(bq_client, rally_items): query = f''' SELECT rally_id, max(timestamp) AS timestamp FROM {SCHEDULE_EVENTS_TABLE} WHERE rally_id IN UNNEST(@CANDIDATES) GROUP BY rally_id ''' job_config = bigquery.QueryJobConfig() job_config.query_parameters = [ bigquery.ArrayQueryParameter('CANDIDATES', 'STRING', [x[0] for x in rally_items]) ] return dict([(x.rally_id, x.timestamp) for x in bq_client.query(query, job_config=job_config)])
def test_to_query_parameters_dict_w_types(): from google.cloud import bigquery assert sorted( _helpers.to_query_parameters( dict(i=1, x=1.2, y=None, z=[]), dict(x="numeric", y="string", z="float64")), key=lambda p: p.name, ) == [ bigquery.ScalarQueryParameter("i", "INT64", 1), bigquery.ScalarQueryParameter("x", "NUMERIC", 1.2), bigquery.ScalarQueryParameter("y", "STRING", None), bigquery.ArrayQueryParameter("z", "FLOAT64", []), ]