def runQuery(client, qString, GeneList, SampleList, PatientList, dryRun=False):
    #  """
    #**`runQuery`**: a relatively generic BigQuery query-execution wrapper function which can be used to run a query in "dry-run"  mode or not:  the call to the `query()` function itself is inside a `try/except` block and if it fails we return `None`;  otherwise a "dry" will return an empty dataframe, and a "live" run will return the query results as a dataframe. This function was modify from previous notebooks to handle user-defined parameteres necessary for the purpose of this notbeook.
    #  """

    print("\n in runQuery ... ")
    if (dryRun):
        print("    dry-run only ")

    ## set up QueryJobConfig object
    job_config = bigquery.QueryJobConfig()

    query_params = [
        bigquery.ArrayQueryParameter('GENELIST', 'STRING', GeneList),
        bigquery.ArrayQueryParameter('SAMPLELIST', 'STRING', SampleList),
        bigquery.ArrayQueryParameter('PATIENTLIST', 'STRING', PatientList),
    ]
    job_config.query_parameters = query_params

    job_config.dry_run = dryRun
    job_config.use_query_cache = True
    job_config.use_legacy_sql = False

    ## run the query
    try:
        query_job = client.query(qString, job_config=job_config)
        ## print ( "    query job state: ", query_job.state )
    except:
        print("  FATAL ERROR: query execution failed ")
        return (None)

    ## return results as a dataframe (or an empty dataframe for a dry-run)
    if (not dryRun):
        try:
            df = query_job.to_dataframe()
            if (query_job.total_bytes_processed == 0):
                print("    the results for this query were previously cached ")
            else:
                print("    this query processed {} bytes ".format(
                    query_job.total_bytes_processed))
                jobTimeLine = query_job.timeline
                elapsed = 0
                for entry in jobTimeLine:
                    elapsed = int(entry.elapsed_ms)
                    #print( entry.elapsed_ms)
                print("    Approx. elpased time : {} miliseconds ".format(
                    elapsed))

            if (len(df) < 1):
                print("  WARNING: this query returned NO results ")
            return (df)
        except:
            print("  FATAL ERROR: query execution failed ")
            return (None)

    else:
        print("    if not cached, this query will process {} bytes ".format(
            query_job.total_bytes_processed))
        ## return an empty dataframe
        return (pd.DataFrame())
Esempio n. 2
0
def generate_recovery_users_query_with_params(events, dateIntervals):
    print(
        "Generate recovery users query and params for events: {events} and date intervals: {dateIntervals}"
        .format(events=events, dateIntervals=dateIntervals))

    requiredKeysForQuery = extract_required_keys_for_generate_recovery_query(
        events)

    stepBeforeDropOff = requiredKeysForQuery["stepBeforeDropOff"]
    nextStepList = requiredKeysForQuery["nextStepList"]
    recoveryStep = requiredKeysForQuery["recoveryStep"]

    dateOfYesterday = calculate_date_of_yesterday_at_utc()
    beginningOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int(
        dateOfYesterday, HOUR_MARKING_START_OF_DAY)
    endOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int(
        dateOfYesterday, HOUR_MARKING_END_OF_DAY)

    recoveryQuery = ("""
            select distinct(`user_id`)
            from `{full_table_url}`
            where `event_type` in UNNEST(@recoveryStep)
            and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds
            and `user_id` in
            (
                select `user_id`
                from `{full_table_url}`
                where `user_id` not in
                (
                    select `user_id`
                    from `{full_table_url}`
                    where `event_type` in UNNEST(@nextStepList)
                    and `time_transaction_occurred` <= @endOfYesterdayInMilliseconds
                )
                and `event_type` = @stepBeforeDropOff
                and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds
            )
        """.format(full_table_url=FULL_TABLE_URL))

    recoveryParams = [
        bigquery.ArrayQueryParameter("recoveryStep", "STRING", recoveryStep),
        bigquery.ScalarQueryParameter("stepBeforeDropOff", "STRING",
                                      stepBeforeDropOff),
        bigquery.ArrayQueryParameter("nextStepList", "STRING", nextStepList),
        bigquery.ScalarQueryParameter("beginningOfYesterdayInMilliseconds",
                                      "INT64",
                                      beginningOfYesterdayInMilliseconds),
        bigquery.ScalarQueryParameter("endOfYesterdayInMilliseconds", "INT64",
                                      endOfYesterdayInMilliseconds),
    ]

    print("""
        Successfully generated recovery users query and params for events: {events} and date intervals: {dateIntervals}.
        Recovery query: {recoveryQuery}. Recovery params: {recoveryParams}
        """.format(events=events,
                   dateIntervals=dateIntervals,
                   recoveryQuery=recoveryQuery,
                   recoveryParams=recoveryParams))

    return {"recoveryQuery": recoveryQuery, "recoveryParams": recoveryParams}
def test_array_to_query_parameter_honors_given_type(alias, type_):
    from google.cloud import bigquery

    assert _helpers.array_to_query_parameter(
        [1.23], None,
        alias) == (bigquery.ArrayQueryParameter(None, type_, [1.23]))
    assert _helpers.array_to_query_parameter(
        (), "foo", alias) == (bigquery.ArrayQueryParameter("foo", type_, ()))
def get_weekly_clientlibs_downloads(clientlibs_table_name, date_str):
    """Use a SQL query to collect the weekly download data of the client
    libraries.

    Args:
        clientlibs_table_name (str): Table name, which is the key in the
                                     CLIENTLIBS dict.
        date_str (str): A date string in "YYYYMMDD" format.

    Returns:
         list: rows of the query result.
    """
    client_libs = CLIENTLIBS[clientlibs_table_name]
    date_time = datetime.datetime.strptime(date_str, DATETIME_FORMAT)
    week_dates = [
        (date_time + datetime.timedelta(days=-i)).strftime(DATETIME_FORMAT)
        for i in range(7)
    ]
    query = """
            SELECT
                file.project as client_library_name,
                COUNT(*) as download_count
            FROM
                `the-psf.pypi.downloads*`
            WHERE
                file.project IN UNNEST(@client_libs)
                AND
                _TABLE_SUFFIX IN UNNEST(@week_dates)
            GROUP BY client_library_name
        """
    client = bigquery.Client()
    query_job = client.run_async_query(
        str(uuid.uuid4()),
        query,
        query_parameters=(bigquery.ArrayQueryParameter('client_libs', 'STRING',
                                                       client_libs),
                          bigquery.ArrayQueryParameter('week_dates', 'STRING',
                                                       week_dates)))
    query_job.use_legacy_sql = False

    # Start the query job and wait it to complete
    query_job.begin()
    wait_for_job(query_job)

    # Fetch the results
    result = query_job.query_results().fetch_data()
    result_list = [item for item in result]

    # In case the result_list contains the metadata like total_rows, the
    # actual rows will be the first element of the result_list.
    if len(result_list) > 0 and isinstance(result_list[0], list):
        result_list = result_list[0]

    rows = [(date_time, ) + row for row in result_list]
    print(rows)

    return rows
Esempio n. 5
0
def GeneSymbol_standardization(Gene_list, project_id):
    client = bigquery.Client(project_id)
    #query='''
    #    SELECT *
    #    FROM  `syntheticlethality.gene_information.gene_info_human`
    #    where Gene in UNNEST(@input_gene_list)
    #    '''
    query = '''
        SELECT *
        FROM  `isb-cgc-bq.synthetic_lethality.gene_info_human_HGNC_NCBI_2020_07`
        where Gene in UNNEST(@input_gene_list)
        '''
    job_config = bigquery.QueryJobConfig(query_parameters=[
        bigquery.ArrayQueryParameter("input_gene_list", "STRING", Gene_list)
    ])

    id_map = client.query(query, job_config=job_config).result().to_dataframe()

    Gene_list_all = list(
        set(list(id_map['Alias'].values) + list(id_map['Gene'].values)))

    #query1 = '''
    #            select Hugo_Symbol
    #            from `syntheticlethality.DepMap_public_20Q3.CCLE_mutation`
    #            where Hugo_Symbol in UNNEST(@input_gene_list_new)
    #            '''

    query1 = ''' 
                select Hugo_Symbol
                from `isb-cgc-bq.DEPMAP.CCLE_mutation_DepMapPublic_current`
                where Hugo_Symbol in UNNEST(@input_gene_list_new)
                '''
    job_config = bigquery.QueryJobConfig(query_parameters=[
        bigquery.ArrayQueryParameter("input_gene_list_new", "STRING",
                                     Gene_list_all)
    ])

    Mut_mat = client.query(query1,
                           job_config=job_config).result().to_dataframe()
    set_gene_CCLE = set(Mut_mat['Hugo_Symbol'])

    dic_gene_to_alias = {}
    output_gene_list = []
    for Gene in Gene_list:
        dic_gene_to_alias[Gene] = set(id_map.loc[id_map['Gene'] == Gene]
                                      ['Alias']).intersection(set_gene_CCLE)
        if Gene in set(id_map['Gene']) and Gene in dic_gene_to_alias[Gene]:
            output_gene_list.append(Gene)
        else:
            print(Gene + ":" + ','.join(list(dic_gene_to_alias[Gene])))
            for value in dic_gene_to_alias[Gene]:
                if value in set(id_map['Alias']):
                    output_gene_list.append(value)

    return (dic_gene_to_alias, output_gene_list)
def main():
    """
    Get the configuration file
    """
    logger = logging.getLogger(__name__)

    # Set up ENV
    CONFIG_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                               'env.json')
    try:
        with open(CONFIG_PATH) as env_file:
            ENV = json.load(env_file)
    except FileNotFoundError:
        logger.error(
            f'Configuration file could not be found; please add file "{CONFIG_PATH}".'
        )
        ENV = dict()

    DATA_WAREHOUSE_COURSE_IDS = ENV["DATA_WAREHOUSE_COURSE_IDS"]
    DATA_WAREHOUSE_SHORT_COURSE_IDS = ENV["DATA_WAREHOUSE_SHORT_COURSE_IDS"]
    CANVAS_DATA_ID_INCREMENT = ENV["CANVAS_DATA_ID_INCREMENT"]
    TIME_LIMIT = ENV["TIME_LIMIT"]

    query_params = [
        bigquery.ArrayQueryParameter('course_ids', 'STRING',
                                     DATA_WAREHOUSE_COURSE_IDS),
        bigquery.ArrayQueryParameter('course_ids_short', 'STRING',
                                     DATA_WAREHOUSE_SHORT_COURSE_IDS),
        bigquery.ScalarQueryParameter('canvas_data_id_increment', 'INT64',
                                      CANVAS_DATA_ID_INCREMENT),
        bigquery.ScalarQueryParameter('time_limit', 'STRING', TIME_LIMIT)
    ]

    # from the configuration variable, load the queries based on UDP expanded vs events table
    # run queries and compare the returned dataframes
    expanded_vs_events_queries_json = ENV["EXPANDED_VS_EVENTS_QUERIES"]
    for event_type in expanded_vs_events_queries_json:
        print(event_type)
        expanded_query_array = []
        events_query_array = []
        for attribute, value in expanded_vs_events_queries_json[
                event_type].items():
            if attribute == "expanded_query_array":
                expanded_query_array = value
            elif attribute == "events_query_array":
                events_query_array = value
        compare_expanded_vs_events_df(expanded_query_array, events_query_array,
                                      query_params)
def set_prefixes_to_status(prefixes: List[str], status: str,
                           services: Services, options: STSJobManagerOptions):
    """
    Sets a list of prefixes to a given status in the database.
    """
    logger.info(f'Updating {len(prefixes)} prefixes to `{status}` status')

    table = get_table_identifier(services, options.bigquery_options,
                                 options.bigquery_options.table_name['job'])

    # API does not support table names for preparameterized queries
    # https://cloud.google.com/bigquery/docs/parameterized-queries
    # We can't UPDATE jobs that are currently in a stream, so defer for later
    # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-manipulation-language#limitations
    query = f"""
    UPDATE `{table}`
    SET status = @status, last_updated = CURRENT_TIMESTAMP()
    WHERE prefix IN UNNEST(@prefixes)
    AND last_updated < TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 90 MINUTE)
    """

    params = [
        bigquery.ScalarQueryParameter("status", "STRING", status),
        bigquery.ArrayQueryParameter("prefixes", "STRING", prefixes)
    ]

    run_query(query, params, services, options).result()
Esempio n. 8
0
def client_query_w_array_params():

    # [START bigquery_query_params_arrays]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    query = """
        SELECT name, sum(number) as count
        FROM `bigquery-public-data.usa_names.usa_1910_2013`
        WHERE gender = @gender
        AND state IN UNNEST(@states)
        GROUP BY name
        ORDER BY count DESC
        LIMIT 10;
    """
    job_config = bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("gender", "STRING", "M"),
            bigquery.ArrayQueryParameter("states", "STRING", ["WA", "WI", "WV", "WY"]),
        ]
    )
    query_job = client.query(query, job_config=job_config)  # Make an API request.

    for row in query_job:
        print("{}: \t{}".format(row.name, row.count))
    def __init__(self, blocklisted_document_types, date_from, date_until):
        self.blocklisted_document_types = blocklisted_document_types
        self.date_from = date_from
        self.date_until = date_until

        logger = logging.getLogger(
            'make_functional_edges_and_weights.EdgeWeightExtractor')
        credentials, project_id = google.auth.default()
        logger.info(f'creating bigqquery client for project {project_id}')
        self.client = bigquery.Client(credentials=credentials,
                                      project=project_id)
        logger.info(
            'reading query from  src/data_preprocessing/query_content_id_edge_weights.sql'
        )
        self.query_edge_list = read_file_as_string(
            "src/data_preprocessing/query_content_id_edge_weights.sql")

        self.query_config = bigquery.QueryJobConfig(query_parameters=[
            bigquery.ScalarQueryParameter("three_weeks_ago", "STRING",
                                          self.date_from),
            bigquery.ScalarQueryParameter("yesterday", "STRING",
                                          self.date_until),
            bigquery.ArrayQueryParameter("excluded_document_types", "STRING",
                                         self.blocklisted_document_types)
        ])

        self.df = self.client.query(
            self.query_edge_list, job_config=self.query_config).to_dataframe()
Esempio n. 10
0
    def _test_query_from_bq_table(self):
        '''skipping this for now, need to create a permanent test table to query from'''

        query = '''
                SELECT stride_treatment_team_id, pat_enc_csn_id, treatment_team
                FROM stride_2008_2017.stride_treatment_team
                WHERE stride_treatment_team_id IN UNNEST(@tt_ids);
                '''

        headers = ['stride_treatment_team_id', 'pat_enc_csn_id', 'treatment_team']
        treatment_team_ids = list(range(2141155, 2141205))
        query_params = [
            bigquery.ArrayQueryParameter('tt_ids', 'INT64', treatment_team_ids)
        ]

        query_job = self.bqClient.queryBQ(str(query), query_params=query_params, location='US', batch_mode=False,
                                          verbose=True)

        actual_data = []
        with open(self.tmp_csv_path, 'wb') as csvfile:
            writer = csv.writer(csvfile)
            for row in query_job:
                writer.writerow(row.values())
                actual_data.append(row.values)

        # TODO fill expected data
        expected_data = []
        self.assertEqualTable(expected_data, actual_data)
Esempio n. 11
0
def find_files_in_load_history(
        bq_project: str, dataset: str,
        areas: dict[str, set[PathWithCrc]]) -> dict[str, set[PathWithCrc]]:
    client = bigquery.Client(project=bq_project)
    loaded_paths = {}

    for area, paths_with_crc in areas.items():
        logging.debug(f"\tPulling loaded files for area {area}...")
        target_paths = [path_with_crc.path for path_with_crc in paths_with_crc]
        query = f"""
            SELECT target_path, checksum_crc32c
            FROM `datarepo_{dataset}.datarepo_load_history` dlh
            WHERE  state = 'succeeded'
            AND target_path IN UNNEST(@paths)
        """

        job_config = bigquery.QueryJobConfig(query_parameters=[
            bigquery.ArrayQueryParameter("paths", "STRING", target_paths),
        ])
        query_job = client.query(query, job_config=job_config)
        loaded_paths[area] = {
            PathWithCrc(row["target_path"], row["checksum_crc32c"])
            for row in query_job
        }

    return loaded_paths
Esempio n. 12
0
def get_log_probs(names_list, timeout_seconds=30):
    bigquery_client = bigquery.Client()
    query = """
        WITH test_names AS
        (SELECT * FROM UNNEST(@names) AS test_name CROSS JOIN {})
        SELECT test_name name, test_year year, IFNULL(logp, min_logp - {}) logp
        FROM `{}`
        RIGHT OUTER JOIN test_names
        ON LOWER(name)=LOWER(test_name) AND year=test_year
        ORDER BY test_year ASC, test_name DESC
    """.format(min_logp_table, unknown_penalty, logp_table)
    job_config = bigquery.QueryJobConfig(query_parameters=[
        bigquery.ArrayQueryParameter('names', 'STRING', names_list)
    ])
    job_config.use_query_cache = True
    query_job = bigquery_client.query(query, job_config=job_config)
    results = query_job.result(timeout=timeout_seconds)
    d = defaultdict(float)
    for row in results:
        d[row["year"]] += row["logp"]

    items = sorted(d.items())
    years = np.array([i[0] for i in items])
    logps = np.array([i[1] for i in items])
    return years, logps
Esempio n. 13
0
def GeneSymbol_standardization_output(Gene_list, project_id):
    Gene_list = list(set(Gene_list))
    client = bigquery.Client(project_id)
    #query='''
    #    SELECT *
    #    FROM  `syntheticlethality.gene_information.gene_info_human`
    #    where Gene in UNNEST(@input_gene_list)
    #    '''

    query = '''
        SELECT *
        FROM  `isb-cgc-bq.synthetic_lethality.gene_info_human_HGNC_NCBI_2020_07`
        where Gene in UNNEST(@input_gene_list)
        '''
    job_config = bigquery.QueryJobConfig(query_parameters=[
        bigquery.ArrayQueryParameter("input_gene_list", "STRING", Gene_list)
    ])

    id_map = client.query(query, job_config=job_config).result().to_dataframe()
    alias_list = list(id_map['Alias'].values)
    gene_list = list(id_map['Gene'].values)

    dic_alias_to_gene = {}

    for i in range(0, len(alias_list)):
        dic_alias_to_gene[alias_list[i]] = gene_list[i]

    return (dic_alias_to_gene)
def query_array_params(gender, states):
    client = bigquery.Client()
    query = """
        SELECT name, sum(number) as count
        FROM `bigquery-public-data.usa_names.usa_1910_2013`
        WHERE gender = @gender
        AND state IN UNNEST(@states)
        GROUP BY name
        ORDER BY count DESC
        LIMIT 10;
        """
    query_job = client.run_async_query(
        str(uuid.uuid4()),
        query,
        query_parameters=(bigquery.ScalarQueryParameter(
            'gender', 'STRING',
            gender), bigquery.ArrayQueryParameter('states', 'STRING', states)))
    query_job.use_legacy_sql = False

    query_job.begin()
    query_job.result()  # Wait for job to complete

    # Print the results.
    destination_table = query_job.destination
    destination_table.reload()
    for row in destination_table.fetch_data():
        print(row)
Esempio n. 15
0
 def make_bq_array_parameters(self, parameters: list) -> list:
     array_params = [
         # TODO: Improve this method
         bigquery.ArrayQueryParameter(key, get_bq_data_type(value[0]), value)
         for key, value in parameters
     ]
     return array_params
Esempio n. 16
0
def array_to_query_parameter(value, name=None):
    """Convert an array-like value into a query parameter.

    Args:
        value (Sequence[Any]): The elements of the array (should not be a
            string-like Sequence).
        name (Optional[str]): Name of the query parameter.

    Returns:
        A query parameter corresponding with the type and value of the plain
        Python object.

    Raises:
        google.cloud.bigquery.dbapi.exceptions.ProgrammingError:
            if the type of array elements cannot be determined.
    """
    if not array_like(value):
        raise exceptions.ProgrammingError(
            "The value of parameter {} must be a sequence that is "
            "not string-like.".format(name))

    if not value:
        raise exceptions.ProgrammingError(
            "Encountered an empty array-like value of parameter {}, cannot "
            "determine array elements type.".format(name))

    # Assume that all elements are of the same type, and let the backend handle
    # any type incompatibilities among the array elements
    array_type = bigquery_scalar_type(value[0])
    if array_type is None:
        raise exceptions.ProgrammingError(
            "Encountered unexpected first array element of parameter {}, "
            "cannot determine array elements type.".format(name))

    return bigquery.ArrayQueryParameter(name, array_type, value)
Esempio n. 17
0
def _build_query_parameters(query: BigQuery, parameters: dict) -> list:
    """ Builds the parameter list for a BigQuery job from a supplied
        list of parameter values.

        Args:
            query (BigQuery): A query with parameter specification.
            parameters (dict): Corresponding dict of parameters and supplied values.

        Returns:
            (list) A list of BigQuery parameters.
    """
    # Build query parameters
    query_params = []
    for spec in query.parameter_spec:
        pname = spec["name"]
        ptype = spec["type"]
        if pname not in parameters:
            raise RuntimeError(
                f"Parameter '{pname}' unspecified in `run_query`")

        if spec["array_type"] is False:
            bqparam = bigquery.ScalarQueryParameter(pname, ptype,
                                                    parameters[pname])
        else:
            if type(parameters[pname]) != list:
                raise RuntimeError(
                    f"Query '{query.name}' expects parameter '{pname}' to be a list"
                )
            bqparam = bigquery.ArrayQueryParameter(pname, ptype,
                                                   parameters[pname])
        query_params.append(bqparam)
    return query_params
Esempio n. 18
0
def query_array_params(gender, states):
    client = bigquery.Client()
    query = """
        SELECT name, sum(number) as count
        FROM `bigquery-public-data.usa_names.usa_1910_2013`
        WHERE gender = @gender
        AND state IN UNNEST(@states)
        GROUP BY name
        ORDER BY count DESC
        LIMIT 10;
        """
    query_params = [
        bigquery.ScalarQueryParameter('gender', 'STRING', gender),
        bigquery.ArrayQueryParameter('states', 'STRING', states)
    ]
    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_params
    query_job = client.query(query, job_config=job_config)

    query_job.result()  # Wait for job to complete

    # Print the results.
    destination_table_ref = query_job.destination
    table = client.get_table(destination_table_ref)
    for row in client.list_rows(table):
        print(row)
    def __init__(self,
                 query_path,
                 blocklisted_document_types,
                 date_from=None,
                 date_until=None,
                 weight_threshold=None):
        self.logger = logging.getLogger(
            'make_functional_edges_and_weights.EdgeWeightExtractor')
        self.blocklisted_document_types = blocklisted_document_types
        self.weight_threshold = weight_threshold
        self.date_from = date_from
        self.date_until = date_until
        self.query_path = query_path
        self.df = None
        credentials, project_id = google.auth.default()
        self.logger.info(f'creating bigqquery client for project {project_id}')
        # TODO use the big query client util in future
        self.client = bigquery.Client(credentials=credentials,
                                      project=project_id)
        self.logger.info(f'reading query from {query_path}')
        self.query_edge_list = parse_sql_script(self.query_path)

        self.query_config = bigquery.QueryJobConfig(query_parameters=[
            bigquery.ScalarQueryParameter("from_date", "STRING",
                                          self.date_from),
            bigquery.ScalarQueryParameter("to_date", "STRING",
                                          self.date_until),
            bigquery.ScalarQueryParameter("weight_threshold", "INT64",
                                          self.weight_threshold),
            bigquery.ArrayQueryParameter("excluded_document_types", "STRING",
                                         self.blocklisted_document_types)
        ])
Esempio n. 20
0
def print_query_results(client, ids, min_duration=0, max_duration=84000):
    query = """
        SELECT 
          start_station_id
          , COUNT(*) as num_trips
        FROM 
          `bigquery-public-data`.london_bicycles.cycle_hire
        WHERE 
          start_station_id IN UNNEST(@STATIONS)
          AND duration BETWEEN @MIN_DURATION AND @MAX_DURATION
        GROUP BY start_station_id
    """
    query_params = [
        bigquery.ArrayQueryParameter('STATIONS', "INT64", ids),
        bigquery.ScalarQueryParameter('MIN_DURATION', "FLOAT64", min_duration),
        bigquery.ScalarQueryParameter('MAX_DURATION', "FLOAT64", max_duration),
    ]
    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_params
    query_job = client.query(
        query,
        location="EU",
        job_config=job_config,
    )
    for row in query_job:
        print("{}: \t{}".format(row.start_station_id, row.num_trips))
    print("______________________")
Esempio n. 21
0
def array_to_query_parameter(value, name=None, query_parameter_type=None):
    """Convert an array-like value into a query parameter.

    Args:
        value (Sequence[Any]): The elements of the array (should not be a
            string-like Sequence).
        name (Optional[str]): Name of the query parameter.
        query_parameter_type (Optional[str]): Given type for the parameter.

    Returns:
        A query parameter corresponding with the type and value of the plain
        Python object.

    Raises:
        google.cloud.bigquery.dbapi.exceptions.ProgrammingError:
            if the type of array elements cannot be determined.
    """
    if not array_like(value):
        raise exceptions.ProgrammingError(
            "The value of parameter {} must be a sequence that is "
            "not string-like.".format(name))

    if query_parameter_type or value:
        array_type = _parameter_type(
            name,
            value[0] if value else None,
            query_parameter_type,
            value_doc="array element ",
        )
    else:
        raise exceptions.ProgrammingError(
            "Encountered an empty array-like value of parameter {}, cannot "
            "determine array elements type.".format(name))

    return bigquery.ArrayQueryParameter(name, array_type, value)
Esempio n. 22
0
def test_generate_recovery_users_query_with_params():
    dateOfYesterday = calculate_date_of_yesterday_at_utc()
    beginningOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int(
        dateOfYesterday, HOUR_MARKING_START_OF_DAY)
    endOfYesterdayInMilliseconds = convert_date_string_to_millisecond_int(
        dateOfYesterday, HOUR_MARKING_END_OF_DAY)

    expectedRecoveryQuery = ("""
            select distinct(`user_id`)
            from `{full_table_url}`
            where `event_type` in UNNEST(@recoveryStep)
            and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds
            and `user_id` in
            (
                select `user_id`
                from `{full_table_url}`
                where `user_id` not in
                (
                    select `user_id`
                    from `{full_table_url}`
                    where `event_type` in UNNEST(@nextStepList)
                    and `time_transaction_occurred` <= @endOfYesterdayInMilliseconds
                )
                and `event_type` = @stepBeforeDropOff
                and `time_transaction_occurred` between @beginningOfYesterdayInMilliseconds and @endOfYesterdayInMilliseconds
            )
        """.format(full_table_url=FULL_TABLE_URL))

    expectedRecoveryParams = [
        bigquery.ArrayQueryParameter("recoveryStep", "STRING", recoveryStep),
        bigquery.ScalarQueryParameter("stepBeforeDropOff", "STRING",
                                      stepBeforeDropOff),
        bigquery.ArrayQueryParameter("nextStepList", "STRING", nextStepList),
        bigquery.ScalarQueryParameter("beginningOfYesterdayInMilliseconds",
                                      "INT64",
                                      beginningOfYesterdayInMilliseconds),
        bigquery.ScalarQueryParameter("endOfYesterdayInMilliseconds", "INT64",
                                      endOfYesterdayInMilliseconds),
    ]

    expectedRecoveryQueryAndParams = {
        "recoveryQuery": expectedRecoveryQuery,
        "recoveryParams": expectedRecoveryParams
    }

    assert generate_recovery_users_query_with_params(
        sampleEvents, sampleDateIntervals) == expectedRecoveryQueryAndParams
def get_weekly_clientlibs_downloads(clientlibs_table_name, date_str):
    """Use a SQL query to collect the weekly download data of the client
    libraries.

    Args:
        clientlibs_table_name (str): Table name, which is the key in the
                                     CLIENTLIBS dict.
        date_str (str): A date string in "YYYYMMDD" format.

    Returns:
         list: rows of the query result.
    """
    client_libs = CLIENTLIBS[clientlibs_table_name]
    date_time = datetime.datetime.strptime(date_str, DATETIME_FORMAT)
    week_dates = [
        (date_time + datetime.timedelta(days=-i)).strftime(DATETIME_FORMAT)
        for i in range(7)
    ]
    query = """
            SELECT
                file.project as client_library_name,
                COUNT(*) as download_count
            FROM
                `the-psf.pypi.downloads*`
            WHERE
                file.project IN UNNEST(@client_libs)
                AND
                _TABLE_SUFFIX IN UNNEST(@week_dates)
            GROUP BY client_library_name
        """
    client = bigquery.Client()
    query_parameters = [
        bigquery.ArrayQueryParameter('client_libs', 'STRING', client_libs),
        bigquery.ArrayQueryParameter('week_dates', 'STRING', week_dates)
    ]
    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_parameters
    query_job = client.query(query, job_config=job_config)

    # Wait for the job to complete and get the results
    results = [row.values() for row in query_job.result()]

    rows = [(date_time, ) + row for row in results]

    return rows
Esempio n. 24
0
File: client.py Progetto: shshe/ibis
def bq_param_array(param, value):
    param_type = param.type()
    assert isinstance(param_type, dt.Array), str(param_type)

    try:
        bigquery_type = _IBIS_TYPE_TO_DTYPE[str(param_type.value_type)]
    except KeyError:
        raise com.UnsupportedBackendType(param_type)
    else:
        return bq.ArrayQueryParameter(param._name, bigquery_type, value)
Esempio n. 25
0
    def bigquery_match(self, bq_client, bq_query, event_field_name, values):
        """Run a BigQuery query for rows with matching event_field_name values.

        Returns:
            BigQuery query job.
        """
        job_config = bigquery.QueryJobConfig(query_parameters=[
            bigquery.ArrayQueryParameter(event_field_name, "STRING", values),
        ])
        return bq_client.query(bq_query, job_config=job_config)
Esempio n. 26
0
def test_to_query_parameters_list_w_types():
    from google.cloud import bigquery

    assert _helpers.to_query_parameters(
        [1, 1.2, None, []], [None, "numeric", "string", "float64"]) == [
            bigquery.ScalarQueryParameter(None, "INT64", 1),
            bigquery.ScalarQueryParameter(None, "NUMERIC", 1.2),
            bigquery.ScalarQueryParameter(None, "STRING", None),
            bigquery.ArrayQueryParameter(None, "FLOAT64", []),
        ]
Esempio n. 27
0
def get_all_tx_from_address_v1(list_addresses):
    #get transaction overview from list of wallets

    query = """
    WITH all_transactions AS (
    -- inputs
    SELECT 
        transaction_hash
       , block_timestamp as timestamp
       , array_to_string(addresses, ",") as address
       , value
       , 'sent' as type
    FROM `bigquery-public-data.crypto_bitcoin.inputs`
    
    UNION ALL
    
    -- outputs
    SELECT 
        transaction_hash
       , block_timestamp as timestamp
       , array_to_string(addresses, ",") as address
       , value
       , 'received' as type
    FROM `bigquery-public-data.crypto_bitcoin.outputs`
    )
    
    SELECT
       address
       , type
       , sum(value) as sum
       , avg(value) as avg
       , min(value) as min
       , max(value) as max
       , count(transaction_hash) as number_transactions
       , min(timestamp) as first_transaction
       , max(timestamp) as last_transaction
    FROM all_transactions
    WHERE address in UNNEST(@address)
    GROUP BY type, address
    """

    query_params = [
        bigquery.ArrayQueryParameter("address", "STRING", list_addresses),
    ]

    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = query_params
    query_job = client.query(
        query,
        job_config=job_config,
    )
    result = query_job.result()
    wallet_info = result.to_dataframe()

    return wallet_info
    def get_compatibility_combinations(self,
                                       packages: List[package.Package]) -> \
            Mapping[FrozenSet[package.Package], List[CompatibilityResult]]:
        """Returns a mapping between package pairs and CompatibilityResults.

        Args:
            packages: The packages to check compatibility for.

        Returns:
            A mapping between every combination of input packages and their
            CompatibilityResults. For example:
            get_compatibility_combinations(packages = [p1, p2, p3]) =>
            {
               frozenset([p1, p2]): [CompatibilityResult...],
               frozenset([p1, p3]): [CompatibilityResult...],
               frozenset([p2, p3]): [CompatibilityResult...],
            }.
        """
        install_name_to_package = {p.install_name: p for p in packages}

        packages_to_results = {}
        for p1, p2 in itertools.combinations(packages, r=2):
            packages_to_results[frozenset([p1, p2])] = []

        query_params = [
            bigquery.ArrayQueryParameter('install_names', 'STRING',
                                         [p.install_name for p in packages]),
        ]
        job_config = bigquery.QueryJobConfig()
        job_config.query_parameters = query_params

        query = ('SELECT * '
                 'FROM {} s1 '
                 'WHERE s1.install_name_lower IN UNNEST(@install_names) '
                 '  AND s1.install_name_higher IN UNNEST(@install_names) '
                 '  AND timestamp = ( '
                 '     SELECT MAX(timestamp) '
                 '     FROM {} s2 '
                 '     WHERE s1.install_name_lower = s2.install_name_lower '
                 '       AND s1.install_name_higher = s2.install_name_higher '
                 '       AND s1.py_version = s2.py_version)'.format(
                     self._pairwise_table_id, self._pairwise_table_id))

        query_job = self._client.query(query, job_config=job_config)

        for row in query_job:
            p_lower = install_name_to_package[row.install_name_lower]
            p_higher = install_name_to_package[row.install_name_higher]
            packages_to_results[frozenset([p_lower, p_higher])].append(
                self._row_to_compatibility_status([p_lower, p_higher], row))
        return {
            p: self._filter_older_versions(crs)
            for (p, crs) in packages_to_results.items()
        }
Esempio n. 29
0
def get_latest_timestamps_from_bq(bq_client, rally_items):
    query = f'''
            SELECT rally_id, max(timestamp) AS timestamp FROM {SCHEDULE_EVENTS_TABLE}
            WHERE rally_id IN UNNEST(@CANDIDATES)
            GROUP BY rally_id
        '''
    job_config = bigquery.QueryJobConfig()
    job_config.query_parameters = [
        bigquery.ArrayQueryParameter('CANDIDATES', 'STRING',
                                     [x[0] for x in rally_items])
    ]
    return dict([(x.rally_id, x.timestamp)
                 for x in bq_client.query(query, job_config=job_config)])
Esempio n. 30
0
def test_to_query_parameters_dict_w_types():
    from google.cloud import bigquery

    assert sorted(
        _helpers.to_query_parameters(
            dict(i=1, x=1.2, y=None, z=[]),
            dict(x="numeric", y="string", z="float64")),
        key=lambda p: p.name,
    ) == [
        bigquery.ScalarQueryParameter("i", "INT64", 1),
        bigquery.ScalarQueryParameter("x", "NUMERIC", 1.2),
        bigquery.ScalarQueryParameter("y", "STRING", None),
        bigquery.ArrayQueryParameter("z", "FLOAT64", []),
    ]