def _make_bqstorage_client(use_bqstorage_api, credentials, client_options): if not use_bqstorage_api: return None try: from google.cloud import bigquery_storage except ImportError as err: customized_error = ImportError( "The default BigQuery Storage API client cannot be used, install " "the missing google-cloud-bigquery-storage and pyarrow packages " "to use it. Alternatively, use the classic REST API by specifying " "the --use_rest_api magic option.") six.raise_from(customized_error, err) try: from google.api_core.gapic_v1 import client_info as gapic_client_info except ImportError as err: customized_error = ImportError( "Install the grpcio package to use the BigQuery Storage API.") six.raise_from(customized_error, err) return bigquery_storage.BigQueryReadClient( credentials=credentials, client_info=gapic_client_info.ClientInfo( user_agent=IPYTHON_USER_AGENT), client_options=client_options, )
def compute_bal_for_gas(start_block_timestamp, end_block_timestamp, gas_whitelist, plot=True, verbose=True): sql = '' with open('src/bal4gas_V1.sql', 'r') as file: sql = (file.read().format(start_block_timestamp, end_block_timestamp, '\',\''.join(gas_whitelist))) if verbose: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' - Querying Bigquery for eligible V1 swaps and reimbursement values ...' ) client = bigquery.Client() bqstorageclient = bigquery_storage.BigQueryReadClient() reimbursements = (client.query(sql).result().to_dataframe( bqstorage_client=bqstorageclient)) if verbose: print( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' - Done!') if plot: reimbursements.groupby('datetime').mean( )['block_median_gas_price'].plot(title='Median gas price') plt.show() if verbose: print( f'ETH reimbursements for the week (V1): {sum(reimbursements.eth_reimbursement)}' ) # get BAL:ETH price feed from Coingecko bal_eth_coingecko = 'https://api.coingecko.com/api/v3/coins/ethereum/contract/0xba100000625a3754423978a60c9317c58a424e3d/market_chart/range?vs_currency=eth&from={0}&to={1}'.format( start_block_timestamp - 7200, end_block_timestamp + 7200) baleth_feed = pd.read_json(bal_eth_coingecko)['prices'] baleth_feed = pd.DataFrame(baleth_feed.tolist(), index=baleth_feed.index, columns=['timestamp', 'price']) baleth_feed['datetime'] = pd.to_datetime(baleth_feed['timestamp'] / 1000, unit='s', utc=True) if plot: baleth_feed.plot(x='datetime', y='price', title='BAL:ETH') plt.show() merge = pd.merge_asof(reimbursements.sort_values(by='datetime'), baleth_feed.sort_values(by='datetime'), on='datetime', direction='nearest') merge['bal_reimbursement'] = merge['eth_reimbursement'] / merge['price'] if verbose: print( f'BAL reimbursements for the week (V1): {sum(merge.bal_reimbursement)}' ) merge['address'] = merge['address'].apply(Web3.toChecksumAddress) return merge
def fetch_BQ(self): credentials = service_account.Credentials.from_service_account_file('../API/BQ_api.json') bqclient = bigquery.Client(credentials=credentials, project=credentials.project_id) bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials) query_string = f"SELECT Date, Traffic_source, Data_Source_type, Cost, Clicks, Impressions \ FROM funnel-248216.marketing_spend.all_funnel_data_view \ WHERE Date >= DATE ({self.start_date.year}, {self.start_date.month}, {self.start_date.day}) \ AND Date <= DATE ({self.end_date.year}, {self.end_date.month}, {self.end_date.day}) \ AND (Campaign_name__TikTok NOT LIKE '%no%' OR Campaign_name__TikTok IS NULL)" self.funnel_df = bqclient.query(query_string).result().to_dataframe(bqstorage_client=bqstorageclient) print('Read ', len(self.funnel_df), ' datapoints from BigQuery Funnel')
def get_bq_storage_client(): """ Build a BigQueryStorage Python client. Returns ------- bigquery_storage.BigQueryReadClient instance """ credentials, project_id = google.auth.default( scopes=SCOPES ) bqstorageclient = bigquery_storage.BigQueryReadClient( credentials=credentials ) return bqstorageclient
def get_reddit_comments_table(): """Get data from the BigQuery Reddit comments dataset.""" # https://www.reddit.com/r/bigquery/comments/3cej2b/17_billion_reddit_comments_loaded_on_bigquery/ client = bigquery.Client() storage_client = bigquery_storage.BigQueryReadClient() # 2005 & 2006 have no samples table_names = [ *list(map(str, range(2007, 2015))), *[ f"{year}_{month:02d}" for year in range(2015, 2020) for month in range(1, 13) ] ] for table_name in tqdm(table_names): # tqdm.write(f"{table_name}") df = client.query( QUERY_STRING.format(table_name=table_name)).result().to_dataframe( bqstorage_client=storage_client) df.to_parquet(DATA_PATH / f"{table_name}.parquet")
def __init__(self): try: self.bigquery = bigquery self.params = get_db_config(section='gcp') credentials = service_account.Credentials.from_service_account_file( self.params['credentials_file_path'], scopes=["https://www.googleapis.com/auth/cloud-platform"], ) self.client = bigquery.Client( credentials=credentials, project=credentials.project_id, ) self.storage = bigquery_storage.BigQueryReadClient( credentials=credentials) except Exception as error: raise (error)
def get_survey_responses(surveyid, client=None): """Get data from survey""" google.cloud.bigquery.magics.context.use_bqstorage_api = True project_id = os.environ.get('PROJECT_ID') table_id = os.environ.get('TABLE_ID') if client is None: client = bigquery.Client(project=project_id) bqstorageclient = bigquery_storage.BigQueryReadClient() query = f""" SELECT CreatedAt, Segmentation, Response FROM `{table_id}` WHERE ID = @survey_id """ job_config = bigquery.QueryJobConfig(query_parameters=[ bigquery.ScalarQueryParameter('survey_id', 'STRING', surveyid), ]) query_job = client.query(query, job_config=job_config) df = query_job.result().to_dataframe(bqstorage_client=bqstorageclient) return df
def main(): args = parse_args() bq_client = bigquery.Client() bqs_client = bigquery_storage.BigQueryReadClient() TableInfo.client = bq_client dataset = get_dataset(args.project_id, args.dataset_id) table_refs = get_table_refs(bq_client, bqs_client, dataset) tables_info = get_tables_info(table_refs) tables_info = filter_latest_tables_info(tables_info) for info in tables_info: info.create_dir() with info.path.open('w') as f: print(f'write {info.clear_name}.view.lkml') write_look_ml(f, info)
def test_constructor_w_client_info(): from google.cloud import bigquery_storage class MyTransport: def __init__(self, *args, **kwargs): self.args = args self.kwargs = kwargs transport_class_patcher = mock.patch.object( bigquery_storage.BigQueryReadClient, "get_transport_class", return_value=MyTransport, ) with transport_class_patcher: client_under_test = bigquery_storage.BigQueryReadClient( client_info=client_info.ClientInfo( client_library_version="test-client-version"), ) transport_client_info = client_under_test._transport.kwargs["client_info"] user_agent = transport_client_info.to_user_agent() assert "test-client-version" in user_agent
def update_blocks(db_engine, config: Config) -> None: latest_timestamp = get_latest_timestamp(db_engine, config) print(f'Latest Timestamp: {latest_timestamp}') print('Connecting to Google Big Query...') # This needs the env variable GOOGLE_APPLICATION_CREDENTIALS filled # with the path to your credentials file credentials, your_project_id = google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"]) bqclient = bigquery.Client( credentials=credentials, project=your_project_id, ) bqstorageclient = bigquery_storage.BigQueryReadClient( credentials=credentials) print('Successful') query_string = f""" SELECT timestamp, number FROM `bigquery-public-data.crypto_ethereum.blocks` WHERE timestamp > '{latest_timestamp}' ; """ print("Querying Block Information") query_result = (bqclient.query(query_string).result().to_dataframe( bqstorage_client=bqstorageclient)) print(f"Succesful, {len(query_result)} new Blocks found!") print("Writing Result to Database") query_result.to_sql(config.BLOCKS_TABLE, db_engine, if_exists='append', index=False) print("Update completed successfully!")
def load_data_from_bq(bq_uri: str) -> pd.DataFrame: ''' Loads data from BigQuery table (BQ) to a dataframe Parameters: bq_uri (str): bq table uri. i.e: example_project.example_dataset.example_table Returns: pandas.DataFrame: a dataframe with the data from GCP loaded ''' if not bq_uri.startswith('bq://'): raise Exception( "uri is not a BQ uri. It should be bq://project_id.dataset.table") logging.info("reading bq data: {}".format(bq_uri)) project, dataset, table = bq_uri.split(".") bqclient = bigquery.Client(project=project[5:]) bqstorageclient = bigquery_storage.BigQueryReadClient() query_string = """ SELECT * from {ds}.{tbl} """.format(ds=dataset, tbl=table) return (bqclient.query(query_string).result().to_dataframe( bqstorage_client=bqstorageclient))
def clients(): # [START bigquerystorage_pandas_tutorial_all] # [START bigquerystorage_pandas_tutorial_create_client] import google.auth from google.cloud import bigquery from google.cloud import bigquery_storage # Explicitly create a credentials object. This allows you to use the same # credentials for both the BigQuery and BigQuery Storage clients, avoiding # unnecessary API calls to fetch duplicate authentication tokens. credentials, your_project_id = google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"]) # Make clients. bqclient = bigquery.Client( credentials=credentials, project=your_project_id, ) bqstorageclient = bigquery_storage.BigQueryReadClient( credentials=credentials) # [END bigquerystorage_pandas_tutorial_create_client] # [END bigquerystorage_pandas_tutorial_all] return bqclient, bqstorageclient
def query_gbq(_network, _week_number, _pool_list, _excluded_lps_list=[]): LOGGER.debug('query_gbq') _excluded_lps_list = list(set(_excluded_lps_list + BASE_LP_EXCLUSION_LIST)) with open(SQL_FILE_PATH, 'r') as file: sql = file.read() _days_in_week = '3' sql = sql.format( week_number=_week_number, pool_addresses="','".join(_pool_list), blocks_table=TABLES_CONFIGS[_network]['blocks'], lm_transfers_table=TABLES_CONFIGS[_network]['lm_transfers'], lm_state_table=TABLES_CONFIGS[_network]['lm_state'], excluded_lps="','".join(_excluded_lps_list), days_in_week=_days_in_week) client = bigquery.Client() bqstorageclient = bigquery_storage.BigQueryReadClient() df = (client.query(sql).result().to_dataframe( bqstorage_client=bqstorageclient)) df = df.groupby(['pool_address', 'lp_address', 'block_timestamp']).sum() return df
def client_under_test(mock_transport): from google.cloud import bigquery_storage return bigquery_storage.BigQueryReadClient(transport=mock_transport)
def bqstorage_client(bigquery_client): from google.cloud import bigquery_storage return bigquery_storage.BigQueryReadClient( credentials=bigquery_client._credentials)
from google.cloud import bigquery_storage import os os.environ[ "GOOGLE_APPLICATION_CREDENTIALS"] = 'D:\medium\example-apis\key\key_bqsa.json' # Create credentials object for both the BigQuery and BigQuery Storage clients credentials, project_id = google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"]) # Init clients. bqclient = bigquery.Client( credentials=credentials, project=project_id, ) bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials) # Write a query. query_string = """ SELECT title, SUM(views) AS views FROM `bigquery-public-data.wikipedia.pageviews_2020` WHERE DATE(datehour) = '2020-12-25' AND WIKI = 'es' AND TITLE NOT IN ('Wikipedia:Portada', 'Especial:Buscar') GROUP BY title ORDER BY views DESC LIMIT 20 """ #Get Dataframe dataframe = (bqclient.query(query_string).result().to_dataframe(
def download(limit: int = 1000, lead=0, within: Polygon = None, min_samples: int = 4, mmsi: list = None, min_knots: int = None, project_id: str = "master-thesis-305112", credentials=None, shuffle: bool = False, crs="epsg:3857") -> gpd.GeoDataFrame: """Creates a query job in Bigquery and downloades the result into a GeoPandas Dataframe Keyword Arguments: limit -- number of results to include. None returns all results. within -- coordinate filter, only points within the this polygon is included. None returns all results. mmsi -- list-like containing mmsi values to include in the result. None returns all. credentials -- google cloud credentials object. None use the google.auth.default. project_id -- google cloud project id to use for billing. """ if credentials is None: credentials, _ = google.auth.default( scopes=["https://www.googleapis.com/auth/cloud-platform"], ) # Make clients. bq = bigquery.Client( credentials=credentials, project=project_id, ) bqstorage = bigquery_storage.BigQueryReadClient(credentials=credentials) lead += 1 leads = [f"LEAD(sample, {l}) OVER w AS sample_{l}" for l in range(lead)] query = f""" WITH with_lead AS ( SELECT mmsi, {", ".join(leads)} FROM `master-thesis-305112.ais.samples` WINDOW w AS (PARTITION BY mmsi ORDER BY sample.timestamp) ) """ samples = ", ".join([f"sample_{l}.*" for l in range(lead)]) # Select samples query += f"SELECT mmsi, {samples} FROM with_lead " # Filter out bad samples query += "WHERE TRUE" for l in range(lead): query += f" AND sample_{l}.timestamp IS NOT NULL " if min_knots is not None: query += f" AND sample_{l}.sog >= {min_knots} " if 0 < l < min_samples: query += f"AND TIMESTAMP_DIFF(sample_{l}.timestamp, sample_{l-1}.timestamp, MINUTE) < 15 " within = f"AND ST_WITHIN(sample_0.position, ST_GEOGFROMTEXT('{str(within)}'))" if within is not None else "" mmsi = "'" + "','".join(mmsi) + "'" if mmsi is not None else None mmsi = f"AND CAST(mmsi AS STRING) IN ({mmsi})" if mmsi is not None else "" # Additional filters query += f""" {within} {mmsi} """ # Window query += "WINDOW w AS (PARTITION BY mmsi ORDER BY sample.timestamp) " if shuffle: query += "ORDER BY RAND()" else: query += "ORDER BY mmsi, sample_0.timestamp " if limit is not None: query += "LIMIT " + str(limit) df = bq.query(query).result().to_dataframe(bqstorage_client=bqstorage) # Convert timestamps and positions to correct dtypes df.position = gpd.GeoSeries.from_wkt(df.position, crs="wgs84").to_crs(crs) df.timestamp = pd.to_datetime(df.timestamp) for l in range(1, lead): df[f"position_{l}"] = gpd.GeoSeries.from_wkt(df[f"position_{l}"], crs="wgs84").to_crs(crs) df[f"timestamp_{l}"] = pd.to_datetime(df[f"timestamp_{l}"]) df = gpd.GeoDataFrame(df, geometry="position") return df
import numpy as np, pandas as pd from sklearn import linear_model from sklearn.metrics import mean_squared_error, r2_score _PROJECT_ID = os.getenv('GOOGLE_CLOUD_PROJECT') _DATASET_ID_EQUITY = 'daily_market_data_equity' _TABLE_ID_DAILY = 'daily' _FULL_TABLE_ID = '{p}.{d}.{t}'.format(p=_PROJECT_ID, d=_DATASET_ID_EQUITY, t=_TABLE_ID_DAILY) _WRITE_QUEUE_SIZE_THRESHOLD = 4000 _POLYGON_API_KEY = os.environ['API_KEY_POLYGON'] _FINNHUB_API_KEY = os.environ['API_KEY_FINNHUB'] _bigquery_client = bigquery.Client(project=os.getenv('GOOGLE_CLOUD_PROJECT')) _bqstorage_client = bigquery_storage.BigQueryReadClient() from polygon import RESTClient _polygon_client = RESTClient(_POLYGON_API_KEY) _QUERY = """ SELECT * FROM `trading-290017.daily_market_data_equity.daily_snp500` WHERE TRUE AND date >= DATE_SUB(CURRENT_DATE(), INTERVAL 100 DAY) ORDER BY date ASC, symbol """ _QUERY_SIMFIN = """ SELECT date, ticker as symbol, close FROM `trading-290017.daily_market_data_equity.daily_simfin`
def client(credentials): return bigquery_storage.BigQueryReadClient(credentials=credentials)