def setUpModule(): Config.CLIENT = bigquery.Client()
from absl import app, flags #absl intstead of argparse flags library from google.cloud import bigquery from peloton_client import peloton_client FLAGS = flags.FLAGS flags.DEFINE_string('peloton_username', None, 'Your Peloton username.') flags.DEFINE_string('peloton_password', None, 'Your Peloton password.') flags.DEFINE_string('table_id', None, 'The BigQuery table name to store data.') flags.mark_flag_as_required('peloton_username') flags.mark_flag_as_required('peloton_password') flags.mark_flag_as_required('table_id') DATASET = 'peloton' BQ_CLIENT = bigquery.Client() #auth JOB_CONFIG = bigquery.LoadJobConfig(write_disposition="WRITE_APPEND") def extract_data(input_data): output_dict = {} for x in input_data: output_dict[x.get('slug')] = x.get('value') return output_dict def get_workout_data(workouts): output_workouts = [] for workout in workouts: workout_metrics = PT_CLIENT.fetch_workout_metrics(workout.get('id')) workout_core_stats = extract_data(workout_metrics.get('summaries'))
import os, sys from datetime import datetime from matplotlib.patches import Patch from matplotlib.lines import Line2D import bigquery_functions as bqf import graph_functions as grf # Global Directories from global_config import config data_dir = config.get_property('data_dir') analysis_dir = config.get_property('analysis_dir') # Starts the BigQuery Client client = bigquery.Client(location="US") # Constants # ------------- date_format = "%Y-%m-%d" eps = 0.001 total_places = 10 percentage = 0.1 # Percentage of top places round_number = 4 # 15m days_go_back = 14 # Two weeks div = 1200 # in meters for the Soft Plus MAX_RESOLUTION = 5 MAX_DISTANCE = 3 MAX_TIME = 2
def setUpClass(cls): cls._bq_cli = bq.Client()
import datetime import get_data_verifly as gdv import metrics import gen_data from verifly_metrics import merge_data ''' The main script function called daily via a cron on a google VM. Function runs the extract, and transform scripts. Joins the result with the most recent version from GBQ and reuploads the result ''' #consruct client and login to goolg api credentials = service_account.Credentials.from_service_account_file('/home/nick/adjust/keys/vfly/verifly-237116-82a9642924f2.json') project_id = 'verifly-237116' client = bigquery.Client(credentials = credentials, project = project_id) #create two date strings. one for today, one for yesterday yesterday = datetime.date.today() - datetime.timedelta(days=1) last30 = yesterday - datetime.timedelta(days=30) yesterday_str = str(yesterday) last30_str = str(last30) #naming tables dataset_id = 'adjust_deliverables' csv_name = 'vfly_deliverables_metrics.csv' #+ today table_name_bigquery = "vfly_deliverables_metrics" local_path = "/home/nick/adjust/data/verifly/deliverables/" + csv_name print("Local path: " + local_path) #download old data as dataframe from google big query
def start_bundle(self): if not self.bigquery_client: self.bigquery_client = bigquery.Client() self.dataset_location = self.get_dataset_location() self.load_jobs = {}
from google.cloud import firestore from google.cloud import pubsub_v1 from google.cloud import storage import pytz PROJECT_ID = os.getenv('GCP_PROJECT') BQ_DATASET = 'mydataset' BQ_TABLE = 'mytable' ERROR_TOPIC = 'projects/%s/topics/%s' % (PROJECT_ID, 'streaming_error_topic') SUCCESS_TOPIC = 'projects/%s/topics/%s' % (PROJECT_ID, 'streaming_success_topic') DB = firestore.Client() CS = storage.Client() PS = pubsub_v1.PublisherClient() BQ = bigquery.Client() def streaming(data, context): '''This function is executed whenever a file is added to Cloud Storage''' bucket_name = data['bucket'] file_name = data['name'] db_ref = DB.document(u'streaming_files/%s' % file_name) if _was_already_ingested(db_ref): _handle_duplication(db_ref) else: try: _insert_into_bigquery(bucket_name, file_name) _handle_success(db_ref) except Exception: _handle_error(db_ref)
def load_new_data(dataset_id, table_name, next_load_date, end_load_date): ''' Queries different snippet related GA properties and loads results to a permanent table in bigquery :param dataset_id: Name of dataset to be loaded into :param table_name: Name of table to be loaded into :param next_load_date: Earliest date to be loaded into table_name :param end_load_date: Latest date to be loaded into table_name :return: ''' while next_load_date < end_load_date: # Set dates required for loading new data next_load_date = datetime.strftime(next_load_date, '%Y%m%d') logging.info(f'Starting load for next load date: {next_load_date}') client = bigquery.Client(project='ga-mozilla-org-prod-001') load_dataset_id = dataset_id load_table_name = table_name load_table_suffix = next_load_date load_table_id = f'{load_table_name.lower()}_{load_table_suffix}' # Configure load job dataset_ref = client.dataset(load_dataset_id) table_ref = dataset_ref.table(load_table_id) load_job_config = bigquery.QueryJobConfig() # load job call load_job_config.schema = [ bigquery.SchemaField('date', 'DATE'), bigquery.SchemaField('country', 'STRING'), bigquery.SchemaField('source', 'STRING'), bigquery.SchemaField('medium', 'STRING'), bigquery.SchemaField('campaign', 'STRING'), bigquery.SchemaField('content', 'STRING'), bigquery.SchemaField('sessions', 'INTEGER'), bigquery.SchemaField('nonFXSessions', 'INTEGER'), bigquery.SchemaField('downloads', 'INTEGER'), bigquery.SchemaField('nonFXDownloads', 'INTEGER'), bigquery.SchemaField('countryCleaned', 'STRING') ] # Define schema load_job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field='date', ) load_job_config.write_disposition = 'WRITE_APPEND' # Options are WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY load_job_config.destination = table_ref sql = f""" WITH parameters as( SELECT '{load_table_suffix}' as startDate, '{load_table_suffix}' as endDate ), sessionsTable as ( SELECT PARSE_DATE('%Y%m%d', date) as date, CASE WHEN country IS NULL THEN '' ELSE country END as country, CASE WHEN source IS NULL THEN '' ELSE source END as source, CASE WHEN medium IS NULL THEN '' ELSE medium END as medium, CASE WHEN campaign IS NULL THEN '' ELSE campaign END as campaign, CASE WHEN content IS NULL THEN '' ELSE content END as content, SUM(sessions) as sessions, SUM(CASE WHEN browser != 'Firefox' THEN sessions ELSE 0 END) as nonFXSessions FROM( SELECT date, country, source, medium, campaign, content, browser, COUNT(DISTINCT visitIdentifier) as sessions FROM( SELECT date AS date, CONCAT(CAST(fullVisitorId AS string),CAST(visitId AS string)) as visitIdentifier, geoNetwork.country as country, trafficSource.source as source, trafficSource.medium as medium, trafficSource.campaign as campaign, trafficSource.adcontent as content, device.browser as browser FROM `ga-mozilla-org-prod-001.65789850.ga_sessions_*`, UNNEST (hits) AS hits WHERE _TABLE_SUFFIX >= (SELECT parameters.startDate from parameters) AND _TABLE_SUFFIX <= (SELECT parameters.startDate from parameters) AND totals.visits = 1 GROUP BY date,visitIdentifier, country, source, medium, campaign, content, browser) GROUP BY date, country, source, medium, campaign, content,browser) GROUP BY date, country, source, medium, campaign, content), downloadsTable as( SELECT PARSE_DATE('%Y%m%d', date) as date, CASE WHEN country IS NULL THEN '' ELSE country END as country, CASE WHEN source IS NULL THEN '' ELSE source END as source, CASE WHEN medium IS NULL THEN '' ELSE medium END as medium, CASE WHEN campaign IS NULL THEN '' ELSE campaign END as campaign, CASE WHEN content IS NULL THEN '' ELSE content END as content, SUM(IF(downloads > 0,1,0)) as downloads, SUM(IF(downloads > 0 AND browser != 'Firefox',1,0)) as nonFXDownloads FROM (SELECT date AS date, fullVisitorId as visitorId, visitNumber as visitNumber, geoNetwork.country as country, trafficSource.source as source, trafficSource.medium as medium, trafficSource.campaign as campaign, trafficSource.adcontent as content, device.browser as browser, SUM(IF (hits.eventInfo.eventAction = "Firefox Download",1,0)) as downloads FROM `ga-mozilla-org-prod-001.65789850.ga_sessions_*`, UNNEST (hits) AS hits WHERE _TABLE_SUFFIX >= (SELECT parameters.startDate from parameters) AND _TABLE_SUFFIX <= (SELECT parameters.startDate from parameters) AND hits.type = 'EVENT' AND hits.eventInfo.eventCategory IS NOT NULL AND hits.eventInfo.eventLabel LIKE "Firefox for Desktop%" GROUP BY date,visitorID, visitNumber, country, source, medium, campaign, content, browser) GROUP BY date, country, source, medium, campaign, content ), siteData as ( SELECT sessionsTable.date, sessionsTable.country, sessionsTable.source, sessionsTable.medium, sessionsTable.campaign, sessionsTable.content, SUM(sessionsTable.sessions) as sessions, SUM(sessionsTable.nonFXSessions) as nonFXSessions, SUM(downloadsTable.downloads) as downloads, SUM(downloadsTable.nonFXDownloads) as nonFXDownloads FROM sessionsTable LEFT JOIN downloadsTable ON sessionsTable.date = downloadsTable.date AND sessionsTable.country = downloadsTable.country AND sessionsTable.source = downloadsTable.source AND sessionsTable.medium = downloadsTable.medium AND sessionsTable.campaign = downloadsTable.campaign AND sessionsTable.content = downloadsTable.content GROUP BY date, country, source, medium, campaign, content ), cleanedSiteData as ( SELECT siteData.*, standardizedCountryList.standardizedCountry as countryCleaned FROM siteData LEFT JOIN `lookupTables.standardizedCountryList` as standardizedCountryList ON siteData.country = standardizedCountryList.rawCountry ) SELECT * from cleanedSiteData """ # Run Load Job query_job = client.query( sql, # Location must match that of the dataset(s) referenced in the query # and of the destination table. location='US', job_config=load_job_config) # API request - starts the query query_job.result() # Waits for the query to finish logging.info(f'Query results loaded to table {table_ref.path}') # Set next_load_date next_load_date = datetime.strptime(next_load_date, '%Y%m%d') + timedelta(1) return
def __init__(self, project, sql, *args, **kwargs): super().__init__(project, sql) self.client = bigquery.Client(project=project)
def bigquery_client(): yield bigquery.Client()
data_connector_name="default_runtime_data_connector_name", data_asset_name="asset_a", # this can be anything that identifies this data runtime_parameters={"query": "SELECT * from demo.taxi_data LIMIT 10"}, batch_identifiers={"default_identifier_name": "default_identifier"}, ) context.create_expectation_suite( expectation_suite_name="test_suite", overwrite_existing=True ) validator: Validator = context.get_validator( batch_request=batch_request, expectation_suite_name="test_suite" ) assert validator # What is the name of the temp_table that was created as part of `get_validator`? temp_table_name: str = validator.active_batch.data.selectable.description client: bigquery.Client = bigquery.Client() project: str = client.project dataset_ref: bigquery.DatasetReference = bigquery.DatasetReference( project, bigquery_dataset ) table_ref: bigquery.TableReference = dataset_ref.table(temp_table_name) table: bigquery.Table = client.get_table(table_ref) # what is its expiration expected_expiration: datetime.datetime = datetime.datetime.now( datetime.timezone.utc ) + datetime.timedelta(days=1) assert table.expires <= expected_expiration # Ensure that passing in `bigquery_temp_table` will fire off a DeprecationWarning batch_request: RuntimeBatchRequest = RuntimeBatchRequest(
def client(): return bigquery.Client()
def subscriber(): while True: counter = 10 temp1 = [] while (counter != -1): pubsub_client = pubsub.Client() topic_name = 'my-new-topic' topic = pubsub_client.topic(topic_name) subscription_name = 'sub1' subscription = topic.subscription(subscription_name) #subscription.create() #for subscription in topic.list_subscriptions(): # print(subscription.name) #logging.debug('Subscription {} created on topic {}.'.format( # subscription.name, topic.name)) try: results = subscription.pull(return_immediately=True) if counter != 0: #print counter print('Received {} messages.'.format(len(results))) for ack_id, message in results: #print('* {}: {}, {}'.format( #message.message_id, message.data, message.attributes)) temp1.append(str(message.data)) #var=var+1 #print results #print counter #print b1 #ts = self.request.get("ts") else: i = 0 while (i != 10): #print len(temp1) #print i #print temp1[i] try: b = json.loads(temp1[i]) #print('json load: {}'.format(b)) except: print('Cannot able to load') print i try: #defing dataset variables dataset_name = 'searce_poc_vuukle' table_name = 'page_impressions' today = b["timestamp"] table_name = "%s$%s" % (table_name, today) #print "hello1" #putting data into bigquery bigquery_client = bigquery.Client() dataset = bigquery_client.dataset(dataset_name) table = dataset.table(table_name) data = b #time_stamp1=time.strftime("%c") #data['timestamp'] = time_stamp1 # Reload the table to get the schema. table.reload() ## get the names of schema temp = list() for key in data: temp.append(data[key]) #rows = [data] #print rows errors = table.insert_data([temp]) if not errors: logging.debug('Loaded 1 row into {}:{}'.format( dataset_name, table_name)) else: logging.error(errors) except: print "cannot load" i = i + 1 # # # Acknowledge received messages. If you do not acknowledge, Pub/Sub will # # # redeliver the message. if results: subscription.acknowledge( [ack_id for ack_id, message in results]) #print( "one acknowledged") except: print("next") #continue counter = counter - 1
def setUpClass(cls): cls._bq_cli = bigquery.Client()
def __init__(self, bq_schema_filename=None, input_bq_table=None, hist_bq_table=None, p_null=0.1, n_keys=sys.maxint, min_date='2000-01-01', max_date=datetime.date.today().strftime('%Y-%m-%d'), only_pos=True, max_int=10**11, max_float=float(10**11), float_precision=2, write_disp='WRITE_APPEND', key_skew='None', primary_key_cols=None, dest_joining_key_col=None): """ Args: bq_schema_filename (str): A path to a local or gcs file containing a BigQuery schema in a json file. p_null (float): The desired sparsity of the generated data. n_keys (int): The cardinality of foreign key columns (for generating joinable schemas). min_date (datetime.date): The earliest date to generate. max_date (datetime.date): The latest date to generate. only_pos (bool): Specifies whether to allow negative numbers to be generated. max_int (int): The upper bound for the range of integers to generate. max_float (float): The upper bound for the range of floats to generate. float_precision (int): The desired display precision for generated floats. (Note that BigQuery will cast all floats with double precision on the backend). primary_key_cols (str): The primary key for the generated data. dest_joining_key_col (str): The name of the key column in the table we are generating that joins to source_joining_key_col. """ bq_cli = bq.Client() if bq_schema_filename is not None: try: # Handles json from google cloud storage or local. if bq_schema_filename.find('gs://') == 0: bkt, path = bq_schema_filename.replace('gs://', '').split('/', 1) client = gcs.Client() bucket = client.get_bucket(bkt) blob = bucket.get_blob(path) self.schema = json.loads(blob.download_as_string()) else: with open(bq_schema_filename, 'r') as json_file: self.schema = json.load(json_file) except ValueError: logging.error("Not a valid json file! \n %s", str(ValueError)) except AttributeError: logging.error("Could not find gcs file %s", str(bq_schema_filename)) elif input_bq_table: dataset_name, table_name = input_bq_table.split('.') bq_dataset = bq_cli.dataset(dataset_name) # This forms a TableReference object. bq_table_ref = bq_dataset.table(table_name) # Errors out if table doesn't exist. bq_table = bq_cli.get_table(bq_table_ref) # Quickly parse TableSchema object to list of dictionaries. self.schema = [{ u'name': field.name, u'type': field.field_type, u'mode': field.mode } for field in bq_table.schema] if hist_bq_table: dataset_name, table_name = hist_bq_table.split('.') bq_dataset = bq_cli.dataset(dataset_name) # This forms a TableReference object. bq_table_ref = bq_dataset.table(table_name) # Errors out if table doesn't exist. bq_table = bq_cli.get_table(bq_table_ref) self.hist_bq_table = hist_bq_table else: self.hist_bq_table = None self.null_prob = float(p_null) self.n_keys = int(n_keys) self.min_date = datetime.datetime.strptime(min_date, "%Y-%m-%d").date() self.max_date = datetime.datetime.strptime(max_date, "%Y-%m-%d").date() self.only_pos = bool(only_pos) self.max_int = int(max_int) self.min_int = 0 if self.only_pos else -1 * self.max_int self.max_float = float(max_float) self.min_float = 0.0 if self.only_pos else -1.0 * self.max_float self.float_precision = int(float_precision) self.key_skew = key_skew self.dest_joining_key_col = dest_joining_key_col # Map the passed string representation of the desired disposition. # This will force early error if invalid write disposition. write_disp_map = { 'WRITE_APPEND': beam.io.BigQueryDisposition.WRITE_APPEND, 'WRITE_EMPTY': beam.io.BigQueryDisposition.WRITE_EMPTY, 'WRITE_TRUNCATE': beam.io.BigQueryDisposition.WRITE_TRUNCATE } self.write_disp = write_disp_map[write_disp]
Returns: [type]: [description] """ original_string = os.environ.get(param,None) if original_string is not None: original_string = original_string.replace("'", "") return original_string src_bucket = get_enviroment_variable('SOURCE_BUCKET_ID') dataset_id = get_enviroment_variable('BQ_SETTINGS_DATASET_ID') table_id='project_billing_info' storage_client = storage.Client() bucket = storage_client.get_bucket(src_bucket) client = bq.Client() dataset = client.dataset(dataset_id) client.delete_table('cudinator_stg.project_billing_info', not_found_ok=True) for file in storage_client.list_blobs(bucket): if ('/mapping.json' in file.name): blob = bucket.get_blob(file.name) data =blob.download_as_string() dict_str = data.decode("UTF-8") data=eval(dict_str) project_details=[] billing_accounts=data.get("billing_accounts") # ETL WORK TO ADD BILLING NUMBER TO EACH PROJECT for billingAccounts in billing_accounts: billing_Accounts_Projects=billingAccounts.get("projects") name=billingAccounts.get("name")
# Standalone script to depict BigQuery features from __future__ import absolute_import from google.cloud import bigquery # Constants PROJECT = "famous-store-237108" DATASET_UDM = "UDM" DATASET_CONS = "Consumption" TABLE = "country_dim" client = bigquery.Client () # Instantiate BigQuery client query = ("SELECT column_name, data_type, is_nullable FROM `" + PROJECT + "." + DATASET_CONS + ".INFORMATION_SCHEMA.COLUMNS` WHERE table_name = '" + TABLE + "' limit 1000 ") query_job = client.query (query) rows = query_job.result () schema = {} col_list = [] for row in rows: col_info = {} col_info ['name'] = row.column_name col_info ['type'] = row.data_type if row.is_nullable: col_info ['mode'] = 'REQUIRED' else: col_info ['mode'] = 'NULLABLE' col_list.append (col_info)
def __init__(self, project_name=None, credentials=None): if credentials is None: self.bq_client = bigquery.Client()
def runtest(self): """Run.""" query_name = self.fspath.dirpath().basename query = read(f"{self.fspath.dirname.replace('tests', 'sql')}/query.sql") expect = load(self.fspath.strpath, "expect") tables: Dict[str, Table] = {} views: Dict[str, str] = {} # generate tables for files with a supported table extension for resource in next(os.walk(self.fspath))[2]: if "." not in resource: continue # tables require an extension table_name, extension = resource.rsplit(".", 1) if table_name.endswith(".schema") or table_name in ( "expect", "query_params", ): continue # not a table if extension in TABLE_EXTENSIONS or extension in ("yaml", "json"): if extension in TABLE_EXTENSIONS: source_format = TABLE_EXTENSIONS[extension] source_path = os.path.join(self.fspath.strpath, resource) else: source_format = TABLE_EXTENSIONS["ndjson"] source_path = (self.fspath.strpath, table_name) if "." in table_name: # combine project and dataset name with table name original, table_name = ( table_name, table_name.replace(".", "_").replace("-", "_"), ) query = query.replace(original, table_name) tables[table_name] = Table(table_name, source_format, source_path) elif extension == "sql": if "." in table_name: # combine project and dataset name with table name original, table_name = ( table_name, table_name.replace(".", "_").replace("-", "_"), ) query = query.replace(original, table_name) views[table_name] = read(self.fspath.strpath, resource) # rewrite all udfs as temporary temp_udfs = parse_udf.sub_persisent_udfs_as_temp(query) if temp_udfs != query: query = temp_udfs # prepend udf definitions query = parse_udf.prepend_udf_usage_definitions(query) dataset_id = "_".join(self.fspath.strpath.split(os.path.sep)[-3:]) if "CIRCLE_BUILD_NUM" in os.environ: dataset_id += f"_{os.environ['CIRCLE_BUILD_NUM']}" bq = bigquery.Client() with dataset(bq, dataset_id) as default_dataset: load_tables(bq, default_dataset, tables.values()) load_views(bq, default_dataset, views) # configure job res_table = bigquery.TableReference(default_dataset, query_name) job_config = bigquery.QueryJobConfig( default_dataset=default_dataset, destination=res_table, query_parameters=get_query_params(self.fspath.strpath), use_legacy_sql=False, write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE, ) # run query job = bq.query(query, job_config=job_config) result = list(coerce_result(*job.result())) result.sort(key=lambda row: json.dumps(row, sort_keys=True)) expect.sort(key=lambda row: json.dumps(row, sort_keys=True)) print_and_test(expect, result)
def setUpClass(cls): cls.GCP_BIGQUERY_CLIENT = bigquery.Client(GCP_PROJECT_ID, location="EU") cls.DATASET = cls.__dataset() cls.__create_dataset_if_not_exists(cls.DATASET)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.bigquery_serving_args is consumed by this class. For the full set of parameters supported by Big Query ML, refer to https://cloud.google.com/bigquery-ml/ Returns: None Raises: ValueError: If bigquery_serving_args is not in exec_properties.custom_config. If pipeline_root is not 'gs://...' RuntimeError: if the Big Query job failed. """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict['model_push']) if not self.CheckBlessing(input_dict): model_push.set_int_custom_property('pushed', 0) return model_export = artifact_utils.get_single_instance( input_dict['model_export']) model_export_uri = model_export.uri custom_config = exec_properties.get('custom_config', {}) bigquery_serving_args = custom_config.get('bigquery_serving_args', None) # if configuration is missing error out if bigquery_serving_args is None: raise ValueError('Big Query ML configuration was not provided') bq_model_uri = '`{}`.`{}`.`{}`'.format( bigquery_serving_args['project_id'], bigquery_serving_args['bq_dataset_id'], bigquery_serving_args['model_name']) # Deploy the model. model_path = path_utils.serving_model_path(model_export_uri) if not model_path.startswith('gs://'): raise ValueError( 'pipeline_root must be gs:// for BigQuery ML Pusher.') absl.logging.info( 'Deploying the model to BigQuery ML for serving: {} from {}'. format(bigquery_serving_args, model_path)) query = (""" CREATE OR REPLACE MODEL {} OPTIONS (model_type='tensorflow', model_path='{}')""".format(bq_model_uri, os.path.join(model_path, '*'))) # TODO(zhitaoli): Refactor the executor_class_path creation into a common # utility function. executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.TFX_EXECUTOR: executor_class_path}): default_query_job_config = bigquery.job.QueryJobConfig( labels=telemetry_utils.get_labels_dict()) client = bigquery.Client( default_query_job_config=default_query_job_config) try: query_job = client.query(query) query_job.result() # Waits for the query to finish except Exception as e: raise RuntimeError('BigQuery ML Push failed: {}'.format(e)) absl.logging.info( 'Successfully deployed model {} serving from {}'.format( bq_model_uri, model_path)) # Setting the push_destination to bigquery uri model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', bq_model_uri)
def enrich_task(ds, **kwargs): template_context = kwargs.copy() template_context['ds'] = ds template_context['params'] = environment client = bigquery.Client() # Need to use a temporary table because bq query sets field modes to NULLABLE and descriptions to null # when writeDisposition is WRITE_TRUNCATE # Create a temporary table temp_table_name = '{task}_{milliseconds}'.format( task=task, milliseconds=int(round(time.time() * 1000))) temp_table_ref = client.dataset(dataset_name_temp).table( temp_table_name) schema_path = os.path.join( dags_folder, 'resources/stages/enrich/schemas/{task}.json'.format( task=task)) schema = read_bigquery_schema_from_file(schema_path) table = bigquery.Table(temp_table_ref, schema=schema) description_path = os.path.join( dags_folder, 'resources/stages/enrich/descriptions/{task}.txt'.format( task=task)) table.description = read_file(description_path) if time_partitioning_field is not None: table.time_partitioning = TimePartitioning( field=time_partitioning_field) logging.info('Creating table: ' + json.dumps(table.to_api_repr())) table = client.create_table(table) assert table.table_id == temp_table_name # Query from raw to temporary table query_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 query_job_config.priority = bigquery.QueryPriority.INTERACTIVE query_job_config.destination = temp_table_ref sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/{task}.sql'.format(task=task)) sql_template = read_file(sql_path) sql = kwargs['task'].render_template('', sql_template, template_context) print('Enrichment sql:') print(sql) query_job = client.query(sql, location='US', job_config=query_job_config) submit_bigquery_job(query_job, query_job_config) assert query_job.state == 'DONE' all_destination_projects = [(destination_dataset_project_id, dataset_name)] if copy_dataset_project_id is not None and len(copy_dataset_project_id) > 0 \ and copy_dataset_name is not None and len(copy_dataset_name) > 0: all_destination_projects.append( (copy_dataset_project_id, copy_dataset_name)) if load_all_partitions: for dest_project, dest_dataset_name in all_destination_projects: # Copy temporary table to destination copy_job_config = bigquery.CopyJobConfig() copy_job_config.write_disposition = 'WRITE_TRUNCATE' dest_table_name = '{task}'.format(task=task) dest_table_ref = client.dataset( dest_dataset_name, project=dest_project).table(dest_table_name) copy_job = client.copy_table(temp_table_ref, dest_table_ref, location='US', job_config=copy_job_config) submit_bigquery_job(copy_job, copy_job_config) assert copy_job.state == 'DONE' else: for dest_project, dest_dataset_name in all_destination_projects: # Merge # https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement merge_job_config = bigquery.QueryJobConfig() # Finishes faster, query limit for concurrent interactive queries is 50 merge_job_config.priority = bigquery.QueryPriority.INTERACTIVE merge_sql_path = os.path.join( dags_folder, 'resources/stages/enrich/sqls/merge/merge_{task}.sql'. format(task=task)) merge_sql_template = read_file(merge_sql_path) merge_template_context = template_context.copy() merge_template_context['params'][ 'source_table'] = temp_table_name merge_template_context['params'][ 'destination_dataset_project_id'] = dest_project merge_template_context['params'][ 'destination_dataset_name'] = dest_dataset_name merge_sql = kwargs['task'].render_template( '', merge_sql_template, merge_template_context) print('Merge sql:') print(merge_sql) merge_job = client.query(merge_sql, location='US', job_config=merge_job_config) submit_bigquery_job(merge_job, merge_job_config) assert merge_job.state == 'DONE' # Delete temp table client.delete_table(temp_table_ref)
from google.cloud import bigquery import pandas as pd import Queue import threading import datetime import logging # Set project Variables projectid = "pythonproject-190701" dataset_id = 'cmcdataset' bqc = bigquery.Client(project=projectid) job_config = bigquery.QueryJobConfig() job_config.use_legacy_sql = False dataset_ref = bqc.dataset(dataset_id) def get_time_str(): timestring = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") return timestring def query(query,bqc,job_config): query_job = bqc.query(query,job_config=job_config) return query_job def query_job_2df(query_job): # Convert the Results Query to a List of lists data = list(x for x in query_job.result()) # Convert the list of lists to a dataframe with columns based on the fields listed in the first row. df = pd.DataFrame.from_records(data[:],columns = sorted(data[0]._xxx_field_to_index, key=data[0]._xxx_field_to_index.__getitem__)) return df
def get_gbq_data(self, sim_config, local_cache_file): if not self.gbq_table: raise ValueError( f"gbq_table={self.gbq_table} is unset. " + "Set env variable for specific source, e.g. FLATFILES_GBQ_TABLE" ) if not sim_config["identifier"]: raise ValueError( f"Invalid sim_config: sim_config[identifier]={sim_config['identifier']}" ) # first get the table schema from GBQ and take the intersection of our # data spec columns to see which columns we can get from GBQ client = bigquery.Client(project=self.gcp_project) table = client.get_table(self.gbq_table) schema_columns = [_s.name for _s in table.schema] # take intersection preserving order in data spec gbq_columns = [ _col for _col in self.data_spec.full.columns if _col in schema_columns ] # query will get data for entire data set per ID so that the cache can # be built up correctly. columns_str = ", ".join(gbq_columns) query_str = f"SELECT {columns_str}\n" query_str += f"FROM `{self.gbq_table}`\n" query_str += f"WHERE Identifier = '{sim_config['identifier']}'\n" # we will use pandas-gbq to read data to df # https://pandas-gbq.readthedocs.io/en/latest/ # use_bqstorage_api=False # BigQuery Storage API allows downloading large (>125 MB) query results # more quickly at an increased cost. We are querying once per ID, which # should be no more than 50 MB. # max_results=1,000,000 # 8760 * 12 = 105,120 records per thermostat-year # use max_results as guard to query accidentaly getting multiple datasets # there should never be 1,000,000 results for a single identifier _df = pandas_gbq.read_gbq( query_or_table=query_str, project_id=self.gcp_project, credentials=self.gbq_token, col_order=gbq_columns, reauth=True, dialect="standard", max_results=1000000, use_bqstorage_api=False, dtypes={ k: v["dtype"] for k, v in self.data_spec.full.spec.items() if k in gbq_columns }, ) if _df.empty: logger.error(( f"Identifier: {sim_config['identifier']}", f" not found in GBQ table: {self.gbq_table}", )) if local_cache_file: if os.path.isdir(os.path.dirname(local_cache_file)): # store as gzip compressed parquet file _df.to_parquet(local_cache_file, compression="gzip", index=False) else: logger.info( "GCSDataSource received no local_cache. Proceeding without caching." ) return _df
def main(): client = bigquery.Client() query = """ SELECT country_name FROM bigquery-public-data.covid19_open_data.covid19_open_data GROUP BY country_name ORDER BY country_name """ country = [] query_job = client.query(query) for row in query_job: country.append(row["country_name"]) query = """ SELECT population, AVG((population - cumulative_confirmed - cumulative_recovered - cumulative_deceased)) as S, AVG(cumulative_confirmed) as I, AVG(cumulative_recovered) as R FROM `bigdata-301014.Domographics.population` INNER JOIN `bigquery-public-data.covid19_open_data.covid19_open_data` ON location_key = key WHERE LENGTH(location_key) = 2 AND country_name = '{country}' GROUP BY population LIMIT 10 """.format(country=country[1]) query_job = client.query(query) n = 0 S0 = 0 I0 = 0 R0 = 0 for row in query_job: n = row["population"] S0 = row["S"] I0 = 1 R0 = 0 def sir_model(y, x, beta, gamma): sus = -beta * y[0] * y[1] / n rec = gamma * y[1] inf = -(sus + rec) return sus, inf, rec def fit_odeint(x, beta, gamma): print(" beta: ", beta, " gamma: ", gamma) return odeint(sir_model, (S0, I0, R0), x, args=(beta, gamma))[:,1] query = """ SELECT date, IFNULL(cumulative_confirmed, 0) as I FROM bigquery-public-data.covid19_open_data.covid19_open_data WHERE LENGTH(location_key) = 2 AND country_name = '{country}' """.format(country=country[1]) query_job = client.query(query) xdata = [] ydata = [] i = 1 for row in query_job: ydata.append(float(row["I"])) #xdata.append(row["date"]) xdata.append(i) i = i + 1 xdata = np.array(xdata) ydata = np.array(ydata) print(xdata) print(ydata) popt, pcov = curve_fit(fit_odeint, xdata, ydata, maxfev=1000000000) print(np.round(np.sqrt(pcov[0][0]),3)) print(np.round(np.sqrt(pcov[1][1]),3))
parser.add_argument( '--speedFactor', help= 'Example: 60 implies 1 hour of data sent to Cloud Pub/Sub in 1 minute', required=True, type=float) parser.add_argument( '--jitter', help='type of jitter to add: None, uniform, exp are the three options', default='None') # set up BigQuery bqclient logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) args = parser.parse_args() bqclient = bq.Client(args.project) dataset = bqclient.get_dataset( bqclient.dataset('flights')) # throws exception on failure # jitter? if args.jitter == 'exp': jitter = 'CAST (-LN(RAND()*0.99 + 0.01)*30 + 90.5 AS INT64)' elif args.jitter == 'uniform': jitter = 'CAST(90.5 + RAND()*30 AS INT64)' else: jitter = '0' # run the query to pull simulated events querystr = """\ SELECT EVENT,
def download_file(self, key, stored_etag=None, manifest_id=None, start_date=None): """ Download a file from GCP storage bucket. If we have a stored etag and it matches the current GCP blob, we can safely skip download since the blob/file content must not have changed. Args: key (str): name of the blob in the GCP storage bucket stored_etag (str): optional etag stored in our DB for comparison Returns: tuple(str, str) with the local filesystem path to file and GCP's etag. """ try: if start_date: invoice_month = start_date.strftime("%Y%m") query = f""" SELECT {",".join(self.gcp_big_query_columns)} FROM {self.table_name} WHERE DATE(_PARTITIONTIME) >= '{self.query_date}' AND invoice.month = '{invoice_month}' """ LOG.info(f"Using querying for invoice_month ({invoice_month})") else: query = f""" SELECT {",".join(self.gcp_big_query_columns)} FROM {self.table_name} WHERE DATE(_PARTITIONTIME) >= '{self.query_date}' """ client = bigquery.Client() query_job = client.query(query) except GoogleCloudError as err: err_msg = ( "Could not query table for billing information." f"\n Provider: {self._provider_uuid}" f"\n Customer: {self.customer_name}" f"\n Response: {err.message}" ) LOG.warning(err_msg) raise GCPReportDownloaderError(err_msg) directory_path = self._get_local_directory_path() full_local_path = self._get_local_file_path(directory_path, key) os.makedirs(directory_path, exist_ok=True) msg = f"Downloading {key} to {full_local_path}" LOG.info(log_json(self.request_id, msg, self.context)) try: with open(full_local_path, "w") as f: writer = csv.writer(f) writer.writerow(self.gcp_big_query_columns) for row in query_job: writer.writerow(row) except (OSError, IOError) as exc: err_msg = ( "Could not create GCP billing data csv file." f"\n Provider: {self._provider_uuid}" f"\n Customer: {self.customer_name}" f"\n Response: {exc}" ) raise GCPReportDownloaderError(err_msg) msg = f"Returning full_file_path: {full_local_path}" LOG.info(log_json(self.request_id, msg, self.context)) dh = DateHelper() return full_local_path, self.etag, dh.today
from google.cloud import storage # Basic data about your project project_id = "your-project-id" bucket_name = "your-bucket-name" bucket_folder = 'your-bucket-folder-name' client_options = ClientOptions(api_endpoint=endpoint) # Endpoint of the ML Model deployed on AI Platform endpoint = 'your-ml-endpoint' ml = discovery.build('ml-name', 'ml-version', client_options=client_options) model_name = 'projects/{}/models/{}/versions/{}'.format( project, model, version) # BigQuery bq_client = bigquery.Client(project=project_id) table_id = "project-id.dataset-name.table-name" # Cloud Storage storage_client = storage.Client() # Configuration for Audio Data class conf: sr = 16000 duration = 3 hop_length = 340 * duration fmin = 20 fmax = sr // 2 n_mels = 128 n_fft = n_mels * 20
import flask from datetime import datetime, date, timedelta from google.cloud import bigquery import concurrent.futures try: import googleclouddebugger googleclouddebugger.enable() except ImportError: pass app = flask.Flask(__name__) app.debug = True bigquery_client = bigquery.Client() app.config.update(SECRET_KEY='x123124asdflsadlf1!!', SESSION_COOKIE_NAME='finedatalab_flask_session', PERMANENT_SESSION_LIFETIME=timedelta(31)) @app.after_request def set_response_headers(r): r.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate' r.headers['Pragma'] = 'no-cache' r.headers['Expires'] = '0' return r @app.route("/") def main(): return flask.render_template("index.html",
def Ejecutar(): reload(sys) sys.setdefaultencoding('utf8') storage_client = storage.Client() bucket = storage_client.get_bucket('ct-telefonia') gcs_path = 'gs://ct-telefonia' sub_path = KEY_REPORT + '/' output = gcs_path + "/" + sub_path + fecha + ext blob = bucket.blob(sub_path + fecha + ext) dateini = request.args.get('dateini') dateend = request.args.get('dateend') if dateini is None: dateini = GetDate1 else: dateini = dateini + hour1 if dateend is None: dateend = GetDate2 else: dateend = dateend + hour2 client = bigquery.Client() QUERY = ( 'SELECT servidor, operacion, token, ipdial_code, id_cliente, cartera FROM telefonia.parametros_ipdial where Estado = "Activado" order by ipdial_code asc' ) #WHERE ipdial_code = "intcob-unisabaneta" query_job = client.query(QUERY) rows = query_job.result() data = "" try: os.remove(ruta_completa) #Eliminar de aries except: print("Eliminado de aries") try: blob.delete() #Eliminar del storage----- except: print("Eliminado de storage") try: QUERY2 = ( 'delete FROM `contento-bi.telefonia.agent_status` where date = ' + '"' + dateini[0:8] + '"') query_job = client.query(QUERY2) rows2 = query_job.result() except: print("Eliminado de bigquery") file = open(ruta_completa, "a") for row in rows: url = 'http://' + str( row.servidor ) + '/ipdialbox/api_reports.php?token=' + row.token + '&report=' + str( CODE_REPORT) + '&date_ini=' + dateini + '&date_end=' + dateend datos = requests.get(url).content # print(url) if len(requests.get(url).content) < 50: continue else: i = json.loads(datos) for rown in i: file.write( str(rown["operation"]).encode('utf-8') + "|" + str(rown["date"]) + "|" + str(rown["hour"]) + "|" + str(rown["id_agent"]) + "|" + str(rown["agent_identification"]) + "|" + str(rown["agent_name"]).encode('utf-8') + "|" + str(rown["CALLS"]) + "|" + str(rown["CALLS INBOUND"]) + "|" + str(rown["CALLS OUTBOUND"]) + "|" + str(rown["CALLS INTERNAL"]) + "|" + str(rown["READY TIME"]) + "|" + str(rown["INBOUND TIME"]) + "|" + str(rown["OUTBOUND TIME"]) + "|" + str(rown["NOT-READY TIME"]) + "|" + str(rown["RING TIME"]) + "|" + str(rown["LOGIN TIME"]) + "|" + str(rown["AHT"]) + "|" + str(rown["OCUPANCY"]).encode('utf-8') + "|" + str(rown["AUX TIME"]) + "|" + str(row.id_cliente) + "|" + str(row.cartera).encode('utf-8') + "\n") file.close() blob.upload_from_filename(ruta_completa) time.sleep(10) ejecutar = agent_status_beam.run( output, KEY_REPORT ) #[[[[[[[[[[[[[[[[[[***********************************]]]]]]]]]]]]]]]]]] time.sleep(60) return ("Se acaba de ejecutar el proceso de " + KEY_REPORT + " Para actualizar desde: " + dateini + " hasta " + dateend)