Esempio n. 1
0
def add_bigquery(temp_data, table_name, table_path, dataset_name, schema):
    """
    Given a dataset name and a table_name (in bigquery), create
    a table in bigquery with schema given in 'schema' and 
    the data stored in Google Storage path 'table_path'.
    
    It deduces the file format (NJSON or CSV) from temp_data.
    """

    if local:
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/tmp/key.json'
    bq = bigquery.Client(project='gabinete-compartilhado')

    ds = bq.dataset(dataset_name)

    # Create dataset if non-existent:
    try:
        bq.create_dataset(ds)
    except:
        pass

    # Create a local object (bigQuery table):
    table_ref = ds.table(table_name)
    table = bigquery.Table(table_ref)

    # Configure the bigquery table:
    if os.path.exists(temp_data):
        external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON')
    elif os.path.exists(temp_data.replace('.json', '.csv')):
        external_config = bigquery.ExternalConfig('CSV')
    else:
        raise Exception('unknown temp_data file extension')

    external_config.schema = schema
    # external_config.autodetect = True
    external_config.ignore_unknown_values = True
    external_config.max_bad_records = 100
    source_uris = [table_path]
    external_config.source_uris = source_uris
    table.external_data_configuration = external_config

    # create table (first delete existent table):
    try:
        bq.delete_table(table)
    except Exception as e:
        print(e)
        pass

    bq.create_table(table)
    print('Table Cr')
Esempio n. 2
0
    def create_external_tables(self, bucket_name, prefix, date, tables,
                               project, dataset, table_prefix, version):
        if table_prefix:
            table_prefix += "_"
        else:
            table_prefix = ""

        gcs_loc = f"gs://{bucket_name}/{prefix}/v{version}/{date}"

        client = bigquery.Client(project=project)

        dataset_ref = client.dataset(dataset)

        for leanplum_name in tables:
            table_name = f"{table_prefix}{leanplum_name}_v{version}_{date}"
            logging.info(f"Creating table {table_name}")

            table_ref = bigquery.TableReference(dataset_ref, table_name)
            table = bigquery.Table(table_ref)

            client.delete_table(table, not_found_ok=True)

            external_config = bigquery.ExternalConfig('CSV')
            external_config.source_uris = [f"{gcs_loc}/{leanplum_name}/*"]
            external_config.autodetect = True

            table.external_data_configuration = external_config

            client.create_table(table)
    def create_external_table_hive_partitioning(
            bq_client: bigquery.Client, dataset: bigquery.Dataset,
            table_id: str, gcs_directory_path: str) -> bigquery.Table:
        """
            Creates an external table with AUTO hive partitioning in GCS
        :param bq_client: Client object to bigquery
        :param dataset: dataset object. Check 'get_or_create_dataset' method
        :param table_id: Table to be created
        :param gcs_directory_path: Directory of GCS with the data. For example:
            If you have a structure like this:
            "gs://bucket/images_metadata/source_id=abc/date=2018-02-20"
            You should pass:
            "******"
        :return:
        """
        table = bigquery.Table(dataset.table(table_id))

        external_config = bigquery.ExternalConfig(
            bigquery.SourceFormat.PARQUET)
        external_config.source_uris = [f"{gcs_directory_path}/*"]
        hive_part_opt = HivePartitioningOptions()
        hive_part_opt.mode = "AUTO"
        hive_part_opt.source_uri_prefix = gcs_directory_path

        external_config.hive_partitioning = hive_part_opt
        table.external_data_configuration = external_config
        table = bq_client.create_table(table, exists_ok=True)

        return table
Esempio n. 4
0
    def external_config(self):

        if self.source_format == "csv":

            _external_config = bigquery.ExternalConfig("CSV")
            _external_config.options.skip_leading_rows = 1
            _external_config.options.allow_quoted_newlines = True
            _external_config.options.allow_jagged_rows = True
            _external_config.autodetect = False
            _external_config.schema = self.table_obj._load_schema(self.mode)

        # You can add new formats here

        else:

            raise NotImplementedError(
                "Base dos Dados just supports comma separated csv files")

        _external_config.source_uris = f"gs://{self.table_obj.bucket_name}/staging/{self.table_obj.dataset_id}/{self.table_obj.table_id}/*"

        if self.partitioned:

            _external_config.hive_partitioning = self.partition()

        return _external_config
def query_external_gcs_temporary_table():

    # [START bigquery_query_external_gcs_temp]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # Configure the external data source and query job.
    external_config = bigquery.ExternalConfig("CSV")
    external_config.source_uris = [
        "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
    ]
    external_config.schema = [
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
    ]
    external_config.options.skip_leading_rows = 1
    table_id = "us_states"
    job_config = bigquery.QueryJobConfig(
        table_definitions={table_id: external_config})

    # Example query to find states starting with 'W'.
    sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(table_id)

    query_job = client.query(sql,
                             job_config=job_config)  # Make an API request.

    w_states = list(query_job)  # Wait for the job to complete.
    print("There are {} states with names starting with W.".format(
        len(w_states)))
def createTempTable(gscfiles):
    gcsbucketname = os.environ['gcsproject'].lower().strip()
    gcsbucketname += '.appspot.com'
    gcsprefix = ''
    if (os.environ['gcspath']):
        gcsprefix += os.environ['gcspath'].strip() + '/'
    tables = []
    for gcf in gscfiles:
        gscfile = gcf.strip()
        gcsfullurl = "gs://{}/{}{}.csv".format(gcsbucketname,gcsprefix,gscfile)
        gcsduration = int(os.environ['bigquery_temptable_duration'].strip())

        # set expiration of the table
        gcsexpiration = datetime.utcnow() + timedelta(seconds=gcsduration)
        bgqclient = bigquery.Client()
        bgqdataset = os.environ['bigquery_dataset'].strip()
        bgqdatasetid = bgqclient.dataset(bgqdataset)
        brazefields = [bf.lower().strip() for bf in os.environ['brazesegmentfields'].split(',')]
        brazetype = [bt.upper().strip() for bt in os.environ['brazesegmenttype'].split(',')]

        bgqschema = []
        # Generate schema based on field type
        for bf, bt in zip(brazefields, brazetype):
            bgqschema.append(bigquery.SchemaField(bf, bt, mode="NULLABLE"))

        table = bigquery.Table(bgqdatasetid.table(gscfile) , schema=bgqschema)
        table.expires = gcsexpiration
        external_config = bigquery.ExternalConfig("CSV")
        external_config.options.skip_leading_rows = 1
        external_config.source_uris = [gcsfullurl]
        table.external_data_configuration = external_config
        table = bgqclient.create_table(table)
        tables.append(table)
    return tables
Esempio n. 7
0
def main ():

    log_uri = "gs://andy-00000002-bucket/api/api-test.log"
    #log_uri = "gs://andy-00000002-bucket/envoy/api/api-0kg1-application-http-egress-2020-07-27-06-25-11.log"
    client = bigquery.Client()
    dataset_ref = client.dataset("service_log_parser")
    print(dataset_ref.table('envoy_access_log'))
    dataset_id = "service_log_parser"
    project_id = "green-reporter-266619"
    table_name = table_name_generator()
    table_id = project_id + '.' + dataset_id + '.' + table_name
    #dest_table_id = project_id + '.' + dataset_id + '.' + 'envoy_access_log'
    dest_table_id = '{}.{}'.format(dataset_id, 'envoy_access_log')
    insert_sql = insert_table_query_generator(table_name, dest_table_id)
    #print(insert_sql)

    external_config = bigquery.ExternalConfig("CSV")
    external_config.source_uris = [
        log_uri,
    ]
    external_config.schema = [
        bigquery.SchemaField("content", "STRING", mode="REQUIRED"),
    ]
    job_config = bigquery.QueryJobConfig(table_definitions={table_name: external_config})
    query_job = client.query(insert_sql, job_config=job_config) 
    if len(list(query_job)) == 0: # Waits for query to finish
        print("Load data completed!")
Esempio n. 8
0
    def add_external_gcs_source(
        self,
        gcs_url: str,
        dataset_name: str,
        table_name: str,
        skip_rows: int = 0,
        delimiter: str = ",",
        quote: str = '"',
        source_format: str = "CSV",
        project_id: str = None,
    ):
        dataset_ref = self._dataset_ref(dataset_name=dataset_name,
                                        project_id=project_id)
        table = dataset_ref.table(table_name)

        external_config = bigquery.ExternalConfig(source_format=source_format)
        external_config.autodetect = True
        external_config.source_uris = [gcs_url]
        external_config.options.skip_leading_rows = skip_rows
        external_config.options.field_delimiter = delimiter
        external_config.options.quote = quote

        table.external_data_configuration = external_config

        return self.client.create_table(table)
Esempio n. 9
0
    def run_federated_query(self, query_type, query):
        """Runs native queries on EXTERNAL files

        Args:
            query_type(str): Code for the category of the query to
                run (SIMPLE_SELECT_*, SELECT_ONE_STRING, SELECT_50_PERCENT).
            query(str): The query to run.
        """
        file_formats = file_constants.FILE_CONSTANTS['sourceFormats']
        source_format = file_formats[self.file_type]
        external_config = bigquery.ExternalConfig(source_format=source_format)
        external_config.source_uris = [self.file_uri + '/*']
        if source_format != 'AVRO' and source_format != 'PARQUET':
            main_table_util = table_util.TableUtil(self.native_table_id,
                                                   self.dataset_id)
            external_config.schema = main_table_util.table.schema

        if source_format == 'CSV':
            external_config.options.skip_leading_rows = 1

        external_config.compression = self.compression.upper()
        table_id = self.native_table_id + '_external'
        results_destination = '{0:s}.{1:s}.{2:s}_query_results'.format(
            self.bq_project, self.dataset_id, table_id)
        logging.info(
            'Storing query results in {0:s}'.format(results_destination))
        job_config = bigquery.QueryJobConfig(
            table_definitions={table_id: external_config},
            use_legacy_sql=False,
            allow_large_results=True,
            destination=results_destination)
        bql = query.format(table_id)
        print(bql)
        query_job = self.bq_client.query(bql, job_config=job_config)
        logging.info("Running external {0:s} query.".format(query_type))
        query_job.result()
        query_result = benchmark_result_util.QueryBenchmarkResultUtil(
            job=query_job,
            job_type=self.job_type,
            benchmark_name=self.benchmark_name,
            project_id=self.bq_project,
            results_table_name=self.results_table_name,
            results_dataset_id=self.results_table_dataset_id,
            bq_logs_dataset=self.bq_logs_dataset_id,
            bql=bql,
            query_category=query_type,
            main_table_name=self.native_table_id,
            table_dataset_id=self.dataset_id,
            table_type=EXTERNAL_TYPE_ID,
            file_uri=self.file_uri,
        )
        query_result.insert_results_row()
        self.bq_client.delete_table(results_destination)
        logging.info('Deleting results destination table {0:s}'.format(
            results_destination))
Esempio n. 10
0
def query_external_sheets_permanent_table(dataset_id):

    # [START bigquery_query_external_sheets_perm]
    from google.cloud import bigquery
    import google.auth

    # Create credentials with Drive & BigQuery API scopes.
    # Both APIs must be enabled for your project before running this code.
    credentials, project = google.auth.default(scopes=[
        "https://www.googleapis.com/auth/drive",
        "https://www.googleapis.com/auth/bigquery",
    ])

    # Construct a BigQuery client object.
    client = bigquery.Client(credentials=credentials, project=project)

    # TODO(developer): Set dataset_id to the ID of the dataset to fetch.
    # dataset_id = "your-project.your_dataset"

    # Configure the external data source.
    dataset = client.get_dataset(dataset_id)
    table_id = "us_states"
    schema = [
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
    ]
    table = bigquery.Table(dataset.table(table_id), schema=schema)
    external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
    # Use a shareable link or grant viewing access to the email address you
    # used to authenticate with BigQuery (this example Sheet is public).
    sheet_url = (
        "https://docs.google.com/spreadsheets"
        "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing")
    external_config.source_uris = [sheet_url]
    external_config.options.skip_leading_rows = 1  # Optionally skip header row.
    external_config.options.range = (
        "us-states!A20:B49"  # Optionally set range of the sheet to query from.
    )
    table.external_data_configuration = external_config

    # Create a permanent table linked to the Sheets file.
    table = client.create_table(table)  # Make an API request.

    # Example query to find states starting with "W".
    sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format(
        dataset_id, table_id)

    query_job = client.query(sql)  # Make an API request.

    # Wait for the query to complete.
    w_states = list(query_job)
    print(
        "There are {} states with names starting with W in the selected range."
        .format(len(w_states)))
Esempio n. 11
0
def ingest_file_external(data, context):
    '''
        This function is triggered by Cloud Storage bucket.

        This function is obsoleted because BQ external data source can lead to
        a potential high data query cost.

        This code will be removed once confirming the decision.
    '''
    bucket_name = data['bucket']
    file_name = data['name']
    file_path = 'gs://{}/{}'.format(bucket_name, file_name)
    logging.info('Triggered by file {}'.format(file_path))

    db_ref = DB.document(u'streaming_files/%s' % file_name)

    dataset_ref = BQ.dataset(BQ_DATASET)
    schema = [
        bigquery.SchemaField('hash', 'STRING'),
        bigquery.SchemaField('size', 'STRING'),
        bigquery.SchemaField('stripped_size', 'STRING'),
        bigquery.SchemaField('weight', 'STRING'),
        bigquery.SchemaField('number', 'STRING'),
        bigquery.SchemaField('version', 'STRING'),
        bigquery.SchemaField('merkle_root', 'STRING'),
        bigquery.SchemaField('timestamp', 'STRING'),
        bigquery.SchemaField('timestamp_month', 'STRING'),
        bigquery.SchemaField('nonce', 'STRING'),
        bigquery.SchemaField('bits', 'STRING'),
        bigquery.SchemaField('coinbase_param', 'STRING'),
        bigquery.SchemaField('transaction_count', 'STRING')
    ]
    table_suffix = _table_suffix()
    table_id = '{}_{}'.format(BQ_TABLE, table_suffix)
    logging.info('Start table creation {}'.format(table_id))

    table = bigquery.Table(dataset_ref.table(table_id), schema=schema)

    external_config = bigquery.ExternalConfig('CSV')
    external_config.source_uris = [file_path]
    external_config.options.skip_leading_rows = 1

    table.external_data_configuration = external_config

    if _was_already_ingested(db_ref):
        _handle_duplication(db_ref)
    else:
        try:
            BQ.create_table(table)
            _handle_success(db_ref)
            logging.info('table creation success {}'.format(table_id))
        except Exception:
            _handle_error(db_ref)
            logging.info('table creation fails {}'.format(table_id))
Esempio n. 12
0
def apply_gcs_changes(data_type, **kwargs):
    '''
    Sets up a file in Google Cloud Storage as an temporary table, and merges it with an existing table in BigQuery.
    The file's location in GCS is retrieved through XCOM, and the schema for the temporary table is loaded from GCS.
    Data types should be: 'accounts', 'offers', or 'trustlines'.

    Parameters:
        data_type - type of the data being uploaded; should be string
    Returns:
        N/A
    '''

    key_path = Variable.get('api_key_path')
    credentials = service_account.Credentials.from_service_account_file(
        key_path)
    client = bigquery.Client(credentials=credentials,
                             project=credentials.project_id)

    gcs_bucket_name = Variable.get('gcs_exported_data_bucket_name')
    gcs_filepath = kwargs['task_instance'].xcom_pull(
        task_ids=f'load_{data_type}_to_gcs')
    schema = read_local_schema(data_type)

    external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON')
    external_config.source_uris = [f'gs://{gcs_bucket_name}/{gcs_filepath}']
    external_config.schema = [
        bigquery.SchemaField(field['name'], field['type'], mode=field['mode'])
        for field in schema
    ]

    # The temporary table is is equal to the name of the file used to create it. Table ids cannot have '-'. Instead they have '_'
    table_id = f'{splitext(basename(gcs_filepath))[0]}'
    table_id = table_id.replace('-', '_')

    job_config = bigquery.QueryJobConfig(
        table_definitions={table_id: external_config})

    sql_query = create_merge_query(table_id, data_type, schema)
    logging.info(f'Merge query is: {sql_query}')
    logging.info(f'Running BigQuery job with config: {job_config._properties}')

    query_job = client.query(sql_query, job_config=job_config)
    result_rows = query_job.result()

    if query_job.error_result:
        logging.info(f'Query errors: {query_job.errors}')
        raise AirflowException(f'Query job failed: {query_job.error_result}')

    logging.info(
        f'Job timeline: {[t._properties for t in query_job.timeline]}')
    logging.info(
        f'{query_job.total_bytes_billed} bytes billed at billing tier {query_job.billing_tier}'
    )
    logging.info(f'Total rows affected: {query_job.num_dml_affected_rows}')
def createTable(bqDataset, bqExternalTable, fileLocation):
    bigqueryClient = bigquery.Client()
    extConfig = bigquery.ExternalConfig("CSV")
    extConfig.source_uris = [fileLocation]
    extConfig.options.skip_leading_rows = 1

    bqSchema = getSchema()
    tableRef = bigqueryClient.dataset(bqDataset).table(bqExternalTable)
    table = bigquery.Table(tableRef, schema=bqSchema)
    table.external_data_configuration = extConfig
    table = bigqueryClient.create_table(table, exists_ok=True)
    return(0)
Esempio n. 14
0
def query_external_sheets_temporary_table():

    # [START bigquery_query_external_sheets_temp]
    # [START bigquery_auth_drive_scope]
    from google.cloud import bigquery
    import google.auth

    # Create credentials with Drive & BigQuery API scopes.
    # Both APIs must be enabled for your project before running this code.
    credentials, project = google.auth.default(scopes=[
        "https://www.googleapis.com/auth/drive",
        "https://www.googleapis.com/auth/bigquery",
    ])

    # Construct a BigQuery client object.
    client = bigquery.Client(credentials=credentials, project=project)
    # [END bigquery_auth_drive_scope]

    # Configure the external data source and query job.
    external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")

    # Use a shareable link or grant viewing access to the email address you
    # used to authenticate with BigQuery (this example Sheet is public).
    sheet_url = (
        "https://docs.google.com/spreadsheets"
        "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing")
    external_config.source_uris = [sheet_url]
    external_config.schema = [
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
    ]
    external_config.options.skip_leading_rows = 1  # Optionally skip header row.
    external_config.options.range = (
        "us-states!A20:B49"  # Optionally set range of the sheet to query from.
    )
    table_id = "us_states"
    job_config = bigquery.QueryJobConfig(
        table_definitions={table_id: external_config})

    # Example query to find states starting with "W".
    sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(table_id)

    query_job = client.query(sql,
                             job_config=job_config)  # Make an API request.

    # Wait for the query to complete.
    w_states = list(query_job)
    print(
        "There are {} states with names starting with W in the selected range."
        .format(len(w_states)))
def create_external_table(dest_dataset, dest_table, gs_path):

    dataset_ref = client.dataset(dest_dataset)
    table_ref = bigquery.TableReference(dataset_ref, dest_table)
    table = bigquery.Table(table_ref)

    external_config = bigquery.ExternalConfig('CSV')
    external_config.autodetect = True
    external_config.max_bad_records = 100000000

    source_uris = [gs_path]
    external_config.source_uris = source_uris
    table.external_data_configuration = external_config

    client.create_table(table)
    print('Table {}.{} created.'.format(dest_dataset, dest_table))
Esempio n. 16
0
def test_query_external_gcs_permanent_table(client, to_delete):
    dataset_id = "query_external_gcs_{}".format(_millis())
    project = client.project
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    dataset = bigquery.Dataset(dataset_ref)
    client.create_dataset(dataset)
    to_delete.append(dataset)

    # [START bigquery_query_external_gcs_perm]
    # from google.cloud import bigquery
    # client = bigquery.Client()
    # dataset_id = 'my_dataset'

    # Configure the external data source
    dataset_ref = bigquery.DatasetReference(project, dataset_id)
    table_id = "us_states"
    schema = [
        bigquery.SchemaField("name", "STRING"),
        bigquery.SchemaField("post_abbr", "STRING"),
    ]
    table = bigquery.Table(dataset_ref.table(table_id), schema=schema)
    external_config = bigquery.ExternalConfig("CSV")
    external_config.source_uris = [
        "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
    ]
    external_config.options.skip_leading_rows = 1  # optionally skip header row
    table.external_data_configuration = external_config

    # Create a permanent table linked to the GCS file
    table = client.create_table(table)  # API request

    # Example query to find states starting with 'W'
    sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format(
        dataset_id, table_id)

    query_job = client.query(sql)  # API request

    w_states = list(query_job)  # Waits for query to finish
    print("There are {} states with names starting with W.".format(
        len(w_states)))
    # [END bigquery_query_external_gcs_perm]
    assert len(w_states) == 4
Esempio n. 17
0
def load_gcs_to_bq(log_uri):
    dataset_id = get_env_var('DATASET_ID')
    target_table_name = get_env_var('TABLE_NAME')
    client = bigquery.Client()

    temp_table_name = table_name_generator()
    target_table_id = '{}.{}'.format(dataset_id, target_table_name)
    insert_sql = insert_table_query_generator(temp_table_name, target_table_id)

    external_config = bigquery.ExternalConfig("CSV")
    external_config.source_uris = [
        log_uri,
    ]
    external_config.schema = [
        bigquery.SchemaField("content", "STRING", mode="REQUIRED"),
    ]
    job_config = bigquery.QueryJobConfig(
        table_definitions={temp_table_name: external_config})
    query_job = client.query(insert_sql, job_config=job_config)
    if len(list(query_job)) == 0:
        print("Load {} to {} completed!".format(log_uri, target_table_id))
Esempio n. 18
0
def create_external_table(project_id: str) -> None:
    client = bigquery.Client()
    dataset_id = "social_dataset"
    dataset_ref = bigquery.DatasetReference(project_id, dataset_id)

    table_id = "ja_kakei_chousa_income_divide_over_two_member"
    table = bigquery.Table(dataset_ref.table(table_id))

    external_config = bigquery.ExternalConfig("PARQUET")
    external_config.source_uris = [
        "gs://ja-kakei-chousa-income-divide-over-two-member/*"
    ]
    external_config.autodetect = True
    hive_partitioning = bigquery.external_config.HivePartitioningOptions()
    hive_partitioning.mode = "AUTO"
    hive_partitioning.require_partition_filter = False
    hive_partitioning.source_uri_prefix = (
        "gs://ja-kakei-chousa-income-divide-over-two-member")
    external_config.hive_partitioning = hive_partitioning
    table.external_data_configuration = external_config

    table = client.create_table(table, exists_ok=True)
Esempio n. 19
0
def create_external_table(project_id: str) -> None:
    client = bigquery.Client()
    dataset_id = "social_dataset"
    dataset_ref = bigquery.DatasetReference(project_id, dataset_id)

    table_id = "jasso_gakuseiseikatsu_stats_annual_income_divide_university"
    table = bigquery.Table(dataset_ref.table(table_id))

    external_config = bigquery.ExternalConfig("PARQUET")
    external_config.source_uris = [
        "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university/*"
    ]
    external_config.autodetect = True
    hive_partitioning = bigquery.external_config.HivePartitioningOptions()
    hive_partitioning.mode = "AUTO"
    hive_partitioning.require_partition_filter = False
    hive_partitioning.source_uri_prefix = (
        "gs://jasso-gakuseiseikatsu-stats-annual-income-divide-university")
    external_config.hive_partitioning = hive_partitioning
    table.external_data_configuration = external_config

    table = client.create_table(table, exists_ok=True)
def create_bq_table(project,
                    gcs_uri,
                    dataset,
                    table,
                    require_hive_partition_filter=True):
    bq_client = bigquery.Client(project=project)
    table_ref = bq_client.dataset(dataset).table(table)
    table = bigquery.Table(table_ref)

    hive_partition_options = HivePartitioningOptions()
    hive_partition_options.mode = "AUTO"
    hive_partition_options.source_uri_prefix = gcs_uri

    # To prevent one from accidentaly scan the whole table, set this
    # partition filter requirement.
    #
    # table.require_partition_filter = True is not supported by the class yet.
    # hive_partition_options.require_partition_filter = True is not
    # supported by the class yet.
    # So I need to do the following to include the option:
    hive_partition_options._properties[
        "require_partition_filter"] = require_hive_partition_filter

    extconfig = bigquery.ExternalConfig('CSV')
    extconfig.schema = [bigquery.SchemaField('line', 'STRING')]
    extconfig.options.field_delimiter = u'\u00ff'
    extconfig.options.quote_character = ''
    #   extconfig.compression = 'GZIP'
    extconfig.options.allow_jagged_rows = False
    extconfig.options.allow_quoted_newlines = False
    extconfig.max_bad_records = 10000000
    extconfig.source_uris = [os.path.join(gcs_uri, "*")]
    extconfig.hive_partitioning = hive_partition_options

    table.external_data_configuration = extconfig

    bq_client.create_table(table)
Esempio n. 21
0
    def test_query_sheet_linked_bq_table(self):
        dataset = bq.create_dataset(self.project_id, self.dataset_id,
                                    self.description, self.label_or_tag)
        # add Google Drive scope
        external_data_scopes = [
            "https://www.googleapis.com/auth/drive",
            "https://www.googleapis.com/auth/cloud-platform"
        ]
        client = bq.get_client(self.project_id, external_data_scopes)

        # Configure the external data source and query job.
        external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")

        # Grant viewing access to the test sheet to BigQuery test accounts
        sheet_url = (
            "https://docs.google.com/spreadsheets"
            "/d/1JI-KyigmwZU9I2J6TZqVTPNoEAWVqiFeF8Y549-dvzM/edit#gid=0")
        schema = [
            bigquery.SchemaField("site_name", "STRING"),
            bigquery.SchemaField("hpo_id", "STRING"),
            bigquery.SchemaField("site_point_of_contact", "STRING"),
        ]
        external_config.source_uris = [sheet_url]
        external_config.schema = schema
        external_config.options.range = (
            "contacts!A1:C5"  # limit scope so that other items can be added to sheet
        )
        external_config.options.skip_leading_rows = 1  # Optionally skip header row.

        table_id = consts.HPO_ID_CONTACT_LIST_TABLE_ID
        table = bigquery.Table(dataset.table(table_id), schema=schema)
        table.external_data_configuration = external_config

        table = client.create_table(table)
        table_content_query = f'SELECT * FROM `{dataset.dataset_id}.{table.table_id}`'
        actual_df = bq.query_sheet_linked_bq_table(self.project_id,
                                                   table_content_query,
                                                   external_data_scopes)
        expected_dict = [{
            'site_name':
                'Fake Site Name 1',
            'hpo_id':
                'fake_1',
            'site_point_of_contact':
                'fake.email.1@site_1.fakedomain; fake.email.2@site_1.fakedomain'
        }, {
            'site_name': 'Fake Site Name 2',
            'hpo_id': 'fake_2',
            'site_point_of_contact': 'no data steward'
        }, {
            'site_name':
                'Fake Site Name 3',
            'hpo_id':
                'fake_3',
            'site_point_of_contact':
                'Fake.Email.1@site_3.fake_domain; Fake.Email.2@site_3.fake_domain'
        }, {
            'site_name':
                'Fake Site Name 4',
            'hpo_id':
                'fake_4',
            'site_point_of_contact':
                '[email protected]; [email protected]'
        }]
        expected_df = pd.DataFrame(
            expected_dict,
            columns=["site_name", "hpo_id", "site_point_of_contact"])
        pd.testing.assert_frame_equal(actual_df, expected_df)
Esempio n. 22
0
def apply_gcs_changes(data_type, **kwargs):
    '''
    Sets up a file in Google Cloud Storage as an temporary table, and merges it with an existing table in BigQuery.
    The file's location in GCS is retrieved through XCOM, and the schema for the temporary table is loaded from GCS.
    Data types should be: 'accounts', 'offers', 'trustlines', 'dimAccounts', 'dimOffers', or 'dimMarkets'.

    Parameters:
        data_type - type of the data being uploaded; should be string
    Returns:
        N/A
    '''

    key_path = Variable.get('api_key_path')
    credentials = service_account.Credentials.from_service_account_file(key_path)
    client = bigquery.Client(credentials=credentials, project=credentials.project_id)

    gcs_bucket_name = Variable.get('gcs_exported_data_bucket_name')
    gcs_filepath = kwargs['task_instance'].xcom_pull(task_ids=f'load_{data_type}_to_gcs')
    schema_dict = read_local_schema(data_type)
    bq_schema_list = [bigquery.SchemaField(field['name'], field['type'], mode=field['mode']) for field in schema_dict] 

    external_config = bigquery.ExternalConfig('NEWLINE_DELIMITED_JSON')
    external_config.source_uris = [f'gs://{gcs_bucket_name}/{gcs_filepath}']
    external_config.schema = bq_schema_list

    # The temporary table is is equal to the name of the file used to create it. Table ids cannot have '-'. Instead they have '_'
    table_id = f'{splitext(basename(gcs_filepath))[0]}'
    table_id = table_id.replace('-', '_')

    job_config = bigquery.QueryJobConfig(table_definitions={table_id: external_config})

    #check if the table already exists; if it does not then we need to create it using the schema that we have already read in
    true_table_id = f'{Variable.get("bq_project")}.{Variable.get("bq_dataset")}.{Variable.get("table_ids", deserialize_json=True)[data_type]}'
    try:
        _ = client.get_table(true_table_id)
    except NotFound:
        logging.info(f'Bigquery table not found; creating table with id {true_table_id}')
        table = bigquery.Table(true_table_id, schema=bq_schema_list)
        client.create_table(table)

    if data_type in ['accounts', 'offers', 'trustlines']:
        logging.info('Using merge query...')
        sql_query = create_merge_query(table_id, data_type, schema_dict)
    elif data_type in ['dimAccounts', 'dimOffers', 'dimMarkets']:
        logging.info('Using insert unique query...')
        sql_query = create_insert_unique_query(table_id, data_type, schema_dict)
    else:
        raise AirflowException(f'Data type {data_type} has no corresponding query')

    logging.info(f'Query is: {sql_query}')
    logging.info(f'Running BigQuery job with config: {job_config._properties}')
    query_job = client.query(sql_query, job_config=job_config)
    result_rows = query_job.result()

    if query_job.error_result:
        logging.info(f'Query errors: {query_job.errors}')
        raise AirflowException(f'Query job failed: {query_job.error_result}')

    logging.info(f'Job timeline: {[t._properties for t in query_job.timeline]}')
    logging.info(f'{query_job.total_bytes_billed} bytes billed at billing tier {query_job.billing_tier}')
    logging.info(f'Total rows affected: {query_job.num_dml_affected_rows}')
Esempio n. 23
0
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from google.cloud import bigquery
SCHEMA = [bigquery.SchemaField('line', 'STRING')]
bq_client = bigquery.Client(project='bigquery-public-data-staging')
table_ref = bq_client.dataset('wikipedia_pipeline').table('view_gcs')
table = bigquery.Table(table_ref, schema=SCHEMA)
extconfig = bigquery.ExternalConfig('CSV')
extconfig.schema = SCHEMA
extconfig.options.field_delimiter = u'\u00ff'
extconfig.options.quote_character = ''
extconfig.compression = 'GZIP'
extconfig.options.allow_jagged_rows = False
extconfig.options.allow_quoted_newlines = False
extconfig.max_bad_records = 10000000
extconfig.source_uris = [
    "gs://wiki-staging/dumps.wikimedia.org/other/pageviews/*"
]
table.external_data_configuration = extconfig
bq_client.create_table(table)
Esempio n. 24
0
    def create(
        self,
        path=None,
        job_config_params=None,
        partitioned=False,
        force_dataset=True,
        if_table_exists="raise",
        if_storage_data_exists="raise",
        if_table_config_exists="raise",
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:
            - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR`

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            partitioned (bool): Optional.
                Whether data is partitioned
            if_table_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.
            if_table_config_exists (str): Optional.
                 What to do if config files already exist

                 * 'raise': Raises FileExistError
                 * 'replace': Replace with blank template
                 * 'pass'; Do nothing
            if_storage_data_exists (str): Optional.
                What to do if data already exists on your bucket:

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing

        Todo:

            * Implement if_table_exists=raise
            * Implement if_table_exists=pass
        """

        if path is None:

            # Look if table data already exists at Storage
            data = self.client["storage_staging"].list_blobs(
                self.bucket_name,
                prefix=f"staging/{self.dataset_id}/{self.table_id}")

            # Raise: Cannot create table without external data
            if not data:
                raise BaseDosDadosException(
                    "You must provide a path for uploading data")

        # Add data to storage
        if isinstance(
                path,
            (
                str,
                Path,
            ),
        ):

            Storage(self.dataset_id, self.table_id,
                    **self.main_vars).upload(path,
                                             mode="staging",
                                             if_exists=if_storage_data_exists)

        # Create Dataset if it doesn't exist
        if force_dataset:

            dataset_obj = Dataset(self.dataset_id, **self.main_vars)

            try:
                dataset_obj.init()
            except FileExistsError:
                pass

            dataset_obj.create(if_exists="pass")

        self.init(
            data_sample_path=path,
            if_folder_exists="replace",
            if_table_config_exists=if_table_config_exists,
        )

        external_config = external_config = bigquery.ExternalConfig("CSV")
        external_config.options.skip_leading_rows = 1
        external_config.options.allow_quoted_newlines = True
        external_config.options.allow_jagged_rows = True
        external_config.autodetect = False
        external_config.schema = self._load_schema("staging")

        external_config.source_uris = (
            f"gs://{self.bucket_name}/staging/{self.dataset_id}/{self.table_id}/*"
        )

        if partitioned:

            hive_partitioning = bigquery.external_config.HivePartitioningOptions(
            )
            hive_partitioning.mode = "AUTO"
            hive_partitioning.source_uri_prefix = self.uri.format(
                dataset=self.dataset_id, table=self.table_id).replace("*", "")
            external_config.hive_partitioning = hive_partitioning

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = external_config
        # Lookup if table alreay exists
        table_ref = None
        try:
            table_ref = self.client["bigquery_staging"].get_table(
                self.table_full_name["staging"])

        except google.api_core.exceptions.NotFound:
            pass

        if isinstance(table_ref, google.cloud.bigquery.table.Table):

            if if_table_exists == "pass":

                return None

            elif if_table_exists == "raise":

                raise FileExistsError(
                    "Table already exists, choose replace if you want to overwrite it"
                )

        if if_table_exists == "replace":

            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)
Esempio n. 25
0
#!/usr/bin/env python
"""Import VAT rates from Google Sheets."""

import google.auth
from google.cloud import bigquery

# Use job config with external table definition
external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
external_config.source_uris = [
    "https://docs.google.com/spreadsheets/d/1-eUbaYIuppfwoCyawKEfNVHRx_HJfn_VBOc7c2YYS48"
]
external_config.schema = [
    bigquery.SchemaField("country_code", "STRING"),
    bigquery.SchemaField("country", "STRING"),
    bigquery.SchemaField("vat", "STRING"),
    bigquery.SchemaField("effective_date", "DATE"),
]
external_config.options.skip_leading_rows = 2
job_config = bigquery.QueryJobConfig(
    table_definitions={"vat_rates_sheet": external_config})

# Use credentials that include a google drive scope
credentials, project = google.auth.default(scopes=[
    "https://www.googleapis.com/auth/drive",
    "https://www.googleapis.com/auth/bigquery",
])
client = bigquery.Client(credentials=credentials, project=project)

query = client.query(
    """CREATE OR REPLACE TABLE
  `moz-fx-data-shared-prod`.mozilla_vpn_derived.vat_rates_v1
Esempio n. 26
0
def update_from_cloud_storage(args):
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = args.google_key_path
    client = bigquery.Client()

    bucket_name = "igenie-tweets"
    blob_name = "historical/{}.json".format("tweets-raw")

    GS_URL = 'gs://{}/{}'.format(bucket_name, blob_name)
    external_config = bigquery.ExternalConfig("NEWLINE_DELIMITED_JSON")
    external_config.autodetect = True
    external_config.source_uris = [GS_URL]
    job_config = bigquery.QueryJobConfig()
    job_config.table_definitions = {"temp": external_config}

    file_name = "tweets-enriched.json"

    QUERY = ('SELECT id,'
             'id_str,'
             'constituent,'
             'text,'
             'coordinates,'
             'created_at,'
             'favorited,'
             'place,'
             'lang,'
             'metadata,'
             'retweeted,'
             'entities.hashtags,'
             'entities.symbols,'
             'source,'
             'user.time_zone,'
             'user.location,'
             'user.friends_count,'
             'user.followers_count,'
             'favorite_count,'
             'retweet_count,'
             'geo ,'
             'search_term '
             'FROM `temp`')

    TIMEOUT = 100  # in seconds

    query_job = client.query(
        QUERY, job_config=job_config)  # API request - starts the query
    assert query_job.state == 'RUNNING'

    # Waits for the query to finish
    iterator = query_job.result(timeout=TIMEOUT)

    with open(file_name, "a") as f:
        for row in iterator:
            # Included attributes
            result = {}
            result["id"] = row.id
            result['id_str'] = row.id_str
            result['text'] = row.text
            result['coordinates'] = row.coordinates
            result['favorited'] = row.favorited
            result['place'] = row.place
            result['lang'] = row.lang
            result['metadata'] = row.metadata
            result['retweeted'] = row.retweeted
            result['entities_hashtags'] = row["entities.hashtags"]
            result['entities_symbols'] = row["entities.symbols"]
            result['source'] = row.source
            result['user_time_zone'] = row["user.time_zone"]
            result['user_location'] = row["user.location"]
            result['user_friends_count'] = row["user.friends_count"]
            result['user_followers_count'] = row["user.followers_count"]
            result['favorite_count'] = row.favorite_count
            result['retweet_count'] = row.retweet_count
            result['geo'] = row.geo
            result['search_term'] = row.search_term

            # Extra attributes
            # constituent_id, constituent_name
            constituent_id, constituent_name = get_constituent_id_name(
                row.constituent)
            result['constituent_id'] = constituent_id
            result['constituent_name'] = constituent_name
            # created at - date
            result['date'] = datetime.strptime(row.created_at,
                                               '%a %b %d %H:%M:%S %z %Y')

            if not row.relevance:
                result["relevance"] = -1
            else:
                result["relevance"] = row.relevance

            # sentiment score
            result["sentiment_score"] = get_nltk_sentiment(row.text)

            update_tags(result)

            f.write(json.dumps(result, cls=MongoEncoder) + '\n')
Esempio n. 27
0
File: table.py Progetto: jvfe/mais
    def create(
        self,
        path=None,
        job_config_params=None,
        partitioned=False,
        if_exists="raise",
        force_dataset=True,
    ):
        """Creates BigQuery table at staging dataset.

        If you add a path, it automatically saves the data in the storage,
        creates a datasets folder and BigQuery location, besides creating the
        table and its configuration files.

        The new table should be located at `<dataset_id>_staging.<table_id>` in BigQuery.

        It looks for data saved in Storage at `<bucket_name>/staging/<dataset_id>/<table_id>/*`
        and builds the table.

        It currently supports the types:
            - Comma Delimited CSV

        Data can also be partitioned following the hive partitioning scheme
        `<key1>=<value1>/<key2>=<value2>`, for instance, `year=2012/country=BR`

        Args:
            path (str or pathlib.PosixPath): Where to find the file that you want to upload to create a table with
            job_config_params (dict): Optional.
                Job configuration params from bigquery
            partitioned (bool): Optional.
                Whether data is partitioned
            if_exists (str): Optional
                What to do if table exists

                * 'raise' : Raises Conflict exception
                * 'replace' : Replace table
                * 'pass' : Do nothing
            force_dataset (bool): Creates `<dataset_id>` folder and BigQuery Dataset if it doesn't exists.

        Todo:

            * Implement if_exists=raise
            * Implement if_exists=pass
        """

        # Add data to storage
        if isinstance(
            path,
            (
                str,
                PosixPath,
            ),
        ):

            Storage(self.dataset_id, self.table_id, **self.main_vars).upload(
                path, mode="staging", if_exists="replace"
            )

            # Create Dataset if it doesn't exist
            if force_dataset:

                dataset_obj = Dataset(self.dataset_id, **self.main_vars)

                try:
                    dataset_obj.init()
                except FileExistsError:
                    pass

                dataset_obj.create(if_exists="pass")

            self.init(data_sample_path=path, if_exists="replace")

        external_config = external_config = bigquery.ExternalConfig("CSV")
        external_config.options.skip_leading_rows = 1
        external_config.options.allow_quoted_newlines = True
        external_config.options.allow_jagged_rows = True
        external_config.autodetect = False
        external_config.schema = self._load_schema("staging")

        external_config.source_uris = (
            f"gs://basedosdados/staging/{self.dataset_id}/{self.table_id}/*"
        )

        if partitioned:

            hive_partitioning = bigquery.external_config.HivePartitioningOptions()
            hive_partitioning.mode = "AUTO"
            hive_partitioning.source_uri_prefix = self.uri.format(
                dataset=self.dataset_id, table=self.table_id
            ).replace("*", "")
            external_config.hive_partitioning = hive_partitioning

        table = bigquery.Table(self.table_full_name["staging"])

        table.external_data_configuration = external_config

        if if_exists == "replace":
            self.delete(mode="staging")

        self.client["bigquery_staging"].create_table(table)

        table = bigquery.Table(self.table_full_name["staging"])
Esempio n. 28
0
    def create_external_table(self,
                              table_name,
                              source_uris: list = [],
                              schema=None,
                              source_format: str = 'NEWLINE_DELIMITED_JSON',
                              delete_if_exists=True,
                              **kwargs):
        """
        Create a BigQuery external table using source_uris as data files.

        The default source format is 'NEWLINE_DELIMITED_JSON'. For list of all available table source formats, run:
        > bq help mkdef

        If schema is omitted then table is used with autodetect schema. This only works with source formats which
        contain their own schema such as json or avro files.

        :param table_name: full table name including dataset name
        :param source_uris: list of google storage URI data file paths (ie: gs://bucket_name/myfiles/*)
        :param schema: table schema. If not defined then schema autodetect is used
        :param source_format: default is 'NEWLINE_DELIMITED_JSON'. For the full list run > bq help mkdef
        :param delete_if_exists: delete table if already exists
        :return: None
        """
        bq = self.client

        # delete existing table
        if delete_if_exists:
            self.delete_table(table_name)

        try:
            # create external table
            full_table_name = f"{bq.project}.{table_name}"
            table = bigquery.Table(full_table_name)
            # external table configurations
            external_table_config = bigquery.ExternalConfig(
                source_format=source_format)
            if schema:
                external_table_config.schema = schema
            else:
                external_table_config.autodetect = True
            external_table_config.ignore_unknown_values = True
            # table_config.max_bad_records = 10
            for k, v in kwargs.items():
                if hasattr(external_table_config, k):
                    setattr(external_table_config, k, v)
                elif hasattr(external_table_config.options, k):
                    setattr(external_table_config.options, k, v)
            external_table_config.source_uris = source_uris
            table.external_data_configuration = external_table_config

            logger.info(
                f"creating bigquery external table: {table.dataset_id}.{table.table_id} \t "
                f"format:'{external_table_config.source_format}' "
                f"source_uris: {external_table_config.source_uris}")

            # create the external table
            table = bq.create_table(table)

            logger.info(
                f"created bigquery external table: {table.dataset_id}.{table.table_id} "
                f"timestamp: {table.created}, location: {table.location}")
            return table
        except google_exceptions.BadRequest as err:
            logger.fatal(
                f"critical error while creating bigquery external table.")
            logger.fatal(str(err))
            raise err
Esempio n. 29
0
def _bq_client_create_external_table(
    table_name,
    schema_fields,
    source_objects,
    source_format,
    geojson=False,
    hive_options=None,
    bucket=None,
    post_hook=None,
):
    # TODO: must be fully qualified table name
    ext = bigquery.ExternalConfig(source_format)
    ext.source_uris = source_objects
    ext.autodetect = schema_fields is None
    ext.ignore_unknown_values = True

    if geojson:
        ext.json_extension = "GEOJSON"

    if hive_options:
        assert (len(source_objects) == 1
                ), "cannot use hive partitioning with more than one URI"
        opt = bigquery.external_config.HivePartitioningOptions()
        # _Strongly_ recommend using CUSTOM mode and explicitly-defined
        # key schema for more than a trivial number of files
        opt.mode = hive_options.get("mode", "AUTO")
        opt.require_partition_filter = hive_options.get(
            "require_partition_filter", False)
        # TODO: this is very fragile, we should probably be calculating it from
        #       the source_objects and validating the format (prefix, trailing slashes)
        prefix = hive_options["source_uri_prefix"]

        if prefix and bucket:
            opt.source_uri_prefix = bucket + "/" + prefix
        else:
            opt.source_uri_prefix = prefix
        ext.hive_partitioning = opt

    client = bigquery.Client(project=get_project_id(),
                             location=CALITP_BQ_LOCATION)

    dataset_name, _ = table_name.split(".")
    full_dataset_name = ".".join((get_project_id(), dataset_name))

    try:
        client.get_dataset(full_dataset_name)
    except NotFound:
        print(f"Dataset {full_dataset_name} not found, creating.")
        dataset = bigquery.Dataset(full_dataset_name)
        dataset.location = "us-west2"
        client.create_dataset(dataset, timeout=30)

    # for some reason, you can set the project name in the bigquery client, and
    # it doesn't need to be in the SQL code, but this bigquery API still requires
    # the fully qualified table name when initializing a Table.
    full_table_name = f"{get_project_id()}.{table_name}"

    # First delete table if it exists
    print(f"Deleting external table if exists: {full_table_name}")
    client.delete_table(full_table_name, not_found_ok=True)

    # (re)create table
    tbl = bigquery.Table(full_table_name, schema_fields)
    tbl.external_data_configuration = ext

    print(
        f"Creating external table: {full_table_name} {tbl} {source_objects} {hive_options}"
    )
    created_table = client.create_table(tbl, timeout=300, exists_ok=True)

    if post_hook:
        client.query(post_hook).result()
        print(f"Successfully ran {post_hook}")

    return created_table
Esempio n. 30
0
    def execute(self, data):

        #setup
        client = self.client
        bigquery = self.bigquery
        datetime = self.datetime
        pytz = self.pytz
        time = self.time
        name = data.get("titleName")
        emails = data.get("emails")
        query = data.get("query")
        table = ""
        #

        # create a dataset first if needed
        dataset_main = self.make_dataset()
        table_id = "{}.{}".format(dataset_main, name)
        #

        # create external table
        if (self.env.get("create_external_table")):
            try:
                # Configure the external data source
                dataset_id = dataset_main
                table_id = "{}.{}".format(dataset_main, query)
                schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                table = bigquery.Table(table_id, schema=schema)
                external_config = bigquery.ExternalConfig("CSV")
                external_config.source_uris = [
                    "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
                ]
                external_config.options.skip_leading_rows = 1  # optionally skip header row
                table.external_data_configuration = external_config

                # Create a permanent table linked to the GCS file
                table = client.create_table(table)  # API request

                # Example query to find states starting with 'W'
                sql = 'SELECT * FROM `{}` WHERE name LIKE "W%"'.format(
                    table_id)

                query_job = client.query(sql)  # API request

                w_states = list(query_job)  # Waits for query to finish
                return "There are {} states with names starting with W. we pulled the data from us-states.csv in cloud storage".format(
                    len(w_states))
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # create temp external table
        elif (self.env.get("create_temp_external_table")):
            try:
                schema = ["filename", "name"]
                # Configure the external data source and query job.
                external_config = bigquery.ExternalConfig("CSV")
                external_config.source_uris = [
                    "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
                ]
                external_config.schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                external_config.options.skip_leading_rows = 1
                table_id = "usa_states"
                job_config = bigquery.QueryJobConfig(
                    table_definitions={table_id: external_config})

                # Example query to find states starting with 'W'.
                sql = """
                SELECT _FILE_NAME AS {},{} FROM `{}` WHERE name LIKE "W%"
                
                """.format(schema[0], schema[1], table_id)
                query_job = client.query(
                    sql, job_config=job_config)  # Make an API request.
                query_job.result()
                return json.dumps({
                    "schema": [{
                        "field": x
                    } for x in schema],
                    "data": [
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]: row[schema[0]],
                            schema[1]: row[schema[1]]
                        } for row in query_job
                    ]
                })
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # drive create external table
        elif (self.env.get("drive_create_external_table")):
            try:
                dataset_id = dataset_main

                # Configure the external data source.
                dataset = client.get_dataset(dataset_id)
                table_id = query
                schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                table = bigquery.Table(dataset.table(table_id), schema=schema)
                external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
                # Use a shareable link or grant viewing access to the email address you
                # used to authenticate with BigQuery (this example Sheet is public).
                sheet_url = (
                    "https://docs.google.com/spreadsheets/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing"
                )
                external_config.source_uris = [sheet_url]
                external_config.options.skip_leading_rows = 1  # Optionally skip header row.
                external_config.options.range = (
                    "us-states!A20:B49"  # Optionally set range of the sheet to query from.
                )
                table.external_data_configuration = external_config

                # Create a permanent table linked to the Sheets file.
                table = client.create_table(table)  # Make an API request.

                # Example query to find states starting with "W".
                sql = 'SELECT * FROM `{}.{}` WHERE name LIKE "W%"'.format(
                    dataset_id, table_id)

                query_job = client.query(sql)  # Make an API request.

                # Wait for the query to complete.
                w_states = list(query_job)
                return "There are {} states with names starting with W in the selected range. this data came from google drive".format(
                    len(w_states))

            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        # drive create temp external table
        elif (self.env.get("drive_create_temp_external_table")):
            try:
                schema = ["name", "post_abbr"]
                # Configure the external data source and query job.
                external_config = bigquery.ExternalConfig("GOOGLE_SHEETS")
                sheet_url = (
                    "https://docs.google.com/spreadsheets"
                    "/d/1i_QCL-7HcSyUZmIbP9E6lO_T5u3HnpLe7dnpHaijg_E/edit?usp=sharing"
                )
                external_config.source_uris = [sheet_url]
                external_config.schema = [
                    bigquery.SchemaField("name", "STRING"),
                    bigquery.SchemaField("post_abbr", "STRING"),
                ]
                external_config.options.skip_leading_rows = 1  # Optionally skip header row.
                external_config.options.range = (
                    "us-states!A20:B49"  # Optionally set range of the sheet to query from.
                )
                table_id = "usa_states"
                job_config = bigquery.QueryJobConfig(
                    table_definitions={table_id: external_config})

                # Example query to find states starting with 'W'.
                sql = """
                SELECT * FROM `{}` WHERE name LIKE "W%"
                """.format(table_id)
                query_job = client.query(
                    sql, job_config=job_config)  # Make an API request.
                query_job.result()
                [print(row) for row in query_job]
                return json.dumps({
                    "schema": [{
                        "field": x
                    } for x in schema],
                    "data": [
                        # Row values can be accessed by field name or index.
                        {
                            schema[0]: row[schema[0]],
                            schema[1]: row[schema[1]]
                        } for row in query_job
                    ]
                })
            except BaseException as e:
                print('my custom error\n')
                print(e.__class__.__name__)
                print('\n')
                print(e)
                return 'an error occured check the output from the backend'
        #

        return "Check the backend env dictionary you did set it so the backend didnt do anything"