def execute(self, context):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s ; Max Results: %s',
                      self.dataset_id, self.table_id, self.max_results)

        hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                            delegate_to=self.delegate_to)

        conn = hook.get_conn()
        cursor = conn.cursor()
        response = cursor.get_tabledata(dataset_id=self.dataset_id,
                                        table_id=self.table_id,
                                        max_results=self.max_results,
                                        selected_fields=self.selected_fields)

        self.log.info('Total Extracted rows: %s', response['totalRows'])
        rows = response['rows']

        table_data = []
        for dict_row in rows:
            single_row = []
            for fields in dict_row['f']:
                single_row.append(fields['v'])
            table_data.append(single_row)

        return table_data
 def execute(self, context):
     logging.info('Deleting: %s', self.deletion_dataset_table)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_table_delete(self.deletion_dataset_table, self.ignore_if_missing)
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                self.bucket,
                self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_external_table(
            external_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            compression=self.compression,
            skip_leading_rows=self.skip_leading_rows,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            src_fmt_configs=self.src_fmt_configs,
            labels=self.labels
        )
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                                          delegate_to=self.delegate_to)
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        schema_fields = self.schema_fields if self.schema_fields else json.loads(gcs_hook.download(self.bucket, self.schema_object))
        source_uris = map(lambda schema_object: 'gs://{}/{}'.format(self.bucket, schema_object), self.source_objects)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(
            destination_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            create_disposition=self.create_disposition,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
            field_delimiter=self.field_delimiter)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            logging.info('Loaded BQ data with max {}.{}={}'.format(self.destination_project_dataset_table, self.max_id_key, max_id))
            return max_id
 def execute(self, context):
     if self.bq_cursor is None:
         self.log.info('Executing: %s', self.sql)
         hook = BigQueryHook(
             bigquery_conn_id=self.bigquery_conn_id,
             use_legacy_sql=self.use_legacy_sql,
             delegate_to=self.delegate_to,
             location=self.location,
         )
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(
         sql=self.sql,
         destination_dataset_table=self.destination_dataset_table,
         write_disposition=self.write_disposition,
         allow_large_results=self.allow_large_results,
         flatten_results=self.flatten_results,
         udf_config=self.udf_config,
         maximum_billing_tier=self.maximum_billing_tier,
         maximum_bytes_billed=self.maximum_bytes_billed,
         create_disposition=self.create_disposition,
         query_params=self.query_params,
         labels=self.labels,
         schema_update_options=self.schema_update_options,
         priority=self.priority,
         time_partitioning=self.time_partitioning,
         api_resource_configs=self.api_resource_configs,
         cluster_fields=self.cluster_fields,
     )
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.gcs_schema_object:

            gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)

            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                gcs_bucket,
                gcs_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_table(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_id=self.table_id,
            schema_fields=schema_fields,
            time_partitioning=self.time_partitioning,
            labels=self.labels
        )
 def poke(self, context):
     table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id, self.table_id)
     self.log.info('Sensor checks existence of table: %s', table_uri)
     hook = BigQueryHook(
         bigquery_conn_id=self.bigquery_conn_id,
         delegate_to=self.delegate_to)
     return hook.table_exists(self.project_id, self.dataset_id, self.table_id)
Example #8
0
 def execute(self, context):
     logging.info('Executing: %s', self.bql)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition,
                      self.allow_large_results, self.udf_config, self.use_legacy_sql)
 def execute(self, context):
     logging.info('Executing copy of %s into: %s', self.source_project_dataset_tables, self.destination_project_dataset_table)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_copy(
         self.source_project_dataset_tables,
         self.destination_project_dataset_table,
         self.write_disposition,
         self.create_disposition)
Example #10
0
 def execute(self, context):
     logging.info('Executing extract of %s into: %s', self.source_dataset_table, self.destination_cloud_storage_uris)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
     hook.run_extract(
         self.source_dataset_table,
         self.destination_cloud_storage_uris,
         self.compression,
         self.export_format,
         self.field_delimiter,
         self.print_header)
 def execute(self, context):
     if(self.bq_cursor == None):
         self.log.info('Executing: %s', self.bql)
         hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                             delegate_to=self.delegate_to)
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition,
                      self.allow_large_results, self.udf_config,
                      self.use_legacy_sql, self.maximum_billing_tier,
                      self.create_disposition, self.query_params)
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_dataset(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            dataset_reference=self.dataset_reference)
 def execute(self, context):
     self.log.info('Executing extract of %s into: %s',
                   self.source_project_dataset_table,
                   self.destination_cloud_storage_uris)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_extract(
         self.source_project_dataset_table,
         self.destination_cloud_storage_uris,
         self.compression,
         self.export_format,
         self.field_delimiter,
         self.print_header)
Example #14
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        datasets = cursor.get_datasets_list(project_id=self.project_id)
        dataset_ids = list(d['datasetReference']['datasetId']
                           for d in datasets)

        if self.dataset_id not in dataset_ids:
            cursor.create_empty_dataset(
                project_id=self.project_id,
                dataset_id=self.dataset_id,
                dataset_reference=self.dataset_reference)
    def init(self):
        self.log.info(f"init() is started")

        # bucket connection
        self.gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.gcs_conn_id, delegate_to=None)

        # bigquery connection
        self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcs_conn_id, use_legacy_sql=False)
        bq_conn = self.bq_hook.get_conn()
        self.bq_cursor = bq_conn.cursor()

        # geotab connection
        self.geotab_hook = GeotabHook(geotab_conn_id=self.geotab_conn_id)

        params = self.geotab_hook.get_connection(self.geotab_conn_id)
        self.log.info(f"login: "******", password: "******", schema: " + params.schema)
 def execute(self, context):
     self.log.info('Fetching last partition from tables: {}'.format(
         str(self.table_lst)))
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     return_dict = {}
     for each_table in self.table_lst:
         project = each_table.split(':')[0]
         dataset = each_table.split(':')[1].split('.')[0]
         table_name = each_table.split(':')[1].split('.')[1]
         lp = sorted(hook.table_list_partition(project, dataset,
                                               table_name))[-1]
         return_dict[each_table] = lp
         self.log.info("Table {} has latest partition: {}".format(
             each_table, lp))
     return return_dict
Example #17
0
 def execute(self, context):
     self.log.info(
         'Executing copy of %s into: %s',
         self.source_project_dataset_tables, self.destination_project_dataset_table
     )
     hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_copy(
         source_project_dataset_tables=self.source_project_dataset_tables,
         destination_project_dataset_table=self.destination_project_dataset_table,
         write_disposition=self.write_disposition,
         create_disposition=self.create_disposition,
         labels=self.labels,
         encryption_configuration=self.encryption_configuration)
Example #18
0
 def execute(self, context):
     self.log.info('Executing extract of %s into: %s',
                   self.source_project_dataset_table,
                   self.destination_cloud_storage_uris)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_extract(
         source_project_dataset_table=self.source_project_dataset_table,
         destination_cloud_storage_uris=self.destination_cloud_storage_uris,
         compression=self.compression,
         export_format=self.export_format,
         field_delimiter=self.field_delimiter,
         print_header=self.print_header,
         labels=self.labels)
Example #19
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                                  and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(
                gcs_hook.download(self.bucket,
                                  self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = [
            'gs://{}/{}'.format(self.bucket, source_object)
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(destination_project_dataset_table=self.
                        destination_project_dataset_table,
                        schema_fields=schema_fields,
                        source_uris=source_uris,
                        source_format=self.source_format,
                        create_disposition=self.create_disposition,
                        skip_leading_rows=self.skip_leading_rows,
                        write_disposition=self.write_disposition,
                        field_delimiter=self.field_delimiter,
                        max_bad_records=self.max_bad_records,
                        quote_character=self.quote_character,
                        allow_quoted_newlines=self.allow_quoted_newlines,
                        allow_jagged_rows=self.allow_jagged_rows,
                        schema_update_options=self.schema_update_options,
                        src_fmt_configs=self.src_fmt_configs,
                        time_partitioning=self.time_partitioning)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info('Loaded BQ data with max %s.%s=%s',
                          self.destination_project_dataset_table,
                          self.max_id_key, max_id)
            return max_id
def save_hash_reference(input_rows,
                        project_id=None,
                        dataset=None,
                        table=None,
                        schema=None):

    bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default',
                           use_legacy_sql=False)

    gcp_credentials = bq_hook._get_credentials()

    bq_client = bigquery.Client(credentials=gcp_credentials,
                                project=project_id)

    target_dataset_ref = bigquery.DatasetReference(project=project_id,
                                                   dataset_id=dataset)

    try:
        target_dataset = bq_client.get_dataset(dataset_ref=target_dataset_ref)
        print("Dataset found: ", target_dataset)
    except NotFound as ex:
        # LOGGER.info(f"Dataset '{target_dataset_ref}' not found, attempting to create.")
        print("Dataset not found")
        target_dataset = bq_client.create_dataset(dataset=target_dataset_ref)
        print("Dataset created: ", target_dataset)

    target_table_ref = bigquery.TableReference(dataset_ref=target_dataset,
                                               table_id=table)

    try:
        target_table = bq_client.get_table(table=target_table_ref)
        print("Table found: ", target_table)
    except NotFound as ex:
        print("Table not found")
        t = bigquery.Table(table_ref=target_table_ref, schema=schema)
        target_table = bq_client.create_table(table=t)
        print("Table created: ", target_table)

    insert_rows = {
        "timestamp": datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%f'),
        "s3_data": input_rows
    }
    print("Rows to insert: ", input_rows)
    print("Target Table: ", target_table)
    error = bq_client.insert_rows_json(table=target_table,
                                       json_rows=[insert_rows])
    print(error)
Example #21
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                                  and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                self.bucket,
                self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = ['gs://{}/{}'.format(self.bucket, source_object)
                       for source_object in self.source_objects]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(
            destination_project_dataset_table=self.destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            create_disposition=self.create_disposition,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            schema_update_options=self.schema_update_options,
            src_fmt_configs=self.src_fmt_configs,
            time_partitioning=self.time_partitioning)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key,
                self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info(
                'Loaded BQ data with max %s.%s=%s',
                self.destination_project_dataset_table, self.max_id_key, max_id
            )
            return max_id
Example #22
0
class GoogleDisplayVideo360ERFToBigQueryOperator(BaseOperator):
    """Upload Multiple Entity Read Files to specified big query dataset.
    """
    def __init__(self,
                 gcp_conn_id='google_cloud_default',
                 report_body=None,
                 yesterday=False,
                 entity_type=None,
                 file_creation_date=None,
                 cloud_project_id=None,
                 bq_table=None,
                 schema=None,
                 gcs_bucket=None,
                 erf_bucket=None,
                 partner_ids=[],
                 write_disposition='WRITE_TRUNCATE',
                 *args,
                 **kwargs):
        super(GoogleDisplayVideo360ERFToBigQueryOperator,
              self).__init__(*args, **kwargs)
        self.gcp_conn_id = gcp_conn_id
        self.service = None
        self.bq_hook = None
        self.gcs_hook = None
        self.report_body = report_body
        self.erf_bucket = erf_bucket
        self.yesterday = yesterday
        self.cloud_project_id = cloud_project_id
        self.bq_table = bq_table
        self.gcs_bucket = gcs_bucket
        self.schema = schema
        self.entity_type = entity_type
        self.erf_object = 'entity/%s.0.%s.json' % (file_creation_date,
                                                   entity_type)
        self.partner_ids = partner_ids
        self.write_disposition = write_disposition
        self.file_creation_date = file_creation_date

    def execute(self, context):
        if self.gcs_hook is None:
            self.gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcp_conn_id)
        if self.bq_hook is None:
            self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id)

        for i, partner_id in enumerate(self.partner_ids):
            filename = erf_utils.download_and_transform_erf(self, partner_id)
            entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename)
            if i > 0:
                self.write_disposition = 'WRITE_APPEND'

            bq_base_cursor = self.bq_hook.get_conn().cursor()
            bq_base_cursor.run_load(
                destination_project_dataset_table=self.bq_table,
                schema_fields=self.schema,
                source_uris=[entity_read_file_ndj],
                source_format='NEWLINE_DELIMITED_JSON',
                write_disposition=self.write_disposition)
            self.gcs_hook.delete(self.gcs_bucket, filename)
Example #23
0
def execute_big_queries(bigquery_conn_id,
                        multi_sqls,
                        sql_separator=";",
                        use_legacy_sql=False,
                        **kwargs):

    hook = BigQueryHook(bigquery_conn_id=bigquery_conn_id)
    conn = hook.get_conn()
    logging.info("Execute : " + multi_sqls)
    for sql in multi_sqls.split(sql_separator):

        cursor = conn.cursor()
        cursor.run_query(bql=sql,
                         destination_dataset_table=None,
                         allow_large_results=True,
                         use_legacy_sql=False)
    logging.info("Execute : Done ")
    def execute(self, context):
        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            use_legacy_sql=self.use_legacy_sql,
            location=self.location,
        )

        records = self.run_query(project=hook._get_field("project"),
                                 credentials=hook._get_credentials())

        if not records:
            raise AirflowException("Query returned no results.")
        elif not all([bool(record) for record in records]):
            raise AirflowException(
                f"Test failed\nQuery: {self.sql}\nRecords: {records}")

        self.log.info(f"Test passed\nQuery: {self.sql}\nRecords: {records}")
    def execute(self, context):
        self.log.info('Fetching Data from:')
        self.log.info('Query: %s', self.sql)

        hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                            delegate_to=self.delegate_to)

        conn = hook.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        response = cursor.fetchmany(self.max_rows)

        self.log.info('Total Extracted rows: %s', len(response))

        self.log.info('Response: %s', response)

        return response
Example #26
0
 def execute(self, context):
     if self.bq_cursor is None:
         self.log.info('Executing: %s', self.bql)
         hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                             use_legacy_sql=self.use_legacy_sql,
                             delegate_to=self.delegate_to)
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(
         self.bql,
         destination_dataset_table=self.destination_dataset_table,
         write_disposition=self.write_disposition,
         allow_large_results=self.allow_large_results,
         udf_config=self.udf_config,
         maximum_billing_tier=self.maximum_billing_tier,
         create_disposition=self.create_disposition,
         query_params=self.query_params,
         schema_update_options=self.schema_update_options)
Example #27
0
    def execute(self, context):
        # TODO Buscar schema do hub se não for passado schema via argumento

        sql = self.sql.format(table=self.destination_project_dataset_table_id,
                              staging=self.staging_project_dataset_table_id,
                              cols=','.join(self.schema),
                              hash=self.schema[0])

        self.log.info('Execution sql \n {}'.format(sql))

        self.hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                                 use_legacy_sql=False)

        self.conn = self.hook.get_conn()

        self.cursor = self.conn.cursor()

        self.cursor.run_query(sql)
Example #28
0
def bq_to_pubsub_query_executor(**kwargs):
    """Executes a custom detector query in BigQuery and passes the results to the next task"""

    query = kwargs['templates_dict']['query']
    logging.info(query)
    bigquery_hook = BigQueryHook(use_legacy_sql=False)
    df = bigquery_hook.get_pandas_df(sql=query)

    messages = [{
        'data': b64e(row.to_json().encode()).decode()
    } for index, row in df.iterrows()]
    """splitting the array to 1000 size chunks (PubSub limit)"""
    messages_chunks = chunks(messages, 1000)
    pubsub_hoook = PubSubHook()
    for chunk in messages_chunks:
        pubsub_hoook.publish(project=gcp_project,
                             topic=pubsub_topic,
                             messages=chunk)
    def execute(self, context):
        """ 
            This method check the quality of the data given the input test cases indicated in the sql_test_cases
            dictionary.
        """
        bigquery = BigQueryHook(bigquery_conn_id=self.conn_id)
        found_errors = []
        for query, expected_result in self.sql_test_cases.items():
            records = bigquery.run_query(sql=query)
            if len(records) < 1 or records[0][0] != expected_result:
                found_errors.append(query)

        if len(found_errors) > 0:
            raise ValueError(
                f"The following query test cases were not successful {found_errors}"
            )

        self.log.info('DataQualityOperator has been executed')
Example #30
0
    def execute(self, context):
        dst_table_name = format_table_name(self.dst_table_name,
                                           is_staging=True)

        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        bucket = get_bucket()
        src_uris = f"{bucket}/{self.src_uris}"

        cursor.run_load(
            dst_table_name,
            source_uris=src_uris,
            schema_fields=self.schema_fields,
            autodetect=self.autodetect,
            skip_leading_rows=self.skip_leading_rows,
            write_disposition=self.write_disposition,
        )
 def execute(self, context):
     if self.bq_cursor is None:
         self.log.info('Executing: %s', self.bql)
         hook = BigQueryHook(
             bigquery_conn_id=self.bigquery_conn_id,
             use_legacy_sql=self.use_legacy_sql,
             delegate_to=self.delegate_to)
         conn = hook.get_conn()
         self.bq_cursor = conn.cursor()
     self.bq_cursor.run_query(
         self.bql,
         destination_dataset_table=self.destination_dataset_table,
         write_disposition=self.write_disposition,
         allow_large_results=self.allow_large_results,
         udf_config=self.udf_config,
         maximum_billing_tier=self.maximum_billing_tier,
         create_disposition=self.create_disposition,
         query_params=self.query_params,
         schema_update_options=self.schema_update_options)
Example #32
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object:
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(
                gcs_hook.download(self.bucket,
                                  self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = [
            'gs://{}/{}'.format(self.bucket, source_object)
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()
        cursor.run_load(destination_project_dataset_table=self.
                        destination_project_dataset_table,
                        schema_fields=schema_fields,
                        source_uris=source_uris,
                        source_format=self.source_format,
                        create_disposition=self.create_disposition,
                        skip_leading_rows=self.skip_leading_rows,
                        write_disposition=self.write_disposition,
                        field_delimiter=self.field_delimiter,
                        max_bad_records=self.max_bad_records,
                        schema_update_options=self.schema_update_options)

        if self.max_id_key:
            cursor.execute('SELECT MAX({}) FROM {}'.format(
                self.max_id_key, self.destination_project_dataset_table))
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            logging.info('Loaded BQ data with max {}.{}={}'.format(
                self.destination_project_dataset_table, self.max_id_key,
                max_id))
            return max_id
Example #33
0
    def bq_get_last_modified(self):
        logging.info("Connecting to Big Query")
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
        bq_conn = bq_hook.get_connection(self.bigquery_conn_id)
        bq_conn_extra_json = bq_conn.extra
        bq_conn_extra = json.loads(bq_conn_extra_json)
        service_dict = bq_conn_extra['extra__google_cloud_platform__keyfile_dict']

        sql = """
            #standardSQL
            SELECT last_modified_time AS TS
            FROM `{0}.{1}.__TABLES__`
            WHERE table_id = '{2}'
            """.format(self.project_id, self.dataset, self.table_name)

        logging.info("Getting table last_modified_time from BQ with SQL:/n{0}".format(sql))
        df = read_gbq(sql, dialect='standard', project_id=self.project_id, private_key = service_dict)
        logging.info("Got table!")

        ts = str(df['TS'][0])
        return ts
Example #34
0
    def execute(self, context):
        if self.gcs_hook is None:
            self.gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.gcp_conn_id)
        if self.bq_hook is None:
            self.bq_hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id)

        for i, partner_id in enumerate(self.partner_ids):
            filename = erf_utils.download_and_transform_erf(self, partner_id)
            entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename)
            if i > 0:
                self.write_disposition = 'WRITE_APPEND'

            bq_base_cursor = self.bq_hook.get_conn().cursor()
            bq_base_cursor.run_load(
                destination_project_dataset_table=self.bq_table,
                schema_fields=self.schema,
                source_uris=[entity_read_file_ndj],
                source_format='NEWLINE_DELIMITED_JSON',
                write_disposition=self.write_disposition)
            self.gcs_hook.delete(self.gcs_bucket, filename)
Example #35
0
    def bq_cursor(self):
        remote_conn_id = conf.get('core', 'REMOTE_LOG_CONN_ID')
        try:
            from airflow.contrib.hooks.bigquery_hook import BigQueryHook

            return BigQueryHook(bigquery_conn_id=remote_conn_id,
                                use_legacy_sql=False).get_conn().cursor()
        except Exception as e:
            self.log.error(
                'Could not create a BigQueryHook with connection id '
                '"%s". %s\n\nPlease make sure that the BigQuery '
                'connection exists.', remote_conn_id, str(e))
Example #36
0
 def execute(self, context):
     for i in range(0, len(self.source_project_dataset_table), 1):
         try:
             self.log.info('Executing %d/%d extracts', i+1, len(self.source_project_dataset_table))
             self.log.info('Executing extract of %s into: %s',
                           self.source_project_dataset_table[i],
                           self.destination_cloud_storage_uris[i])
             hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                                 delegate_to=self.delegate_to)
             conn = hook.get_conn()
             cursor = conn.cursor()
             cursor.run_extract(
                 self.source_project_dataset_table[i],
                 self.destination_cloud_storage_uris[i],
                 self.compression,
                 self.export_format,
                 self.field_delimiter,
                 self.print_header,
                 self.labels)
         except Exception as e:
             self.log.error('Exception: %s', e)
             self.log.info('Wait %d seconds retry', self.lazy_retry_wait)
             time.sleep(self.lazy_retry_wait)
             hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                                 delegate_to=self.delegate_to)
             conn = hook.get_conn()
             cursor = conn.cursor()
             cursor.run_extract(
                 self.source_project_dataset_table[i],
                 self.destination_cloud_storage_uris[i],
                 self.compression,
                 self.export_format,
                 self.field_delimiter,
                 self.print_header,
                 self.labels)
Example #37
0
 def get_hook(self):
     try:
         if self.conn_type == 'mysql':
             from airflow.hooks.mysql_hook import MySqlHook
             return MySqlHook(mysql_conn_id=self.conn_id)
         elif self.conn_type == 'google_cloud_platform':
             from airflow.contrib.hooks.bigquery_hook import BigQueryHook
             return BigQueryHook(bigquery_conn_id=self.conn_id)
         elif self.conn_type == 'postgres':
             from airflow.hooks.postgres_hook import PostgresHook
             return PostgresHook(postgres_conn_id=self.conn_id)
         elif self.conn_type == 'hive_cli':
             from airflow.hooks.hive_hooks import HiveCliHook
             return HiveCliHook(hive_cli_conn_id=self.conn_id)
         elif self.conn_type == 'presto':
             from airflow.hooks.presto_hook import PrestoHook
             return PrestoHook(presto_conn_id=self.conn_id)
         elif self.conn_type == 'hiveserver2':
             from airflow.hooks.hive_hooks import HiveServer2Hook
             return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
         elif self.conn_type == 'sqlite':
             from airflow.hooks.sqlite_hook import SqliteHook
             return SqliteHook(sqlite_conn_id=self.conn_id)
         elif self.conn_type == 'jdbc':
             from airflow.hooks.jdbc_hook import JdbcHook
             return JdbcHook(jdbc_conn_id=self.conn_id)
         elif self.conn_type == 'mssql':
             from airflow.hooks.mssql_hook import MsSqlHook
             return MsSqlHook(mssql_conn_id=self.conn_id)
         elif self.conn_type == 'oracle':
             from airflow.hooks.oracle_hook import OracleHook
             return OracleHook(oracle_conn_id=self.conn_id)
         elif self.conn_type == 'vertica':
             from airflow.contrib.hooks.vertica_hook import VerticaHook
             return VerticaHook(vertica_conn_id=self.conn_id)
         elif self.conn_type == 'cloudant':
             from airflow.contrib.hooks.cloudant_hook import CloudantHook
             return CloudantHook(cloudant_conn_id=self.conn_id)
         elif self.conn_type == 'jira':
             from airflow.contrib.hooks.jira_hook import JiraHook
             return JiraHook(jira_conn_id=self.conn_id)
         elif self.conn_type == 'redis':
             from airflow.contrib.hooks.redis_hook import RedisHook
             return RedisHook(redis_conn_id=self.conn_id)
         elif self.conn_type == 'wasb':
             from airflow.contrib.hooks.wasb_hook import WasbHook
             return WasbHook(wasb_conn_id=self.conn_id)
         elif self.conn_type == 'docker':
             from airflow.hooks.docker_hook import DockerHook
             return DockerHook(docker_conn_id=self.conn_id)
     except:
         pass
Example #38
0
    def execute(self, context):
        dest = self.destination_file
        sql = self.sql
        logging.info("Connecting to Big Query")
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
        bq_conn = bq_hook.get_connection(self.bigquery_conn_id)
        bq_conn_extra_json = bq_conn.extra
        bq_conn_extra = json.loads(bq_conn_extra_json)
        service_dict = bq_conn_extra['extra__google_cloud_platform__keyfile_dict']

        logging.info("Getting table from BQ with SQL:/n{0}".format(sql))
        df = read_gbq(sql, dialect='standard', private_key = service_dict)
        logging.info("Got table!")

        #logging.info('\tSaving to... {}'.format(save_dir))
        #if not os.path.isdir(save_dir):
        #    os.mkdir(save_dir)
        logging.info("Writing table to disk in feather format")
        feather.write_dataframe(df, dest)

        logging.info("Table written to {0}".format(dest))
        return df.info()
Example #39
0
  def execute(self, context):
    gcs_hook = GoogleCloudStorageHook(google_cloud_storage_conn_id=self.conn_id)
    partner_ids = models.Variable.get('partner_ids').split(',')
    for i, partner_id in enumerate(partner_ids):
      filename = download_and_transform_erf(self, partner_id)
      entity_read_file_ndj = 'gs://%s/%s' % (self.gcs_bucket, filename)
      hook = BigQueryHook(bigquery_conn_id=self.conn_id)
      self.service = hook.get_service()
      if i == 0:
        write_disposition = 'WRITE_TRUNCATE'
      else:
        write_disposition = 'WRITE_APPEND'

      bq_base_cursor = BigQueryBaseCursor(self.service, self.cloud_project_id)
      bq_base_cursor.run_load(
          self.bq_table,
          [entity_read_file_ndj],
          schema_fields=self.schema,
          source_format='NEWLINE_DELIMITED_JSON',
          write_disposition=write_disposition,
          ignore_unknown_values=True)
      gcs_hook.delete(self.gcs_bucket, filename)
 def execute(self, context=None):
     self.log.info('Executing SQL check: %s', self.sql)
     hook = BigQueryHook(
         bigquery_conn_id=self.bigquery_conn_id,
         use_legacy_sql=self.use_legacy_sql,
         delegate_to=self.delegate_to)
     records = hook.get_first(self.sql)
     self.log.info('Record: %s', records)
     branch_to_follow = self.pass_task
     if not records:
         self.log.info('The query returned None')
         branch_to_follow = self.fail_task
     elif not all([bool(r) for r in records]):
         exceptstr = 'Test failed.\nQuery:\n{q}\nResults:\n{r!s}'
         self.log.info(exceptstr.format(q=self.sql, r=records))
         branch_to_follow = self.fail_task
     downstream_tasks = context['task'].downstream_list
     self.log.info('Following branch %s', branch_to_follow)
     self.log.info('Downstream task_ids %s', downstream_tasks)
     skip_tasks = [t for t in downstream_tasks if t.task_id != branch_to_follow]
     if downstream_tasks:
         self.skip(context['dag_run'], context['ti'].execution_date, skip_tasks)
    def execute(self, context):
        if self.bq_cursor is None:
            self.log.info('Executing: %s', self.sql)
            hook = BigQueryHook(
                bigquery_conn_id=self.bigquery_conn_id,
                use_legacy_sql=self.use_legacy_sql,
                delegate_to=self.delegate_to,
                location=self.location,
            )
            conn = hook.get_conn()
            self.bq_cursor = conn.cursor()
        job_id = self.bq_cursor.run_query(
            sql=self.sql,
            destination_dataset_table=self.destination_dataset_table,
            write_disposition=self.write_disposition,
            allow_large_results=self.allow_large_results,
            flatten_results=self.flatten_results,
            udf_config=self.udf_config,
            maximum_billing_tier=self.maximum_billing_tier,
            maximum_bytes_billed=self.maximum_bytes_billed,
            create_disposition=self.create_disposition,
            query_params=self.query_params,
            labels=self.labels,
            schema_update_options=self.schema_update_options,
            priority=self.priority,
            time_partitioning=self.time_partitioning,
            api_resource_configs=self.api_resource_configs,
            cluster_fields=self.cluster_fields,
        )
        context['task_instance'].xcom_push(key='job_id', value=job_id)

        df = hook.get_pandas_df(self.sql)

        if self.sort_by is not None:
            df.sort_values('self.sort_by')

        list_to_return = df.astype(str).to_dict('index')
        print(list_to_return)
        return list_to_return
Example #42
0
def create_big_query_table():

    bq_hook = BigQueryHook(bigquery_conn_id='bigquery_default',
                           use_legacy_sql=False)

    gcp_credentials = bq_hook._get_credentials()

    bq_client = bigquery.Client(credentials=gcp_credentials,
                                project=bigquery_project)

    target_dataset_ref = bigquery.DatasetReference(
        project=bigquery_project, dataset_id=reference_dataset)

    try:
        target_dataset = bq_client.get_dataset(dataset_ref=target_dataset_ref)
    except NotFound as ex:
        # LOGGER.info(f"Dataset '{target_dataset_ref}' not found, attempting to create.")
        target_dataset = bq_client.create_dataset(dataset=target_dataset_ref)
    target_table_ref = bigquery.TableReference(dataset_ref=target_dataset,
                                               table_id=reference_table)

    target_table = bq_client.delete_table(table=target_table_ref)
Example #43
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.schema_object \
                and self.source_format != 'DATASTORE_BACKUP':
            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(
                gcs_hook.download(self.bucket,
                                  self.schema_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        source_uris = [
            'gs://{}/{}'.format(self.bucket, source_object)
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_external_table(
            external_project_dataset_table=self.
            destination_project_dataset_table,
            schema_fields=schema_fields,
            source_uris=source_uris,
            source_format=self.source_format,
            compression=self.compression,
            skip_leading_rows=self.skip_leading_rows,
            field_delimiter=self.field_delimiter,
            max_bad_records=self.max_bad_records,
            quote_character=self.quote_character,
            allow_quoted_newlines=self.allow_quoted_newlines,
            allow_jagged_rows=self.allow_jagged_rows,
            src_fmt_configs=self.src_fmt_configs,
            labels=self.labels,
            encryption_configuration=self.encryption_configuration)
Example #44
0
    def _bq_get_data(self):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s', self.dataset_id,
                      self.table_id)

        hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                            delegate_to=self.delegate_to,
                            location=self.location)

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(
                dataset_id=self.dataset_id,
                table_id=self.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size)

            if 'rows' in response:
                rows = response['rows']
            else:
                self.log.info('Job Finished')
                return

            self.log.info('Total Extracted rows: %s',
                          len(rows) + i * self.batch_size)

            table_data = []
            for dict_row in rows:
                single_row = []
                for fields in dict_row['f']:
                    single_row.append(fields['v'])
                table_data.append(single_row)

            yield table_data
            i += 1
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        logging.info('start_date_str = %s', self.start_date_str)
        logging.info('end_date_str = %s', self.end_date_str)
        logging.info('Date conversion starts')
        start = str2date(self.start_date_str)
        end = str2date(self.end_date_str)
        logging.info('Date conversion ends')
        logging.info('time_partitioning = %s', self.time_partitioning)

        for i in daterange(start, end):
            date_no_dash = i.strftime("%Y%m%d")
            partitioned_table_id = self.table_id + date_no_dash
            logging.info("Partitioned table {0}".format(partitioned_table_id))

            logging.info('Hooks to check if table exists <%s:%s.%s>',
                         self.project_id, self.dataset_id,
                         partitioned_table_id)
            table_exists = bq_hook.table_exists(self.project_id,
                                                self.dataset_id,
                                                partitioned_table_id)
            if not table_exists:
                logging.info('Table <%s> does not exists',
                             partitioned_table_id)
                logging.info('Connects to BigQuery')
                cursor = BigQueryHelperCursor(bq_hook.get_service(),
                                              self.project_id)

                logging.info('Creates the empty table %s with the schema %s',
                             partitioned_table_id, self.schema_fields)
                cursor.create_empty_table(
                    project_id=self.project_id,
                    dataset_id=self.dataset_id,
                    table_id=partitioned_table_id,
                    schema_fields=self.schema_fields,
                    time_partitioning=self.time_partitioning)
 def execute(self, context):
     logging.info('Executing: %s', str(self.bql))
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
     hook.run(self.bql, self.destination_dataset_table)
Example #47
0
 def execute(self, context):
     logging.info('Executing: %s', str(self.bql))
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to)
     hook.run(self.bql, self.destination_dataset_table, self.write_disposition)