def bqcreate():
	bqcreate = BigQueryHook().create_empty_table(
	project_id="project-staging",
	dataset_id='test',
	table_id="partitiontable",
	table_resource = json.loads(tableresource)
	)
 def tearDown(self):
     self.delete_gcs_bucket(BUCKET)
     with provide_gcp_context(GCP_BIGQUERY_KEY, scopes=SCOPES):
         hook = BigQueryHook()
         hook.delete_dataset(dataset_id='airflow_test',
                             delete_contents=True)
         super().tearDown()
Exemple #3
0
    def execute(self, context: 'Context'):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        table_ref = TableReference.from_string(self.source_project_dataset_table, hook.project_id)

        configuration: Dict[str, Any] = {
            'extract': {
                'sourceTable': table_ref.to_api_repr(),
                'compression': self.compression,
                'destinationUris': self.destination_cloud_storage_uris,
                'destinationFormat': self.export_format,
            }
        }

        if self.labels:
            configuration['labels'] = self.labels

        if self.export_format == 'CSV':
            # Only set fieldDelimiter and printHeader fields if using CSV.
            # Google does not like it if you set these fields for other export
            # formats.
            configuration['extract']['fieldDelimiter'] = self.field_delimiter
            configuration['extract']['printHeader'] = self.print_header

        hook.insert_job(configuration=configuration)
Exemple #4
0
    def execute(self, context: 'Context'):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        job_id = hook.run_extract(
            source_project_dataset_table=self.source_project_dataset_table,
            destination_cloud_storage_uris=self.destination_cloud_storage_uris,
            compression=self.compression,
            export_format=self.export_format,
            field_delimiter=self.field_delimiter,
            print_header=self.print_header,
            labels=self.labels,
        )

        job = hook.get_job(job_id=job_id).to_api_repr()
        conf = job["configuration"]["extract"]["sourceTable"]
        dataset_id, project_id, table_id = conf["datasetId"], conf[
            "projectId"], conf["tableId"]
        BigQueryTableLink.persist(
            context=context,
            task_instance=self,
            dataset_id=dataset_id,
            project_id=project_id,
            table_id=table_id,
        )
Exemple #5
0
    def execute(self, context: 'Context') -> None:
        self.log.info(
            'Executing copy of %s into: %s',
            self.source_project_dataset_tables,
            self.destination_project_dataset_table,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", DeprecationWarning)
            job_id = hook.run_copy(
                source_project_dataset_tables=self.source_project_dataset_tables,
                destination_project_dataset_table=self.destination_project_dataset_table,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                labels=self.labels,
                encryption_configuration=self.encryption_configuration,
            )

            job = hook.get_job(job_id=job_id).to_api_repr()
            conf = job["configuration"]["copy"]["destinationTable"]
            BigQueryTableLink.persist(
                context=context,
                task_instance=self,
                dataset_id=conf["datasetId"],
                project_id=conf["projectId"],
                table_id=conf["tableId"],
            )
def func_appy_bq(**kwargs):
    ti = kwargs['ti']
    ts = ti.xcom_pull(task_ids='gen_table_resource')
    BigQueryHook().create_empty_table(project_id="mygcp-project",
                                      dataset_id='test',
                                      table_id="partition_demo",
                                      table_resource=json.loads(ts))
Exemple #7
0
 def execute(self, context: 'Context') -> None:
     big_query_hook = BigQueryHook(
         gcp_conn_id=self.gcp_conn_id,
         delegate_to=self.delegate_to,
         location=self.location,
         impersonation_chain=self.impersonation_chain,
     )
     project_id, dataset_id, table_id = self.source_project_dataset_table.split(
         '.')
     BigQueryTableLink.persist(
         context=context,
         task_instance=self,
         dataset_id=dataset_id,
         project_id=project_id,
         table_id=table_id,
     )
     mssql_hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id,
                            schema=self.database)
     for rows in bigquery_get_data(
             self.log,
             self.dataset_id,
             self.table_id,
             big_query_hook,
             self.batch_size,
             self.selected_fields,
     ):
         mssql_hook.insert_rows(
             table=self.mssql_table,
             rows=rows,
             target_fields=self.selected_fields,
             replace=self.replace,
         )
    def execute(self, context: 'Context') -> None:
        self.log.info(
            'Executing copy of %s into: %s',
            self.source_project_dataset_tables,
            self.destination_project_dataset_table,
        )
        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        with warnings.catch_warnings():
            warnings.simplefilter("ignore", DeprecationWarning)
            hook.run_copy(
                source_project_dataset_tables=self.
                source_project_dataset_tables,
                destination_project_dataset_table=self.
                destination_project_dataset_table,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                labels=self.labels,
                encryption_configuration=self.encryption_configuration,
            )
Exemple #9
0
 def poke(self, context):
     table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id,
                                      self.table_id)
     self.log.info('Sensor checks existence of table: %s', table_uri)
     hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                         delegate_to=self.delegate_to)
     return hook.table_exists(self.project_id, self.dataset_id,
                              self.table_id)
Exemple #10
0
    def extract_google_analytics_data(gcp_conn_id: str, start_date: str,
                                      end_date: str, locations: str):
        bq_hook = BigQueryHook(gcp_conn_id=gcp_conn_id)

        cities = ','.join(['"{}"'.format(c["name"]) for c in locations])
        countries = ','.join(['"{}"'.format(c["country"]) for c in locations])
        states = ','.join(['"{}"'.format(c["state"]) for c in locations])

        query = """SELECT 
                        fullVisitorId,
                        date,
                        geoNetwork.city as city,
                        geoNetwork.country as country,
                        geoNetwork.region as region,
                        product.v2ProductCategory as productCategory,
                        product.v2ProductName as productName,
                        hits.eCommerceAction.action_type as action_type,
                        hits.eCommerceAction.step as action_step,
                        product.productQuantity as quantity,
                        product.productPrice as price,
                        product.productRevenue as revenue,
                        product.isImpression as isImpression,
                        hits.transaction.transactionId as transactionId,
                        hits.transaction.transactionRevenue as transactionRevenue,
                        hits.transaction.transactionTax as transactionTax,
                        hits.transaction.transactionShipping as transactionShipping,
                    FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
                        UNNEST(hits) as hits,
                        UNNEST(hits.product) as product
                    WHERE 
                        _TABLE_SUFFIX BETWEEN '{start_date}' AND '{end_date}'
                    AND
                        geoNetwork.country IN ({countries})
                    AND
                        geoNetwork.city IN ({cities})
                    AND 
                        geoNetwork.region IN ({states})
                    ORDER BY 
                        date ASC""".format(start_date=start_date,
                                           end_date=end_date,
                                           countries=countries,
                                           cities=cities,
                                           states=states)

        df = bq_hook.get_pandas_df(sql=query, dialect="standard")

        df["location_name"] = df.apply(lambda row: get_location_string(
            row['country'], row['city'], row['region']),
                                       axis=1)
        df["date"] = pd.to_datetime(df["date"])
        df["price"] = df["price"] / (10**6)
        df["revenue"] = df["revenue"] / (10**6)
        df["transactionRevenue"] = df["transactionRevenue"] / (10**6)
        df["transactionTax"] = df["transactionTax"] / (10**6)
        df["transactionShipping"] = df["transactionShipping"] / (10**6)

        df = df.set_index(["date", "location_name"])
        return df.to_csv(date_format="%Y-%m-%d %H:%M:%S")
 def poke(self, context: dict) -> bool:
     table_uri = f'{self.project_id}:{self.dataset_id}.{self.table_id}'
     self.log.info('Sensor checks existence of table: %s', table_uri)
     hook = BigQueryHook(
         bigquery_conn_id=self.bigquery_conn_id,
         delegate_to=self.delegate_to,
         impersonation_chain=self.impersonation_chain,
     )
     return hook.table_exists(project_id=self.project_id,
                              dataset_id=self.dataset_id,
                              table_id=self.table_id)
Exemple #12
0
 def poke(self, context: 'Context') -> bool:
     table_uri = f'{self.project_id}:{self.dataset_id}.{self.table_id}'
     self.log.info('Sensor checks existence of partition: "%s" in table: %s', self.partition_id, table_uri)
     hook = BigQueryHook(
         gcp_conn_id=self.gcp_conn_id,
         delegate_to=self.delegate_to,
         impersonation_chain=self.impersonation_chain,
     )
     return hook.table_partition_exists(
         project_id=self.project_id,
         dataset_id=self.dataset_id,
         table_id=self.table_id,
         partition_id=self.partition_id,
     )
Exemple #13
0
 def poke(self, context: dict) -> bool:
     table_uri = '{0}:{1}.{2}'.format(self.project_id, self.dataset_id, self.table_id)
     self.log.info('Sensor checks existence of partition: "%s" in table: %s', self.partition_id, table_uri)
     hook = BigQueryHook(
         bigquery_conn_id=self.bigquery_conn_id,
         delegate_to=self.delegate_to,
         impersonation_chain=self.impersonation_chain,
     )
     return hook.table_partition_exists(
         project_id=self.project_id,
         dataset_id=self.dataset_id,
         table_id=self.table_id,
         partition_id=self.partition_id,
     )
 def execute(self, context):
     self.log.info('Executing extract of %s into: %s',
                   self.source_project_dataset_table,
                   self.destination_cloud_storage_uris)
     hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to,
                         location=self.location)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_extract(
         source_project_dataset_table=self.source_project_dataset_table,
         destination_cloud_storage_uris=self.destination_cloud_storage_uris,
         compression=self.compression,
         export_format=self.export_format,
         field_delimiter=self.field_delimiter,
         print_header=self.print_header,
         labels=self.labels)
Exemple #15
0
 def execute(self, context):
     self.log.info('Executing copy of %s into: %s',
                   self.source_project_dataset_tables,
                   self.destination_project_dataset_table)
     hook = BigQueryHook(bigquery_conn_id=self.gcp_conn_id,
                         delegate_to=self.delegate_to,
                         location=self.location)
     conn = hook.get_conn()
     cursor = conn.cursor()
     cursor.run_copy(
         source_project_dataset_tables=self.source_project_dataset_tables,
         destination_project_dataset_table=self.
         destination_project_dataset_table,
         write_disposition=self.write_disposition,
         create_disposition=self.create_disposition,
         labels=self.labels,
         encryption_configuration=self.encryption_configuration)
Exemple #16
0
    def _bq_get_data(self):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s', self.dataset_id,
                      self.table_id)

        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(
                dataset_id=self.dataset_id,
                table_id=self.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size,
            )

            if 'rows' in response:
                rows = response['rows']
            else:
                self.log.info('Job Finished')
                return

            self.log.info('Total Extracted rows: %s',
                          len(rows) + i * self.batch_size)

            table_data = []
            for dict_row in rows:
                single_row = []
                for fields in dict_row['f']:
                    single_row.append(fields['v'])
                table_data.append(single_row)

            yield table_data
            i += 1
    def _bq_get_data(self):

        hook = BigQueryHook(
            bigquery_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        table_ref = TableReference.from_string(
            self.source_project_dataset_table)
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s, Table: %s', table_ref.dataset_id,
                      table_ref.table_id)

        conn = hook.get_conn()
        cursor = conn.cursor()
        i = 0
        while True:
            response = cursor.get_tabledata(
                dataset_id=table_ref.dataset_id,
                table_id=table_ref.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size,
            )

            if 'rows' not in response:
                self.log.info('Job Finished')
                return

            rows = response['rows']

            self.log.info('Total Extracted rows: %s',
                          len(rows) + i * self.batch_size)

            table_data = []
            table_data = [[fields['v'] for fields in dict_row['f']]
                          for dict_row in rows]

            yield table_data
            i += 1
Exemple #18
0
    def execute(self, context):
        bq = BigQueryHook(gcp_conn_id=self.gcp_conn_id, use_legacy_sql=False)

        total_rows = []

        # self.log.info("{credentials}")
        for table in self.table_list:
            q = self.sql.format(table)
            response = bq.get_pandas_df(q)
            if response.empty:
                response_value = 0
            else:
                response_value = response['f0_'].iloc[0]
            total_rows.append(response_value)

        if sum(total_rows) != self.pass_value:
            logging.info(sum(total_rows))
            logging.info('Tests failed')
            logging.info(total_rows)
            logging.info(self.pass_value)
            raise ValueError('Data quality check failed')
Exemple #19
0
 def execute(self, context: 'Context'):
     self.log.info(
         'Executing extract of %s into: %s',
         self.source_project_dataset_table,
         self.destination_cloud_storage_uris,
     )
     hook = BigQueryHook(
         gcp_conn_id=self.gcp_conn_id,
         delegate_to=self.delegate_to,
         location=self.location,
         impersonation_chain=self.impersonation_chain,
     )
     hook.run_extract(
         source_project_dataset_table=self.source_project_dataset_table,
         destination_cloud_storage_uris=self.destination_cloud_storage_uris,
         compression=self.compression,
         export_format=self.export_format,
         field_delimiter=self.field_delimiter,
         print_header=self.print_header,
         labels=self.labels,
     )
Exemple #20
0
 def load(gcp_conn_id: str, combined_data: str, gcs_bucket: str,
          gcs_object: str):
     gcs_hook = GCSHook(gcp_conn_id=gcp_conn_id)
     gcs_hook.upload(bucket_name=gcs_bucket,
                     data=combined_data,
                     object_name=gcs_object)
     bq_hook = BigQueryHook(bigquery_conn_id=gcp_conn_id)
     bq_hook.run_load(
         destination_project_dataset_table=
         "augmented-works-297410.demo_dataset.sales_interactions2",
         source_uris="gs://{}/{}".format(gcs_bucket, gcs_object),
         write_disposition="WRITE_APPEND",
         source_format="CSV",
         skip_leading_rows=1,
         autodetect=False,
         schema_fields=[
             bigquery.SchemaField("date", "DATETIME").to_api_repr(),
             bigquery.SchemaField("location_name", "STRING").to_api_repr(),
             bigquery.SchemaField("average_temp", "FLOAT").to_api_repr(),
             bigquery.SchemaField("fullVisitorId", "STRING").to_api_repr(),
             bigquery.SchemaField("city", "STRING").to_api_repr(),
             bigquery.SchemaField("country", "STRING").to_api_repr(),
             bigquery.SchemaField("region", "STRING").to_api_repr(),
             bigquery.SchemaField("productCategory",
                                  "STRING").to_api_repr(),
             bigquery.SchemaField("productName", "STRING").to_api_repr(),
             bigquery.SchemaField("action_type", "INTEGER").to_api_repr(),
             bigquery.SchemaField("action_step", "INTEGER").to_api_repr(),
             bigquery.SchemaField("quantity", "FLOAT").to_api_repr(),
             bigquery.SchemaField("price", "FLOAT").to_api_repr(),
             bigquery.SchemaField("revenue", "FLOAT").to_api_repr(),
             bigquery.SchemaField("isImpression", "BOOL").to_api_repr(),
             bigquery.SchemaField("transactionId", "STRING").to_api_repr(),
             bigquery.SchemaField("transactionRevenue",
                                  "FLOAT").to_api_repr(),
             bigquery.SchemaField("transactionTax", "FLOAT").to_api_repr(),
             bigquery.SchemaField("transactionShipping",
                                  "FLOAT").to_api_repr(),
         ])
 def execute(self, context: 'Context') -> None:
     big_query_hook = BigQueryHook(
         gcp_conn_id=self.gcp_conn_id,
         delegate_to=self.delegate_to,
         location=self.location,
         impersonation_chain=self.impersonation_chain,
     )
     mysql_hook = MySqlHook(schema=self.database, mysql_conn_id=self.mysql_conn_id)
     for rows in bigquery_get_data(
         self.log,
         self.dataset_id,
         self.table_id,
         big_query_hook,
         self.batch_size,
         self.selected_fields,
     ):
         mysql_hook.insert_rows(
             table=self.mysql_table,
             rows=rows,
             target_fields=self.selected_fields,
             replace=self.replace,
         )
Exemple #22
0
    def _bq_get_data(self):
        self.log.info('Fetching Data from:')
        self.log.info('Dataset: %s ; Table: %s', self.dataset_id,
                      self.table_id)

        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        i = 0
        while True:
            response = hook.list_rows(
                dataset_id=self.dataset_id,
                table_id=self.table_id,
                max_results=self.batch_size,
                selected_fields=self.selected_fields,
                start_index=i * self.batch_size,
            )
            rows = [dict(r) for r in response]
            if len(rows) == 0:
                self.log.info('Job Finished')
                return

            self.log.info('Total Extracted rows: %s',
                          len(rows) + i * self.batch_size)

            table_data = []
            for dict_row in rows:
                single_row = []
                for fields in dict_row['f']:
                    single_row.append(fields['v'])
                table_data.append(single_row)

            yield table_data
            i += 1
    def execute(self, context):
        bq_hook = BigQueryHook(
            bigquery_conn_id=self.bigquery_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        if not self.schema_fields:
            if self.schema_object and self.source_format != 'DATASTORE_BACKUP':
                gcs_hook = GCSHook(
                    gcp_conn_id=self.google_cloud_storage_conn_id,
                    delegate_to=self.delegate_to,
                    impersonation_chain=self.impersonation_chain,
                )
                blob = gcs_hook.download(
                    bucket_name=self.bucket,
                    object_name=self.schema_object,
                )
                schema_fields = json.loads(blob.decode("utf-8"))
            elif self.schema_object is None and self.autodetect is False:
                raise AirflowException(
                    'At least one of `schema_fields`, `schema_object`, or `autodetect` must be passed.'
                )
            else:
                schema_fields = None

        else:
            schema_fields = self.schema_fields

        source_uris = [
            f'gs://{self.bucket}/{source_object}'
            for source_object in self.source_objects
        ]
        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        if self.external_table:
            cursor.create_external_table(
                external_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                compression=self.compression,
                skip_leading_rows=self.skip_leading_rows,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                src_fmt_configs=self.src_fmt_configs,
                encryption_configuration=self.encryption_configuration,
                labels=self.labels,
                description=self.description,
            )
        else:
            cursor.run_load(
                destination_project_dataset_table=self.
                destination_project_dataset_table,
                schema_fields=schema_fields,
                source_uris=source_uris,
                source_format=self.source_format,
                autodetect=self.autodetect,
                create_disposition=self.create_disposition,
                skip_leading_rows=self.skip_leading_rows,
                write_disposition=self.write_disposition,
                field_delimiter=self.field_delimiter,
                max_bad_records=self.max_bad_records,
                quote_character=self.quote_character,
                ignore_unknown_values=self.ignore_unknown_values,
                allow_quoted_newlines=self.allow_quoted_newlines,
                allow_jagged_rows=self.allow_jagged_rows,
                encoding=self.encoding,
                schema_update_options=self.schema_update_options,
                src_fmt_configs=self.src_fmt_configs,
                time_partitioning=self.time_partitioning,
                cluster_fields=self.cluster_fields,
                encryption_configuration=self.encryption_configuration,
                labels=self.labels,
                description=self.description,
            )

        if cursor.use_legacy_sql:
            escaped_table_name = f'[{self.destination_project_dataset_table}]'
        else:
            escaped_table_name = f'`{self.destination_project_dataset_table}`'

        if self.max_id_key:
            cursor.execute(
                f'SELECT MAX({self.max_id_key}) FROM {escaped_table_name}')
            row = cursor.fetchone()
            max_id = row[0] if row[0] else 0
            self.log.info(
                'Loaded BQ data with max %s.%s=%s',
                self.destination_project_dataset_table,
                self.max_id_key,
                max_id,
            )
Exemple #24
0
    def execute(self, context: 'Context'):
        bq_hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )

        if not self.schema_fields:
            if self.schema_object and self.source_format != 'DATASTORE_BACKUP':
                gcs_hook = GCSHook(
                    gcp_conn_id=self.gcp_conn_id,
                    delegate_to=self.delegate_to,
                    impersonation_chain=self.impersonation_chain,
                )
                blob = gcs_hook.download(
                    bucket_name=self.bucket,
                    object_name=self.schema_object,
                )
                schema_fields = json.loads(blob.decode("utf-8"))
            else:
                schema_fields = None
        else:
            schema_fields = self.schema_fields

        self.source_objects = (self.source_objects if isinstance(
            self.source_objects, list) else [self.source_objects])
        source_uris = [
            f'gs://{self.bucket}/{source_object}'
            for source_object in self.source_objects
        ]

        if self.external_table:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", DeprecationWarning)
                bq_hook.create_external_table(
                    external_project_dataset_table=self.
                    destination_project_dataset_table,
                    schema_fields=schema_fields,
                    source_uris=source_uris,
                    source_format=self.source_format,
                    autodetect=self.autodetect,
                    compression=self.compression,
                    skip_leading_rows=self.skip_leading_rows,
                    field_delimiter=self.field_delimiter,
                    max_bad_records=self.max_bad_records,
                    quote_character=self.quote_character,
                    ignore_unknown_values=self.ignore_unknown_values,
                    allow_quoted_newlines=self.allow_quoted_newlines,
                    allow_jagged_rows=self.allow_jagged_rows,
                    encoding=self.encoding,
                    src_fmt_configs=self.src_fmt_configs,
                    encryption_configuration=self.encryption_configuration,
                    labels=self.labels,
                    description=self.description,
                )
        else:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", DeprecationWarning)
                bq_hook.run_load(
                    destination_project_dataset_table=self.
                    destination_project_dataset_table,
                    schema_fields=schema_fields,
                    source_uris=source_uris,
                    source_format=self.source_format,
                    autodetect=self.autodetect,
                    create_disposition=self.create_disposition,
                    skip_leading_rows=self.skip_leading_rows,
                    write_disposition=self.write_disposition,
                    field_delimiter=self.field_delimiter,
                    max_bad_records=self.max_bad_records,
                    quote_character=self.quote_character,
                    ignore_unknown_values=self.ignore_unknown_values,
                    allow_quoted_newlines=self.allow_quoted_newlines,
                    allow_jagged_rows=self.allow_jagged_rows,
                    encoding=self.encoding,
                    schema_update_options=self.schema_update_options,
                    src_fmt_configs=self.src_fmt_configs,
                    time_partitioning=self.time_partitioning,
                    cluster_fields=self.cluster_fields,
                    encryption_configuration=self.encryption_configuration,
                    labels=self.labels,
                    description=self.description,
                )

        if self.max_id_key:
            select_command = f'SELECT MAX({self.max_id_key}) FROM `{self.destination_project_dataset_table}`'
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", DeprecationWarning)
                job_id = bq_hook.run_query(
                    sql=select_command,
                    use_legacy_sql=False,
                )
            row = list(bq_hook.get_job(job_id).result())
            if row:
                max_id = row[0] if row[0] else 0
                self.log.info(
                    'Loaded BQ data with max %s.%s=%s',
                    self.destination_project_dataset_table,
                    self.max_id_key,
                    max_id,
                )
            else:
                raise RuntimeError(f"The {select_command} returned no rows!")
Exemple #25
0
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.providers.mysql.hooks.mysql import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.providers.google.cloud.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.providers.postgres.hooks.postgres import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.providers.apache.pig.hooks.pig import PigCliHook
         return PigCliHook(pig_cli_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.providers.apache.hive.hooks.hive import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.providers.presto.hooks.presto import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.providers.sqlite.hooks.sqlite import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.providers.jdbc.hooks.jdbc import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.providers.microsoft.mssql.hooks.mssql import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'odbc':
         from airflow.providers.odbc.hooks.odbc import OdbcHook
         return OdbcHook(odbc_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.providers.oracle.hooks.oracle import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.providers.vertica.hooks.vertica import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.providers.cloudant.hooks.cloudant import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.providers.jira.hooks.jira import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.providers.redis.hooks.redis import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.providers.microsoft.azure.hooks.wasb import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.providers.docker.hooks.docker import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.providers.microsoft.azure.hooks.azure_data_lake import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.providers.microsoft.azure.hooks.azure_cosmos import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.providers.apache.cassandra.hooks.cassandra import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.providers.mongo.hooks.mongo import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.providers.google.cloud.hooks.cloud_sql import CloudSQLDatabaseHook
         return CloudSQLDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.providers.grpc.hooks.grpc import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
Exemple #26
0
    def execute(self, context: 'Context'):
        self.log.info(
            'Executing extract of %s into: %s',
            self.source_project_dataset_table,
            self.destination_cloud_storage_uris,
        )
        hook = BigQueryHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            location=self.location,
            impersonation_chain=self.impersonation_chain,
        )
        self.hook = hook

        configuration = self._prepare_configuration()
        job_id = hook.generate_job_id(
            job_id=self.job_id,
            dag_id=self.dag_id,
            task_id=self.task_id,
            logical_date=context["logical_date"],
            configuration=configuration,
            force_rerun=self.force_rerun,
        )

        try:
            self.log.info("Executing: %s", configuration)
            job: ExtractJob = hook.insert_job(
                job_id=job_id,
                configuration=configuration,
                project_id=self.project_id,
                location=self.location,
                timeout=self.result_timeout,
                retry=self.result_retry,
            )
            self._handle_job_error(job)
        except Conflict:
            # If the job already exists retrieve it
            job = hook.get_job(
                project_id=self.project_id,
                location=self.location,
                job_id=job_id,
            )
            if job.state in self.reattach_states:
                # We are reattaching to a job
                job.result(timeout=self.result_timeout,
                           retry=self.result_retry)
                self._handle_job_error(job)
            else:
                # Same job configuration so we need force_rerun
                raise AirflowException(
                    f"Job with id: {job_id} already exists and is in {job.state} state. If you "
                    f"want to force rerun it consider setting `force_rerun=True`."
                    f"Or, if you want to reattach in this scenario add {job.state} to `reattach_states`"
                )

        conf = job.to_api_repr()["configuration"]["extract"]["sourceTable"]
        dataset_id, project_id, table_id = conf["datasetId"], conf[
            "projectId"], conf["tableId"]
        BigQueryTableLink.persist(
            context=context,
            task_instance=self,
            dataset_id=dataset_id,
            project_id=project_id,
            table_id=table_id,
        )