Ejemplo n.º 1
0
    def test_execute_bad_type(self, mock_hook):
        operator = BigQueryOperator(
            task_id=TASK_ID,
            sql=1,
            destination_dataset_table=None,
            write_disposition='WRITE_EMPTY',
            allow_large_results=False,
            flatten_results=None,
            bigquery_conn_id='google_cloud_default',
            udf_config=None,
            use_legacy_sql=True,
            maximum_billing_tier=None,
            maximum_bytes_billed=None,
            create_disposition='CREATE_IF_NEEDED',
            schema_update_options=(),
            query_params=None,
            labels=None,
            priority='INTERACTIVE',
            time_partitioning=None,
            api_resource_configs=None,
            cluster_fields=None,
        )

        with self.assertRaises(AirflowException):
            operator.execute(MagicMock())
Ejemplo n.º 2
0
    def test_bigquery_operator_defaults(self, mock_hook):
        operator = BigQueryOperator(
            task_id=TASK_ID,
            sql='Select * from test_table',
        )

        operator.execute(None)
        mock_hook.return_value \
            .get_conn() \
            .cursor() \
            .run_query \
            .assert_called_once_with(
                sql='Select * from test_table',
                destination_dataset_table=None,
                write_disposition='WRITE_EMPTY',
                allow_large_results=False,
                flatten_results=None,
                udf_config=None,
                maximum_billing_tier=None,
                maximum_bytes_billed=None,
                create_disposition='CREATE_IF_NEEDED',
                schema_update_options=(),
                query_params=None,
                labels=None,
                priority='INTERACTIVE',
                time_partitioning=None,
                api_resource_configs=None,
                cluster_fields=None,
            )
    def test_bigquery_operator_extra_link(self, mock_hook):
        bigquery_task = BigQueryOperator(
            task_id=TASK_ID,
            sql='SELECT * FROM test_table',
            dag=self.dag,
        )
        self.dag.clear()

        ti = TaskInstance(
            task=bigquery_task,
            execution_date=DEFAULT_DATE,
        )

        job_id = '12345'
        ti.xcom_push(key='job_id', value=job_id)

        self.assertEquals(
            'https://console.cloud.google.com/bigquery?j={job_id}'.format(job_id=job_id),
            bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name),
        )

        self.assertEquals(
            '',
            bigquery_task.get_extra_links(datetime(2019, 1, 1), BigQueryConsoleLink.name),
        )
Ejemplo n.º 4
0
    def test_bigquery_operator_extra_link(self, mock_hook):
        bigquery_task = BigQueryOperator(
            task_id=TASK_ID,
            sql='SELECT * FROM test_table',
            dag=self.dag,
        )
        self.dag.clear()

        ti = TaskInstance(
            task=bigquery_task,
            execution_date=DEFAULT_DATE,
        )

        job_id = '12345'
        ti.xcom_push(key='job_id', value=job_id)

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j={job_id}'.format(
                job_id=job_id),
            bigquery_task.get_extra_links(DEFAULT_DATE,
                                          BigQueryConsoleLink.name),
        )

        self.assertEqual(
            '',
            bigquery_task.get_extra_links(datetime(2019, 1, 1),
                                          BigQueryConsoleLink.name),
        )
Ejemplo n.º 5
0
    def test_bigquery_operator_extra_link_when_multiple_query(self, mock_hook, session):
        bigquery_task = BigQueryOperator(
            task_id=TASK_ID,
            sql=['SELECT * FROM test_table', 'SELECT * FROM test_table2'],
            dag=self.dag,
        )
        self.dag.clear()
        session.query(XCom).delete()

        ti = TaskInstance(
            task=bigquery_task,
            execution_date=DEFAULT_DATE,
        )

        job_id = ['123', '45']
        ti.xcom_push(key='job_id', value=job_id)

        self.assertEqual(
            {'BigQuery Console #1', 'BigQuery Console #2'},
            bigquery_task.operator_extra_link_dict.keys()
        )

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=123',
            bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #1'),
        )

        self.assertEqual(
            'https://console.cloud.google.com/bigquery?j=45',
            bigquery_task.get_extra_links(DEFAULT_DATE, 'BigQuery Console #2'),
        )
    def test_bigquery_operator_defaults(self, mock_hook):
        operator = BigQueryOperator(
            task_id=TASK_ID,
            sql='Select * from test_table',
        )

        operator.execute(None)
        mock_hook.return_value \
            .get_conn() \
            .cursor() \
            .run_query \
            .assert_called_once_with(
                sql='Select * from test_table',
                destination_dataset_table=None,
                write_disposition='WRITE_EMPTY',
                allow_large_results=False,
                flatten_results=None,
                udf_config=None,
                maximum_billing_tier=None,
                maximum_bytes_billed=None,
                create_disposition='CREATE_IF_NEEDED',
                schema_update_options=(),
                query_params=None,
                labels=None,
                priority='INTERACTIVE',
                time_partitioning=None,
                api_resource_configs=None,
                cluster_fields=None,
            )
    def test_bigquery_operator_defaults(self, mock_hook):

        operator = BigQueryOperator(task_id=TASK_ID,
                                    sql='Select * from test_table',
                                    dag=self.dag,
                                    default_args=self.args)

        operator.execute(None)
        mock_hook.return_value \
            .get_conn() \
            .cursor() \
            .run_query \
            .assert_called_once_with(
                sql='Select * from test_table',
                destination_dataset_table=None,
                write_disposition='WRITE_EMPTY',
                allow_large_results=False,
                flatten_results=None,
                udf_config=None,
                maximum_billing_tier=None,
                maximum_bytes_billed=None,
                create_disposition='CREATE_IF_NEEDED',
                schema_update_options=(),
                query_params=None,
                labels=None,
                priority='INTERACTIVE',
                time_partitioning=None,
                api_resource_configs=None,
                cluster_fields=None,
            )

        self.assertTrue(isinstance(operator.sql, six.string_types))
        ti = TaskInstance(task=operator, execution_date=DEFAULT_DATE)
        ti.render_templates()
        self.assertTrue(isinstance(ti.task.sql, six.string_types))
Ejemplo n.º 8
0
    def execute(self, context):
        # TODO checar se 'hasattr' contempla
        try:
            self.sql = self.SQL_TEMPLATE.format(**self.sql_template_params)
        except AttributeError:
            self.sql = self.sql.format(**self.sql_template_params)

        BigQueryOperator.execute(self, context)
Ejemplo n.º 9
0
    def test_bigquery_operator_extra_link_when_missing_job_id(self, mock_hook, session):
        bigquery_task = BigQueryOperator(
            task_id=TASK_ID,
            sql='SELECT * FROM test_table',
            dag=self.dag,
        )
        self.dag.clear()
        session.query(XCom).delete()

        self.assertEqual(
            '',
            bigquery_task.get_extra_links(DEFAULT_DATE, BigQueryConsoleLink.name),
        )
Ejemplo n.º 10
0
    def test_execute(self, mock_hook):
        encryption_configuration = {'key': 'kk'}

        operator = BigQueryOperator(
            task_id=TASK_ID,
            sql='Select * from test_table',
            destination_dataset_table=None,
            write_disposition='WRITE_EMPTY',
            allow_large_results=False,
            flatten_results=None,
            gcp_conn_id='google_cloud_default',
            udf_config=None,
            use_legacy_sql=True,
            maximum_billing_tier=None,
            maximum_bytes_billed=None,
            create_disposition='CREATE_IF_NEEDED',
            schema_update_options=(),
            query_params=None,
            labels=None,
            priority='INTERACTIVE',
            time_partitioning=None,
            api_resource_configs=None,
            cluster_fields=None,
            encryption_configuration=encryption_configuration
        )

        operator.execute(MagicMock())
        mock_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .run_query \
            .assert_called_once_with(
                sql='Select * from test_table',
                destination_dataset_table=None,
                write_disposition='WRITE_EMPTY',
                allow_large_results=False,
                flatten_results=None,
                udf_config=None,
                maximum_billing_tier=None,
                maximum_bytes_billed=None,
                create_disposition='CREATE_IF_NEEDED',
                schema_update_options=(),
                query_params=None,
                labels=None,
                priority='INTERACTIVE',
                time_partitioning=None,
                api_resource_configs=None,
                cluster_fields=None,
                encryption_configuration=encryption_configuration
            )
Ejemplo n.º 11
0
def execute_sql(task_id: str, sql_file_path: str) -> BigQueryOperator:
    return BigQueryOperator(task_id=task_id,
                            sql=sql_file_path,
                            bigquery_conn_id=BIG_QUERY_CONN_ID,
                            write_disposition='WRITE_APPEND',
                            use_legacy_sql=False,
                            location='US')
Ejemplo n.º 12
0
def gc_tasks(name, schema, next_task=DummyOperator(task_id="Done")):
    bq_staging = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_source }}}}.{name}"
    bq_warehouse = f"{{{{ var.value.gc_project_id }}}}.{{{{ var.value.bq_dataset_target }}}}.{name}"

    t1 = GoogleCloudStorageToBigQueryOperator(
        task_id=f"staging_{name}",
        bucket="{{var.value.gcs_bucket}}",
        source_objects=[f"{name}*"],
        destination_project_dataset_table=bq_staging,
        write_disposition="WRITE_TRUNCATE",
        schema_fields=schema,
        skip_leading_rows=1,
    )

    t2 = BigQueryOperator(
        task_id=f"merge_{name}_into_warehouse",
        sql=_create_merge_sql(bq_staging, bq_warehouse, schema),
        use_legacy_sql=False,
    )

    t3 = GoogleCloudStorageToGoogleCloudStorageOperator(
        task_id=f"move_{name}_to_processed",
        source_bucket="{{var.value.gcs_bucket}}",
        source_object=f"{name}*",
        destination_bucket="{{var.value.gcs_bucket}}",
        destination_object=f"processed/{name}",
        move_object=True,
    )

    t1 >> t2 >> t3 >> next_task

    return t1
Ejemplo n.º 13
0
def Call_BQ_Load_Proc(proj, dset, tgt_tab):

    return BigQueryOperator(task_id='load_' + tgt_tab + '_via_sproc',
                            sql='CALL `' + proj + '.' + dset + '.load_' +
                            tgt_tab + '`()',
                            use_legacy_sql=False,
                            trigger_rule='none_failed')
Ejemplo n.º 14
0
def Truncate_BQ_Table(proj, dset, tgt_tab):

    return BigQueryOperator(task_id='truncate_' + tgt_tab,
                            sql='TRUNCATE TABLE `' + proj + '.' + dset + '.' +
                            tgt_tab + '`',
                            use_legacy_sql=False,
                            trigger_rule='none_failed')
 def createTaskHelper(table):
         return BigQueryOperator(
             task_id='materialize__{0}'.format(table),
             bql='{0}.sql'.format(table),
             use_legacy_sql=False,
             write_disposition="WRITE_TRUNCATE",
             destination_dataset_table='{0}.{1}'.format(BQ_DATASET_NAME,table),
             dag=dag_daily)
 def createTaskHelper(table):
         return BigQueryOperator(
             task_id='materialize_{0}'.format(table),
             bql='{0}.sql'.format(table),
             params={"partition_date":"{0}".format(job_run_date)},
             use_legacy_sql=False,
             write_disposition="WRITE_TRUNCATE",
             destination_dataset_table='{0}.{1}${2}'.format(BQ_DATASET_NAME,table,job_run_date.replace('-','')),
             dag=dag_daily)
Ejemplo n.º 17
0
 def test_bql_deprecation_warning(self):
     with warnings.catch_warnings(record=True) as w:
         BigQueryOperator(
             task_id='test_deprecation_warning_for_bql',
             bql='select * from test_table'
         )
     self.assertIn(
         'Deprecated parameter `bql`',
         w[0].message.args[0])
def deleteStagingTablesTask(table):
    return BigQueryOperator(
            task_id='delete_staging_{0}'.format(table),
            bql = '''
                DROP TABLE IF EXISTS {{params.table}}
            ''',
            params={"table":"{0}.{1}".format(BQ_STAGING_DATASET_NAME,table)},
            use_legacy_sql=False,
            dag=dag_daily)
Ejemplo n.º 19
0
def insert_overwrite(task_id: str, sql_file_path: str,
                     destination_table: str) -> BigQueryOperator:
    return BigQueryOperator(task_id=task_id,
                            sql=sql_file_path,
                            bigquery_conn_id=BIG_QUERY_CONN_ID,
                            write_disposition='WRITE_TRUNCATE',
                            destination_dataset_table=destination_table,
                            use_legacy_sql=False,
                            location='US')
Ejemplo n.º 20
0
def Load_Within_BQ(mode, proj, dset, tgt_tab, src_tab, src_cols='*'):

    t = 'load_' + tgt_tab + '_from_' + src_tab
    #   t = 'load_' + tgt_tab + '_via_' + ('custom_op' + str(mode) if mode in (1,2,3) else 'subdag' if mode == 4 else 'std_op')

    d = proj + '.' + dset + '.' + tgt_tab
    s = 'SELECT ' + src_cols + ' FROM `' + proj + '.' + dset + '.' + src_tab + '`'

    c = 'CREATE_NEVER'
    w = 'WRITE_TRUNCATE' if mode in (1, 2, 3) else 'WRITE_EMPTY'
    l = False
    r = 'none_failed'

    if mode == 1:
        return CustomBigQueryOperator(task_id=t,
                                      sql=s,
                                      destination_dataset_table=d,
                                      create_disposition=c,
                                      write_disposition=w,
                                      use_legacy_sql=l,
                                      trigger_rule=r)

    elif mode == 2:
        return AnotherCustomBigQueryOperator(task_id=t,
                                             sql=s,
                                             destination_dataset_table=d,
                                             create_disposition=c,
                                             write_disposition=w,
                                             use_legacy_sql=l,
                                             trigger_rule=r)

    elif mode == 3:
        return DodgyCustomBigQueryOperator(task_id=t,
                                           sql=s,
                                           destination_dataset_table=d,
                                           create_disposition=c,
                                           write_disposition=w,
                                           use_legacy_sql=l,
                                           trigger_rule=r)

    elif mode == 4:
        return SubDagOperator(subdag=Load_Subdag(tgt_tab, t, s, d, c, w, l, r,
                                                 dag.default_args),
                              task_id=t,
                              dag=dag)

    else:
        return BigQueryOperator(task_id=t,
                                sql=s,
                                destination_dataset_table=d,
                                create_disposition=c,
                                write_disposition=w,
                                use_legacy_sql=l,
                                trigger_rule=r)
Ejemplo n.º 21
0
    def _get_bigquery_task():
        dag = DAG(dag_id='TestBigQueryExtractorE2E')
        task = BigQueryOperator(
            sql='select first_name, last_name from customers;',
            task_id="task_id",
            project_id="project_id",
            dag_id="dag_id",
            dag=dag,
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0))

        return task
Ejemplo n.º 22
0
    def __init__(self,
                 project,
                 table,
                 sql_template_params,
                 task_id=None,
                 sql=None,
                 *args,
                 **kwargs):
        self.project = project
        self.table = table
        self.sql_template_params = sql_template_params

        BigQueryOperator.__init__(
            self,
            task_id=task_id if task_id else '{}-table-{}'.format(
                self.operation, self.table),
            sql=sql if sql else 'SELECT 1',
            allow_large_results=True,
            use_legacy_sql=False,
            *args,
            **kwargs)
Ejemplo n.º 23
0
    def test_extract_error(self, mock_client, mock_hook):
        bq_job_id = "foo.bq.job_id"

        mock_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .run_query.return_value = bq_job_id

        mock_client.return_value \
            .get_job.side_effects = [Exception("bq error")]

        # To make sure hasattr "sees" close and calls it
        mock_client.return_value.close.return_value

        mock.seal(mock_hook)
        mock.seal(mock_client)

        dag = DAG(dag_id='TestBigQueryExtractorE2E')
        task = BigQueryOperator(
            sql='select first_name, last_name from customers;',
            task_id="task_id",
            project_id="project_id",
            dag_id="dag_id",
            dag=dag,
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0)
        )

        task_instance = TaskInstance(
            task=task,
            execution_date=datetime.utcnow().replace(tzinfo=pytz.utc))

        bq_extractor = BigQueryExtractor(task)

        steps_meta_extract = bq_extractor.extract()
        assert steps_meta_extract is None

        task_instance.run()

        step_meta = bq_extractor.extract_on_complete(task_instance)
        assert step_meta.context['bigquery.extractor.error'] is not None
        mock_client.return_value \
            .get_job.assert_called_once_with(job_id=bq_job_id)

        assert step_meta.inputs is not None
        assert len(step_meta.inputs) == 0
        assert step_meta.outputs is not None
        assert len(step_meta.outputs) == 0

        assert step_meta.context['sql'] == task.sql

        mock_client.return_value.close.assert_called()
Ejemplo n.º 24
0
def Load_Subdag(tgt_tab, t, s, d, c, w, l, r, args):

    subdag = models.DAG(dag_id='Skating_ELT.' + t,
                        default_args=args,
                        schedule_interval="@daily")

    s01 = BigQueryOperator(task_id='truncate_' + tgt_tab,
                           sql='TRUNCATE TABLE `' + d + '`',
                           use_legacy_sql=l,
                           trigger_rule=r,
                           dag=subdag)

    s02 = BigQueryOperator(task_id='load_' + tgt_tab,
                           sql=s,
                           destination_dataset_table=d,
                           create_disposition=c,
                           write_disposition=w,
                           use_legacy_sql=l,
                           trigger_rule=r,
                           dag=subdag)

    s01 >> s02
    return subdag
    def test_bigquery_operator_defaults(self, mock_hook):

        operator = BigQueryOperator(
            task_id=TASK_ID,
            sql='Select * from test_table',
            dag=self.dag, default_args=self.args
        )

        operator.execute(None)
        mock_hook.return_value \
            .get_conn() \
            .cursor() \
            .run_query \
            .assert_called_once_with(
                sql='Select * from test_table',
                destination_dataset_table=None,
                write_disposition='WRITE_EMPTY',
                allow_large_results=False,
                flatten_results=None,
                udf_config=None,
                maximum_billing_tier=None,
                maximum_bytes_billed=None,
                create_disposition='CREATE_IF_NEEDED',
                schema_update_options=(),
                query_params=None,
                labels=None,
                priority='INTERACTIVE',
                time_partitioning=None,
                api_resource_configs=None,
                cluster_fields=None,
            )

        self.assertTrue(isinstance(operator.sql, six.string_types))
        ti = TaskInstance(task=operator, execution_date=DEFAULT_DATE)
        ti.render_templates()
        self.assertTrue(isinstance(ti.task.sql, six.string_types))
Ejemplo n.º 26
0
def view_redefinition_task_factory(table_config, **kwargs):
    # load the values if needed in the command you plan to execute
    dataset = table_config['dataset']
    table_name = table_config['table_name']
    table_suffix = table_config['table_suffix']
    return BigQueryOperator(
        task_id=f'view_redeploy_{table_name}',
        sql=
        f'create or replace view `{dataset}.{table_name}` as select * from `{dataset}.{table_name + table_suffix}`',
        #destination_dataset_table=False,
        bigquery_conn_id='bigquery_default',  #<-- Need these both
        google_cloud_storage_conn_id=
        'bigquery_default',  #<-- becasue of inheritance
        use_legacy_sql=False,
        dag=dag)
Ejemplo n.º 27
0
def add_verify_tasks(task, dependencies=None):
    # The queries in verify/sqls will fail when the condition is not met
    # Have to use this trick since the Python 2 version of BigQueryCheckOperator doesn't support standard SQL
    # and legacy SQL can't be used to query partitioned tables.
    sql_path = os.path.join(
        dags_folder,
        'resources/stages/verify/sqls/{task}.sql'.format(task=task))
    sql = read_file(sql_path)
    verify_task = BigQueryOperator(task_id='verify_{task}'.format(task=task),
                                   bql=sql,
                                   use_legacy_sql=False,
                                   dag=dag)
    if dependencies is not None and len(dependencies) > 0:
        for dependency in dependencies:
            dependency >> verify_task
    return verify_task
Ejemplo n.º 28
0
def get_bq_to_bq_operator(
        sql_or_filename,
        dst_table_name,
        dag=None,
        params={},
        table_expiration_seconds=None,
        partition_expiration_seconds=None):
    """Get templated BigQueryOperator.

    Args:
        sql_or_filename (string): Valid SQL statement or a path to a sql file.
        It can be templated using Jinja in either case.
        dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_bq_to_bq_operator(..., dag=dag)`. Defaults to None.

    Returns:
        airflow.contrib.operators.bigquery_operator.BigQueryOperator

    """
    dag = dag or models._CONTEXT_MANAGER_DAG
    if dag is None:
        logger.warning('No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI')

    dst_table_name_with_date_descriptor = \
        '{table_name}{date_descriptor}'.format(
            table_name=dst_table_name,
            date_descriptor='{{ ds_nodash }}')

    dataset_name = '{experiment_name}_database'.format(
        experiment_name=get_config('experiment_name'))

    return BigQueryOperator(
        dag=dag,
        task_id='{experiment_name}.{table_name}.bq_to_bq'
        .format(
            experiment_name=get_config('experiment_name'),
            table_name=dst_table_name),
        sql=sql_or_filename,
        use_legacy_sql=False,
        write_disposition="WRITE_TRUNCATE",
        destination_dataset_table="{gcp_project_name}:{dataset_name}.{table_name}"
        .format(
            gcp_project_name=get_config('gcp_project_name'),
            dataset_name=dataset_name,
            table_name=dst_table_name_with_date_descriptor),

        params=params)
Ejemplo n.º 29
0
    def test_extract_cached(self, mock_client, mock_hook):
        bq_job_id = "foo.bq.job_id"

        mock_hook.return_value \
            .get_conn.return_value \
            .cursor.return_value \
            .run_query.return_value = bq_job_id

        job_details = self.read_file_json(
            "tests/extractors/cached_job_details.json"
        )

        mock_client.return_value.get_job.return_value._properties = job_details
        # To make sure hasattr "sees" close and calls it
        mock_client.return_value.close.return_value

        mock.seal(mock_hook)
        mock.seal(mock_client)

        dag = DAG(dag_id='TestBigQueryExtractorE2E')
        task = BigQueryOperator(
            sql='select first_name, last_name from customers;',
            task_id="task_id",
            project_id="project_id",
            dag_id="dag_id",
            dag=dag,
            start_date=timezone.datetime(2016, 2, 1, 0, 0, 0)
        )

        task_instance = TaskInstance(
            task=task,
            execution_date=datetime.utcnow().replace(tzinfo=pytz.utc))

        bq_extractor = BigQueryExtractor(task)
        steps_meta_extract = bq_extractor.extract()
        assert steps_meta_extract is None

        task_instance.run()

        step_meta = bq_extractor.extract_on_complete(task_instance)
        assert step_meta.inputs is not None
        assert step_meta.outputs is not None

        assert len(step_meta.run_facets) == 1
        assert step_meta.run_facets['bigQuery_statistics'] \
               == BigQueryStaticticsRunFacet(cached=True)
Ejemplo n.º 30
0
def insert_overwrite(date):

    str_date = re.sub("-", '', date)
    print('str_date : %s' % str_date)

    obj = BigQueryOperator(
        task_id='insertOverwrite_{}'.format(date),
        write_disposition=
        'WRITE_TRUNCATE',  # WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY
        create_disposition='CREATE_IF_NEEDED',
        # priority="BATCH",
        allow_large_results=True,
        use_legacy_sql=False,
        location=bq_location,
        sql=""" 
            SELECT 
                CAST (cyymmdd AS DATE ) AS cyymmdd,
                un,
                rgn_cd,
                cnty_cd,
                tcom_cd,
                dvc_gp_id,
                dvc_modl_id ,
                fw_ver,
                cp_ver,
                hw_ver,
                os_ver
            FROM sa-bigdata-dev.hive_test.device_origin
            WHERE cyymmdd = "{}" """.format(date),
        destination_dataset_table=pj_bigquery + '.' + ds_demo + '.' +
        tb_profile + '$' + str_date,
        # maximum_billing_tier=1,
        #trigger_rule=TriggerRule.ALL_SUCCESS,
        retries=5,
        retry_delay=timedelta(seconds=5),
        dag=dag)
    return obj
Ejemplo n.º 31
0
t3 = BigQueryOperator(
    task_id='bq_write_to_github_daily_metrics',
    use_legacy_sql=False,
    write_disposition='WRITE_TRUNCATE',
    allow_large_results=True,
    bql='''
    #standardSQL
    SELECT
      date,
      repo,
      SUM(IF(type='WatchEvent', 1, NULL)) AS stars,
      SUM(IF(type='ForkEvent',  1, NULL)) AS forks
    FROM (
      SELECT
        FORMAT_TIMESTAMP("%Y%m%d", created_at) AS date,
        actor.id as actor_id,
        repo.name as repo,
        type
      FROM
        `githubarchive.day.{{ yesterday_ds_nodash }}`
      WHERE type IN ('WatchEvent','ForkEvent')
    )
    GROUP BY
      date,
      repo
    ''',
    destination_dataset_table=
    'my-project.github_trends.github_daily_metrics${{ yesterday_ds_nodash }}',
    dag=dag)
            {
                'name': 'timestamp',
                'type': 'integer',
                'mode': 'nullable'
            },
            {
                'name': 'window_start',
                'type': 'string',
                'mode': 'nullable'
            },
        ],
        write_disposition='WRITE_TRUNCATE')

    # Run example query (http://shortn/_BdF1UTEYOb) and save result to the
    # destination table.
    t3 = BigQueryOperator(
        task_id='bq_example_query',
        bql="""
        SELECT
          name, team, total_score
        FROM
          [bq_example.foobar]
        WHERE total_score > 15
        LIMIT 100;
      """,
        destination_dataset_table='{0}.gcp_example_query_result'.format(
            BQ_DATASET_NAME),
        write_disposition='WRITE_TRUNCATE')

    t1 >> t2 >> t3
Ejemplo n.º 33
0
                        source_objects = ['data/{}.csv'.format(table_name)],
                        destination_project_dataset_table = '{}:{}.{}'.format(params['GCP_PROJECT_ID'],params['BQ_DATASET_ID'],table_name),
                        schema_fields =  params_bq_schema[table_name],
                        write_disposition = 'WRITE_TRUNCATE',
                        dag = dag
                )
    list_gcs_to_bq.append(gcs_to_bq)



# Se lee la querie de las variables de airflow para obtener el dataset final agrupado y almacenar en BigQuery
execute_bq_sql = BigQueryOperator(
                        task_id='execute_bq_sql',
                        sql= query_sql,
                        use_legacy_sql=False,
                        destination_dataset_table=bq_recent_questions_table_id,
                        create_disposition='CREATE_IF_NEEDED',
                        write_disposition='WRITE_TRUNCATE',
                        dag = dag
                )

# Se exporta el resultado de la tabla temporal a GCS

export_data_groupby = BigQueryToCloudStorageOperator(
                        task_id='export_table_temp_to_gcs',
                        source_project_dataset_table= bq_recent_questions_table_id,
                        destination_cloud_storage_uris='gs://{}/data/archivo_final_agrupado.csv'.format(params['BUCKET_ID']),
                        export_format='CSV',
                        dag = dag
                )
Ejemplo n.º 34
0
        dataset_s3_bucket="telemetry-parquet",
        aws_conn_id="aws_dev_iam_s3",
        dataset="clients_daily",
        dataset_version="v6",
        gke_cluster_name="bq-load-gke-1",
        reprocess=True,
        ),
    task_id="clients_daily_v6_bigquery_load",
    dag=dag)

clients_last_seen = BigQueryOperator(
    task_id='clients_last_seen',
    bql='sql/clients_last_seen_v1.sql',
    destination_dataset_table='telemetry.clients_last_seen_v1${{ds_nodash}}',
    write_disposition='WRITE_TRUNCATE',
    use_legacy_sql=False,
    bigquery_conn_id="google_cloud_derived_datasets",
    depends_on_past=True,
    start_date=datetime(2019, 4, 15),
    dag=dag,
)

clients_last_seen_export = SubDagOperator(
    subdag=export_to_parquet(
        table="clients_last_seen_v1",
        arguments=["--submission-date={{ds}}"],
        parent_dag_name=dag.dag_id,
        dag_name="clients_last_seen_export",
        default_args=default_args,
        num_preemptible_workers=10),
    task_id="clients_last_seen_export",