Example #1
0
def table_custom_check(jinja_query, days_to_retry):
    return BigQueryCheckOperator(
        task_id='custom_check',
        sql=jinja_query,
        use_legacy_sql=False,
        retries=2 * 24 * days_to_retry,  # Retries 3 days with 30 minutes.
        execution_timeout=timedelta(days=days_to_retry),  # TimeOut of 3 days.
        retry_delay=timedelta(minutes=30),  # Delay in retries 30 minutes.
        max_retry_delay=timedelta(
            minutes=30),  # Max Delay in retries 30 minutes
        on_failure_callback=config_tools.failure_callback_gfw)
Example #2
0
 def table_check(self, task_id, project, dataset, table, date,
                 **retries_config):
     """
     Returns a BigQueryCheckOperator that checks the existance of a partitioned table for a specific date.
     Having a retry mechanism of 3 days checking by 30 minutes.
     :param task_id: The task identification.
     :type task_id: str
     :param project: The project where the dataset and table belongs.
     :type project: str
     :param dataset: The dataset that has the table.
     :type dataset: str
     :param table: The partitioned table.
     :type table: str
     :param date: The date of the partition to be checked.
     :type date: str
     :param retries_config: The retries configuration to adapt to your
     needs. You can customize parameters as retries, execution_timeout,
     retry_delay and max_retry_delay.
     :type retries_config: dict
     """
     retries = retries_config.get('retries')
     execution_timeout = retries_config.get('execution_timeout')
     retry_delay = retries_config.get('retry_delay')
     max_retry_delay = retries_config.get('max_retry_delay')
     return BigQueryCheckOperator(
         task_id=task_id,
         sql='SELECT COUNT(*) FROM [{}.{}${}]'.format(dataset, table, date),
         retries=2 * 24 *
         3 if not retries else retries,  # Retries 3 days with 30 minutes.
         execution_timeout=timedelta(days=3) if not execution_timeout else
         execution_timeout,  # TimeOut of 3 days.
         retry_delay=timedelta(minutes=30) if not retry_delay else
         retry_delay,  # Delay in retries 30 minutes.
         max_retry_delay=timedelta(minutes=30) if not max_retry_delay else
         max_retry_delay,  # Max Delay in retries 30 minutes
         on_failure_callback=config_tools.failure_callback_gfw)
Example #3
0
    'start_date': datetime(2020, 9, 23),
    'end_date': datetime(2020, 9, 30),
    'depends_on_past': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=10)
}

dag = DAG(dag_id="my-bq-task",
          default_args=default_args,
          schedule_interval='@daily')
filepath = "gs://backup-data-2020/" + 'out_' + datetime.now().strftime(
    "%d_%b_%Y:%H:%M:%S.%f") + '.csv'

t1 = BigQueryCheckOperator(
    bigquery_conn_id="my-bq-conn",
    task_id="check_for_table",
    dag=dag,
    use_legacy_sql=False,
    sql="""select count(*) from `tech-289406.sample.covid`""")

t2 = BigQueryToCloudStorageOperator(
    task_id="dataset_to_gcs",
    source_project_dataset_table="sample.covid",
    destination_cloud_storage_uris=[filepath],
    export_format="CSV",
    field_delimiter=',',
    bigquery_conn_id="my-bq-conn",
    dag=dag,
)

t1 >> t2
Example #4
0
t1 = GoogleCloudStorageToBigQueryOperator(
    task_id='gcs_to_bq_eu',
    bucket='some-bucket',
    schema_fields=schema_fields,
    skip_leading_rows=1,
    source_objects=['eu-referendum-result-data.csv'],
    destination_project_dataset_table='airflow_referendum.result_data',
    dag=dag)

# Task 2: Check admin areas table exists
# airflow test -sd=airflow_dags demo bq_dest_table_lookup_admin 2017-03-06

t2 = BigQueryCheckOperator(
    task_id='bq_dest_table_lookup_admin',
    sql=
    'SELECT table_id FROM [some-project:airflow_referendum.__TABLES__] WHERE table_id = "admin_areas"',
    dag=dag)

# Task 3: Join DCM table with lookup files
# airflow test -sd=airflow_dags demo bq_dest_populate 2017-03-06

t3 = BigQueryOperator(
    task_id='bq_dest_populate',
    bql='demo/bigquery_sql/geo.sql',
    destination_dataset_table='airflow_referendum.geo_results{}'.format(
        current_date),
    write_disposition='WRITE_TRUNCATE',
    dag=dag)

t2.set_upstream(t1)
Example #5
0
def preprocess_tasks(model, dag, PROJECT_ID, BUCKET, DATA_DIR):
  # Constants
  # Specify your source BigQuery project, dataset, and table names
  SOURCE_BQ_PROJECT = "nyc-tlc"
  SOURCE_DATASET_TABLE_NAMES = "yellow.trips,green.trips_2014,green.trips_2015".split(",")

  # Specify your destination BigQuery dataset
  DESTINATION_DATASET = "taxifare"
  
  # BigQuery data query
  bql="""
  SELECT
    (tolls_amount + fare_amount) AS fare_amount,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) * 1.0 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) * 1.0 AS hourofday,
    pickup_longitude AS pickuplon,
    pickup_latitude AS pickuplat,
    dropoff_longitude AS dropofflon,
    dropoff_latitude AS dropofflat,
    passenger_count*1.0 AS passengers,
    CONCAT(CAST(pickup_datetime AS STRING), CAST(pickup_longitude AS STRING), CAST(pickup_latitude AS STRING), CAST(dropoff_latitude AS STRING), CAST(dropoff_longitude AS STRING)) AS key
  FROM
    `{0}.{1}`
  WHERE
    trip_distance > 0
    AND fare_amount >= 2.5
    AND pickup_longitude > -78
    AND pickup_longitude < -70
    AND dropoff_longitude > -78
    AND dropoff_longitude < -70
    AND pickup_latitude > 37
    AND pickup_latitude < 45
    AND dropoff_latitude > 37
    AND dropoff_latitude < 45
    AND passenger_count > 0
    AND rand() < 0.00001
  """

  bql = bql.format(SOURCE_BQ_PROJECT, model)

  bql_train = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) < 4".format(bql)
  bql_eval = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) = 4".format(bql)

  # Complete the BigQueryOperator task to truncate the table if it already exists before writing
  # Reference: https://airflow.apache.org/integration.html#bigqueryoperator
  bq_train_data_op = BigQueryOperator(
      task_id="bq_train_data_{}_task".format(model.replace(".","_")),
      bql=bql_train,
      destination_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")),
      write_disposition="WRITE_TRUNCATE", # specify to truncate on writes
      use_legacy_sql=False,
      dag=dag
  )

  bq_eval_data_op = BigQueryOperator(
      task_id="bq_eval_data_{}_task".format(model.replace(".","_")),
      bql=bql_eval,
      destination_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")),
      write_disposition="WRITE_TRUNCATE", # specify to truncate on writes
      use_legacy_sql=False,
      dag=dag
  )

  sql = """
  SELECT
    COUNT(*)
  FROM
    [{0}:{1}.{2}]
  """

  # Check to make sure that the data tables won"t be empty
  bq_check_train_data_op = BigQueryCheckOperator(
      task_id="bq_check_train_data_{}_task".format(model.replace(".","_")),
      sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_train_data"),
      dag=dag
  )

  bq_check_eval_data_op = BigQueryCheckOperator(
      task_id="bq_check_eval_data_{}_task".format(model.replace(".","_")),
      sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_eval_data"),
      dag=dag
  )

  # BigQuery training data export to GCS
  bash_remove_old_data_op = BashOperator(
      task_id="bash_remove_old_data_{}_task".format(model.replace(".","_")),
      bash_command="if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi".format(BUCKET, model.replace(".","_")),
      dag=dag
  )

  # Takes a BigQuery dataset and table as input and exports it to GCS as a CSV
  bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator(
      task_id="bq_export_gcs_train_csv_{}_task".format(model.replace(".","_")),
      source_project_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")),
      destination_cloud_storage_uris=[DATA_DIR + "{}/train-*.csv".format(model.replace(".","_"))],
      export_format="CSV",
      print_header=False,
      dag=dag
  )

  bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator(
      task_id="bq_export_gcs_eval_csv_{}_task".format(model.replace(".","_")),
      source_project_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")),
      destination_cloud_storage_uris=[DATA_DIR + "{}/eval-*.csv".format(model.replace(".","_"))],
      export_format="CSV",
      print_header=False,
      dag=dag
  )
  
  return (bq_train_data_op,
          bq_eval_data_op,
          bq_check_train_data_op,
          bq_check_eval_data_op,
          bash_remove_old_data_op,
          bq_export_gcs_train_csv_op,
          bq_export_gcs_eval_csv_op)
Example #6
0
# Define DAG: Set ID and assign default args and schedule interval
dag = DAG('bigquery_github_trends',
          default_args=default_args,
          schedule_interval=schedule_interval)

## Task 1: check that the github archive data has a dated table created for that date
# To test this task, run this command:
# docker-compose -f docker-compose-gcloud.yml run --rm webserver airflow test bigquery_github_trends bq_check_githubarchive_day 2018-12-01
t1 = BigQueryCheckOperator(task_id='bq_check_githubarchive_day',
                           sql='''
        #standardSQL
        SELECT
          table_id
        FROM
          `githubarchive.day.__TABLES_SUMMARY__`
        WHERE
          table_id = "{{ yesterday_ds_nodash }}"
        ''',
                           use_legacy_sql=False,
                           bigquery_conn_id=BQ_CONN_ID,
                           dag=dag)

## Task 2: check that the hacker news table contains data for that date.
t2 = BigQueryCheckOperator(task_id='bq_check_hackernews_full',
                           sql='''
        #standardSQL
        SELECT
          FORMAT_TIMESTAMP("%Y%m%d", timestamp ) AS date
        FROM
          `bigquery-public-data.hacker_news.full`
Example #7
0

##########

#Authentication
client = bigquery.Client()
#client = bigquery.Client.from_service_account_json('/Users/jasmeetbhatia/Dropbox/Google/service_keys/jsb-demos-8fb1eb4ebaf6.json')




t3 = BigQueryCheckOperator(
    task_id='bq_check_table',
    sql='''
    #legacySql
    SELECT COUNT(pageviews) 
    FROM [jsb-demos.ga_360_merch_store.merch_store_ga360_sessions] LIMIT 1
    ''',
    dag=dag,
    depends_on_past=True,
    ui_color= '#bed578' )


t4 = BigQueryCheckOperator(
    task_id='bq_data_validation',
    sql='''
    #legacySql
    SELECT
    STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date
    FROM
      [bigquery-public-data:hacker_news.full]
    WHERE
Example #8
0
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=3)
}

dag = DAG(dag_id='bq_data_v1',
          start_date=datetime(2020, 5, 31),
          default_args=dag_args,
          end_date=None,
          schedule_interval='0 9 * * *')

t1 = BigQueryCheckOperator(task_id='check_public_data_exists',
                           sql="""
                            select count(*) > 0
                            from bigquery-public-data.covid19_italy.data_by_region
                            where DATE(_PARTITIONTIME) = DATE_SUB('{{ ds }}', INTERVAL 1 DAY))
                        """,
                           use_legacy_sql=False)

t2 = BigQueryOperator(
    task_id='load_public_data',
    use_legacy_sql=False,
    create_disposition='CREATE_IF_NEEDED',
    write_disposition='WRITE_TRUNCATE',
    allow_large_results=True,
    bigquery_conn_id='bigquery_default',
    time_partitioning={"type": 'DAY'},
    sql='''
    #standardSQL
            select date, 
    # ingested within the past 90 days.
    check_sql = """
                SELECT
                    COUNT(*)
                FROM
                    `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                WHERE
                    trip_start_timestamp >=
                        TIMESTAMP('{{ macros.ds_add(ds, -90) }}')
                """

    # BigQueryCheckOperator will fail if the result of the query is 0.
    # I.e. if there is no fresh data.
    bq_check_data_op = BigQueryCheckOperator(
        task_id="bq_check_data_task",
        use_legacy_sql=False,
        sql=check_sql,
    )

    CHECK_ERROR_MESSAGE = b64e(b'Error. Did not retrain on stale data.')

    # Task will only trigger if all upstream tasks fail. In which case a
    # message will be sent to the Pub/Sub topic specified above.

    publish_if_failed_check_op = PubSubPublishOperator(
        task_id="publish_on_failed_check_task",
        project=PROJECT_ID,
        topic=TOPIC,
        messages=[{'data': CHECK_ERROR_MESSAGE.decode()}],
        trigger_rule=TriggerRule.ALL_FAILED
    )
    'start_date': datetime(2019, 12, 17),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(seconds=7)
}

dag = DAG(
    dag_id='BQ_TableToPartTable', 
    default_args=default_args, 
    schedule_interval='35 6 * * *', description='Перезаписывает и добавляет данные в PARTITIONTIME таблице'
    , catchup=False
)

t2 = BigQueryCheckOperator(task_id='data_validation',
                           sql="""SELECT count(*) FROM `konic-progress-196909.Mig_Data.Orders` where date = '{{ ds }}'""", use_legacy_sql=False, dag=dag)


t3 = BigQueryOperator(
    task_id='delete_22_day_ago',
    use_legacy_sql=False,
    allow_large_results=True,
    sql="""delete FROM Mig_Data.PartOrders where  DATE(_PARTITIONTIME) >= '{{ macros.ds_add(ds, -22) }}' and DATE(_PARTITIONTIME) <= '{{ ds }}'""",  
    dag=dag)

t2>>t3


for i in range(0,23,1):
	k='{{{{ (execution_date - macros.timedelta(days={})).strftime("%Y-%m-%d") }}}}'.format(i)
	k_='{{{{ (execution_date - macros.timedelta(days={})).strftime("%Y%m%d") }}}}'.format(i)
Example #11
0

def py_call(name):
    print("Hello {0}. Congrats on Printing this.".format(name))


t2 = PythonOperator(task_id='Py_Test',
                    python_callable=py_call,
                    op_kwargs={'name': 'Ravi Kishore'},
                    dag=dag)

# Check to see if the Source Table exists
t3 = BigQueryCheckOperator(
    task_id='Check_Chicago_Crime_Table',
    sql=
    '''select count(*) FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` 
            where date(trip_start_timestamp) = "{{yesterday_ds}}"''',
    bigquery_conn_id='my_gcp_conn',
    use_legacy_sql=False,
    dag=dag)

# Load data to Project Dataset
t4 = BigQueryOperator(
    task_id='BQLoadDestinationTable',
    sql=
    '''select trip_start_timestamp, trip_end_timestamp, trip_seconds, trip_miles,
        fare, tips, tolls, extras, trip_total FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips` 
        where date(trip_start_timestamp)= "{{ yesterday_ds }}"''',
    bigquery_conn_id='my_gcp_conn',
    write_disposition='WRITE_TRUNCATE',
    create_disposition='CREATE_IF_NEEDED',
    destination_dataset_table=
Example #12
0
    create_disposition='CREATE_IF_NEEDED',
    use_legacy_sql=False,
    task_id='analytics_award_saga',
    dag=dag)

drop_awards = BigQueryTableDeleteOperator(deletion_dataset_table=award_table,
                                          ignore_if_missing=True,
                                          task_id='deletion_dataset_table',
                                          dag=dag)

###########################
# Validation tasks
###########################
validate_non_empty_movie = BigQueryCheckOperator(
    dag=dag,
    task_id='validate_non_empty_movie',
    sql=SqlQueries.validate_non_empty_movie,
    use_legacy_sql=False)

validate_non_empty_person = BigQueryCheckOperator(
    dag=dag,
    task_id='validate_non_empty_person',
    sql=SqlQueries.validate_non_empty_person,
    use_legacy_sql=False)

validate_non_empty_movie_person = BigQueryCheckOperator(
    dag=dag,
    task_id='validate_non_empty_movie_person',
    sql=SqlQueries.validate_non_empty_movie_person,
    use_legacy_sql=False)
Example #13
0
        msg = msg.format(conn_id=new_conn.conn_id)
        print(msg)

dag = DAG('bigquery_github_trends_v1', default_args=default_args, schedule_interval="@once")

# Task to add a connection
t0 = PythonOperator(
    dag=dag,
    task_id='add_gcp_connection_python',
    python_callable=add_gcp_connection,
    provide_context=True,
)

t1 = BigQueryCheckOperator(
    task_id='bq_check_hackernews_full',
    bigquery_conn_id='bigquery',
    sql='''
    #legacySql
    SELECT
    STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date
    FROM
      [bigquery-public-data:hacker_news.full]
    WHERE
    type = 'story'
    AND STRFTIME_UTC_USEC(timestamp, "%Y%m%d") = "{{ yesterday_ds_nodash }}"
    LIMIT 1
    ''',
    dag=dag)

t1.set_upstream(t0)
Example #14
0
        session.commit()
    else:
        msg = '\n\tA connection with `conn_id`={conn_id} already exists\n'
        msg = msg.format(conn_id=new_conn.conn_id)
        print(msg)

dag = DAG('bigquery_github_trends_v1', default_args=default_args, schedule_interval="@once")

# Task to add a connection
t0 = PythonOperator(
    dag=dag,
    task_id='add_gcp_connection_python',
    python_callable=add_gcp_connection,
    provide_context=True,
)

t1 = BigQueryCheckOperator(
    task_id='bq_check_hackernews_github_agg',
    bigquery_conn_id='bigquery',
    sql='''
    #legacySql
    SELECT
    partition_id
    FROM
    [github_trends.hackernews_github_agg$__PARTITIONS_SUMMARY__]
    WHERE partition_id = "{{ yesterday_ds_nodash }}"
    ''',
    dag=dag)

t1.set_upstream(t0)
Example #15
0
    'start_date': start_date,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

# write dag
with DAG(dag_id='blog',
         default_args=default_args,
         schedule_interval=schedule_interval,
         catchup=False) as dag:

    t1 = BigQueryCheckOperator(task_id='check_bq_data_exists',
                               sql=queries.check_bq_data_exists,
                               use_legacy_sql=False)

    t2 = GoogleCloudStorageObjectSensor(task_id='check_gcs_file_exists',
                                        bucket=cfg.BUCKET,
                                        object=cfg.SOURCE_OBJECT)

    t3 = BigQueryOperator(task_id='write_weight_data_to_bq',
                          sql=queries.write_weight_data_to_bq,
                          destination_dataset_table=cfg.BQ_TABLE_WEIGHT,
                          create_disposition='CREATE_IF_NEEDED',
                          write_disposition='WRITE_TRUNCATE',
                          use_legacy_sql=False)

    t4 = GoogleCloudStorageToBigQueryOperator(
        task_id='write_route_data_to_bq',
dag = DAG(
    'bigquery_check', 
    default_args=default_args, 
    schedule_interval=schedule_interval
    )
## For testing individual tasks run the below code in your terminal 
## airflow test bigquery_github_trends bq_check_githubarchive_day 2020-12-01
## Task 1: check that the github archive data has a dated table created for that date
t1 = BigQueryCheckOperator(
        task_id='bq_check_githubarchive_day',
        sql='''
        #standardSQL
        SELECT
          table_id
        FROM
          `githubarchive.day.__TABLES_SUMMARY__`
        WHERE
          table_id = "{{ yesterday_ds_nodash }}"
        ''',
        use_legacy_sql=False,
        bigquery_conn_id=BQ_CONN_ID,
        dag=dag
    )

## Task 2: check that the hacker news table contains data for that date.
t2 = BigQueryCheckOperator(
        task_id='bq_check_hackernews_full',
        sql='''
        #standardSQL
        SELECT
          FORMAT_TIMESTAMP("%Y%m%d", timestamp ) AS date
BQ_CONN = gcp_config["BQ_CONN"]
GCP_PROJECT = gcp_config["GCP_PROJECT"]
PROJECT_DATASET = gcp_config["PROJECT_DATASET"]

schedule_interval = "0 0 * * *"

dag = DAG('bigquery_github_trends', catchup=True ,default_args = default_args, schedule_interval = schedule_interval)

#Task 1: Check if githubarchive contains data for the date.
# To test: docker-compose run --rm webserver airflow test bigquery_github_trends github_table_check 2020-01-01
t1 = BigQueryCheckOperator(
    task_id = "github_table_check",
    sql = """
        SELECT * FROM
        `githubarchive.day.__TABLES_SUMMARY__`
        WHERE table_id = '{{ yesterday_ds_nodash }}'
    """,
    bigquery_conn_id = BQ_CONN,
    use_legacy_sql = False,
    dag = dag
)

#Task 2: Check if hackernews has data for the date.
# To test: docker-compose run --rm webserver airflow test bigquery_github_trends hackernews_full_table_check 2020-01-01
t2 = BigQueryCheckOperator(
    task_id = "hackernews_full_table_check",
    sql = """
        SELECT timestamp FROM
        `bigquery-public-data.hacker_news.full`
        WHERE EXTRACT(date from timestamp) = '{{ yesterday_ds }}'
            AND type = 'story'
Example #18
0
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=3)
}

dag = DAG(dag_id='covid_italy_daily',
          start_date=datetime(2020, 5, 31),
          default_args=dag_args,
          end_date=None,
          schedule_interval='0 9 * * *')

t1 = BigQueryCheckOperator(task_id='check_public_data_exists',
                           sql="""
                            select count(*) > 0
                            from bigquery-public-data.covid19_italy.data_by_region
                            where DATE(date) = '{{ ds }}'
                        """,
                           use_legacy_sql=False)

t2 = BigQueryOperator(
    task_id='load_public_data',
    use_legacy_sql=False,
    create_disposition='CREATE_IF_NEEDED',
    write_disposition='WRITE_TRUNCATE',
    allow_large_results=True,
    bigquery_conn_id='bigquery_default',
    time_partitioning={"type": 'DAY'},
    sql='''
    #standardSQL
            select date, 
load_immigration_data = GoogleCloudStorageToBigQueryOperator(
    task_id='load_immigration_data',
    bucket=gs_bucket,
    source_objects=['immigration_data/*.parquet'],
    destination_project_dataset_table=
    f'{project_id}:{staging_dataset}.immigration_data',
    source_format='parquet',
    write_disposition='WRITE_TRUNCATE',
    skip_leading_rows=1,
    autodetect=True)

# Check loaded data not null
check_us_cities_demo = BigQueryCheckOperator(
    task_id='check_us_cities_demo',
    use_legacy_sql=False,
    sql=f'SELECT count(*) FROM `{project_id}.{staging_dataset}.us_cities_demo`'
)

check_airports = BigQueryCheckOperator(
    task_id='check_airports',
    use_legacy_sql=False,
    sql=f'SELECT count(*) FROM `{project_id}.{staging_dataset}.airport_codes`')

check_weather = BigQueryCheckOperator(
    task_id='check_weather',
    use_legacy_sql=False,
    sql=
    f'SELECT count(*) FROM `{project_id}.{staging_dataset}.temperature_by_city`'
)
Example #20
0
        session.commit()
    else:
        msg = '\n\tA connection with `conn_id`={conn_id} already exists\n'
        msg = msg.format(conn_id=new_conn.conn_id)
        print(msg)


dag = DAG('bigquery_github_trends_v1',
          default_args=default_args,
          schedule_interval="@once")

# Task to add a connection
t0 = PythonOperator(
    dag=dag,
    task_id='add_gcp_connection_python',
    python_callable=add_gcp_connection,
    provide_context=True,
)

t1 = BigQueryCheckOperator(task_id='bq_check_githubarchive_day',
                           bigquery_conn_id='bigquery',
                           sql='''
    #legacySql
    SELECT table_id
    FROM [githubarchive:day.__TABLES__]
    WHERE table_id = "{{ yesterday_ds_nodash }}"
    ''',
                           dag=dag)

t1.set_upstream(t0)
Example #21
0
    gcs_schema_object='gs://testcovidlinh/googletrend_schema.json',
    # schema_fields=googletrend_schema,
    bigquery_conn_id=gcp_conn_id,
    google_cloud_storage_conn_id=gcp_conn_id)

# Loading data from GCS to BigQuery
gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator(
    task_id='GCS_to_BigQuery',
    dag=dag,
    bucket='testcovidlinh',
    source_objects=source_objects,
    # schema_object   = "/tmp/covidStatSchema.json",
    schema_object='googletrend_schema.json',
    # schema_fields   = googletrend_schema,
    source_format='CSV',
    destination_project_dataset_table=destination_project_dataset_table,
    write_disposition='WRITE_TRUNCATE',
    # autodetect = False,
    bigquery_conn_id=gcp_conn_id,
    google_cloud_storage_conn_id=gcp_conn_id)

# Check Count
check_count = BigQueryCheckOperator(task_id="check_count",
                                    bigquery_conn_id=gcp_conn_id,
                                    sql="SELECT COUNT(*) FROM " +
                                    destination_project_dataset_table,
                                    use_legacy_sql=False
                                    # location=location
                                    )

start_operator >> googletrend >> CreateTable >> gcs_to_bigquery >> check_count
Example #22
0
    'email_on_retry': False
}

# Step 3: Initiate DAG
dag = DAG('Airflow_20201025',
          default_args=default_args,
          schedule_interval='0 0 * * *')

# Step 4: Define Tasks
t1 = BashOperator(task_id='Bash_Test', bash_command='date', dag=dag)


def py_call(name):
    print("Hello {0}. Congrats on Printing this.".format(name))


t2 = PythonOperator(task_id='Py_Test',
                    python_callable=py_call,
                    op_kwargs={'name': 'Ravi Kishore'},
                    dag=dag)

t3 = BigQueryCheckOperator(
    task_id='Check_Chicago_Crime_Table',
    sql=
    '''select * from `bigquery-public-data.chicago_crime.crime` where date(date) = "{{ yesterday_ds }}"''',
    bigquery_conn_id='google_cloud_default',
    use_legacy_sql=False,
    dag=dag)
# Step 5: Define Dependencies

t1 >> t2 >> t3
Example #23
0
query = '''
SELECT
  CONCAT(
    'https://stackoverflow.com/questions/',
    CAST(id as STRING)) as url,
  view_count
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE tags like '%google-bigquery%'
ORDER BY view_count DESC
LIMIT 10
'''

t1 = BigQueryCheckOperator(
    task_id='bq_check_githubarchive_day',
    # use_legacy_sql=False,
    sql=query,
    project_id='tt-cust-analytics',
    dag=dag)

# t2 = BigQueryCheckOperator(
#     task_id='bq_check_hackernews_full',
#     sql='''
#     #legacySql
#     SELECT
#     STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date
#     FROM
#       [bigquery-public-data:hacker_news.full]
#     WHERE
#     type = 'story'
#     AND STRFTIME_UTC_USEC(timestamp, "%Y%m%d") = "{{ yesterday_ds_nodash }}"
#     LIMIT 1
            DESTINATION_DATASET, model.replace(".", "_")),
        write_disposition="WRITE_TRUNCATE",  # specify to truncate on writes
        use_legacy_sql=False,
        dag=dag)

    sql = """
  SELECT
    COUNT(*)
  FROM
    [{0}:{1}.{2}]
  """

    # Check to make sure that the data tables won"t be empty
    bq_check_train_data_op = BigQueryCheckOperator(
        task_id="bq_check_train_data_{}_task".format(model.replace(".", "_")),
        sql=sql.format(PROJECT_ID, DESTINATION_DATASET,
                       model.replace(".", "_") + "_train_data"),
        dag=dag)

    bq_check_eval_data_op = BigQueryCheckOperator(
        task_id="bq_check_eval_data_{}_task".format(model.replace(".", "_")),
        sql=sql.format(PROJECT_ID, DESTINATION_DATASET,
                       model.replace(".", "_") + "_eval_data"),
        dag=dag)

    # BigQuery training data export to GCS
    bash_remove_old_data_op = BashOperator(
        task_id="bash_remove_old_data_{}_task".format(model.replace(".", "_")),
        bash_command=
        "if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi"
        .format(BUCKET, model.replace(".", "_")),
Example #25
0
          schedule_interval=schedule_interval)

# Create new tables in cli before running DAG

# bq mk --time_partitioning_type=DAY my-project:github_trends.github_daily_metrics
# bq mk --time_partitioning_type=DAY my-project:github_trends.github_agg
# bq mk --time_partitioning_type=DAY my-project:github_trends.hackernews_agg
# bq mk --time_partitioning_type=DAY my-project:github_trends.hackernews_github_agg

## Task 1
# airflow test bigquery_github_trends_v1 bq_check_githubarchive_day 2017-06-02

t1 = BigQueryCheckOperator(task_id='bq_check_githubarchive_day',
                           sql='''
    #legacySql
    SELECT table_id 
    FROM [githubarchive:day.__TABLES__] 
    WHERE table_id = "{{ yesterday_ds_nodash }}"
    ''',
                           dag=dag)

## Task 2
# airflow test bigquery_github_trends_v1 bq_check_hackernews_full 2017-06-02

t2 = BigQueryCheckOperator(task_id='bq_check_hackernews_full',
                           sql='''
    #legacySql
    SELECT
    STRFTIME_UTC_USEC(timestamp, "%Y%m%d") as date
    FROM
      [bigquery-public-data:hacker_news.full]
    WHERE 
Example #26
0
import os
import datetime
from airflow import DAG
from airflow.contrib.operators.bigquery_operator import BigQueryOperator
from airflow.contrib.operators.bigquery_check_operator import BigQueryCheckOperator

args = {
    'owner': 'matti',
    'start_date': datetime.datetime(day=20, month=11, year=2018),
}

template_path = '{}/{}'.format(os.environ['AIRFLOW_HOME'], 'dags/sql')

with DAG(dag_id='github_dag', default_args=args, template_searchpath=template_path) as dag:
    github_commits = BigQueryOperator(task_id='github_commits',
                                      sql='github_users_aggregate.sql',
                                      use_legacy_sql=False,
                                      write_disposition='WRITE_TRUNCATE',
                                      destination_dataset_table='github.commits${{ ds_nodash }}')
    test__check_commit_sums_match_with_raw_data = BigQueryCheckOperator(task_id='TEST_commit_sums_match',
                                                                        sql='github_users_check_commit_sums.sql',
                                                                        use_legacy_sql=False)
    github_commits >> test__check_commit_sums_match_with_raw_data