def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields and self.schema_object \ and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads( gcs_hook.download(self.bucket, self.schema_object).decode("utf-8")) else: schema_fields = self.schema_fields source_uris = [ 'gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects ] conn = bq_hook.get_conn() cursor = conn.cursor() if self.external_table: cursor.create_external_table( external_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, src_fmt_configs=self.src_fmt_configs) else: cursor.run_load(destination_project_dataset_table=self. destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, self.destination_project_dataset_table)) row = cursor.fetchone() max_id = row[0] or 0 self.log.info('Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id) return max_id
def get_db_hook(self): return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql)
def get_db_hook(self): return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id)
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.contrib.hooks.bigquery_hook import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.contrib.hooks.gcp_sql_hook import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def init(self): self.bq_hook = BigQueryHook(bigquery_conn_id=self.bq_conn_id, use_legacy_sql=False) bq_conn = self.bq_hook.get_conn() self.bq_cursor = bq_conn.cursor()
def create_dag(pipeline): #Request Configuration DAG_NAME = pipeline['dag']['dag_name'] ARGS = pipeline['args'] START_DATE = pipeline['start_date'] SCHEDULE_INTERVAL = pipeline['dag']['schedule_interval'] BQ_CONN_ID = pipeline['connection']['bq_conn_id'] BQ_PROJECT_DESTINATION = pipeline['connection']['bq_project_destination'] BQ_DATASET_DESTINATION = pipeline['connection']['bq_dataset_destination'] BQ_TABLE_DESTINATION = pipeline['connection']['bq_table_destination'] BUCKET_DESTINATION = pipeline['connection']['bucket_destination'] FOLDER_IN_BUCKET = pipeline['connection']['folder_in_bucket'] IS_USING_ML = pipeline['model']['is_using_ml'] TARGET_FORECAST = pipeline['model']['target_forecast'] CATEGORY = pipeline['model']['category'] YLABEL = pipeline['viz']['ylabel'] QUERY = pipeline['query'] START_DATETIME_QUERY = pipeline['start_datetime_query'] if IS_USING_ML: ATTACHED_FILE = ['forecast_result.pdf'] else: ATTACHED_FILE = None # Connection Hook bq_hook = BigQueryHook(bigquery_conn_id=BQ_CONN_ID, use_legacy_sql=False) storage_client = storage.Client(project = bq_hook._get_field("project"), credentials = bq_hook._get_credentials()) bq_client = bigquery.Client(project = bq_hook._get_field("project"), credentials = bq_hook._get_credentials()) dag = DAG( dag_id=DAG_NAME, default_args=ARGS, schedule_interval=timedelta(days=SCHEDULE_INTERVAL), start_date= datetime.strptime(START_DATE, '%Y-%m-%d'), dagrun_timeout=timedelta(minutes=60), max_active_runs=1, catchup=False ) def if_table_exists(**kwargs): bq_table = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'*') #read file try: bq_client.get_table(bq_table) kwargs['ti'].xcom_push(key='created_flag', value=True) except: kwargs['ti'].xcom_push(key='created_flag', value=False) def branch_table(**kwargs): is_table_exists = kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table') print('is_table_exists' + str(is_table_exists)) if not IS_USING_ML: return 'mail' elif IS_USING_ML and is_table_exists: return 'predict' else: return 'train' def to_table(**kwargs): is_table_exists=kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table') #table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))) if is_table_exists: table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180914') else: table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180831') job_config = bigquery.QueryJobConfig() job_config.create_disposition = 'CREATE_IF_NEEDED' job_config.write_disposition = 'WRITE_TRUNCATE' job_config.destination = table_ref # -- Uncomment to create partitioned table #job_config.time_partitioning = bigquery.table.TimePartitioning() #is_table_exists = False if is_table_exists: #sql=QUERY.format(str(datetime.fromtimestamp(kwargs['execution_date'].timestamp()) - timedelta(SCHEDULE_INTERVAL - 1)) ,str(datetime.fromtimestamp(kwargs['execution_date'].timestamp()))) sql=QUERY.format('2018-09-01 00:00:00','2018-09-14 23:59:59') kwargs['ti'].xcom_push(key='csv_file', value='20180914') else: #sql=QUERY.format(START_DATETIME_QUERY,str(datetime.fromtimestamp(kwargs['execution_date'].timestamp()))) sql=QUERY.format('2018-01-01 00:00:00','2018-08-31 23:59:59') kwargs['ti'].xcom_push(key='csv_file', value='20180831') query_job = bq_client.query( sql, location='US', job_config=job_config, ) result = query_job.result() kwargs['ti'].xcom_push(key='row_num', value=result.total_rows) def query_to_csv(**kwargs): #table_ref = bq_client.dataset(dataset_id=BQ_DATASET_DESTINATION).table(table_id=BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))) is_table_exists=kwargs['ti'].xcom_pull(key='created_flag', task_ids='chck_table') #table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))) if is_table_exists: table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180914') uri = 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+'20180914.csv' else: table_ref = bq_client.dataset(BQ_DATASET_DESTINATION).table(BQ_TABLE_DESTINATION+'_'+'20180831') uri = 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+'20180831.csv' csv_job = bq_client.extract_table( source = table_ref, #Change the destination uri if needed # 'gs://'+BUCKET_DESTINATION+'/'+FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))+'.csv' destination_uris = uri, location='US', ) csv_job.result() def gcs_csv_to_df(bq_hook, execution_date): #todo : change the blob folder to be filled with the request.json #blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+str(kwargs['execution_date'].date().strftime('%Y%m%d'))+'.csv') blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+BQ_TABLE_DESTINATION+'_'+execution_date+'.csv') byte_stream = io.BytesIO() blob.download_to_file(byte_stream) byte_stream.seek(0) df = pd.read_csv(byte_stream) return df def save_to_gcs(bq_hook, stream, path_file): blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+path_file) byte_stream = io.BytesIO() pickle.dump(stream, byte_stream, pickle.HIGHEST_PROTOCOL) byte_stream.seek(0) blob.upload_from_file(byte_stream) def train_mdl(**kwargs): #df = gcs_csv_to_df(bq_hook, kwargs['execution_date'].date()) df = gcs_csv_to_df(bq_hook, '20180831') df.rename(columns={TARGET_FORECAST: 'y'}) model = prophet.train(df, category_cols=CATEGORY) save_to_gcs(bq_hook, model, 'model/model.pickle') #prediction = predict_mdl(kwargs['execution_date'].date()) prediction = predict_mdl(datetime(2018,8,31).date()) return prediction def predict_mdl(execution_date): blob = storage_client.get_bucket(BUCKET_DESTINATION).get_blob(FOLDER_IN_BUCKET+'model/model.pickle') byte_stream = io.BytesIO() blob.download_to_file(byte_stream) byte_stream.seek(0) models = pickle.load(byte_stream) prediction = prophet.predict(execution_date, models=models, schedule_interval=SCHEDULE_INTERVAL, category_cols=CATEGORY) plot.plotvis(prediction, YLABEL, SCHEDULE_INTERVAL, category_cols=CATEGORY) prediction.to_csv('prediction.csv', index=False) #blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+'prediction/prediction_'+str((kwargs['execution_date']+timedelta(days=1)).date().strftime('%Y%m%d'))+'.csv') blob = storage_client.get_bucket(BUCKET_DESTINATION).blob(FOLDER_IN_BUCKET+'prediction/prediction_'+execution_date.strftime('%Y%m%d')+'.csv') blob.upload_from_filename('prediction.csv') return prediction def predict(**kwargs): #prediction = predict_mdl(kwargs['execution_date'].date()) prediction = predict_mdl(datetime(2018,9,14).date()) return prediction with dag: chck_table = PythonOperator( task_id = 'chck_table', dag=dag, python_callable=if_table_exists, ) chck_table_branch = BranchPythonOperator( task_id = 'chck_table_branch', dag=dag, python_callable=branch_table, ) crt_table = PythonOperator( task_id='crt_table', dag=dag, python_callable=to_table ) save_to_csv = PythonOperator( task_id='save_to_csv', dag=dag, python_callable=query_to_csv ) train = PythonOperator( task_id='train', dag=dag, python_callable=train_mdl, ) #Change the 2018031 to {{ds_nodash}} mail = EmailOperator( task_id='mail', dag=dag, trigger_rule='none_failed', to='*****@*****.**', subject='Reporting: {{ params.dag_name }} {{ ds }}', params={ 'dag_name': DAG_NAME, 'table': BQ_TABLE_DESTINATION, 'dataset': BQ_DATASET_DESTINATION, 'project': BQ_PROJECT_DESTINATION, 'bucket': BUCKET_DESTINATION, 'using_ml': IS_USING_ML }, html_content=''' DAG name : {{ params.dag_name }} <br> Table : {{ params.table }} <br> Dataset : {{ params.dataset }} <br> Project : {{ params.project }} <br> CSV link in GCS : https://storage.cloud.google.com/{{ params.bucket }}/{{ params.dataset }}/{{ params.table }}_{{task_instance.xcom_pull(task_ids='crt_table', key='csv_file')}}.csv <br> Number of recorded rows : {{task_instance.xcom_pull(task_ids='crt_table', key='row_num')}} ''', files = ATTACHED_FILE, cc=['*****@*****.**', '*****@*****.**'], ) predict = PythonOperator( task_id='predict', dag=dag, python_callable=predict, ) chck_table >> crt_table >> save_to_csv >> chck_table_branch >> [train, predict, mail] train >> mail predict >> mail return dag
from __future__ import print_function import json from airflow import AirflowException from airflow.contrib.hooks.bigquery_hook import BigQueryHook # Global Variables for library g_source_composer_bucket_path = '/home/airflow/gcs/dags/' g_json_composer_bucket_path = g_source_composer_bucket_path + 'json/' g_sql_composer_bucket_path = g_source_composer_bucket_path + 'sql/' g_conn_id = 'bigquery_default' g_bq_hook = BigQueryHook(bigquery_conn_id=g_conn_id) g_config_file = "/home/airflow/gcs/dags/json/DPLF_DAG_Generator_Config.json" # Access configuration def DPLF_Access_ReadConfig( p_config_file="/home/airflow/gcs/dags/json/DPLF_DAG_Generator_Config.json" ): v_json_data = open(p_config_file).read() v_config = json.loads(v_json_data) return v_config # Extractor of files and tasks from Json def DPLF_GetValueByKey(p_list, p_key): if p_key in p_list: return p_list[p_key] else: return ""
def patch_bq_cols(self, bq_table, sf_cols): """ Used to decide whether we need to run an ALTER or CREATE table command. Leverages alter_tbl_ddl() and create_tbl_ddl() to create the DDL that will be run. """ bq_service = BigQueryHook( bigquery_conn_id=self.bq_conn_id).get_service() bq_conn = BigQueryBaseCursor(bq_service, self.bq_project) missing_cols = [] try: bq_cols = bq_conn.get_schema(self.bq_dataset, bq_table) print(bq_cols) bq_cols = [col for col in bq_cols['fields']] missing_cols = [x for x in sf_cols if x['name'] not in bq_cols] except: bq_cols = [] for col in sf_cols: bq_cols.append({ "type": col['type'], "name": col["name"].lower(), "mode": "NULLABLE" }) self.create_tbl_ddl(bq_table, bq_cols) if missing_cols: bq_cols = [] for col in sf_cols: bq_cols.append({ "type": col['type'], "name": col["name"].lower(), "mode": "NULLABLE" }) bq_cols.append({ "name": "partition_date", "type": "DATE", "mode": "NULLABLE" }) print('new schema is ' + str(bq_cols)) table_resource = {} table_resource['schema'] = {'fields': bq_cols} try: bq_service.tables().patch(projectId=self.bq_project, datasetId=self.bq_dataset, tableId=bq_table, body=table_resource).execute() self.log.info('Table patched successfully') except HttpError as err: raise AirflowException( 'BigQuery job failed. Error was: {}'.format(err.content)) return bq_cols
def execute(self, context): logging.info('Executing: %s', str(self.bql)) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) hook.run(self.bql, self.destination_dataset_table, self.write_disposition)
def get_db_hook(self): return BigQueryHook(bigquery_conn_id=self.conn_id).get_conn().cursor()
# ---------------------------------------------------------------------------- conn_id_gcp = 'conn_id_gcp' conn_id_gcp_cros = 'conn_id_gcp_cros' sa_cross = Variable.get("config_sa-cross") slack_token_variable_name = "slack_token" slack_user_name_daf_airflow = "slack_user_name_to_users" slack_channel_daf_airflow = "slack_channel_daf_airflow" github_auth = Variable.get("github_auth", deserialize_json=True) # ---------------------------------------------------------------------------- # HOOKS - VARIABLES # ---------------------------------------------------------------------------- bq_hook = BigQueryHook(bigquery_conn_id=conn_id_gcp_cros, use_legacy_sql=False) # ---------------------------------------------------------------------------- # DATE - VARIABLES # ---------------------------------------------------------------------------- macro_yesterday_date = '{{ ds }}' macro_today_date = '{{ next_ds }}' # ---------------------------------------------------------------------------- # CLUSTER CONFIG - VARIABLES # ---------------------------------------------------------------------------- predict_config_script = \ Variable.get("var-food_ontology-predict-configuration", deserialize_json=True) predict_properties_daily = \
def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) if not self.schema_fields: if self.schema_object and self.source_format != 'DATASTORE_BACKUP': gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) schema_fields = json.loads(gcs_hook.download( self.bucket, self.schema_object).decode("utf-8")) elif self.schema_object is None and self.autodetect is False: raise ValueError('At least one of `schema_fields`, `schema_object`, ' 'or `autodetect` must be passed.') else: schema_fields = None else: schema_fields = self.schema_fields source_uris = ['gs://{}/{}'.format(self.bucket, source_object) for source_object in self.source_objects] conn = bq_hook.get_conn() cursor = conn.cursor() if self.external_table: cursor.create_external_table( external_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, compression=self.compression, skip_leading_rows=self.skip_leading_rows, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, src_fmt_configs=self.src_fmt_configs, encryption_configuration=self.encryption_configuration ) else: cursor.run_load( destination_project_dataset_table=self.destination_project_dataset_table, schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, autodetect=self.autodetect, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, field_delimiter=self.field_delimiter, max_bad_records=self.max_bad_records, quote_character=self.quote_character, ignore_unknown_values=self.ignore_unknown_values, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, encoding=self.encoding, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, time_partitioning=self.time_partitioning, cluster_fields=self.cluster_fields, encryption_configuration=self.encryption_configuration) if cursor.use_legacy_sql: escaped_table_name = '[{}]'.format(self.destination_project_dataset_table) else: escaped_table_name = '`{}`'.format(self.destination_project_dataset_table) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( self.max_id_key, escaped_table_name)) row = cursor.fetchone() max_id = row[0] if row[0] else 0 self.log.info( 'Loaded BQ data with max %s.%s=%s', self.destination_project_dataset_table, self.max_id_key, max_id ) return max_id
def execute(self, context): logging.info('Executing: %s', str(self.bql)) hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) conn = hook.get_conn() cursor = conn.cursor() cursor.run_query(self.bql, self.destination_dataset_table, self.write_disposition, self.allow_large_results, self.udf_config)