def test_snowflake_operator(self, mock_get_hook): sql = """ CREATE TABLE IF NOT EXISTS test_airflow ( dummy VARCHAR(50) ); """ t = SnowflakeOperator(task_id='basic_snowflake', sql=sql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_snowflake_operator(self, mock_get_hook): sql = """ CREATE TABLE IF NOT EXISTS test_airflow ( dummy VARCHAR(50) ); """ t = SnowflakeOperator( task_id='basic_snowflake', sql=sql, dag=self.dag) t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def upload_to_snowflake(task_id): sql_statements = [] # look for CSVs to ignest to snowflake for output_file in glob.glob(output_file_glob + ".csv"): s3_file_name = os.path.basename(output_file) tablename = os.path.splitext(s3_file_name)[0].replace("-", "_") snowflake_stage = Variable.get("SNOWFLAKE_STAGE", default_var="COVID_PROD") truncate_st = f'TRUNCATE TABLE {tablename}' insert_st = f'copy into {tablename} from @{snowflake_stage}/{s3_file_name} file_format = (type = "csv" field_delimiter = "," NULL_IF = (\'NULL\', \'null\',\'\') EMPTY_FIELD_AS_NULL = true FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)' sql_statements.append(truncate_st) sql_statements.append(insert_st) sql_statements.append("COMMIT") create_insert_task = SnowflakeOperator( task_id=task_id, sql=sql_statements, autocommit=False, snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION", default_var="SNOWFLAKE"), ) return create_insert_task
def load_raw_ingester_table(job_args, defaults, table, beid_list, parent_dag_name, start_date, schedule_interval, sf_conn_id, sf_role, sf_warehouse, sf_database, env): dag = DAG("{}.{}".format(parent_dag_name, table), default_args=defaults, start_date=start_date, schedule_interval=schedule_interval, catchup=False, concurrency=32) sql_path = os.path.join(job_args["ingester_sql_path"], table) for raw_beid in beid_list: beid = re.search('.*client=([0-9]+)\/', raw_beid).group(1) raw_ingester_query = SqlUtils.load_query(sql_path).replace( "[beid_param]", raw_beid).split("---") per_beid_task = SnowflakeOperator(task_id=beid, snowflake_conn_id=sf_conn_id, warehouse=sf_warehouse, database=sf_database, sql=raw_ingester_query, params={ "env": env, "table": table }, autocommit=True, dag=dag) return dag
def execute_script(sql_file): script = env.get_template(sql_file).render(**template_params) sql_statements = script.split(";") return SnowflakeOperator( task_id="execute_script_" + sql_file, sql=sql_statements, autocommit=False, snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION", default_var="SNOWFLAKE"), )
def upload_to_snowflake(task_id): insert_st = f'copy into {basename.replace("-","_")} from @COVID_PROD/{basename}.csv file_format = (type = "csv" field_delimiter = "," FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)' create_insert_task = SnowflakeOperator( task_id=task_id, sql=insert_st, snowflake_conn_id="OJ10999_COVID19", ) return create_insert_task
def upload_to_snowflake(task_id): truncate_st = f'TRUNCATE TABLE {basename.replace("-", "_")}' insert_st = f'copy into {basename.replace("-","_")} from @COVID_PROD/{basename}.csv file_format = (type = "csv" field_delimiter = "," FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)' create_insert_task = SnowflakeOperator( task_id=task_id, sql=[truncate_st, insert_st, "COMMIT"], autocommit=False, snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION", default_var="SNOWFLAKE"), ) return create_insert_task
def upload_to_snowflake(task_id): sql_statements = [] for output_file in glob.glob(output_file_glob): s3_file_name = os.path.basename(output_file) tablename = os.path.splitext(s3_file_name)[0].replace("-", "_") truncate_st = f'TRUNCATE TABLE {tablename}' insert_st = f'copy into {tablename} from @COVID_PROD/{s3_file_name} file_format = (type = "csv" field_delimiter = "," FIELD_OPTIONALLY_ENCLOSED_BY=\'"\' skip_header = 1)' sql_statements.append(truncate_st) sql_statements.append(insert_st) sql_statements.append("COMMIT") create_insert_task = SnowflakeOperator( task_id=task_id, sql=sql_statements, autocommit=False, snowflake_conn_id=Variable.get("SNOWFLAKE_CONNECTION", default_var="SNOWFLAKE"), ) return create_insert_task
'depends_on_past': False, 'start_date': days_ago(7), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } DAG_DESCRIPTION = 'Email discounts to customers that have experienced order delays daily' DAG = dag = DAG(DAG_ID, schedule_interval='@weekly', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION) TASK_ID = 'select' TASK = SnowflakeOperator(task_id=TASK_ID, snowflake_conn_id=CONN_ID, sql=SQL, dag=DAG) @mock.patch( 'openlineage.airflow.extractors.snowflake_extractor.SnowflakeExtractor._get_table_schemas' ) # noqa @mock.patch('openlineage.airflow.extractors.postgres_extractor.get_connection') def test_extract(get_connection, mock_get_table_schemas): mock_get_table_schemas.side_effect = \ [[DB_TABLE_SCHEMA], NO_DB_TABLE_SCHEMA] conn = Connection() conn.parse_from_uri(uri=CONN_URI) get_connection.return_value = conn
from airflow import DAG from datetime import datetime, timedelta from airflow.contrib.operators.snowflake_operator import SnowflakeOperator default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2021, 5, 14), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=60), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('sr_etl2', default_args=default_args, schedule_interval='@once') snowflake = SnowflakeOperator( task_id='snowflake_task', sql="select * from CURO_CLAIMS.curo_medclaims LIMIT 5", snowflake_conn_id='ocudm', dag=dag)
body = """ Hi Everyone, <br> <br> There's been an error in the {task_name} job.<br> <br> Forever yours,<br> Airflow bot <br> """.format(**contextDict) send_email_smtp('*****@*****.**', title, body) with dag: create_table = SnowflakeOperator( task_id="snowflake_create", sql=query_for_creation, snowflake_conn_id="snowflake_airflow", ) with dag: load_table = SnowflakeOperator(task_id='snowflake_load', sql=query_for_loading, snowflake_conn_id="snowflake_airflow") with dag: update_data = SnowflakeOperator( task_id='snowflake_update', sql=query_for_update, snowflake_conn_id="snowflake_airflow", ) with dag:
'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG( 'snowflake_analytics', default_args=default_args, description='Snowflake analytics pipeline', schedule_interval='@daily', ) t1 = SnowflakeOperator( task_id='create_analytics_calendar_date', sql='sql/create_analytics_calendar_date.sql', snowflake_conn_id='snowflake_default', warehouse='load_wh', database='analytics', autocommit=True, dag=dag) t2 = SnowflakeOperator( task_id='create_analytics_airline', sql='sql/create_analytics_airline.sql', snowflake_conn_id='snowflake_default', warehouse='load_wh', database='analytics', autocommit=True, dag=dag) t3 = SnowflakeOperator( task_id='create_analytics_nyc_taxi',
) stage_finish = DummyOperator(task_id="adlogs_snowflake_staging_finish") # staging ad logs hourly for table in JOB_ARGS["tables"]: stage_sql_path = os.path.join( JOB_ARGS["stage_sql_path"], table ) query_log = SqlUtils.load_query(stage_sql_path).split("---") stage_adlogs_hourly_job = SnowflakeOperator( task_id="stage_logs_{}_hourly".format(table), snowflake_conn_id=SF_CONN_ID, warehouse=SF_WAREHOUSE, database=SF_DATABASE, sql=query_log, params={ "env": ENV, "team_name": TEAM_NAME }, autocommit=True, trigger_rule='all_done', dag=DAG ) stage_adlogs_hourly_job >> stage_finish
def get_row_count(**context): dwh_hook = SnowflakeHook(snowflake_conn_id="snowflake_common") result = dwh_hook.get_first( "select count(*) from private.{}".format(table_name)) logging.info("Number of rows in `private.%s` - %s", table_name, result[0]) with dag: create_table = SnowflakeOperator( task_id="create_table", sql=create_table_queries, snowflake_conn_id="snowflake_common", outlets={ "datasets": [ SnowflakeTable( table_alias="cy25812.ap-southeast-1/demo_db/private/{}". format(table_name), name=table_name) ] }) count_rows = PythonOperator( task_id="count_rows", python_callable=get_row_count, inlets={ "datasets": [ SnowflakeTable( table_alias="cy25812.ap-southeast-1/demo_db/private/{}". format(table_name), name=table_name)
result = snowflake_hook.get_first( "SELECT COUNT(*) FROM @AZURE_DEMO_STAGE/superstore_sample_data.csv (FILE_FORMAT => CSV_FILE_FORMAT);" ) logging.info("Number of rows in raw file - %s", result[0]) #Using a with statement to execute cleanup code with dag: #Using a snowflake operator to execute a select command on our raw data in Azure check_raw = PythonOperator(task_id="check_raw", python_callable=check_raw) #Using a snowflake operator to execute our create command in Snowflake create = SnowflakeOperator( task_id="snowflake_create", sql=create_query, snowflake_conn_id="snowflake_conn", ) #Using a snowflake operator to execute our insert command in Snowflake insert = SnowflakeOperator( task_id="snowflake_insert", sql=insert_query, snowflake_conn_id="snowflake_conn", ) #Using a python operator to execute a user-defined python function check_table = PythonOperator(task_id="check_table", python_callable=check_table) #Using python bitwise shift operators to determine the workflow of our dag (the order in which our tasks will executed)
dag = DAG( dag_id="sample_snowflake", default_args=default_args, start_date=days_ago(1), schedule_interval=None ) status_sql = """ insert into public.stage_status values (101, '{}', 'create_file', 'success'); """.format(str(datetime.now())) def row_count(**context): dwh_hook = SnowflakeHook(snowflake_conn_id="snowflake_sql") result = dwh_hook.get_first("select count(*) from public.stage_status") print("Number of rows in `public.test_table` - %s", result[0]) with dag: create_insert = SnowflakeOperator( task_id="snowflake_insert", sql=status_sql, snowflake_conn_id="snowflake_sql", ) get_count = PythonOperator( task_id="get_count", python_callable=row_count ) create_insert >> get_count
from airflow.contrib.operators.snowflake_operator import SnowflakeOperator from atlan.lineage.assets import SnowflakeTable args = {"owner": "Atlan", "start_date": airflow.utils.dates.days_ago(2)} dag = DAG(dag_id="customer_distribution_apac", default_args=args, schedule_interval=None) with dag: ## task definition customer_nation_join = SnowflakeOperator( task_id = "customer_nation_join", sql = """CREATE TABLE biw.private.customer_enriched AS SELECT c.c_custkey, c.c_acctbal, c.c_mktsegment, n.n_nationkey, n.n_name FROM biw.raw.customer c INNER JOIN biw.raw.nation n ON c.c_nationkey = n.n_nationkey""", snowflake_conn_id = "snowflake_common", inlets: { "datasets": [SnowflakeTable(table_alias = "mi04151.ap-south-1/biw/raw/customer", name = "customer"), SnowflakeTable(table_alias = "mi04151.ap-south-1/biw/raw/nation", name = "nation")] }, outlets: { "datasets": [SnowflakeTable(table_alias = "mi04151.ap-south-1/biw/private/customer_enriched", name = "customer_enriched")] } )
DAG = DAG( #creating DAG "stage_ad_logs_to_snowflake", #goal of DAG is to stage ad logs to Snowflake default_args=DEFAULTS, start_date=datetime(2018, 1, 1), schedule_interval=JOB_ARGS["schedule_interval"], catchup=False) stage_int_sql_path = os.path.join( JOB_ARGS["stage_sql_path"], #stage_sql_path = adlogs/load_raw_logs "int") stage_int_hourly_query = SqlUtils.load_query(stage_int_sql_path).split("---") stage_int_hourly_job = SnowflakeOperator(task_id="stage_adlogs_int_hourly", snowflake_conn_id=SF_CONN_ID, warehouse=SF_WAREHOUSE, database=SF_DATABASE, sql=stage_int_hourly_query, params={"env": ENV}, autocommit=True, dag=DAG) stage_int_tables = DummyOperator(task_id="finish_int_rl_staging") stage_onetag_sql_path = os.path.join(JOB_ARGS["stage_sql_path"], "onetag") stage_onetag_hourly_query = SqlUtils.load_query(stage_onetag_sql_path).split( "---") stage_onetag_hourly_job = SnowflakeOperator( task_id="stage_adlogs_onetag_hourly", snowflake_conn_id=SF_CONN_ID, warehouse=SF_WAREHOUSE, database=SF_DATABASE, sql=stage_onetag_hourly_query,
'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG( 'snowflake_raw', default_args=default_args, description='Snowflake raw pipeline', schedule_interval='0 */6 * * *', ) t1 = SnowflakeOperator(task_id='copy_raw_airline', sql='sql/copy_raw_airline.sql', snowflake_conn_id='snowflake_default', warehouse='load_wh', database='raw', autocommit=True, dag=dag) t2 = SnowflakeOperator(task_id='copy_raw_nyc_taxi', sql='sql/copy_raw_nyc_taxi.sql', snowflake_conn_id='snowflake_default', warehouse='load_wh', database='raw', autocommit=True, dag=dag) t1 >> t2
sql_path = os.path.join( JOB_ARGS[ "stage_sql_path"], #stage_sql_path = adlogs/log_process/filename.sql process) sql_query = SqlUtils.load_query(sql_path).split( "---" ) # sql_query is a list of all the queries in a given .sql file, seperated by '---' query_log += sql_query #for i in range(len(sql_query)): # query_log.append(sql_query[i]) # using .append() instead of += #create task with appropriate sql query transform_task = SnowflakeOperator( task_id="{}_transform".format(table), snowflake_conn_id=SF_CONN_ID, warehouse=SF_WAREHOUSE, database=SF_DATABASE, sql=query_log, # here is where we pass in the query log params={ #set strings to reference in the .sql scripts "env": ENV, "table": table, "database": SF_DATABASE, #ft_db_dev "transform_schema": TRANSFORM_SCHEMA, #transform_schema "business_schema": BUSINESS_SCHEMA #ssc_business }, autocommit=True, dag=DAG) adlogs_sensor >> transform_task >> transform_finish
#op_kwargs = {"file_name" : file} ) load_to_s3 = PythonOperator(dag=dag, task_id=f"load_to_s3", python_callable=load_to_s3, provide_context=True, op_kwargs={"file_name": "contacts"}) s3_to_snowflake = PythonOperator(dag=dag, task_id=f"s3_to_snowflake", python_callable=s3_to_snowflake, provide_context=True, op_kwargs={"file_name": "contacts"}) Contacts_merge_snowflake = SnowflakeOperator( dag=dag, task_id=f"Contacts_merge_snowflake", sql=merge_query, snowflake_conn_id="snowflake_conn") completed_flag = PythonOperator(dag=dag, task_id=f"completed_flag", python_callable=completed_flag, provide_context=True #op_kwargs = {"file_name" : file} ) dummy = DummyOperator(task_id="dummy", dag=dag) dummy >> get_latest_Created_contacts >> get_latest_Updated_contacts >> get_contact_properties >> load_to_s3 >> s3_to_snowflake >> Contacts_merge_snowflake >> completed_flag
dag = DAG(dag_id="snowflake_connector_test", default_args=args, schedule_interval='@once') create_insert_query = [ """create table public.test_table (amount number);""", """insert into public.test_table values(1),(2),(3);""" ] #def row_count(**kwargs): # sleep(120) # dwh_hook = SnowflakeHook(snowflake_conn_id="ocudm") # result = dwh_hook.get_first("select count(*) from curo_medclaims") # print("hello") # sleep(120) # logging.info("Number of rows in `curo_medclaims` - %s", result[0]) with dag: create_insert = SnowflakeOperator( task_id="snowfalke_create", sql=create_insert_query, snowflake_conn_id="ocudm", ) #get_count = PythonOperator(task_id="get_count", python_callable=row_count,provide_context=True) sleep = BashOperator(task_id='sleep', bash_command='sleep 30') print_hello = BashOperator(task_id='print_hello', bash_command='echo "hello"') print_hello >> sleep >> create_insert
import logging import airflow from airflow import DAG from airflow.operators.python_operator import PythonOperator from airflow.contrib.hooks.snowflake_hook import SnowflakeHook from airflow.contrib.operators.snowflake_operator import SnowflakeOperator logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) args = {"owner": "Airflow", "start_date": airflow.utils.dates.days_ago(2)} dag = DAG(dag_id="snowflake_conn", default_args=args, schedule_interval=None) create_query = [ """create table PUBLIC.TEST_TABLE;""", ] with dag: create = SnowflakeOperator( task_id="snowflake_create", sql=create_query, snowflake_conn_id="snowflake_conn", ) # create_query
'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), } dag = DAG('demo', max_active_runs=1, schedule_interval='@daily', default_args=default_args, catchup=False) create_discounts_by_quarter = """ create or replace table {{ var.value.database }}.{{ var.value.spa_schema }}.discounts_by_fiscal_quarter as select c_start.fiscal_quarter, c_start.fiscal_year, count(*) as discounts from {{ var.value.database }}.{{ var.value.ods_schema }}.discount d join {{ var.value.database }}.{{ var.value.ods_schema }}.calendar c_start on d.start_date=c_start.date group by 1,2 order by 2,1 """ with dag: start = DummyOperator(task_id='start', dag=dag) discount_by_quarter = SnowflakeOperator( task_id='discount_by_quarter', snowflake_conn_id='snowflake_default', sql=create_discounts_by_quarter, warehouse='ANALYTICS_LARGE', dag=dag) start >> discount_by_quarter
from airflow import DAG from airflow.contrib.operators.snowflake_operator import SnowflakeOperator from datetime import datetime, timedelta default_args = { 'owner': 'airflow', 'depends_on_past': False, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1) } with DAG('paramaterized_query', start_date=datetime(2020, 6, 1), max_active_runs=3, schedule_interval='@daily', default_args=default_args, template_searchpath='/usr/local/airflow/include', catchup=False) as dag: opr_param_query = SnowflakeOperator(task_id='param_query', snowflake_conn_id='snowflake', sql='param-query.sql')