seven_days_ago = datetime.combine(datetime.today() - timedelta(7),
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(
    dag_id='example_bash_operator', default_args=args,
    schedule_interval='0 0 * * *',
    dagrun_timeout=timedelta(minutes=60))

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(
    task_id='run_after_loop', bash_command='echo 1', dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_'+i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(
    task_id='also_run_this',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)
task.set_downstream(run_this_last)
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('dag', default_args=default_args)

# t1, t2 and t3 are examples of tasks created by instatiating operators
requirements = BashOperator(
    task_id='requirements',
    bash_command='pip install -r requirements.txt',
    dag=dag)

data = BashOperator(
    task_id='data',
    bash_command='python src/make_dataset.py',
    dag=dag)

clean = BashOperator(
    task_id='clean',
    bash_command='find . -name "*.pyc" -exec rm {} \;',
    dag=dag)

lint = BashOperator(
    task_id='flake8',
    bash_command='flake8 .',
    dag=dag)

data.set_upstream(requirements)
    default_args={'owner': 'airflow', 'provide_context': True})



valid_chars='-_.abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
def sanitize(text):
	return ''.join(c for c in text if c in valid_chars)

#this is where pipeline-generated bash commands come in.... 
bash_commands = ('echo "hi russ"', 'echo "hello again"')
conclusion_command = 'echo "all done"'
conclusion = BashOperator(task_id='conclude', bash_command=conclusion_command, dag=dag)

for cmd in bash_commands: 
	cmd.rstrip()
	run_this = BashOperator(
	  task_id=sanitize(cmd), bash_command=cmd, dag=dag)
	run_this.set_downstream(conclusion)








# def push(**kwargs):
#     # pushes an XCom without a specific target
#     kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1)

# def push_by_returning(**kwargs):
#     # pushes an XCom without a specific target, just by returning it
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.now(),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG(
    'docker_sample', default_args=default_args, schedule_interval=timedelta(minutes=10))

t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag)

t2 = BashOperator(
    task_id='sleep',
    bash_command='sleep 5',
    retries=3,
    dag=dag)

t3 = DockerOperator(api_version='1.19',
    docker_url='tcp://localhost:2375', #Set your docker URL
    command='/bin/sleep 30',
    image='centos:latest',
    network_mode='bridge',
    task_id='docker_op_tester',
    dag=dag)
        )

        make_klusta_dir_task = BashOperator(
            task_id='make_klusta_dir',
            bash_command=make_klustadir_cmd,
            params={'klustadir': KLUSTA_DIR},
            on_success_callback = lambda c: set_perms(c['params']['klustadir'],default_args['owner']), 
            dag=dag)

        make_kwd_task = BashOperator(
            task_id='make_kwd',
            # pool='RAM',
            bash_command=make_kwd_command,
            env={'PATH': ANACONDA_PATH},
            params={'klustadir': KLUSTA_DIR,
                    'matfiledir': MATFILE_DIR,
                    'probe': PROBE,
                    'rig': RIG,
                    'omit': OMIT},
            on_failure_callback = lambda c: clean_dir(c['params']['klustadir']),
            on_success_callback = lambda c: set_perms(c['params']['klustadir'],default_args['owner']),
            dag=dag)

        phy_task = BashOperator(
            task_id='phy_spikesort',
            # pool='CPU',
            env={'PATH': PHY_PATH},
            bash_command=sort_spikes_command,
            params={'klustadir': KLUSTA_DIR,
                    'matfiledir': MATFILE_DIR},
            on_failure_callback = lambda c: [clean_dir(c['params']['klustadir'],filt) for filt in ('*.kwik','*.kwx')],
default_args = {
    'owner': 'Samarth',
    'start_date': datetime(2016, 03, 15, 12),
}

# "schedule_interval" is your cron expression you can write any cron expression like unix cron.
dag = DAG('airflow_task_with_hdfs_sensor', default_args=default_args, schedule_interval="1 * * * *")

bash_task = BashOperator(
    task_id='dependency_for_hdfs_sensor',
    bash_command='echo "HDFS sensor would only be enabled after I am done!"',
    dag=dag)

# Sensor operator takes "filepath" to check if this file is present in hdfs or not.
# "hdfs_conn_id" is configured in ui Admin--> Connection.
hdfs_sensor_task = HdfsSensor(
    task_id='hdfs_sensor_task',
    filepath='/user/samarthg/input2',
    hdfs_conn_id='webhdfs_default',
    dag=dag)

post_hdfs_sensor_task = BashOperator(
    task_id='post_hdfs_sensor_task',
    bash_command='echo "I am done, it means sensor has done its job."',
    dag=dag)

# Setting up the correct dependencies for defined tasks.
hdfs_sensor_task.set_upstream(bash_task)
post_hdfs_sensor_task.set_upstream(hdfs_sensor_task)
Exemple #7
0
default_args = {
    'owner': 'anjana',
    'depends_on_past': False,
    'start_date': datetime(2020, 6, 21),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

with DAG('Helloworld', default_args=default_args) as dag:

    t1 = BashOperator(
        task_id='task_1',
        bash_command='echo "Hello World from Task 1"',
        dag=dag)

    t2 = BashOperator(
        task_id='task_2',
        bash_command='echo "Hello World from Task 2"',
        dag=dag)

    t3 = BashOperator(
        task_id='task_3',
        bash_command='echo "Hello World from Task 3"',
        dag=dag)

    t4 = BashOperator(
        task_id='task_4',
        bash_command='echo "Hello World from Task 4"',
Exemple #8
0
                                  datetime.min.time())
args = {
    'owner': 'airflow',
    'start_date': seven_days_ago,
}

dag = DAG(dag_id='example_bash_operator',
          default_args=args,
          schedule_interval='0 0 * * *',
          dagrun_timeout=timedelta(minutes=60))

cmd = 'ls -l'
run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

run_this = BashOperator(task_id='run_after_loop',
                        bash_command='echo 1',
                        dag=dag)
run_this.set_downstream(run_this_last)

for i in range(3):
    i = str(i)
    task = BashOperator(
        task_id='runme_' + i,
        bash_command='echo "{{ task_instance_key_str }}" && sleep 1',
        dag=dag)
    task.set_downstream(run_this)

task = BashOperator(
    task_id='also_run_this',
    bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"',
    dag=dag)
Exemple #9
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2016, 11, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval= '@once')

t1 = BashOperator(
    task_id='bash_test',
    bash_command='echo "hello world" > s3_conn_test.txt',
    dag=dag)

sensor = S3KeySensor(
    task_id='check_s3_for_file_in_s3',
    bucket_key='*',
    wildcard_match=True,
    bucket_name='airflow-input-spite',
    s3_conn_id='aws_default',
    timeout=18*60*60,
    poke_interval=120,
    dag=dag)

t1.set_upstream(sensor)

    'start_date': yesterday,
    # To email on failure or retry set 'email' arg to your email and enable
    # emailing here.
    'email_on_failure': False,
    'email_on_retry': False,
    # If a task fails, retry it once after waiting at least 5 minutes
    'retries': 0,
    'retry_delay': datetime.timedelta(minutes=5),
    'project_id': models.Variable.get('gcp_project')
}

with DAG(dag_id='monitor_dag',
         schedule_interval=None,
         default_args=default_dag_args) as dag:

    bash_prerequisites_install_cmd = """sudo apt install -y python-pip"""
    bash_prerequisites_install = BashOperator(
        task_id='bash_prerequisites_install',
        bash_command=bash_prerequisites_install_cmd)

    bash_pip_install_cmd = """sudo pip install pandas google-colab google-cloud-bigquery google-cloud-bigquery-storage pyarrow pyTelegramBotAPI"""
    bash_pip_install = BashOperator(task_id='bash_pip_install',
                                    bash_command=bash_pip_install_cmd)

    bash_colab_export_script_cmd = """python /home/omid/gs_dags/script.py"""
    bash_colab_export_scriptTask = BashOperator(
        task_id='bash_colab_export_script',
        bash_command=bash_colab_export_script_cmd)

bash_prerequisites_install >> bash_pip_install >> bash_colab_export_scriptTask
Exemple #11
0
#@auther: uday sharma

from airflow import DAG
from airflow.operators import BashOperator, HiveOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'udaysharma',
    'start_date': datetime(2016, 1, 14),
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('incremental_load', default_args=default_args)

sqoop_job = """
 exec ./scripts/sqoop_incremental.sh
"""
# Importing the data from Mysql table to HDFS
task1 = BashOperator(task_id='sqoop_import', bash_command=sqoop_job, dag=dag)

# Inserting the data from Hive external table to the target table
task2 = HiveOperator(
    task_id='hive_insert',
    hql=
    'INSERT INTO TABLE orders_trans SELECT order_id, first_name,last_name, item_code, order_date FROM orders_stg;',
    depends_on_past=True,
    dag=dag)

# defining the job dependency
task2.set_upstream(task1)
Exemple #12
0
pipeline_args = {
    'owner': 'pipeliner',
    'depends_on_past': False,
    'start_date': datetime(2016, 10, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

dag_id = 'pipeline_sample_dag'

dag = DAG(dag_id, default_args=pipeline_args, schedule_interval='0 0 * * 6')

globals()[dag_id] = dag

# Create sample task operators

task_1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)

task_2 = BashOperator(task_id='sleep',
                      bash_command='sleep 60',
                      retries=3,
                      dag=dag)

task_3 = BashOperator(task_id='print_date_again', bash_command='date', dag=dag)

task_2.set_upstream(task_1)
task_3.set_upstream(task_2)
from __future__ import absolute_import, unicode_literals
import os
from airflow.operators import BashOperator
from airflow.models import DAG
from datetime import datetime, timedelta

args = {
    'owner': 'recursive_schedule_interval',
    'depends_on_past': False,
    'start_date': datetime(2021, 1, 1),
}

dag = DAG(
    dag_id='recursive_schedule_interval',
    default_args=args,
    schedule_interval="*/5 * * * 1,2,3,4,5",
    catchup=False
)

# cmd file name
CMD = 'echo Job executado em: $(date +"%d/%m/%Y %k:%M:%S")'

run_this = BashOperator(
    task_id='bash_operator', bash_command=CMD, dag=dag
)
Exemple #14
0
Documentation that goes along with the Airflow tutorial located
[here](http://pythonhosted.org/airflow/tutorial.html)
"""
from airflow import DAG
from airflow.operators import BashOperator
from datetime import datetime, timedelta
from settings import default_args

dag = DAG('reddit_comments', default_args=default_args, schedule_interval=timedelta(2))


pre = BashOperator(
    task_id='setup',
    bash_command='tasks/setup_dirs.sh',
    depends_on_past=False,
    dag=dag)

t1 = BashOperator(
    task_id='ngrams_batch',
    bash_command='tasks/run_ngrams_batch.sh',
    depends_on_past=False,
    dag=dag)

t2 = BashOperator(
    task_id='ngrams_optimize',
    depends_on_past=True,
    bash_command='tasks/optimize_ngrams.sh',
    dag=dag)

t1.set_upstream(pre)
t2.set_upstream(t1)
Exemple #15
0
    'retries': 0,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('bharat_sheep_download',
          default_args=default_args,
          schedule_interval="@once")

# t1, t2, t3 and t4 are examples of tasks created using operators

dl_cmd = """hive.c4000 --hiveconf start_date=\"\'{}\'\" --hiveconf end_date=\"\'{}\'\" -f /home/prabhakarbha01/sheep/src/data/1_insert_into_download.sql"""

dl_reset_cmd = """hive.c4000 -f /home/prabhakarbha01/sheep/src/data/1_drop_create_download.sql"""

dl_reset = BashOperator(task_id='reset_agg_table',
                        bash_command=dl_reset_cmd,
                        dag=dag)

dl_10_01 = BashOperator(task_id='dl_10_01',
                        bash_command=dl_cmd.format("2018-10-01", "2018-10-08"),
                        dag=dag)

dl_10_02 = BashOperator(task_id='dl_10_02',
                        bash_command=dl_cmd.format("2018-10-08", "2018-10-16"),
                        dag=dag)

dl_10_03 = BashOperator(task_id='dl_10_03',
                        bash_command=dl_cmd.format("2018-10-16", "2018-10-24"),
                        dag=dag)

dl_10_04 = BashOperator(task_id='dl_10_04',
Exemple #16
0
		n = sanitize(c)
		while n in dic.keys(): 
			n = n + '_'
		dic[n] = c 
	return dic 

command_dict = dict_from_cmd_list(bash_commands)
tasks = []






for n,c in command_dict.iteritems(): 
	task = BashOperator(task_id=n, bash_command=c, dag=dag, pool='default')
	if len(tasks) > 0: 
		task.set_upstream(tasks[-1])
	tasks.append(task)



job = DagExecutionJob(dag)





# def push(**kwargs):
#     # pushes an XCom without a specific target
#     kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1)
Exemple #17
0
Code that goes along with the Airflow located at:
http://airflow.readthedocs.org/en/latest/tutorial.html
"""
from airflow import DAG
from airflow.operators import BashOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2016, 4, 22),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 4, 24),
}

dag = DAG('spark_pi', default_args=default_args)

# t1 is an example of tasks created by instatiating operators
t1 = BashOperator(
    task_id='spark_pi',
    bash_command=
    'spark-submit --class org.apache.spark.examples.SparkPi --master spark://127.0.0.1:7077 $SPARK_EXAMPLES_JAR 10',
    dag=dag)
from_channels = ['fromTwitter_A', 'fromTwitter_B', 'fromTwitter_C', 'fromTwitter_D']
to_channels = ['toTwitter_A', 'toTwitter_B', 'toTwitter_C', 'toTwitter_D']
yesterday = date.today() - timedelta(days=1)
dt = yesterday.strftime("%Y-%m-%d")
# define where you want to store the tweets csv file in your local directory
local_dir = "/tmp/"
# define the location where you want to store in HDFS
hdfs_dir = " /tmp/"

for channel in to_channels:

    file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"

    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)
Exemple #19
0
from airflow import DAG
from airflow.operators import BashOperator, TimeSensor
from datetime import datetime, timedelta, time

default_args = {
    'owner': 'Samarth',
    'start_date': datetime(2016, 03, 15, 12),
}

# "schedule_interval" is your cron expression you can write any cron expression like unix cron.
dag = DAG('airflow_task_with_time_sensor',
          default_args=default_args,
          schedule_interval="1 * * * *")

bash_task = BashOperator(
    task_id='dependency_for_sensor',
    bash_command='echo "Sensor would only be enabled after I am done!"',
    dag=dag)

# Sensor operator takes "target_time" which is a specific time in a day irrespective of date/day.
# Sensor is executed once the target time has passed. In this case after 10:55 at morning.
sensor_task = TimeSensor(task_id='sensor_task',
                         target_time=time(10, 55, 1, 1),
                         dag=dag)

post_sensor_task = BashOperator(
    task_id='post_sensor_task',
    bash_command='echo "I am done, it means sensor has done its job."',
    dag=dag)

# Setting up the correct dependencies for defined tasks.
sensor_task.set_upstream(bash_task)
def my_py_command(ds, **kwargs):
    # Print out the "foo" param passed in via
    # `airflow test example_passing_params_via_test_command run_this <date>
    # -tp '{"foo":"bar"}'`
    if kwargs["test_mode"]:
        print(" 'foo' was passed in via test={} command : kwargs[params][foo] \
               = {}".format( kwargs["test_mode"], kwargs["params"]["foo"]) )
    # Print out the value of "miff", passed in below via the Python Operator
    print(" 'miff' was passed in via task params = {}".format( kwargs["params"]["miff"]) )
    return 1

my_templated_command = """
    echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} "
    echo " 'miff was passed in via BashOperator with value {{ params.miff }} "
"""

run_this = PythonOperator(
    task_id='run_this',
    provide_context=True,
    python_callable=my_py_command,
    params={"miff":"agg"},
    dag=dag)

also_run_this = BashOperator(
    task_id='also_run_this',
    bash_command=my_templated_command,
    params={"miff":"agg"},
    dag=dag)
also_run_this.set_upstream(run_this)
default_args = {
    'owner': 'airflow',
    'start_date': datetime.now() - timedelta(minutes=1),
    'email': [],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('airflow_task_script_1', default_args=default_args, schedule_interval=None, start_date=datetime.now() - timedelta(minutes=1))


generating_the_MySql_data= BashOperator(
    task_id='generating_the_MySql_data',
    bash_command="cd ~/Documents/data/ ; python practical_exercise_data_generator.py --load_data;",
    dag=dag)

generating_the_CSV_data= BashOperator(
    task_id='generating_the_CSV_data',
    bash_command="cd ~/Documents/data/ ; python practical_exercise_data_generator.py --create_csv",
    dag=dag)

Sqoop_import_user= BashOperator(
    task_id='Sqoop_import_user',
    bash_command=""" 
sqoop import --connect jdbc:mysql://localhost/practical_exercise_1 --username root --password-file /user/cloudera/root_pwd.txt --table user -m 4 --hive-import --hive-overwrite --hive-database practical_exercise_1 --hive-table user;
if [ $? -ne 0 ];then
	echo Failed at importing user table
	exit 1
fi
"""



# Importing the incremental data from Mysql table to HDFS
task1 = BashOperator(
        task_id= 'sqoop_incremental_import',
        #bash_command=sqoop_incremental_job,
	bash_command='./sqoop_incremental.sh',
        dag=dag
)

# merge the data from Mysql table to HDFS
task2 = BashOperator(
        task_id= 'sqoop_merge_import',
        bash_command='./sqoop_merge.sh',
        dag=dag
)

# Inserting the data from Hive external table to the target table
task3 = HiveOperator(
        task_id= 'hive_insert',
        hql='LOAD DATA INPATH "/user/cloudera/employeeprofilemerge" OVERWRITE INTO TABLE employee_profile;',
        depends_on_past=True,
        dag=dag
)

# Inserting the data from Hive table with masked ssn external table to the target table
task4 = HiveOperator(
        task_id= 'hive_insert_masked',
        hql='add jar /home/cloudera/Masking.jar;create TEMPORARY function masking as \'Masking\';INSERT OVERWRITE table employee_profile_masked SELECT profile_id,first_name,last_name,modified_date,masking(ssn) FROM employee_profile;',
Exemple #23
0
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('thirdpay',
          default_args=default_args,
          schedule_interval='0 12 * * *')

EXECUTION_DATE = "{{ ds }}"

start = BashOperator(task_id='start', bash_command='echo start ', dag=dag)

# ods 表生成

ods_fin_third_pay_wxapp = BashOperator(
    task_id='ods_fin_third_pay_wxapp',
    bash_command='CheckTag -d {{ ds }} -l day  -b ods.ods_fin_third_pay_wxapp',
    dag=dag)
ods_fin_third_pay_wxapp.set_upstream(start)

ods_fin_third_pay_wxgzh = BashOperator(
    task_id='ods_fin_third_pay_wxgzh',
    bash_command='CheckTag -d {{ ds }} -l day  -b ods.ods_fin_third_pay_wxgzh',
    dag=dag)
ods_fin_third_pay_wxgzh.set_upstream(start)
  'email_on_retry': False
}


# Set concurrency and max_active_runs to 1, preventing more than one dag instance
# from being created.
dag = DAG(dag_name, default_args=task_args,
          concurrency=1,
          max_active_runs=1,
          schedule_interval=schedule_interval)


get_env = PythonOperator(
    task_id='get-config-from-s3',
    python_callable=ConfigGetter(),
    dag=dag)

set_variables = PythonOperator(
    task_id='set-variables',
    python_callable=BootStrapper(),
    dag=dag)

cleanup = BashOperator(
    task_id='cleanup',
    bash_command=rm_config,
    trigger_rule='all_done',
    dag=dag)


set_variables.set_upstream(get_env)
cleanup.set_upstream(set_variables)
Exemple #25
0
"""
Code that goes along with the Airflow located at:
http://airflow.readthedocs.org/en/latest/tutorial.html
"""
from airflow import DAG
from airflow.operators import BashOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime.now(),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 4, 24),
}

dag = DAG('pyspark_submit_pi', default_args=default_args)

t1 = BashOperator(
    task_id='spark_submit',
    bash_command=
    'spark-submit --master spark://spark-master-2-0-1:7077 /root/volumes/source.ml/jupyterhub.ml/spark/pyspark_pi.py',
    dag=dag)
        PROBE = "A1x16-5mm-50"
        RIG = "burung16"

        dag_id = USER + BLOCK
        dag = DAG(dag_id, 
                  default_args=default_args,
                  schedule_interval='@once',
        )


        phy_task = BashOperator(
            task_id='phy_spikesort',
            pool='phy',
            env={'PATH': PHY_PATH},
            bash_command=sort_spikes_command,
            params={'klustadir': KLUSTA_DIR,
                    'matfiledir': MATFILE_DIR},
            on_failure_callback = lambda c: [clean_dir(c['params']['klustadir'],filt) for filt in ('*.kwik','*.kwx')],
            on_success_callback = lambda c: set_perms(c['params']['klustadir'],default_args['owner']),
            dag=dag)

        clear_phy_task = BashOperator(
            task_id='clear_phy',
            bash_command=clear_phy_cmd,
            params={'klustadir': KLUSTA_DIR,
                    'matfiledir': MATFILE_DIR},
            dag=dag)

        make_kwik_bak_dir_task = BashOperator(
        	task_id='make_kwik_bak_dir',
        	bash_command=make_kwik_bak_dir_cmd,
Exemple #27
0
from airflow.operators import BashOperator, MySqlOperator
from airflow.models import DAG
from datetime import datetime

default_args = {
    'owner': 'max',
    'start_date': datetime(2014, 9, 1),
    'mysql_dbid': 'local_mysql',
}

dag = DAG(dag_id='example_3')

run_this = BashOperator(
        task_id='also_run_this', bash_command='ls -l', **default_args)
dag.add_task(run_this)

for i in range(5):
    i = str(i)
    task = BashOperator(
            task_id='runme_'+i, 
            bash_command='sleep {{ 10 + macros.random() * 10 }}', 
            **default_args)
    task.set_upstream(run_this)
    dag.add_task(task)

Exemple #28
0
aml_utils = load_source(
    'aml_utils',
    "{pf}/asiamiles_airflow_extensions/utils.py".format(
        pf=configuration.get('core', 'plugins_folder')))

mod_config = aml_utils.load_config(
    "{dag_folder}/pros_etl.cfg".format(
        dag_folder=dirname(realpath(__file__))))

hdfs_home=mod_config['hadoop']['hdfs_home']

copy_rsynced_files_to_hadoop = BashOperator(
    task_id="copy_rsynced_files_to_hadoop",
    bash_command="hadoop fs -put -f /data1/staging/pros/* pros",
    dag=dag)

#spark-shell --master yarn-client

update_seat_idx = BashOperator(
  task_id="update_seat_idx",
  bash_command="cat /data1/airflow/dags/pros-etl/pros_seat_index_hist_load.scala | spark-shell --master yarn-client",
  dag=dag)

update_curve = BashOperator(
  task_id="update_curve",
  bash_command="cat /data1/airflow/dags/pros-etl/pros_bid_price_hist_load.scala | spark-shell --master yarn-client",
  dag=dag)

update_seat_idx.set_upstream(copy_rsynced_files_to_hadoop)
update_curve.set_upstream(copy_rsynced_files_to_hadoop)
"""
Executing tasks at a particular time of the day using sensor operator.
"""
from airflow import DAG
from airflow.operators import BashOperator, TimeSensor
from datetime import datetime, timedelta, time


default_args = {"owner": "Samarth", "start_date": datetime(2016, 03, 15, 12)}

# "schedule_interval" is your cron expression you can write any cron expression like unix cron.
dag = DAG("airflow_task_with_time_sensor", default_args=default_args, schedule_interval="1 * * * *")

bash_task = BashOperator(
    task_id="dependency_for_sensor", bash_command='echo "Sensor would only be enabled after I am done!"', dag=dag
)

# Sensor operator takes "target_time" which is a specific time in a day irrespective of date/day.
# Sensor is executed once the target time has passed. In this case after 10:55 at morning.
sensor_task = TimeSensor(task_id="sensor_task", target_time=time(10, 55, 1, 1), dag=dag)

post_sensor_task = BashOperator(
    task_id="post_sensor_task", bash_command='echo "I am done, it means sensor has done its job."', dag=dag
)

# Setting up the correct dependencies for defined tasks.
sensor_task.set_upstream(bash_task)
post_sensor_task.set_upstream(sensor_task)
# Set concurrency and max_active_runs to 1, preventing more than one dag instance
# from being created.
dag = DAG(dag_name, default_args=task_args,
          concurrency=1,
          max_active_runs=1,
          schedule_interval=schedule_interval)


get_file = PythonOperator(
    task_id='get-file-from-s3',
    python_callable=FileGetter(),
    dag=dag)

hello_world_docker_write_logs = BashOperator(
    task_id='hello-world',
    bash_command=start_hello_world,
    trigger_rule=TriggerRule.ALL_SUCCESS,
    dag=dag)

check_read_logs = PythonOperator(
    task_id='check_read_logs',
    python_callable=CheckReadLogs(),
    dag=dag)

put_file = PythonOperator(
    task_id='put-file-to-s3',
    python_callable=DataPutter(),
    dag=dag)

delete_object = PythonOperator(
    task_id='delete-object-from-s3',
}

dag = DAG(
    'financial_news', default_args=default_args, schedule_interval=timedelta(2))

#Run Camus to pull messages from Kafka into HDFS
camus_a = BashOperator(
        task_id = 'camus_a',
        bash_command='tasks/run_camus.sh',
        depends_on_past=1,
        dag = dag)

#Run Spark to sum all historical trades and write to Cassandra
trades_batch_a = BashOperator(
        task_id = 'trades_batch_a',
        bash_command='tasks/run_trades_batch.sh',
        depends_on_past=1,
        dag = dag)

#set trades batch after news batch to give it more memory
trades_batch_a.set_upstream(camus_a)

#Update Cassandra's stream 2 table to include counts from the batch run with all the trades summed from stock_count_rts1, which were the trades that came in since task1_camus started running
sum_batch_a_rts2 = BashOperator(
        task_id = 'sum_batch_a_rts2',
        bash_command='tasks/sum_batch_rts2.sh',
        depends_on_past=1,
        dag = dag)

sum_batch_a_rts2.set_upstream(trades_batch_a)
Exemple #32
0
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'schedule_interval': timedelta(1),
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('tutorial', default_args=default_args)

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag)

t1.doc_md = """\
#### Task Documentation
You can document your task using the attributes `doc_md` (markdown),
`doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
rendered in the UI's Task Details page.
![img](http://montcs.bloomu.edu/~bobmon/Semesters/2012-01/491/import%20soul.png)
"""

dag.doc_md = __doc__

t2 = BashOperator(
    task_id='sleep',
    depends_on_past=False,
Exemple #33
0
        dag = DAG(dag_id, 
                  default_args=default_args,
                  schedule_interval='@once',
        )
    ############ Post-phy cleanup and merging
        make_postphy_dir_task = BashOperator(
            task_id='make_postphy_dir',
            bash_command=as_user(make_postphy_dir_cmd, USER),
            params={'postphydir': POSTPHY_DIR},
            on_success_callback = lambda c: set_perms(c['params']['postphydir'],default_args['owner']), 
            dag=dag)

        rsync_task = BashOperator(
            task_id='rsync',
            bash_command=as_user(rsync_command, USER),
            params={'postphydir': POSTPHY_DIR,
                    'mansortdir': MANSORT_DIR,
                    'mansorthost': MANSORT_HOST},
            dag=dag)

        merge_events_task = BashOperator(
            task_id='merge_events',
            bash_command=merge_events_cmd,
            env={'PATH': ANACONDA_PATH},
            params={'matfiledir': MATFILE_DIR,
                    'postphydir': POSTPHY_DIR},
            dag=dag)

        kwik2pandas_task = BashOperator(
            task_id='kwik2pandas',
            bash_command=kwik2pandas_cmd,
Exemple #34
0
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG(
    'tutorial_mod', default_args=default_args, schedule_interval=timedelta(1))

# t1, t2 and t3 are examples of tasks created by instatiating operators
t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag)

t2 = BashOperator(
    task_id='sleep',
    bash_command='sleep 5',
    retries=3,
    dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(
    task_id='templated',
    bash_command=templated_command,
    params={'my_param': 'Parameter I passed in'},
        create_disposition='CREATE_IF_NEEDED',
        skip_leading_rows=0,
        write_disposition='WRITE_TRUNCATE',  # If the table exists, overwrite it.
        max_bad_records=0)

    # Delete the Cloud Dataproc cluster.
    delete_cluster = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        # Obviously needs to match the name of cluster created in the prior two Operators.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # This will tear down the cluster even if there are failures in upstream tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    # Delete  gcs files in the timestamped transformed folder.
    delete_transformed_files = BashOperator(
        task_id='delete_transformed_files',
        bash_command="gsutil -m rm -r gs://" + BUCKET +
        "/{{ dag_run.conf['transformed_path'] }}/")

    # If the spark job or BQ Load fails we rename the timestamped raw path to
    # a timestamped failed path.
    move_failed_files = BashOperator(task_id='move_failed_files',
                                     bash_command="gsutil mv gs://" + BUCKET +
                                     "/{{ dag_run.conf['raw_path'] }}/ " +
                                     "gs://" + BUCKET +
                                     "/{{ dag_run.conf['failed_path'] }}/",
                                     trigger_rule=TriggerRule.ONE_FAILED)
    # Set the dag property of the first Operators, this will be inherited by downstream Operators.

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)