예제 #1
0
from airflow import DAG
# BashOperator
from airflow.operators.bash_operator import BashOperator
# days ago function
from airflow.utils.dates import days_ago
from datetime import datetime as dt
from datetime import timedelta
import os

default_args = {
'owner' : 'airflow',
'depends_on_past' : False,
'start_date' : dt(2020, 12, 5),
'retries' : 1,
'retry_delay' : timedelta(minutes=1)
}

dag = DAG(
'Start_hadoop',
description = 'Check if Hadoop running',
default_args = default_args,
schedule_interval = timedelta(days = 1)
)


t1 = BashOperator(
	task_id='Start_hadoop',
	bash_command='hadoop_start',
	dag=dag)

    print(
        'POSTAMBLE ------------------------------------------------------------------------'
    )
    create_podevent('Finishing ##PHASE## workflow for POD ##UUID##, Failed')
    create_podevent('State changed to: FAILED', level='STATUS')


t1 = PythonOperator(task_id='preamble',
                    provide_context=True,
                    python_callable=preamble,
                    dag=dag)

# Note the space at the end of the bash_command value is REQUIRED
t2 = BashOperator(
    task_id='maintask',
    bash_command=
    'chmod +x /workflow/##PHASE##-##WFINDEX##-##UUID##/##WFNAME##; /bin/bash /workflow/##PHASE##-##WFINDEX##-##UUID##/##WFNAME## ',
    dag=dag)

t3 = PythonOperator(task_id='postamble',
                    provide_context=True,
                    python_callable=postamble,
                    dag=dag)

t4 = PythonOperator(task_id='failure',
                    provide_context=True,
                    python_callable=failure,
                    dag=dag,
                    trigger_rule='all_failed')

t2.set_upstream(t1)
예제 #3
0
    # if a task fails, retry it once after waiting
    # at least 5 minutes
    #'retries': 1,
    #'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'dag_cron',
    default_args=default_args,
    description='A simple tutorial DAG',
    schedule_interval="20 3 3 * *",
)

t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag,
)

t1.doc_md = """\
#### Task Documentation
You can documentation your task usinf the attributes `doc_md` (,arkdown),
`doc` (plain text), `doc_rst`, `doc_json`, `doc_yaml` which gets
rendered in the UI's Task Instance Details page
"""

dag.doc_md = __doc__

t2 = BashOperator(
    task_id='sleep',
    depends_on_past=False,
예제 #4
0
import requests
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator

dag = DAG(
    dag_id="download_rocket_launches",
    start_date=airflow.utils.dates.days_ago(14),
    schedule_interval=None,
)

download_launches = BashOperator(
    task_id="download_launches",
    bash_command=
    "curl -o /tmp/launches.json 'https://launchlibrary.net/1.4/launch?next=5&mode=verbose'",
    dag=dag,
    executor_config={
        'request_memory': '128Mi',
        'limit_memory': '128Mi',
        'image': 'airflow/scipy:1.1.5'
    })


def _get_pictures():
    # Ensure directory exists
    pathlib.Path("/tmp/images").mkdir(parents=True, exist_ok=True)

    # Download all pictures in launches.json
    with open("/tmp/launches.json") as f:
        launches = json.load(f)
        image_urls = [
            launch["rocket"]["imageURL"] for launch in launches["launches"]
예제 #5
0
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG("test_branch",
          default_args=default_args,
          schedule_interval=timedelta(minutes=5),
          catchup=False)

t1 = BashOperator(
    task_id="init",
    bash_command="echo lol",
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)

options = ["wowww", "wowww2"]
t2 = BranchPythonOperator(task_id='branching',
                          python_callable=lambda: random.choice(options),
                          dag=dag)

t3 = BashOperator(
    task_id="wowww",
    bash_command="echo wowwww",
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)
t4 = DummyOperator(task_id='wowww2', trigger_rule='one_success', dag=dag)
예제 #6
0
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=10),
}

dag = DAG(
    dag_id='aadownloaddata',
    default_args=default_args,
    description='descargadedatos',
    dagrun_timeout=timedelta(minutes=2),
    schedule_interval=timedelta(days=1),
)

CreateDir = BashOperator(task_id='create_dir',
                         depends_on_past=False,
                         bash_command='mkdir -p /tmp/airflow/p2/',
                         dag=dag)

DownloadTemperatureData = BashOperator(
    task_id='download_temperature_data',
    depends_on_past=False,
    bash_command=
    'curl -o /tmp/airflow/p2/temperature.csv.zip https://raw.githubusercontent.com/manuparra/MaterialCC2020/master/temperature.csv.zip',
    dag=dag)

DownloadHumidityData = BashOperator(
    task_id='download_humidity_data',
    depends_on_past=False,
    bash_command=
    'curl -o /tmp/airflow/p2/humidity.csv.zip https://raw.githubusercontent.com/manuparra/MaterialCC2020/master/humidity.csv.zip',
    dag=dag)
dag = DAG(
    "extract_secmar_s3",
    default_args=default_args,
    max_active_runs=1,
    concurrency=5,
    catchup=False,
    schedule_interval="50 8 * * *",
)
dag.doc_md = __doc__

start = DummyOperator(task_id="start", dag=dag)
end = DummyOperator(task_id="end", dag=dag)

download_s3 = BashOperator(
    task_id="download_s3",
    bash_command="mkdir {base} && aws s3 sync s3://secmar {base}".format(
        base=BASE_PATH),
    dag=dag,
)
download_s3.set_downstream(start)

for table in SECMAR_TABLES + ["operations_valides"]:
    command = "awk 'NR==1{{$0=tolower($0)}} 1' {in_path} > {tmp} && mv {tmp} {out_path}".format(
        tmp="/tmp/lower_" + table,
        in_path=BASE_PATH + "/" + table + ".csv",
        out_path=in_path(table),
    )
    lowercase_header = BashOperator(task_id="lowercase_header_csv_" + table,
                                    bash_command=command,
                                    dag=dag)
    lowercase_header.set_upstream(start)
    lowercase_header.set_downstream(end)
예제 #8
0
    "retries": 1,
    "retry_delay": timedelta(minutes=1),
    'provide_context': True,
}

dag = DAG("Stock-Exchange-V4",
          default_args=default_args,
          schedule_interval="0 13 * * 6,0-3",
          catchup=False)

task_read_stock_exchange_xlsx_file = BashOperator(
    task_id='Download-Stock-Exchange-Xlsx-File',
    bash_command=
    'curl --retry 10 --output {0} -L -H "User-Agent:Chrome/61.0" --compressed "http://members.tsetmc.com/tsev2/excel/MarketWatchPlus.aspx?d=0"'
    .format(
        path.join(
            EXCEL_FILE_PATH,
            "{0}_{1}.{2}".format(EXCEL_FILE_NAME,
                                 date.today().strftime("%Y_%m_%d"),
                                 EXCEL_FILE_EXT_XLSX))),
    dag=dag,
)

task_waiting_file_xlsx = FileSensor(
    task_id="Waiting-Excel-File",
    fs_conn_id="fs_temp",
    filepath=path.join(
        EXCEL_FILE_PATH,
        "{0}_{1}.{2}".format(EXCEL_FILE_NAME,
                             date.today().strftime("%Y_%m_%d"),
                             EXCEL_FILE_EXT_XLSX)),
    poke_interval=10,  # every 10 seconds,
from pendulum import Pendulum

args = {
    'owner': 'Anton Kostyliev',
    'start_date': airflow.utils.dates.days_ago(2),
}

dag = DAG(
    dag_id='exercise_2',
    default_args=args,
    schedule_interval=None,
    dagrun_timeout=timedelta(minutes=60),
)

def print_execution_date(execution_date : Pendulum, **context):
    print("Execution date: " + execution_date.to_iso8601_string())

with dag as dag:
    execution_date = PythonOperator(
        task_id="print_execution_date",
        python_callable=print_execution_date,
        provide_context=True
    )

    sleep1 = BashOperator(task_id='sleep1', bash_command="sleep 1")
    sleep5 = BashOperator(task_id='sleep5', bash_command="sleep 5")
    sleep10 = BashOperator(task_id='sleep10', bash_command="sleep 10")
    end = DummyOperator(task_id="finish_task")

execution_date >> [sleep1, sleep5, sleep10] >> end
예제 #10
0
import airflow
from airflow.operators.bash_operator import BashOperator
from airflow.models import DAG

args = {
    'owner': 'Freddy Drennan',
    'start_date': airflow.utils.dates.days_ago(2),
    'email': ['*****@*****.**'],
    'retries': 2,
    'email_on_failure': True,
    'email_on_retry': True
}

dag = DAG(dag_id='restore_postgres_from_backup',
          default_args=args,
          schedule_interval='@daily',
          concurrency=1,
          max_active_runs=1,
          catchup=False)

task_1 = BashOperator(task_id='set_up_aws',
                      bash_command='. /home/scripts/R/shell/aws_configure',
                      dag=dag)

task_2 = BashOperator(
    task_id='get_backup_from_s3',
    bash_command='. /home/scripts/R/shell/get_backup_from_s3',
    dag=dag)

task_1 >> task_2
예제 #11
0
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'royh',
    'start_date': datetime(2020, 5, 5),
    'depends_on_past': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
    'catchup_by_default': False,
    'email_on_retry': False
}

dag = DAG('seo_rankings',
          default_args=default_args,
          schedule_interval='@daily')

t0 = BashOperator(task_id='where_am_i',
                  bash_command='pwd && cd ~ && ls',
                  dag=dag)

t1 = BashOperator(task_id='get_coaster_rankings',
                  bash_command='cd ~ && cd dags && python etsy_coaster_seo.py',
                  dag=dag)

t2 = BashOperator(
    task_id='get_luggage_rankings',
    bash_command='cd ~ && cd dags && python etsy_luggagetag_seo.py',
    dag=dag)

t0 >> t1 >> t2
예제 #12
0
    'start_date': datetime(2021, 3, 16, 18),  # ano mes dia e hora
    'email': ['*****@*****.**', '*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1)
}

# Vamos definir a DAG - Fluxo
dag = DAG("treino-04",
          description='Paralelismo',
          default_args=default_args,
          schedule_interval="*/10 * * * *")

start_processing = BashOperator(task_id='start-processing',
                                bash_command='echo Start Preprocessing! Vai!',
                                dag=dag)

get_data = BashOperator(
    task_id='get-data',
    bash_command=
    'curl http://download.inep.gov.br/microdados/Enade_Microdados/microdados_enade_2019.zip.  -o /usr/local/airflow/data/train.csv',
    dag=dag)


def unzip_file():
    with zipfile.ZipFile('/usr/local/airflow/data/microdados_enade_2019.zip',
                         'r') as zipped:
        zipped.extractall('/usr/local/airflow/data/')

예제 #13
0
https://github.com/apache/airflow/blob/master/airflow/example_dags/tutorial.py
"""

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'Airflow',
    'depends_on_past': False,
    'start_date': datetime(2020, 2, 4),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('test_pipeline',
          default_args=default_args,
          schedule_interval=timedelta(days=1))

t1 = BashOperator(
    task_id='execute_casa',
    bash_command='casa --nologger --nogui -c ./src/casa_script.py',
    dag=dag)
예제 #14
0
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2019, 11, 6),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5)
    #"access_control": {"role1": {"can_dag_read", "can_dag_edit"}}
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG("rbac_test_2",
          default_args=default_args,
          schedule_interval=timedelta(minutes=1),
          catchup=False,
          access_control={'role1': ['can_dag_read']})

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

t3 = BashOperator(task_id="sleep2", bash_command="sleep 5", retries=3, dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)
예제 #15
0
            log_url=context.get('task_instance').log_url,
        )
    failed_alert = SlackWebhookOperator(
        task_id='slack_test',
        http_conn_id='slack',
        webhook_token=slack_webhook_token,
        message=slack_msg,
        username='******',
        )
    return failed_alert.execute(context=context)

default_args = {'owner':'rdumas',
                'depends_on_past':False,
                'start_date': datetime(2019, 11, 22),
                'email': ['*****@*****.**'],
                'email_on_failure': False,
                 'email_on_success': False,
                 'retries': 0,
                 'retry_delay': timedelta(minutes=5),
                 'on_failure_callback': task_fail_slack_alert
                }

dag = DAG('pull_miovision',default_args=default_args, schedule_interval='0 3 * * *')
# Add 3 hours to ensure that the data are at least 2 hours old

t1 = BashOperator(
        task_id = 'pull_miovision',
        bash_command = '/etc/airflow/data_scripts/.venv/bin/python3 /etc/airflow/data_scripts/volumes/miovision/api/intersection_tmc.py run-api --path /etc/airflow/data_scripts/volumes/miovision/api/config.cfg --dupes', 
        retries = 0,
        dag=dag)
예제 #16
0
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG(
    'ats_daily', default_args=default_args,
    schedule_interval='0 0 * * *',
    catchup=False,
    max_active_runs=1)

python_executable = '~/venv/bin/python3.7'
python_script_path = '~/PycharmProjects/TwitterStats'
# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(
    task_id='words_trends',
    bash_command='cd {};{} words.py trends'.format(python_script_path, python_executable),
    dag=dag)

t2 = BashOperator(
    task_id='tweeter_promotion',
    bash_command='cd {};{} tweeter_promotion.py'.format(python_script_path, python_executable),
    dag=dag)

hu = BashOperator(
    task_id='hashtag_update',
    bash_command='cd {};{} hashtag_update.py'.format(python_script_path, python_executable),
    dag=dag)

t4 = BashOperator(
    task_id='findbots',
    bash_command='cd {};{} findbots.py {{{{ ds }}}}'.format(python_script_path, python_executable),
예제 #17
0
    'retry_delay': timedelta(minutes=1)
}

# Definição da DAG - Fluxo
dag = DAG(
    "treino-03",
    description=
    "Pega dados do Titanic e calcula idade média para homens ou mulheres",
    default_args=default_args,
    schedule_interval=timedelta(minutes=2)
    #schedule_interval="*/2 * * * *"
)

get_data = BashOperator(
    task_id='get_data',
    bash_command=
    'curl https://raw.githubusercontent.com/A3Data/hermione/master/hermione/file_text/train.csv -o /usr/local/airflow/data/train.csv',
    dag=dag)


def sorteia_h_m():
    return random.choice(['male', 'female'])


escolhe_h_m = PythonOperator(task_id='escolhe-h-m',
                             python_callable=sorteia_h_m,
                             dag=dag)


def Mouf(**context):
    value = context['task_instance'].xcom_pull(task_ids='escolhe-h-m')
예제 #18
0
    "start_date": datetime(2019, 1, 1),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG("random-fail",
          default_args=default_args,
          schedule_interval=timedelta(1))

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id="task1", bash_command="echo task1", dag=dag)

t2 = BashOperator(
    task_id="task2",
    bash_command=
    "rnd=$RANDOM; echo $rnd; if [ $rnd -lt 20000 ]; then exit 1; fi",
    retries=0,
    dag=dag)

t3 = BashOperator(task_id="task3", bash_command="echo task3", dag=dag)

t1 >> t2
t2 >> t3
예제 #19
0
    'owner': 'pardha',
    'depends_on_past': False,
    'start_date': datetime(2017, 5, 11),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('HelloWorld', default_args=default_args)

# t1, t2, t3 and t4 are examples of tasks created using operators

t1 = BashOperator(
    task_id='task_1',
    bash_command='echo "Hello World from Task pardha"',
    dag=dag)

t2 = BashOperator(
    task_id='task_2',
    bash_command='echo "Hello World from Task 2"',
    dag=dag)

t3 = BashOperator(
    task_id='task_3',
    bash_command='echo "Hello World from Task 3"',
    dag=dag)

t4 = BashOperator(
    task_id='task_4',
    bash_command='echo "Hello World from Task 4"',
예제 #20
0
    xcom_push=True,
    dag=dag,
)

callSjson = SimpleHttpOperator(
    task_id="pass_JSON",
    method='POST',
    endpoint='/payload',
    data=json.dumps({"channel": "UK"}),
    headers={"Content-Type": "application/json"},
    http_conn_id='cloud_run_gcp_flask',
    xcom_push=True,
    dag=dag,
)

echoS = BashOperator(
    task_id="echo_sucess",
    bash_command="echo sucess",
    dag=dag,
)

echoF = BashOperator(
    task_id="echo_failure",
    bash_command="echo failure",
    dag=dag,
)

echoS.set_upstream(callS1)
echoS.set_upstream(callS2)
echoF.set_upstream(callF1)
예제 #21
0
    "retry_delay": timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG(
    "tutorial",
    default_args=default_args,
    schedule_interval=timedelta(days=1),
    catchup=False,
)

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(
    task_id="templated",
    bash_command=templated_command,
    params={"my_param": "Parameter I passed in"},
예제 #22
0
default_args = {
    'owner': 'user',
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
    # 'wait_for_downstream': False,
    # 'dag': dag,
    # 'adhoc':False,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,我都搶不到
    # 'trigger_rule': u'all_success'
}

dag = DAG('tutorial',
          default_args=default_args,
          description='my first DAG',
          schedule_interval='10 * * * * *')

t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)
예제 #23
0
        except (IOError, psycopg2.Error), error :
            print ("Error inserting dataframe to DB! ", error)
    else:
        print "Data already exists in the DB! Skipping..."


srcDir = '/home/ubuntu/fault-tolerant-airflow/src/spark/'

# Command to run remote spark batch processing
cmd = 'ssh [email protected] spark-submit' + ' ' + srcDir + 'PDS.py --master ec2-18-235-191-19.compute-1.amazonaws.com --deploy-mode=cluster'

objectKey = 's3n://de-yk-bucket/PDS/XETR/DailyAverages/' + str(todays_date_str) + '.csv'


# Bash operator that synchronizes the Deutsche XETR Public Dataset with my bucket stored in S3
s3_ingest_opr = BashOperator(task_id='s3_ingest', bash_command='aws s3 sync s3://deutsche-boerse-xetra-pds s3://de-yk-bucket/PDS/XETR/ ', dag=dag)

# Remote batch processing operator that calculates the daily averages of stock prices
spark_batch_opr = BashOperator(task_id='spark_batch', bash_command=cmd, dag=dag)

# S3 file sensor operator that senses the temporarily created csv file in S3
s3_file_sensor_opr = S3KeySensor(
    task_id='s3_file_sensor',
    poke_interval=60,
    timeout=10,
    soft_fail=True,
    bucket_key=objectKey,
    bucket_name=None,
    dag=dag)

# Store to DB operator that stores the calculated daily average prices in PostgreSQL
예제 #24
0
    'start_date': datetime(2015, 6, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('test', default_args=default_args, schedule_interval=timedelta(1))

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id='echo_cal', bash_command='cal', dag=dag)

t2 = BashOperator(task_id='sleep',
                  bash_command='sleep 5 && echo "Hello World"',
                  retries=3,
                  dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(task_id='templated',
예제 #25
0
          default_args=args,
          schedule_interval='@daily')

d1 = PythonOperator(task_id='task1',
                    provide_context=True,
                    python_callable=tt1,
                    op_kwargs={'param': 'apple'},
                    dag=dag)
d2 = PythonOperator(task_id='task2',
                    provide_context=True,
                    python_callable=tt2,
                    op_kwargs={'param': 'apple'},
                    dag=dag)

d3 = BashOperator(
    task_id='task3',
    bash_command='source /Users/jigeonho/airflow/venv/bin/activate',
    dag=dag)

d5 = BashOperator(
    task_id='task5',
    bash_command='/bin/bash /Users/jigeonho/airflow/function/runvenv.sh ',
    dag=dag)

# d4  = PythonOperator(task_id='task4',
#                     provide_context=True,
#                     python_callable=pdf_main,
#                     # op_kwargs={'param': 'apple'},
#                     dag=dag)

d1 >> d2 >> d3
예제 #26
0
from airflow.contrib.operators.dataflow_operator import DataFlowJavaOperator
from airflow.operators.bash_operator import BashOperator
from airflow import DAG
from datetime import datetime, timedelta

dataflow_dag = DAG(dag_id="dataflow_pipeline",
                   start_date=datetime(2017, 2, 2),
                   schedule_interval=timedelta(seconds=15),
                   max_active_runs=1,
                   catchup=True)

print_path_task = BashOperator(dag=dataflow_dag,
                               bash_command="pwd",
                               task_id="test_upstream_task_pwd")

jar_task = DataFlowJavaOperator(dag=dataflow_dag,
                                jar="/home/airflow/gcs/dags/"
                                "jar/dataflow_pipeline-bundled-1.0.jar",
                                options={
                                    "project":
                                    "hybrid-elysium-118418",
                                    "stagingLocation":
                                    "gs://hybrid-elysium-118418/dataflow/"
                                },
                                task_id="dataflow_pipeline")

print_path_task.set_downstream(jar_task)
예제 #27
0
    dag_id="atd_visionzero_reassociate_missing_locations_staging",
    description="This script re-processes location associations in VZD",
    default_args=args,
    schedule_interval="0 3 * * *",
    dagrun_timeout=timedelta(minutes=60),
    tags=["staging", "visionzero"],
)

#
# This process will find the locations for CR3 crashes that do not have one but
# fall into a location and they are not mainlanes.
#
process_cr3 = BashOperator(
    task_id="process_cr3",
    bash_command=
    "python3 ~/dags/python_scripts/atd_vzd_update_cr3_locations.py",
    env=environment_vars,
    dag=dag,
)

#
# This process will find the locations for Non-CR3 crashes that do not have one but
# fall into a location and they are not mainlanes.
#
process_noncr3 = BashOperator(
    task_id="process_noncr3",
    bash_command=
    "python3 ~/dags/python_scripts/atd_vzd_update_noncr3_locations.py",
    env=environment_vars,
    dag=dag,
)
예제 #28
0
  {{ params.base_path }}/{{ params.filename }} \
  {{ params.base_path }}
"""
pyspark_date_bash_command = """
spark-submit --master {{ params.master }} \
  {{ params.base_path }}/{{ params.filename }} \
  {{ ts }} {{ params.base_path }}
"""

# Gather the training data for our classifier
extract_features_operator = BashOperator(task_id="pyspark_extract_features",
                                         bash_command=pyspark_bash_command,
                                         params={
                                             "master":
                                             "local[8]",
                                             "filename":
                                             "ch08/extract_features.py",
                                             "base_path":
                                             "{}/".format(PROJECT_HOME)
                                         },
                                         dag=training_dag)

# Train and persist the classifier model
train_classifier_model_operator = BashOperator(
    task_id="pyspark_train_classifier_model",
    bash_command=pyspark_bash_command,
    params={
        "master": "local[8]",
        "filename": "ch08/train_spark_mllib_model.py",
        "base_path": "{}/".format(PROJECT_HOME)
    },
예제 #29
0
    'start_date': datetime(2020,1,30),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag:DAG = DAG( 'project_dag',
           default_args=default_args,
           schedule_interval=timedelta(hours=1)
         )

# operators (executors) for each task we must run named according to the BigData assignment numbers.
project_2 = BashOperator( task_id='project_2',
                          bash_command='/home/jeff/BigData/002-NetCat/run_howto.sh ',
                          dag=dag
                        )

project_3 = BashOperator( task_id='project_3',
                          bash_command='/home/jeff/BigData/003-SkillsTest/run.sh ',
                          dag=dag
                        )

project_4 = BashOperator( task_id='project_4',
                          bash_command='/home/jeff/BigData/004-WordCount/python/run ',
                          dag=dag
                        )

project_8 = BashOperator( task_id='project_8',
                          bash_command='/home/jeff/BigData/008-WordCountInScala/run ',
                          dag=dag
예제 #30
0
	cnopts.hostkeys = None
    
	#3. Create a pysftp connection with host="35.222.158.208", username="******", password="******",cnopts = cnopts
	
	json_remote_file = "comments."+str(time.time())+".json"
	
	#4. Do a sftp with following parameters
	# Local file (comments.json)
	# Remote file path "Upload/YourName/json_remote_file"

	# Closes the connection
	srv.close()




#5. Add your name to the DAG name below
with DAG('airflow_sftp_YourName', default_args=default_args, schedule_interval='*/3 * * * *',) as dag:

	
	download_json_task = PythonOperator(task_id='Get_comments', python_callable=download_json_comments)
	
	print_json_ok = BashOperator(task_id='Download_check', bash_command='echo "Json comments file downloaded successfully"')

	upload_json_sftp_task = PythonOperator(task_id='Upload_comments_sftp', python_callable=upload_sftp)
	
	print_upload_ok = BashOperator(task_id='Upload_check', bash_command='echo "Json comments file uploaded successfully"')


#6. Create worflow DAG with following sequence: download_json_task, print_json_ok, upload_json_sftp_task, print_upload_ok