Esempio n. 1
0
    'start_date': datetime.datetime(2020, 01, 01),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("msisensor",
          default_args=default_args,
          schedule_interval=None,
          concurrency=10000,
          max_active_runs=2000)

start_analysis_run_task = PythonOperator(task_id="start_analysis_run",
                                         python_callable=start_analysis_run,
                                         provide_context=True,
                                         dag=dag)

msisensor_task = PythonOperator(task_id='msisensor',
                                python_callable=run_msisensor,
                                provide_context=True,
                                dag=dag)

msisensor_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)
Esempio n. 2
0
from datetime import datetime


dag = DAG(
	dag_id = 'my_first_dag',
	start_date = datetime(2020,10,31),
	schedule_interval = '0 2 * * *')

def print_hello():
	print("hello!")
	return "hello!"

def print_goodbye():
	print("goodbye!")
	return "goodbye!"

print_hello = PythonOperator(
	task_id = 'print_hello',
	#python_callable param points to the function you want to run 
	python_callable = print_hello,
	#dag param points to the DAG that this task is a part of
	dag = dag)

print_goodbye = PythonOperator(
	task_id = 'print_goodbye',
	python_callable = print_goodbye,
	dag = dag)

#Assign the order of the tasks in our DAG
print_hello >> print_goodbye
Esempio n. 3
0
dag = DAG(
    dag_id = 'week5_2',
    start_date = datetime(2020,11,8), # 날짜가 미래인 경우 실행이 안됨
    schedule_interval = '@once',  # 적당히 조절
    max_active_runs = 1,
    concurrency=2,
    catchup=False
)

dir_path = os.path.dirname(os.path.realpath(__file__))
print(dir_path)
print(os.getcwd())

prev_task = PythonOperator(
    task_id = 'create_tables',
    python_callable = create_tables,
    dag = dag)

for table in tables:
    s3_key=s3_key_prefix+'/'+table+'.tsv'

    postgrestos3 = PostgresToS3Operator(
        table="public."+table,
        s3_bucket=s3_bucket,
        s3_key=s3_key,
        data_dir=local_dir,
        dag=dag,
        task_id="Postgres_to_S3"+"_"+table
    )

    s3toredshift = S3ToRedshiftOperator(
Esempio n. 4
0
  'email_on_retry': False
}


# Set concurrency and max_active_runs to 1, preventing more than one dag instance
# from being created.
dag = DAG(dag_name, default_args=task_args,
          concurrency=1,
          max_active_runs=1,
          schedule_interval=schedule_interval)


get_env = PythonOperator(
    task_id='get-config-from-s3',
    python_callable=ConfigGetter(),
    dag=dag)

set_variables = PythonOperator(
    task_id='set-variables',
    python_callable=BootStrapper(),
    dag=dag)

cleanup = BashOperator(
    task_id='cleanup',
    bash_command=rm_config,
    trigger_rule='all_done',
    dag=dag)


set_variables.set_upstream(get_env)
cleanup.set_upstream(set_variables)
    '''This is a function that will run within the DAG execution'''
    time.sleep(random_base)


def connect_to_monary_and_print_aggregation(ds, **kwargs):
    m = Monary()
    pipeline = [{"$group": {"_id": "$state", "totPop": {"$sum": "$pop"}}}]
    states, population = m.aggregate("zips", "data", pipeline, ["_id", "totPop"], ["string:2", "int64"])
    strs = list(map(lambda x: x.decode("utf-8"), states))
    result = list("%s: %d" % (state, pop) for (state, pop) in zip(strs, population))
    print (result)
    return 'Whatever you return gets printed in the logs'

run_this = PythonOperator(
    task_id='connect_to_monary_and_print_aggregation',
    provide_context=True,
    python_callable=connect_to_monary_and_print_aggregation,
    dag=dag)

for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(
        task_id='sleep_for_'+str(i),
        python_callable=my_sleeping_function,
        op_kwargs={'random_base': i},
        dag=dag)
    task.set_upstream(run_this)
Esempio n. 6
0
                                      auth=basic_auth("neo4j", "password"))
        session = driver.session()
        result = session.run(query_code, inputs)
        # ret = {x:[] for x in outputs}
        try:
            for record in result:
                print(record)
                for out in output_vars:
                    if (isinstance(record[out], bytes)):
                        ret[out].append(record[out].decode("utf-8"))
                    else:
                        ret[out].append(record[out])
        except:
            print("Came into except ")
    if (query_type == "mongoDB"):
        if (query_code == "mp_ht_in_total"):
            print(inputs["num"])
            ret = mongoQuery.mp_ht_in_total(limit=inputs["num"])
    print(ret)
    for k, v in ret.items():
        context['task_instance'].xcom_push(k, v)
    print("========================================")
    return ret


task_0 = PythonOperator(task_id='node_{}'.format("n1"),
                        python_callable=execute_query,
                        op_kwargs={'node_name': "n1"},
                        provide_context=True,
                        dag=dag)
Esempio n. 7
0
}

dag = DAG("delly", default_args=default_args,
          schedule_interval=None, concurrency=10000, max_active_runs=2000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)


validate_sample_task = PythonOperator(
    task_id="validate_sample",
    python_callable=validate_sample,
    provide_context=True,
    dag=dag)

validate_sample_task.set_upstream(start_analysis_run_task)

delly_task = PythonOperator(
    task_id="delly_genotype",
    python_callable=run_delly,
    provide_context=True,
    dag=dag)

delly_task.set_upstream(validate_sample_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG(
    'example_twitter_dag', default_args=default_args,
    schedule_interval="@daily")

# --------------------------------------------------------------------------------
# This task should call Twitter API and retrieve tweets from yesterday from and to
# for the four twitter users (Twitter_A,..,Twitter_D) There should be eight csv
# output files generated by this task and naming convention
# is direction(from or to)_twitterHandle_date.csv
# --------------------------------------------------------------------------------

fetch_tweets = PythonOperator(
    task_id='fetch_tweets',
    python_callable=fetchtweets,
    dag=dag)

# --------------------------------------------------------------------------------
# Clean the eight files. In this step you can get rid of or cherry pick columns
# and different parts of the text
# --------------------------------------------------------------------------------

clean_tweets = PythonOperator(
    task_id='clean_tweets',
    python_callable=cleantweets,
    dag=dag)

clean_tweets.set_upstream(fetch_tweets)

# --------------------------------------------------------------------------------
Esempio n. 9
0
    print("second")


def third():
    print("third")


with DAG('demo_1',
         description='123',
         schedule_interval=None,
         start_date=datetime(2018, 1, 1),
         catchup=False) as dag:

    first = PythonOperator(
        task_id='first',
        python_callable=first,
        dag=dag,
    )

    second = PythonOperator(
        task_id='second',
        python_callable=second,
        dag=dag,
    )

    third = PythonOperator(
        task_id='third',
        python_callable=third,
        dag=dag,
    )
Esempio n. 10
0
    dag=dag,
    table="trips",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket="udac-data-pipelines",
    s3_key=
    "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv"
)

#
# TODO: Replace this data quality check with the HasRowsOperator
#
check_trips = PythonOperator(task_id='check_trips_data',
                             dag=dag,
                             python_callable=check_greater_than_zero,
                             provide_context=True,
                             params={
                                 'table': 'trips',
                             })

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = S3ToRedshiftOperator(
    task_id="load_stations_from_s3_to_redshift",
    dag=dag,
    redshift_conn_id="redshift",
end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline', dag=dag)

pg_unload = PostgresOperator(
    dag=dag,
    task_id='pg_unload',
    sql=unload_user_purchase,
    postgres_conn_id='postgres_default',
    params={'temp_filtered_user_purchase': temp_filtered_user_purchase},
    depends_on_past=True,
    wait_for_downstream=True)

user_purchase_to_s3_stage = PythonOperator(
    dag=dag,
    task_id='user_purchase_to_s3_stage',
    python_callable=_local_to_s3,
    op_kwargs={
        'filename': temp_filtered_user_purchase,
        'key': temp_filtered_user_purchase_key,
    },
)

remove_local_user_purchase_file = PythonOperator(
    dag=dag,
    task_id='remove_local_user_purchase_file',
    python_callable=remove_local_file,
    op_kwargs={
        'filelocation': temp_filtered_user_purchase,
    },
)

movie_review_to_s3_stage = PythonOperator(
    fetch_logs=True,
    tags='aiflow_example_run',
    dag=dag)

t2 = QuboleOperator(
    task_id='hive_s3_location',
    command_type="hivecmd",
    script_location="s3n://dev.canopydata.com/airflow/show_table.hql",
    notfiy=True,
    tags=['tag1', 'tag2'],
    trigger_rule="all_done",
    dag=dag)

t3 = PythonOperator(
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

""" Simple subdag example """
from airflow import DAG
from airflow.operators import PythonOperator
from twitter_airflow import csv_to_sqlite, identify_popular_links
from datetime import datetime, timedelta


default_args = {
    'owner': 'admin',
    'depends_on_past': False,
    'start_date': datetime(2016, 1, 1),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

subdag = DAG('generate_twitter_dags.insert_and_id_pop',
             default_args=default_args)

move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite',
                                       provide_context=True,
                                       python_callable=csv_to_sqlite,
                                       dag=subdag)

id_popular = PythonOperator(task_id='identify_popular_links',
                            provide_context=True,
                            python_callable=identify_popular_links,
                            dag=subdag,
                            params={'write_mode': 'a'})

id_popular.set_upstream(move_tweets_to_sqlite)
Esempio n. 14
0
}

dag = DAG("freebayes", default_args=default_args,
          schedule_interval=None, concurrency=10000, max_active_runs=2000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)


validate_sample_task = PythonOperator(
    task_id="validate_sample",
    python_callable=validate_sample,
    provide_context=True,
    dag=dag)

validate_sample_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

for contig_name in tracker.util.workflow_common.CONTIG_NAMES:
    freebayes_task = PythonOperator(
        task_id="freebayes_" + contig_name,
        python_callable=run_freebayes,
    message = f'''Отчет по объявлению 121288 за 2 апреля
    Траты: {df.Total_cost[1]} рублей ({Total_cost_diff} %)
    Показы: {df.view[1]} ({view_diff} %)
    Клики: {df.click[1]} ({click_diff} %)
    CTR: {df.CTR[1]} ({CTR_diff} %)'''

    token = '66117ac9424a6b67d404d24a1cb0fcfec6a150abeb21fb62b1edea6ddda943c35975ca53f7084347b094c'

    vk_session = vk_api.VkApi(token=token)
    vk = vk_session.get_api()

    vk.messages.send(user_id='7768141', random_id=2, message=message)


t1 = PythonOperator(task_id='send_vk_report_task',
                    python_callable=send_vk_report,
                    dag=dag)
'''
1. Прочитайте csv файл, который находится по этой ссылке, при помощи библиотеки pandas

https://docs.google.com/spreadsheets/d/e/2PACX-1vR-ti6Su94955DZ4Tky8EbwifpgZf_dTjpBdiVH0Ukhsq94jZdqoHuUytZsFZKfwpXEUCKRFteJRc9P/pub?gid=889004448&single=true&output=csv
2. В данных вы найдете информацию о событиях, которые произошли с объявлением 121288 за два дня. Рассчитайте следующие метрики в разрезе каждого дня:

количество показов
количество кликов
CTR 
сумма потраченных денег 
То есть для каждой метрики у вас должно получиться два числа - за 2019-04-01 и 2019-04-02

Рассчитать сумму потраченных денег можно по следующей формуле - разделите значение из колонки ad_cost на 1000 и умножьте на количество показов объявления
    # Task 3 git clone project
    bash_git = BashOperator(
        task_id='bash_git',
        bash_command='pushd /home/admin/gta_scripts && ./git_clone.sh '
        '{{dag_run.conf['
        '"git_dire"]}} {{dag_run.conf["git_url"]}} ')

    # Task 4 change branch of git bash_ch_branch=BashOperator(task_id='bash_ch_branch',bash_command='pushd
    # /home/admin/gta_scripts && ./change_branch.sh ' '{{dag_run.conf["pro_dire"]}} {{dag_run.conf["git_branch"]}} ')

    # Task 5 Give status of Git clone git_status=BashOperator(task_id='git_status',bash_command='pushd
    # /home/admin/gta_scripts && python db_status_git.py -i ' '{{dag_run.conf["user_id"]}} {{dag_run.conf[
    # "state_git"]}} {{dag_run.conf["status_git"]}}')

    git_status = PythonOperator(task_id='git_status',
                                provide_context=True,
                                python_callable=write_db_git)

    # Task 6 ping the server
    ping_ser = BashOperator(
        task_id='ping_ser',
        bash_command='pushd /home/admin/gta_scripts && ./ping.sh ')

    # Task 7 params update params_update=BashOperator(task_id='params_update',bash_command='pushd
    # /home/admin/gta_scripts && python sutas_params_update.py -i ' '{{dag_run.conf["raisebugs"]}} {{dag_run.conf[
    # "jiraenv"]}} {{dag_run.conf["loglevel"]}} {{dag_run.conf["slack"]}} {{dag_run.conf["emailnotifications"]}} {{
    # dag_run.conf["symmetrickey"]}} {{dag_run.conf["teamsnotifications"]}} {{dag_run.conf["consolidatedmail"]}} {{
    # dag_run.conf["enabledatebase"]}} {{dag_run.conf["enabletestmanagement"]}} {{dag_run.conf[
    # "enablepushtestartifacts"]}} -d {{dag_run.conf["ref_path"]}} ')

    docker_params_update = PythonOperator(task_id='docker_params_update',
dag6_task1 = DummyOperator(
    task_id='test_depends_on_past',
    depends_on_past=True,
    dag=dag6,)
dag6_task2 = DummyOperator(
    task_id='test_depends_on_past_2',
    depends_on_past=True,
    dag=dag6,)
dag6_task2.set_upstream(dag6_task1)


# DAG tests that a deadlocked subdag is properly caught
dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args)
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args)
subdag7_task1 = PythonOperator(
    task_id='test_subdag_fail',
    dag=subdag7,
    python_callable=fail)
subdag7_task2 = DummyOperator(
    task_id='test_subdag_dummy_1',
    dag=subdag7,)
subdag7_task3 = DummyOperator(
    task_id='test_subdag_dummy_2',
    dag=subdag7)
dag7_subdag1 = SubDagOperator(
    task_id='subdag',
    dag=dag7,
    subdag=subdag7)
subdag7_task1.set_downstream(subdag7_task2)
subdag7_task2.set_downstream(subdag7_task3)

# DAG tests that queued tasks are run
Esempio n. 18
0
def process_utilization_kpi(
        parent_dag_name, child_dag_name, start_date, schedule_interval,
        celery_queue, ss_tech_sites, hostnames_ss_per_site, ss_name,
        utilization_attributes, config_sites
):  #here config site is list of all sites in system_config var

    utilization_kpi_subdag_dag = DAG(
        dag_id="%s.%s" % (parent_dag_name, child_dag_name),
        schedule_interval=schedule_interval,
        start_date=start_date,
    )
    for service in utilization_attributes:
        sv_to_ds_mapping[service.get("service_name")] = {
            "data_source": service.get("data_source"),
            "sector_type": service.get("sector_type")
        }

    def get_calculated_ss_data():
        ss_data = redis_hook_util_10.rget("calculated_ss_utilization_kpi")
        combined_site_data = {}
        for site_data in ss_data:
            site_data = eval(site_data)
            combined_site_data.update(site_data)

        return combined_site_data

    #To create SS dict
    def format_data(**kwargs):

        device_type = kwargs.get("params").get("technology")
        utilization_attributes = kwargs.get("params").get("attributes")
        machine_name = kwargs.get("params").get("machine_name")
        ss_kpi_dict = {
            'site_name': 'unknown',
            'device_name': 'unknown',
            'service_name': 'unknown',
            'ip_address': 'unknown',
            'severity': 'unknown',
            'age': 'unknown',
            'data_source': 'unknown',
            'current_value': 'unknown',
            'warning_threshold': 'unknown',
            'critical_threshold': 'unknown',
            'check_timestamp': 'unknown',
            'sys_timestamp': 'unknown',
            'refer': 'unknown',
            'min_value': 'unknown',
            'max_value': 'unknown',
            'avg_value': 'unknown',
            'machine_name': 'unknown'
        }

        ss_data = redis_hook_util_10.rget("calculated_utilization_%s_%s" %
                                          (device_type, machine_name))
        cur_processing_time = backtrack_x_min(
            time.time(), 300
        ) + 120  # this is used to rewind the time to previous multiple of 5 value so that kpi can be shown accordingly
        ss_devices_list = []
        for ss_device in ss_data:
            ss_device = eval(ss_device)
            hostname = ss_device.get('hostname')

            for service in ss_device.get('services'):

                data_source = sv_to_ds_mapping.get(service).get("data_source")
                pmp_type = sv_to_ds_mapping.get(service).get("sector_type")
                thresholds = get_severity_values(service)
                ss_kpi_dict['critical_threshold'] = thresholds[0]
                ss_kpi_dict['data_source'] = data_source
                ss_kpi_dict['site_name'] = ss_device.get('site')
                #TODO: ok and unknown are only 2 sev for ss we can incluudethis in rules later
                ss_kpi_dict['service_name'] = service

                ss_kpi_dict['machine_name'] = machine_name
                ss_kpi_dict['check_timestamp'] = cur_processing_time
                ss_kpi_dict['device_name'] = ss_device.get('hostname')
                ss_kpi_dict['sys_timestamp'] = cur_processing_time
                ss_kpi_dict['refer'] = ss_device.get("%s_sector" % (pmp_type))
                ss_kpi_dict['ip_address'] = ss_device.get('ipaddress')
                ss_kpi_dict['warning_threshold'] = thresholds[1]

                if not isinstance(ss_device.get(service), dict):
                    #handling cur_value if it is greater than 100
                    cur_value = ss_device.get(service)
                    if cur_value and cur_value != None:
                        cur_value = ss_device.get(service)
                        try:
                            if isinstance(
                                    curr_value, float
                            ) and cur_value and cur_value > 100.00:
                                cur_value = 100
                        except Exception:
                            logging.error(
                                "Exception while handling above 100 entries")

                    ss_kpi_dict['severity'] = calculate_severity(
                        service, ss_device.get(service))
                    ss_kpi_dict['age'] = calculate_age(
                        hostname, ss_kpi_dict['severity'],
                        ss_device.get('device_type'), cur_processing_time,
                        service)
                    ss_kpi_dict['current_value'] = cur_value
                    ss_kpi_dict['avg_value'] = cur_value
                    ss_kpi_dict['min_value'] = cur_value
                    ss_kpi_dict['max_value'] = cur_value

                    if ss_kpi_dict['current_value'] != None:
                        ss_devices_list.append(ss_kpi_dict.copy())
                else:
                    for data_source in ss_device.get(service):
                        ds_values = ss_device.get(service)
                        curr_value = ss_device.get(service).get(data_source)
                        if isinstance(curr_value, str):
                            try:
                                curr_value = float(curr_value)
                                if isinstance(curr_value, float):
                                    if curr_value > 100.00:
                                        curr_value = 100
                            except Exception:
                                logging.error("Unable to convert to float")
                        else:
                            if curr_value > 100.00:
                                curr_value = 100

                        ss_kpi_dict['data_source'] = data_source
                        ss_kpi_dict['severity'] = calculate_severity(
                            service, ds_values.get(data_source))
                        ss_kpi_dict['age'] = calculate_age(
                            hostname, ss_kpi_dict['severity'],
                            ss_device.get('device_type'), cur_processing_time,
                            service)
                        ss_kpi_dict['current_value'] = curr_value
                        ss_kpi_dict['avg_value'] = curr_value
                        ss_kpi_dict['min_value'] = curr_value
                        ss_kpi_dict['max_value'] = curr_value
                        if ss_kpi_dict['current_value'] != None:
                            ss_devices_list.append(ss_kpi_dict.copy())

        try:

            if len(ss_devices_list) > 0:
                redis_hook_util_10.rpush(
                    "formatted_util_%s_%s" % (device_type, machine_name),
                    ss_devices_list)
            else:
                logging.info("No %s device found in %s after formatting " %
                             (device_type, machine_name))
        except Exception:
            logging.error("Unable to push formatted SS data to redis")

    def get_required_data_ss(**kwargs):
        site_name = kwargs.get("params").get("site_name")
        device_type = kwargs.get("params").get("technology")
        utilization_attributes = kwargs.get("params").get("attributes")
        if "vrfprv" in site_name:
            memc_con = vrfprv_memc_con

        elif "pub" in site_name:
            memc_con = pub_memc_con
        else:
            memc_con = memc_con_cluster

        ss_data_dict = {}
        all_ss_data = []
        if site_name not in hostnames_ss_per_site.keys():
            logging.warning("No SS devices found for %s" % (site_name))
            return 1

        for hostnames_dict in hostnames_ss_per_site.get(site_name):
            host_name = hostnames_dict.get("hostname")
            ip_address = hostnames_dict.get("ip_address")
            ss_data_dict['hostname'] = host_name
            ss_data_dict['ipaddress'] = ip_address
            ss_data_dict['site_name'] = site_name

            if host_name not in down_and_unresponsive_devices:
                for service in utilization_attributes:
                    ss_data_dict[service.get('service_name')] = memc_con.get(
                        service.get('utilization_key') % (host_name))

                all_ss_data.append(ss_data_dict.copy())

        if len(all_ss_data) == 0:
            logging.info("No data Fetched ! Aborting Successfully")
            return 0
        try:

            #redis_hook_util_10.rpush("%s_%s"%(device_type,site_name),all_ss_data)
            print "++++++++++++"
            print site_name.split("_")[0]
            redis_hook_util_10.rpush(
                "%s_%s" % (device_type, site_name.split("_")[0]), all_ss_data)
        except Exception:
            logging.warning("Unable to insert ss data into redis")

        #pprint(all_ss_data)

    def calculate_utilization_data_ss(**kwargs):

        machine_name = kwargs.get("params").get("machine_name")
        device_type = kwargs.get("params").get("technology")
        utilization_attributes = kwargs.get("params").get("attributes")

        devices_data_dict = redis_hook_util_10.rget(
            "%s_%s" % (device_type, machine_name))
        if len(devices_data_dict) == 0:
            logging.info("No Data found for ss %s " % (machine_name))
            return 1

        ss_data = []
        for devices in devices_data_dict:

            devices = eval(devices)
            site_name = devices.get("site_name")
            devices['site'] = site_name
            devices['device_type'] = device_type

            for service_attributes in utilization_attributes:  #loop for the all the configured services
                service = service_attributes.get('service_name')

                if service_attributes.get('isKpi'):
                    if 'services' in devices.keys(
                    ) and devices.get('services') != None:
                        devices.get('services').append(service)
                    elif service and devices.get('services') == None:
                        devices['services'] = [service]
                    else:

                        devices['services'] = []

                if service_attributes.get('isKpi'):
                    utilization_type = service_attributes.get(
                        "utilization_type")
                    capacity = None
                    if "capacity" in service_attributes.keys():
                        capacity = service_attributes.get("capacity")
                    try:
                        formula = kpi_rules.get(service).get('formula')

                        devices[service] = eval(formula)

                    except Exception:
                        print "Exception in calculating data"
                        pass
                else:
                    continue

            #ip_ul_mapper[devices.get('ipaddress')] = devices
            ss_data.append(devices.copy())

        #ss_utilization_list.append(ip_ul_mapper.copy())
        key = "calculated_utilization_%s_%s" % (device_type, machine_name)
        redis_hook_util_10.rpush(key, ss_data)
        print "Setting ....."
        print "calculated_utilization_%s_%s" % (device_type, machine_name)
        #redis_hook_util_10.rpush("calculated_ss_utilization_kpi",ss_utilization_list)

    def aggregate_utilization_data(*args, **kwargs):
        print "Aggregating Data"
        machine_name = kwargs.get("params").get("machine_name")
        device_type = kwargs.get("params").get("technology")

        #device_type = kwargs.get("params").get("device_type")
        formatted_data = redis_hook_util_10.rget("formatted_util_%s_%s" %
                                                 (device_type, machine_name))
        machine_data = []

        for site_data in formatted_data:
            machine_data.append(eval(site_data))

        redis_hook_util_10.set(
            "aggregated_utilization_%s_%s" % (machine_name, device_type),
            str(machine_data))

    machine_names = set([site.split("_")[0] for site in ss_tech_sites])
    config_machines = set([site.split("_")[0] for site in config_sites])
    aggregate_dependency_ss = {}
    aggregate_dependency_bs = {}
    calculate_task_list = {}
    format_task_list = {}

    #TODo Remove this if ss >> bs task
    # calculate_utilization_lost_ss_bs_task = PythonOperator(
    # 			task_id = "calculate_bs_utilization_lost_ss",
    # 			provide_context=True,
    # 			python_callable=calculate_utilization_data_bs,
    # 			params={"lost_n_found":True},
    # 			dag=utilization_kpi_subdag_dag
    # 			)

    for each_machine_name in machine_names:
        if each_machine_name in config_machines:

            aggregate_utilization_data_ss_task = PythonOperator(
                task_id="aggregate_utilization_ss_%s" % each_machine_name,
                provide_context=True,
                python_callable=aggregate_utilization_data,
                params={
                    "machine_name": each_machine_name,
                    "technology": ss_name
                },
                dag=utilization_kpi_subdag_dag,
                queue=O7_CALC_Q,
                trigger_rule='all_done')
            aggregate_dependency_ss[
                each_machine_name] = aggregate_utilization_data_ss_task

            calculate_utilization_data_ss_task = PythonOperator(
                task_id="calculate_ss_utilization_kpi_of_%s" %
                each_machine_name,
                provide_context=True,
                trigger_rule='all_done',
                python_callable=calculate_utilization_data_ss,
                params={
                    "machine_name": each_machine_name,
                    "technology": ss_name,
                    'attributes': utilization_attributes
                },
                dag=utilization_kpi_subdag_dag,
                queue=O7_CALC_Q,
            )

            format_data_ss_task = PythonOperator(
                task_id="format_data_of_ss_%s" % each_machine_name,
                provide_context=True,
                python_callable=format_data,
                trigger_rule='all_done',
                params={
                    "machine_name": each_machine_name,
                    "technology": ss_name,
                    'attributes': utilization_attributes
                },
                dag=utilization_kpi_subdag_dag,
                queue=celery_queue,
            )
            calculate_task_list[
                each_machine_name] = calculate_utilization_data_ss_task
            calculate_utilization_data_ss_task >> format_data_ss_task
            format_data_ss_task >> aggregate_utilization_data_ss_task

            #we gotta create teh crazy queries WTF this is so unsafe

            INSERT_QUERY = INSERT_HEADER % ("nocout_" +
                                            each_machine_name) + INSERT_TAIL
            UPDATE_QUERY = UPDATE_HEADER % ("nocout_" +
                                            each_machine_name) + UPDATE_TAIL
            INSERT_QUERY = INSERT_QUERY.replace('\n', '')
            UPDATE_QUERY = UPDATE_QUERY.replace('\n', '')

            #ss_name == Device_type
            if not DEBUG:
                insert_data_in_mysql = MySqlLoaderOperator(
                    task_id="upload_data_%s" % (each_machine_name),
                    dag=utilization_kpi_subdag_dag,
                    query=INSERT_QUERY,
                    #data="",
                    redis_key="aggregated_utilization_%s_%s" %
                    (each_machine_name, ss_name),
                    redis_conn_id="redis_hook_util_10",
                    mysql_conn_id='mysql_uat',
                    queue=O7_CALC_Q,
                    trigger_rule='all_done')
                update_data_in_mysql = MySqlLoaderOperator(
                    task_id="update_data_%s" % (each_machine_name),
                    query=UPDATE_QUERY,
                    #data="",
                    redis_key="aggregated_utilization_%s_%s" %
                    (each_machine_name, ss_name),
                    redis_conn_id="redis_hook_util_10",
                    mysql_conn_id='mysql_uat',
                    dag=utilization_kpi_subdag_dag,
                    queue=O7_CALC_Q,
                    trigger_rule='all_done')

                update_data_in_mysql << aggregate_utilization_data_ss_task
                insert_data_in_mysql << aggregate_utilization_data_ss_task

    db_list = []
    for each_site_name in ss_tech_sites:
        if each_site_name in config_sites:
            machine = each_site_name.split("_")[0]
            get_required_data_ss_task = PythonOperator(
                task_id="get_utilization_data_of_ss_%s" % each_site_name,
                provide_context=True,
                trigger_rule='all_done',
                python_callable=get_required_data_ss,
                params={
                    "site_name": each_site_name,
                    "technology": ss_name,
                    'attributes': utilization_attributes
                },
                dag=utilization_kpi_subdag_dag,
                queue=celery_queue)

            get_required_data_ss_task >> calculate_task_list.get(machine)
            #calculate_utilization_data_ss_task >> format_data_ss_task
            #calculate_utilization_data_ss_task >> calculate_utilization_data_bs_task

            # try:

            # 	aggregate_dependency_ss[machine_name] << format_data_ss_task

            # except:
            # 	logging.info("Site Not Found %s"%(machine_name))
            # 	pass

        else:
            logging.info("Skipping %s" % (each_site_name))

    return utilization_kpi_subdag_dag
Esempio n. 19
0
    if table_exists:
        sqls = [drop_table, create_table, load_data]
        for i in sqls:
            redshift_call(i)
    else:
        sqls = [create_table, load_data]
        for i in sqls:
            redshift_call(i)

postgres_to_local_csv = PythonOperator(
    task_id='postgres_to_local_csv',
    provide_context=True,
    python_callable=get_orders_with_bellhops,
    dag=dag)

local_csv_to_s3 = PythonOperator(
    task_id='local_csv_to_s3',
    provide_context=True,
    python_callable=store_orders_with_bellhops,
    dag=dag)

s3_to_redshift = PythonOperator(
    task_id='s3_to_redshift',
    provide_context=True,
    python_callable=transfer_orders_with_bellhops,
    dag=dag) 

local_csv_to_s3.set_upstream(postgres_to_local_csv)
s3_to_redshift.set_upstream(local_csv_to_s3)
    
                'h': '$time.h'
            },
            'hourly': {
                '$sum': 1
            }
        }
    }, {
        '$out': tmp_created_collection_per_hour_name
    }]
    results = db.logs.aggregate(pipeline)
    print("Aggregated hour metrics")
    return 'Whatever you return gets printed in the logs'


run_this = PythonOperator(task_id='connect_to_mongodb_and_aggregate_day',
                          provide_context=True,
                          python_callable=connect_to_mongodb_and_aggregate_day,
                          dag=dag)

run_this_also = PythonOperator(
    task_id='connect_to_mongodb_and_aggregate_hour',
    provide_context=True,
    python_callable=connect_to_mongodb_and_aggregate_hour,
    dag=dag)

run_this_also.set_upstream(run_this)

send_email_notification_flow_successful = EmailOperator(
    task_id='send_email_notification_flow_successful',
    to="*****@*****.**",
    subject='custom email from airflow',
    html_content="{{ params['foo'](execution_date) }}",
Esempio n. 21
0
    sms_result = context['task_instance'].xcom_pull(task_ids='send_sms')
    mail_result = context['task_instance'].xcom_pull(task_ids='send_mail')
    call_result = context['task_instance'].xcom_pull(task_ids='send_call')
    # pdb.set_trace()
    print('success')

# t1, t2 and t3 are examples of tasks created by instatiating operators
p0 = PythonOperator(
    task_id='set_group',
    python_callable=set_group,
    dag=dag)

p1 = PythonOperator(
    task_id='set_call',
    provide_context=True,
    python_callable=set_call,
    dag=dag)

p2 = PythonOperator(
    task_id='set_mail',
    provide_context=True,
    python_callable=set_mail,
    dag=dag)

p3 = PythonOperator(
    task_id='set_sms',
    provide_context=True,
    python_callable=set_sms,
    dag=dag)
Esempio n. 22
0
# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag)

t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(
    task_id="templated",
    bash_command=templated_command,
    params={"my_param": "Parameter I passed in"},
    dag=dag,
)

t4 = PythonOperator(
    task_id="python_code",
    python_callable=some_function,
    dag=dag
)

t2.set_upstream(t1)
t3.set_upstream(t1)
t4.set_upstream(t1)
Esempio n. 23
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("sanger_bwa", default_args=default_args,
          schedule_interval=None, concurrency=500, max_active_runs=500)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)

run_bwa_task = PythonOperator(
    task_id="run_bwa",
    python_callable=run_bwa,
    provide_context=True,
    dag=dag)

run_bwa_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(run_bwa_task)
Esempio n. 24
0
    schedule_interval='25 10 * * *',
)

run_file_1 = BashOperator(
    task_id='run_file_1',
    bash_command=f'python /home/airflow/gcs/data/file_1.py',
    email_on_failure=False,
    dag=dag)

run_file_2 = BashOperator(
    task_id='run_file_2',
    bash_command=f"python /home/airflow/gcs/data/file_2.py",
    email_on_failure=False,
    dag=dag)

push_1 = PythonOperator(task_id='xom_push_try',
                        provide_context=True,
                        python_callable=push,
                        dag=dag)

pull_1 = PythonOperator(task_id='xom_pull_try',
                        provide_context=True,
                        python_callable=pull,
                        dag=dag)

x_com_push_try = BashOperator(task_id='bigquery_ls',
                              bash_command="bq ls",
                              email_on_failure=False,
                              dag=dag)

x_com_push_try >> run_file_1 >> [run_file_2, push_1 >> pull_1]
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("filter-vcf", default_args=default_args,
          schedule_interval=None, concurrency=20000, max_active_runs=20000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)



filter_task = PythonOperator(
    task_id="filter_variants",
    python_callable=filter_variants,
    provide_context=True,
    dag=dag)

filter_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(filter_task)
Esempio n. 26
0
def waitfor_gatk(run_id='runid2', task_id='taskid2'):
    update_key = run_id + '.' + task_id
    while not r.get(update_key):
        print("Job in progress")
        time.sleep(30)
    else:
        print("Completed task GATK")


t1 = BashOperator(
    task_id='bwa_cc',
    bash_command=
    'export KUBECONFIG=/root/.kube/kind-config-kind && kubectl apply -f /tmp/bwapod.yaml',
    dag=dag)
t2 = PythonOperator(task_id='bwa_wait',
                    python_callable=waitfor_bwa,
                    op_kwargs={},
                    dag=dag)
t3 = BashOperator(
    task_id='gatk_cc',
    bash_command=
    'export KUBECONFIG=/root/.kube/kind-config-kind && kubectl apply -f /tmp/gatkpod.yaml',
    dag=dag)
t4 = PythonOperator(task_id='gatk_wait',
                    python_callable=waitfor_gatk,
                    op_kwargs={},
                    dag=dag)

t1 >> t2 >> t3 >> t4
    # clean the entire bucket
    private = [(private_bucket, name) for name in hook.list(private_bucket)]
    shared = [(shared_bucket, name) for name in hook.list(shared_bucket)]

    for bucket_name, object_name in private + shared:
        logging.info("Deleting gs://{}/{}".format(bucket_name, object_name))
        hook.delete(bucket_name, object_name)
        total += 1
    logging.info("Deleted {} objects".format(total))


clean_processor_a = PythonOperator(
    task_id="clean_processor_a",
    python_callable=clean_buckets,
    op_kwargs={
        "private_bucket": BUCKET_PRIVATE_A,
        "shared_bucket": BUCKET_SHARED_A,
        "google_cloud_storage_conn_id": PRIO_A_CONN,
    },
    dag=dag,
)

clean_processor_b = PythonOperator(
    task_id="clean_processor_b",
    python_callable=clean_buckets,
    op_kwargs={
        "private_bucket": BUCKET_PRIVATE_B,
        "shared_bucket": BUCKET_SHARED_B,
        "google_cloud_storage_conn_id": PRIO_B_CONN,
    },
    dag=dag,
)
                            html_content='Check out the latest!!',
                            files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
                            dag=dag)


sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)


clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format(
    RAW_TWEET_DIR), task_id='clear_latest', dag=dag)


gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,
        dag=dag,
        params={'query': term})
    simple_search.set_upstream(gen_search_terms)
    simple_search.set_downstream(sub)

sub.set_downstream(email_links)
email_links.set_downstream(clear_latest)
Esempio n. 29
0
    'owner': 'ryan',
    'depends_on_past': False,
    'start_date': datetime.utcnow(),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

# Run at the top of the hour Monday to Friday.
# Note: This doesn't line up with the market hours of
# 10PM Sunday till 10PM Friday GMT.
dag = DAG(dag_id='stocks',
          default_args=args,
          schedule_interval='0 * * * 1,2,3,4,5',
          dagrun_timeout=timedelta(seconds=30))
# loop through the lob's we want to use to build up our dag
for stock in stocks:
    get_stocks_task = \
        PythonOperator(task_id='get_stocks',
                       provide_context=True,
                       op_kwargs={"stock": stock},
                       python_callable=get_stocks,
                       dag=dag)

    cache_latest_stocks_task = \
        PythonOperator(task_id='cache_latest_stocks',
                       provide_context=True,
                       python_callable=cache_latest_stocks,
                       dag=dag)

    get_stocks_task.set_downstream(cache_latest_stocks_task)
Esempio n. 30
0
    'start_date': datetime.now(),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

# DAG is scheduled to run every 8 hours
dag = DAG('PostTweet',
          schedule_interval=timedelta(hours=8),
          default_args=default_args)

# This dag will stage all the tweets to the csv file
t1 = PythonOperator(task_id='stage_tweets',
                    python_callable=stage_tweets,
                    dag=dag)

#This Dag will commit all the tweets to the csv file
t2 = PythonOperator(task_id='commit_tweets',
                    python_callable=commit_tweets,
                    dag=dag)

# This dag is used to send the tweets to twitter
t3 = PythonOperator(task_id='post_status', python_callable=post_tweet, dag=dag)

# Backup all the files and tweets to google drive
t4 = PythonOperator(task_id='backup', python_callable=upload.main, dag=dag)

t1.set_downstream(t2)
t2.set_downstream(t3)
Esempio n. 31
0
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0),
    schedule_interval="@monthly",
    max_active_runs=1
)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql.CREATE_TRIPS_TABLE_SQL
)

copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
    provide_context=True,
)

check_trips = PythonOperator(
    task_id='check_trips_data',
    dag=dag,
    python_callable=check_greater_than_zero,
    provide_context=True,
    params={
        'table': 'trips',
    }
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
Esempio n. 32
0
    'depends_on_past': False,
    'start_date': dt.datetime.strptime('2020-03-24T00:00:00',
                                       '%Y-%m-%dT%H:%M:%S'),
    'provide_context': True
}

# creating a new dag
dag = DAG('dataflow_process_dag',
          default_args=default_args,
          schedule_interval='0 0 * * 2',
          max_active_runs=1)

# Integrating different operatortasks in airflow dag
# Integrating read_data operator in airflow dag
read_table = PythonOperator(task_id='read_table',
                            python_callable=read_data,
                            op_kwargs={'fig_path': fig_path},
                            dag=dag)
# Integrating data_report operator in airflow dag
data_report = PythonOperator(task_id='data_report',
                             python_callable=data_report,
                             op_kwargs={'fig_path': fig_path},
                             dag=dag)
# Integrating plots operator in airflow dag
plots = PythonOperator(task_id='var_dist_plots',
                       python_callable=plot_var_distributions,
                       op_kwargs={'fig_path': fig_path},
                       dag=dag)
# Integrating train_test operator in airflow dag
train_test = PythonOperator(task_id='train_test',
                            python_callable=make_train_test,
                            op_kwargs={'fig_path': fig_path},
Esempio n. 33
0
    for dag in dags:
        if not os.path.exists(dag.fileloc):
            logging.info("After checking DAG '" + str(dag) +
                         "', the Python definition file DOES NOT exist.")
            entries_to_delete.append(dag)
        else:
            logging.info("After checking DAG '" + str(dag) +
                         "', the Python definition file does exist.")

    logging.info("Process will be Deleting the DAG(s) from the DB:")
    for entry in entries_to_delete:
        logging.info("\tEntry: " + str(entry))
    logging.info("Process will be Deleting " + str(len(entries_to_delete)) +
                 " DAG(s)")

    if ENABLE_DELETE:
        logging.info("Performing Delete...")
        for entry in entries_to_delete:
            session.delete(entry)
        logging.info("Finished Performing Delete")
    else:
        logging.warn("You're opted to skip deleting the DAG entries!!!")

    logging.info("Finished Running Clear Process")


clear_missing_dags = PythonOperator(task_id='clear_missing_dags',
                                    python_callable=clear_missing_dags_fn,
                                    provide_context=True,
                                    dag=dag)
import logging

from airflow import DAG
from airflow.operators import PythonOperator

from datetime import datetime, timedelta

args = {
    'owner': 'airflow',
    'start_date': datetime(2019, 4, 1),
    'provide_context': True
}

dag = DAG('spark_count_lines',
          start_date=datetime(2019, 4, 1),
          schedule_interval='@monthly',
          dagrun_timeout=timedelta(minutes=60),
          default_args=args)


def run_spark(**kwargs):
    import pyspark
    sc = pyspark.SparkContext()
    df = sc.textFile('file:////Users/kajariverma/airflow/dags/test.py')
    logging.info('Number of lines in people.txt = {0}'.format(df.count()))
    sc.stop()


t_main = PythonOperator(task_id='call_spark',
                        dag=dag,
                        python_callable=run_spark)
"""
Example with PythonOperator
"""
from airflow import DAG
from airflow.operators import PythonOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'Samarth',
    'start_date': datetime(2016, 03, 15, 12),
}

# "schedule_interval" is your cron expression you can write any cron expression like unix cron.
dag = DAG('airflow_with_python_operator',
          default_args=default_args,
          schedule_interval="1 * * * *")


def my_function():
    '''This is a function that will run within the DAG execution'''
    return "Check me in the logs"


# Note that unlike other example we are using PythonOperator here.
# 'python_callable' parameter determines which python functions to
# execute.
run_this = PythonOperator(task_id='run_my_function',
                          python_callable=my_function,
                          dag=dag)
Esempio n. 36
0
    task_id='setup_jobs',
    provide_context=True,
    python_callable=setup_jobs_fn,
    dag=dag)


def collect_results_fn(ds, **kwargs):
    pprint(kwargs)
    print(ds)


collect_results = PythonOperator(
    task_id='collect_results',
    provide_context=True,
    python_callable=collect_results_fn,
    dag=dag)


for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(
        task_id='sleep_for_'+str(i),
        python_callable=my_sleeping_function,
        op_kwargs={'random_base': float(i)/10},
        dag=dag)
    task.set_upstream(setup_jobs)
    task.set_downstream(collect_results)
Esempio n. 37
0
def pusher_dynamic(my_task_id, **kwargs):
	#print(ds)
	print("pushing| my task id: "+str(my_task_id)+" Notice, the task operator id is also pushed, imliciltly")
	print(kwargs)
	print(kwargs['ti'])
	kwargs['ti'].xcom_push(key='value from pusher dynamic', value=int(my_task_id) )
	return 'Whatever you return gets printed in the logs'

def puller_dynamic(my_task_id,**kwargs):
        ti = kwargs['ti']
        pulled_value = ti.xcom_pull(key='value from pusher dynamic', task_ids='push_'+str(my_task_id) )
        print ("pulled value based on pusher_id: " +str(pulled_value))

i=1
push1 = PythonOperator(task_id='push_1',	provide_context=True,dag=dag,python_callable=pusher)
pull1 = PythonOperator(task_id='pull_1',	provide_context=True,dag=dag,python_callable=puller)

#notice I am pulling based on push_1 id, expeted value to push is 2, for pull is1,  b/c we are sending the push_1 id...
i=i+1
push2 = PythonOperator(task_id='push_2', 	provide_context=True,dag=dag,python_callable=pusher)
pull2 = PythonOperator(task_id='pull_2',       	provide_context=True,dag=dag,python_callable=puller)



#trying to create a dynamic pusher called pusher_synami, accpeting a counter  and pushes it to the MySQL 
i=i+1
my_task_id=i
push3 = PythonOperator(task_id='push_'+str(i),        provide_context=True,python_callable=pusher_dynamic,op_kwargs={'my_task_id': my_task_id},dag=dag)
pull3 = PythonOperator(task_id='pull_'+str(i),        provide_context=True,python_callable=puller_dynamic,op_kwargs={'my_task_id': my_task_id},dag=dag)
Esempio n. 38
0
    # list files in ftp server
    ftp_files = set(key for key in sftp.listdir("upload")
                    if key.lower().endswith(('.pp', '.nc')))
    print(ftp_files)

    # find the files in ftp but not in s3
    upload_files = ftp_files - s3_files
    print(upload_files)

    # upload each file in list of files to uploaded
    for key in upload_files:
        print('Uploading ' + key)
        upload(key)


dag = DAG(dag_id='ftp-to-s3',
          default_args={
              'owner': 'airflow',
              'start_date': airflow.utils.dates.days_ago(2)
          },
          schedule_interval='* * * * *',
          dagrun_timeout=timedelta(minutes=60),
          catchup=False,
          max_active_runs=1)

task = PythonOperator(task_id='ftp', python_callable=ftp, dag=dag)

if __name__ == "__main__":
    dag.cli()
def connect_to_mongodb_and_aggregate_hour(ds, **kwargs):
    db = MongoClient().test
    tmp_created_collection_per_hour_name = 'page_per_hour_hits_tmp';
    pipeline = [{"$project":{'page': '$PAGE', 'time': { 'y': {'$year':'$DATE'} , 'm':{'$month':'$DATE'}, 'day':{'$dayOfMonth':'$DATE'}, 'h':{'$hour':'$DATE'}}}}, {'$group':{'_id':{'p':'$page','y':'$time.y','m':'$time.m','d':'$time.day', 'h':'$time.h'}, 'hourly':{'$sum':1}}},{'$out': tmp_created_collection_per_hour_name}]
    results = db.logs.aggregate(pipeline)
    print("Aggregated hour metrics")
    return 'Whatever you return gets printed in the logs'

run_this = PythonOperator(
    task_id='connect_to_mongodb_and_aggregate_day',
    provide_context=True,
    python_callable=connect_to_mongodb_and_aggregate_day,
    dag=dag)

run_this_also = PythonOperator(
    task_id='connect_to_mongodb_and_aggregate_hour',
    provide_context=True,
    python_callable=connect_to_mongodb_and_aggregate_hour,
    dag=dag)

run_this_also.set_upstream(run_this)

send_email_notification_flow_successful = EmailOperator(
    task_id='send_email_notification_flow_successful',
    to="*****@*****.**",
    subject='custom email from airflow',
    html_content="{{ params['foo'](execution_date) }}",
    params=params,
    dag=dag)

send_email_notification_flow_successful.set_upstream(run_this_also)
Esempio n. 40
0
dag = DAG(dag_id='example_python_operator', default_args=args)


def my_sleeping_function(random_base):
    '''This is a function that will run within the DAG execution'''
    time.sleep(random_base)


def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'


run_this = PythonOperator(task_id='print_the_context',
                          provide_context=True,
                          python_callable=print_context,
                          dag=dag)

for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(task_id='sleep_for_' + str(i),
                          python_callable=my_sleeping_function,
                          op_kwargs={'random_base': float(i) / 10},
                          dag=dag)

    task.set_upstream(run_this)
    conn = sqlite.get_conn()
    query = """select * from tweets where
    created > date('now', '-1 days') and urls is not null
    order by favorite_count"""
    df = pd.read_sql_query(query, conn)
    df.urls = df.urls.map(ast.literal_eval)
    cntr = Counter(itertools.chain.from_iterable(df.urls.values))
    with open('{}/latest_links.txt'.format(directory), write_mode) as latest:
        wrtr = writer(latest)
        wrtr.writerow(['url', 'count'])
        wrtr.writerows(cntr.most_common(5))


simple_search = PythonOperator(task_id='search_twitter',
                               provide_context=True,
                               python_callable=search_twitter,
                               dag=dag,
                               params={'query': '#python'})


move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite',
                                       provide_context=True,
                                       python_callable=csv_to_sqlite,
                                       dag=dag)


id_popular = PythonOperator(task_id='identify_popular_links',
                            provide_context=True,
                            python_callable=identify_popular_links,
                            dag=dag)
end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline', dag=dag)

pg_unload = PostgresOperator(
    dag=dag,
    task_id='pg_unload',
    sql=unload_user_purchase,
    postgres_conn_id='postgres_default',
    params={'temp_filtered_user_purchase': temp_filtered_user_purchase},
    depends_on_past=True,
    wait_for_downstream=True)

user_purchase_to_s3_stage = PythonOperator(
    dag=dag,
    task_id='user_purchase_to_s3_stage',
    python_callable=_local_to_s3,
    op_kwargs={
        'filename': temp_filtered_user_purchase,
        'key': temp_filtered_user_purchase_key,
    },
)

# remove_local_user_purchase_file = PythonOperator(
#     dag=dag,
#     task_id='remove_local_user_purchase_file',
#     python_callable=remove_local_file,
#     op_kwargs={
#         'filelocation': temp_filtered_user_purchase,
#     },
# )

# movie_review_to_s3_stage = PythonOperator(
Esempio n. 43
0
    attachment.add_header("Content-Disposition", "attachment", filename=filename)
    msg = MIMEMultipart()
    msg.attach(attachment)
    msg["Subject"] = str("Resultado Analise Notebook")
    msg["From"] = "*****@*****.**"
    msg["Reply-to"] = "*****@*****.**"

    server = smtplib.SMTP("smtp.gmail.com:587")
    server.ehlo()
    server.starttls()
    server.login("*****@*****.**", "a.24423242")
    server.sendmail(msg["From"], "*****@*****.**", msg.as_string())
    server.quit()


def acessNotebook():
    print("Acessando Notebook no Swift")


runZero = PythonOperator(task_id="Acess_Notebook", provide_context=False, python_callable=acessNotebook, dag=dag)

runFirst = PythonOperator(task_id="ExecNotebook", provide_context=False, python_callable=execNotebook, dag=dag)

runSecond = PythonOperator(task_id="Get_Output", provide_context=False, python_callable=getOutput, dag=dag)

runThird = PythonOperator(task_id="send_email", provide_context=False, python_callable=send_email, dag=dag)

runZero.set_downstream(runFirst)
runFirst.set_downstream(runSecond)
runSecond.set_downstream(runThird)
Esempio n. 44
0
        
        response=elastic.index(index=INDEX_NAME,doc_type=TYPE_NAME,id=uuid,body=Doc_Source)
        print('uuid is',str(uuid))
        uuid += 1
        
    Variable.set("uuid", uuid)    

    
with DAG(dag_id='ETL_Import_Yearly', description='Trade', start_date=datetime(2020,6, 15),end_date= None, schedule_interval='@yearly', default_args=args) as dag:
    for countrycode, countryname in country_dict.items():
        for hslevelcode in hslevel_list:
            for currency in currency_list:

                task1 = SeleniumOperator(
                script = get_df,
                script_args = [data_folder, countrycode, hslevelcode, currency],
                task_id = 'Extract_Data_' + countrycode + '_' + hslevelcode + '_' + currency)

                task2 = PythonOperator(
                task_id = 'Transform_Data_' + countrycode + '_' + hslevelcode + '_' + currency,
                # op_kwargs={'countrycode': countrycode},
                python_callable = transform_data)

                task3 = PythonOperator(
                task_id = 'Load_Data_' + countrycode + '_' + hslevelcode + '_' + currency,
                op_kwargs = {'countryname': countryname, 'hslevel': hslevelcode, 'currency': currency, 'countrycode': countrycode},
                python_callable = load_data)

                # Set Dependencies
                task1 >> task2 >> task3
Esempio n. 45
0
    'start_date': datetime(2015, 8, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('etl_daily',
          start_date=datetime(2016, 05, 01),
          schedule_interval="0 0 14 * MON-FRI",
          default_args=default_args)

t1 = PythonOperator(task_id='test_airflow',
                    python_callable=test_airflow,
                    dag=dag)

t2 = PythonOperator(task_id='daily_equity_price_ingest',
                    python_callable=daily_equity_price_ingest,
                    dag=dag)

run_this_last = DummyOperator(task_id='run_this_last', dag=dag)

t2.set_upstream(t1)

run_this_last.set_upstream(t2)
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
    'catchup': False
}

dag = DAG('sparkify_dag',
          default_args=default_args,
          description='Load and transform data in Redshift with Airflow',
          schedule_interval='0 * * * *')

start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

create_tables_in_redshift = PythonOperator(
    task_id="create_tables_in_redshift",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    dag=dag,
    provide_context=True,
    python_callable=create_table)

stage_events_to_redshift = StageToRedshiftOperator(
    task_id='Stage_events',
    dag=dag,
    table="staging_events",
    redshift_conn_id="redshift",
    aws_credentials_id="aws_credentials",
    s3_bucket=s3_bucket,
    s3_key=log_s3_key,
    copy_json_option="s3://udacity-dend/log_json_path.json",
)
    'email': [alert_email],
    'email_on_failure': True,
    'email_on_retry': False,
}


# Set concurrency and max_active_runs to 1, preventing more than one dag instance
# from being created.
dag = DAG(dag_name, default_args=task_args,
          concurrency=1,
          max_active_runs=1,
          schedule_interval=schedule_interval)


get_file = PythonOperator(
    task_id='get-file-from-s3',
    python_callable=FileGetter(),
    dag=dag)

hello_world_docker_write_logs = BashOperator(
    task_id='hello-world',
    bash_command=start_hello_world,
    trigger_rule=TriggerRule.ALL_SUCCESS,
    dag=dag)

check_read_logs = PythonOperator(
    task_id='check_read_logs',
    python_callable=CheckReadLogs(),
    dag=dag)

put_file = PythonOperator(
    task_id='put-file-to-s3',
Esempio n. 48
0

args = {
    'owner': 'mark',
    'depends_on_past': False,
    'start_date': datetime.utcnow(),
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

# Run at the top of the hour Monday to Friday.
# Note: This doesn't line up with the market hours of
# 10PM Sunday till 10PM Friday GMT.
dag = DAG(dag_id='rates',
          default_args=args,
          schedule_interval='0 * * * 1,2,3,4,5',
          dagrun_timeout=timedelta(seconds=30))

get_rates_task = \
    PythonOperator(task_id='get_rates',
                   provide_context=True,
                   python_callable=get_rates,
                   dag=dag)

cache_latest_rates_task = \
    PythonOperator(task_id='cache_latest_rates',
                   provide_context=True,
                   python_callable=cache_latest_rates,
                   dag=dag)

get_rates_task.set_downstream(cache_latest_rates_task)
Esempio n. 49
0
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("bcftools", default_args=default_args,
          schedule_interval=None, concurrency=20000, max_active_runs=20000)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)

bcftools_task = PythonOperator(
    task_id="bcftools",
    python_callable=bcftools,
    provide_context=True,
    dag=dag)

bcftools_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(bcftools_task)
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG("sanger_variant_calling", default_args=default_args,
          schedule_interval=None, concurrency=500, max_active_runs=500)


start_analysis_run_task = PythonOperator(
    task_id="start_analysis_run",
    python_callable=start_analysis_run,
    provide_context=True,
    dag=dag)

run_sanger_callers_task = PythonOperator(
    task_id="run_sanger_callers",
    python_callable=run_sanger_callers,
    provide_context=True,
    dag=dag)

run_sanger_callers_task.set_upstream(start_analysis_run_task)

complete_analysis_run_task = PythonOperator(
    task_id="complete_analysis_run",
    python_callable=complete_analysis_run,
    provide_context=True,
    dag=dag)

complete_analysis_run_task.set_upstream(run_sanger_callers_task)