conn = sqlite.get_conn() query = "select * from twitter_terms" df = pd.read_sql_query(query, conn) return random.choice([ 'search_{}_twitter'.format(re.sub(r'\W+', '', t)) for t in df.search_term.values]) fill_search_terms = PythonOperator(task_id='fill_terms', provide_context=True, python_callable=fill_terms, dag=dag) gen_search_terms = BranchPythonOperator(task_id='generate_search_terms', provide_context=True, python_callable=generate_search_terms, dag=dag) email_links = EmailOperator(task_id='email_best_links', to='*****@*****.**', subject='Latest popular links', html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag)
def _get_task_id(execution_date, **context): return 'email_' + weekday_person_to_email[execution_date.weekday()] def _print_weekday(execution_date: datetime, **context): print(execution_date.strftime('%a')) with dag: print_weekday = PythonOperator( task_id='print_weekday', python_callable=_print_weekday, provide_context=True, ) branching = BranchPythonOperator( task_id='branching', python_callable=_get_task_id, provide_context=True, ) users = ['bob', 'alice', 'joe'] branches = [DummyOperator(task_id='email_' + user) for user in users] end = BashOperator(task_id='end', bash_command='echo "That\'s it folks!"', trigger_rule=TriggerRule.ONE_SUCCESS) print_weekday >> branching >> branches >> end
from airflow.operators import BranchPythonOperator, DummyOperator from airflow.models import DAG from datetime import datetime, timedelta import random seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_branch_operator', default_args=args) cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow)
else: df.at[i, 'status'] = status print("Failed record {}, message - {}".format(i, status)) return ('success') with airflow.DAG('arya_kx_delta_load', 'catchup=False', default_args=default_args, schedule_interval=datetime.timedelta(hours=12)) as dag: Start = DummyOperator(task_id='Start') is_new_delta_file = BranchPythonOperator(task_id='is_new_delta_file', provide_context=True, python_callable=delta_file_check, trigger_rule="all_done", dag=dag) load_delta_files = PythonOperator(task_id='load_delta_files', provide_context=True, python_callable=load_data, dag=dag) End = DummyOperator(task_id='End') Start >> is_new_delta_file is_new_delta_file >> [load_delta_files, End] load_delta_files >> End
logging.info( "Opting to send an email to alert the users that processes were killed" ) return send_processes_killed_email.task_id else: logging.info("enable_kill is set to False") else: logging.info("Processes to kill list was either None or Empty") logging.info( "Opting to skip sending an email since no processes were killed") return "" email_or_not_branch = BranchPythonOperator(task_id="email_or_not_branch", python_callable=branch_function, provide_context=True, dag=dag) send_processes_killed_email = EmailOperator( task_id="send_processes_killed_email", to=PROCESS_KILLED_EMAIL_ADDRESSES, subject=PROCESS_KILLED_EMAIL_SUBJECT, html_content=""" <html> <body> <h2>Dag Run Information</h2> <table> <tr><td><b> ID: </b></td><td>{{ dag_run.id }}</td></tr> <tr><td><b> DAG ID: </b></td><td>{{ dag_run.dag_id }}</td></tr> <tr><td><b> Execution Date: </b></td><td>{{ dag_run.execution_date }}</td></tr>
第2列表示小时1~23(0表示0点) 第3列表示日期1~31 第4列表示月份1~12 第5列标识号星期0~6(0表示星期天) 第6列要运行的命令 ''' dag = DAG('source_data_count', default_args=default_args, schedule_interval="0 12 * * *") run_this_first = DummyOperator(task_id='run_this_first', dag=dag) branching = BranchPythonOperator(task_id='branching', python_callable=lambda: 'source_count' if datetime.now().day <= 7 and datetime.today( ).weekday() == 6 else 'ignore_not_sunday', dag=dag) branching.set_upstream(run_this_first) esucc = EmailOperator(task_id='email_success_' + dag.dag_id, to=email_addr, subject=dag.dag_id + ' [success] on ' + datetime.now().strftime('%Y-%m-%d'), html_content='Congratulation!', trigger_rule='all_success', dag=dag) source_count = BashOperator( task_id='source_count', bash_command='/disk1/source_data_count; ./daily_table_count.sh > out.log ',
from airflow.operators import BranchPythonOperator, DummyOperator from airflow.utils import chain from datetime import datetime, timedelta import random yesterday = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) default_args = { 'owner': 'airflow', 'start_date': yesterday, } dag = DAG('branch', default_args=default_args) t1 = DummyOperator(task_id='task1', dag=dag) b1 = DummyOperator(task_id='branch1', dag=dag) b2 = DummyOperator(task_id='branch2', dag=dag) b3 = DummyOperator(task_id='branch3', dag=dag) select = BranchPythonOperator( task_id='select', python_callable=lambda: random.choice(['branch1', 'branch2', 'branch3']), dag=dag) chain(t1, select) chain(select, b1) chain(select, b2) chain(select, b3)
dag=dag) t3 = PythonOperator( task_id='compare_result', provide_context=True, python_callable=compare_result, trigger_rule="all_done", dag=dag) t3.set_upstream(t1) t3.set_upstream(t2) options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(t3) join = DummyOperator( task_id='join', trigger_rule='one_success', dag=dag ) t4 = QuboleOperator( task_id='hadoop_jar_cmd', command_type='hadoopcmd', sub_command='jar s3://paid-qubole/HadoopAPIExamples/jars/hadoop-0.20.1-dev-streaming.jar -mapper wc -numReduceTasks 0 -input s3://paid-qubole/HadoopAPITests/data/3.tsv -output s3://paid-qubole/HadoopAPITests/data/3_wc',
conn = sqlite.get_conn() query = "select * from twitter_terms" df = pd.read_sql_query(query, conn) return random.choice([ 'search_{}_twitter'.format(re.sub(r'\W+', '', t)) for t in df.search_term.values ]) fill_search_terms = PythonOperator(task_id='fill_terms', provide_context=True, python_callable=fill_terms, dag=dag) gen_search_terms = BranchPythonOperator(task_id='generate_search_terms', provide_context=True, python_callable=generate_search_terms, dag=dag) email_links = EmailOperator( task_id='email_best_links', to='*****@*****.**', subject='Latest popular links', html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag)
} # BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG(dag_id='example_branch_dop_operator_v3', schedule_interval='*/1 * * * *', default_args=args) def should_run(ds, **kwargs): print("------------- exec dttm = {} and minute = {}".format( kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "oper_1" else: return "oper_2" cond = BranchPythonOperator(task_id='condition', provide_context=True, python_callable=should_run, dag=dag) oper_1 = DummyOperator(task_id='oper_1', dag=dag) oper_1.set_upstream(cond) oper_2 = DummyOperator(task_id='oper_2', dag=dag) oper_2.set_upstream(cond)
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes, default_args): dag = DAG(dag_id, schedule_interval=schedule, start_date=start_date, default_args=default_args) dag.doc_md = """ # DAG fetching data from smiles.com.ar ### procesing and dumping on postgresql """ """start = TimeDeltaSensor( task_id='wait_to_start', delta=timedelta(minutes=delta_sensor), dag=dag)""" start = DummyOperator(task_id="start", dag=dag) branches = [] def return_dates_branches(**kwargs): return branches gen_url_branch = BranchPythonOperator( task_id='generate_url_dates', provide_context=True, python_callable=return_dates_branches, dag=dag) def transform_data(**kwargs): ti = kwargs['ti'] raw_data = ti.xcom_pull(task_ids=return_dates_branches()) data = [] logging.info(raw_data) if raw_data is not None: flat_list = [item for sublist in raw_data for item in sublist] for row in flat_list: row = list(row) # add À-ÿ for spanish accents date = '/'.join( list( re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split( row[1]))[2:4]) date = dateparser.parse(date, languages=['pt', 'es'], date_formats=['%d/%b' ]).strftime('%Y-%m-%d') row[1] = date td = row[4].split(':') row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1]))) row[5] = int(row[5].replace('.', '')) row[6] = int(row[6].replace('.', '')) row[8] = row[8].split(' ')[-1] row.insert(0, datetime.now().strftime('%Y-%m-%d')) data.append(tuple(row)) return data else: print('No se recibio datos') t2 = PythonOperator( task_id='transform_data', python_callable=transform_data, depends_on_past=True, trigger_rule=TriggerRule.ALL_SUCCESS, provide_context=True, dag=dag, ) t2.doc_md = """ #### Task Documentation Transform fetched data @return a list of tuples """ # def gen_url_dates(**kwargs): date_start = read_scraped_date(airpots_codes) date_end = date_start + timedelta(days=AMOUNT_DAYS) date_generated = [ date_start + timedelta(days=x) for x in range(0, (date_end - date_start).days) ] for i, date in enumerate(date_generated): date_ml = str(date.timestamp())[:8] + '00000' url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3¤cyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format( airpots_codes[0][0], airpots_codes[1], date_ml, date_ml, airpots_codes[0][1], airpots_codes[1]) get_data_op = PythonOperator( task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0], airpots_codes[0][1], airpots_codes[1], i), python_callable=get_data_URL, op_kwargs={'URL': url_dated}, trigger_rule=TriggerRule.ONE_SUCCESS, provide_context=True, dag=dag, ) branches.append(get_data_op.task_id) get_data_op.set_upstream(gen_url_branch) get_data_op.set_downstream(t2) get_data_op.doc_md = """ #### Task Documentation Fetch data from passed url return list of semi-parsed data """ insert_data = PythonOperator( task_id='insert_data', python_callable=insert_into_table, provide_context=True, dag=dag, ) insert_data.doc_md = """ #### Task Documentation Insert parsed and transformed data into table """ t2.set_downstream(insert_data) gen_url_branch.set_upstream(start) return dag
#s.connect((socket_ip, socket_port)) s.connect(socket_path) s.settimeout(60.0) s.send(query) s.shutdown(socket.SHUT_WR) output = '' while True: try: out = s.recv(100000000) out.strip() except socket.timeout,e: err=e.args[0] print 'socket timeout ..Exiting' if err == 'timed out': sys.exit(1) if not len(out): break output += out return output extract_data = BranchPythonOperator( task_id="ExtractData", provide_context=False, python_callable=connect_and_extract, dag=main_service_dag)