def test_logging_head_error_request( self, mock_session_send ): def resp_check(resp): return True response = requests.Response() response.status_code = 404 response.reason = 'Not Found' mock_session_send.return_value = response task = HttpSensor( dag=self.dag, task_id='http_sensor_head_method', http_conn_id='http_default', endpoint='', request_params={}, method='HEAD', response_check=resp_check, timeout=5, poke_interval=1 ) with mock.patch.object(task.hook.log, 'error') as mock_errors: with self.assertRaises(AirflowSensorTimeout): task.execute(None) self.assertTrue(mock_errors.called) mock_errors.assert_called_with('HTTP error: %s', 'Not Found')
def test_logging_head_error_request(self, mock_session_send): def resp_check(resp): return True response = requests.Response() response.status_code = 404 response.reason = 'Not Found' mock_session_send.return_value = response task = HttpSensor(dag=self.dag, task_id='http_sensor_head_method', http_conn_id='http_default', endpoint='', request_params={}, method='HEAD', response_check=resp_check, timeout=5, poke_interval=1) with mock.patch.object(task.hook.log, 'error') as mock_errors: with self.assertRaises(AirflowSensorTimeout): task.execute(None) self.assertTrue(mock_errors.called) mock_errors.assert_called_with('HTTP error: %s', 'Not Found')
def test_head_method(self, mock_session_send): def resp_check(resp): return True task = HttpSensor( dag=self.dag, task_id='http_sensor_head_method', http_conn_id='http_default', endpoint='', request_params={}, method='HEAD', response_check=resp_check, timeout=5, poke_interval=1) task.execute(None) args, kwargs = mock_session_send.call_args received_request = args[0] prep_request = requests.Request( 'HEAD', 'https://www.google.com', {}).prepare() self.assertEqual(prep_request.url, received_request.url) self.assertTrue(prep_request.method, received_request.method)
def test_sensor(self): sensor = HttpSensor( task_id='http_sensor_check', http_conn_id='http_default', endpoint='/search', request_params={"client": "ubuntu", "q": "airflow", 'date': '{{ds}}'}, headers={}, response_check=lambda response: ( "airbnb/airflow/" + DEFAULT_DATE.strftime('%Y-%m-%d') in response.text), poke_interval=5, timeout=15, dag=self.dag) sensor.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_poke_exception(self, mock_session_send): """ Exception occurs in poke function should not be ignored. """ response = requests.Response() response.status_code = 200 mock_session_send.return_value = response def resp_check(resp): raise AirflowException('AirflowException raised here!') task = HttpSensor(task_id='http_sensor_poke_exception', http_conn_id='http_default', endpoint='', request_params={}, response_check=resp_check, timeout=5, poke_interval=1) with self.assertRaisesRegexp(AirflowException, 'AirflowException raised here!'): task.execute(None)
def test_poke_exception(self, mock_session_send): """ Exception occurs in poke function should not be ignored. """ response = requests.Response() response.status_code = 200 mock_session_send.return_value = response def resp_check(resp): raise AirflowException('AirflowException raised here!') task = HttpSensor( task_id='http_sensor_poke_exception', http_conn_id='http_default', endpoint='', request_params={}, response_check=resp_check, timeout=5, poke_interval=1) with self.assertRaisesRegexp(AirflowException, 'AirflowException raised here!'): task.execute(None)
'r') as readfile: reader = csv.DictReader(readfile, delimiter=',') for row in reader: print(row) with DAG(dag_id="willshire_dag", schedule_interval="@daily", default_args=default_args, catchup=False) as dag: ## WILLSHIRE RATES is_willshire_5000_available = HttpSensor( task_id="is_willshire_5000_available", method="GET", http_conn_id="willshire_api", endpoint='latest', response_check=lambda response: "DATE" in response.text, poke_interval=5, timeout=20) downloading_willshire_rates = PythonOperator( task_id="downloading_willshire_rates", python_callable=download_willshire_rates) is_willshire_file_available = FileSensor( task_id="is_willshire_file_available", fs_conn_id="willshire_path", filepath="willshire.csv", poke_interval=5, timeout=20)
"email_on_failure": False, "email_on_retry": False, "email": "*****@*****.**", "retries": 1, "retry_delay": timedelta(minutes=5) } slack_token = BaseHook.get_connection("slack_conn").password with DAG(dag_id="pycon_dag", schedule_interval="*/5 * * * *", default_args=default_args, catchup=False) as dag: esta_todo_bien = HttpSensor( task_id="ping_webpage", method="GET", http_conn_id="pyconar", endpoint="events/pyconar2020/", response_check=lambda response: 200 == response.status_code, poke_interval=5, timeout=20 ) sending_slack_notification = SlackWebhookOperator( task_id='sending_slack', http_conn_id='slack_conn', webhook_token=slack_token, message="Esta todo bien! \n Ahora toma un gatito! " "https://www.youtube.com/watch?v=J---aiyznGQ", username='******', icon_url='https://raw.githubusercontent.com/apache/' 'airflow/master/airflow/www/static/pin_100.png', dag=dag
'a') as outfile: json.dump(outdata, outfile) outfile.write('\n') with DAG( dag_id="forex_data_pipeline", schedule_interval="@daily", default_args=default_args, catchup=False, ) as dag: is_forex_rates_available = HttpSensor( task_id="is_forex_rates_available", http_conn_id="forex_api", endpoint="marclamberti/f45f872dea4dfd3eaa015a4a1af4b39b", response_check=lambda response: "rates" in response.text, poke_interval=5, timeout=20, ) is_forex_currencies_file_available = FileSensor( task_id="is_forex_currencies_file_available", fs_conn_id="forex_path", filepath="forex_currencies.csv", poke_interval=5, timeout=20, ) downloading_rates = PythonOperator(task_id="downloading_rates", python_callable=download_rates)
DAG_NAME = 'HTTP_OPERATOR_TEST' args = {'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(10)} dag = DAG( dag_id=DAG_NAME, catchup=False, default_args=args, schedule_interval='3 12 * * *', ) start_task = DummyOperator(task_id='starting_task', dag=dag) http_sensor_task = HttpSensor(task_id='http_sensor_task', http_conn_id='https_default', method='GET', endpoint='dog.ceo/api/breed/hound/images', headers={"Content-Type": "application/json"}, xcom_push=True, dag=dag) t1 = SimpleHttpOperator(task_id='get_labrador', method='GET', http_conn_id='https_default', endpoint='dog.ceo/api/breed/hound/images', headers={"Content-Type": "application/json"}, xcom_push=True, dag=dag) http_sensor_task >> start_task >> t1
hook = S3_hook.S3Hook('s3_connection') hook.load_file(filename, key, bucket_name) # Initiate our dag definition dag = DAG( dag_id="us_covid_daily_report_pipeline", start_date=datetime(2020, 12, 1), schedule_interval="@daily" ) # Check if Covid api has data for a given date is_covid_api_available = HttpSensor( task_id="is_covid_api_available", method="GET", http_conn_id="covid_api", endpoint="v1/us/{{ ds_nodash }}.json", response_check=lambda response: "date" in response.text, poke_interval=5, timeout=20 ) # Download Covid US states data for a given date fetch_covid_us_data = BashOperator( task_id="fetch_covid_us_data", bash_command="curl -o /opt/airflow/data/{{ ds_nodash }}.json \ --request GET \ --url https://api.covidtracking.com/v1/us/{{ ds_nodash }}.json", dag=dag ) # Upload Covid data to S3 using a helper function
message=slack_msg, username='******') return slack_send_message.execute(context=context) with DAG( dag_id='get_gh_archive_2', default_args=default_args, schedule_interval='0 * * * *', catchup=True, max_active_runs=1, ) as dag: sensor = HttpSensor( task_id='check_if_present', http_conn_id=GH_ARCHIVE_CONN_ID, endpoint='{{execution_date.strftime("%Y-%m-%d-%-H")}}.json.gz', method='HEAD', poke_interval=5, dag=dag, ) get_gh_archive = DownloadGZIPOperator( task_id='download', http_conn_id=GH_ARCHIVE_CONN_ID, endpoint='{{execution_date.strftime("%Y-%m-%d-%-H")}}.json.gz', method='GET', storage_location= 'day_utc={{ ds }}/{{ execution_date.strftime("%Y-%m-%d-%-H") }}.json.gz', headers={'Accept-Encoding': 'deflate'}, ) unzip = BashOperator(
def check_http(response): content = response.text if len(content) > 0: log.info('the server responsed http content - %s' % content) return True log.info('the server did not respond') return False sensor = HttpSensor( task_id='http_sensor', # 'http_default' goes to https://google.com http_conn_id='http_default', endpoint='', request_params={}, response_check=check_http, poke_interval=5, dag=dag, ) handler = PythonOperator( task_id='python_operator', provide_context=True, python_callable=handler, dag=dag ) sensor >> handler
# we use the "with" python keyword to ensure that the DAG object is well closed once we are done using it with DAG(dag_id="forex_data_pipeline", schedule_interval="@daily", default_args=default_args, # to prevent from running past dag runs, catchup = False catchup=False) as dag: # check if the rates at the link are available using the httpsensor is_forex_rates_available = HttpSensor( task_id = "is_forex_rates_available", method="GET", # for conn_id you need to put the link of the connection that you are connecting to # we use the name forex_api because we will create a connection in airflow that is called forex_api # the connection we refer to here refers to the connections available in http: // localhost:8080 / admin / connection / http_conn_id = "forex_api", endpoint = "latest", # we have to give a lambda function that returns true when you get a response from the http sensor # this lambda function basically returns True if the field rates is available/returned in the response.text response_check=lambda response: "rates" in response.text, # the http sensor should send an http request every 5 seconds poke_interval = 5, # for at most 20 seconds before it times out timeout = 20 ) # check if the currencies pair file is available in the directory is_forex_currencies_file_available = FileSensor( task_id="is_forex_currencies_file_available", fs_conn_id="forex_path", filepath="forex_currencies.csv", poke_interval=5, timeout=20
def throw_task(dag, init, code_path, name='', debug=False): if name: name = '-' + name with open(code_path, 'r') as f: code = f.read() spark_session = SparkLivykHook( http_conn_id=CONNECTION, task_id='start-session' + name, data=json.dumps({'kind': 'spark'}), headers={'Content-Type': 'application/json'}, endpoint='sessions', dag=dag, ) sensor = HttpSensor( task_id='wait_spark_ready' + name, http_conn_id=CONNECTION, endpoint="{{'/sessions/'+ti.xcom_pull(task_ids='start-session" + name + "')+'/state'}}", request_params={}, response_check=lambda response: response.json()['state'] == 'idle', poke_interval=5, dag=dag, ) code = SparkLivykHook( http_conn_id=CONNECTION, task_id='send-task' + name, data=json.dumps({'code': code}), headers={'Content-Type': 'application/json'}, endpoint="{{'/sessions/'+ti.xcom_pull(task_ids='start-session" + name + "')+'/statements'}}", dag=dag, ) end_task = HttpSensor( task_id='end-task' + name, http_conn_id=CONNECTION, endpoint="{{'/sessions/'+ti.xcom_pull(task_ids='start-session" + name + "')+'/statements/'+ti.xcom_pull(task_ids='send-task" + name + "')}}", request_params={}, response_check=statment_status, poke_interval=5, dag=dag, ) if not debug: close_task = SimpleHttpOperator( method='DELETE', task_id='close-task' + name, http_conn_id=CONNECTION, endpoint="{{'/sessions/'+ti.xcom_pull(task_ids='start-session" + name + "')}}", dag=dag, ) init >> spark_session >> sensor >> code >> end_task >> close_task return close_task else: init >> spark_session >> sensor >> code >> end_task return end_task
from airflow.sensors.http_sensor import HttpSensor # Python libraries from datetime import datetime # Initiate our dag definition dag = DAG(dag_id="nyc_collisions_pipeline", start_date=datetime(2020, 12, 19), schedule_interval="@daily") # Check if collisions api has data for a given date is_collisions_api_available = HttpSensor( task_id="is_collisions_api_available", method="GET", http_conn_id="nyc_collisions_api", endpoint="resource/h9gi-nx95.json?crash_date={{ ds }}", response_check=lambda response: "crash_date" in response.text, poke_interval=5, timeout=20, dag=dag) # Download collisions data for a given date fetch_collisions_data = BashOperator( task_id="fetch_collisions_data", bash_command="curl -o /usr/local/airflow/data/{{ ds }}.json \ --request GET \ --url https://data.cityofnewyork.us/resource/h9gi-nx95.json?crash_date={{ ds }}", dag=dag) # Define the dependencies is_collisions_api_available >> fetch_collisions_data
'provide_context': True } dag = DAG( dag_id='github_commits_loader', default_args=default_args, schedule_interval=timedelta(days=1), max_active_runs=1, catchup=False, ) check_commits = HttpSensor( task_id='pull_commits', http_conn_id='', headers={'Accept': 'application/vnd.github.cloak-preview'}, method='GET', endpoint= f'https://api.github.com/search/commits?q=committer:{GITHUB_USERNAME}&sort=committer-date', response_check=lambda response: response.json()['total_count'] > 0, dag=dag, ) load_github_commits = HttpToGcsOperator( task_id='load_github_commits', http_conn_id='', headers={'Accept': 'application/vnd.github.cloak-preview'}, method='GET', endpoint= f'https://api.github.com/search/commits?q=committer:{GITHUB_USERNAME}&sort=committer-date', bucket=GOOGLE_STORAGE_BUCKET, filename=OUTPUT_FILENAME, dag=dag,
""" Filter changes Return a list of items that matched by criteria ``entity`` and ``action`` """ for task in tasks: _entity = get_entity(task["id"]) if _entity == entity and task.get("task") == action: yield task http_kernel_check = HttpSensor( task_id='http_kernel_check', http_conn_id='kernel_conn', endpoint='/changes', request_params={}, poke_interval=5, dag=dag, ) read_changes_task = ShortCircuitOperator( task_id="read_changes_task", provide_context=True, python_callable=read_changes, dag=dag, ) def transform_journal(data): metadata = data["metadata"]
@apply_defaults def __init__(self, name, *args, **kwargs): super().__init__(*args, **kwargs) self.name = name def execute(self, context): message = "Hello {}".format(self.name) print(message) return message task1 = BashOperator(task_id='t1', bash_command='eacho hello', dag=dag) task2 = BashOperator(task_id='t2', bash_command='eacho t2', dag=dag) task3 = BashOperator(task_id='t3', bash_command='eacho t3', dag=dag, trigger_rule='all_failed') task4 = MyOperator(name='Akshay', task_id='t4', dag=dag, trigger_rule='one_success') # Create connection id in UI. sensor = HttpSensor(task_id='sensor', endpoint='/', http_conn_id='my_httpcon', dag=dag, retires=20, retry_delay=timedelta(seconds=10)) sensor >> task1 >> [task2, task3] >> task4
outdata['rates'][pair] = indata['rates'][pair] with open('/usr/local/airflow/dags/files/forex_rates.json', 'a') as outfile: json.dump(outdata, outfile) outfile.write('\n') with DAG(dag_id="forex_data_pipeline", schedule_interval="@daily", default_args=default_args, catchup=False) as dag: is_forex_rates_available = HttpSensor( task_id="is_forex_rates_available", method='GET', http_conn_id='forex_api', endpoint='latest', response_check=lambda response: "rates" in response.text, poke_interval=5, timeout=20, ) is_forex_file_available = FileSensor( task_id="is_forex_file_available", fs_conn_id='forex_path', filepath="forex_currencies.csv", poke_interval=5, timeout=20, ) downloading_rates = PythonOperator( task_id='forex_downloading_rates',
ac = AirflowController() ac.createHttpConnection("securethebox", "https://securethebox.us") def t2_error_task(**context): instance = context['task_instance'] print("Failed...", instance) http_sensor = HttpSensor( task_id='http_sensor_task', http_conn_id='securethebox', endpoint='', method='GET', request_params=None, headers=None, response_check=False, extra_options=None, poke_interval=1, # (seconds); checking site every 5 seconds timeout=30, # timeout in 1 minute on_failure_callback=t2_error_task, dag=dag) def printMessage(**context): xcomdata = context['task_instance'].xcom_pull(task_ids='http_sensor_task') print("print", xcomdata) print_message = PythonOperator(task_id='print_message', python_callable=printMessage,
t3 = SimpleHttpOperator( task_id='put_op', method='PUT', endpoint='api/v1.0/nodes', data=json.dumps({"priority": 5}), headers={"Content-Type": "application/json"}, dag=dag) t4 = SimpleHttpOperator( task_id='del_op', method='DELETE', endpoint='api/v1.0/nodes', data="some=data", headers={"Content-Type": "application/x-www-form-urlencoded"}, dag=dag) sensor = HttpSensor( task_id='http_sensor_check', http_conn_id='http_default', endpoint='', request_params={}, response_check=lambda response: True if "Google" in response.content else False, poke_interval=5, dag=dag) t1.set_upstream(sensor) t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t3) t5.set_upstream(t4)
from airflow.operators.hive_operator import HiveOperator from airflow.contrib.operators.spark_submit_operator import SparkSubmitOperator from airflow.operators.email_operator import EmailOperator from airflow.operators.slack_operator import SlackAPIPostOperator import json import csv import requests default_args = { "owner": 'test', "start_date": datetime(2021, 1, 1), "depends_on_past": False, "email_on_failure": False, "email": "*****@*****.**", "retries": 1, "retry_delay": timedelta(minutes=5) } with DAG(dag_id="test", schedule_interval="@hourly", default_args=default_args, catchup=True) as dag: collect_data = HttpSensor( task_id="collect_data", method="GET", http_conn_id="data_api", endpoint="latest", response_check=lambda response: "rates" in response.text, poke_interval=5, timeout=20)
""" Filter changes Return a list of items that matched by criteria ``entity`` and ``action`` """ for task in tasks: _entity = get_entity(task["id"]) if _entity == entity and task.get("task") == action: yield task http_kernel_check = HttpSensor( task_id="http_kernel_check", http_conn_id="kernel_conn", endpoint="/changes", request_params={}, poke_interval=5, dag=dag, ) read_changes_task = ShortCircuitOperator( task_id="read_changes_task", provide_context=True, python_callable=read_changes, dag=dag, ) def JournalFactory(data): """Produz instância de `models.Journal` a partir dos dados retornados do endpoint `/journals/:journal_id` do Kernel.
outdata['rates'][pair] = indata['rates'][pair] with open('/usr/local/airflow/dags/files/forex_rates.json', 'a') as outfile: json.dump(outdata, outfile) outfile.write('\n') with DAG(dag_id="forex_data_pipeline", schedule_interval="@daily", default_args=default_args, catchup=False) as dag: is_forex_rates_available = HttpSensor( task_id="is_forex_rates_available", method="GET", http_conn_id="forex_api", endpoint="latest", response_check=lambda response: "rates" in response.text, poke_interval=5, timeout=20) is_forex_currencies_file_available = FileSensor( task_id="is_forex_currencies_file_available", fs_conn_id="forex_path", filepath="forex_currencies.csv", poke_interval=5, timeout=20) downloading_rates = PythonOperator(task_id="downloading_rates", python_callable=download_rates) saving_rates = BashOperator(task_id="saving_rates",
t3 = SimpleHttpOperator( task_id='put_op', method='PUT', endpoint='put', data=json.dumps({"priority": 5}), headers={"Content-Type": "application/json"}, dag=dag, ) t4 = SimpleHttpOperator( task_id='del_op', method='DELETE', endpoint='delete', data="some=data", headers={"Content-Type": "application/x-www-form-urlencoded"}, dag=dag, ) sensor = HttpSensor( task_id='http_sensor_check', http_conn_id='http_default', endpoint='', request_params={}, response_check=lambda response: "httpbin" in response.text, poke_interval=5, dag=dag, ) sensor >> t1 >> t2 >> t3 >> t4 >> t5
def test_poke_context(self, mock_session_send): """ test provide_context """ response = requests.Response() response.status_code = 200 mock_session_send.return_value = response def resp_check(resp, **context): if context: if "execution_date" in context: if context["execution_date"] == DEFAULT_DATE: return True raise AirflowException('AirflowException raised here!') task = HttpSensor(task_id='http_sensor_poke_exception', http_conn_id='http_default', endpoint='', request_params={}, response_check=resp_check, provide_context=True, timeout=5, poke_interval=1, dag=self.dag) task_instance = TaskInstance(task=task, execution_date=DEFAULT_DATE) task.execute(task_instance.get_template_context())