def test_default_fs_conn_id(self): with tempfile.NamedTemporaryFile() as tmp: task = FileSensor( task_id="test", filepath=tmp.name[1:], dag=self.dag, timeout=0, ) task._hook = self.hook task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_wildcard_file(self): suffix = '.txt' with tempfile.NamedTemporaryFile(suffix=suffix) as tmp: fileglob = os.path.join(os.path.dirname(tmp.name), '*' + suffix) task = FileSensor( task_id='test', filepath=fileglob, fs_conn_id='fs_default', dag=self.dag, timeout=0, ) task._hook = self.hook task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
def test_file_in_dir(self): temp_dir = tempfile.mkdtemp() task = FileSensor( task_id="test", filepath=temp_dir[1:], fs_conn_id='fs_default', dag=self.dag, timeout=0, ) task._hook = self.hook try: # `touch` the dir open(temp_dir + "/file", "a").close() task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) finally: shutil.rmtree(temp_dir)
def test_subdirectory_not_empty(self): suffix = '.txt' temp_dir = tempfile.mkdtemp() subdir = tempfile.mkdtemp(dir=temp_dir) with tempfile.NamedTemporaryFile(suffix=suffix, dir=subdir): task = FileSensor( task_id='test', filepath=temp_dir, fs_conn_id='fs_default', dag=self.dag, timeout=0, ) task._hook = self.hook task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) shutil.rmtree(temp_dir)
def test_subdirectory_empty(self): temp_dir = tempfile.mkdtemp() tempfile.mkdtemp(dir=temp_dir) task = FileSensor(task_id='test', filepath=temp_dir, fs_conn_id='fs_default', dag=self.dag, timeout=0, poke_interval=1) task._hook = self.hook with self.assertRaises(AirflowSensorTimeout): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) shutil.rmtree(temp_dir)
def test_empty_dir(self): temp_dir = tempfile.mkdtemp() task = FileSensor(task_id="test", filepath=temp_dir[1:], fs_conn_id='fs_default', dag=self.dag, timeout=0, poke_interval=1) task._hook = self.hook try: with self.assertRaises(AirflowSensorTimeout): task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) finally: shutil.rmtree(temp_dir)
def get_file_sensor(path: str, filename: str) -> FileSensor: label, _ = filename.split(".") sensor = FileSensor( task_id=f"waiting-{label}", poke_interval=10, retries=100, filepath=os.path.join(path, filename), ) return sensor
def uploads3(filename: str, key: str, bucket_name: str) -> None: hook = S3Hook('s3_conn') for f in walk_directory(filename): hook.load_file(filename=f, key=f, bucket_name=bucket_name) print(f"File {f} upload complete") with DAG('pipeline', start_date=datetime(2022, 3, 28), schedule_interval='@daily', default_args=default_args, catchup=False) as dag: is_csv_available = FileSensor(task_id='is_csv_available', fs_conn_id='path', filepath="owid-covid-data.csv", poke_interval=5, timeout=20) push_to_hive = BashOperator(task_id="push_to_hive", bash_command=""" hdfs dfs -mkdir -p /covidData && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/owid-covid-data.csv /covidData """) create_hive_table = HiveOperator(task_id='create_hive_table', hive_cli_conn_id='hive_conn', hql=""" CREATE EXTERNAL TABLE IF NOT EXISTS cov_data( iso_code STRING, continent STRING,
catchup=False) as dag: is_forex_rates_available = HttpSensor( task_id='is_forex_rates_available', method='GET', http_conn_id='forex_api', endpoint='latest', response_check=lambda response: 'rates' in response.text, poke_interval=5, timeout=20 ) is_forex_currencies_file_available = FileSensor( task_id='is_forex_currencies_file_available', fs_conn_id='forex_path', filepath='forex_currencies.csv', poke_interval=5, timeout=20 ) downloading_rates = PythonOperator( task_id='downloading_rates', python_callable=_download_rates ) saving_rates = BashOperator( task_id='saving_rates', bash_command=""" hdfs dfs -mkdir -p /forex && \ hdfs dfs -put -f $AIRFLOW_HOME/dags/files/forex_rates.json /forex """
from airflow.providers.docker.operators.docker import DockerOperator from airflow.utils.dates import days_ago from utils import default_args, VOLUME from airflow.sensors.filesystem import FileSensor prod_model_path = '{{ var.value.PROD_MODEL_PATH }}' with DAG(dag_id='_prod3_3_predict', default_args=default_args, schedule_interval="@daily", start_date=days_ago(0, 2)) as dag: start = DummyOperator(task_id='start') data_sensor = FileSensor(task_id='data_sensor', filepath='data/raw/{{ ds }}/data.csv', poke_interval=10, retries=100) model_sensor = FileSensor(task_id='model_sensor', filepath='data/model/{{ ds }}/model.pkl', poke_interval=10, retries=100) transformer_sensor = FileSensor( task_id='transformer_sensor', filepath='data/model/{{ ds }}/transformer.pkl', poke_interval=10, retries=100) prediction = DockerOperator(task_id='prediction', image='prediction',
with open('/opt/airflow/files/forex_rates.json', 'a') as outfile: json.dump(outdata, outfile) outfile.write('\n') with DAG("forex_data_pipeline", start_date=datetime(2021, 1, 1), schedule_interval="@daily", default_args=default_args, catchup=False) as dag: is_forex_rates_available = HttpSensor( task_id="is_forex_rates_available", http_conn_id="forex_api", endpoint="marclamberti/f45f872dea4dfd3eaa015a4a1af4b39b", response_check=lambda response: "rates" in response.text, poke_interval=5, timeout=20) is_forex_currencies_file_available = FileSensor( task_id="is_forex_currencies_file_available", fs_conn_id="forex_path", filepath="forex_currencies.csv", poke_interval=5, timeout=20) downloading_rates = PythonOperator(task_id="downloading_rates", python_callable=download_rates) is_forex_rates_available >> is_forex_currencies_file_available >> downloading_rates
from airflow.utils.dates import days_ago from airflow.models import Variable from airflow.sensors.filesystem import FileSensor from utils import DEFAULT_VOLUME, default_args with DAG("DAG3_inference", default_args=default_args, schedule_interval="@daily", start_date=days_ago(3)) as dag: start_task = DummyOperator(task_id='start-prediction') data_await = FileSensor( filepath='/opt/airflow/data/raw/{{ ds }}/data.csv', task_id="await-data", poke_interval=10, retries=100, ) model_await = FileSensor( filepath='/opt/airflow/{{ var.value.model_dir }}/model.pkl', task_id="await-model", poke_interval=10, retries=100, ) preprocessing = DockerOperator( task_id="preprocessing", image="airflow-preprocess", command="--input-dir data/raw/{{ ds }} " "--output-dir data/processed/for_preds/{{ ds }} " "--prediction", network_mode="bridge",
with DAG(dag_id='download_dag', schedule_interval='@daily', start_date=days_ago(3), catchup=True, default_args=default_args) as dag: download_data = PythonOperator(task_id='download_data', python_callable=_download_data #op_kwargs={'my_param':42} ) check_data = PythonOperator(task_id='check_data', python_callable=_check_data) wait_data = FileSensor(task_id='wait_data', fs_conn_id='fs_default', filepath='my_file.txt', poke_interval=30) process_data = BashOperator(task_id='process_data', bash_command='exit 1', on_failure_callback=_fail_callback) #download_data.set_downstream(wait_data) #wait_data.set_downstream(process_data) #download_data >> [wait_data,process_data] #download_data >> wait_data >> process_data #chain(download_data,wait_data,process_data)
default_args = { 'owner': 'ningeen', 'start_date': airflow.utils.dates.days_ago(7), 'email': [Variable.get("gmail_user")], 'email_on_failure': True, } with DAG( dag_id="03_prediction", schedule_interval="@daily", default_args=default_args, ) as dag: wait_raw_data = FileSensor( task_id="wait_raw_data", filepath="./data/raw/{{ ds }}/data.csv", poke_interval=30, retries=100, ) wait_raw_target = FileSensor( task_id="wait_raw_target", filepath="./data/raw/{{ ds }}/target.csv", poke_interval=30, retries=100, ) preprocessor = DockerOperator( image="airflow-preprocessor", command="/data/raw/{{ ds }}/data.csv /data/raw/{{ ds }}/target.csv", network_mode="bridge", task_id="docker-airflow-preprocessor",
from airflow.sensors.filesystem import FileSensor default_args = { 'retry' : 5 ,'retry_delay' : timedelta(minutes=5) } # let's check if the file myfile.txt is in the folder def _downloading_data (**kwargs): with open ('/tmp/myfile.txt','w'): f.write('my_data') with DAG ( dag_id = 'simple_dag' ,schedule_interval = "*/10 * * * *" ,start_date = datetime(2021,1,1) ,catchup = False #disable backfilling ,default_args = default_args ) as dag: downloading_data = PythonOperator ( task_id = 'downloading_data' ,python_callable = _downloading_data ) waiting_data = FileSensor ( task_id = 'waiting_data' ,fs_conn_id= = 'con_id' ,filepath = 'my_file.txt' ,poke_interval = 15 )
import airflow from airflow import DAG from airflow.sensors.filesystem import FileSensor from airflow.operators.bash import BashOperator from airflow.providers.docker.operators.docker import DockerOperator RAW_DATA_PATH = "data/raw/data.csv" RAW_TARGET_PATH = "data/raw/target.csv" with DAG( dag_id="train_val", start_date=airflow.utils.dates.days_ago(21), schedule_interval="@weekly", ) as dag: wait_for_features = FileSensor(task_id="wait-for-features", poke_interval=10, retries=5, filepath=RAW_DATA_PATH) wait_for_target = FileSensor(task_id="wait-for-target", poke_interval=10, retries=5, filepath=RAW_TARGET_PATH) preprocess = DockerOperator( image="airflow-preprocess", task_id="preprocess", do_xcom_push=False, command='--input-dir /data/raw/ --output-dir /data/processed/{{ ds }}', volumes=[ '/d/katka/MADE/semester2/ML_in_production/airflow/airflow-examples/data:/data' ],
from datetime import datetime from airflow import DAG from airflow.operators.bash import BashOperator from airflow.operators.python import PythonOperator from airflow.sensors.filesystem import FileSensor dag = DAG(dag_id='process_numbers', schedule_interval=None, default_args={ 'owner': 'jie', 'start_date': datetime(2021, 7, 1), }) file_sensor = FileSensor( task_id='file_sensor', dag=dag, filepath='/data/numbers.json', poke_interval=5, ) stage_file = BashOperator( task_id='stage_file', dag=dag, bash_command=""" STAGE_FILE=/data/stage/numbers_$(date +%s).json mkdir -p /data/stage mv /data/numbers.json ${STAGE_FILE} echo $STAGE_FILE """, )
from airflow.operators.python_operator import PythonOperator from airflow.operators.bash_operator import BashOperator from airflow.sensors.filesystem import FileSensor from airflow.utils.dates import days_ago default_args = {'owner': 'airflow', 'start_date': days_ago(1), 'retries': 3} with DAG('dag_sensors', default_args=default_args, schedule_interval='@daily', catchup=False) as dag: start = DummyOperator(task_id='start') task_print_hello_world = BashOperator( task_id='task_print_hello_world', bash_command='echo "Hello World!" > /opt/airflow/logs/hello.txt') task_remove_hello_world = BashOperator( task_id='task_remove_hello_world', bash_command='rm /opt/airflow/logs/hello.txt') task_waiting_for_data = FileSensor(task_id='task_waiting_for_data', fs_conn_id='fs_path_default', filepath='hello.txt', poke_interval=15) end = DummyOperator(task_id='end') start << [task_print_hello_world, task_waiting_for_data ] << task_remove_hello_world << end
"provide_context": True, } with DAG( "load_dataset_create_api", default_args=default_args, description="A demo pipeline that loads a file from the landing zone and automatically generates an API", schedule_interval=timedelta(days=1), start_date=days_ago(2), tags=["example"], ) as dag: wait_for_file = FileSensor( task_id="wait_for_file", poke_interval=5, fs_conn_id="landing_zone", filepath="*", ) get_filename = PythonOperator( task_id='get_filename', python_callable=get_latest_file, op_kwargs={'filepath': '/usr/local/airflow/data/engineering'}, ) load_data = PythonOperator( task_id="load_data", python_callable=load, ) generate_api = GenerateAPIOperator(
"owner": "airflow", "retries": 1, "retry_delay": timedelta(minutes=5), } PATH = "/home/vadim/MADE/vzavadskyi/data" with DAG( "prepare_model", default_args=default_args, schedule_interval="@weekly", start_date=days_ago(0), ) as dag: check_data_ready = FileSensor(task_id="wait-raw-data", filepath="data/raw/{{ ds }}/data.csv", poke_interval=10, retries=2) check_target_data = FileSensor(task_id="wait-target-data", filepath="data/raw/{{ ds }}/target.csv", poke_interval=10, retries=2) preprocess = DockerOperator( image="airflow-preprocess", command= "--input-dir /data/raw/{{ ds }} --output-dir /data/processed/{{ ds }}", task_id="docker-airflow-preprocess", do_xcom_push=False, volumes=[f"{PATH}:/data"], )
schedule_interval='*/10 * * * *', start_date=datetime(2020, 1, 1), catchup=False) as dag: creating_table = PostgresOperator(task_id='creating_table', sql='sql/CREATE_TABLE_ACCURACIES.sql', postgres_conn_id='postgres') downloading_data = PythonOperator(task_id='downloading_data', python_callable=download_dataset) sanity_check = PythonOperator(task_id="sanity_check", python_callable=check_dataset) waiting_for_data = FileSensor(task_id='waiting_for_data', fs_conn_id='fs_default', filepath='avocado.csv', poke_interval=15) n_estimators = [100, 150] max_features = ['auto', 'sqrt'] training_model_tasks = [] for feature in max_features: for estimator in n_estimators: ml_id = f"{feature}_{estimator}" training_model_tasks.append( PapermillOperator( task_id=f'training_model_{ml_id}', input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb=
with DAG( dag_id="daily_pipeline", start_date=days_ago(8), schedule_interval="@daily", default_args=DEFAULT_DAG_ARGS, ) as dag: SCHEDULE_DATE = "{{ ds }}" INPUT_DATA_DIR = os.path.join(DATA_DIR, "raw", SCHEDULE_DATE) PREPROCESSED_DATA_DIR = os.path.join(DATA_DIR, "preprocessed", SCHEDULE_DATE) PREDICT_DATA_DIR = os.path.join(DATA_DIR, "predict", SCHEDULE_DATE) WEEKLY_MODEL_DIR = os.path.join(PRODUCTION_MODEL_DIR, SCHEDULE_DATE) wait_daily_data = FileSensor(task_id="wait_daily_data", filepath=f"{INPUT_DATA_DIR}/data.csv", poke_interval=30) weekly_train_branch = BranchPythonOperator( task_id='weekly_train_branch', python_callable=weekly_train_model, provide_context=True, ) preprocess_data_train = DockerOperator( task_id="preprocess_data_train", image="airflow-preprocess", command=f"--input-data-dir={INPUT_DATA_DIR} " f"--output-data-dir={PREPROCESSED_DATA_DIR} " "--mode=train ", **DOCKER_CONFIG,
from airflow import DAG from airflow.providers.docker.operators.docker import DockerOperator from airflow.utils.dates import days_ago from airflow.sensors.filesystem import FileSensor from utils import default_args, VOLUME with DAG(dag_id='_prod3_2_train_model', schedule_interval='@weekly', start_date=days_ago(0, 2), default_args=default_args) as dag: data_sensor = FileSensor(task_id='data_sensor', filepath='data/raw/{{ ds }}/data.csv', poke_interval=10, retries=100) target_sensor = FileSensor(task_id='target_sensor', filepath='data/raw/{{ ds }}/target.csv', poke_interval=10, retries=100) build_features = DockerOperator(task_id='build_features', image='build_features', command='/data/raw/{{ ds }}', network_mode='bridge', volumes=[VOLUME], do_xcom_push=False) split_data = DockerOperator(task_id='split_data', image='split_data', command='/data/processed/{{ ds }}',
with DAG( "02_train_pipeline", default_args=DEFAULT_ARGS, start_date=START_DATE, default_view="graph", schedule_interval="@weekly", ) as dag: start_pipeline = DummyOperator(task_id='start-pipeline') wait_data = FileSensor( task_id="wait-for-data", filepath=str(os.path.join(DATA_RAW_DIR, "data.csv")), timeout=6000, poke_interval=10, retries=100, mode="poke", ) wait_target = FileSensor( task_id="wait-for-target", filepath=str(os.path.join(DATA_RAW_DIR, "target.csv")), timeout=6000, poke_interval=10, retries=100, mode="poke", ) data_preprocessing = DockerOperator( image="airflow-process",
from airflow.utils.dates import days_ago from airflow.sensors.filesystem import FileSensor from utils import default_args, VOLUME with DAG( "2_train_pipeline", default_args=default_args, schedule_interval="@weekly", start_date=days_ago(5), ) as dag: start = DummyOperator(task_id="Begin") data_sensor = FileSensor(task_id="Wait_for_data", poke_interval=10, retries=100, filepath="data/raw/{{ ds }}/data.csv") target_sensor = FileSensor(task_id="Wait_for_target", poke_interval=10, retries=100, filepath="data/raw/{{ ds }}/target.csv") preprocess = DockerOperator( task_id="Data_preprocess", image="airflow-preprocess", command= "/data/raw/{{ ds }} /data/processed/{{ ds }} /data/model/{{ ds }}", network_mode="bridge", do_xcom_push=False, volumes=[VOLUME],
from airflow import DAG from airflow.sensors.filesystem import FileSensor from airflow.operators.python import PythonOperator from datetime import datetime from logsparsing import fetch_logs from logsparsing import preprocess_logs default_args = {"start_date": datetime(2020, 1, 1), "owner": "airflow"} with DAG(dag_id="log_dag", schedule_interval="@daily", default_args=default_args) as dag: waiting_to_logs = FileSensor(task_id="waiting_for_dag_logs", fs_conn_id="fs_logs", filepath="logs_dags.py.log", poke_interval=5) fetching_logs = PythonOperator(task_id="fetching_logs", python_callable=fetch_logs.main) error_logs = PythonOperator(task_id="preprocess_logs", python_callable=preprocess_logs.main)
import airflow.utils.dates from airflow import DAG from airflow.sensors.filesystem import FileSensor dag = DAG( dag_id="listing_6_01", start_date=airflow.utils.dates.days_ago(3), schedule_interval="0 16 * * *", description= "A batch workflow for ingesting supermarket promotions data, demonstrating the FileSensor.", default_args={"depends_on_past": True}, ) wait = FileSensor(task_id="wait_for_supermarket_1", filepath="/data/supermarket1/data.csv", dag=dag)
def etl_twitter(): @task def start(): print("Start!") return True @task def read_data_export_json(retorno): with open(base_path + "collected_tweets_2021-03-17-19-19-15.txt", "r") as f: tweets = f.readlines() for i in range(len(tweets)): with open(f"{base_path}tweet_{i}.json", "w") as f: json.dump(json.loads(json.loads(tweets[i])), f) return len(tweets) - 1 @task def read_json_export_pandas(retorno): arquivos = [ file for file in os.listdir(base_path) if file.startswith("tweet_") ] print(arquivos) for arquivo in arquivos: with open(base_path + arquivo) as f: tweet = f.readlines() parsedtweet = json.loads(tweet[0]) processado = tweet_para_df(parsedtweet) if processado is None: pass else: processado.to_csv(base_path + arquivo[:-4] + "csv", sep=";", index=False) return True @task def concatenate_all_csvs(retorno): arquivos = [ file for file in os.listdir(base_path) if file.endswith(".csv") ] dataframes = [ pd.read_csv(base_path + arquivo, sep=';') for arquivo in arquivos ] unico = pd.concat(dataframes, ignore_index=True) unico.to_csv(f"{base_path}tweets_dataframe_unico.csv", sep=";", index=False) return True st = start() ntweets = read_data_export_json(st) check_file = FileSensor(task_id="check_file", filepath=f"{base_path}tweet_{ntweets}.json", poke_interval=10) res = read_json_export_pandas(ntweets) unico = concatenate_all_csvs(res) ntweets >> check_file >> res
with open('/tmp/my_files.txt', 'w') as f: f.write('my_data') def _checking_data(): print("checking data") with DAG(dag_id='sample_af2_dag', default_args=default_args, schedule_interval='@daily', catchup=False, start_date=datetime(2021, 1, 1)) as dag: downloading_data = PythonOperator( task_id='downloading_data', python_callable=_downloading_data ) checking_data = PythonOperator( task_id='checking_data', python_callable=_checking_data ) waiting_for_data = FileSensor( task_id='waiting_for_data', fs_conn_id='fs_default', filepath='my_files.txt' ) processing_data = BashOperator( task_id='processing_data', bash_command='exit 0' ) cross_downstream([downloading_data, checking_data], [waiting_for_data,processing_data])
schedule_interval='0 8 * * 1-5', #dagrun_timeout=timedelta(minutes=5), start_date=datetime(2019, 1, 1), tags=['acoes', 'swing', 'b3']) tsk_B3 = BashOperator( task_id='analisa_papeis', bash_command= 'cd /home/fm/Documents/swing_bbot3/Scripts && /home/fm/miniconda3/envs/b3/bin/python /home/fm/Documents/swing_bbot3/Scripts/app.py', dag=dag, depends_on_past=False) mail_results = EmailOperator( task_id='send_result', to=['*****@*****.**'], subject='Análise Gráfica Diária', html_content=""" <h3>Bom dia, humano. Seguem resultados anexos.</h3> """, files=[ f"/home/fm/Documents/swing_bbot3/Dados/analise_{datetime.now().strftime('%m.%d.%Y')}.xlsx" ], dag=dag) check_results_storage = FileSensor( task_id='sense_result', filepath= f"/home/fm/Documents/swing_bbot3/Dados/analise_{datetime.now().strftime('%m.%d.%Y')}.xlsx", poke_interval=3, dag=dag) tsk_B3 >> check_results_storage >> mail_results