def setUp(self): self.key = "test_dag_id" task = DummyOperator(task_id='dummy', dag=models.DAG(dag_id=self.key, default_args={'start_date': days_ago(2)}), owner='airflow') d = days_ago(1) with create_session() as session: session.add(DM(dag_id=self.key)) session.add(DR(dag_id=self.key)) session.add(TI(task=task, execution_date=d, state=State.SUCCESS)) # flush to ensure task instance if written before # task reschedule because of FK constraint session.flush() session.add(LOG(dag_id=self.key, task_id=None, task_instance=None, execution_date=d, event="varimport")) session.add(TF(task=task, execution_date=d, start_date=d, end_date=d)) session.add(TR(task=task, execution_date=d, start_date=d, end_date=d, try_number=1, reschedule_date=d))
def setUpClass(cls): dagbag = models.DagBag(include_examples=True) cls.dag1 = dagbag.dags['example_bash_operator'] cls.dag1.sync_to_db() cls.dag2 = dagbag.dags['example_subdag_operator'] cls.dag2.sync_to_db() cls.execution_dates = [days_ago(2), days_ago(1)]
def setUp(self): self.dagbag = models.DagBag(include_examples=True) self.dag1 = self.dagbag.dags['example_bash_operator'] self.dag2 = self.dagbag.dags['example_subdag_operator'] self.execution_dates = [days_ago(2), days_ago(1), days_ago(0)] self.session = Session()
def setUp(self): self.session = settings.Session() self.key = "test_dag_id" task = DummyOperator(task_id='dummy', dag=models.DAG(dag_id=self.key, default_args={'start_date': days_ago(2)}), owner='airflow') self.session.add(DM(dag_id=self.key)) self.session.add(DR(dag_id=self.key)) self.session.add(TI(task=task, execution_date=days_ago(1), state=State.SUCCESS)) self.session.add(LOG(dag_id=self.key, task_id=None, task_instance=None, execution_date=days_ago(1), event="varimport")) self.session.commit()
def setUp(self): self.dagbag = models.DagBag(include_examples=True) self.dag1 = self.dagbag.dags['example_bash_operator'] self.dag2 = self.dagbag.dags['example_subdag_operator'] self.execution_dates = [days_ago(2), days_ago(1)] drs = _create_dagruns(self.dag1, self.execution_dates, state=State.RUNNING, run_id_template="scheduled__{}") for dr in drs: dr.dag = self.dag1 dr.verify_integrity() drs = _create_dagruns(self.dag2, [self.dag2.default_args['start_date']], state=State.RUNNING, run_id_template="scheduled__{}") for dr in drs: dr.dag = self.dag2 dr.verify_integrity()
def test_days_ago(self): today = pendulum.today() today_midnight = pendulum.instance(datetime.fromordinal(today.date().toordinal())) self.assertTrue(dates.days_ago(0) == today_midnight) self.assertTrue(dates.days_ago(100) == today_midnight + timedelta(days=-100)) self.assertTrue(dates.days_ago(0, hour=3) == today_midnight + timedelta(hours=3)) self.assertTrue(dates.days_ago(0, minute=3) == today_midnight + timedelta(minutes=3)) self.assertTrue(dates.days_ago(0, second=3) == today_midnight + timedelta(seconds=3)) self.assertTrue(dates.days_ago(0, microsecond=3) == today_midnight + timedelta(microseconds=3))
# Example dataset DATASET = { "display_name": "test_video_dataset", "video_classification_dataset_metadata": {}, } IMPORT_INPUT_CONFIG = {"gcs_source": {"input_uris": [GCP_AUTOML_VIDEO_BUCKET]}} extract_object_id = CloudAutoMLHook.extract_object_id # Example DAG for AutoML Video Intelligence Classification with models.DAG( "example_automl_video", schedule_interval=None, # Override to match your needs start_date=days_ago(1), user_defined_macros={"extract_object_id": extract_object_id}, tags=['example'], ) as example_dag: create_dataset_task = AutoMLCreateDatasetOperator( task_id="create_dataset_task", dataset=DATASET, location=GCP_AUTOML_LOCATION) dataset_id = create_dataset_task.output["dataset_id"] import_dataset_task = AutoMLImportDataOperator( task_id="import_dataset_task", dataset_id=dataset_id, location=GCP_AUTOML_LOCATION, input_config=IMPORT_INPUT_CONFIG,
# 'wait_for_downstream': False, # 'dag': dag, # 'sla': timedelta(hours=2), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'sla_miss_callback': yet_another_function, # 'trigger_rule': 'all_success' } dag = DAG( 'blog_example', default_args=default_args, description='Example dag for airflow blog', schedule_interval="0 10 * * *", start_date=days_ago(2), tags=['example'], ) p1 = PythonOperator( task_id='spider', python_callable=spider, dag=dag, ) p2 = PythonOperator( task_id='read_db', python_callable=read_db, dag=dag, )
from __future__ import print_function import xarray as xa import codecs, pickle, time from builtins import range from pprint import pprint import airflow from airflow.models import DAG from airflow.operators.python_operator import PythonOperator from airflow.utils.dates import days_ago args = { 'owner': 'airflow', 'start_date': days_ago(2), } dag = DAG( dag_id='ILTest-xarray1', default_args=args, schedule_interval=None, ) def print_context(ds, **kwargs): pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' op_print_context = PythonOperator( task_id='print_the_context',
failed_alert = MessageOperator( task_id="failed_alert", http_conn_id="slack", webhook_token=slack_webhook_token, message=f"{cleandoc(message)}\n\n{formatted_exception}", username="******", ) return failed_alert.execute(context=context) # set the local time zone, so the start_date DAG param can use it in its context # as stated in the Airflow docs, pendulum must be used to set the timezone amsterdam = pendulum.timezone("Europe/Amsterdam") # set start_date to 'yesterday', and get the year, month and day as seperate integer values start_date_dag = str(days_ago(1)) YYYY = 0 MM = 0 DD = 0 # # extract the YYYY MM and DD values as integers get_YYYY_MM_DD_values = re.search("([0-9]{4})-([0-9]{2})-([0-9]{2})", start_date_dag) if get_YYYY_MM_DD_values: YYYY = int(get_YYYY_MM_DD_values.group(1)) MM = int(get_YYYY_MM_DD_values.group(2)) DD = int(get_YYYY_MM_DD_values.group(3)) default_args = { "owner": "dataservices", "depends_on_past": False,
import os import datetime from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.utils.dates import days_ago default_args = { 'owner': 'Airflow', 'depends_on_past': False, 'email': os.environ['FAILURE_EMAIL'], 'start_date': days_ago(0), 'email_on_failure': True, } dag = DAG(dag_id='games', default_args=default_args, schedule_interval="* * * * *") t1 = BashOperator(task_id='sklearn_pipeline', bash_command='sudo docker run sklearn_pipeline', dag=dag)
TODO: Review the workflow, change it accordingly to your environment & enable the code. """ from datetime import timedelta from airflow import DAG from airflow.operators.bash import BashOperator from airflow.operators.python import ShortCircuitOperator from airflow.providers.docker.operators.docker import DockerOperator from airflow.utils.dates import days_ago default_args = { "owner": "airflow", "depends_on_past": False, "start_date": days_ago(2), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5), } dag = DAG("docker_sample_copy_data", default_args=default_args, schedule_interval=timedelta(minutes=10)) locate_file_cmd = """ sleep 10 find {{params.source_location}} -type f -printf "%f\n" | head -1 """
from datetime import timedelta from airflow import DAG from airflow.utils.dates import days_ago from dag_test_examples import t_A, t_B default_args = { "owner": "airflow", "depends_on_past": False, "start_date": days_ago(2), "retries": 1, "retry_delay": timedelta(minutes=5), "dbnd_config": { "databand": { "env": "gcp" } }, } with DAG(dag_id="dbnd_dag_at_gcp", default_args=default_args) as dag_remote_fs: a = t_A() b = t_B(a) if __name__ == "__main__": dag_remote_fs.clear() dag_remote_fs.run(start_date=days_ago(0), end_date=days_ago(0))
from airflow.providers.docker.operators.docker import DockerOperator from airflow.utils.dates import days_ago DATA_PATH = "/Users/mariapopova/Documents/GitHub/chydlife/airflow_ml_dags/data:/data" default_args = { "owner": "airflow", "email": ["*****@*****.**"], "retries": 1, "retry_delay": timedelta(minutes=5), } with DAG( "download-train-validate", default_args=default_args, schedule_interval="@daily", start_date=days_ago(5), ) as dag: download = DockerOperator( image="airflow-download", command="/data/raw/{{ ds }}", network_mode="bridge", task_id="docker-airflow-download", do_xcom_push=False, # !!! HOST folder(NOT IN CONTAINER) replace with yours !!! volumes=[DATA_PATH]) preprocess = DockerOperator( image="airflow-preprocess", command= "--input-dir /data/raw/{{ ds }} --output-dir /data/processed/{{ ds }}", task_id="docker-airflow-preprocess",
from airflow import DAG from airflow.models import Variable from airflow.operators.python_operator import PythonOperator from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook from airflow.contrib.operators.gcs_to_bq import GoogleCloudStorageToBigQueryOperator from airflow.contrib.operators.bigquery_operator import BigQueryOperator from airflow.utils.dates import days_ago PROJECT_ID = Variable.get("project") LANDING_BUCKET = Variable.get("landing_bucket") BACKUP_BUCKET = Variable.get("backup_bucket") default_arguments = {"owner": "YOUR-NAME-HERE", "start_date": days_ago(1)} def list_objects(bucket=None): hook = GoogleCloudStorageHook() storage_objects = hook.list(bucket) return storage_objects def move_objects(source_bucket=None, destination_bucket=None, prefix=None, **kwargs): storage_objects = kwargs["ti"].xcom_pull(task_ids="list_files") hook = GoogleCloudStorageHook() for storage_object in storage_objects: destination_object = storage_object
DEFAULT_ARGS = {"owner": "airflow"} class GetRequestOperator(BaseOperator): """Custom operator to sand GET request to provided url""" def __init__(self, *, url: str, **kwargs): super().__init__(**kwargs) self.url = url def execute(self, context): return requests.get(self.url).json() # [START dag_decorator_usage] @dag(default_args=DEFAULT_ARGS, schedule_interval=None, start_date=days_ago(2)) def example_dag_decorator(email: str = '*****@*****.**'): """ DAG to send server IP to email. :param email: Email to send IP to. Defaults to [email protected]. :type email: str """ get_ip = GetRequestOperator(task_id='get_ip', url="http://httpbin.org/get") @task(multiple_outputs=True) def prepare_email(raw_json: Dict[str, Any]) -> Dict[str, str]: external_ip = raw_json['origin'] return { 'subject': f'Server connected from {external_ip}', 'body': f'Seems like today your server executing Airflow is connected from IP {external_ip}<br>',
class DummySkipOperator(DummyOperator): """Dummy operator which always skips the task.""" ui_color = '#e8b7e4' def execute(self, context): raise AirflowSkipException def create_test_pipeline(suffix, trigger_rule, dag_): """ Instantiate a number of operators for the given DAG. :param str suffix: Suffix to append to the operator task_ids :param str trigger_rule: TriggerRule for the join task :param DAG dag_: The DAG to run the operators on """ skip_operator = DummySkipOperator(task_id=f'skip_operator_{suffix}', dag=dag_) always_true = DummyOperator(task_id=f'always_true_{suffix}', dag=dag_) join = DummyOperator(task_id=trigger_rule, dag=dag_, trigger_rule=trigger_rule) final = DummyOperator(task_id=f'final_{suffix}', dag=dag_) skip_operator >> join always_true >> join join >> final dag = DAG(dag_id='example_skip_dag', default_args=args, start_date=days_ago(2), tags=['example']) create_test_pipeline('1', 'all_success', dag) create_test_pipeline('2', 'one_success', dag)
def test_lineage_backend(mock_emit, inlets, outlets): DEFAULT_DATE = days_ago(2) mock_emitter = Mock() mock_emit.return_value = mock_emitter # Using autospec on xcom_pull and xcom_push methods fails on Python 3.6. with mock.patch.dict( os.environ, { "AIRFLOW__LINEAGE__BACKEND": "datahub_provider.lineage.datahub.DatahubLineageBackend", "AIRFLOW__LINEAGE__DATAHUB_CONN_ID": datahub_rest_connection_config.conn_id, "AIRFLOW__LINEAGE__DATAHUB_KWARGS": json.dumps({ "graceful_exceptions": False, "capture_executions": False }), }, ), mock.patch("airflow.models.BaseOperator.xcom_pull"), mock.patch( "airflow.models.BaseOperator.xcom_push"), patch_airflow_connection( datahub_rest_connection_config): func = mock.Mock() func.__name__ = "foo" dag = DAG(dag_id="test_lineage_is_sent_to_backend", start_date=DEFAULT_DATE) with dag: op1 = DummyOperator( task_id="task1_upstream", inlets=inlets, outlets=outlets, ) op2 = DummyOperator( task_id="task2", inlets=inlets, outlets=outlets, ) op1 >> op2 # Airflow < 2.2 requires the execution_date parameter. Newer Airflow # versions do not require it, but will attempt to find the associated # run_id in the database if execution_date is provided. As such, we # must fake the run_id parameter for newer Airflow versions. if AIRFLOW_VERSION < packaging.version.parse("2.2.0"): ti = TaskInstance(task=op2, execution_date=DEFAULT_DATE) else: ti = TaskInstance(task=op2, run_id=f"test_airflow-{DEFAULT_DATE}") ctx1 = { "dag": dag, "task": op2, "ti": ti, "task_instance": ti, "execution_date": DEFAULT_DATE, "ts": "2021-04-08T00:54:25.771575+00:00", } prep = prepare_lineage(func) prep(op2, ctx1) post = apply_lineage(func) post(op2, ctx1) # Verify that the inlets and outlets are registered and recognized by Airflow correctly, # or that our lineage backend forces it to. assert len(op2.inlets) == 1 assert len(op2.outlets) == 1 assert all(map(lambda let: isinstance(let, Dataset), op2.inlets)) assert all(map(lambda let: isinstance(let, Dataset), op2.outlets)) # Check that the right things were emitted. assert mock_emitter.emit.call_count == 9 # Running further checks based on python version because args only exists in python 3.7+ if sys.version_info[:3] > (3, 7): assert mock_emitter.method_calls[0].args[ 0].aspectName == "dataFlowInfo" assert ( mock_emitter.method_calls[0].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[1].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[1].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[2].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[2].args[0].entityUrn == "urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod)" ) assert mock_emitter.method_calls[3].args[ 0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[3].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert (mock_emitter.method_calls[4].args[0].aspectName == "dataJobInputOutput") assert ( mock_emitter.method_calls[4].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatajobs[0] == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task1_upstream)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.inputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert ( mock_emitter.method_calls[4].args[0].aspect.outputDatasets[0] == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[5].args[0].aspectName == "status" assert ( mock_emitter.method_calls[5].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableConsumed,PROD)" ) assert mock_emitter.method_calls[6].args[0].aspectName == "status" assert ( mock_emitter.method_calls[6].args[0].entityUrn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableProduced,PROD)" ) assert mock_emitter.method_calls[7].args[ 0].aspectName == "ownership" assert ( mock_emitter.method_calls[7].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" ) assert mock_emitter.method_calls[8].args[ 0].aspectName == "globalTags" assert ( mock_emitter.method_calls[8].args[0].entityUrn == "urn:li:dataJob:(urn:li:dataFlow:(airflow,test_lineage_is_sent_to_backend,prod),task2)" )
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START composer_grouping_airflow_1] from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator from airflow.utils.dates import days_ago DAG_NAME = 'all_tasks_in_one_dag' args = {'owner': 'airflow', 'start_date': days_ago(1), 'schedule_interval': "@once"} with DAG(dag_id=DAG_NAME, default_args=args) as dag: start = DummyOperator( task_id='start' ) task_1 = BashOperator( task_id='op-1', bash_command=':', dag=dag) task_2 = BashOperator( task_id='op-2', bash_command=':',
# 'wait_for_downstream': False, # 'dag': dag, # 'sla': timedelta(hours=2), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, # 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'sla_miss_callback': yet_another_function, # 'trigger_rule': 'all_success' } with DAG( 'wf_aws_driver', default_args=default_args, description='A simple test trigger another DAG', schedule_interval=None, start_date=days_ago(0), tags=['test'], ) as dag: dag.doc_md = dedent("""\ Тестовый WF с вызовом другого WS """) aws_test2 = LivyOperator( task_id='aws_test2', dag=dag, livy_conn_id='livy_default', file= 's3a://datagram/user/root/deployments/autogenerated_tr_aws_test_2/ru.neoflex.meta.etl2.spark.aws_test_2-1.0' '-SNAPSHOT.jar', proxy_user='******', args=[
GCS_STAGING = os.environ.get('GCP_DATAFLOW_GCS_STAGING', 'gs://test-dataflow-example/staging/') GCS_OUTPUT = os.environ.get('GCP_DATAFLOW_GCS_OUTPUT', 'gs://test-dataflow-example/output') GCS_JAR = os.environ.get( 'GCP_DATAFLOW_JAR', 'gs://test-dataflow-example/word-count-beam-bundled-0.1.jar') GCS_PYTHON = os.environ.get( 'GCP_DATAFLOW_PYTHON', 'gs://test-dataflow-example/wordcount_debugging.py') GCS_JAR_PARTS = urlparse(GCS_JAR) GCS_JAR_BUCKET_NAME = GCS_JAR_PARTS.netloc GCS_JAR_OBJECT_NAME = GCS_JAR_PARTS.path[1:] default_args = { "start_date": days_ago(1), 'dataflow_default_options': { 'tempLocation': GCS_TMP, 'stagingLocation': GCS_STAGING, } } with models.DAG( "example_gcp_dataflow_native_java", default_args=default_args, schedule_interval=None, # Override to match your needs tags=['example'], ) as dag_native_java: # [START howto_operator_start_java_job] start_java_job = DataflowCreateJavaJobOperator(
def __init__( self, dag, name, image=None, # Directories operator_out_dir=None, input_operator=None, # Airflow task_id=None, parallel_id=None, trigger_rule=TriggerRule.ALL_SUCCESS, ram_mem_mb=500, ram_mem_mb_lmt=None, cpu_millicores=None, cpu_millicores_lmt=None, gpu_mem_mb=None, gpu_mem_mb_lmt=None, retries=1, retry_delay=timedelta(seconds=60), priority_weight=1, execution_timeout=timedelta(minutes=90), task_concurrency=None, manage_cache=None, # Other stuff cmds=None, arguments=None, env_vars=None, image_pull_secrets=None, startup_timeout_seconds=120, namespace='flow-jobs', image_pull_policy=os.getenv('PULL_POLICY_PODS', 'IfNotPresent'), training_operator=False, volume_mounts=None, volumes=None, pod_resources=None, enable_proxy=False, host_network=False, in_cluster=False, cluster_context=None, labels=None, get_logs=True, annotations=None, affinity=None, config_file=None, xcom_push=False, node_selectors=None, secrets=None, kind="Pod", pool=None, pool_slots=None, api_version="v1", *args, **kwargs): KaapanaBaseOperator.set_defaults(self, name=name, task_id=task_id, operator_out_dir=operator_out_dir, input_operator=input_operator, parallel_id=parallel_id, trigger_rule=trigger_rule, pool=pool, pool_slots=pool_slots, ram_mem_mb=ram_mem_mb, ram_mem_mb_lmt=ram_mem_mb_lmt, cpu_millicores=cpu_millicores, cpu_millicores_lmt=cpu_millicores_lmt, gpu_mem_mb=gpu_mem_mb, gpu_mem_mb_lmt=gpu_mem_mb_lmt, manage_cache=manage_cache) # Airflow self.retries = retries self.priority_weight = priority_weight self.execution_timeout = execution_timeout self.task_concurrency = task_concurrency self.retry_delay = retry_delay self.training_operator = training_operator # Kubernetes self.image = image self.env_vars = env_vars or {} self.namespace = namespace self.cmds = cmds or [] self.arguments = arguments or [] self.labels = labels or {} self.startup_timeout_seconds = startup_timeout_seconds self.volume_mounts = volume_mounts or [] self.volumes = volumes or [] self.image_pull_secrets = image_pull_secrets or [] self.in_cluster = in_cluster self.cluster_context = cluster_context self.get_logs = get_logs self.image_pull_policy = image_pull_policy self.node_selectors = node_selectors or {} self.annotations = annotations or {} self.affinity = affinity or {} self.xcom_push = xcom_push self.pod_resources = pod_resources or None self.config_file = config_file self.api_version = api_version self.secrets = secrets self.kind = kind self.data_dir = os.getenv('DATADIR', "") self.result_message = None self.host_network = host_network self.enable_proxy = enable_proxy self.volume_mounts.append( VolumeMount('dcmdata', mount_path='/data', sub_path=None, read_only=False)) volume_config = { 'hostPath': { 'type': 'DirectoryOrCreate', 'path': self.data_dir } } self.volumes.append(Volume(name='dcmdata', configs=volume_config)) if self.training_operator: self.volume_mounts.append( VolumeMount('tensorboard', mount_path='/tensorboard', sub_path=None, read_only=False)) tb_config = { 'hostPath': { 'type': 'DirectoryOrCreate', 'path': os.path.join(self.data_dir, "tensorboard") } } self.volumes.append(Volume(name='tensorboard', configs=tb_config)) if self.pod_resources is None: pod_resources = PodResources( request_cpu="{}m".format(self.cpu_millicores) if self.cpu_millicores != None else None, limit_cpu="{}m".format(self.cpu_millicores + 100) if self.cpu_millicores != None else None, request_memory="{}Mi".format(self.ram_mem_mb), limit_memory="{}Mi".format( self.ram_mem_mb_lmt if self. ram_mem_mb_lmt is not None else self.ram_mem_mb + 100), limit_gpu=1 if self.gpu_mem_mb is not None else None) self.pod_resources = pod_resources envs = { "WORKFLOW_DIR": str(WORKFLOW_DIR), "BATCH_NAME": str(BATCH_NAME), "OPERATOR_OUT_DIR": str(self.operator_out_dir), "OPERATOR_IN_DIR": str(self.operator_in_dir), "BATCHES_INPUT_DIR": "/{}/{}".format(WORKFLOW_DIR, BATCH_NAME) } if http_proxy is not None and http_proxy != "" and self.enable_proxy: envs.update({ "http_proxy": http_proxy, "https_proxy": http_proxy, "HTTP_PROXY": http_proxy, "HTTPS_PROXY": http_proxy, }) envs.update(self.env_vars) self.env_vars = envs super().__init__(dag=dag, task_id=self.task_id, retries=self.retries, priority_weight=self.priority_weight, execution_timeout=self.execution_timeout, task_concurrency=self.task_concurrency, pool=self.pool, pool_slots=self.pool_slots, retry_delay=self.retry_delay, email=None, email_on_retry=True, email_on_failure=True, start_date=days_ago(0), depends_on_past=False, wait_for_downstream=False, trigger_rule=self.trigger_rule, on_failure_callback=KaapanaBaseOperator.on_failure, on_success_callback=KaapanaBaseOperator.on_success, on_retry_callback=KaapanaBaseOperator.on_retry, on_execute_callback=KaapanaBaseOperator.on_execute, executor_config=self.executor_config, *args, **kwargs)
from itertools import cycle from functools import partial from datetime import timedelta, datetime from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator from airflow.utils.dates import days_ago from nb_runner import cycle_exp, cycle_mutate, \ cycle_crossover, cycle_combine, cycle_all,\ bo_exp, bo_all from config import cfg d = days_ago(1) # + timedelta(hours=10, minutes=31) default_args = { 'owner': cfg.OWNER, 'depends_on_past': False, #'start_date':d, 'email': False, 'email_on_failure': False, 'email_on_retry': False, #'retries': 0,# overrided in pythonOperator down below #'retry_delay': timedelta(minutes=5),# overrided in pythonOperator down below } default_pool = cfg.DAG.DEF_POOL #schedule_interval = cfg.DAG.SCHED_INTERVAL if cfg.DAG.SCHED_INTERVAL else None #@daily description = cfg.DAG.DESC + '\n' + json.dumps(cfg, indent=4)
from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator from airflow.utils.dates import days_ago from datetime import timedelta default_args = { 'owner': 'airflow', # Lo ejecuta el usuario airflow 'depends_on_past': False, 'start_date': days_ago(2), # Comienzo inmediato 'email': ['*****@*****.**'], # Email al que enviar el informe si hay error. 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=10), } dag = DAG( dag_id='aadownloaddata', default_args=default_args, description='descargadedatos', dagrun_timeout=timedelta(minutes=2), schedule_interval=timedelta(days=1), ) CreateDir = BashOperator(task_id='create_dir', depends_on_past=False, bash_command='mkdir -p /tmp/airflow/p2/', dag=dag)
from airflow.operators.python_operator import PythonOperator from airflow.settings import Session from airflow.utils import timezone from airflow.utils.dates import days_ago, infer_time_unit, round_time, scale_time_units from airflow.utils.state import State from airflow.utils.timezone import datetime from tests.test_utils.config import conf_vars DEV_NULL = '/dev/null' TEST_DAG_FOLDER = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dags') DEFAULT_DATE = datetime(2015, 1, 1) DEFAULT_DATE_ISO = DEFAULT_DATE.isoformat() DEFAULT_DATE_DS = DEFAULT_DATE_ISO[:10] TEST_DAG_ID = 'unit_tests' EXAMPLE_DAG_DEFAULT_DATE = days_ago(2) class OperatorSubclass(BaseOperator): """ An operator to test template substitution """ template_fields = ['some_templated_field'] def __init__(self, some_templated_field, *args, **kwargs): super().__init__(*args, **kwargs) self.some_templated_field = some_templated_field def execute(self, context): pass
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.utils.dates import days_ago from airflow.utils.log.logging_mixin import LoggingMixin from airflow.models import DAG log = LoggingMixin().log try: # Kubernetes is optional, so not available in vanilla Airflow # pip install apache-airflow[kubernetes] from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator args = {'owner': 'airflow', 'start_date': days_ago(2)} dag = DAG(dag_id='example_kubernetes_operator', default_args=args, schedule_interval=None) tolerations = [{'key': "key", 'operator': 'Equal', 'value': 'value'}] k = KubernetesPodOperator(namespace='default', image="ubuntu:16.04", cmds=["bash", "-cx"], arguments=["echo", "10"], labels={"foo": "bar"}, name="airflow-test-pod", in_cluster=False, task_id="task",
GCF_ENTRYPOINT = os.environ.get('GCF_ENTRYPOINT', 'helloWorld') GCF_RUNTIME = 'nodejs6' GCP_VALIDATE_BODY = os.environ.get('GCP_VALIDATE_BODY', True) # [END howto_operator_gcf_deploy_variables] # [START howto_operator_gcf_deploy_body] body = { "name": FUNCTION_NAME, "entryPoint": GCF_ENTRYPOINT, "runtime": GCF_RUNTIME, "httpsTrigger": {} } # [END howto_operator_gcf_deploy_body] # [START howto_operator_gcf_default_args] default_args = {'start_date': dates.days_ago(1)} # [END howto_operator_gcf_default_args] # [START howto_operator_gcf_deploy_variants] if GCF_SOURCE_ARCHIVE_URL: body['sourceArchiveUrl'] = GCF_SOURCE_ARCHIVE_URL elif GCF_SOURCE_REPOSITORY: body['sourceRepository'] = {'url': GCF_SOURCE_REPOSITORY} elif GCF_ZIP_PATH: body['sourceUploadUrl'] = '' default_args['zip_path'] = GCF_ZIP_PATH elif GCF_SOURCE_UPLOAD_URL: body['sourceUploadUrl'] = GCF_SOURCE_UPLOAD_URL else: raise Exception("Please provide one of the source_code parameters") # [END howto_operator_gcf_deploy_variants]
import sys sys.path.insert(0, '..') # from function.make_pdf import pdf_main # from function.first import main def tt1(param, **kwargs): print('tt1', param) # main() def tt2(param, **kwargs): print('tt2', param) args = {'owner': 'geonho', 'start_date': days_ago(n=1)} dag = DAG(dag_id='test_20210422', default_args=args, schedule_interval='@daily') d1 = PythonOperator(task_id='task1', provide_context=True, python_callable=tt1, op_kwargs={'param': 'apple'}, dag=dag) d2 = PythonOperator(task_id='task2', provide_context=True, python_callable=tt2, op_kwargs={'param': 'apple'}, dag=dag)
""" from os import getenv from airflow import DAG from airflow.providers.amazon.aws.operators.imap_attachment_to_s3 import ImapAttachmentToS3Operator from airflow.utils.dates import days_ago # [START howto_operator_imap_attachment_to_s3_env_variables] IMAP_ATTACHMENT_NAME = getenv("IMAP_ATTACHMENT_NAME", "test.txt") IMAP_MAIL_FOLDER = getenv("IMAP_MAIL_FOLDER", "INBOX") IMAP_MAIL_FILTER = getenv("IMAP_MAIL_FILTER", "All") S3_DESTINATION_KEY = getenv("S3_DESTINATION_KEY", "s3://bucket/key.json") # [END howto_operator_imap_attachment_to_s3_env_variables] default_args = {"start_date": days_ago(1)} with DAG(dag_id="example_imap_attachment_to_s3", default_args=default_args, schedule_interval=None, tags=['example']) as dag: # [START howto_operator_imap_attachment_to_s3_task_1] task_transfer_imap_attachment_to_s3 = ImapAttachmentToS3Operator( imap_attachment_name=IMAP_ATTACHMENT_NAME, s3_key=S3_DESTINATION_KEY, imap_mail_folder=IMAP_MAIL_FOLDER, imap_mail_filter=IMAP_MAIL_FILTER, task_id='transfer_imap_attachment_to_s3', dag=dag) # [END howto_operator_imap_attachment_to_s3_task_1]
"repoSource": { "repoName": GCP_SOURCE_REPOSITORY_NAME, "branchName": "master" } }, "steps": [{ "name": "gcr.io/cloud-builders/docker", "args": ["build", "-t", "gcr.io/$PROJECT_ID/$REPO_NAME", "."], }], "images": ["gcr.io/$PROJECT_ID/$REPO_NAME"], } # [END howto_operator_create_build_from_repo_body] with models.DAG( "example_gcp_cloud_build", default_args=dict(start_date=dates.days_ago(1)), schedule_interval=None, tags=['example'], ) as dag: # [START howto_operator_create_build_from_storage] create_build_from_storage = CloudBuildCreateOperator( task_id="create_build_from_storage", project_id=GCP_PROJECT_ID, body=create_build_from_storage_body) # [END howto_operator_create_build_from_storage] # [START howto_operator_create_build_from_storage_result] create_build_from_storage_result = BashOperator( bash_command= "echo '{{ task_instance.xcom_pull('create_build_from_storage')['images'][0] }}'", task_id="create_build_from_storage_result",
from marquez_airflow import DAG from airflow.operators.postgres_operator import PostgresOperator from airflow.operators.sensors import ExternalTaskSensor from airflow.utils.dates import days_ago default_args = { 'owner': 'datascience', 'depends_on_past': False, 'start_date': days_ago(1), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } dag = DAG('etl_orders_7_days', schedule_interval='@hourly', catchup=False, default_args=default_args, description='Loads newly placed orders weekly.') # Wait for new_food_deliveries DAG to complete t1 = ExternalTaskSensor(task_id='wait_for_new_food_deliveries', external_dag_id='new_food_deliveries', mode='reschedule', dag=dag) # Wait for etl_orders DAG to complete t2 = ExternalTaskSensor(task_id='wait_for_etl_orders', external_dag_id='etl_orders', mode='reschedule', dag=dag)
# task 3 t3 = PythonOperator(task_id='python_write_file', depends_on_past=False, python_callable=WriteToFile, email=['*****@*****.**'], email_on_failure=True, dag=dag_subdag) t3_complete = datetime.now() return dag_subdag, t2_complete, t3_complete dag = DAG( 'sample_sub_dag', default_args=default_args, start_date=days_ago(1), #Start date for the workflow is neccesary description='A sample workflow', schedule_interval=None) # task 1 command1 = """ echo "Time: $(date)" >> /home/karan/Attempt_ApacheAirflow/t1.log """ t1 = BashOperator(task_id='print_date', depends_on_past=False, bash_command=command1, dag=dag) # task 3 - python function def WriteToFile():
}, } # type: Dict[str, Any] # [END howto_operator_gcp_transfer_create_job_body_gcp] # [START howto_operator_gcp_transfer_update_job_body] update_body = { PROJECT_ID: GCP_PROJECT_ID, TRANSFER_JOB: {DESCRIPTION: "{}_updated".format(GCP_DESCRIPTION)}, TRANSFER_JOB_FIELD_MASK: "description", } # [END howto_operator_gcp_transfer_update_job_body] list_filter_dict = {FILTER_PROJECT_ID: GCP_PROJECT_ID, FILTER_JOB_NAMES: []} # [START howto_operator_gcp_transfer_default_args] default_args = {'start_date': days_ago(1)} # [END howto_operator_gcp_transfer_default_args] with models.DAG( 'example_gcp_transfer', default_args=default_args, schedule_interval=None # Override to match your needs ) as dag: # [START howto_operator_gcp_transfer_create_job] create_transfer_job_from_aws = GcpTransferServiceJobCreateOperator( task_id="create_transfer_job_from_aws", body=aws_to_gcs_transfer_body ) # [END howto_operator_gcp_transfer_create_job] wait_for_operation_to_start = GCPTransferServiceWaitForJobStatusSensor( task_id="wait_for_operation_to_start", job_name="{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}",
from airflow.models import DAG from airflow.utils.dates import days_ago from airflow.operators.python_operator import PythonOperator # Parâmetros args = {'owner': 'janilson', 'start_date': days_ago(1)} # Criação do dag dag = DAG(dag_id="my_simple_dag", default_args=args, schedule_interval=None) # Função python para ser executada pelas tarefas def run_this_func(**context): print('hi') with dag: # Tarefa teste 1 run_this_task = PythonOperator( task_id='run_this', python_callable=run_this_func, provide_context=True, ) # Tarefa teste 2 run_this_task2 = PythonOperator( task_id='run_this2', python_callable=run_this_func, provide_context=True, )
def setUp(self): self.dagbag = models.DagBag(include_examples=True) self.dag1 = self.dagbag.dags['example_bash_operator'] self.dag2 = self.dagbag.dags['example_subdag_operator'] self.execution_dates = [days_ago(2), days_ago(1), days_ago(0)]
] DB_TABLE_SCHEMA = DbTableSchema( schema_name=DB_SCHEMA_NAME, table_name=DB_TABLE_NAME, columns=DB_TABLE_COLUMNS ) NO_DB_TABLE_SCHEMA = [] SQL = f"SELECT * FROM {DB_NAME}.{DB_TABLE_NAME.name};" DAG_ID = 'email_discounts' DAG_OWNER = 'datascience' DAG_DEFAULT_ARGS = { 'owner': DAG_OWNER, 'depends_on_past': False, 'start_date': days_ago(7), 'email_on_failure': False, 'email_on_retry': False, 'email': ['*****@*****.**'] } DAG_DESCRIPTION = 'Email discounts to customers that have experienced order delays daily' DAG = dag = DAG( DAG_ID, schedule_interval='@weekly', default_args=DAG_DEFAULT_ARGS, description=DAG_DESCRIPTION ) TASK_ID = 'select' TASK = SnowflakeOperator(
from airflow.utils import dates project = 'your-project-id' # Change this to your own GCP project_id topic = 'example-topic' # Cloud Pub/Sub topic subscription = 'subscription-to-example-topic' # Cloud Pub/Sub subscription # Sample messages to push/pull messages = [ {'data': b64encode(b'Hello World')}, {'data': b64encode(b'Another message')}, {'data': b64encode(b'A final message')} ] default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': dates.days_ago(2), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'project': project, 'topic': topic, 'subscription': subscription, } echo_template = ''' {% for m in task_instance.xcom_pull(task_ids='pull-messages') %} echo "AckID: {{ m.get('ackId') }}, Base64-Encoded: {{ m.get('message') }}" {% endfor %} '''
from airflow import DAG from airflow.operators.dagrun_operator import TriggerDagRunOperator from airflow.sensors.external_task_sensor import ExternalTaskSensor from airflow.utils.dates import days_ago with DAG(dag_id="dag_referenced_task_dag_id_exists_fail", schedule_interval=None, start_date=days_ago(1)) as dag: TriggerDagRunOperator(task_id="test_trigger", trigger_dag_id="nonexistent") ExternalTaskSensor(task_id="test_sensor_dag", external_dag_id="nonexistent") ExternalTaskSensor(task_id="test_sensor_task", external_dag_id="nonexistent", external_task_id="non-task")