Exemple #1
0
    )

    get_package = GetPackageOperator(
        task_id="get_package",
        address=ckan_address,
        apikey=ckan_apikey,
        package_name_or_id=PACKAGE_NAME,
    )

    res_new_or_existing = BranchPythonOperator(
        task_id="res_new_or_existing",
        python_callable=is_resource_new,
    )

    transformed_data = PythonOperator(
        task_id="transform_data",
        python_callable=transform_data,
    )

    create_data_dictionary = PythonOperator(
        task_id="create_data_dictionary",
        python_callable=build_data_dict,
    )

    get_or_create_resource = GetOrCreateResourceOperator(
        task_id="get_or_create_resource",
        address=ckan_address,
        apikey=ckan_apikey,
        package_name_or_id=PACKAGE_NAME,
        resource_name=RESOURCE_NAME,
        resource_attributes=dict(
            format="geojson",
    'owner': 'airflow',
}

def print_cwd(ds, **kwargs):
    """Print the Airflow context and ds variable from the context."""
    print (Path.cwd())
    return str(Path.cwd())


dag = DAG(
    dag_id='tika_bash_operator',
    default_args=args,
    start_date=days_ago(2),
    dagrun_timeout=timedelta(minutes=60),
    tags=['curl_tika'],
    params={"example_key": "example_value"},
)

run_this = BashOperator(
    task_id='run_curl',
    bash_command='curl -T /opt/airflow/dags/LICENSE http://0.0.0.0:9998/meta',
    dag=dag,
)

run_this0 = PythonOperator(
    task_id='print_the_context',
    python_callable=print_cwd,
    dag=dag,
)

run_this0 >> run_this
    end_date = Variable.get('narrativedx_end_date',
                            default_var=first_of_month - timedelta(days=1))
    start_date = Variable.get('narrativedx_start_date',
                              default_var=first_of_month -
                              timedelta(days=end_date.day))

    sql = sql.format(start_date=start_date, end_date=end_date, surv=service)
    df = pd.read_sql(sql, ppw_engine)

    df.to_csv(basepath.joinpath(f'NarrativeDX - {service} - {exec_date}.csv'))


queries = []
for service in services:
    delete = PythonOperator(task_id=f'delete_older_{service}_file',
                            python_callable=delete_older_file,
                            op_kwargs={'service': service},
                            dag=dag)

    query = PythonOperator(task_id=f'query_narrativedx_{service}',
                           python_callable=query_narrativedx,
                           op_kwargs={'service': service},
                           dag=dag)

    sftp = SFTPOperator(
        task_id=f'upload_{service}_to_sftp',
        ssh_conn_id='coh_sftp',
        local_filepath=str(
            basepath.joinpath(f'NarrativeDX - {service} - {exec_date}.csv')),
        remote_filepath=f'/sftp/NarrativeDX - {service} - {exec_date}.csv',
        operation='put',
        create_intermediate_dirs=True,
Exemple #4
0
        0,
        0,
    ),
    tags=['google_ad_api'],
)

t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag,
)

t2 = PythonOperator(
    task_id='initiate_database_connection',
    depends_on_past=False,
    python_callable=connect,
    op_kwargs={'engine': 'conn'},
    retries=3,
    dag=dag,
)

t3 = PythonOperator(
    task_id='data_upload',
    depends_on_past=False,
    python_callable=copy_from_dataframe,
    op_kwargs={
        'conn': 'connect(engine)',
        'dff': 'dff'
    },
    retries=3,
    dag=dag,
)
Exemple #5
0
            })
        thread.daemon = True
        thread.start()
        time.sleep(consumer.RECEIVE_DURATION)
        consumer_client.close()
        thread.join()
    except KeyboardInterrupt:
        print('Stop receiving.')

    print('Consumer2 has stopped receiving, end time is {}.'.format(
        time.time()))


t1 = PythonOperator(
    task_id='produce_raw_message',
    python_callable=produce_raw_message,
    dag=dag,
)
t2 = PythonOperator(
    task_id='preprocess_raw_message',
    python_callable=preprocess_raw_message,
    dag=dag,
)

t3 = PythonOperator(
    task_id='consume_and_offload_preprocessed_message',
    python_callable=consume_preprocessed_message,
    dag=dag,
)

t1 >> t2 >> t3
    """A callable to upload file to AWS bucket"""
    s3_hook = S3Hook()
    s3_hook.load_file(filename=UPLOAD_FILE, key=PREFIX, bucket_name=S3BUCKET_NAME)


with models.DAG(
    'example_s3_to_gcs',
    schedule_interval=None,
    start_date=days_ago(2),
    tags=['example'],
) as dag:
    create_s3_bucket = S3CreateBucketOperator(
        task_id="create_s3_bucket", bucket_name=S3BUCKET_NAME, region_name='us-east-1'
    )

    upload_to_s3 = PythonOperator(task_id='upload_file_to_s3', python_callable=upload_file)

    create_gcs_bucket = GCSCreateBucketOperator(
        task_id="create_bucket",
        bucket_name=GCS_BUCKET,
        project_id=GCP_PROJECT_ID,
    )
    # [START howto_transfer_s3togcs_operator]
    transfer_to_gcs = S3ToGCSOperator(
        task_id='s3_to_gcs_task', bucket=S3BUCKET_NAME, prefix=PREFIX, dest_gcs="gs://" + GCS_BUCKET
    )
    # [END howto_transfer_s3togcs_operator]

    delete_s3_bucket = S3DeleteBucketOperator(
        task_id='delete_s3_bucket', bucket_name=S3BUCKET_NAME, force_delete=True
    )
Exemple #7
0
    print(my_xcom)


def _fail_callback(context):
    print('callback123')
    print(context)


with DAG(dag_id='download_dag',
         schedule_interval='@daily',
         start_date=days_ago(3),
         catchup=True,
         default_args=default_args) as dag:

    download_data = PythonOperator(task_id='download_data',
                                   python_callable=_download_data
                                   #op_kwargs={'my_param':42}
                                   )

    check_data = PythonOperator(task_id='check_data',
                                python_callable=_check_data)

    wait_data = FileSensor(task_id='wait_data',
                           fs_conn_id='fs_default',
                           filepath='my_file.txt',
                           poke_interval=30)

    process_data = BashOperator(task_id='process_data',
                                bash_command='exit 1',
                                on_failure_callback=_fail_callback)

    #download_data.set_downstream(wait_data)
Exemple #8
0
import os

from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago


def print_stuff():
    print('HELLO FROM AIRFLOW')


args = {
    'owner': 'airflow',
}

with DAG(
        dag_id='example_kubernetes_executor',
        default_args=args,
        schedule_interval='@once',
        start_date=days_ago(2),
        tags=['example', 'example2'],
) as dag:

    # You don't have to use any special KubernetesExecutor configuration if you don't want to
    start_task = PythonOperator(task_id='start_task',
                                python_callable=print_stuff)
Exemple #9
0
    print(f"Message in notebook {message} for {execution_date}")

    if message.data != f"Ran from Airflow at {execution_date}!":
        return False

    return True


with DAG(
    dag_id='example_papermill_operator',
    default_args=default_args,
    schedule_interval='0 0 * * *',
    dagrun_timeout=timedelta(minutes=60)
) as dag_2:

    run_this = PapermillOperator(
        task_id="run_example_notebook",
        input_nb=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                              "input_notebook.ipynb"),
        output_nb="/tmp/out-{{ execution_date }}.ipynb",
        parameters={"msgs": "Ran from Airflow at {{ execution_date }}!"}
    )

    check_output = PythonOperator(
        task_id='check_out',
        python_callable=check_notebook,
        inlets=AUTO
    )

    check_output.set_upstream(run_this)
from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from elasticsearch_plugin.hooks.elastic_hook import ElasticHook
from elasticsearch_plugin.operators.postgres_to_elastic import PostgresToElasticOperator

from datetime import datetime 

default_args = {
    'start_date': datetime(2020, 1, 1)
}

def _print_es_info():
    hook = ElasticHook()
    print(hook.info())

with DAG ('elasticsearch_dag', schedule_interval='@daily', 
        default_args=default_args, catchup=False) as dag:

        print_es_info = PythonOperator(
            task_id='print_es_info',
            python_callable=_print_es_info
        )

        connections_to_es = PostgresToElasticOperator(
            task_id='connections_to_es',
            sql="SELECT * FROM connections",
            index='connections'
        )

        print_es_info >> connections_to_es
Exemple #11
0
    # [END transform_function]

    # [START load_function]
    def load(**kwargs):
        ti = kwargs['ti']
        total_value_string = ti.xcom_pull(task_ids='transform',
                                          key='total_order_value')
        total_order_value = json.loads(total_value_string)

        print(total_order_value)

    # [END load_function]

    # [START main_flow]
    extract_task = PythonOperator(
        task_id='extract',
        python_callable=extract,
    )
    extract_task.doc_md = dedent("""\
    #### Extract task
    A simple Extract task to get data ready for the rest of the data pipeline.
    In this case, getting data is simulated by reading from a hardcoded JSON string.
    This data is then put into xcom, so that it can be processed by the next task.
    """)

    transform_task = PythonOperator(
        task_id='transform',
        python_callable=transform,
    )
    transform_task.doc_md = dedent("""\
    #### Transform task
    A simple Transform task which takes in the collection of order data from xcom
Exemple #12
0
for gene in regions.keys():

    filepath_prefix = WORKING_DIR + 'data/premsa-processor/' + gene + '/sequences'
    filepath = filepath_prefix + '.fasta'
    stdout = filepath_prefix  + '.stdout.log'
    reference_output_filepath  = filepath_prefix + '.references.fasta'

    nuc_input_filepath = filepath + '_nuc.fas'
    nuc_dupe_output_filepath  = filepath + '_raw_nucleotide.duplicates.json'
    protein_dupe_output_filepath = filepath + '_protein.duplicates.json'
    prot_input_filepath = filepath + '_protein.fas'
    dupe_input_filepath = filepath + '_copies.json'

    export_missing = PythonOperator(
        task_id=f'export_missing_premsa_{gene}',
        python_callable=export_sequences,
        op_kwargs={ "gene" : gene, "output_fn" : filepath },
        dag=dag,
    )

    pre_msa = BashOperator(
        task_id=f'pre_msa_{gene}',
        bash_command=PREMSA,
        params={'regions': regions, 'filepath': filepath, 'gene': gene, 'node' : i % 8, 'stdout' : stdout },
        dag=dag,
    )

    populated_check_task = ShortCircuitOperator(
        task_id=f'check_if_populated_{gene}',
        python_callable=is_export_populated,
        op_kwargs={ 'filepath': filepath },
        dag=dag
from datetime import datetime, timedelta

from airflow import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago

default_args = {'owner': 'airflow', 'depends_on_past': False}


def task():
    print(
        f"The current time is {datetime.now().strftime('%b. %d, %Y %-I:%M %p UTC')}"
    )


with DAG(dag_id="hello_world",
         description=
         "A hello world DAG which shows the basic execution flow of Airflow",
         default_args=default_args,
         dagrun_timeout=timedelta(hours=2),
         start_date=days_ago(1),
         schedule_interval=None,
         default_view="graph",
         tags=["sample", "python", "bash"]) as dag:
    bash_task = BashOperator(task_id='bash_task',
                             bash_command='echo "Hello from Airflow!"')
    python_task = PythonOperator(task_id='python_task', python_callable=task)

    bash_task >> python_task
Exemple #14
0
                earningsId = updateCursor.fetchone()[0]
                print(earningsId)
                updateCursor.execute("""
                  UPDATE trees
                  SET earnings_id = %s
                  WHERE id = ANY(%s)
                """, 
                (earningsId, 
                row['tree_ids']))

            conn.commit()
            return 0
        except Exception as e:
            print("get error when exec SQL:", e)
            print("SQL result:", updateCursor.query)
            raise ValueError('Error executing query')
            return 1

    create_new_person_records = PythonOperator(
        task_id='create_new_person_records',
        python_callable=create_new_person_records,
        )

    earnings_report = PythonOperator(
        task_id='earnings_report',
        python_callable=earnings_report,
        )


    create_new_person_records >> earnings_report >> t1
Exemple #15
0
def create_evaluate_ops(  # pylint: disable=too-many-arguments
    task_prefix: str,
    data_format: str,
    input_paths: List[str],
    prediction_path: str,
    metric_fn_and_keys: Tuple[T, Iterable[str]],
    validate_fn: T,
    batch_prediction_job_id: Optional[str] = None,
    region: Optional[str] = None,
    project_id: Optional[str] = None,
    dataflow_options: Optional[Dict] = None,
    model_uri: Optional[str] = None,
    model_name: Optional[str] = None,
    version_name: Optional[str] = None,
    dag: Optional[DAG] = None,
    py_interpreter="python3",
):
    """
    Creates Operators needed for model evaluation and returns.

    It gets prediction over inputs via Cloud ML Engine BatchPrediction API by
    calling MLEngineBatchPredictionOperator, then summarize and validate
    the result via Cloud Dataflow using DataFlowPythonOperator.

    For details and pricing about Batch prediction, please refer to the website
    https://cloud.google.com/ml-engine/docs/how-tos/batch-predict
    and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/

    It returns three chained operators for prediction, summary, and validation,
    named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation,
    respectively.
    (<prefix> should contain only alphanumeric characters or hyphen.)

    The upstream and downstream can be set accordingly like:
      pred, _, val = create_evaluate_ops(...)
      pred.set_upstream(upstream_op)
      ...
      downstream_op.set_upstream(val)

    Callers will provide two python callables, metric_fn and validate_fn, in
    order to customize the evaluation behavior as they wish.

    - metric_fn receives a dictionary per instance derived from json in the
      batch prediction result. The keys might vary depending on the model.
      It should return a tuple of metrics.
    - validation_fn receives a dictionary of the averaged metrics that metric_fn
      generated over all instances.
      The key/value of the dictionary matches to what's given by
      metric_fn_and_keys arg.
      The dictionary contains an additional metric, 'count' to represent the
      total number of instances received for evaluation.
      The function would raise an exception to mark the task as failed, in a
      case the validation result is not okay to proceed (i.e. to set the trained
      version as default).

    Typical examples are like this:

    .. code-block:: python

        def get_metric_fn_and_keys():
            import math  # imports should be outside of the metric_fn below.
            def error_and_squared_error(inst):
                label = float(inst['input_label'])
                classes = float(inst['classes'])  # 0 or 1
                err = abs(classes-label)
                squared_err = math.pow(classes-label, 2)
                return (err, squared_err)  # returns a tuple.
            return error_and_squared_error, ['err', 'mse']  # key order must match.

        def validate_err_and_count(summary):
            if summary['err'] > 0.2:
                raise ValueError('Too high err>0.2; summary=%s' % summary)
            if summary['mse'] > 0.05:
                raise ValueError('Too high mse>0.05; summary=%s' % summary)
            if summary['count'] < 1000:
                raise ValueError('Too few instances<1000; summary=%s' % summary)
            return summary

    For the details on the other BatchPrediction-related arguments (project_id,
    job_id, region, data_format, input_paths, prediction_path, model_uri),
    please refer to MLEngineBatchPredictionOperator too.

    :param task_prefix: a prefix for the tasks. Only alphanumeric characters and
        hyphen are allowed (no underscores), since this will be used as dataflow
        job name, which doesn't allow other characters.
    :type task_prefix: str

    :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP'
    :type data_format: str

    :param input_paths: a list of input paths to be sent to BatchPrediction.
    :type input_paths: list[str]

    :param prediction_path: GCS path to put the prediction results in.
    :type prediction_path: str

    :param metric_fn_and_keys: a tuple of metric_fn and metric_keys:

        - metric_fn is a function that accepts a dictionary (for an instance),
          and returns a tuple of metric(s) that it calculates.

        - metric_keys is a list of strings to denote the key of each metric.
    :type metric_fn_and_keys: tuple of a function and a list[str]

    :param validate_fn: a function to validate whether the averaged metric(s) is
        good enough to push the model.
    :type validate_fn: function

    :param batch_prediction_job_id: the id to use for the Cloud ML Batch
        prediction job. Passed directly to the MLEngineBatchPredictionOperator as
        the job_id argument.
    :type batch_prediction_job_id: str

    :param project_id: the Google Cloud project id in which to execute
        Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['project_id']` will be used.
    :type project_id: str

    :param region: the Google Cloud region in which to execute Cloud ML
        Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['region']` will be used.
    :type region: str

    :param dataflow_options: options to run Dataflow jobs. If None, then the
        `dag`'s `default_args['dataflow_default_options']` will be used.
    :type dataflow_options: dictionary

    :param model_uri: GCS path of the model exported by Tensorflow using
        tensorflow.estimator.export_savedmodel(). It cannot be used with
        model_name or version_name below. See MLEngineBatchPredictionOperator for
        more detail.
    :type model_uri: str

    :param model_name: Used to indicate a model to use for prediction. Can be
        used in combination with version_name, but cannot be used together with
        model_uri. See MLEngineBatchPredictionOperator for more detail. If None,
        then the `dag`'s `default_args['model_name']` will be used.
    :type model_name: str

    :param version_name: Used to indicate a model version to use for prediction,
        in combination with model_name. Cannot be used together with model_uri.
        See MLEngineBatchPredictionOperator for more detail. If None, then the
        `dag`'s `default_args['version_name']` will be used.
    :type version_name: str

    :param dag: The `DAG` to use for all Operators.
    :type dag: airflow.models.DAG

    :param py_interpreter: Python version of the beam pipeline.
        If None, this defaults to the python3.
        To track python versions supported by beam and related
        issues check: https://issues.apache.org/jira/browse/BEAM-1251
    :type py_interpreter: str

    :returns: a tuple of three operators, (prediction, summary, validation)
    :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator,
                  PythonOperator)
    """
    batch_prediction_job_id = batch_prediction_job_id or ""
    dataflow_options = dataflow_options or {}
    region = region or ""

    # Verify that task_prefix doesn't have any special characters except hyphen
    # '-', which is the only allowed non-alphanumeric character by Dataflow.
    if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix):
        raise AirflowException(
            "Malformed task_id for DataFlowPythonOperator (only alphanumeric "
            "and hyphens are allowed but got: " + task_prefix
        )

    metric_fn, metric_keys = metric_fn_and_keys
    if not callable(metric_fn):
        raise AirflowException("`metric_fn` param must be callable.")
    if not callable(validate_fn):
        raise AirflowException("`validate_fn` param must be callable.")

    if dag is not None and dag.default_args is not None:
        default_args = dag.default_args
        project_id = project_id or default_args.get('project_id')
        region = region or default_args['region']
        model_name = model_name or default_args.get('model_name')
        version_name = version_name or default_args.get('version_name')
        dataflow_options = dataflow_options or default_args.get('dataflow_default_options')

    evaluate_prediction = MLEngineStartBatchPredictionJobOperator(
        task_id=(task_prefix + "-prediction"),
        project_id=project_id,
        job_id=batch_prediction_job_id,
        region=region,
        data_format=data_format,
        input_paths=input_paths,
        output_path=prediction_path,
        uri=model_uri,
        model_name=model_name,
        version_name=version_name,
        dag=dag,
    )

    metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True)).decode()
    evaluate_summary = DataflowCreatePythonJobOperator(
        task_id=(task_prefix + "-summary"),
        py_file=os.path.join(os.path.dirname(__file__), 'mlengine_prediction_summary.py'),
        dataflow_default_options=dataflow_options,
        options={
            "prediction_path": prediction_path,
            "metric_fn_encoded": metric_fn_encoded,
            "metric_keys": ','.join(metric_keys),
        },
        py_interpreter=py_interpreter,
        py_requirements=['apache-beam[gcp]>=2.14.0'],
        dag=dag,
    )
    evaluate_summary.set_upstream(evaluate_prediction)

    def apply_validate_fn(*args, templates_dict, **kwargs):
        prediction_path = templates_dict["prediction_path"]
        scheme, bucket, obj, _, _ = urlsplit(prediction_path)
        if scheme != "gs" or not bucket or not obj:
            raise ValueError("Wrong format prediction_path: {}".format(prediction_path))
        summary = os.path.join(obj.strip("/"), "prediction.summary.json")
        gcs_hook = GCSHook()
        summary = json.loads(gcs_hook.download(bucket, summary))
        return validate_fn(summary)

    evaluate_validation = PythonOperator(
        task_id=(task_prefix + "-validation"),
        python_callable=apply_validate_fn,
        templates_dict={"prediction_path": prediction_path},
        dag=dag,
    )
    evaluate_validation.set_upstream(evaluate_summary)

    return evaluate_prediction, evaluate_summary, evaluate_validation
        for job in jobs:
            dag = DAG(
                dag_id=f'{job}_dag',
                start_date=datetime(2021, 2, 9),
                max_active_runs=3,
                schedule_interval="0 10 * * *",
                default_args=default_args
            )

            dummy_id = DummyOperator(
                task_id=f"trigger_spark_job",
                dag=dag
            )

            with open(f"{dag.dag_id}.pkl", "wb") as f:
                pickle.dump(dag, f)


with DAG(
    dag_id="dag_generator",
    schedule_interval=None,
    max_active_runs=3,
    default_args=default_args
) as dag:

    dag_generator = PythonOperator(
        task_id="create_dags",
        python_callable=create_and_pickle_dag
    )
    dag_id='example_twitter_dag',
    default_args=default_args,
    schedule_interval="@daily",
    start_date=days_ago(5),
    tags=['example'],
) as dag:

    # --------------------------------------------------------------------------------
    # This task should call Twitter API and retrieve tweets from yesterday from and to
    # for the four twitter users (Twitter_A,..,Twitter_D) There should be eight csv
    # output files generated by this task and naming convention
    # is direction(from or to)_twitterHandle_date.csv
    # --------------------------------------------------------------------------------

    fetch_tweets = PythonOperator(
        task_id='fetch_tweets',
        python_callable=fetchtweets
    )

    # --------------------------------------------------------------------------------
    # Clean the eight files. In this step you can get rid of or cherry pick columns
    # and different parts of the text
    # --------------------------------------------------------------------------------

    clean_tweets = PythonOperator(
        task_id='clean_tweets',
        python_callable=cleantweets
    )

    clean_tweets << fetch_tweets

    # --------------------------------------------------------------------------------
Exemple #18
0
gunzip_files = BashOperator(
    task_id='gunzip_files',
    bash_command='gunzip {{ params.import_dir}}/*.gz',
    dag=dag,
)

new_meta = default_args['params']['import_dir'] + 'new.tsv'
new_fasta = default_args['params']['import_dir'] + 'new.fasta'

# Split out items from
split_out_new_task = PythonOperator(
    task_id='split_out_new',
    python_callable=filter_gisaid_exports_by_dir,
    op_kwargs={
        "dir": default_args['params']['import_dir'],
        "fasta_output": new_fasta,
        "meta_output": new_meta
    },
    pool='mongo',
    dag=dag,
)

import_tsv = BashOperator(
    task_id='import_tsv',
    bash_command=
    'node {{ params.working_dir }}/js/submit-tsv-to-mongo.js {{ params.meta_tsv }}',
    params={'meta_tsv': new_meta},
    dag=dag,
)

update_mongo_with_sequences = BashOperator(
Exemple #19
0
        f"The xcom value pushed by task push via return value is {bash_pushed_via_return_value}"
    )
    print(
        f"The xcom value pushed by task push manually is {bash_manually_pushed_value}"
    )


with DAG(
        'example_xcom',
        schedule_interval="@once",
        start_date=days_ago(2),
        tags=['example'],
) as dag:

    push1 = PythonOperator(
        task_id='push',
        python_callable=push,
    )

    push2 = PythonOperator(
        task_id='push_by_returning',
        python_callable=push_by_returning,
    )

    pull = PythonOperator(
        task_id='puller',
        python_callable=puller,
        op_kwargs={
            'pulled_value_1': push1.output['value from pusher 1'],
            'pulled_value_2': push2.output,
        },
    )
def calculator_func(**kwargs):
    ti = kwargs['ti']
    tasks = [f'push_{i}' for i in range(1, 10)]
    values = ti.xcom_pull(task_ids=tasks)
    return sum(values)


with DAG(
        dag_id='xcom_multiple_tasks',
        start_date=datetime(2021, 3, 1),
        schedule_interval='@once',
) as dag:

    tasks = []

    for i in range(1, 10):
        task = PythonOperator(
            task_id=f'push_{i}',
            python_callable=lambda i=i: i,
        )

        tasks.append(task)

    calculator = PythonOperator(
        task_id='calculator',
        python_callable=calculator_func,
    )

    calculator.set_upstream(tasks)
        """
        Tests whether the volume has been mounted.
        """
        with open('/foo/volume_mount_test.txt', 'w') as foo:
            foo.write('Hello')

        return_code = os.system("cat /foo/volume_mount_test.txt")
        if return_code != 0:
            raise ValueError(f"Error when checking volume mount. Return code {return_code}")

    # You can use annotations on your kubernetes pods!
    start_task = PythonOperator(
        task_id="start_task",
        python_callable=print_stuff,
        executor_config={"pod_override": k8s.V1Pod(
            metadata=k8s.V1ObjectMeta(
                annotations={"test": "annotation"}
            )
        )
        }
    )

    # [START task_with_volume]
    volume_task = PythonOperator(
        task_id="task_with_volume",
        python_callable=test_volume_mount,
        executor_config={
            "pod_override": k8s.V1Pod(
                spec=k8s.V1PodSpec(
                    containers=[
                        k8s.V1Container(
                            name="base",
Exemple #22
0
            pg_hook.run(insert_statement, parameters=row)

            os.remove(tot_name)
        else:
            print("No file named {}".format(tot_name))
    else:
        print("No file named {}.  No data to load.".format(tot_name))


with DAG('fetch_kc_crime_data-v0.1',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    check_endpoint_availability = HttpSensor(
        task_id='check_endpoint_availability',
        http_conn_id='http_data_kcmo_org',
        endpoint=app_config['endpoint'])

    download_latest_crime_data = PythonOperator(
        task_id='download_latest_crime_data',
        python_callable=download_latest_crime_data)

    load_data = PythonOperator(task_id='load_data_raw',
                               python_callable=load_data_raw)

    fetch_crime_window = PythonOperator(task_id='fetch_crime_window',
                                        python_callable=fetch_crime_window)

    check_endpoint_availability >> fetch_crime_window >> download_latest_crime_data >> load_data
default_args = {
    'owner': 'airflow',
    'depends_on_past': True,
}

DAYS = 14

with DAG(
        dag_id=DAG_ID,
        default_args=default_args,
        schedule_interval="@daily",
        start_date=days_ago(DAYS),
        tags=[WORKFLOW_ID],
) as dag:
    start = DummyOperator(task_id="start")

    def failed():
        raise ValueError("failed test")

    failed_task = PythonOperator(task_id="failed_task", python_callable=failed)

    daily = BashOperator(
        task_id='print_daily_date',
        bash_command=("echo prev_ds :{{ prev_ds }}, "
                      "ds :{{ ds }}, "
                      "next_ds :{{ next_ds }}, "
                      "next_execution_date : {{ next_execution_date }}"))
    end = DummyOperator(task_id="end")

    start >> failed_task >> daily >> end
Exemple #24
0
                lastname TEXT NOT NULL,
                country TEXT NOT NULL,
                username TEXT NOT NULL,
                password TEXT NOT NULL,
                email TEXT NOT NULL PRIMARY KEY
            );
            """)

    is_api_available = HttpSensor(task_id='is_api_available',
                                  http_conn_id='user_api',
                                  endpoint='api/')

    extracting_user = SimpleHttpOperator(
        task_id='extracting_user',
        http_conn_id='user_api',
        endpoint='api/',
        method='GET',
        response_filter=lambda response: json.loads(response.text),
        log_response=True)

    processing_user = PythonOperator(task_id='processing_user',
                                     python_callable=_processing_user)

    storing_user = BashOperator(
        task_id='storing_user',
        bash_command=
        'echo -e ".separator ","\n.import /tmp/processed_user.csv users" | sqlite3 /home/airflow/airflow/airflow.db'
    )

    creating_table >> is_api_available >> extracting_user >> processing_user >> storing_user
Exemple #25
0
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago

args = {
    'owner': 'airflow',
    'start_date': days_ago(2),
}


def dummy(*args, **kwargs):
    """Dummy function"""
    return "pass"


with DAG(dag_id='example_xcom_args',
         default_args=args,
         schedule_interval=None,
         tags=['example']) as dag:
    task1 = PythonOperator(
        task_id='task1',
        python_callable=dummy,
    )

    task2 = PythonOperator(
        task_id='task2',
        python_callable=dummy,
        op_kwargs={"dummy": task1.output},
    )
    'retries': 3,
    'retry_delay': timedelta(minutes=5)
}

with DAG('avocado_dag',
         default_args=default_args,
         description='Forecasting avocado prices',
         schedule_interval='*/10 * * * *',
         start_date=datetime(2020, 1, 1),
         catchup=False) as dag:

    creating_table = PostgresOperator(task_id='creating_table',
                                      sql='sql/CREATE_TABLE_ACCURACIES.sql',
                                      postgres_conn_id='postgres')

    downloading_data = PythonOperator(task_id='downloading_data',
                                      python_callable=download_dataset)

    sanity_check = PythonOperator(task_id="sanity_check",
                                  python_callable=check_dataset)

    waiting_for_data = FileSensor(task_id='waiting_for_data',
                                  fs_conn_id='fs_default',
                                  filepath='avocado.csv',
                                  poke_interval=15)

    n_estimators = [100, 150]
    max_features = ['auto', 'sqrt']

    training_model_tasks = []
    for feature in max_features:
        for estimator in n_estimators:
Exemple #27
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

# Using a DAG context manager, you don't have to specify the dag property of each task
with DAG(
        'rock_content_item_backfill_example_dag',
        start_date=datetime(2021, 2, 22),
        max_active_runs=1,
        schedule_interval='@once',
        default_args=default_args,
        # catchup=False # enable if you don't want historical dag runs to run
) as dag:

    t0 = PythonOperator(
        task_id='fetch_and_save_content_items',
        python_callable=
        fetch_and_save_content_items,  # make sure you don't include the () of the function
        op_kwargs={'client': None})

    t1 = PythonOperator(
        task_id='fetch_and_save_content_items_connections',
        python_callable=
        fetch_and_save_content_items_connections,  # make sure you don't include the () of the function
        op_kwargs={
            'client': None,
            'do_backfill': True
        })

    t0 >> t1
Exemple #28
0
        'value': 'airflow'
    }]

    def use_zip_binary():
        """
        Checks whether Zip is installed.

        :return: True if it is installed, False if not.
        :rtype: bool
        """
        return_code = os.system("zip")
        if return_code != 0:
            raise SystemError("The zip binary is missing")

    # You don't have to use any special KubernetesExecutor configuration if you don't want to
    start_task = PythonOperator(task_id="start_task",
                                python_callable=print_stuff)

    # But you can if you want to
    one_task = PythonOperator(
        task_id="one_task",
        python_callable=print_stuff,
        executor_config={"KubernetesExecutor": {
            "image": "airflow/ci:latest"
        }})

    # Use the zip binary, which is only found in this special docker image
    two_task = PythonOperator(task_id="two_task",
                              python_callable=use_zip_binary,
                              executor_config={
                                  "KubernetesExecutor": {
                                      "image": "airflow/ci_zip:latest"
Exemple #29
0
    df.to_csv('dags/postgresqldata.csv')
    print("-------Data Saved------")


def insertElasticsearch():
    es = Elasticsearch()
    df = pd.read_csv('dags/postgresqldata.csv')
    for _, r in df.iterrows():
        doc = r.to_json()
        res = es.index(index="frompostgresql", doc_type="doc", body=doc)
        print(res)


default_args = {
    'owner': 'sbahaddi',
    'start_date': dt.datetime(2021, 3, 25),
    'retries': 1,
    'retry_delay': dt.timedelta(minutes=5),
}

with DAG('MyDBdag',
         default_args=default_args,
         schedule_interval='@daily',
         ) as dag:
    getData = PythonOperator(task_id='QueryPostgreSQL',
                             python_callable=queryPostgresql)
    insertData = PythonOperator(
        task_id='InsertDataElasticsearch', python_callable=insertElasticsearch)

getData >> insertData
Exemple #30
0
def _clean_sales_new(**context):
    print("Preprocessing sales data (NEW)...")


with DAG(
        dag_id="03_branch_dag",
        start_date=airflow.utils.dates.days_ago(3),
        schedule_interval="@daily",
) as dag:
    start = DummyOperator(task_id="start")

    pick_erp_system = BranchPythonOperator(task_id="pick_erp_system",
                                           python_callable=_pick_erp_system)

    fetch_sales_old = PythonOperator(task_id="fetch_sales_old",
                                     python_callable=_fetch_sales_old)
    clean_sales_old = PythonOperator(task_id="clean_sales_old",
                                     python_callable=_clean_sales_old)

    fetch_sales_new = PythonOperator(task_id="fetch_sales_new",
                                     python_callable=_fetch_sales_new)
    clean_sales_new = PythonOperator(task_id="clean_sales_new",
                                     python_callable=_clean_sales_new)

    fetch_weather = DummyOperator(task_id="fetch_weather")
    clean_weather = DummyOperator(task_id="clean_weather")

    # Using the wrong trigger rule ("all_success") results in tasks being skipped downstream.
    # join_datasets = DummyOperator(task_id="join_datasets")

    join_datasets = DummyOperator(task_id="join_datasets",