dag = DAG(
    dag_id="FirstScript",
    schedule_interval="@daily",
    default_args={
        "owner": "airflow training",
        "start_date": dt.datetime(2018, 8, 1),
        "depends_on_past": True,
        "email_on_failure": True,
        "email": "*****@*****.**",
    },
)

pgsl_to_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="postgres_to_gcs",
    dag=dag,
    sql=
    "select * from land_registry_price_paid_uk where transfer_date='{{ds}}'",
    bucket="airflow-training-knab-asv",
    filename="land_registry_price_paid_uk/{{ds}}/proerties_{}.json",
    postgres_conn_id="airflow-training-postgres")

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id=PROJECT_ID,
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)

land_registry_prices_to_bigquery = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
Beispiel #2
0
        "start_date": dt.datetime(2018, 9, 11),
        "depends_on_past": True,
        "email_on_failure": True,
        "email": "*****@*****.**",
    },
)


def print_exec_date(**context):
    print(context["execution_date"])


pgsl_to_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="postgres_to_gcs",
    postgres_conn_id="airflow-training-postgres",
    sql="SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="airflow-training-knab-jochem",
    filename="land_registry_price_paid_uk/{{ ds }}/properties_{}.json",
    dag=dag,
)


dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-ea393e48abe0a85089b6b551da",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
    auto_delete_ttl=5 * 60,  # Autodelete after 5 minutes
)
Beispiel #3
0
# project_id = "training-airflow"

dag = DAG(
    dag_id="uk_land_dag2",
    schedule_interval="30 7 * * *",
    default_args={
        "owner": "airflow",
        "start_date": dt.datetime(2018, 10, 1),
        "depends_on_past": True,
    },
)

pg_2_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="pg_2_gcs",
    postgres_conn_id="my_db_connection",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="airflowbolcom_ghermann_dummybucket",
    filename="mypgdata_{{ ds }}",
    dag=dag)

zone = "europe-west4-a"

dataproc_cluster_name = "my-dp-cluster-{{ ds }}"

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="my_create_dp_cluster",
    cluster_name=dataproc_cluster_name,
    project_id=project_id,
    num_workers=2,
    zone=zone,
    dag=dag,
        "start_date": dt.datetime(2018, 10, 1),
        "depends_on_past": True,
        "email_on_failure": True,
        "email": "*****@*****.**",
    },
)


def print_exec_date(**context):
    print(context["execution_date"])


get_data = PostgresToGoogleCloudStorageOperator(
    task_id="postgres_to_gcs",
    postgres_conn_id="my_database_connection",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket='airflow_training_bucket',
    filename='land_registry_price_paid_uk/{{ ds }}/result.json',
    dag=dag)

my_task = PythonOperator(task_id="task_name",
                         python_callable=print_exec_date,
                         provide_context=True,
                         dag=dag)

create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id='airflowbolcom-20165e4959a78c1d',
    num_workers=2,
    zone="europe-west4-a",
            "start_date": dt.datetime(2018, 10, 10),
            "depends_on_past": False,
            "email_on_failure": True,
            "email": "*****@*****.**",
        },
) as dag:
    usd_conversion_rate = create_conversion_task(task_id="usd_conversion_rate",
                                                 target_currency="USD")

    eur_conversion_rate = create_conversion_task(task_id="eur_conversion_rate",
                                                 target_currency="EUR")

    psql_to_gcs = PostgresToGoogleCloudStorageOperator(
        task_id="read_postgres",
        postgres_conn_id="postgres_training",
        sql=
        "select * from land_registry_price_paid_uk where transfer_date = '{{ ds }}'::date",
        bucket="airflow-training-simple-dag",
        filename="training-price-paid-uk/{{ ds }}/land_registry.json")

    cluster_name = "cluster-{{ ds }}"
    gcs_project_id = "airflowbolcom-544f36a42f5c0d9d"

    create_cluster = DataprocClusterCreateOperator(task_id="create_cluster",
                                                   cluster_name=cluster_name,
                                                   project_id=gcs_project_id,
                                                   num_workers=2,
                                                   zone="europe-west4-a")

    cloud_analytics = DataProcPySparkOperator(
        task_id="analyze_data",
Beispiel #6
0
    },
)


def print_exec_date(**context):
    print(context["execution_date"])


# my_task = PythonOperator(
#     task_id="task_name", python_callable=print_exec_date, provide_context=True, dag=dag
# )

pgsl_to_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="export_data_to_bucket",
    postgres_conn_id="training_postgres",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="airflow_training_data",
    filename="data_{{ds_nodash}}/land_registry_price.json",
    dag=dag)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc_cluster",
    cluster_name="dataproc-cluster-dag-training-{{ ds }}",
    project_id="airflowbolcom-b9aabd6971d488d9",
    num_workers=2,
    zone="europe-west1-d",
    dag=dag)

compute_aggregates = DataProcPySparkOperator(
    task_id="compute_aggregates",
    main=
Beispiel #7
0
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
    pool="dataproc",
)

query = """
    SELECT *
    FROM land_registry_price_paid_uk
    WHERE transfer_date = '{{ ds }}'
"""

pgsl_to_gcs = (PostgresToGoogleCloudStorageOperator(
    task_id="pgsl_to_gcs",
    postgres_conn_id="postgres_airflow_training",
    sql=query,
    bucket=BUCKET,
    filename="land_registry_price_paid_uk/{{ ds }}/properties_{}.json",
    dag=dag,
) >> dataproc_create_cluster)

for currency in {"EUR", "USD"}:
    HttpToGcsOperator(
        task_id="get_currency_" + currency,
        endpoint="airflow-training-transform-valutas?date={{ ds }}&from=GBP&to="
        + currency,
        bucket=BUCKET,
        method="GET",
        http_conn_id="airflow-training-currency-http",
        gcs_conn_id="airflow-training-data-tim",
        gcs_path="currency/{{ ds }}-" + currency + ".json",
        dag=dag,

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-25d677142443a8e2ace1927d48",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag,
)


pgsl_to_gcs = PostgresToGoogleCloudStorageOperator(
    task_id="postgres_to_gcs",
    postgres_conn_id="postgres_airflow_training",
    sql="SELECT * FROM public.land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket='airflow-training-knab-geert',
    filename='land_registry_price_paid_uk/{{ ds }}/properties_{}.json',
    dag=dag
) >> dataproc_create_cluster


for currency in {'EUR', 'USD'}:
    HttpToGcsOperator(
        task_id="get_currency_" + currency,
        method="GET",
        endpoint="airflow-training-transform-valutas?date={{ ds }}&from=GBP&to=" + currency,
        http_conn_id="http_airflow_training",
        gcs_conn_id="google_cloud_default",
        gcs_bucket="airflow-training-knab-geert",
        gcs_path="currency/{{ ds }}-" + currency + ".json",
        dag=dag
Beispiel #9
0
dag = DAG(
    dag_id="training_dag",
    schedule_interval="30 7 * * *",
    default_args={
        "owner": "airflow",
        "start_date": dt.datetime(2018, 10, 1),
        "depends_on_past": True,
        "email_on_failure": True,
        "email": "*****@*****.**",
    },
)

copy_task = PostgresToGoogleCloudStorageOperator(
    task_id="copy_postgres_to_gcs",
    postgres_conn_id="training_postgres",
    sql=
    "SELECT * FROM land_registry_price_paid_uk WHERE transfer_date = '{{ ds }}'",
    bucket="airflow-training",
    filename="exports/{{ ds }}/land_registry_price.json",
    dag=dag)

dataproc_create_cluster = DataprocClusterCreateOperator(
    task_id="create_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="airflowbolcom-1d3b3a0049ce78da",
    num_workers=2,
    zone="europe-west4-a",
    dag=dag)

copy_task >> dataproc_create_cluster

compute_aggregates = DataProcPySparkOperator(