コード例 #1
0
    def read_dag_yaml(self) -> Dict:
        if self.dag_yaml_dict.get('start_date_type') == 'days_ago':
            start_date = days_ago(self.dag_yaml_dict.get('start_date'))
        else:
            start_year = self.dag_yaml_dict.get('start_date').year
            start_month = self.dag_yaml_dict.get('start_date').month
            start_day = self.dag_yaml_dict.get('start_date').day

            start_date = datetime(int(start_year), int(start_month),
                                  int(start_day))
        if self.dag_yaml_dict.get('schedule_type') == 'minute':
            schedule_interval = timedelta(
                minutes=self.dag_yaml_dict.get('schedule_interval'))
        elif self.dag_yaml_dict.get('schedule_type') == 'hour':
            schedule_interval = timedelta(
                hours=self.dag_yaml_dict.get('schedule_interval'))
        elif self.dag_yaml_dict.get('schedule_type') == 'day':
            schedule_interval = timedelta(
                days=self.dag_yaml_dict.get('schedule_interval'))
        else:
            schedule_interval = self.dag_yaml_dict.get('schedule_interval',
                                                       '0 0 * * *')

        dag_defn = {
            'dag_dir_path': self.dag_yaml_dict.get('root'),
            'dag_name': self.dag_yaml_dict.get('dag_name'),
            'catchup': self.dag_yaml_dict.get('catchup'),
            'default_args': {
                'owner':
                self.dag_yaml_dict.get('owner', 'airflow'),
                'depends_on_past':
                self.dag_yaml_dict.get('depends_on_past', False),
                'start_date':
                start_date,
                'email':
                self.dag_yaml_dict.get('email'),
                'email_on_failure':
                self.dag_yaml_dict.get('email_on_failure'),
                'email_on_retry':
                self.dag_yaml_dict.get('email_on_retry'),
                'retries':
                self.dag_yaml_dict.get('retries'),
                'retry_delay':
                timedelta(
                    minutes=int(self.dag_yaml_dict.get('retry_delay_mins')))
            },
            'schedule_interval': schedule_interval
        }
        return dag_defn
コード例 #2
0
from datetime import timedelta

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

default_args = {
    'owner': 'mibrahimbila',
    'depends_on_past': False,
    'start_date': datetime(2020, 12, 23),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    # 'retries': 1,
    # 'retry_delay': timedelta(minutes=5)
}
dag = DAG(
    'faceboook_connector',
    default_args=default_args,
    description='An Airflow DAG to invoke simple dbt commands',
    schedule_interval='@once',
)

VIRTUAL_ENV_ACTIVATION = "/home/klox-dev/.venv/bin/activate"



CMD_FACEBOOK_CONNECTOR="python main.py "

FACEBOOK_CONNECTOR_DIR="/home/klox-dev/facebook-api-python-test"
コード例 #3
0
# under the License.

import os

from airflow import models
from airflow.providers.amazon.aws.transfers.mongo_to_s3 import MongoToS3Operator
from airflow.utils.dates import datetime

S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket")
S3_KEY = os.environ.get("S3_KEY", "key")
MONGO_DATABASE = os.environ.get("MONGO_DATABASE", "Test")
MONGO_COLLECTION = os.environ.get("MONGO_COLLECTION", "Test")

with models.DAG(
        "example_mongo_to_s3",
        start_date=datetime(2021, 1, 1),
        catchup=False,
) as dag:
    # [START howto_transfer_mongo_to_s3]
    create_local_to_s3_job = MongoToS3Operator(
        task_id="create_mongo_to_s3_job",
        mongo_collection=MONGO_COLLECTION,
        # Mongo query by matching values
        # Here returns all documents which have "OK" as value for the key "status"
        mongo_query={"status": "OK"},
        s3_bucket=S3_BUCKET,
        s3_key=S3_KEY,
        mongo_db=MONGO_DATABASE,
        replace=True,
    )
    # [END howto_transfer_mongo_to_s3]
コード例 #4
0
from datetime import timedelta
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

# We're hardcoding this value here for the purpose of the demo, but in a production environment this
# would probably come from a config file and/or environment variables!
DBT_PROJECT_DIR = '/usr/local/airflow/dbt'

# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    "owner": "astronomer",
    "depends_on_past": False,
    "start_date": datetime(2020, 12, 23),
    "email": ["*****@*****.**"],
    "email_on_failure": False
}

dag = DAG(
    "dbt_basic_dag",
    default_args=default_args,
    description="A sample Airflow DAG to invoke dbt runs using a BashOperator",
    schedule_interval=None,
    catchup=False,
)

with dag:
    # This task loads the CSV files from dbt/data into the local postgres database for the purpose of this demo.
    # In practice, we'd usually expect the data to have already been loaded to the database.
コード例 #5
0
from datetime import timedelta
from logging import raiseExceptions
from pathlib import Path
import pickle
import json

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_dag_args = {
    "start_date": datetime(2020, 11, 24),
    "retry_delay": timedelta(minutes=10),
    "depends_on_past": False,
    "retries": 0,
}
DAG_NAME = "standard_schedule"
DBT_DIR = "/usr/local/airflow/data-cicd"
DBT_SELECTOR_PICKLE_DIR = "/usr/local/airflow/include/data"
GLOBAL_CLI_FLAGS = "--no-write-json"

dag = DAG(
    dag_id=f"dbt_{DAG_NAME}",
    schedule_interval="@daily",
    max_active_runs=1,
    catchup=False,
    default_args=default_dag_args,
)
コード例 #6
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import os

from airflow import models
from airflow.providers.amazon.aws.transfers.local_to_s3 import LocalFilesystemToS3Operator
from airflow.utils.dates import datetime

S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket")
S3_KEY = os.environ.get("S3_KEY", "key")

with models.DAG(
    "example_local_to_s3",
    schedule_interval=None,
    start_date=datetime(2021, 1, 1),  # Override to match your needs
) as dag:
    # [START howto_local_transfer_data_to_s3]
    create_local_to_s3_job = LocalFilesystemToS3Operator(
        task_id="create_local_to_s3_job",
        filename="relative/path/to/file.csv",
        dest_key=S3_KEY,
        dest_bucket=S3_BUCKET,
    )

    create_local_to_s3_job
    # [END howto_local_transfer_data_to_s3]
コード例 #7
0
from airflow import DAG, AirflowException
from airflow.operators.dummy_operator import DummyOperator
from airflow.providers.google.cloud.sensors.bigquery import BigQueryTableExistenceSensor
from airflow.providers.google.cloud.operators.bigquery import BigQueryValueCheckOperator
from fivetran_provider.operators.fivetran import FivetranOperator
from fivetran_provider.sensors.fivetran import FivetranSensor
from airflow.utils.dates import datetime

TABLE = 'forestfires'
DATASET = 'google_sheets'
# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'astronomer',
    'depends_on_past': False,
    'start_date': datetime(2021, 7, 7),
    'email': ['*****@*****.**'],
    'email_on_failure': False
}

with DAG('example_fivetran_bigquery',
         default_args=default_args,
         description='',
         schedule_interval=None,
         catchup=False) as dag:
    """
    ### Simple EL Pipeline with Data Integrity and Quality Checks
    Before running the DAG, set the following in an Airflow or Environment Variables:
    - key: gcp_project_id
      value: [gcp_project_id]
    - key: connector_id
コード例 #8
0
from sqlalchemy import create_engine
from datetime import timedelta

# Setting timezone to pacific
local_tz = pendulum.timezone("US/Pacific")
# Setting database name
db_name = "userdata"
# The api that we need to call
NY_API = "https://health.data.ny.gov/resource/xdss-u53e.json?"

# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'Anil',
    'dag_id': 'LOAD_NY_COVID_DLY',
    'start_date': datetime(2020, 3, 1, tzinfo=local_tz),
    'schedule_interval': '0 9 * * *'
}

# Using postgress Hook to get connection url and modifying it to have the right databasename
result = PostgresHook(postgres_conn_id='postgres_new').get_uri().split("/")
result[3] = db_name
dbURI = "/".join(result)

with DAG('LOAD_NY_COVID_DLY',
         default_args=default_args,
         catchup=False,
         template_searchpath='/opt/airflow/') as dag:

    @dag.task
    def getTodayDate():
コード例 #9
0
from datetime import timedelta

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

default_args = {
    'owner': 'owner_name',
    'depends_on_past': False,
    'start_date': datetime(2021, 1, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG(
    'dbt_dag',
    default_args=default_args,
    description='DAG to invoke dbt commands',
    schedule_interval=timedelta(days=1),
)

dbt_run = BashOperator(
    task_id='dbt_run',
    bash_command='dbt run',
    dag=dag
)
コード例 #10
0
from airflow.utils.dates import datetime
from airflow.providers.postgres.hooks.postgres import PostgresHook
from airflow.decorators import dag, task
from airflow.operators.python import task, get_current_context
import pendulum
from pandas import DataFrame
import requests
from pandas import json_normalize
import pandas as pd
from sqlalchemy import create_engine
from datetime import timedelta
from airflow.operators.bash import BashOperator

local_tz = pendulum.timezone("US/Pacific")

default_args = {
    'owner': 'Anil',
    'start_date': datetime(2021, 2, 10, tzinfo=local_tz),
    'schedule_interval': '0 9 * * *',
    'email': '*****@*****.**'
}

dag = DAG(dag_id='TEST_BACKFILL_DAG', default_args=default_args, catchup=True)

run_this = BashOperator(
    task_id='also_run_this',
    bash_command=
    'echo "run_id={{ run_id }} | dag_run={{ dag_run }} | ds={{ ds }}"',
    depends_on_past=True,
    dag=dag)