def read_dag_yaml(self) -> Dict: if self.dag_yaml_dict.get('start_date_type') == 'days_ago': start_date = days_ago(self.dag_yaml_dict.get('start_date')) else: start_year = self.dag_yaml_dict.get('start_date').year start_month = self.dag_yaml_dict.get('start_date').month start_day = self.dag_yaml_dict.get('start_date').day start_date = datetime(int(start_year), int(start_month), int(start_day)) if self.dag_yaml_dict.get('schedule_type') == 'minute': schedule_interval = timedelta( minutes=self.dag_yaml_dict.get('schedule_interval')) elif self.dag_yaml_dict.get('schedule_type') == 'hour': schedule_interval = timedelta( hours=self.dag_yaml_dict.get('schedule_interval')) elif self.dag_yaml_dict.get('schedule_type') == 'day': schedule_interval = timedelta( days=self.dag_yaml_dict.get('schedule_interval')) else: schedule_interval = self.dag_yaml_dict.get('schedule_interval', '0 0 * * *') dag_defn = { 'dag_dir_path': self.dag_yaml_dict.get('root'), 'dag_name': self.dag_yaml_dict.get('dag_name'), 'catchup': self.dag_yaml_dict.get('catchup'), 'default_args': { 'owner': self.dag_yaml_dict.get('owner', 'airflow'), 'depends_on_past': self.dag_yaml_dict.get('depends_on_past', False), 'start_date': start_date, 'email': self.dag_yaml_dict.get('email'), 'email_on_failure': self.dag_yaml_dict.get('email_on_failure'), 'email_on_retry': self.dag_yaml_dict.get('email_on_retry'), 'retries': self.dag_yaml_dict.get('retries'), 'retry_delay': timedelta( minutes=int(self.dag_yaml_dict.get('retry_delay_mins'))) }, 'schedule_interval': schedule_interval } return dag_defn
from datetime import timedelta from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.utils.dates import datetime from airflow.utils.dates import timedelta default_args = { 'owner': 'mibrahimbila', 'depends_on_past': False, 'start_date': datetime(2020, 12, 23), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, # 'retries': 1, # 'retry_delay': timedelta(minutes=5) } dag = DAG( 'faceboook_connector', default_args=default_args, description='An Airflow DAG to invoke simple dbt commands', schedule_interval='@once', ) VIRTUAL_ENV_ACTIVATION = "/home/klox-dev/.venv/bin/activate" CMD_FACEBOOK_CONNECTOR="python main.py " FACEBOOK_CONNECTOR_DIR="/home/klox-dev/facebook-api-python-test"
# under the License. import os from airflow import models from airflow.providers.amazon.aws.transfers.mongo_to_s3 import MongoToS3Operator from airflow.utils.dates import datetime S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket") S3_KEY = os.environ.get("S3_KEY", "key") MONGO_DATABASE = os.environ.get("MONGO_DATABASE", "Test") MONGO_COLLECTION = os.environ.get("MONGO_COLLECTION", "Test") with models.DAG( "example_mongo_to_s3", start_date=datetime(2021, 1, 1), catchup=False, ) as dag: # [START howto_transfer_mongo_to_s3] create_local_to_s3_job = MongoToS3Operator( task_id="create_mongo_to_s3_job", mongo_collection=MONGO_COLLECTION, # Mongo query by matching values # Here returns all documents which have "OK" as value for the key "status" mongo_query={"status": "OK"}, s3_bucket=S3_BUCKET, s3_key=S3_KEY, mongo_db=MONGO_DATABASE, replace=True, ) # [END howto_transfer_mongo_to_s3]
from datetime import timedelta from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.utils.dates import datetime from airflow.utils.dates import timedelta # We're hardcoding this value here for the purpose of the demo, but in a production environment this # would probably come from a config file and/or environment variables! DBT_PROJECT_DIR = '/usr/local/airflow/dbt' # These args will get passed on to each operator # You can override them on a per-task basis during operator initialization default_args = { "owner": "astronomer", "depends_on_past": False, "start_date": datetime(2020, 12, 23), "email": ["*****@*****.**"], "email_on_failure": False } dag = DAG( "dbt_basic_dag", default_args=default_args, description="A sample Airflow DAG to invoke dbt runs using a BashOperator", schedule_interval=None, catchup=False, ) with dag: # This task loads the CSV files from dbt/data into the local postgres database for the purpose of this demo. # In practice, we'd usually expect the data to have already been loaded to the database.
from datetime import timedelta from logging import raiseExceptions from pathlib import Path import pickle import json from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.utils.dates import datetime from airflow.utils.dates import timedelta # These args will get passed on to each operator # You can override them on a per-task basis during operator initialization default_dag_args = { "start_date": datetime(2020, 11, 24), "retry_delay": timedelta(minutes=10), "depends_on_past": False, "retries": 0, } DAG_NAME = "standard_schedule" DBT_DIR = "/usr/local/airflow/data-cicd" DBT_SELECTOR_PICKLE_DIR = "/usr/local/airflow/include/data" GLOBAL_CLI_FLAGS = "--no-write-json" dag = DAG( dag_id=f"dbt_{DAG_NAME}", schedule_interval="@daily", max_active_runs=1, catchup=False, default_args=default_dag_args, )
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from airflow import models from airflow.providers.amazon.aws.transfers.local_to_s3 import LocalFilesystemToS3Operator from airflow.utils.dates import datetime S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket") S3_KEY = os.environ.get("S3_KEY", "key") with models.DAG( "example_local_to_s3", schedule_interval=None, start_date=datetime(2021, 1, 1), # Override to match your needs ) as dag: # [START howto_local_transfer_data_to_s3] create_local_to_s3_job = LocalFilesystemToS3Operator( task_id="create_local_to_s3_job", filename="relative/path/to/file.csv", dest_key=S3_KEY, dest_bucket=S3_BUCKET, ) create_local_to_s3_job # [END howto_local_transfer_data_to_s3]
from airflow import DAG, AirflowException from airflow.operators.dummy_operator import DummyOperator from airflow.providers.google.cloud.sensors.bigquery import BigQueryTableExistenceSensor from airflow.providers.google.cloud.operators.bigquery import BigQueryValueCheckOperator from fivetran_provider.operators.fivetran import FivetranOperator from fivetran_provider.sensors.fivetran import FivetranSensor from airflow.utils.dates import datetime TABLE = 'forestfires' DATASET = 'google_sheets' # These args will get passed on to each operator # You can override them on a per-task basis during operator initialization default_args = { 'owner': 'astronomer', 'depends_on_past': False, 'start_date': datetime(2021, 7, 7), 'email': ['*****@*****.**'], 'email_on_failure': False } with DAG('example_fivetran_bigquery', default_args=default_args, description='', schedule_interval=None, catchup=False) as dag: """ ### Simple EL Pipeline with Data Integrity and Quality Checks Before running the DAG, set the following in an Airflow or Environment Variables: - key: gcp_project_id value: [gcp_project_id] - key: connector_id
from sqlalchemy import create_engine from datetime import timedelta # Setting timezone to pacific local_tz = pendulum.timezone("US/Pacific") # Setting database name db_name = "userdata" # The api that we need to call NY_API = "https://health.data.ny.gov/resource/xdss-u53e.json?" # These args will get passed on to each operator # You can override them on a per-task basis during operator initialization default_args = { 'owner': 'Anil', 'dag_id': 'LOAD_NY_COVID_DLY', 'start_date': datetime(2020, 3, 1, tzinfo=local_tz), 'schedule_interval': '0 9 * * *' } # Using postgress Hook to get connection url and modifying it to have the right databasename result = PostgresHook(postgres_conn_id='postgres_new').get_uri().split("/") result[3] = db_name dbURI = "/".join(result) with DAG('LOAD_NY_COVID_DLY', default_args=default_args, catchup=False, template_searchpath='/opt/airflow/') as dag: @dag.task def getTodayDate():
from datetime import timedelta from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.utils.dates import datetime from airflow.utils.dates import timedelta default_args = { 'owner': 'owner_name', 'depends_on_past': False, 'start_date': datetime(2021, 1, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } dag = DAG( 'dbt_dag', default_args=default_args, description='DAG to invoke dbt commands', schedule_interval=timedelta(days=1), ) dbt_run = BashOperator( task_id='dbt_run', bash_command='dbt run', dag=dag )
from airflow.utils.dates import datetime from airflow.providers.postgres.hooks.postgres import PostgresHook from airflow.decorators import dag, task from airflow.operators.python import task, get_current_context import pendulum from pandas import DataFrame import requests from pandas import json_normalize import pandas as pd from sqlalchemy import create_engine from datetime import timedelta from airflow.operators.bash import BashOperator local_tz = pendulum.timezone("US/Pacific") default_args = { 'owner': 'Anil', 'start_date': datetime(2021, 2, 10, tzinfo=local_tz), 'schedule_interval': '0 9 * * *', 'email': '*****@*****.**' } dag = DAG(dag_id='TEST_BACKFILL_DAG', default_args=default_args, catchup=True) run_this = BashOperator( task_id='also_run_this', bash_command= 'echo "run_id={{ run_id }} | dag_run={{ dag_run }} | ds={{ ds }}"', depends_on_past=True, dag=dag)