Ejemplo n.º 1
0
    def read_dag_yaml(self) -> Dict:
        if self.dag_yaml_dict.get('start_date_type') == 'days_ago':
            start_date = days_ago(self.dag_yaml_dict.get('start_date'))
        else:
            start_year = self.dag_yaml_dict.get('start_date').year
            start_month = self.dag_yaml_dict.get('start_date').month
            start_day = self.dag_yaml_dict.get('start_date').day

            start_date = datetime(int(start_year), int(start_month),
                                  int(start_day))
        if self.dag_yaml_dict.get('schedule_type') == 'minute':
            schedule_interval = timedelta(
                minutes=self.dag_yaml_dict.get('schedule_interval'))
        elif self.dag_yaml_dict.get('schedule_type') == 'hour':
            schedule_interval = timedelta(
                hours=self.dag_yaml_dict.get('schedule_interval'))
        elif self.dag_yaml_dict.get('schedule_type') == 'day':
            schedule_interval = timedelta(
                days=self.dag_yaml_dict.get('schedule_interval'))
        else:
            schedule_interval = self.dag_yaml_dict.get('schedule_interval',
                                                       '0 0 * * *')

        dag_defn = {
            'dag_dir_path': self.dag_yaml_dict.get('root'),
            'dag_name': self.dag_yaml_dict.get('dag_name'),
            'catchup': self.dag_yaml_dict.get('catchup'),
            'default_args': {
                'owner':
                self.dag_yaml_dict.get('owner', 'airflow'),
                'depends_on_past':
                self.dag_yaml_dict.get('depends_on_past', False),
                'start_date':
                start_date,
                'email':
                self.dag_yaml_dict.get('email'),
                'email_on_failure':
                self.dag_yaml_dict.get('email_on_failure'),
                'email_on_retry':
                self.dag_yaml_dict.get('email_on_retry'),
                'retries':
                self.dag_yaml_dict.get('retries'),
                'retry_delay':
                timedelta(
                    minutes=int(self.dag_yaml_dict.get('retry_delay_mins')))
            },
            'schedule_interval': schedule_interval
        }
        return dag_defn
Ejemplo n.º 2
0
from datetime import timedelta

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

default_args = {
    'owner': 'mibrahimbila',
    'depends_on_past': False,
    'start_date': datetime(2020, 12, 23),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    # 'retries': 1,
    # 'retry_delay': timedelta(minutes=5)
}
dag = DAG(
    'faceboook_connector',
    default_args=default_args,
    description='An Airflow DAG to invoke simple dbt commands',
    schedule_interval='@once',
)

VIRTUAL_ENV_ACTIVATION = "/home/klox-dev/.venv/bin/activate"



CMD_FACEBOOK_CONNECTOR="python main.py "

FACEBOOK_CONNECTOR_DIR="/home/klox-dev/facebook-api-python-test"
Ejemplo n.º 3
0
# under the License.

import os

from airflow import models
from airflow.providers.amazon.aws.transfers.mongo_to_s3 import MongoToS3Operator
from airflow.utils.dates import datetime

S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket")
S3_KEY = os.environ.get("S3_KEY", "key")
MONGO_DATABASE = os.environ.get("MONGO_DATABASE", "Test")
MONGO_COLLECTION = os.environ.get("MONGO_COLLECTION", "Test")

with models.DAG(
        "example_mongo_to_s3",
        start_date=datetime(2021, 1, 1),
        catchup=False,
) as dag:
    # [START howto_transfer_mongo_to_s3]
    create_local_to_s3_job = MongoToS3Operator(
        task_id="create_mongo_to_s3_job",
        mongo_collection=MONGO_COLLECTION,
        # Mongo query by matching values
        # Here returns all documents which have "OK" as value for the key "status"
        mongo_query={"status": "OK"},
        s3_bucket=S3_BUCKET,
        s3_key=S3_KEY,
        mongo_db=MONGO_DATABASE,
        replace=True,
    )
    # [END howto_transfer_mongo_to_s3]
Ejemplo n.º 4
0
from datetime import timedelta
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

# We're hardcoding this value here for the purpose of the demo, but in a production environment this
# would probably come from a config file and/or environment variables!
DBT_PROJECT_DIR = '/usr/local/airflow/dbt'

# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    "owner": "astronomer",
    "depends_on_past": False,
    "start_date": datetime(2020, 12, 23),
    "email": ["*****@*****.**"],
    "email_on_failure": False
}

dag = DAG(
    "dbt_basic_dag",
    default_args=default_args,
    description="A sample Airflow DAG to invoke dbt runs using a BashOperator",
    schedule_interval=None,
    catchup=False,
)

with dag:
    # This task loads the CSV files from dbt/data into the local postgres database for the purpose of this demo.
    # In practice, we'd usually expect the data to have already been loaded to the database.
from datetime import timedelta
from logging import raiseExceptions
from pathlib import Path
import pickle
import json

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_dag_args = {
    "start_date": datetime(2020, 11, 24),
    "retry_delay": timedelta(minutes=10),
    "depends_on_past": False,
    "retries": 0,
}
DAG_NAME = "standard_schedule"
DBT_DIR = "/usr/local/airflow/data-cicd"
DBT_SELECTOR_PICKLE_DIR = "/usr/local/airflow/include/data"
GLOBAL_CLI_FLAGS = "--no-write-json"

dag = DAG(
    dag_id=f"dbt_{DAG_NAME}",
    schedule_interval="@daily",
    max_active_runs=1,
    catchup=False,
    default_args=default_dag_args,
)
Ejemplo n.º 6
0
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.


import os

from airflow import models
from airflow.providers.amazon.aws.transfers.local_to_s3 import LocalFilesystemToS3Operator
from airflow.utils.dates import datetime

S3_BUCKET = os.environ.get("S3_BUCKET", "test-bucket")
S3_KEY = os.environ.get("S3_KEY", "key")

with models.DAG(
    "example_local_to_s3",
    schedule_interval=None,
    start_date=datetime(2021, 1, 1),  # Override to match your needs
) as dag:
    # [START howto_local_transfer_data_to_s3]
    create_local_to_s3_job = LocalFilesystemToS3Operator(
        task_id="create_local_to_s3_job",
        filename="relative/path/to/file.csv",
        dest_key=S3_KEY,
        dest_bucket=S3_BUCKET,
    )

    create_local_to_s3_job
    # [END howto_local_transfer_data_to_s3]
Ejemplo n.º 7
0
from airflow import DAG, AirflowException
from airflow.operators.dummy_operator import DummyOperator
from airflow.providers.google.cloud.sensors.bigquery import BigQueryTableExistenceSensor
from airflow.providers.google.cloud.operators.bigquery import BigQueryValueCheckOperator
from fivetran_provider.operators.fivetran import FivetranOperator
from fivetran_provider.sensors.fivetran import FivetranSensor
from airflow.utils.dates import datetime

TABLE = 'forestfires'
DATASET = 'google_sheets'
# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'astronomer',
    'depends_on_past': False,
    'start_date': datetime(2021, 7, 7),
    'email': ['*****@*****.**'],
    'email_on_failure': False
}

with DAG('example_fivetran_bigquery',
         default_args=default_args,
         description='',
         schedule_interval=None,
         catchup=False) as dag:
    """
    ### Simple EL Pipeline with Data Integrity and Quality Checks
    Before running the DAG, set the following in an Airflow or Environment Variables:
    - key: gcp_project_id
      value: [gcp_project_id]
    - key: connector_id
Ejemplo n.º 8
0
from sqlalchemy import create_engine
from datetime import timedelta

# Setting timezone to pacific
local_tz = pendulum.timezone("US/Pacific")
# Setting database name
db_name = "userdata"
# The api that we need to call
NY_API = "https://health.data.ny.gov/resource/xdss-u53e.json?"

# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'Anil',
    'dag_id': 'LOAD_NY_COVID_DLY',
    'start_date': datetime(2020, 3, 1, tzinfo=local_tz),
    'schedule_interval': '0 9 * * *'
}

# Using postgress Hook to get connection url and modifying it to have the right databasename
result = PostgresHook(postgres_conn_id='postgres_new').get_uri().split("/")
result[3] = db_name
dbURI = "/".join(result)

with DAG('LOAD_NY_COVID_DLY',
         default_args=default_args,
         catchup=False,
         template_searchpath='/opt/airflow/') as dag:

    @dag.task
    def getTodayDate():
Ejemplo n.º 9
0
from datetime import timedelta

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta

default_args = {
    'owner': 'owner_name',
    'depends_on_past': False,
    'start_date': datetime(2021, 1, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG(
    'dbt_dag',
    default_args=default_args,
    description='DAG to invoke dbt commands',
    schedule_interval=timedelta(days=1),
)

dbt_run = BashOperator(
    task_id='dbt_run',
    bash_command='dbt run',
    dag=dag
)
Ejemplo n.º 10
0
from airflow.utils.dates import datetime
from airflow.providers.postgres.hooks.postgres import PostgresHook
from airflow.decorators import dag, task
from airflow.operators.python import task, get_current_context
import pendulum
from pandas import DataFrame
import requests
from pandas import json_normalize
import pandas as pd
from sqlalchemy import create_engine
from datetime import timedelta
from airflow.operators.bash import BashOperator

local_tz = pendulum.timezone("US/Pacific")

default_args = {
    'owner': 'Anil',
    'start_date': datetime(2021, 2, 10, tzinfo=local_tz),
    'schedule_interval': '0 9 * * *',
    'email': '*****@*****.**'
}

dag = DAG(dag_id='TEST_BACKFILL_DAG', default_args=default_args, catchup=True)

run_this = BashOperator(
    task_id='also_run_this',
    bash_command=
    'echo "run_id={{ run_id }} | dag_run={{ dag_run }} | ds={{ ds }}"',
    depends_on_past=True,
    dag=dag)