def email_send(recips,topics_count, pdf=None): """ Send notification emails to active recipients :param recips: :param topics_count: :param pdf: :return: """ with open('/opt/airflow/common/templates/topics_email.html') as f: html = f.read() template = Template(html) today = pendulum.now(local_tz) yesterday = today - timedelta(days=1) dateString = format_date(yesterday, format='long', locale='ru') host = os.environ.get('DJANGO_HOST') port = os.environ.get('DJANGO_PORT', '80') if port != '80': url = 'http://' + host else: url = 'http://' + host subject = 'Отзывы о Белтелеком за ' + yesterday.strftime('%d.%m.%Y') topicsCountLastDigit = topics_count % 10 review = 'отзывов' left = 'оставлено' if topicsCountLastDigit == 1: review = "отзыв" left = 'оставлен' elif topicsCountLastDigit in [2, 3, 4]: review = 'отзыва' files_pathes = [] if pdf: p = pickle.loads(pdf) with open('forums.pdf', 'wb') as file: file.write(pdf) files_pathes.append('forums.pdf') context = {'left': left, 'review': review, 'topicsCount': topics_count, 'dateString': dateString, 'url': url, } content = template.render(context) for index, recip in enumerate(recips): email_operator = email.EmailOperator( task_id="send_email_"+str(index), to=[recip[3]], subject=subject, files=files_pathes, html_content=content, ) email_operator.execute(context)
def daily_query(): """ Constructs query parameters that get endpoints starting from a Day prior to the Date it's run. """ return { "date_modified__gte": datetime.isoformat( datetime.now(timezone.utc) - timedelta(days=1) ) }
def get_topisc_count(url): """ Get topics count from forums API :param url: str :return: """ today = pendulum.now(local_tz) yesterday = today - timedelta(days=1) date_str = yesterday.strftime('%Y-%m-%d') req = requests.models.PreparedRequest() req.prepare_url(url, {'date': date_str}) response = requests.get(req.url) if response.status_code != 200: raise requests.HTTPError return response.json()['count']
from airflow.providers.odbc.hooks.odbc import OdbcHook from airflow.providers.mongo.hooks.mongo import MongoHook from airflow.contrib.hooks.fs_hook import FSHook from airflow.contrib.sensors.file_sensor import FileSensor from airflow.utils.dates import days_ago, timedelta from airflow.models import Variable from mongo_plugin.operators.csv_to_mongo_operator import CsvToMongoOperator from utils import utils from datetime import datetime import pyodbc import pandas as pd import numpy as np import os yesterday_date = datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d') default_args = { 'owner': 'airflow', 'start_date': datetime(2021, 3, 9), 'retries': 1, 'retry_delay': timedelta(seconds=10) } with DAG('flowback_data_ingestion_dag', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: check_file_exists = FileSensor( task_id='check_file_exists',
from airflow.utils.dates import timedelta VOLUME = '/abs/path/data:/data' default_args = { 'owner': 'airflow', 'email': ['*****@*****.**'], 'retries': 1, 'retry_delay': timedelta(minutes=5), 'email_on_failure': True }
with connection.connect() as conn: conn.execute(f"DELETE FROM {TABLE_NAME};") print(f'data removed {repr(data_size)} rows') connection.dispose() return f'[+] data removing task completed' with DAG( dag_id='df_to_postgres_sqlalchemy', description=f'Load data to postgress table {repr("boliga")}', default_args=args, start_date=datetime.now() - timedelta(minutes=10), # Start 10 minutes ago # days_ago(2) schedule_interval='*/10 * * * *', ) as dag: load_dataframe = PythonOperator( task_id='load_data_with_sqlalchemy', python_callable=load_data, dag=dag, provide_context=True ) remove_dataframe = PythonOperator( task_id='remove_data_after_50_rows', dag=dag,
import json from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.utils.dates import datetime from airflow.utils.dates import timedelta from airflow.utils.task_group import TaskGroup default_args = { "owner": "astronomer", "depends_on_past": False, "start_date": datetime(2020, 12, 23), "email": ["*****@*****.**"], "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=5), } DBT_DIR = "/usr/local/airflow/data-cicd" dag = DAG( "dbt_advanced_dag", default_args=default_args, description="A dbt wrapper for airflow", schedule_interval=timedelta(days=1), ) def load_manifest(): manifest_filepath = f"{DBT_DIR}/target/manifest.json" with open(manifest_filepath) as f:
with connection.connect() as conn: conn.execute(f"DELETE FROM {TABLE_NAME};") print(f"data removed {repr(data_size)} rows") finally: connection.dispose() return "[+] data removing task completed" with DAG( dag_id="df_to_postgres_sqlalchemy", description=f'Load data to postgress table {repr("boliga")}', default_args=args, start_date=( datetime.now() - timedelta(minutes=10)), # Start 10 minutes ago # days_ago(2) schedule_interval="*/10 * * * *", ) as dag: load_dataframe = PythonOperator( task_id="load_data_with_sqlalchemy", python_callable=load_data, dag=dag, provide_context=True, ) remove_dataframe = PythonOperator( task_id="remove_data_after_50_rows", dag=dag, python_callable=remove_data, provide_context=True,
# just some globals to keep around START_OP_ID = "start_op" EXTRACT_OP_ID = "extract_op_{}" TRANSFORM_OP_ID = "transform_op_{}" DAG_ID = "CourtListener_Daily" NUMBER_OF_BATCHES = 5 SKIP_KEY = "skip" default_args = { "owner": "airflow", "depends_on_past": False, "email_on_failure": False, "email_on_retry": False, "retries": 5, "retry_delay": timedelta(seconds=25), "number_of_batches": 5, "log_response": True # 'execution_timeout': timedelta(seconds=300), } daily_paging_args = { "parser": parser, "check_more": check_more, "next_page": next_page, "response_count": response_count, "response_valid": response_valid, "query_builder": daily_query, "endpoint": "opinions", "http_conn_id": "default_api", "mongo_conn_id": "default_mongo",
from airflow.models import DAG from airflow.utils.dates import days_ago, timedelta from airflow.operators.python_operator import PythonOperator import random args = { "owner": "dennislau", "start_date": days_ago(1) } dag = DAG(dag_id='simple_dag', default_args=args, schedule_interval=None) def run_this_func(**context): print("hellow world!") def always_fail(**context): raise Exception('Exception') def random_fail(**context): if random.random() > 0.7: raise Exception('random exception') else: print('passed!') with dag: task_1 = PythonOperator(task_id='run_this_1', python_callable=random_fail, provide_context=True, retries=10, retry_delay=timedelta(seconds=2)) task_2 = PythonOperator(task_id='run_this_2', python_callable=run_this_func, provide_context=True) task_1 >> task_2
def notify_flow(): """ DAG for scheduled sending email notifications of yesterday topics about Beltelecom. DAG has 4 tasks: - get_recipients :param None :return tuple of recipients - get_topics_count :param url :return topics_count - get_pdf :param url :return pickle of binary pdf -send_email :param recips,topics_count, pdf=None :return None """ @task(default_args=default_args, execution_timeout=timedelta(seconds=10)) def get_recipients(): """ Get notification recipients from Topics DB :return: tuple """ from sqlalchemy import create_engine, MetaData, Table from sqlalchemy.orm import sessionmaker SQL_DATABASE = os.environ.get('SQL_DATABASE_REMOTE') SQL_USER = os.environ.get('SQL_USER_REMOTE') SQL_PASSWORD = os.environ.get('SQL_PASSWORD_REMOTE') SQL_HOST = os.environ.get('SQL_HOST_REMOTE') SQL_PORT = os.environ.get('SQL_PORT_REMOTE') DB_STRING = 'postgresql+psycopg2://' + SQL_USER + ':' + SQL_PASSWORD + '@' + SQL_HOST + '/' + SQL_DATABASE engine = create_engine(DB_STRING, encoding='utf-8', echo=False) metadata = MetaData(engine) metadata.reflect(only=['forumTopics_employees']) table = Table('forumTopics_employees', metadata, autoload=True) Session = sessionmaker(bind=engine) session = Session() result = session.query(table.c.lastName, table.c.firstName, table.c.patronymic, table.c.email ).filter(table.c.isActive == '1').all() session.close() if len(result) < 1: raise Exception('No active recipients') return tuple(result) @task(default_args=default_args, execution_timeout=timedelta(seconds=10)) def get_topisc_count(url): """ Get topics count from forums API :param url: str :return: """ today = pendulum.now(local_tz) yesterday = today - timedelta(days=1) date_str = yesterday.strftime('%Y-%m-%d') req = requests.models.PreparedRequest() req.prepare_url(url, {'date': date_str}) response = requests.get(req.url) if response.status_code != 200: raise requests.HTTPError return response.json()['count'] @task(default_args=default_args) def get_pdf(url): """ create pdf file for attach :param url: str :return: pickle """ response = requests.get(url) if response.status_code != 200: raise AirflowSkipException return pickle.dumps(response.content) @task(default_args=default_args, trigger_rule=TriggerRule.NONE_FAILED) def email_send(recips,topics_count, pdf=None): """ Send notification emails to active recipients :param recips: :param topics_count: :param pdf: :return: """ with open('/opt/airflow/common/templates/topics_email.html') as f: html = f.read() template = Template(html) today = pendulum.now(local_tz) yesterday = today - timedelta(days=1) dateString = format_date(yesterday, format='long', locale='ru') host = os.environ.get('DJANGO_HOST') port = os.environ.get('DJANGO_PORT', '80') if port != '80': url = 'http://' + host else: url = 'http://' + host subject = 'Отзывы о Белтелеком за ' + yesterday.strftime('%d.%m.%Y') topicsCountLastDigit = topics_count % 10 review = 'отзывов' left = 'оставлено' if topicsCountLastDigit == 1: review = "отзыв" left = 'оставлен' elif topicsCountLastDigit in [2, 3, 4]: review = 'отзыва' files_pathes = [] if pdf: p = pickle.loads(pdf) with open('forums.pdf', 'wb') as file: file.write(pdf) files_pathes.append('forums.pdf') context = {'left': left, 'review': review, 'topicsCount': topics_count, 'dateString': dateString, 'url': url, } content = template.render(context) for index, recip in enumerate(recips): email_operator = email.EmailOperator( task_id="send_email_"+str(index), to=[recip[3]], subject=subject, files=files_pathes, html_content=content, ) email_operator.execute(context) recipients = get_recipients() topics_count = get_topisc_count(os.environ.get('URL_TOPICS_COUNT')) pdf = get_pdf(os.environ.get('URL_PDF_API')) email_send(recipients, topics_count, pdf)
from jinja2 import Template import pendulum local_tz = pendulum.timezone(os.environ.get('TIMEZONE')) default_args = { 'owner': os.environ.get('OWNER'), 'email_on_failure': True, 'email': os.environ.get('ALERT_EMAIL'), 'retries': 1, 'depends_on_past': False, } @dag( default_args=default_args, schedule_interval=timedelta(days=int(os.environ.get('INTERVAL'))), start_date=days_ago(1, int(os.environ.get('START_HOUR')), int(os.environ.get('START_MINUTE'))), catchup=False, tags=['topics_notify'], ) def notify_flow(): """ DAG for scheduled sending email notifications of yesterday topics about Beltelecom. DAG has 4 tasks: - get_recipients :param None :return tuple of recipients - get_topics_count :param url :return topics_count