Ejemplo n.º 1
0
    def email_send(recips,topics_count, pdf=None):
        """
            Send notification emails to active recipients
        :param recips:
        :param topics_count:
        :param pdf:
        :return:
        """

        with open('/opt/airflow/common/templates/topics_email.html') as f:
            html = f.read()

        template = Template(html)
        today = pendulum.now(local_tz)
        yesterday = today - timedelta(days=1)
        dateString = format_date(yesterday, format='long', locale='ru')
        host = os.environ.get('DJANGO_HOST')
        port = os.environ.get('DJANGO_PORT', '80')
        if port != '80':
            url = 'http://' + host
        else:
            url = 'http://' + host

        subject = 'Отзывы о Белтелеком за ' + yesterday.strftime('%d.%m.%Y')

        topicsCountLastDigit = topics_count % 10
        review = 'отзывов'
        left = 'оставлено'
        if topicsCountLastDigit == 1:
            review = "отзыв"
            left = 'оставлен'
        elif topicsCountLastDigit in [2, 3, 4]:
            review = 'отзыва'

        files_pathes = []
        if pdf:
            p = pickle.loads(pdf)
            with open('forums.pdf', 'wb') as file:
                file.write(pdf)
            files_pathes.append('forums.pdf')

        context = {'left': left,
                   'review': review,
                   'topicsCount': topics_count,
                   'dateString': dateString,
                   'url': url,
                   }

        content = template.render(context)
        for index, recip in enumerate(recips):
            email_operator = email.EmailOperator(
                task_id="send_email_"+str(index),
                to=[recip[3]],
                subject=subject,
                files=files_pathes,
                html_content=content,
            )
            email_operator.execute(context)
Ejemplo n.º 2
0
def daily_query():
    """
    Constructs query parameters that get endpoints starting from a Day prior to the Date it's run.
    """
    return {
        "date_modified__gte": datetime.isoformat(
            datetime.now(timezone.utc) - timedelta(days=1)
        )
    }
Ejemplo n.º 3
0
 def get_topisc_count(url):
     """
         Get topics count from forums API
     :param url: str
     :return:
     """
     today = pendulum.now(local_tz)
     yesterday = today - timedelta(days=1)
     date_str = yesterday.strftime('%Y-%m-%d')
     req = requests.models.PreparedRequest()
     req.prepare_url(url, {'date': date_str})
     response = requests.get(req.url)
     if response.status_code != 200:
         raise requests.HTTPError
     return response.json()['count']
Ejemplo n.º 4
0
from airflow.providers.odbc.hooks.odbc import OdbcHook
from airflow.providers.mongo.hooks.mongo import MongoHook
from airflow.contrib.hooks.fs_hook import FSHook
from airflow.contrib.sensors.file_sensor import FileSensor
from airflow.utils.dates import days_ago, timedelta
from airflow.models import Variable
from mongo_plugin.operators.csv_to_mongo_operator import CsvToMongoOperator
from utils import utils

from datetime import datetime
import pyodbc
import pandas as pd
import numpy as np
import os

yesterday_date = datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d')

default_args = {
    'owner': 'airflow',
    'start_date': datetime(2021, 3, 9),
    'retries': 1,
    'retry_delay': timedelta(seconds=10)
}

with DAG('flowback_data_ingestion_dag',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    check_file_exists = FileSensor(
        task_id='check_file_exists',
Ejemplo n.º 5
0
from airflow.utils.dates import timedelta

VOLUME = '/abs/path/data:/data'

default_args = {
    'owner': 'airflow',
    'email': ['*****@*****.**'],
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'email_on_failure': True
}
        with connection.connect() as conn:
            conn.execute(f"DELETE FROM {TABLE_NAME};")

        print(f'data removed {repr(data_size)} rows')

    connection.dispose()
    return f'[+] data removing task completed'
    



with DAG(
    dag_id='df_to_postgres_sqlalchemy',
    description=f'Load data to postgress table {repr("boliga")}',
    default_args=args,
    start_date=datetime.now() - timedelta(minutes=10), # Start 10 minutes ago # days_ago(2)
    schedule_interval='*/10 * * * *',
    ) as dag:

    
    load_dataframe = PythonOperator(
        task_id='load_data_with_sqlalchemy',
        python_callable=load_data,
        dag=dag,
        provide_context=True

    )

    remove_dataframe = PythonOperator(
        task_id='remove_data_after_50_rows',
        dag=dag,
Ejemplo n.º 7
0
import json
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.utils.dates import datetime
from airflow.utils.dates import timedelta
from airflow.utils.task_group import TaskGroup

default_args = {
    "owner": "astronomer",
    "depends_on_past": False,
    "start_date": datetime(2020, 12, 23),
    "email": ["*****@*****.**"],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=5),
}

DBT_DIR = "/usr/local/airflow/data-cicd"

dag = DAG(
    "dbt_advanced_dag",
    default_args=default_args,
    description="A dbt wrapper for airflow",
    schedule_interval=timedelta(days=1),
)


def load_manifest():
    manifest_filepath = f"{DBT_DIR}/target/manifest.json"
    with open(manifest_filepath) as f:
Ejemplo n.º 8
0
            with connection.connect() as conn:
                conn.execute(f"DELETE FROM {TABLE_NAME};")

            print(f"data removed {repr(data_size)} rows")
    finally:
        connection.dispose()
    return "[+] data removing task completed"


with DAG(
        dag_id="df_to_postgres_sqlalchemy",
        description=f'Load data to postgress table {repr("boliga")}',
        default_args=args,
        start_date=(
            datetime.now() -
            timedelta(minutes=10)),  # Start 10 minutes ago # days_ago(2)
        schedule_interval="*/10 * * * *",
) as dag:

    load_dataframe = PythonOperator(
        task_id="load_data_with_sqlalchemy",
        python_callable=load_data,
        dag=dag,
        provide_context=True,
    )

    remove_dataframe = PythonOperator(
        task_id="remove_data_after_50_rows",
        dag=dag,
        python_callable=remove_data,
        provide_context=True,
Ejemplo n.º 9
0
# just some globals to keep around
START_OP_ID = "start_op"
EXTRACT_OP_ID = "extract_op_{}"
TRANSFORM_OP_ID = "transform_op_{}"
DAG_ID = "CourtListener_Daily"
NUMBER_OF_BATCHES = 5

SKIP_KEY = "skip"

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 5,
    "retry_delay": timedelta(seconds=25),
    "number_of_batches": 5,
    "log_response": True
    #    'execution_timeout': timedelta(seconds=300),
}

daily_paging_args = {
    "parser": parser,
    "check_more": check_more,
    "next_page": next_page,
    "response_count": response_count,
    "response_valid": response_valid,
    "query_builder": daily_query,
    "endpoint": "opinions",
    "http_conn_id": "default_api",
    "mongo_conn_id": "default_mongo",
Ejemplo n.º 10
0
from airflow.models import DAG
from airflow.utils.dates import days_ago, timedelta
from airflow.operators.python_operator import PythonOperator
import random

args = {
    "owner": "dennislau",
    "start_date": days_ago(1)
}

dag = DAG(dag_id='simple_dag', default_args=args, schedule_interval=None)

def run_this_func(**context):
    print("hellow world!")

def always_fail(**context):
    raise Exception('Exception')

def random_fail(**context): 
    if random.random() > 0.7:
        raise Exception('random exception')
    else:
        print('passed!')

with dag:
    task_1 = PythonOperator(task_id='run_this_1', python_callable=random_fail, provide_context=True, retries=10, retry_delay=timedelta(seconds=2))
    task_2 = PythonOperator(task_id='run_this_2', python_callable=run_this_func, provide_context=True)
    task_1 >> task_2

Ejemplo n.º 11
0
def notify_flow():
    """
        DAG for scheduled sending email notifications of
        yesterday topics about Beltelecom.
        DAG has 4 tasks:
           -  get_recipients :param None
                            :return tuple of recipients
           - get_topics_count :param url
                              :return topics_count
           - get_pdf :param url
                     :return pickle of binary pdf
           -send_email :param recips,topics_count, pdf=None
                    :return  None

    """

    @task(default_args=default_args,
          execution_timeout=timedelta(seconds=10))
    def get_recipients():
        """
            Get notification recipients from Topics DB
        :return: tuple
        """
        from sqlalchemy import create_engine, MetaData, Table
        from sqlalchemy.orm import sessionmaker

        SQL_DATABASE = os.environ.get('SQL_DATABASE_REMOTE')
        SQL_USER = os.environ.get('SQL_USER_REMOTE')
        SQL_PASSWORD = os.environ.get('SQL_PASSWORD_REMOTE')
        SQL_HOST = os.environ.get('SQL_HOST_REMOTE')
        SQL_PORT = os.environ.get('SQL_PORT_REMOTE')

        DB_STRING = 'postgresql+psycopg2://' + SQL_USER + ':' + SQL_PASSWORD + '@' + SQL_HOST + '/' + SQL_DATABASE

        engine = create_engine(DB_STRING, encoding='utf-8', echo=False)
        metadata = MetaData(engine)
        metadata.reflect(only=['forumTopics_employees'])
        table = Table('forumTopics_employees', metadata, autoload=True)
        Session = sessionmaker(bind=engine)
        session = Session()
        result = session.query(table.c.lastName, table.c.firstName,
                                   table.c.patronymic, table.c.email
                                   ).filter(table.c.isActive == '1').all()
        session.close()
        if len(result) < 1:
            raise Exception('No active recipients')
        return tuple(result)

    @task(default_args=default_args,
          execution_timeout=timedelta(seconds=10))
    def get_topisc_count(url):
        """
            Get topics count from forums API
        :param url: str
        :return:
        """
        today = pendulum.now(local_tz)
        yesterday = today - timedelta(days=1)
        date_str = yesterday.strftime('%Y-%m-%d')
        req = requests.models.PreparedRequest()
        req.prepare_url(url, {'date': date_str})
        response = requests.get(req.url)
        if response.status_code != 200:
            raise requests.HTTPError
        return response.json()['count']

    @task(default_args=default_args)
    def get_pdf(url):
        """
            create pdf file for attach
        :param url: str
        :return: pickle
        """
        response = requests.get(url)
        if response.status_code != 200:
            raise AirflowSkipException
        return pickle.dumps(response.content)


    @task(default_args=default_args, trigger_rule=TriggerRule.NONE_FAILED)
    def email_send(recips,topics_count, pdf=None):
        """
            Send notification emails to active recipients
        :param recips:
        :param topics_count:
        :param pdf:
        :return:
        """

        with open('/opt/airflow/common/templates/topics_email.html') as f:
            html = f.read()

        template = Template(html)
        today = pendulum.now(local_tz)
        yesterday = today - timedelta(days=1)
        dateString = format_date(yesterday, format='long', locale='ru')
        host = os.environ.get('DJANGO_HOST')
        port = os.environ.get('DJANGO_PORT', '80')
        if port != '80':
            url = 'http://' + host
        else:
            url = 'http://' + host

        subject = 'Отзывы о Белтелеком за ' + yesterday.strftime('%d.%m.%Y')

        topicsCountLastDigit = topics_count % 10
        review = 'отзывов'
        left = 'оставлено'
        if topicsCountLastDigit == 1:
            review = "отзыв"
            left = 'оставлен'
        elif topicsCountLastDigit in [2, 3, 4]:
            review = 'отзыва'

        files_pathes = []
        if pdf:
            p = pickle.loads(pdf)
            with open('forums.pdf', 'wb') as file:
                file.write(pdf)
            files_pathes.append('forums.pdf')

        context = {'left': left,
                   'review': review,
                   'topicsCount': topics_count,
                   'dateString': dateString,
                   'url': url,
                   }

        content = template.render(context)
        for index, recip in enumerate(recips):
            email_operator = email.EmailOperator(
                task_id="send_email_"+str(index),
                to=[recip[3]],
                subject=subject,
                files=files_pathes,
                html_content=content,
            )
            email_operator.execute(context)

    recipients = get_recipients()
    topics_count = get_topisc_count(os.environ.get('URL_TOPICS_COUNT'))
    pdf = get_pdf(os.environ.get('URL_PDF_API'))
    email_send(recipients, topics_count, pdf)
Ejemplo n.º 12
0
from jinja2 import Template
import pendulum

local_tz = pendulum.timezone(os.environ.get('TIMEZONE'))

default_args = {
    'owner': os.environ.get('OWNER'),
    'email_on_failure': True,
    'email': os.environ.get('ALERT_EMAIL'),
    'retries': 1,
    'depends_on_past': False,
}

@dag(
    default_args=default_args,
    schedule_interval=timedelta(days=int(os.environ.get('INTERVAL'))),
    start_date=days_ago(1,
                        int(os.environ.get('START_HOUR')),
                        int(os.environ.get('START_MINUTE'))),
    catchup=False,
    tags=['topics_notify'],
)
def notify_flow():
    """
        DAG for scheduled sending email notifications of
        yesterday topics about Beltelecom.
        DAG has 4 tasks:
           -  get_recipients :param None
                            :return tuple of recipients
           - get_topics_count :param url
                              :return topics_count