コード例 #1
0
ファイル: cli.py プロジェクト: cjquinon/incubator-airflow
def variables(args):

    if args.get:
        try:
            var = Variable.get(args.get,
                               deserialize_json=args.json,
                               default_var=args.default)
            print(var)
        except ValueError as e:
            print(e)
    if args.delete:
        session = settings.Session()
        session.query(Variable).filter_by(key=args.delete).delete()
        session.commit()
        session.close()
    if args.set:
        Variable.set(args.set[0], args.set[1])
    # Work around 'import' as a reserved keyword
    imp = getattr(args, 'import')
    if imp:
        if os.path.exists(imp):
            import_helper(imp)
        else:
            print("Missing variables file.")
    if args.export:
        export_helper(args.export)
    if not (args.set or args.get or imp or args.export or args.delete):
        # list all variables
        session = settings.Session()
        vars = session.query(Variable)
        msg = "\n".join(var.key for var in vars)
        print(msg)
コード例 #2
0
ファイル: tuto2.py プロジェクト: VViles/airflow_test
def set_sms(*args, **context):
    group = Variable.get('group')
    if group == 'night_shift':
        context['task_instance'].xcom_push('recipient', '0011223344')
        context['task_instance'].xcom_push('message', 'night airflow message')
    else:
        context['task_instance'].xcom_push('recipient', '0011223344')
        context['task_instance'].xcom_push('message', 'day airflow message')
コード例 #3
0
ファイル: cli.py プロジェクト: yogesh2021/airflow
def variables(args):
    if args.get:
        try:
            var = Variable.get(args.get, deserialize_json=args.json, default_var=args.default)
            print(var)
        except ValueError as e:
            print(e)
    if args.set:
        Variable.set(args.set[0], args.set[1])
    if not args.set and not args.get:
        # list all variables
        session = settings.Session()
        vars = session.query(Variable)
        msg = "\n".join(var.key for var in vars)
        print(msg)
コード例 #4
0
    def failed(self, context):
        self.conf = context["conf"]
        self.task = context["task"]
        self.execution_date = context["execution_date"]
        self.dag = context["dag"]
        self.errors = SlackAPIPostOperator(
            task_id='task_failed',
            token=Variable.get('slack_token'),
            channel='C1SRU2R33',
            text="Your DAG has encountered an error, please follow the link to view the log details:  " + "http://localhost:8080/admin/airflow/log?" + "task_id=" + task.task_id + "&" +\
            "execution_date=" + execution_date.isoformat() + "&" + "dag_id=" + dag.dag_id,
            dag=pipeline
        )

        errors.execute()
コード例 #5
0
def ReportDailySuccessful(task_instance, **kwargs):
  date = kwargs['execution_date']
  latest_run = float(Variable.get('latest_daily_timestamp'))

  timestamp = time.mktime(date.timetuple())
  logging.info('Current run\'s timestamp: %s \n'
               'latest_daily\'s timestamp: %s', timestamp, latest_run)
  if timestamp >= latest_run:
    Variable.set('latest_daily_timestamp', timestamp)
    run_sha = task_instance.xcom_pull(task_ids='get_git_commit')
    latest_version = GetSettingPython(task_instance, 'VERSION')
    logging.info('setting latest green daily to: %s', run_sha)
    Variable.set('latest_sha', run_sha)
    Variable.set('latest_daily', latest_version)
    logging.info('latest_sha test to %s', run_sha)
コード例 #6
0
        def wrapped(context):
            """ping error in slack on failure and provide link to the log"""
            conf = context["conf"]
            task = context["task"]
            execution_date = context["execution_date"]
            dag = context["dag"]
            base_url = conf.get('webserver', 'base_url')

            # Get the ID of the target slack channel
            slack_token = Variable.get(slack_token_variable)
            sc = SlackClient(slack_token)

            response = sc.api_call('channels.list')
            for channel in response['channels']:
                if channel['name'].lower() == channel_name.lower():
                    break
            else:
                raise AirflowException('No channel named {} found.'.format(channel_name))

            # Construct a slack operator to send the message off.
            notifier = cls(
                task_id='task_failed',
                token=slack_token,
                channel=channel['id'],
                text=(
                    "Your DAG has encountered an error, please follow the link "
                    "to view the log details:  "
                    "{}/admin/airflow/log?"
                        "task_id={}&"
                        "dag_id={}&"
                        "execution_date={}"
                    ).format(base_url, task.task_id, dag.dag_id,
                             execution_date.isoformat()),
                dag=dag,
            )
            notifier.execute()
コード例 #7
0
import json
from datetime import datetime, timedelta

import requests
from airflow import DAG
from airflow.hooks.mysql_hook import MySqlHook
from airflow.models import Variable
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator

# Utils
api_key = Variable.get('api_key')
cities_ = Variable.get('cities').split(',')
cities = [x.encode('utf-8') for x in cities_]


def open_weather_response_parser(response_text, city):
    response_dict = json.loads(response_text)
    main = response_dict['main']
    temp_live = round(main['temp'] - 273.15, 2)
    temp_max = round(main['temp_max'] - 273.15, 2)
    temp_min = round(main['temp_min'] - 273.15, 2)
    humidity = main['humidity']
    pressure = main['pressure']
    weather = response_dict['weather'][0]['main']
    wind_speed = response_dict['wind']['speed']
    time = datetime.utcnow()
    parsed_response = {
        'city': city,
        'temp_live': temp_live,
        'temp_max': temp_max,
コード例 #8
0
ファイル: utils.py プロジェクト: techalchemy/airflow-sync
 def DEFAULT_SQL_DIR(cls):
     sql_dir = Path(Variable.get("sql_dir"))
     if not sql_dir.exists():
         PKG_PARENT = Path(__file__).absolute().parent.parent.parent.parent
         sql_dir = PKG_PARENT / "airflow-core/sql"
     return sql_dir / "salesforce"
コード例 #9
0
import json
import os
import sys
from airflow.models import Variable
sys.path.append(Variable.get('module_path'))
# sys.path.append(os.getenv('MODULE_PATH'))
from models.dag_task_model import DagTaskModel
from models.notification_message_model import NotificationMessageModel, NotificationType
from models.notification_subject_model import NotificationSubjectModel
from models.slack_model import SlackModel
from messengers.slack_notification import SlackNotification
# import Log


class SlackNotificationService:
    def __init__(self, notification_type: str) -> None:
        self.notification_type = NotificationType(notification_type)
        self.COUNT = 0
        self.RETRY_LIMIT = 5

    def send_message(self, status: str) -> dict:
        dag_id = str(status['dag']).translate({'<:': '', '>': ''})
        task_id = str(status['task']).translate({'<': '', '>': ''})

        dag_task = DagTaskModel(dag_id, task_id)

        notification_message: NotificationMessageModel = NotificationMessageModel(
            dag_task, self.notification_type)
        notification_subject: NotificationSubjectModel = NotificationSubjectModel(
            dag_task)
        slack_model = SlackModel(notification_subject, notification_message)
コード例 #10
0
ファイル: core.py プロジェクト: moritzpein/airflow
 def test_variable_set_get_round_trip(self):
     Variable.set("tested_var_set_id", "Monday morning breakfast")
     assert "Monday morning breakfast" == Variable.get("tested_var_set_id")
コード例 #11
0
def test_access_var():
    my_var = Variable.get("hsfjskdfjhk")
    print("my var message : {}".format(my_var))
    return ("Access Var Success!")
コード例 #12
0
default_args = {
    'owner': 'airflow',
    'description': 'Gathers MDS data from Bird',
    'depend_on_past': False,
    'start_date': datetime(2018, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    'on_failure_callback': task_fail_slack_alert,
}

mds_provider = "bird"
current_time = datetime.now() + timedelta(days=-1, hours=-6)
time_max = f"{current_time.year}-{current_time.month}-{current_time.day}-{(current_time.hour)}"
environment_vars = Variable.get("atd_mds_config_staging",
                                deserialize_json=True)
docker_image = 'atddocker/atd-mds-etl:master'

with DAG(
        f"atd_mds_{mds_provider}_staging",
        default_args=default_args,
        schedule_interval="15 * * * *",
        catchup=False,
        tags=["staging", "mds"],
) as dag:
    #
    # Task: provider_extract
    # Description: Given a schedule block, the script extracts data from the MDS provider within the schedule's time window
    # then it uploads the data into S3 for further processing.
    #
    t1 = DockerOperator(
コード例 #13
0
ファイル: istio_common_dag.py プロジェクト: yushihui/istio
def AirflowGetVariableOrBaseCase(var, base):
    try:
        return Variable.get(var)
    except KeyError:
        return base
コード例 #14
0
from airflow import DAG
from datetime import datetime, timedelta
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
from airflow.operators.dummy_operator import DummyOperator
from kubernetes.client import models as k8s
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflow.operators.http_operator import SimpleHttpOperator
import urllib.request
import json

default_args = {
    'owner': 'datagap'
}

basePath = Variable.get("permit_data_base_url")
templateUrl = Variable.get("permit_data_weekly_index_url")
permitDataSource = Variable.get("permit_datasource")

def downloadTemplate(templateUrl):
  request = urllib.request.urlopen(templateUrl)
  response = request.read().decode('utf-8')

  return response

def replace(jsonContent, dataSource, interval, basePath, date, market):
  
  result = json.loads(jsonContent)
  # base data source
  result['spec']['ioConfig']['inputSource']['dataSource'] = dataSource
  # ingest data url
コード例 #15
0
# 2. Fact tables dependent on the stage tables are also loaded
# (Fact tables dependent upon two different stages will not be loaded).
# This gives little bit of flexibility over db based taskgroups
#######################################################################################
from airflow import DAG
import base64
from datetime import timedelta
from airflow.utils.dates import days_ago
from airflow.models import Variable
from airflowcommon import getBatchId, getpythonoperator, getbashoperator, getbashoperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.subdag_operator import SubDagOperator
from airflow.utils.task_group import TaskGroup

# set the default config for the dag
dset = Variable.get("factloadjob1", deserialize_json=True)
kinitparms = Variable.get("kinitparms", deserialize_json=True)
password = kinitparms["kinitpass"]
password = base64.b64decode(password).decode('utf-8')
#kinitprincipal = kinitparms["kinitprincipal"]
crpmdevicedict = dset["crpm_device_mapping"]
sqoopjobs = crpmdevicedict["jobs"]
factdb = crpmdevicedict["factdb"]
srctoland = crpmdevicedict["src2land"]
land2stg = crpmdevicedict["land2stg"]
scriptpaths = dset["scriptpaths"]
kinitprincipal = kinitparms["kinitprincipal"]
kinitdomain = kinitparms["kinitdomain"]
edgenodehost = kinitparms["edgenodehost"]

default_args = {
コード例 #16
0
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
}

dag = airflow.DAG('dm_oride_passenger_base_multi_cube',
                  schedule_interval="45 00 * * *",
                  default_args=args)
##----------------------------------------- 变量 ---------------------------------------##

db_name = "oride_dw"
table_name = "dm_oride_passenger_base_multi_cube"

##----------------------------------------- 依赖 ---------------------------------------##
#获取变量
code_map = eval(Variable.get("sys_flag"))

#判断ufile(cdh环境)
if code_map["id"].lower() == "ufile":
    # 依赖前一天分区
    dwm_oride_passenger_order_base_di_prev_day_task = UFileSensor(
        task_id='dwm_oride_passenger_order_base_di_prev_day_task',
        filepath='{hdfs_path_str}/dt={pt}/_SUCCESS'.format(
            hdfs_path_str=
            "oride/oride_dw/dwm_oride_passenger_order_base_di/country_code=NG",
            pt='{{ds}}'),
        bucket_name='opay-datalake',
        poke_interval=60,  # 依赖不满足时,一分钟检查一次依赖状态
        dag=dag)
    #路径
    hdfs_path = "ufile://opay-datalake/oride/oride_dw/" + table_name
コード例 #17
0
    def dividend_probability_calculator():
        credentials = service_account.Credentials.from_service_account_info(
            Variable.get("key", deserialize_json=True))

        destination_bucket_name = 'dividend_declarations_hackathon'
        storage_client = storage.Client()
        destination_bucket = storage_client.bucket(destination_bucket_name)

        project_id = 'hackathon-wpb'
        table_id = 'hackathon-wpb.customer_relations.customer_dividend_malaysia'
        query_string = """
           SELECT * 
           FROM hackathon-wpb.customer_relations.customer_dividend_malaysia"""

        table_schema = [{
            'name': 'Ticker',
            'type': 'STRING',
            'mode': 'REQUIRED'
        }, {
            'name': 'Mic',
            'type': 'STRING',
            'mode': 'REQUIRED'
        }, {
            'name':
            'Contacts',
            'type':
            'RECORD',
            'mode':
            'REPEATED',
            'fields': [{
                'name': 'Name',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'email',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }, {
            'name':
            'Dividend',
            'type':
            'RECORD',
            'mode':
            'REPEATED',
            'fields': [{
                'name': 'DeclarationYear',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'DeclaratioMonth',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }, {
                'name': 'DeclarationDate',
                'type': 'STRING',
                'mode': 'NULLABLE'
            }]
        }, {
            'name': 'RecentDeclarationDate',
            'type': 'DATE',
            'mode': 'NULLABLE'
        }, {
            'name': 'NextPayableDate',
            'type': 'DATE',
            'mode': 'NULLABLE'
        }, {
            'name': 'ExpectedStartDate',
            'type': 'DATE',
            'mode': 'NULLABLE'
        }, {
            'name': 'ExpectedEndDate',
            'type': 'DATE',
            'mode': 'NULLABLE'
        }, {
            'name': 'LastRunDate',
            'type': 'DATE',
            'mode': 'NULLABLE'
        }, {
            'name': 'ProbabilityNextMonthDeclaration',
            'type': 'NUMERIC',
            'mode': 'NULLABLE'
        }, {
            'name': 'Period',
            'type': 'INTEGER',
            'mode': 'NULLABLE'
        }]

        project_id = 'hackathon-wpb'
        dataset_id = 'customer_relations'
        table_id = 'customer_dividend_malaysia'

        client = bigquery.Client(project=project_id)
        dataset = client.dataset(dataset_id)
        table = dataset.table(table_id)

        job_config = bigquery.LoadJobConfig()
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        job_config.schema = table_schema
        job_config.write_disposition = 'WRITE_TRUNCATE'

        dataframe_complete = pdgbq.read_gbq(query=query_string,
                                            project_id=project_id)
        dataframe = dataframe_complete
        print(dataframe.dtypes)
        print(len(dataframe))
        base = datetime.today().date()
        start_date = base + timedelta(days=30)
        end_date = base + timedelta(days=70)
        df_companies = pd.DataFrame(dataframe_complete.Ticker.unique())
        df_companies.rename(columns={0: 'Ticker'}, inplace=True)
        df_companies['ProbabilityNextMonthDeclaration'] = 0.0
        df_companies['ExpectedStartDate'] = ''
        df_companies['ExpectedEndDate'] = ''
        convert_dict = {
            'Ticker': str,
            'ProbabilityNextMonthDeclaration': float,
            'ExpectedStartDate': np.datetime64,
            'ExpectedEndDate': np.datetime64
        }

        df_companies = df_companies.astype(convert_dict)
        for ind in df_companies.index:
            company_name = df_companies['Ticker'][ind]
            df_company_temp = pd.DataFrame(
                dataframe.loc[dataframe['Ticker'] == company_name])
            df_company_temp_list = df_company_temp['Dividend']
            index = df_company_temp_list.index
            try:
                df_company_temp_2 = json_normalize(
                    df_company_temp_list[index[0]])
                df = pd.DataFrame({
                    'year': df_company_temp_2['DeclarationYear'],
                    'month': df_company_temp_2['DeclaratioMonth'],
                    'day': df_company_temp_2['DeclarationDate']
                })
                df_company_temp_2['Date'] = pd.to_datetime(df)
                if (company_name == 'CIMB.XKLS'):
                    print(df_company_temp_2)
                df_company_temp_2.drop_duplicates(subset=[
                    'DeclarationYear', 'DeclaratioMonth', 'DeclarationDate'
                ],
                                                  inplace=True)
                if (company_name == 'CIMB.XKLS'):
                    print(df_company_temp_2)
            except:
                continue
            total_declarations = len(
                df_company_temp_2['DeclarationYear'].unique())
            recent_years = []
            non_recent_years = []
            count_recent = 0
            count_recent_two_years = 0
            number_of_recent_years = 0
            number_of_non_recent_years = 0
            count_non_recent = 0
            base_date_minus_2 = base - timedelta(days=730)
            base_date_minus_5 = base - timedelta(days=1825)
            year_considered = []
            for ind2 in df_company_temp_2.index:
                months = dataframe.loc[dataframe['Ticker'] == company_name,
                                       'Period']
                if (not (math.isnan(months))):
                    start_date = base + timedelta(days=int(months) * 30)
                    end_date = start_date + timedelta(days=40)
                start_dates, end_dates = date_to_months(start_date, end_date)
                date_temp = df_company_temp_2['Date'][ind2]
                if (company_name == 'CIMB.XKLS'):
                    print(date_temp)
                if (base_date_minus_2 <= date_temp < base):
                    recent_years.append(date_temp.year)
                elif (base_date_minus_5 <= date_temp < base_date_minus_2):
                    recent_years.append(date_temp.year)
                else:
                    non_recent_years.append(date_temp.year)
                out_fmt = '%Y-%m-%d'
                #for every month check if previous declaration month/date falls in the range
                for start, end in zip(start_dates, end_dates):
                    year = start.year
                    if (company_name == 'CIMB.XKLS'):
                        print('base')
                        print(base)
                        print('start')
                        print(start)
                        print('end')
                        print(end)
                    try:
                        if start.replace(year=year) <= date_temp.replace(
                                year=year) <= end.replace(year=year):
                            if (not date_temp.year in year_considered):
                                if (base_date_minus_2 <= date_temp < base):
                                    count_recent_two_years = count_recent_two_years + 1
                                    if (company_name == 'CIMB.XKLS'):
                                        print(date_temp)
                                        print("count_recent incremented")
                                elif (base_date_minus_5 <= date_temp <
                                      base_date_minus_2):
                                    count_recent = count_recent + 1
                                    if (company_name == 'CIMB.XKLS'):
                                        print(date_temp)
                                        print("count_recent incremented")
                                elif (date_temp <= base_date_minus_5):
                                    count_non_recent = count_non_recent + 1
                                    if (company_name == 'CIMB.XKLS'):
                                        print(date_temp)
                                        print("count_non_recent incremented")
                                year_considered.append(date_temp.year)
                    except:
                        #to handle 29th feb
                        #print(date_temp)
                        one_day = timedelta(1)
                        date_temp = date_temp - one_day
                        if start.replace(year=year) <= date_temp.replace(
                                year=year) <= end.replace(year=year):
                            if (not date_temp.year in year_considered):
                                if (base_date_minus_2 <= date_temp < base):
                                    count_recent_two_years = count_recent_two_years + 1
                                    if (company_name == 'UEMS.XKLS'):
                                        print(date_temp)
                                        print("count_recent incremented")
                                elif (base_date_minus_5 <= date_temp <
                                      base_date_minus_2):
                                    count_recent = count_recent + 1
                                    if (company_name == 'UEMS.XKLS'):
                                        print(date_temp)
                                        print("count_recent incremented")
                                elif (date_temp < base_date_minus_5):
                                    count_non_recent = count_non_recent + 1
                                    #print("count_non_recent incremented")
                                    year_considered.append(date_temp.year)
            number_of_latest_years = 2
            number_of_recent_years = 3
            #(pd.Series(recent_years)).nunique()
            number_of_non_recent_years = (
                pd.Series(non_recent_years)).nunique()
            probability = (
                (3 * weird_division(count_recent_two_years,
                                    number_of_latest_years)) +
                (2 * weird_division(count_recent, number_of_recent_years)) +
                (weird_division(count_non_recent,
                                number_of_non_recent_years))) / 6
            if (company_name == 'CIMB.XKLS'):
                print(count_recent_two_years)
                print(number_of_latest_years)
                print(count_recent)
                print(number_of_recent_years)
                print(count_non_recent)
                print(non_recent_years)
                print(number_of_non_recent_years)
                print(probability)
            df_companies['ProbabilityNextMonthDeclaration'][ind] = round(
                probability, 3)
            df_companies['ExpectedStartDate'][ind] = np.datetime64(start_date)
            df_companies['ExpectedEndDate'][ind] = np.datetime64(end_date)
            dataframe.loc[dataframe['Ticker'] == company_name,
                          'ProbabilityNextMonthDeclaration'] = str(probability)
            dataframe.loc[dataframe['Ticker'] == company_name,
                          'ExpectedStartDate'] = np.datetime64(start_date)
            dataframe.loc[dataframe['Ticker'] == company_name,
                          'ExpectedEndDate'] = np.datetime64(end_date)
        dataframe_complete.drop([
            'ProbabilityNextMonthDeclaration', 'ExpectedStartDate',
            'ExpectedEndDate', 'LastRunDate'
        ],
                                axis=1,
                                inplace=True)
        df_update = pd.merge(dataframe_complete,
                             df_companies,
                             left_on='Ticker',
                             right_on='Ticker')
        df_update['LastRunDate'] = np.datetime64(base)
        df_update['NextPayableDate'] = df_update[
            'NextPayableDate'].dt.strftime('%Y-%m-%d')
        df_update['ExpectedStartDate'] = df_update[
            'ExpectedStartDate'].dt.strftime('%Y-%m-%d')
        df_update['ExpectedEndDate'] = df_update[
            'ExpectedEndDate'].dt.strftime('%Y-%m-%d')
        df_update['LastRunDate'] = df_update['LastRunDate'].dt.strftime(
            '%Y-%m-%d')
        df_update['RecentDeclarationDate'] = df_update[
            'RecentDeclarationDate'].dt.strftime('%Y-%m-%d')
        json_data = df_update.to_json(orient="records")
        json_object = json.loads(json_data)
        job = client.load_table_from_json(
            json_object, table, job_config=job_config)  # Make an API request.
        filename = 'customer_dividend_malaysia_probability_update_' + str(
            datetime.now()) + '.json'
        blob = destination_bucket.blob(filename)
        blob.upload_from_string(data=json.dumps(json_object),
                                content_type='application/json')
        job.result()
        count_mails = 0  #remove for actual code
        for ind3 in df_companies.index:
            company_name = df_companies['Ticker'][ind3]
            print(company_name)
            print(ind3)
            probability = df_companies['ProbabilityNextMonthDeclaration'][ind3]
            expected_start_date = df_update.loc[dataframe['Ticker'] ==
                                                company_name,
                                                'ExpectedStartDate'].iloc[0]
            expected_end_date = df_update.loc[
                dataframe['Ticker'] == company_name, 'ExpectedEndDate'].iloc[0]
            if (float(probability) > 0.9 and count_mails < 10):
                df_contacts_temp = pd.DataFrame(
                    dataframe.loc[dataframe['Ticker'] == company_name])
                df_contacts_temp_list = df_company_temp['Contacts']
                index = df_contacts_temp_list.index
                df_contacts = json_normalize(df_contacts_temp_list[index[0]])
                contacts = df_contacts.drop_duplicates(subset=['email'],
                                                       keep='last')
                html_string = None
                with open(
                        '/opt/bitnami/airflow/dags/git-github-com-jainita95-dividend-tracker-git/EmailTemplateUpcomingDividend.html',
                        'r') as f:
                    html_string = f.read()
                html_string = html_string.format(code=company_name,
                                                 startDate=expected_start_date,
                                                 endDate=expected_end_date,
                                                 probability=math.ceil(
                                                     probability * 100))
                name = []
                emails = []
                for ind4 in contacts.index:
                    name_contact = contacts['Name'][ind4]
                    email = contacts['email'][ind4]
                    name.append(name_contact)
                    emails.append(To(email))
                message = Mail(
                    from_email='*****@*****.**',
                    to_emails=emails,
                    subject=
                    "Notice: An Upcoming Dividend Declaration cited for " +
                    company_name,
                    html_content=html_string)
                with open(
                        '/opt/bitnami/airflow/dags/git-github-com-jainita95-dividend-tracker-git/hsbcLogo.png',
                        'rb') as f:
                    data = f.read()
                    f.close()
                encoded = base64.b64encode(data).decode()
                attachment = Attachment()
                attachment.file_content = FileContent(encoded)
                attachment.file_type = FileType('image/png')
                attachment.file_name = FileName('hsbcLogo.png')
                attachment.disposition = Disposition('inline')
                attachment.content_id = ContentId('hsbclogo')
                message.add_attachment(attachment)
                try:
                    sg = SendGridAPIClient(Variable.get("sendgridapikey"))
                    response = sg.send(message)
                    count_mails = count_mails + 1
                    #print(response.status_code)
                    #print(response.body)
                    #print(response.headers)
                except Exception as e:
                    print(e.message)
コード例 #18
0
from datetime import datetime, timedelta

from airflow.operators.subdag_operator import SubDagOperator
from airflow.models import DAG, Variable
from tester_collector.subdags.sub import all_process

PROJECT_VERSION = '1.0'
PROJECT_NAME = 'tester-collector'

# MAIN DAGS
# interval = "0 3 */1 * *"
interval = "*/10 * * * *"
DAG_ID = 'tester_collector'
start_date = datetime.strptime(Variable.get("tester_collector_start_date"),
                               "%Y-%m-%d %H:%M:%S")
emails = Variable.get('support_email_list').split(',')
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': start_date,
    'email': emails,
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=2)
}

with DAG(dag_id=DAG_ID,
         default_args=default_args,
         schedule_interval=interval,
         start_date=start_date) as dag:
コード例 #19
0
ファイル: tuto2.py プロジェクト: VViles/airflow_test
def set_mail(*args, **context):
    group = Variable.get('group')
    if group == 'night_shift':
        context['task_instance'].xcom_push(key='recipient', value='*****@*****.**')
    else:
        context['task_instance'].xcom_push(key='recipient', value='*****@*****.**')
コード例 #20
0
from airflow import DAG
from airflow.models import Variable
from airflow.operators.python_operator import PythonOperator
from datetime import timedelta, datetime
from tasks.fetch_covid_cases import fetch_daily_data
from tasks.find_upload_percentage import find_percentage
from tasks.upload_csv_to_big_table import upload_csv_to_big_table
import yaml

# #fetching constants

# from airflow
dag_config = Variable.get("bigquery_variables", deserialize_json=True)
BQ_CONN_ID = dag_config["bq_conn_id"]
BQ_PROJECT = dag_config["bq_project"]
BQ_TABLE = dag_config["bq_table"]
BQ_DATASET = dag_config["bq_dataset"]

# form config yaml file
with open("config/pipelines/covid_pipeline.yaml", 'r') as stream:
    try:
        dag_info = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# form library yaml file
with open("library/pipeline_defaults.yaml", 'r') as stream:
    try:
        dag_defaults = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
コード例 #21
0
ファイル: istio_common_dag.py プロジェクト: veggiemonk/istio
  def GenerateTestArgs(**kwargs):
    """Loads the configuration that will be used for this Iteration."""
    conf = kwargs['dag_run'].conf
    if conf is None:
      conf = dict()

    """ Airflow gives the execution date when the job is supposed to be run,
        however we dont backfill and only need to run one build therefore use
        the current date instead of the date that is passed in """
#    date = kwargs['execution_date']
    date = datetime.datetime.now()

    timestamp = time.mktime(date.timetuple())

    # Monthly releases started in Nov 2017 with 0.3.0, so minor is # of months
    # from Aug 2017.
    minor_version = (date.year - 2017) * 12 + (date.month - 1) - 7
    major_version = AirflowGetVariableOrBaseCase('major_version', 0)
    # This code gets information about the latest released version so we know
    # What version number to use for this round.
    r_minor = int(AirflowGetVariableOrBaseCase('released_version_minor', 0))
    r_patch = int(AirflowGetVariableOrBaseCase('released_version_patch', 0))
    # If  we have already released a monthy for this mounth then bump
    # The patch number for the remander of the month.
    if r_minor == minor_version:
      patch = r_patch + 1
    else:
      patch = 0
    # If version is overriden then we should use it otherwise we use it's
    # default or monthly value.
    version = conf.get('VERSION')
    if monthly and not version:
      version = '{}.{}.{}'.format(major_version, minor_version, patch)

    default_conf = environment_config.get_airflow_config(
        version,
        timestamp,
        major=major_version,
        minor=minor_version,
        patch=patch,
        date=date.strftime('%Y%m%d'),
        rc=date.strftime('%H-%M'))
    config_settings = dict(VERSION=default_conf['VERSION'])
    config_settings_name = [
        'PROJECT_ID',
        'MFEST_URL',
        'MFEST_FILE',
        'GCS_STAGING_BUCKET',
        'SVC_ACCT',
        'GITHUB_ORG',
        'GITHUB_REPO',
        'GCS_GITHUB_PATH',
        'TOKEN_FILE',
        'GCR_STAGING_DEST',
        'GCR_RELEASE_DEST',
        'GCS_MONTHLY_RELEASE_PATH',
        'DOCKER_HUB',
        'GCS_BUILD_BUCKET',
        'RELEASE_PROJECT_ID',
    ]

    for name in config_settings_name:
      config_settings[name] = conf.get(name) or default_conf[name]

    if monthly:
      config_settings['MFEST_COMMIT'] = conf.get(
          'MFEST_COMMIT') or Variable.get('latest_sha')
      gcs_path = conf.get('GCS_MONTHLY_STAGE_PATH')
      if not gcs_path:
        gcs_path = default_conf['GCS_MONTHLY_STAGE_PATH']
    else:
      config_settings['MFEST_COMMIT'] = conf.get(
          'MFEST_COMMIT') or default_conf['MFEST_COMMIT']
      gcs_path = conf.get('GCS_DAILY_PATH') or default_conf['GCS_DAILY_PATH']

    config_settings['GCS_STAGING_PATH'] = gcs_path
    config_settings['GCS_BUILD_PATH'] = '{}/{}'.format(
        config_settings['GCS_BUILD_BUCKET'], gcs_path)
    config_settings['GCS_RELEASE_TOOLS_PATH'] = '{}/release-tools/{}'.format(
        config_settings['GCS_BUILD_BUCKET'], gcs_path)
    config_settings['GCS_FULL_STAGING_PATH'] = '{}/{}'.format(
        config_settings['GCS_STAGING_BUCKET'], gcs_path)
    config_settings['ISTIO_REPO'] = 'https://github.com/{}/{}.git'.format(
        config_settings['GITHUB_ORG'], config_settings['GITHUB_REPO'])

    return config_settings
コード例 #22
0
import os
from os.path import expanduser

from airflow import DAG
from airflow.models import Variable
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.http_operator import SimpleHttpOperator

from airflow.operators.idea_plugin import BigQueryTableModifiedSensor

home = expanduser("~")

STATE_PATH = '{0}/gcs/data/nwea_assessment_results_last_modified.text'.format(
    home)
IDEA2_API_KEY = Variable.get('idea2_api_key')
"""
DAG for updating Illuminate grade review data
"""

default_args = {
    "owner": "airflow",
    "depends_on_past": False,
    "start_date": datetime(2019, 3, 12),
    "email": ["*****@*****.**"],
    "email_on_failure": True,
    "email_on_retry": False,
    "retries": 1,
    "retry_delay": timedelta(minutes=2),
    "provide_context": True
    # 'queue': 'bash_queue',
コード例 #23
0
ファイル: core.py プロジェクト: moritzpein/airflow
 def test_get_non_existing_var_should_return_default(self):
     default_value = "some default val"
     assert default_value == Variable.get("thisIdDoesNotExist",
                                          default_var=default_value)
コード例 #24
0
    }]

    TaskTimeoutMonitor().set_task_monitor(msg)


task_timeout_monitor = PythonOperator(task_id='task_timeout_monitor',
                                      python_callable=fun_task_timeout_monitor,
                                      provide_context=True,
                                      dag=dag)

##----------------------------------------- 变量 ---------------------------------------##
db_name = "opay_dw_ods"

table_name = "ods_sqoop_base_message_record_di"
hdfs_path = "oss://opay-datalake/opay_dw_sqoop_di/opay_sms/message_record"
config = eval(Variable.get("opay_time_zone_config"))


def ods_sqoop_base_message_record_di_sql_task(ds):
    HQL = '''

    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.parallel=true;
    insert overwrite table {db}.{table} partition (dt)
    SELECT 
        id,
        template_name,
        country_code,
        message_type,
        mobile,
        content,
コード例 #25
0
ファイル: find_neighbors.py プロジェクト: wprazuch/Astral
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'sla_miss_callback': yet_another_function,
    # 'trigger_rule': 'all_success'
}

dag = DAG(
    "3_find_neighbours",
    default_args=default_args,
    description="Find neighbouring waves in a timespace",
    schedule_interval=timedelta(days=1),
)

rootdir = "/app/data"

filename = Variable.get("filename")

if filename == "all":
    files = [file for file in os.listdir(rootdir) if file.endswith(".tif")]
else:
    files = [filename]

tolerance_xy = Variable.get("tolerance_xy")
tolerance_t = Variable.get("tolerance_t")
intersect_threshold = Variable.get("intersection_threshold")

for file in files:
    filename = file
    directory = filename.split(".")[0]
    directory = process_task_name(directory)
コード例 #26
0
default_args = {
    'owner': 'chr0nomaton',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('Automata',
          start_date=datetime(2018, 12, 30),
          default_args=default_args,
          schedule_interval=timedelta(days=1))

user_id = Variable.get("SPOTIFY_CLIENT_USER_ID")
auth_token = Variable.get("SPOTIFY_CLIENT_TOKEN_CACHE")
with open(f"/usr/local/airflow/.cache-{user_id}", "w+") as f:
    print(f"Wrote {auth_token} to /usr/local/airflow/.cache-{user_id}")
    f.write(auth_token)
spotify = SpotifyAPI(
    user_id=user_id,
    client_id=Variable.get("SPOTIFY_CLIENT_ID"),
    client_secret=Variable.get("SPOTIFY_CLIENT_SECRET"),
    redirect_uri=Variable.get("SPOTIFY_CLIENT_REDIRECT_URI"),
)
with open(f"/usr/local/airflow/.cache-{user_id}") as f:
    Variable.set("SPOTIFY_CLIENT_TOKEN_CACHE", f.read())

t1_get_bands = PythonOperator(task_id='fetch_reddit_posts',
                              python_callable=get_reddit_posts,
コード例 #27
0
ファイル: utils.py プロジェクト: techalchemy/airflow-sync
def get_sql_dir():
    sql_dir = Path(Variable.get("sql_dir"))
    if not sql_dir.exists():
        PKG_PARENT = Path(__file__).absolute().parent.parent.parent.parent
        sql_dir = PKG_PARENT / "airflow-core/sql"
    return sql_dir
コード例 #28
0
def get_reddit_posts(**context):
    reddit = CurrentDaysBands(
        client_id=Variable.get("REDDIT_CLIENT_ID"),
        client_secret=Variable.get("REDDIT_CLIENT_SECRET"),
        user_agent=Variable.get("REDDIT_USER_AGENT"))
    return reddit.get_bands(context['yesterday_ds'])
コード例 #29
0
from postgres_check_operator import (
    PostgresMultiCheckOperator,
    COUNT_CHECK,
    GEO_CHECK,
)

from sql.wior import (
    DROP_COLS,
    SQL_DROP_TMP_TABLE,
    SQL_GEOM_VALIDATION,
    SQL_ADD_PK,
    SQL_SET_DATE_DATA_TYPES,
)

dag_id: str = "wior"
variables: Dict = Variable.get(dag_id, deserialize_json=True)
data_endpoint: Dict = variables["data_endpoints"]["wfs"]
tmp_dir: str = f"{SHARED_DIR}/{dag_id}"
data_file: str = f"{tmp_dir}/{dag_id}.geojson"
db_conn: DatabaseEngine = DatabaseEngine()
password: str = env("AIRFLOW_CONN_WIOR_PASSWD")
user: str = env("AIRFLOW_CONN_WIOR_USER")
base_url: str = URL(env("AIRFLOW_CONN_WIOR_BASE_URL"))
total_checks: list = []
count_checks: list = []
geo_checks: list = []
to_zone: Optional[tzinfo] = tz.gettz("Europe/Amsterdam")


class DataSourceError(Exception):
    """Custom exeception for not available data source"""
コード例 #30
0
}

DOCKER_IMAGE = "atddocker/atd-knack-services:production"

# command args
SCRIPT_TASK_1 = "records_to_postgrest"
SCRIPT_TASK_2 = "records_to_agol"
SCRIPT_TASK_3 = "agol_build_markings_segment_geometries"
SCRIPT_TASK_4 = "records_to_socrata"
APP_NAME = "signs-markings"
ENV = "prod"
POOL_KNACK = "knack_signs_markings"
POOL_POSTGREST = "atd_knack_postgrest_pool"
CONTAINER = "view_3100"

env_vars = Variable.get("atd_knack_services_postgrest", deserialize_json=True)
atd_knack_auth = Variable.get("atd_knack_auth", deserialize_json=True)
env_vars["KNACK_APP_ID"] = atd_knack_auth[APP_NAME][ENV]["app_id"]
env_vars["KNACK_API_KEY"] = atd_knack_auth[APP_NAME][ENV]["api_key"]
env_vars["AGOL_USERNAME"] = Variable.get("agol_username")
env_vars["AGOL_PASSWORD"] = Variable.get("agol_password")
env_vars["SOCRATA_API_KEY_ID"] = Variable.get(
    "atd_service_bot_socrata_api_key_id")
env_vars["SOCRATA_API_KEY_SECRET"] = Variable.get(
    "atd_service_bot_socrata_api_key_secret")
env_vars["SOCRATA_APP_TOKEN"] = Variable.get(
    "atd_service_bot_socrata_app_token")

with DAG(
        dag_id="atd_knack_markings_work_orders_jobs",
        description=
コード例 #31
0
ファイル: operators.py プロジェクト: UnityTech/docker-airflow
    def execute(self, context):

        if self.provide_context:
            context.update(self.op_kwargs)

        log_date = context['ti'].execution_date.strftime('%Y-%m-%d')
        if self.log_date_fun is not None:
            log_date = self.log_date_fun(context)
        if self.log_hour_fun is not None:
            self.log_hour = self.log_hour_fun(context)

        current_dir = os.getcwd()
        s3_bucket_with_env = tools.s3_name_generator(self.s3_bucket, "-json", "-staging")
        s3_conn_id_with_env = tools.s3_name_generator(self.s3_conn_id, "_json", "_staging")
        s3_key_path = "druid-json-template" + "/" + self.template
        self.download_template_file(s3_conn_id_with_env, s3_bucket_with_env, s3_key_path, current_dir + "/json")

        druid_host, druid_port = Variable.get("druid_overlord").split(":")
        key, secret = GenericHook(s3_conn_id_with_env).get_credentials()
        druid = DruidAccess(druid_host, druid_port, "", "", self.data_source)
        s3 = S3Access(key, secret, False)

        if self.folder is None:
            self.folder = tools.s3_name_generator(self.prefix, "-prod", "-staging")

        s3_file_location = ""
        if self.aggregate == "DAILY":
            s3_file_location = "s3://{bucket}/{folder}/{topic}/{day_key}={log_date}/".format(
                bucket=s3_bucket_with_env, folder=self.folder,
                topic=self.topic, day_key=self.date_key,
                log_date=log_date)
        if self.aggregate == "HOURLY":
            s3_file_location = "s3://{bucket}/{folder}/{topic}/{day_key}={log_date}/{hour_key}={log_hour}/".format(
                bucket=s3_bucket_with_env, folder=self.folder,
                topic=self.topic, day_key=self.date_key,
                log_date=log_date, hour_key=self.hour_key, log_hour=self.log_hour)

        logging.info("Launching importer for %s.." % s3_file_location)
        s3_files = s3.get_filenames(s3_file_location)
        logging.info("Files Name " + ','.join(s3_files))
        running_tasks = []
        s3_files = ['"' + f + '"' for f in sorted(s3_files)]
        task = Task(s3_files)
        log_timestamp = log_date + ' ' + self.log_hour + ":00:00"
        log_timestamp_ts = (parser.parse(log_timestamp)).isoformat()
        next_log_timestamp_ts = (parser.parse(log_timestamp) + datetime.timedelta(days=1)).isoformat()
        if self.aggregate == "HOURLY":
            next_log_timestamp_ts = (parser.parse(log_timestamp) + datetime.timedelta(hours=1)).isoformat()

        logging.info("Handling task %r" % task)
        task.id = druid.upload(log_timestamp_ts, next_log_timestamp_ts, s3_files, self.template)
        logging.info("Uploading task to druid and task id is %r" % task.id)
        running_tasks.append(task)
        # Cleaning and waiting
        druid.clean_tasks(running_tasks)
        while len(running_tasks) >= self.slots:
            logging.info("Waiting for %r tasks" % len(running_tasks))
            time.sleep(10)
            running_tasks = druid.clean_tasks(running_tasks)
        while len(running_tasks) > 0:
            logging.info("Waiting for finalization of %r tasks" % len(running_tasks))
            time.sleep(10)
            running_tasks = druid.clean_tasks(running_tasks)
        logging.info("Importing done..")
コード例 #32
0
import os
import time
import boto3
import airflow.hooks.S3_hook

from airflow import DAG
from airflow.models import Variable
from airflow.operators import BashOperator
from datetime import datetime, timedelta
from airflow.operators.python_operator import PythonOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.hooks.postgres_hook import PostgresHook

## APi key to connect to weather api
## Alternatively store them in files and source
API_KEY = Variable.get("weather_api_key")

# Following are defaults which can be overridden later on
# dag variables
default_args = {
    'owner': 'Srilekha',
    'depends_on_past': False,
    'start_date': datetime(2020, 11, 6),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

コード例 #33
0
ファイル: project-workflow.py プロジェクト: dkyos/dev-samples
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from datetime import datetime,timedelta
from airflow.models import Variable

SRC=Variable.get("SRC")
#SRC='./'
COUNTRY=Variable.get("COUNTRY")
#COUNTRY='PL'

dag = DAG('project-workflow',description='Project Workflow DAG',
        schedule_interval = '*/5 0 * * *',
        start_date=datetime(2017,7,1),
        catchup=False)

xlsx_to_csv_task = BashOperator(
        task_id='xlsx_to_csv',
        bash_command='"$src"/test.sh "$country" 2nd_param_xlsx',
        env={'src': SRC, 'country': COUNTRY},
        dag=dag)

merge_command = SRC + '/test.sh ' + COUNTRY + ' 2nd_param_merge'
merge_task = BashOperator(
        task_id='merge',
        bash_command=merge_command ,
        dag=dag)

my_templated_command = """
{{ params.src }}/test.sh {{ params.country}} 2nd_param_cleansing
コード例 #34
0
ファイル: utilization_kpi.py プロジェクト: vipul-tm/DAGS-PROD
}

Q_PUBLIC = "poller_queue"
Q_PRIVATE = "formatting_queue"
Q_OSPF = "poller_queue"
Q_PING = "poller_queue"

PARENT_DAG_NAME = "UTILIZATION_KPI"
utilization_kpi_dag = DAG(dag_id=PARENT_DAG_NAME,
                          default_args=default_args,
                          schedule_interval='4-59/5 * * * *')

redis_hook_util_10 = RedisHook(redis_conn_id="redis_hook_util_10")
redis_hook_2 = RedisHook(redis_conn_id="redis_hook_2")

technologies = eval(Variable.get('utilization_kpi_technologies'))
machines = eval(Variable.get("system_config_no_o1"))
devices = eval(Variable.get('hostmk.dict.site_mapping'))
attributes = eval(Variable.get('utilization_kpi_attributes'))

all_sites = []


def init_kpi():
    logging.info("TODO : Check All vars and Airflow ETL Environment here")
    redis_hook_util_10.flushall("*")
    logging.info("Flushed all in redis_hook_util_10 connection")


def get_previous_device_states(device_type):
    prev_state = eval(redis_hook_2.get("kpi_ul_prev_state_%s" % device_type))
コード例 #35
0
ファイル: tuto2.py プロジェクト: VViles/airflow_test
def set_call(*args, **context):
    group = Variable.get('group')
    if group == 'night_shift':
        context['task_instance'].xcom_push(key='recipient', value='0011223344')
    else:
        context['task_instance'].xcom_push(key='recipient', value='0011223344')
コード例 #36
0
def GetVariableOrDefault(var, default):
  try:
    return Variable.get(var)
  except KeyError:
    return default
コード例 #37
0
def load_dimension_subdag(parent_dag_name, task_id, redshift_conn_id, *args,
                          **kwargs):
    """
    A python function with arguments, which creates a dag
    :param parent_dag_name: imp ({parent_dag_name}.{task_id})
    :param task_id: imp {task_id}
    :param redshift_conn_id: {any connection id}
    :param args: {verbose}
    :param kwargs: {verbose and context variables}
    :return:
    """
    dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs)

    copy_ports = StageToRedshiftOperator(task_id='copy_ports',
                                         dag=dag,
                                         redshift_conn_id="redshift",
                                         aws_credentials_id="aws_default",
                                         file='i94port.csv',
                                         delimiter=',',
                                         table='i94ports',
                                         s3_bucket=Variable.get("s3_bucket"),
                                         s3_key="csv",
                                         sql_stmt=SqlQueries.copy_csv_cmd,
                                         provide_context=True)

    copy_visa = StageToRedshiftOperator(task_id='copy_visa',
                                        dag=dag,
                                        redshift_conn_id="redshift",
                                        aws_credentials_id="aws_default",
                                        file='i94visa.csv',
                                        delimiter=',',
                                        table='i94visa',
                                        s3_bucket=Variable.get("s3_bucket"),
                                        s3_key="csv",
                                        sql_stmt=SqlQueries.copy_csv_cmd,
                                        provide_context=True)

    copy_modes = StageToRedshiftOperator(task_id='copy_modes',
                                         dag=dag,
                                         redshift_conn_id="redshift",
                                         aws_credentials_id="aws_default",
                                         file='i94mode.csv',
                                         delimiter=',',
                                         table='i94mode',
                                         s3_bucket=Variable.get("s3_bucket"),
                                         s3_key="csv",
                                         sql_stmt=SqlQueries.copy_csv_cmd,
                                         provide_context=True)

    copy_addr = StageToRedshiftOperator(task_id='copy_addr',
                                        dag=dag,
                                        redshift_conn_id="redshift",
                                        aws_credentials_id="aws_default",
                                        file='i94addr.csv',
                                        delimiter=',',
                                        table='i94addr',
                                        s3_bucket=Variable.get("s3_bucket"),
                                        s3_key="csv",
                                        sql_stmt=SqlQueries.copy_csv_cmd,
                                        provide_context=True)

    copy_country_codes = StageToRedshiftOperator(
        task_id='copy_country_codes',
        dag=dag,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_default",
        file='i94cit&i94res.csv',
        delimiter=',',
        table='i94res',
        s3_bucket=Variable.get("s3_bucket"),
        s3_key="csv",
        sql_stmt=SqlQueries.copy_csv_cmd,
        provide_context=True)

    copy_cities_demographics = StageToRedshiftOperator(
        task_id='copy_cities_demographics',
        dag=dag,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_default",
        file='us-cities-demographics.csv',
        delimiter=';',
        table='us_cities_demographics',
        s3_bucket=Variable.get("s3_bucket"),
        s3_key="csv",
        sql_stmt=SqlQueries.copy_csv_cmd,
        provide_context=True)

    copy_airports = StageToRedshiftOperator(
        task_id='copy_airports',
        dag=dag,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_default",
        file='airport-codes_csv.csv',
        delimiter=',',
        table='airport_codes',
        s3_bucket=Variable.get("s3_bucket"),
        s3_key="csv",
        sql_stmt=SqlQueries.copy_csv_cmd,
        provide_context=True)

    def parquet_to_redshift(table, s3_bucket, s3_key, iam_role, sql_stmt,
                            redshift_conn_id, **kwargs):
        """
        This function reads parquet files and copies them to redshift
        schema.db
        :param table:
        :param s3_bucket:
        :param s3_key:
        :param iam_role:
        :param sql_stmt:
        :param redshift_conn_id:
        :param kwargs:
        :return:
        """
        redshift = PostgresHook(postgres_conn_id=redshift_conn_id)
        logging.info("Copying data from S3 to Redshift")
        s3_path = "s3://{}/{}".format(s3_bucket, s3_key)
        formatted_sql = sql_stmt.format(table, s3_path, iam_role)
        redshift.run(formatted_sql)
        aws_hook = AwsHook("aws_default")
        credentials = aws_hook.get_credentials()
        client = boto3.client('s3',
                              aws_access_key_id=credentials.access_key,
                              aws_secret_access_key=credentials.secret_key)
        objects_to_delete = client.list_objects(
            Bucket=Variable.get("s3_bucket"), Prefix="parquet")
        delete_keys = {'Objects': []}
        delete_keys['Objects'] = [
            {
                'Key': k
            } for k in
            [obj['Key'] for obj in objects_to_delete.get('Contents', [])]
        ]
        client.delete_objects(Bucket=Variable.get("s3_bucket"),
                              Delete=delete_keys)

    copy_immigration = PythonOperator(
        task_id='copy_immigration',
        python_callable=parquet_to_redshift,  # changed
        provide_context=True,
        op_kwargs={
            'table': "immigration",
            's3_bucket': Variable.get("s3_bucket"),
            's3_key': 'parquet',
            'iam_role': Variable.get('iam_role'),
            'sql_stmt': SqlQueries.copy_parquet_cmd,
            'redshift_conn_id': 'redshift'
        },
        dag=dag)

    copy_ports
    copy_visa
    copy_modes
    copy_addr
    copy_country_codes
    copy_airports
    copy_cities_demographics
    copy_immigration

    return dag
コード例 #38
0
try:
    from airflow.utils import timezone  # airflow.utils.timezone is available from v1.10 onwards
    now = timezone.utcnow
except ImportError:
    now = datetime.utcnow

DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace(
    ".py", "")  # airflow-db-cleanup
START_DATE = airflow.utils.dates.days_ago(1)
SCHEDULE_INTERVAL = "@daily"  # How often to Run. @daily - Once a day at Midnight (UTC)
DAG_OWNER_NAME = "operations"  # Who is listed as the owner of this DAG in the Airflow Web Server
ALERT_EMAIL_ADDRESSES = [
]  # List of email address to send email alerts to if this job fails
DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS = int(
    Variable.get("airflow_db_cleanup__max_db_entry_age_in_days", 30)
)  # Length to retain the log files if not already provided in the conf. If this is set to 30, the job will remove those files that are 30 days old or older.
ENABLE_DELETE = True  # Whether the job should delete the db entries or not. Included if you want to temporarily avoid deleting the db entries.
DATABASE_OBJECTS = [  # List of all the objects that will be deleted. Comment out the DB objects you want to skip.
    {
        "airflow_db_model": DagRun,
        "age_check_column": DagRun.execution_date,
        "keep_last": True,
        "keep_last_filters": [DagRun.external_trigger == False],
        "keep_last_group_by": DagRun.dag_id
    },
    {
        "airflow_db_model": TaskInstance,
        "age_check_column": TaskInstance.execution_date,
        "keep_last": False,
        "keep_last_filters": None,
# This DAG is configured to print the date and sleep for 5 seconds.
# However, it is configured to fail (see the expect_failure bash_command)
# and send an e-mail to your specified email on task failure.

from airflow import DAG
from airflow.models import Variable
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta

YESTERDAY = datetime.combine(
    datetime.today() - timedelta(days=1), datetime.min.time())

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': YESTERDAY,
    'email': [ Variable.get('email') ],
    'email_on_failure': True,
    'email_on_retry': False,
    'retries': 0,
}

with DAG('hello_world_email_bonus', default_args=default_args) as dag:
  t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)
  t2 = BashOperator(task_id='expect_failure', bash_command='exit 1', dag=dag)
  t1 >> t2
コード例 #40
0
ファイル: pre-msa.py プロジェクト: veg/SARS-CoV-2
import pathlib
from pathlib import Path

p = os.path.abspath(
    str(pathlib.Path(__file__).parent.absolute()) + '/../../python/')
if p not in sys.path:
    sys.path.append(p)

from export_sequences_without_premsa import export_sequences
from store_premsa import store_premsa_file
from premsa_log_parse import mark_troubled
from mark_premsa_dupes import mark_premsa_dupes
from get_raw_duplicates import write_raw_duplicates
from mark_duplicates import mark_duplicates

WORKING_DIR = Variable.get("WORKING_DIR")
DATE_STRING = datetime.date.today().strftime('%Y-%m-%d')

default_args = {
    'owner': 'sweaver',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'params': {
        'working_dir': WORKING_DIR,
        'num_procs': 16,
        'python':
        "/data/shares/veg/SARS-CoV-2/SARS-CoV-2-devel/env/bin/python3",
        'hyphy': "/data/shares/veg/SARS-CoV-2/hyphy/hyphy",
        'hyphy_mpi': "/data/shares/veg/SARS-CoV-2/hyphy/HYPHYMPI",
コード例 #41
0
ファイル: istio_common_dag.py プロジェクト: veggiemonk/istio
 def AirflowGetVariableOrBaseCase(var, base):
   try:
     return Variable.get(var)
   except KeyError:
     return base
コード例 #42
0
def my_function():
    from airflow.models import Variable
    catalogs_folder = Variable.get("CATALOGS_FOLDER")
    
    import numpy as np
    import pandas as pd
    from pplaa import Project
    
    prj = Project()
    prj.init(catalogs_folder + '/example_006')

    prj.cat.raw.pokemon.load()

    prj.cat.raw.pokemon.load()['HP'].max()  # Max HP

    validation_rules = {
        'raw.pokemon': {
            'rules': [
                {
                    'rtype': 'REQUIRED_COLUMNS_RULE',
                    'mandatory': 1,
                    'columns': [
                        'Name', 'Type 1', 'Total', 'HP'
                    ],
                    'strict': 0,
                    'paused': 0
                }, 
                {
                    'rtype': 'MIN_MAX_RULE',
                    'mandatory': 1,
                    'column': 'HP',
                    'min_value': 0,
                    'max_value': 255   # <-- Max HP
                }
            ]
        }
    }

    prj.cat.set_validation_rules(validation_rules)

    prj.cat.validate('raw.pokemon').passed

    # The cat.validate() method returns a ValidationReport object
    type(prj.cat.validate('raw.pokemon'))

    # When we print a ValidationReport, we obtain a report of the result
    print(prj.cat.validate('raw.pokemon'))

    # Forcing fail in the validation changing the max_value for HP (MIN_MAX_RULE)
    validation_rules['raw.pokemon']['rules'][1]['max_value'] = 254

    prj.cat.set_validation_rules(validation_rules)

    prj.cat.validate('raw.pokemon').passed

    print(prj.cat.validate('raw.pokemon'))

    # REQUIRED_COLUMNS_RULE -> Ok
    vars(prj.cat.validate('raw.pokemon').validation_result.results[0]['result'])

    # MIN_MAX_RULE -> Fail
    vars(prj.cat.validate('raw.pokemon').validation_result.results[1]['result'])
コード例 #43
0
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': dt.timedelta(minutes=5),
    'queue': queue,
}

# initiate the DAG
dag = DAG(
    dag_name,
    default_args=default_args,
    description='Running multiple HEC-HMSs using Google Kubernetes Engine',
    schedule_interval=schedule_interval)

hec_config = Variable.get(hec_config_var, deserialize_json=True)

for i in range(parallel_runs):
    generate_run_id = PythonOperator(
        task_id='gen-run-id',
        python_callable=af_kube_utils.generate_random_run_id,
        op_args=[run_id_prefix],
        op_kwargs={"suffix": "%04d" % i},
        provide_context=True,
        dag=dag)

    logging.info('Initializing hec-hms pod')
    hec_pod = get_base_pod()
    hec_pod.metadata.name = 'kube-pod-{{ ti.xcom_pull(task_ids=\'gen-run-id\') }}'
    hec_pod.spec.containers[
        0].name = 'kube-cont-{{ ti.xcom_pull(task_ids=\'gen-run-id\') }}'
# # create formatter and add it to the handlers
# formatter = logging.Formatter('%(asctime)s - %(process)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s')
# fh.setFormatter(formatter)
# ch.setFormatter(formatter)
# # add the handlers to the logger
# logger.addHandler(fh)
# logger.addHandler(ch)

import datetime
from airflow import DAG
from airflow.models import Variable
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from openpyxl import Workbook

project_folder = Variable.get("project_folder")
order_output_folder = Variable.get("order_output_folder")
store_order_file = Variable.get("store_order_file_name")

default_args = {
    'owner': 'Carrefour',
    'start_date': datetime.datetime(2019, 8, 19),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
    'end_date': datetime.datetime(2030, 1, 1),
}

dag = DAG('get_sales',
コード例 #45
0
ファイル: core.py プロジェクト: moritzpein/airflow
 def test_variable_set_get_round_trip_json(self):
     value = {"a": 17, "b": 47}
     Variable.set("tested_var_set_id", value, serialize_json=True)
     assert value == Variable.get("tested_var_set_id", deserialize_json=True)
コード例 #46
0
    StructType,
    StringType,
    DoubleType,
    IntegerType
)
from pyspark.sql.functions import udf, monotonically_increasing_id

from helpers import (
    practice_prescribing_schema,
    chemicals_schema,
    practices_schema,
    practice_size_schema,
    bnf_codes_schema
)

S3_STAGING = Variable.get('s3_output_bucket')
S3_RAW_DATA = Variable.get('s3_input_bucket')
aws_access_key_id = Variable.get('aws_access_key_id')
aws_secret_key = Variable.get('aws_secret_access_key')


class PreprocessToS3Operator(BaseOperator):

    ui_color = '#80BD9E'

    @apply_defaults
    def __init__(self,
                 schema="",
                 s3_bucket="",
                 s3_key="",
                 filename="",
コード例 #47
0
ファイル: core.py プロジェクト: moritzpein/airflow
 def test_get_non_existing_var_should_not_deserialize_json_default(self):
     default_value = "}{ this is a non JSON default }{"
     assert default_value == Variable.get("thisIdDoesNotExist",
                                          default_var=default_value,
                                          deserialize_json=True)
コード例 #48
0
ファイル: varialbes.py プロジェクト: Aleks-Ya/yaal_examples
"""
Working with Variables.
Doc: https://airflow.apache.org/concepts.html?highlight=variable#variables
"""

from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import datetime
from airflow.models import Variable

text_variable = Variable.get("user")

# Getting a JSON var doesn't work ()
#json_variable = Variable.get("json_var", deserialize_json = True)

default_args = {
    'start_date': datetime.now()
}

dag = DAG('varialbes', default_args=default_args)

text_message = f"echo 'The user variable is {text_variable}'"
#json_message = f"echo 'The json_var={json_variable}'"
t1 = BashOperator(
    task_id='text_variable',
    bash_command=text_message,
    dag=dag)