def variables(args): if args.get: try: var = Variable.get(args.get, deserialize_json=args.json, default_var=args.default) print(var) except ValueError as e: print(e) if args.set: Variable.set(args.set[0], args.set[1]) if not args.set and not args.get: # list all variables session = settings.Session() vars = session.query(Variable) msg = "\n".join(var.key for var in vars) print(msg)
def set_sms(*args, **context): group = Variable.get('group') if group == 'night_shift': context['task_instance'].xcom_push('recipient', '0011223344') context['task_instance'].xcom_push('message', 'night airflow message') else: context['task_instance'].xcom_push('recipient', '0011223344') context['task_instance'].xcom_push('message', 'day airflow message')
def variables(args): if args.get: try: var = Variable.get(args.get, deserialize_json=args.json, default_var=args.default) print(var) except ValueError as e: print(e) if args.delete: session = settings.Session() session.query(Variable).filter_by(key=args.delete).delete() session.commit() session.close() if args.set: Variable.set(args.set[0], args.set[1]) # Work around 'import' as a reserved keyword imp = getattr(args, 'import') if imp: if os.path.exists(imp): import_helper(imp) else: print("Missing variables file.") if args.export: export_helper(args.export) if not (args.set or args.get or imp or args.export or args.delete): # list all variables session = settings.Session() vars = session.query(Variable) msg = "\n".join(var.key for var in vars) print(msg)
def failed(self, context): self.conf = context["conf"] self.task = context["task"] self.execution_date = context["execution_date"] self.dag = context["dag"] self.errors = SlackAPIPostOperator( task_id='task_failed', token=Variable.get('slack_token'), channel='C1SRU2R33', text="Your DAG has encountered an error, please follow the link to view the log details: " + "http://localhost:8080/admin/airflow/log?" + "task_id=" + task.task_id + "&" +\ "execution_date=" + execution_date.isoformat() + "&" + "dag_id=" + dag.dag_id, dag=pipeline ) errors.execute()
def wrapped(context): """ping error in slack on failure and provide link to the log""" conf = context["conf"] task = context["task"] execution_date = context["execution_date"] dag = context["dag"] base_url = conf.get('webserver', 'base_url') # Get the ID of the target slack channel slack_token = Variable.get(slack_token_variable) sc = SlackClient(slack_token) response = sc.api_call('channels.list') for channel in response['channels']: if channel['name'].lower() == channel_name.lower(): break else: raise AirflowException('No channel named {} found.'.format(channel_name)) # Construct a slack operator to send the message off. notifier = cls( task_id='task_failed', token=slack_token, channel=channel['id'], text=( "Your DAG has encountered an error, please follow the link " "to view the log details: " "{}/admin/airflow/log?" "task_id={}&" "dag_id={}&" "execution_date={}" ).format(base_url, task.task_id, dag.dag_id, execution_date.isoformat()), dag=dag, ) notifier.execute()
try: from airflow.utils import timezone # airflow.utils.timezone is available from v1.10 onwards now = timezone.utcnow except ImportError: now = datetime.utcnow DAG_ID = os.path.basename(__file__).replace(".pyc", "").replace( ".py", "") # airflow-db-cleanup START_DATE = airflow.utils.dates.days_ago(1) SCHEDULE_INTERVAL = "@daily" # How often to Run. @daily - Once a day at Midnight (UTC) DAG_OWNER_NAME = "operations" # Who is listed as the owner of this DAG in the Airflow Web Server ALERT_EMAIL_ADDRESSES = [ ] # List of email address to send email alerts to if this job fails DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS = int( Variable.get("airflow_db_cleanup__max_db_entry_age_in_days", 30) ) # Length to retain the log files if not already provided in the conf. If this is set to 30, the job will remove those files that are 30 days old or older. ENABLE_DELETE = True # Whether the job should delete the db entries or not. Included if you want to temporarily avoid deleting the db entries. DATABASE_OBJECTS = [ # List of all the objects that will be deleted. Comment out the DB objects you want to skip. { "airflow_db_model": DagRun, "age_check_column": DagRun.execution_date, "keep_last": True, "keep_last_filters": [DagRun.external_trigger == False], "keep_last_group_by": DagRun.dag_id }, { "airflow_db_model": TaskInstance, "age_check_column": TaskInstance.execution_date, "keep_last": False, "keep_last_filters": None,
# 2. Fact tables dependent on the stage tables are also loaded # (Fact tables dependent upon two different stages will not be loaded). # This gives little bit of flexibility over db based taskgroups ####################################################################################### from airflow import DAG import base64 from datetime import timedelta from airflow.utils.dates import days_ago from airflow.models import Variable from airflowcommon import getBatchId, getpythonoperator, getbashoperator, getbashoperator from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator from airflow.utils.task_group import TaskGroup # set the default config for the dag dset = Variable.get("factloadjob1", deserialize_json=True) kinitparms = Variable.get("kinitparms", deserialize_json=True) password = kinitparms["kinitpass"] password = base64.b64decode(password).decode('utf-8') #kinitprincipal = kinitparms["kinitprincipal"] crpmdevicedict = dset["crpm_device_mapping"] sqoopjobs = crpmdevicedict["jobs"] factdb = crpmdevicedict["factdb"] srctoland = crpmdevicedict["src2land"] land2stg = crpmdevicedict["land2stg"] scriptpaths = dset["scriptpaths"] kinitprincipal = kinitparms["kinitprincipal"] kinitdomain = kinitparms["kinitdomain"] edgenodehost = kinitparms["edgenodehost"] default_args = {
def my_function(): from airflow.models import Variable catalogs_folder = Variable.get("CATALOGS_FOLDER") import numpy as np import pandas as pd from pplaa import Project prj = Project() prj.init(catalogs_folder + '/example_006') prj.cat.raw.pokemon.load() prj.cat.raw.pokemon.load()['HP'].max() # Max HP validation_rules = { 'raw.pokemon': { 'rules': [ { 'rtype': 'REQUIRED_COLUMNS_RULE', 'mandatory': 1, 'columns': [ 'Name', 'Type 1', 'Total', 'HP' ], 'strict': 0, 'paused': 0 }, { 'rtype': 'MIN_MAX_RULE', 'mandatory': 1, 'column': 'HP', 'min_value': 0, 'max_value': 255 # <-- Max HP } ] } } prj.cat.set_validation_rules(validation_rules) prj.cat.validate('raw.pokemon').passed # The cat.validate() method returns a ValidationReport object type(prj.cat.validate('raw.pokemon')) # When we print a ValidationReport, we obtain a report of the result print(prj.cat.validate('raw.pokemon')) # Forcing fail in the validation changing the max_value for HP (MIN_MAX_RULE) validation_rules['raw.pokemon']['rules'][1]['max_value'] = 254 prj.cat.set_validation_rules(validation_rules) prj.cat.validate('raw.pokemon').passed print(prj.cat.validate('raw.pokemon')) # REQUIRED_COLUMNS_RULE -> Ok vars(prj.cat.validate('raw.pokemon').validation_result.results[0]['result']) # MIN_MAX_RULE -> Fail vars(prj.cat.validate('raw.pokemon').validation_result.results[1]['result'])
}] TaskTimeoutMonitor().set_task_monitor(msg) task_timeout_monitor = PythonOperator(task_id='task_timeout_monitor', python_callable=fun_task_timeout_monitor, provide_context=True, dag=dag) ##----------------------------------------- 变量 ---------------------------------------## db_name = "opay_dw_ods" table_name = "ods_sqoop_base_message_record_di" hdfs_path = "oss://opay-datalake/opay_dw_sqoop_di/opay_sms/message_record" config = eval(Variable.get("opay_time_zone_config")) def ods_sqoop_base_message_record_di_sql_task(ds): HQL = ''' set hive.exec.dynamic.partition.mode=nonstrict; set hive.exec.parallel=true; insert overwrite table {db}.{table} partition (dt) SELECT id, template_name, country_code, message_type, mobile, content,
default_args = { 'owner': 'chr0nomaton', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), } dag = DAG('Automata', start_date=datetime(2018, 12, 30), default_args=default_args, schedule_interval=timedelta(days=1)) user_id = Variable.get("SPOTIFY_CLIENT_USER_ID") auth_token = Variable.get("SPOTIFY_CLIENT_TOKEN_CACHE") with open(f"/usr/local/airflow/.cache-{user_id}", "w+") as f: print(f"Wrote {auth_token} to /usr/local/airflow/.cache-{user_id}") f.write(auth_token) spotify = SpotifyAPI( user_id=user_id, client_id=Variable.get("SPOTIFY_CLIENT_ID"), client_secret=Variable.get("SPOTIFY_CLIENT_SECRET"), redirect_uri=Variable.get("SPOTIFY_CLIENT_REDIRECT_URI"), ) with open(f"/usr/local/airflow/.cache-{user_id}") as f: Variable.set("SPOTIFY_CLIENT_TOKEN_CACHE", f.read()) t1_get_bands = PythonOperator(task_id='fetch_reddit_posts', python_callable=get_reddit_posts,
default_args = { 'owner': 'airflow', 'description': 'Gathers MDS data from Bird', 'depend_on_past': False, 'start_date': datetime(2018, 1, 1), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), 'on_failure_callback': task_fail_slack_alert, } mds_provider = "bird" current_time = datetime.now() + timedelta(days=-1, hours=-6) time_max = f"{current_time.year}-{current_time.month}-{current_time.day}-{(current_time.hour)}" environment_vars = Variable.get("atd_mds_config_staging", deserialize_json=True) docker_image = 'atddocker/atd-mds-etl:master' with DAG( f"atd_mds_{mds_provider}_staging", default_args=default_args, schedule_interval="15 * * * *", catchup=False, tags=["staging", "mds"], ) as dag: # # Task: provider_extract # Description: Given a schedule block, the script extracts data from the MDS provider within the schedule's time window # then it uploads the data into S3 for further processing. # t1 = DockerOperator(
def ReportDailySuccessful(task_instance, **kwargs): date = kwargs['execution_date'] latest_run = float(Variable.get('latest_daily_timestamp')) timestamp = time.mktime(date.timetuple()) logging.info('Current run\'s timestamp: %s \n' 'latest_daily\'s timestamp: %s', timestamp, latest_run) if timestamp >= latest_run: Variable.set('latest_daily_timestamp', timestamp) run_sha = task_instance.xcom_pull(task_ids='get_git_commit') latest_version = GetSettingPython(task_instance, 'VERSION') logging.info('setting latest green daily to: %s', run_sha) Variable.set('latest_sha', run_sha) Variable.set('latest_daily', latest_version) logging.info('latest_sha test to %s', run_sha)
def GetVariableOrDefault(var, default): try: return Variable.get(var) except KeyError: return default
def set_mail(*args, **context): group = Variable.get('group') if group == 'night_shift': context['task_instance'].xcom_push(key='recipient', value='*****@*****.**') else: context['task_instance'].xcom_push(key='recipient', value='*****@*****.**')
from datetime import datetime, timedelta from airflow.operators.subdag_operator import SubDagOperator from airflow.models import DAG, Variable from tester_collector.subdags.sub import all_process PROJECT_VERSION = '1.0' PROJECT_NAME = 'tester-collector' # MAIN DAGS # interval = "0 3 */1 * *" interval = "*/10 * * * *" DAG_ID = 'tester_collector' start_date = datetime.strptime(Variable.get("tester_collector_start_date"), "%Y-%m-%d %H:%M:%S") emails = Variable.get('support_email_list').split(',') default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': start_date, 'email': emails, 'email_on_failure': True, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=2) } with DAG(dag_id=DAG_ID, default_args=default_args, schedule_interval=interval, start_date=start_date) as dag:
def dividend_probability_calculator(): credentials = service_account.Credentials.from_service_account_info( Variable.get("key", deserialize_json=True)) destination_bucket_name = 'dividend_declarations_hackathon' storage_client = storage.Client() destination_bucket = storage_client.bucket(destination_bucket_name) project_id = 'hackathon-wpb' table_id = 'hackathon-wpb.customer_relations.customer_dividend_malaysia' query_string = """ SELECT * FROM hackathon-wpb.customer_relations.customer_dividend_malaysia""" table_schema = [{ 'name': 'Ticker', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'Mic', 'type': 'STRING', 'mode': 'REQUIRED' }, { 'name': 'Contacts', 'type': 'RECORD', 'mode': 'REPEATED', 'fields': [{ 'name': 'Name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'email', 'type': 'STRING', 'mode': 'NULLABLE' }] }, { 'name': 'Dividend', 'type': 'RECORD', 'mode': 'REPEATED', 'fields': [{ 'name': 'DeclarationYear', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'DeclaratioMonth', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'DeclarationDate', 'type': 'STRING', 'mode': 'NULLABLE' }] }, { 'name': 'RecentDeclarationDate', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'NextPayableDate', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'ExpectedStartDate', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'ExpectedEndDate', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'LastRunDate', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'ProbabilityNextMonthDeclaration', 'type': 'NUMERIC', 'mode': 'NULLABLE' }, { 'name': 'Period', 'type': 'INTEGER', 'mode': 'NULLABLE' }] project_id = 'hackathon-wpb' dataset_id = 'customer_relations' table_id = 'customer_dividend_malaysia' client = bigquery.Client(project=project_id) dataset = client.dataset(dataset_id) table = dataset.table(table_id) job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.schema = table_schema job_config.write_disposition = 'WRITE_TRUNCATE' dataframe_complete = pdgbq.read_gbq(query=query_string, project_id=project_id) dataframe = dataframe_complete print(dataframe.dtypes) print(len(dataframe)) base = datetime.today().date() start_date = base + timedelta(days=30) end_date = base + timedelta(days=70) df_companies = pd.DataFrame(dataframe_complete.Ticker.unique()) df_companies.rename(columns={0: 'Ticker'}, inplace=True) df_companies['ProbabilityNextMonthDeclaration'] = 0.0 df_companies['ExpectedStartDate'] = '' df_companies['ExpectedEndDate'] = '' convert_dict = { 'Ticker': str, 'ProbabilityNextMonthDeclaration': float, 'ExpectedStartDate': np.datetime64, 'ExpectedEndDate': np.datetime64 } df_companies = df_companies.astype(convert_dict) for ind in df_companies.index: company_name = df_companies['Ticker'][ind] df_company_temp = pd.DataFrame( dataframe.loc[dataframe['Ticker'] == company_name]) df_company_temp_list = df_company_temp['Dividend'] index = df_company_temp_list.index try: df_company_temp_2 = json_normalize( df_company_temp_list[index[0]]) df = pd.DataFrame({ 'year': df_company_temp_2['DeclarationYear'], 'month': df_company_temp_2['DeclaratioMonth'], 'day': df_company_temp_2['DeclarationDate'] }) df_company_temp_2['Date'] = pd.to_datetime(df) if (company_name == 'CIMB.XKLS'): print(df_company_temp_2) df_company_temp_2.drop_duplicates(subset=[ 'DeclarationYear', 'DeclaratioMonth', 'DeclarationDate' ], inplace=True) if (company_name == 'CIMB.XKLS'): print(df_company_temp_2) except: continue total_declarations = len( df_company_temp_2['DeclarationYear'].unique()) recent_years = [] non_recent_years = [] count_recent = 0 count_recent_two_years = 0 number_of_recent_years = 0 number_of_non_recent_years = 0 count_non_recent = 0 base_date_minus_2 = base - timedelta(days=730) base_date_minus_5 = base - timedelta(days=1825) year_considered = [] for ind2 in df_company_temp_2.index: months = dataframe.loc[dataframe['Ticker'] == company_name, 'Period'] if (not (math.isnan(months))): start_date = base + timedelta(days=int(months) * 30) end_date = start_date + timedelta(days=40) start_dates, end_dates = date_to_months(start_date, end_date) date_temp = df_company_temp_2['Date'][ind2] if (company_name == 'CIMB.XKLS'): print(date_temp) if (base_date_minus_2 <= date_temp < base): recent_years.append(date_temp.year) elif (base_date_minus_5 <= date_temp < base_date_minus_2): recent_years.append(date_temp.year) else: non_recent_years.append(date_temp.year) out_fmt = '%Y-%m-%d' #for every month check if previous declaration month/date falls in the range for start, end in zip(start_dates, end_dates): year = start.year if (company_name == 'CIMB.XKLS'): print('base') print(base) print('start') print(start) print('end') print(end) try: if start.replace(year=year) <= date_temp.replace( year=year) <= end.replace(year=year): if (not date_temp.year in year_considered): if (base_date_minus_2 <= date_temp < base): count_recent_two_years = count_recent_two_years + 1 if (company_name == 'CIMB.XKLS'): print(date_temp) print("count_recent incremented") elif (base_date_minus_5 <= date_temp < base_date_minus_2): count_recent = count_recent + 1 if (company_name == 'CIMB.XKLS'): print(date_temp) print("count_recent incremented") elif (date_temp <= base_date_minus_5): count_non_recent = count_non_recent + 1 if (company_name == 'CIMB.XKLS'): print(date_temp) print("count_non_recent incremented") year_considered.append(date_temp.year) except: #to handle 29th feb #print(date_temp) one_day = timedelta(1) date_temp = date_temp - one_day if start.replace(year=year) <= date_temp.replace( year=year) <= end.replace(year=year): if (not date_temp.year in year_considered): if (base_date_minus_2 <= date_temp < base): count_recent_two_years = count_recent_two_years + 1 if (company_name == 'UEMS.XKLS'): print(date_temp) print("count_recent incremented") elif (base_date_minus_5 <= date_temp < base_date_minus_2): count_recent = count_recent + 1 if (company_name == 'UEMS.XKLS'): print(date_temp) print("count_recent incremented") elif (date_temp < base_date_minus_5): count_non_recent = count_non_recent + 1 #print("count_non_recent incremented") year_considered.append(date_temp.year) number_of_latest_years = 2 number_of_recent_years = 3 #(pd.Series(recent_years)).nunique() number_of_non_recent_years = ( pd.Series(non_recent_years)).nunique() probability = ( (3 * weird_division(count_recent_two_years, number_of_latest_years)) + (2 * weird_division(count_recent, number_of_recent_years)) + (weird_division(count_non_recent, number_of_non_recent_years))) / 6 if (company_name == 'CIMB.XKLS'): print(count_recent_two_years) print(number_of_latest_years) print(count_recent) print(number_of_recent_years) print(count_non_recent) print(non_recent_years) print(number_of_non_recent_years) print(probability) df_companies['ProbabilityNextMonthDeclaration'][ind] = round( probability, 3) df_companies['ExpectedStartDate'][ind] = np.datetime64(start_date) df_companies['ExpectedEndDate'][ind] = np.datetime64(end_date) dataframe.loc[dataframe['Ticker'] == company_name, 'ProbabilityNextMonthDeclaration'] = str(probability) dataframe.loc[dataframe['Ticker'] == company_name, 'ExpectedStartDate'] = np.datetime64(start_date) dataframe.loc[dataframe['Ticker'] == company_name, 'ExpectedEndDate'] = np.datetime64(end_date) dataframe_complete.drop([ 'ProbabilityNextMonthDeclaration', 'ExpectedStartDate', 'ExpectedEndDate', 'LastRunDate' ], axis=1, inplace=True) df_update = pd.merge(dataframe_complete, df_companies, left_on='Ticker', right_on='Ticker') df_update['LastRunDate'] = np.datetime64(base) df_update['NextPayableDate'] = df_update[ 'NextPayableDate'].dt.strftime('%Y-%m-%d') df_update['ExpectedStartDate'] = df_update[ 'ExpectedStartDate'].dt.strftime('%Y-%m-%d') df_update['ExpectedEndDate'] = df_update[ 'ExpectedEndDate'].dt.strftime('%Y-%m-%d') df_update['LastRunDate'] = df_update['LastRunDate'].dt.strftime( '%Y-%m-%d') df_update['RecentDeclarationDate'] = df_update[ 'RecentDeclarationDate'].dt.strftime('%Y-%m-%d') json_data = df_update.to_json(orient="records") json_object = json.loads(json_data) job = client.load_table_from_json( json_object, table, job_config=job_config) # Make an API request. filename = 'customer_dividend_malaysia_probability_update_' + str( datetime.now()) + '.json' blob = destination_bucket.blob(filename) blob.upload_from_string(data=json.dumps(json_object), content_type='application/json') job.result() count_mails = 0 #remove for actual code for ind3 in df_companies.index: company_name = df_companies['Ticker'][ind3] print(company_name) print(ind3) probability = df_companies['ProbabilityNextMonthDeclaration'][ind3] expected_start_date = df_update.loc[dataframe['Ticker'] == company_name, 'ExpectedStartDate'].iloc[0] expected_end_date = df_update.loc[ dataframe['Ticker'] == company_name, 'ExpectedEndDate'].iloc[0] if (float(probability) > 0.9 and count_mails < 10): df_contacts_temp = pd.DataFrame( dataframe.loc[dataframe['Ticker'] == company_name]) df_contacts_temp_list = df_company_temp['Contacts'] index = df_contacts_temp_list.index df_contacts = json_normalize(df_contacts_temp_list[index[0]]) contacts = df_contacts.drop_duplicates(subset=['email'], keep='last') html_string = None with open( '/opt/bitnami/airflow/dags/git-github-com-jainita95-dividend-tracker-git/EmailTemplateUpcomingDividend.html', 'r') as f: html_string = f.read() html_string = html_string.format(code=company_name, startDate=expected_start_date, endDate=expected_end_date, probability=math.ceil( probability * 100)) name = [] emails = [] for ind4 in contacts.index: name_contact = contacts['Name'][ind4] email = contacts['email'][ind4] name.append(name_contact) emails.append(To(email)) message = Mail( from_email='*****@*****.**', to_emails=emails, subject= "Notice: An Upcoming Dividend Declaration cited for " + company_name, html_content=html_string) with open( '/opt/bitnami/airflow/dags/git-github-com-jainita95-dividend-tracker-git/hsbcLogo.png', 'rb') as f: data = f.read() f.close() encoded = base64.b64encode(data).decode() attachment = Attachment() attachment.file_content = FileContent(encoded) attachment.file_type = FileType('image/png') attachment.file_name = FileName('hsbcLogo.png') attachment.disposition = Disposition('inline') attachment.content_id = ContentId('hsbclogo') message.add_attachment(attachment) try: sg = SendGridAPIClient(Variable.get("sendgridapikey")) response = sg.send(message) count_mails = count_mails + 1 #print(response.status_code) #print(response.body) #print(response.headers) except Exception as e: print(e.message)
import json from datetime import datetime, timedelta import requests from airflow import DAG from airflow.hooks.mysql_hook import MySqlHook from airflow.models import Variable from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator # Utils api_key = Variable.get('api_key') cities_ = Variable.get('cities').split(',') cities = [x.encode('utf-8') for x in cities_] def open_weather_response_parser(response_text, city): response_dict = json.loads(response_text) main = response_dict['main'] temp_live = round(main['temp'] - 273.15, 2) temp_max = round(main['temp_max'] - 273.15, 2) temp_min = round(main['temp_min'] - 273.15, 2) humidity = main['humidity'] pressure = main['pressure'] weather = response_dict['weather'][0]['main'] wind_speed = response_dict['wind']['speed'] time = datetime.utcnow() parsed_response = { 'city': city, 'temp_live': temp_live, 'temp_max': temp_max,
def DEFAULT_SQL_DIR(cls): sql_dir = Path(Variable.get("sql_dir")) if not sql_dir.exists(): PKG_PARENT = Path(__file__).absolute().parent.parent.parent.parent sql_dir = PKG_PARENT / "airflow-core/sql" return sql_dir / "salesforce"
import json import os import sys from airflow.models import Variable sys.path.append(Variable.get('module_path')) # sys.path.append(os.getenv('MODULE_PATH')) from models.dag_task_model import DagTaskModel from models.notification_message_model import NotificationMessageModel, NotificationType from models.notification_subject_model import NotificationSubjectModel from models.slack_model import SlackModel from messengers.slack_notification import SlackNotification # import Log class SlackNotificationService: def __init__(self, notification_type: str) -> None: self.notification_type = NotificationType(notification_type) self.COUNT = 0 self.RETRY_LIMIT = 5 def send_message(self, status: str) -> dict: dag_id = str(status['dag']).translate({'<:': '', '>': ''}) task_id = str(status['task']).translate({'<': '', '>': ''}) dag_task = DagTaskModel(dag_id, task_id) notification_message: NotificationMessageModel = NotificationMessageModel( dag_task, self.notification_type) notification_subject: NotificationSubjectModel = NotificationSubjectModel( dag_task) slack_model = SlackModel(notification_subject, notification_message)
} DOCKER_IMAGE = "atddocker/atd-knack-services:production" # command args SCRIPT_TASK_1 = "records_to_postgrest" SCRIPT_TASK_2 = "records_to_agol" SCRIPT_TASK_3 = "agol_build_markings_segment_geometries" SCRIPT_TASK_4 = "records_to_socrata" APP_NAME = "signs-markings" ENV = "prod" POOL_KNACK = "knack_signs_markings" POOL_POSTGREST = "atd_knack_postgrest_pool" CONTAINER = "view_3100" env_vars = Variable.get("atd_knack_services_postgrest", deserialize_json=True) atd_knack_auth = Variable.get("atd_knack_auth", deserialize_json=True) env_vars["KNACK_APP_ID"] = atd_knack_auth[APP_NAME][ENV]["app_id"] env_vars["KNACK_API_KEY"] = atd_knack_auth[APP_NAME][ENV]["api_key"] env_vars["AGOL_USERNAME"] = Variable.get("agol_username") env_vars["AGOL_PASSWORD"] = Variable.get("agol_password") env_vars["SOCRATA_API_KEY_ID"] = Variable.get( "atd_service_bot_socrata_api_key_id") env_vars["SOCRATA_API_KEY_SECRET"] = Variable.get( "atd_service_bot_socrata_api_key_secret") env_vars["SOCRATA_APP_TOKEN"] = Variable.get( "atd_service_bot_socrata_app_token") with DAG( dag_id="atd_knack_markings_work_orders_jobs", description=
def get_reddit_posts(**context): reddit = CurrentDaysBands( client_id=Variable.get("REDDIT_CLIENT_ID"), client_secret=Variable.get("REDDIT_CLIENT_SECRET"), user_agent=Variable.get("REDDIT_USER_AGENT")) return reddit.get_bands(context['yesterday_ds'])
import os from os.path import expanduser from airflow import DAG from airflow.models import Variable from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator from airflow.operators.http_operator import SimpleHttpOperator from airflow.operators.idea_plugin import BigQueryTableModifiedSensor home = expanduser("~") STATE_PATH = '{0}/gcs/data/nwea_assessment_results_last_modified.text'.format( home) IDEA2_API_KEY = Variable.get('idea2_api_key') """ DAG for updating Illuminate grade review data """ default_args = { "owner": "airflow", "depends_on_past": False, "start_date": datetime(2019, 3, 12), "email": ["*****@*****.**"], "email_on_failure": True, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=2), "provide_context": True # 'queue': 'bash_queue',
from airflow import DAG from airflow.models import Variable from airflow.operators.python_operator import PythonOperator from datetime import timedelta, datetime from tasks.fetch_covid_cases import fetch_daily_data from tasks.find_upload_percentage import find_percentage from tasks.upload_csv_to_big_table import upload_csv_to_big_table import yaml # #fetching constants # from airflow dag_config = Variable.get("bigquery_variables", deserialize_json=True) BQ_CONN_ID = dag_config["bq_conn_id"] BQ_PROJECT = dag_config["bq_project"] BQ_TABLE = dag_config["bq_table"] BQ_DATASET = dag_config["bq_dataset"] # form config yaml file with open("config/pipelines/covid_pipeline.yaml", 'r') as stream: try: dag_info = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) # form library yaml file with open("library/pipeline_defaults.yaml", 'r') as stream: try: dag_defaults = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc)
import os import time import boto3 import airflow.hooks.S3_hook from airflow import DAG from airflow.models import Variable from airflow.operators import BashOperator from datetime import datetime, timedelta from airflow.operators.python_operator import PythonOperator from airflow.operators.dummy_operator import DummyOperator from airflow.hooks.postgres_hook import PostgresHook ## APi key to connect to weather api ## Alternatively store them in files and source API_KEY = Variable.get("weather_api_key") # Following are defaults which can be overridden later on # dag variables default_args = { 'owner': 'Srilekha', 'depends_on_past': False, 'start_date': datetime(2020, 11, 6), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), }
def AirflowGetVariableOrBaseCase(var, base): try: return Variable.get(var) except KeyError: return base
} Q_PUBLIC = "poller_queue" Q_PRIVATE = "formatting_queue" Q_OSPF = "poller_queue" Q_PING = "poller_queue" PARENT_DAG_NAME = "UTILIZATION_KPI" utilization_kpi_dag = DAG(dag_id=PARENT_DAG_NAME, default_args=default_args, schedule_interval='4-59/5 * * * *') redis_hook_util_10 = RedisHook(redis_conn_id="redis_hook_util_10") redis_hook_2 = RedisHook(redis_conn_id="redis_hook_2") technologies = eval(Variable.get('utilization_kpi_technologies')) machines = eval(Variable.get("system_config_no_o1")) devices = eval(Variable.get('hostmk.dict.site_mapping')) attributes = eval(Variable.get('utilization_kpi_attributes')) all_sites = [] def init_kpi(): logging.info("TODO : Check All vars and Airflow ETL Environment here") redis_hook_util_10.flushall("*") logging.info("Flushed all in redis_hook_util_10 connection") def get_previous_device_states(device_type): prev_state = eval(redis_hook_2.get("kpi_ul_prev_state_%s" % device_type))
def get_sql_dir(): sql_dir = Path(Variable.get("sql_dir")) if not sql_dir.exists(): PKG_PARENT = Path(__file__).absolute().parent.parent.parent.parent sql_dir = PKG_PARENT / "airflow-core/sql" return sql_dir
def load_dimension_subdag(parent_dag_name, task_id, redshift_conn_id, *args, **kwargs): """ A python function with arguments, which creates a dag :param parent_dag_name: imp ({parent_dag_name}.{task_id}) :param task_id: imp {task_id} :param redshift_conn_id: {any connection id} :param args: {verbose} :param kwargs: {verbose and context variables} :return: """ dag = DAG(f"{parent_dag_name}.{task_id}", **kwargs) copy_ports = StageToRedshiftOperator(task_id='copy_ports', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_default", file='i94port.csv', delimiter=',', table='i94ports', s3_bucket=Variable.get("s3_bucket"), s3_key="csv", sql_stmt=SqlQueries.copy_csv_cmd, provide_context=True) copy_visa = StageToRedshiftOperator(task_id='copy_visa', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_default", file='i94visa.csv', delimiter=',', table='i94visa', s3_bucket=Variable.get("s3_bucket"), s3_key="csv", sql_stmt=SqlQueries.copy_csv_cmd, provide_context=True) copy_modes = StageToRedshiftOperator(task_id='copy_modes', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_default", file='i94mode.csv', delimiter=',', table='i94mode', s3_bucket=Variable.get("s3_bucket"), s3_key="csv", sql_stmt=SqlQueries.copy_csv_cmd, provide_context=True) copy_addr = StageToRedshiftOperator(task_id='copy_addr', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_default", file='i94addr.csv', delimiter=',', table='i94addr', s3_bucket=Variable.get("s3_bucket"), s3_key="csv", sql_stmt=SqlQueries.copy_csv_cmd, provide_context=True) copy_country_codes = StageToRedshiftOperator( task_id='copy_country_codes', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_default", file='i94cit&i94res.csv', delimiter=',', table='i94res', s3_bucket=Variable.get("s3_bucket"), s3_key="csv", sql_stmt=SqlQueries.copy_csv_cmd, provide_context=True) copy_cities_demographics = StageToRedshiftOperator( task_id='copy_cities_demographics', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_default", file='us-cities-demographics.csv', delimiter=';', table='us_cities_demographics', s3_bucket=Variable.get("s3_bucket"), s3_key="csv", sql_stmt=SqlQueries.copy_csv_cmd, provide_context=True) copy_airports = StageToRedshiftOperator( task_id='copy_airports', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_default", file='airport-codes_csv.csv', delimiter=',', table='airport_codes', s3_bucket=Variable.get("s3_bucket"), s3_key="csv", sql_stmt=SqlQueries.copy_csv_cmd, provide_context=True) def parquet_to_redshift(table, s3_bucket, s3_key, iam_role, sql_stmt, redshift_conn_id, **kwargs): """ This function reads parquet files and copies them to redshift schema.db :param table: :param s3_bucket: :param s3_key: :param iam_role: :param sql_stmt: :param redshift_conn_id: :param kwargs: :return: """ redshift = PostgresHook(postgres_conn_id=redshift_conn_id) logging.info("Copying data from S3 to Redshift") s3_path = "s3://{}/{}".format(s3_bucket, s3_key) formatted_sql = sql_stmt.format(table, s3_path, iam_role) redshift.run(formatted_sql) aws_hook = AwsHook("aws_default") credentials = aws_hook.get_credentials() client = boto3.client('s3', aws_access_key_id=credentials.access_key, aws_secret_access_key=credentials.secret_key) objects_to_delete = client.list_objects( Bucket=Variable.get("s3_bucket"), Prefix="parquet") delete_keys = {'Objects': []} delete_keys['Objects'] = [ { 'Key': k } for k in [obj['Key'] for obj in objects_to_delete.get('Contents', [])] ] client.delete_objects(Bucket=Variable.get("s3_bucket"), Delete=delete_keys) copy_immigration = PythonOperator( task_id='copy_immigration', python_callable=parquet_to_redshift, # changed provide_context=True, op_kwargs={ 'table': "immigration", 's3_bucket': Variable.get("s3_bucket"), 's3_key': 'parquet', 'iam_role': Variable.get('iam_role'), 'sql_stmt': SqlQueries.copy_parquet_cmd, 'redshift_conn_id': 'redshift' }, dag=dag) copy_ports copy_visa copy_modes copy_addr copy_country_codes copy_airports copy_cities_demographics copy_immigration return dag
from postgres_check_operator import ( PostgresMultiCheckOperator, COUNT_CHECK, GEO_CHECK, ) from sql.wior import ( DROP_COLS, SQL_DROP_TMP_TABLE, SQL_GEOM_VALIDATION, SQL_ADD_PK, SQL_SET_DATE_DATA_TYPES, ) dag_id: str = "wior" variables: Dict = Variable.get(dag_id, deserialize_json=True) data_endpoint: Dict = variables["data_endpoints"]["wfs"] tmp_dir: str = f"{SHARED_DIR}/{dag_id}" data_file: str = f"{tmp_dir}/{dag_id}.geojson" db_conn: DatabaseEngine = DatabaseEngine() password: str = env("AIRFLOW_CONN_WIOR_PASSWD") user: str = env("AIRFLOW_CONN_WIOR_USER") base_url: str = URL(env("AIRFLOW_CONN_WIOR_BASE_URL")) total_checks: list = [] count_checks: list = [] geo_checks: list = [] to_zone: Optional[tzinfo] = tz.gettz("Europe/Amsterdam") class DataSourceError(Exception): """Custom exeception for not available data source"""
import pathlib from pathlib import Path p = os.path.abspath( str(pathlib.Path(__file__).parent.absolute()) + '/../../python/') if p not in sys.path: sys.path.append(p) from export_sequences_without_premsa import export_sequences from store_premsa import store_premsa_file from premsa_log_parse import mark_troubled from mark_premsa_dupes import mark_premsa_dupes from get_raw_duplicates import write_raw_duplicates from mark_duplicates import mark_duplicates WORKING_DIR = Variable.get("WORKING_DIR") DATE_STRING = datetime.date.today().strftime('%Y-%m-%d') default_args = { 'owner': 'sweaver', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'params': { 'working_dir': WORKING_DIR, 'num_procs': 16, 'python': "/data/shares/veg/SARS-CoV-2/SARS-CoV-2-devel/env/bin/python3", 'hyphy': "/data/shares/veg/SARS-CoV-2/hyphy/hyphy", 'hyphy_mpi': "/data/shares/veg/SARS-CoV-2/hyphy/HYPHYMPI",
def execute(self, context): if self.provide_context: context.update(self.op_kwargs) log_date = context['ti'].execution_date.strftime('%Y-%m-%d') if self.log_date_fun is not None: log_date = self.log_date_fun(context) if self.log_hour_fun is not None: self.log_hour = self.log_hour_fun(context) current_dir = os.getcwd() s3_bucket_with_env = tools.s3_name_generator(self.s3_bucket, "-json", "-staging") s3_conn_id_with_env = tools.s3_name_generator(self.s3_conn_id, "_json", "_staging") s3_key_path = "druid-json-template" + "/" + self.template self.download_template_file(s3_conn_id_with_env, s3_bucket_with_env, s3_key_path, current_dir + "/json") druid_host, druid_port = Variable.get("druid_overlord").split(":") key, secret = GenericHook(s3_conn_id_with_env).get_credentials() druid = DruidAccess(druid_host, druid_port, "", "", self.data_source) s3 = S3Access(key, secret, False) if self.folder is None: self.folder = tools.s3_name_generator(self.prefix, "-prod", "-staging") s3_file_location = "" if self.aggregate == "DAILY": s3_file_location = "s3://{bucket}/{folder}/{topic}/{day_key}={log_date}/".format( bucket=s3_bucket_with_env, folder=self.folder, topic=self.topic, day_key=self.date_key, log_date=log_date) if self.aggregate == "HOURLY": s3_file_location = "s3://{bucket}/{folder}/{topic}/{day_key}={log_date}/{hour_key}={log_hour}/".format( bucket=s3_bucket_with_env, folder=self.folder, topic=self.topic, day_key=self.date_key, log_date=log_date, hour_key=self.hour_key, log_hour=self.log_hour) logging.info("Launching importer for %s.." % s3_file_location) s3_files = s3.get_filenames(s3_file_location) logging.info("Files Name " + ','.join(s3_files)) running_tasks = [] s3_files = ['"' + f + '"' for f in sorted(s3_files)] task = Task(s3_files) log_timestamp = log_date + ' ' + self.log_hour + ":00:00" log_timestamp_ts = (parser.parse(log_timestamp)).isoformat() next_log_timestamp_ts = (parser.parse(log_timestamp) + datetime.timedelta(days=1)).isoformat() if self.aggregate == "HOURLY": next_log_timestamp_ts = (parser.parse(log_timestamp) + datetime.timedelta(hours=1)).isoformat() logging.info("Handling task %r" % task) task.id = druid.upload(log_timestamp_ts, next_log_timestamp_ts, s3_files, self.template) logging.info("Uploading task to druid and task id is %r" % task.id) running_tasks.append(task) # Cleaning and waiting druid.clean_tasks(running_tasks) while len(running_tasks) >= self.slots: logging.info("Waiting for %r tasks" % len(running_tasks)) time.sleep(10) running_tasks = druid.clean_tasks(running_tasks) while len(running_tasks) > 0: logging.info("Waiting for finalization of %r tasks" % len(running_tasks)) time.sleep(10) running_tasks = druid.clean_tasks(running_tasks) logging.info("Importing done..")
def GenerateTestArgs(**kwargs): """Loads the configuration that will be used for this Iteration.""" conf = kwargs['dag_run'].conf if conf is None: conf = dict() """ Airflow gives the execution date when the job is supposed to be run, however we dont backfill and only need to run one build therefore use the current date instead of the date that is passed in """ # date = kwargs['execution_date'] date = datetime.datetime.now() timestamp = time.mktime(date.timetuple()) # Monthly releases started in Nov 2017 with 0.3.0, so minor is # of months # from Aug 2017. minor_version = (date.year - 2017) * 12 + (date.month - 1) - 7 major_version = AirflowGetVariableOrBaseCase('major_version', 0) # This code gets information about the latest released version so we know # What version number to use for this round. r_minor = int(AirflowGetVariableOrBaseCase('released_version_minor', 0)) r_patch = int(AirflowGetVariableOrBaseCase('released_version_patch', 0)) # If we have already released a monthy for this mounth then bump # The patch number for the remander of the month. if r_minor == minor_version: patch = r_patch + 1 else: patch = 0 # If version is overriden then we should use it otherwise we use it's # default or monthly value. version = conf.get('VERSION') if monthly and not version: version = '{}.{}.{}'.format(major_version, minor_version, patch) default_conf = environment_config.get_airflow_config( version, timestamp, major=major_version, minor=minor_version, patch=patch, date=date.strftime('%Y%m%d'), rc=date.strftime('%H-%M')) config_settings = dict(VERSION=default_conf['VERSION']) config_settings_name = [ 'PROJECT_ID', 'MFEST_URL', 'MFEST_FILE', 'GCS_STAGING_BUCKET', 'SVC_ACCT', 'GITHUB_ORG', 'GITHUB_REPO', 'GCS_GITHUB_PATH', 'TOKEN_FILE', 'GCR_STAGING_DEST', 'GCR_RELEASE_DEST', 'GCS_MONTHLY_RELEASE_PATH', 'DOCKER_HUB', 'GCS_BUILD_BUCKET', 'RELEASE_PROJECT_ID', ] for name in config_settings_name: config_settings[name] = conf.get(name) or default_conf[name] if monthly: config_settings['MFEST_COMMIT'] = conf.get( 'MFEST_COMMIT') or Variable.get('latest_sha') gcs_path = conf.get('GCS_MONTHLY_STAGE_PATH') if not gcs_path: gcs_path = default_conf['GCS_MONTHLY_STAGE_PATH'] else: config_settings['MFEST_COMMIT'] = conf.get( 'MFEST_COMMIT') or default_conf['MFEST_COMMIT'] gcs_path = conf.get('GCS_DAILY_PATH') or default_conf['GCS_DAILY_PATH'] config_settings['GCS_STAGING_PATH'] = gcs_path config_settings['GCS_BUILD_PATH'] = '{}/{}'.format( config_settings['GCS_BUILD_BUCKET'], gcs_path) config_settings['GCS_RELEASE_TOOLS_PATH'] = '{}/release-tools/{}'.format( config_settings['GCS_BUILD_BUCKET'], gcs_path) config_settings['GCS_FULL_STAGING_PATH'] = '{}/{}'.format( config_settings['GCS_STAGING_BUCKET'], gcs_path) config_settings['ISTIO_REPO'] = 'https://github.com/{}/{}.git'.format( config_settings['GITHUB_ORG'], config_settings['GITHUB_REPO']) return config_settings
from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import PythonOperator from datetime import datetime,timedelta from airflow.models import Variable SRC=Variable.get("SRC") #SRC='./' COUNTRY=Variable.get("COUNTRY") #COUNTRY='PL' dag = DAG('project-workflow',description='Project Workflow DAG', schedule_interval = '*/5 0 * * *', start_date=datetime(2017,7,1), catchup=False) xlsx_to_csv_task = BashOperator( task_id='xlsx_to_csv', bash_command='"$src"/test.sh "$country" 2nd_param_xlsx', env={'src': SRC, 'country': COUNTRY}, dag=dag) merge_command = SRC + '/test.sh ' + COUNTRY + ' 2nd_param_merge' merge_task = BashOperator( task_id='merge', bash_command=merge_command , dag=dag) my_templated_command = """ {{ params.src }}/test.sh {{ params.country}} 2nd_param_cleansing
def test_variable_set_get_round_trip(self): Variable.set("tested_var_set_id", "Monday morning breakfast") assert "Monday morning breakfast" == Variable.get("tested_var_set_id")
def set_call(*args, **context): group = Variable.get('group') if group == 'night_shift': context['task_instance'].xcom_push(key='recipient', value='0011223344') else: context['task_instance'].xcom_push(key='recipient', value='0011223344')
# # create formatter and add it to the handlers # formatter = logging.Formatter('%(asctime)s - %(process)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s') # fh.setFormatter(formatter) # ch.setFormatter(formatter) # # add the handlers to the logger # logger.addHandler(fh) # logger.addHandler(ch) import datetime from airflow import DAG from airflow.models import Variable from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator from openpyxl import Workbook project_folder = Variable.get("project_folder") order_output_folder = Variable.get("order_output_folder") store_order_file = Variable.get("store_order_file_name") default_args = { 'owner': 'Carrefour', 'start_date': datetime.datetime(2019, 8, 19), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'end_date': datetime.datetime(2030, 1, 1), } dag = DAG('get_sales',
def test_access_var(): my_var = Variable.get("hsfjskdfjhk") print("my var message : {}".format(my_var)) return ("Access Var Success!")
'email': ['*****@*****.**'], 'email_on_failure': True, 'email_on_retry': False, } dag = airflow.DAG('dm_oride_passenger_base_multi_cube', schedule_interval="45 00 * * *", default_args=args) ##----------------------------------------- 变量 ---------------------------------------## db_name = "oride_dw" table_name = "dm_oride_passenger_base_multi_cube" ##----------------------------------------- 依赖 ---------------------------------------## #获取变量 code_map = eval(Variable.get("sys_flag")) #判断ufile(cdh环境) if code_map["id"].lower() == "ufile": # 依赖前一天分区 dwm_oride_passenger_order_base_di_prev_day_task = UFileSensor( task_id='dwm_oride_passenger_order_base_di_prev_day_task', filepath='{hdfs_path_str}/dt={pt}/_SUCCESS'.format( hdfs_path_str= "oride/oride_dw/dwm_oride_passenger_order_base_di/country_code=NG", pt='{{ds}}'), bucket_name='opay-datalake', poke_interval=60, # 依赖不满足时,一分钟检查一次依赖状态 dag=dag) #路径 hdfs_path = "ufile://opay-datalake/oride/oride_dw/" + table_name
# This DAG is configured to print the date and sleep for 5 seconds. # However, it is configured to fail (see the expect_failure bash_command) # and send an e-mail to your specified email on task failure. from airflow import DAG from airflow.models import Variable from airflow.operators.bash_operator import BashOperator from datetime import datetime, timedelta YESTERDAY = datetime.combine( datetime.today() - timedelta(days=1), datetime.min.time()) default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': YESTERDAY, 'email': [ Variable.get('email') ], 'email_on_failure': True, 'email_on_retry': False, 'retries': 0, } with DAG('hello_world_email_bonus', default_args=default_args) as dag: t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator(task_id='expect_failure', bash_command='exit 1', dag=dag) t1 >> t2
# 'on_success_callback': some_other_function, # 'on_retry_callback': another_function, # 'sla_miss_callback': yet_another_function, # 'trigger_rule': 'all_success' } dag = DAG( "3_find_neighbours", default_args=default_args, description="Find neighbouring waves in a timespace", schedule_interval=timedelta(days=1), ) rootdir = "/app/data" filename = Variable.get("filename") if filename == "all": files = [file for file in os.listdir(rootdir) if file.endswith(".tif")] else: files = [filename] tolerance_xy = Variable.get("tolerance_xy") tolerance_t = Variable.get("tolerance_t") intersect_threshold = Variable.get("intersection_threshold") for file in files: filename = file directory = filename.split(".")[0] directory = process_task_name(directory)
from airflow import DAG from datetime import datetime, timedelta from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator from airflow.operators.dummy_operator import DummyOperator from kubernetes.client import models as k8s from airflow.utils.dates import days_ago from airflow.models import Variable from airflow.operators.http_operator import SimpleHttpOperator import urllib.request import json default_args = { 'owner': 'datagap' } basePath = Variable.get("permit_data_base_url") templateUrl = Variable.get("permit_data_weekly_index_url") permitDataSource = Variable.get("permit_datasource") def downloadTemplate(templateUrl): request = urllib.request.urlopen(templateUrl) response = request.read().decode('utf-8') return response def replace(jsonContent, dataSource, interval, basePath, date, market): result = json.loads(jsonContent) # base data source result['spec']['ioConfig']['inputSource']['dataSource'] = dataSource # ingest data url
'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': dt.timedelta(minutes=5), 'queue': queue, } # initiate the DAG dag = DAG( dag_name, default_args=default_args, description='Running multiple HEC-HMSs using Google Kubernetes Engine', schedule_interval=schedule_interval) hec_config = Variable.get(hec_config_var, deserialize_json=True) for i in range(parallel_runs): generate_run_id = PythonOperator( task_id='gen-run-id', python_callable=af_kube_utils.generate_random_run_id, op_args=[run_id_prefix], op_kwargs={"suffix": "%04d" % i}, provide_context=True, dag=dag) logging.info('Initializing hec-hms pod') hec_pod = get_base_pod() hec_pod.metadata.name = 'kube-pod-{{ ti.xcom_pull(task_ids=\'gen-run-id\') }}' hec_pod.spec.containers[ 0].name = 'kube-cont-{{ ti.xcom_pull(task_ids=\'gen-run-id\') }}'
StructType, StringType, DoubleType, IntegerType ) from pyspark.sql.functions import udf, monotonically_increasing_id from helpers import ( practice_prescribing_schema, chemicals_schema, practices_schema, practice_size_schema, bnf_codes_schema ) S3_STAGING = Variable.get('s3_output_bucket') S3_RAW_DATA = Variable.get('s3_input_bucket') aws_access_key_id = Variable.get('aws_access_key_id') aws_secret_key = Variable.get('aws_secret_access_key') class PreprocessToS3Operator(BaseOperator): ui_color = '#80BD9E' @apply_defaults def __init__(self, schema="", s3_bucket="", s3_key="", filename="",
def test_get_non_existing_var_should_not_deserialize_json_default(self): default_value = "}{ this is a non JSON default }{" assert default_value == Variable.get("thisIdDoesNotExist", default_var=default_value, deserialize_json=True)
""" Working with Variables. Doc: https://airflow.apache.org/concepts.html?highlight=variable#variables """ from airflow import DAG from airflow.operators.bash_operator import BashOperator from datetime import datetime from airflow.models import Variable text_variable = Variable.get("user") # Getting a JSON var doesn't work () #json_variable = Variable.get("json_var", deserialize_json = True) default_args = { 'start_date': datetime.now() } dag = DAG('varialbes', default_args=default_args) text_message = f"echo 'The user variable is {text_variable}'" #json_message = f"echo 'The json_var={json_variable}'" t1 = BashOperator( task_id='text_variable', bash_command=text_message, dag=dag)
def test_get_non_existing_var_should_return_default(self): default_value = "some default val" assert default_value == Variable.get("thisIdDoesNotExist", default_var=default_value)
def test_variable_set_get_round_trip_json(self): value = {"a": 17, "b": 47} Variable.set("tested_var_set_id", value, serialize_json=True) assert value == Variable.get("tested_var_set_id", deserialize_json=True)