elif (active_tdy and not active_ydy): result = 'NEW' if new_user else 'RESURRECTED' elif (not active_tdy and not active_ydy): result = 'STALE' elif (not active_tdy and active_ydy): result = 'CHURN' return result $$ language plpythonu; """, }, } queries = [] for function, config in udfs.items(): desc = config.get('desc', '') query = config.get('query', None).format(function=function) if query is None: continue queries.append(query) register_udfs = FBRedshiftOperator( task_id='register_udfs', postgres_conn_id=REDSHIFT_CONN_ID, sql="{{ params.queries }}", params={ 'queries': "\n".join(queries), }, dag=dag, )
table='course_assignment_sections', dag=dag, ) create_course_assignment_sections = FBRedshiftOperator( task_id='create_course_assignment_sections', sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {schema}.course_assignment_sections_historical ( id NUMERIC(10,0), course_assignment_id NUMERIC(10,0), section_id NUMERIC(10,0), created_at TIMESTAMP WITHOUT TIME ZONE, updated_at TIMESTAMP WITHOUT TIME ZONE, as_of DATE ) DISTSTYLE KEY DISTKEY (course_assignment_id) SORTKEY (as_of ,course_assignment_id ,section_id ); COMMIT; """.format(schema=DIM_AND_FCT_SCHEMA), dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, ) insert_course_assignment_sections = FBHistoricalOperator( redshift_conn_id=REDSHIFT_CONN_ID, task_id='insert_course_assignment_sections',
schema=STAGING_SCRAPES_SCHEMA, table='sections', dag=dag, ) create_sections = FBRedshiftOperator( task_id='create_sections', sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {schema}.sections_historical ( id numeric(10,0), sis_id character varying(65535), name character varying(255), clever_id character varying(65535), site_id numeric(10,0), create_at timestamp without time zone, updated_at timestamp without time zone, sis_course_id numeric(10,0), academic_year numeric(10,0), create_source character varying(256), as_of date ); COMMIT; """.format(schema=DIM_AND_FCT_SCHEMA), dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, ) insert_sections = FBHistoricalOperator( redshift_conn_id=REDSHIFT_CONN_ID, task_id='insert_sections', view_name='sections',
schema=STAGING_SCRAPES_SCHEMA, table='section_teachers', dag=dag, ) create_section_teachers_historical = FBRedshiftOperator( task_id='create_section_teachers_historical', sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {schema}.section_teachers_historical ( id NUMERIC(10,0), section_id NUMERIC(10,0), teacher_id NUMERIC(10,0), created_at TIMESTAMP WITHOUT TIME ZONE, updated_at TIMESTAMP WITHOUT TIME ZONE, visibility VARCHAR(256), as_of DATE ) DISTKEY (teacher_id) SORTKEY (as_of, teacher_id, section_id, visibility); COMMIT; """.format(schema=DIM_AND_FCT_SCHEMA), dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, ) select_and_insert = FBHistoricalOperator( task_id='select_and_insert', view_name='section_teachers', select_sql="""
dag=dag, ) create_dim_sites = FBRedshiftOperator( task_id='create_dim_sites', sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {schema}.dim_sites_historical ( id numeric(10,0) NOT NULL, nces_site_id numeric(10,0), name character varying(255), enrollment_group character varying(256), district_id numeric(10,0), nces_district_id numeric(10,0), district_name character varying(65535), as_of date, status character varying(256), school_type character varying(65535), student_data_sharing character varying(256), teacher_sync_enabled bool, should_sync_clever bool, student_sync_enabled bool, section_sync_enabled bool ); COMMIT; """.format(schema=DIM_AND_FCT_SCHEMA), dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, ) insert_dim_sites = FBHistoricalOperator( redshift_conn_id=REDSHIFT_CONN_ID,
def get_scrape_subdag(table_name): dag = DAG( '%s.%s' % (PARENT_DAG_NAME, table_name), default_args=default_args, schedule_interval=SCHEDULE_INTERVAL, ) copy_transaction = FBRedshiftOperator( task_id='copy_transaction', postgres_conn_id=REDSHIFT_CONN_ID, sql=""" BEGIN; DROP TABLE IF EXISTS {{ params.schema }}."{{ params.table_name }}_{{ macros.ds_add(ds, -4) }}" CASCADE; -- CASCADE will DROP the VIEW, too DROP TABLE IF EXISTS {{ params.schema }}."{{ params.table_name }}_{{ ds }}" CASCADE; DROP VIEW IF EXISTS {{ params.schema }}."{{ params.table_name }}" CASCADE; -- Copy Table -- Step 0: Drop DROP TABLE IF EXISTS {{ params.schema }}."{{ params.table_name }}"; -- Step 1: Create CREATE TABLE {{ params.schema }}."{{ params.table_name }}" (LIKE heroku_public."{{ params.table_name }}"); -- Step 2: Copy INSERT INTO {{ params.schema }}."{{ params.table_name }}" SELECT * FROM heroku_public."{{ params.table_name }}"; GRANT ALL PRIVILEGES ON {{ params.schema }}."{{ params.table_name }}" TO GROUP data_eng; ALTER TABLE {{ params.schema }}.{{ params.table_name }} RENAME TO "{{ params.table_name }}_{{ ds }}"; CREATE VIEW {{ params.schema }}.{{ params.table_name }} AS (SELECT * FROM {{ params.schema }}."{{ params.table_name }}_{{ ds }}"); GRANT ALL PRIVILEGES ON {{ params.schema }}.{{ params.table_name }} TO GROUP data_eng; COMMIT; """, params={ 'schema': STAGING_SCRAPES_WRITE_SCHEMA, 'table_name': table_name, }, dag=dag, ) signal = FBWriteSignalOperator( conn_id=REDSHIFT_CONN_ID, task_id='write_signal', schema=STAGING_SCRAPES_WRITE_SCHEMA, table=table_name, dag=dag, ) signal.set_upstream(copy_transaction) unload = FBRedshiftToS3Transfer( task_id='unload', schema=STAGING_SCRAPES_WRITE_SCHEMA, table='{{ params.table_name }}_{{ ds }}', s3_bucket='plp-data-lake', s3_key='scrapes-opt-prod-airflow/{{ params.table_name }}/as_of={{ ds }}/', redshift_conn_id=REDSHIFT_CONN_ID, unload_options=[ 'ALLOWOVERWRITE', 'DELIMITER AS \',\'', 'GZIP', 'ESCAPE ADDQUOTES', ], params={'table_name': table_name}, dag=dag, ) unload.set_upstream(copy_transaction) return dag
insert_know_do_masteries = FBRedshiftOperator( task_id='insert_know_do_masteries', postgres_conn_id=REDSHIFT_CONN_ID, dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS {output_schema}.know_do_masteries; CREATE TABLE IF NOT EXISTS {output_schema}.know_do_masteries ( id numeric(10,0) NOT NULL, student_id numeric(10,0), know_do_id numeric(10,0), mastery character varying(255), reason character varying(255), created_at timestamp without time zone, updated_at timestamp without time zone, created_by numeric(10,0), updated_by numeric(10,0) ) DISTKEY (student_id) SORTKEY (created_at, student_id, know_do_id); INSERT INTO {output_schema}.know_do_masteries ( SELECT id, student_id, know_do_id, mastery, reason, created_at, updated_at, created_by, updated_by FROM {input_schema}."know_do_masteries_{today}" ); COMMIT; """.format(output_schema=DIM_AND_FCT_SCHEMA, input_schema=STAGING_SCRAPES_SCHEMA, today='{{ ds }}'), ).set_upstream(wait_for_know_do_masteries)
STAGING_SCRAPES_SCHEMA, STAGING_SCRAPES_WRITE_SCHEMA, 'wild_west', 'zendesk', ] add_ddl = [] for group, users in permission_groups.items(): for user in users: add_ddl.append("ALTER GROUP {group_name} ADD USER {user};".format( user=user, group_name=group, )) add_members = FBRedshiftOperator( task_id='add_members', postgres_conn_id=REDSHIFT_ADMIN_CONN_ID, sql="\n".join(add_ddl), dag=dag, ) perms_sql = [] for schema in schemas: for user in user_list: perms_sql.append("GRANT ALL ON SCHEMA {schema} TO {user};".format( schema=schema, user=user, )) for target in ('TABLES', 'FUNCTIONS'): perms_sql.append( """ GRANT {perms} ON ALL TABLES IN SCHEMA {schema} TO {user}; """.format(
schema=STAGING_SCRAPES_SCHEMA, table='users', dag=dag, ) # Bigint instead of integer in preparation for using larger IDs create_dim_parents = FBRedshiftOperator( task_id='create_dim_parents', sql=[ "BEGIN;", """ CREATE TABLE IF NOT EXISTS {schema}.dim_parents_historical ( id bigint NOT NULL, email character varying(1024), first_name character varying(1024), last_name character varying(1024), as_of date NOT NULL ) DISTKEY (id) SORTKEY (as_of, id); """.format(schema=DIM_AND_FCT_SCHEMA), "COMMIT;", ], dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, ) insert_dim_parents = FBHistoricalOperator( task_id='insert_dim_parents', view_name='dim_parents', select_sql=""" SELECT
insert_assessment_takes = FBRedshiftOperator( task_id='insert_assessment_takes', postgres_conn_id=REDSHIFT_CONN_ID, dag=dag, sql=""" BEGIN; DROP TABLE IF EXISTS {output_schema}.assessment_takes; CREATE TABLE IF NOT EXISTS {output_schema}.assessment_takes ( id NUMERIC(10,0), student_id NUMERIC(10,0), num_correct NUMERIC(10,0), created_at TIMESTAMP WITHOUT TIME ZONE, updated_at TIMESTAMP WITHOUT TIME ZONE, know_do_id NUMERIC(10,0), num_possible NUMERIC(10,0), taken_at TIMESTAMP WITHOUT TIME ZONE, approved_by_id NUMERIC(10,0), assessment_id NUMERIC(10,0), approved_time TIMESTAMP WITHOUT TIME ZONE, start_time TIMESTAMP WITHOUT TIME ZONE, is_content_assessment VARCHAR(5), created_by NUMERIC(10,0), updated_by NUMERIC(10,0), ell_locale VARCHAR(65535), objective_level_stats VARCHAR(65535), additional_time_given NUMERIC(10,0), academic_mindset_popup VARCHAR(5), student_signature VARCHAR(65535), time_zone VARCHAR(65535), ip_for_request VARCHAR(65535), ip_for_start VARCHAR(65535), ip_for_finish VARCHAR(65535), session_id_for_request VARCHAR(65535), session_id_for_start VARCHAR(65535), session_id_for_finish VARCHAR(65535), browser_id_for_request VARCHAR(65535), browser_id_for_start VARCHAR(65535), browser_id_for_finish VARCHAR(65535), invalidation_description VARCHAR(65535), visibility VARCHAR(256), invalidation_code VARCHAR(256), pcnt_to_pass NUMERIC(3,2), passed BOOLEAN ) DISTKEY (student_id) SORTKEY (created_at, student_id, know_do_id); INSERT INTO {output_schema}.assessment_takes ( SELECT ats.id, ats.student_id, ats.num_correct, ats.created_at, ats.updated_at, ats.know_do_id, ats.num_possible, ats.taken_at, ats.approved_by_id, ats.assessment_id, ats.approved_time, ats.start_time, ats.is_content_assessment, ats.created_by, ats.updated_by, ats.ell_locale, ats.objective_level_stats, ats.additional_time_given, ats.academic_mindset_popup, ats.student_signature, ats.time_zone, ats.ip_for_request, ats.ip_for_start, ats.ip_for_finish, ats.session_id_for_request, ats.session_id_for_start, ats.session_id_for_finish, ats.browser_id_for_request, ats.browser_id_for_start, ats.browser_id_for_finish, ats.invalidation_description, enum_name_for_value('visibility', ats.visibility, 'assessment_takes', 'AssessmentTake'), enum_name_for_value('invalidation_code', ats.invalidation_code, 'assessment_takes', 'AssessmentTake'), kd.pcnt_to_pass, (100 * ats.num_correct / cast(ats.num_possible as float)) >= cast(round(kd.pcnt_to_pass * 100, 0) as int) FROM {input_schema}."assessment_takes_{today}" ats LEFT JOIN {input_schema}."know_dos_{today}" kd ON (ats.know_do_id = kd.id) ); COMMIT; """.format( output_schema=DIM_AND_FCT_SCHEMA, input_schema=STAGING_SCRAPES_SCHEMA, today='{{ ds }}', ))
] activity_cols = ['controller', 'action', 'status'] params = { 'schema': DIM_AND_FCT_SCHEMA, 'daily': 'engagement_daily', 'datelist': 'engagement_datelist', 'visitation': 'engagement_visitation', 'datelist_max': 2**30, } create = FBRedshiftOperator( dag=dag, params=params, postgres_conn_id=REDSHIFT_CONN_ID, sql=creates.values(), task_id='creates', ) daily = FBRedshiftOperator( dag=dag, params=params, postgres_conn_id=REDSHIFT_CONN_ID, sql=[ 'BEGIN;', def_delete("{{ params.schema }}.{{ params.daily }}"), """ INSERT INTO {{ params.schema }}.{{ params.daily }} SELECT timestamp::date AS as_of, acting_user_id,
create_agg_perf = FBRedshiftOperator( task_id='create_agg_perf', sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {schema}.agg_perf_historical ( as_of date NOT NULL, controller varchar(256), action varchar(256), release_version varchar(256), release_created_at varchar(256), duration_p10 float8, duration_p25 float8, duration_p50 float8, duration_p75 float8, duration_p90 float8, duration_p95 float8, duration_cnt int8, db_p10 float8, db_p25 float8, db_p50 float8, db_p75 float8, db_p90 float8, db_p95 float8, db_cnt int8, response_size_p10 float8, response_size_p25 float8, response_size_p50 float8, response_size_p75 float8, response_size_p90 float8, response_size_p95 float8, response_size_cnt int8 ) SORTKEY (as_of, controller, action, release_version); COMMIT; """.format(schema=DIM_AND_FCT_SCHEMA), dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, )
) create_dim_teachers = FBRedshiftOperator( task_id='create_dim_teachers', postgres_conn_id=REDSHIFT_CONN_ID, sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {schema}.dim_teachers_historical ( id integer, email character varying(1020), first_name character varying(1020), last_name character varying(1020), site_id integer, verified_teacher boolean, visibility character varying(256), site_name character varying(1020), district_id integer, site_min_grade_level integer, site_max_grade_level integer, nces_site_id integer, site_enrollment_group character varying(256), district_name character varying(256), nces_district_id integer, as_of date, created_at timestamp without time zone ); COMMIT; """.format(schema=DIM_AND_FCT_SCHEMA), dag=dag, ) insert_dim_teachers = FBHistoricalOperator(
'redshift_enum_udf', default_args=default_args, schedule_interval=DEFAULT_SCHEDULE_INTERVAL, ) table_name = '{}.enums_translations'.format(STAGING_SCRAPES_WRITE_SCHEMA) create_enums_translations = FBRedshiftOperator( task_id='create_enums_translations', sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {table_name} ( table_name VARCHAR(256), class_name VARCHAR(256), "column" VARCHAR(256), key NUMERIC(10,0), value VARCHAR(256) ) SORTKEY (table_name, class_name, "column", key); COMMIT; """.format(table_name=table_name), dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, ) s3_key = '//plp-data-lake/enums/translations_{{ ds }}.csv.gz' load_enums_translations = FBS3ToRedshiftOperator( redshift_conn_id=REDSHIFT_CONN_ID, task_id='load_enums_translations', table=table_name,
'depends_on_past': False, 'start_date': datetime(2017, 2, 3), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, # In case the warehouse is down, we don't want to dogpile # the airflow server with minute long jobs. Add an execution # timeout to prevent going over one minute. 'execution_timeout': timedelta(seconds=30), } dag = DAG( 'redshift_availability_minutely', default_args=default_args, schedule_interval='* * * * *', ) # Note: We use a different user because # the "airflow" user is marked with the etl_users GROUP # and thus uses the etl queue. To track the availability # of the normal query queue we use the "airflow_nonetl" user. availability_check = FBRedshiftOperator( task_id='availability_check', sql="""-- Availability check query SELECT * FROM staging_scrapes.sites LIMIT 5; """, postgres_conn_id=REDSHIFT_NON_ETL_CONN_ID, dag=dag, )
create_course_assignments = FBRedshiftOperator( task_id='create_course_assignments', sql=""" BEGIN; CREATE TABLE IF NOT EXISTS {schema}.course_assignments_historical ( id integer NOT NULL, course_id integer, student_id integer, created_at timestamp without time zone, updated_at timestamp without time zone, project_score integer, raw_cog_skill_score double precision, cog_skill_pcnt integer, num_projects_graded integer, num_projects_overdue integer, num_projects_turned_in integer, num_projects_total integer, num_projects_ungraded integer, num_projects_exempted integer, manually_graded boolean, overall_score integer, letter_grade character varying(1020), power_num_mastered integer, power_out_of integer, power_expected_pcnt integer, power_on_track boolean, addl_num_mastered integer, addl_out_of integer, addl_expected_pcnt integer, power_expected double precision, addl_expected double precision, target_letter_grade character varying(1020), power_num_behind integer, overall_score_changed_at timestamp without time zone, start_date date, end_date date, raw_concept_score double precision, concept_pcnt integer, site_id integer, visibility character varying(256), as_of date ); COMMIT; """.format(schema=DIM_AND_FCT_SCHEMA), dag=dag, postgres_conn_id=REDSHIFT_CONN_ID, )