Ejemplo n.º 1
0
                elif (active_tdy and not active_ydy):
                    result = 'NEW' if new_user else 'RESURRECTED'
                elif (not active_tdy and not active_ydy):
                    result = 'STALE'
                elif (not active_tdy and active_ydy):
                    result = 'CHURN'

                return result
            $$ language plpythonu;
        """,
    },
}

queries = []
for function, config in udfs.items():
    desc = config.get('desc', '')
    query = config.get('query', None).format(function=function)
    if query is None:
        continue
    queries.append(query)

register_udfs = FBRedshiftOperator(
    task_id='register_udfs',
    postgres_conn_id=REDSHIFT_CONN_ID,
    sql="{{ params.queries }}",
    params={
        'queries': "\n".join(queries),
    },
    dag=dag,
)
Ejemplo n.º 2
0
    table='course_assignment_sections',
    dag=dag,
)

create_course_assignment_sections = FBRedshiftOperator(
    task_id='create_course_assignment_sections',
    sql="""
    BEGIN;
    CREATE TABLE IF NOT EXISTS {schema}.course_assignment_sections_historical (
         id NUMERIC(10,0), 
         course_assignment_id NUMERIC(10,0),
         section_id NUMERIC(10,0),
         created_at TIMESTAMP WITHOUT TIME ZONE,
         updated_at TIMESTAMP WITHOUT TIME ZONE,
         as_of DATE
    )
         DISTSTYLE KEY
         DISTKEY (course_assignment_id)
         SORTKEY (as_of
                 ,course_assignment_id
                 ,section_id
    );
    COMMIT;
    """.format(schema=DIM_AND_FCT_SCHEMA),
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)

insert_course_assignment_sections = FBHistoricalOperator(
    redshift_conn_id=REDSHIFT_CONN_ID,
    task_id='insert_course_assignment_sections',
Ejemplo n.º 3
0
    schema=STAGING_SCRAPES_SCHEMA,
    table='sections',
    dag=dag,
)

create_sections = FBRedshiftOperator(
    task_id='create_sections',
    sql="""
    BEGIN;
    CREATE TABLE IF NOT EXISTS {schema}.sections_historical (
        id numeric(10,0),
        sis_id character varying(65535),
        name character varying(255),
        clever_id character varying(65535),
        site_id numeric(10,0),
        create_at timestamp without time zone,
        updated_at timestamp without time zone,
        sis_course_id numeric(10,0),
        academic_year numeric(10,0),
        create_source character varying(256),
        as_of date
    );
    COMMIT;
    """.format(schema=DIM_AND_FCT_SCHEMA),
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)

insert_sections = FBHistoricalOperator(
    redshift_conn_id=REDSHIFT_CONN_ID,
    task_id='insert_sections',
    view_name='sections',
Ejemplo n.º 4
0
    schema=STAGING_SCRAPES_SCHEMA,
    table='section_teachers',
    dag=dag,
)

create_section_teachers_historical = FBRedshiftOperator(
    task_id='create_section_teachers_historical',
    sql="""
    BEGIN;

    CREATE TABLE IF NOT EXISTS {schema}.section_teachers_historical (
        id NUMERIC(10,0),
        section_id NUMERIC(10,0),
        teacher_id NUMERIC(10,0),
        created_at TIMESTAMP WITHOUT TIME ZONE,
        updated_at TIMESTAMP WITHOUT TIME ZONE,
        visibility VARCHAR(256),
        as_of DATE
    )
    DISTKEY (teacher_id)
    SORTKEY (as_of, teacher_id, section_id, visibility);

    COMMIT;
    """.format(schema=DIM_AND_FCT_SCHEMA),
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)

select_and_insert = FBHistoricalOperator(
    task_id='select_and_insert',
    view_name='section_teachers',
    select_sql="""
Ejemplo n.º 5
0
    dag=dag,
)

create_dim_sites = FBRedshiftOperator(
    task_id='create_dim_sites',
    sql="""
    BEGIN;
    CREATE TABLE IF NOT EXISTS {schema}.dim_sites_historical (
        id numeric(10,0) NOT NULL,
        nces_site_id numeric(10,0),
        name character varying(255),
        enrollment_group character varying(256),
        district_id numeric(10,0),
        nces_district_id numeric(10,0),
        district_name character varying(65535),
        as_of date,
        status character varying(256),
        school_type character varying(65535),
        student_data_sharing character varying(256),
        teacher_sync_enabled bool,
        should_sync_clever bool,
        student_sync_enabled bool,
        section_sync_enabled bool
    );
    COMMIT;
    """.format(schema=DIM_AND_FCT_SCHEMA),
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)

insert_dim_sites = FBHistoricalOperator(
    redshift_conn_id=REDSHIFT_CONN_ID,
Ejemplo n.º 6
0
def get_scrape_subdag(table_name):
    dag = DAG(
        '%s.%s' % (PARENT_DAG_NAME, table_name),
        default_args=default_args,        
        schedule_interval=SCHEDULE_INTERVAL,
    )
    copy_transaction = FBRedshiftOperator(
        task_id='copy_transaction',
        postgres_conn_id=REDSHIFT_CONN_ID,
        sql="""
            BEGIN;

            DROP TABLE IF EXISTS
            {{ params.schema }}."{{ params.table_name }}_{{ macros.ds_add(ds, -4) }}"
            CASCADE; -- CASCADE will DROP the VIEW, too
            DROP TABLE IF EXISTS {{ params.schema }}."{{ params.table_name }}_{{ ds }}" CASCADE;
            DROP VIEW IF EXISTS {{ params.schema }}."{{ params.table_name }}" CASCADE;            

            -- Copy Table
            -- Step 0: Drop
            DROP TABLE IF EXISTS {{ params.schema }}."{{ params.table_name }}";
            -- Step 1: Create
            CREATE TABLE {{ params.schema }}."{{ params.table_name }}"
            (LIKE heroku_public."{{ params.table_name }}");
            -- Step 2: Copy
            INSERT INTO {{ params.schema }}."{{ params.table_name }}"
            SELECT * FROM heroku_public."{{ params.table_name }}";

            GRANT ALL PRIVILEGES ON {{ params.schema }}."{{ params.table_name }}"
            TO GROUP data_eng;

            ALTER TABLE {{ params.schema }}.{{ params.table_name }}
            RENAME TO "{{ params.table_name }}_{{ ds }}";

            CREATE VIEW {{ params.schema }}.{{ params.table_name }} AS
            (SELECT * FROM {{ params.schema }}."{{ params.table_name }}_{{ ds }}");

            GRANT ALL PRIVILEGES ON {{ params.schema }}.{{ params.table_name }}
            TO GROUP data_eng;

            COMMIT;
        """,
        params={
          'schema': STAGING_SCRAPES_WRITE_SCHEMA,
          'table_name': table_name,
        },
        dag=dag,
    )

    signal = FBWriteSignalOperator(
        conn_id=REDSHIFT_CONN_ID,
        task_id='write_signal',
        schema=STAGING_SCRAPES_WRITE_SCHEMA,
        table=table_name,
        dag=dag,
    )
    signal.set_upstream(copy_transaction)

    unload = FBRedshiftToS3Transfer(
        task_id='unload',
        schema=STAGING_SCRAPES_WRITE_SCHEMA,
        table='{{ params.table_name }}_{{ ds }}',
        s3_bucket='plp-data-lake',
        s3_key='scrapes-opt-prod-airflow/{{ params.table_name }}/as_of={{ ds }}/',
        redshift_conn_id=REDSHIFT_CONN_ID,
        unload_options=[
            'ALLOWOVERWRITE',
            'DELIMITER AS \',\'',
            'GZIP',
            'ESCAPE ADDQUOTES',
        ],
        params={'table_name': table_name},
        dag=dag,
    )
    unload.set_upstream(copy_transaction)

    return dag
Ejemplo n.º 7
0
insert_know_do_masteries = FBRedshiftOperator(
    task_id='insert_know_do_masteries',
    postgres_conn_id=REDSHIFT_CONN_ID,
    dag=dag,
    sql="""
    BEGIN;
    DROP TABLE IF EXISTS {output_schema}.know_do_masteries;
    
    CREATE TABLE IF NOT EXISTS {output_schema}.know_do_masteries (
        id numeric(10,0) NOT NULL,
        student_id numeric(10,0),
        know_do_id numeric(10,0),
        mastery character varying(255),
        reason character varying(255),
        created_at timestamp without time zone,
        updated_at timestamp without time zone,
        created_by numeric(10,0),
        updated_by numeric(10,0)
    ) 
    DISTKEY (student_id)
    SORTKEY (created_at, student_id, know_do_id);

    INSERT INTO {output_schema}.know_do_masteries (
        SELECT 
            id,
            student_id,
            know_do_id,
            mastery,
            reason,
            created_at,
            updated_at,
            created_by,
            updated_by
        FROM {input_schema}."know_do_masteries_{today}"
    );
    COMMIT;
    """.format(output_schema=DIM_AND_FCT_SCHEMA,
               input_schema=STAGING_SCRAPES_SCHEMA,
               today='{{ ds }}'),
).set_upstream(wait_for_know_do_masteries)
Ejemplo n.º 8
0
    STAGING_SCRAPES_SCHEMA,
    STAGING_SCRAPES_WRITE_SCHEMA,
    'wild_west',
    'zendesk',
]

add_ddl = []
for group, users in permission_groups.items():
    for user in users:
        add_ddl.append("ALTER GROUP {group_name} ADD USER {user};".format(
            user=user,
            group_name=group,
        ))
add_members = FBRedshiftOperator(
    task_id='add_members',
    postgres_conn_id=REDSHIFT_ADMIN_CONN_ID,
    sql="\n".join(add_ddl),
    dag=dag,
)

perms_sql = []
for schema in schemas:
    for user in user_list:
        perms_sql.append("GRANT ALL ON SCHEMA {schema} TO {user};".format(
            schema=schema,
            user=user,
        ))
        for target in ('TABLES', 'FUNCTIONS'):
            perms_sql.append(
                """
                    GRANT {perms} ON ALL TABLES IN SCHEMA {schema} TO {user};
                """.format(
Ejemplo n.º 9
0
    schema=STAGING_SCRAPES_SCHEMA,
    table='users',
    dag=dag,
)

# Bigint instead of integer in preparation for using larger IDs
create_dim_parents = FBRedshiftOperator(
    task_id='create_dim_parents',
    sql=[
        "BEGIN;",
        """
        CREATE TABLE IF NOT EXISTS {schema}.dim_parents_historical (
            id bigint NOT NULL,
            email character varying(1024),
            first_name character varying(1024),
            last_name character varying(1024),
            as_of date NOT NULL
        )
        DISTKEY (id)
        SORTKEY (as_of, id);
        """.format(schema=DIM_AND_FCT_SCHEMA),
        "COMMIT;",
    ],
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)

insert_dim_parents = FBHistoricalOperator(
    task_id='insert_dim_parents',
    view_name='dim_parents',
    select_sql="""
        SELECT
Ejemplo n.º 10
0
insert_assessment_takes = FBRedshiftOperator(
    task_id='insert_assessment_takes',
    postgres_conn_id=REDSHIFT_CONN_ID,
    dag=dag,
    sql="""
    BEGIN;
    DROP TABLE IF EXISTS {output_schema}.assessment_takes;
    
    CREATE TABLE IF NOT EXISTS {output_schema}.assessment_takes (
        id NUMERIC(10,0),
        student_id NUMERIC(10,0),
        num_correct NUMERIC(10,0),
        created_at TIMESTAMP WITHOUT TIME ZONE,
        updated_at TIMESTAMP WITHOUT TIME ZONE,
        know_do_id NUMERIC(10,0),
        num_possible NUMERIC(10,0),
        taken_at TIMESTAMP WITHOUT TIME ZONE,
        approved_by_id NUMERIC(10,0),
        assessment_id NUMERIC(10,0),
        approved_time TIMESTAMP WITHOUT TIME ZONE,
        start_time TIMESTAMP WITHOUT TIME ZONE,
        is_content_assessment VARCHAR(5),
        created_by NUMERIC(10,0),
        updated_by NUMERIC(10,0),
        ell_locale VARCHAR(65535),
        objective_level_stats VARCHAR(65535),
        additional_time_given NUMERIC(10,0),
        academic_mindset_popup VARCHAR(5),
        student_signature VARCHAR(65535),
        time_zone VARCHAR(65535),
        ip_for_request VARCHAR(65535),
        ip_for_start VARCHAR(65535),
        ip_for_finish VARCHAR(65535),
        session_id_for_request VARCHAR(65535),
        session_id_for_start VARCHAR(65535),
        session_id_for_finish VARCHAR(65535),
        browser_id_for_request VARCHAR(65535),
        browser_id_for_start VARCHAR(65535),
        browser_id_for_finish VARCHAR(65535),
        invalidation_description VARCHAR(65535),
        visibility VARCHAR(256),
        invalidation_code VARCHAR(256),
        pcnt_to_pass NUMERIC(3,2),
        passed BOOLEAN
    )
    DISTKEY (student_id)
    SORTKEY (created_at, student_id, know_do_id);

    INSERT INTO {output_schema}.assessment_takes (
        SELECT
            ats.id,
            ats.student_id,
            ats.num_correct,
            ats.created_at,
            ats.updated_at,
            ats.know_do_id,
            ats.num_possible,
            ats.taken_at,
            ats.approved_by_id,
            ats.assessment_id,
            ats.approved_time,
            ats.start_time,
            ats.is_content_assessment,
            ats.created_by,
            ats.updated_by,
            ats.ell_locale,
            ats.objective_level_stats,
            ats.additional_time_given,
            ats.academic_mindset_popup,
            ats.student_signature,
            ats.time_zone,
            ats.ip_for_request,
            ats.ip_for_start,
            ats.ip_for_finish,
            ats.session_id_for_request,
            ats.session_id_for_start,
            ats.session_id_for_finish,
            ats.browser_id_for_request,
            ats.browser_id_for_start,
            ats.browser_id_for_finish,
            ats.invalidation_description,
            enum_name_for_value('visibility', ats.visibility, 'assessment_takes', 'AssessmentTake'),
            enum_name_for_value('invalidation_code', ats.invalidation_code, 'assessment_takes', 'AssessmentTake'),
            kd.pcnt_to_pass,
            (100 * ats.num_correct / cast(ats.num_possible as float)) >= cast(round(kd.pcnt_to_pass * 100, 0) as int)
        FROM {input_schema}."assessment_takes_{today}" ats
        LEFT JOIN {input_schema}."know_dos_{today}" kd
        ON (ats.know_do_id = kd.id)
    );
    COMMIT;
    """.format(
        output_schema=DIM_AND_FCT_SCHEMA,
        input_schema=STAGING_SCRAPES_SCHEMA,
        today='{{ ds }}',
    ))
Ejemplo n.º 11
0
]

activity_cols = ['controller', 'action', 'status']

params = {
    'schema': DIM_AND_FCT_SCHEMA,
    'daily': 'engagement_daily',
    'datelist': 'engagement_datelist',
    'visitation': 'engagement_visitation',
    'datelist_max': 2**30,
}

create = FBRedshiftOperator(
    dag=dag,
    params=params,
    postgres_conn_id=REDSHIFT_CONN_ID,
    sql=creates.values(),
    task_id='creates',
)
daily = FBRedshiftOperator(
    dag=dag,
    params=params,
    postgres_conn_id=REDSHIFT_CONN_ID,
    sql=[
        'BEGIN;',
        def_delete("{{ params.schema }}.{{ params.daily }}"),
        """
        INSERT INTO {{ params.schema }}.{{ params.daily }}
        SELECT
            timestamp::date AS as_of,
            acting_user_id,
Ejemplo n.º 12
0
create_agg_perf = FBRedshiftOperator(
    task_id='create_agg_perf',
    sql="""
    BEGIN;
    CREATE TABLE IF NOT EXISTS {schema}.agg_perf_historical (
        as_of date NOT NULL,
        controller varchar(256),
        action varchar(256),
        release_version varchar(256),
        release_created_at varchar(256),
        duration_p10 float8,
        duration_p25 float8,
        duration_p50 float8,
        duration_p75 float8,
        duration_p90 float8,
        duration_p95 float8,
        duration_cnt int8,
        db_p10 float8,
        db_p25 float8,
        db_p50 float8,
        db_p75 float8,
        db_p90 float8,
        db_p95 float8,
        db_cnt int8,
        response_size_p10 float8,
        response_size_p25 float8,
        response_size_p50 float8,
        response_size_p75 float8,
        response_size_p90 float8,
        response_size_p95 float8,
        response_size_cnt int8
    )
    SORTKEY (as_of, controller, action, release_version);
    COMMIT;
    """.format(schema=DIM_AND_FCT_SCHEMA),
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)
Ejemplo n.º 13
0
)

create_dim_teachers = FBRedshiftOperator(
    task_id='create_dim_teachers',
    postgres_conn_id=REDSHIFT_CONN_ID,
    sql="""
    BEGIN;
    CREATE TABLE IF NOT EXISTS {schema}.dim_teachers_historical (
        id integer,
        email character varying(1020),
        first_name character varying(1020),
        last_name character varying(1020),
        site_id integer,
        verified_teacher boolean,
        visibility character varying(256),
        site_name character varying(1020),
        district_id integer,
        site_min_grade_level integer,
        site_max_grade_level integer,
        nces_site_id integer,
        site_enrollment_group character varying(256),
        district_name character varying(256),
        nces_district_id integer,
        as_of date,
        created_at timestamp without time zone
    );
    COMMIT;
    """.format(schema=DIM_AND_FCT_SCHEMA),
    dag=dag,
)

insert_dim_teachers = FBHistoricalOperator(
Ejemplo n.º 14
0
    'redshift_enum_udf',
    default_args=default_args,
    schedule_interval=DEFAULT_SCHEDULE_INTERVAL,
)

table_name = '{}.enums_translations'.format(STAGING_SCRAPES_WRITE_SCHEMA)

create_enums_translations = FBRedshiftOperator(
    task_id='create_enums_translations',
    sql="""
    BEGIN;
    CREATE TABLE IF NOT EXISTS {table_name} (
        table_name VARCHAR(256),
        class_name VARCHAR(256),
        "column" VARCHAR(256),
        key NUMERIC(10,0),
        value VARCHAR(256)
    )
    SORTKEY (table_name, class_name, "column", key);
    COMMIT;
    """.format(table_name=table_name),
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)

s3_key = '//plp-data-lake/enums/translations_{{ ds }}.csv.gz'

load_enums_translations = FBS3ToRedshiftOperator(
    redshift_conn_id=REDSHIFT_CONN_ID,
    task_id='load_enums_translations',
    table=table_name,
Ejemplo n.º 15
0
    'depends_on_past': False,
    'start_date': datetime(2017, 2, 3),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    # In case the warehouse is down, we don't want to dogpile
    # the airflow server with minute long jobs. Add an execution
    # timeout to prevent going over one minute.
    'execution_timeout': timedelta(seconds=30),
}

dag = DAG(
    'redshift_availability_minutely',
    default_args=default_args,
    schedule_interval='* * * * *',
)

# Note: We use a different user because
# the "airflow" user is marked with the etl_users GROUP
# and thus uses the etl queue. To track the availability
# of the normal query queue we use the "airflow_nonetl" user.
availability_check = FBRedshiftOperator(
    task_id='availability_check',
    sql="""-- Availability check query
        SELECT * FROM staging_scrapes.sites LIMIT 5;
    """,
    postgres_conn_id=REDSHIFT_NON_ETL_CONN_ID,
    dag=dag,
)
Ejemplo n.º 16
0
create_course_assignments = FBRedshiftOperator(
    task_id='create_course_assignments',
    sql="""
    BEGIN;
    CREATE TABLE IF NOT EXISTS {schema}.course_assignments_historical (
        id integer NOT NULL,
        course_id integer,
        student_id integer,
        created_at timestamp without time zone,
        updated_at timestamp without time zone,
        project_score integer,
        raw_cog_skill_score double precision,
        cog_skill_pcnt integer,
        num_projects_graded integer,
        num_projects_overdue integer,
        num_projects_turned_in integer,
        num_projects_total integer,
        num_projects_ungraded integer,
        num_projects_exempted integer,
        manually_graded boolean,
        overall_score integer,
        letter_grade character varying(1020),
        power_num_mastered integer,
        power_out_of integer,
        power_expected_pcnt integer,
        power_on_track boolean,
        addl_num_mastered integer,
        addl_out_of integer,
        addl_expected_pcnt integer,
        power_expected double precision,
        addl_expected double precision,
        target_letter_grade character varying(1020),
        power_num_behind integer,
        overall_score_changed_at timestamp without time zone,
        start_date date,
        end_date date,
        raw_concept_score double precision,
        concept_pcnt integer,
        site_id integer,
        visibility character varying(256),
        as_of date
    );
    COMMIT;
    """.format(schema=DIM_AND_FCT_SCHEMA),
    dag=dag,
    postgres_conn_id=REDSHIFT_CONN_ID,
)