コード例 #1
0
    def execute(self, **context):
        hive = HiveHelper(hive_conn=context['hive_conn_id'])
        impala = ImpalaHelper(hive_conn=context['impala_conn_id'])
        list_all_dates = AirflowMetaStoreHelper().set_granularity(
            list_all_dates=context['list_all_dates'],
            agg_by=context['agg_by']
        )

        for date in list_all_dates:
            hdfs_path = HdfsHelper().generate_hdfs_path(env=context["env"],
                                                        layer=context['layer'],
                                                        dag_id=context['dag_name'],
                                                        date=date)

            self.log.info("HIVE: Creating partition")
            hive.add_partition(date=date,
                               db=context['dag_name'],
                               table=context['layer'],
                               hdfs_path=hdfs_path)

            self.log.info("IMPALA: Creating partition")
            impala.add_partition(date=date,
                                 db=context['dag_name'],
                                 table=context['layer'],
                                 hdfs_path=hdfs_path)

            self.log.info("IMPALA: Refreshing table")
            impala.execute_refresh_table(db=context['dag_name'],
                                         table=context['layer'])
コード例 #2
0
    def generate_all_partitions(self, oracle_conn: str, table_ctrl: str,
                                table_ctrl_col_dt_ref: str, agg_by: str,
                                env: str, layer: str, data: str) -> None:
        """
        Generates all partitions in hive and impala.
        Useful when it is necessary to recreate/change dbs, tables or data directories in HDFS
        """
        list_all_dates = OracleHelper(oracle_conn).get_all_dates(
            table_ctrl=table_ctrl, table_ctrl_col_dt_ref=table_ctrl_col_dt_ref)
        list_all_dates = [dt for dt in list_all_dates if dt is not None]
        list_dates = AirflowMetaStoreHelper().set_granularity(
            list_all_dates=list_all_dates, agg_by=agg_by)

        for date in list_dates:
            hdfs_path = HdfsHelper().generate_hdfs_path(env=env,
                                                        layer=layer,
                                                        dag_id=data,
                                                        date=date)

            self.log.info(f"Creating partition:")
            self.add_partition(date=date,
                               db=data,
                               table=layer,
                               hdfs_path=hdfs_path)
コード例 #3
0
def main():
    template = '-' * 79
    list_data = get_list_docs(path=''.join(here + f'/../../configs/data/'))

    for tag in ('oracle', 'redis', 'big_data'):
        logging.info(f'\n{template}\nCreating connections\n{template}')
        for conn_name in dict_config_conn[tag]:
            c = dict_config_conn[tag][conn_name]
            AirflowMetaStoreHelper(airflow_conn=c['conn']) \
                .create_conn(conn_type=c['conn_type'],
                             host=c['host'],
                             login=c['login'],
                             passwd=c['password'],
                             port=c['port'],
                             extra=c['extra'],
                             describe=c['describe'])

    impala = ImpalaHelper(hive_conn='impala')
    hive = HiveHelper(hive_conn='hive')
    hdfs = HdfsHelper(hdfs_conn='webhdfs')

    Variable.set(key='total_rows', value='100000')
    Variable.set(key='env', value=os.environ.get('ENV').lower())

    for data_name in list_data:
        dict_cfg_data = read_data_config(data_name=data_name)
        cfg = dict_cfg_data[data_name]
        cfg_hdfs = dag_cfg["paths"]["path_hdfs"]

        list_all_layers = list(cfg["hdfs_data_schema"].keys())
        list_layers = [
            layer for layer in list_all_layers if layer != 'db_comment'
        ]

        for layer in list_layers:

            # control var
            try:
                logging.info(
                    f'\n{template}\nLoading {data_name} control_var to airflow variables\n{template}'
                )
                val_control_var = hdfs.get_control_var(data_name=data_name,
                                                       layer=layer)
                Variable.set(key=f'{data_name}_control_var',
                             value=val_control_var)
                logging.info(f'control_var = {val_control_var}')
            except Exception as err:
                if layer == 'raw':
                    logging.error(err.__class__)
                    logging.info(f'File does not have exists. Creating ...')
                    hdfs.create_file_state(data_name=data_name, layer=layer)

            # avro schema
            logging.info(
                f'\n{template}\nGenerating and saving {data_name} avro schema\n{template}'
            )
            avro_schema = prepare_avro_schema(
                layer=layer,
                data_name=data_name,
                template=template,
                path_ojdbc=
                f"{dag_cfg['paths']['path_libs']}/{dag_cfg['libs']['ojdbc']}",
                path_native_lib=dag_cfg['paths']['path_native_lib'],
                doc_type=cfg['doc_type'],
                list_dict_cols=cfg["hdfs_data_schema"][layer]["cols"])
            hdfs \
                .save_avro_schema_hdfs(data_name=data_name,
                                       hdfs_path=cfg_hdfs["path_avro_schemas"],
                                       layer=layer,
                                       avro_schema=avro_schema)
            hdfs \
                .download_file(hdfs_path=f'{cfg_hdfs["path_avro_schemas"]}/{layer}/{data_name}.avsc',
                               local_path=f'{dag_cfg["paths"]["path_local_avro_schemas"]}/{layer}')

            # db, tables, partitions
            hive \
                .hive_prepare_env(env='prod',
                                  layer=layer,
                                  data_name=data_name,
                                  cfg=dict_cfg_data[data_name],
                                  cfg_hdfs=cfg_hdfs,
                                  template=template)

            logging.info(f'\n{template}\nUpdating DB statistics\n{template}')
            impala \
                .execute_invalidate_metadata()

            logging.info(
                f'\n{template}\nDownload {data_name} avro schemas to'
                f' {dag_cfg["paths"]["path_local_avro_schemas"]}\n{template}')
コード例 #4
0
def create_dag(
    dag_name: str,
    agg_by: str,
    cache_blob: str,
    path_avro_schema: str,
    executor_cores: str,
    executor_memory: str,
    driver_memory: str,
    max_registry_by_avro: str,
    table_id: str,
    table_id_col_fk: str,
    table_id_col_dt_ref: str,
    table_id_col_dt_created: str,
    table_blob_col_blob: str
) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 4, 12),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 100,
        'retry_delay': timedelta(seconds=90),
        'dag_name': dag_name
    }

    with DAG(dag_id=f'{step}_{dag_name}',
             description=f'pre_processe e limpa os dados de {dag_name}',
             schedule_interval=None,
             default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](http://www.xpto.company/layout/site/images/logo_xpto.png)"""

        origin_layer = 'raw'
        dest_layer = 'pre_processed'
        env = Variable.get('env', default_var='dev')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        last_control_var = Variable.get(f'{dag_name}_last_control_var', default_var='000000000000000')
        current_dag_name = dag_name + '_' + control_var
        list_all_dates = eval(Variable.get(f'{dag_name}_list_all_dates', default_var='[]'))
        list_partitions = eval(Variable.get(f'{dag_name}_list_partitions', default_var='[]'))
        list_inconsistency = eval(Variable.get(f'{dag_name}_{dest_layer}_list_inconsistency', default_var='[]'))

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(
            task_id='start',
            python_callable=start_time,
            depends_on_past=False,
            op_kwargs={'dag_name': dag_name,
                       'execution_date': '{{ ts }}'}
        )
        
        with TaskGroup(group_id='group_pre_process_clean_load') as group_pre_process_clean_load:
            list_all_dates = AirflowMetaStoreHelper().set_granularity(list_all_dates=list_all_dates,
                                                                      agg_by=agg_by)
            for date in list_all_dates:
                PythonOperator(
                    task_id=f'pre_process_clean_load-{date}',
                    python_callable=pre_processAndCleanData().execute,
                    trigger_rule=TriggerRule.ALL_SUCCESS,
                    op_kwargs={
                        'dag_id': dag.dag_id,
                        'step': step,
                        'dag_name': dag_name,
                        'date': date,
                        'env': env,
                        'origin_layer': origin_layer,
                        'dest_layer': dest_layer,
                        'table_blob_col_blob': table_blob_col_blob,
                        'path_spark_xml': path_spark_xml,
                        'path_spark_avro': path_spark_avro,
                        'path_native_lib': path_native_lib,
                        'path_ojdbc': path_ojdbc,
                        'executor_cores': executor_cores,
                        'executor_memory': executor_memory,
                        'driver_memory': driver_memory,
                        'hdfs_conn_id': 'webhdfs',
                        'path_avro_schema': path_avro_schema,
                        'list_all_dates': list_all_dates,
                        'max_registry_by_avro': max_registry_by_avro
                    }
                )
        
        task_create_partitions = PythonOperator(
            task_id='create_partitions',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=CreatePartitions().execute,
            op_kwargs={'dag_name': dag_name,
                       'current_dag_name': current_dag_name,
                       'list_all_dates': list_all_dates,
                       'agg_by': agg_by,
                       'env': env,
                       'hive_conn_id': 'hive',
                       'impala_conn_id': 'impala',
                       'layer': dest_layer}
        )
        
        with TaskGroup(group_id='group_compare_data') as group_compare_data:
            list_all_dates = AirflowMetaStoreHelper().set_granularity(list_all_dates=list_all_dates,
                                                                      agg_by=agg_by)
        
            for date in list_all_dates:
                PythonOperator(
                    task_id=f'compare_{origin_layer}_and_{dest_layer}_{date}',
                    python_callable=CompareDataImpalaImpala().execute,
                    trigger_rule=TriggerRule.ALL_SUCCESS,
                    op_kwargs={'dag_name': dag_name,
                               'control_var': control_var,
                               'last_control_var': last_control_var,
                               'date': date,
                               'hive_conn_id': 'impala',
                               'origin_layer': origin_layer,
                               'dest_layer': dest_layer,
                               'redis_conn_id': cache_blob,
                               'table_id': table_id,
                               'dt_ref': table_id_col_dt_ref,
                               'agg_by': agg_by,
                               'table_id_col_fk': table_id_col_fk,
                               'table_id_col_dt_created': table_id_col_dt_created,
                               'list_partitions': list_partitions,
                               'list_all_dates': list_all_dates}
                )
        
        task_check_if_contains_inconsistency = BranchPythonOperator(
            task_id=f'check_if_contains_inconsistency',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            wait_for_downstream=True,
            python_callable=AirflowMetaStoreHelper('airflow_db').check_if_contains_inconsistency,
            op_kwargs={'dag_name': dag_name,
                       'last_control_var': last_control_var,
                       'layer': dest_layer,
                       'true_case': 'generate_list_inconsistency',
                       'false_case': 'clear_airflow_var',
                       'redis_conn_id': cache_blob,
                       'get_redis_key': f'{dag_name}_inconsistency_date'}
        )
        
        task_generate_list_inconsistency = PythonOperator(
            task_id=f'generate_list_inconsistency',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=AirflowMetaStoreHelper('airflow_db').generate_list_inconsistency,
            op_kwargs={'dag_name': dag_name,
                       'last_control_var': last_control_var,
                       'layer': dest_layer}
        )
        
        task_hdfs_clear_inconsistency_data = PythonOperator(
            task_id='hdfs_clear_inconsistency_data',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=HdfsHelper('webhdfs').clear_inconsistency_data,
            op_kwargs={'dag_name': dag_name,
                       'list_inconsistency': list_inconsistency,
                       'layer': dest_layer}
        )
        
        task_crash_dag = PythonOperator(
            task_id=f'crash_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=crash_dag
        )

        task_clear_airflow_var = PythonOperator(
            task_id='clear_airflow_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=clear_airflow_var,
            op_kwargs={'airflow_conn_id': 'airflow_db',
                       'list_pattern_name_var': f"["
                                                f" '{dag_name}_%_total_row_id',"
                                                f" '{dag_name}_raw_total_registry_%',"
                                                f" '{dag_name}_list_dags',"
                                                f" '{dag_name}_list_path_to_concat',"
                                                f" '{dag_name}_list_path_to_rename',"
                                                f" '{dag_name}_list_all_dates'"
                                                f"]"}
        )

        task_no_inconsistencies_were_found = DummyOperator(
            task_id='no_inconsistencies_were_found'
        )

        task_end = PythonOperator(
            task_id='end',
            python_callable=end_time,
            op_kwargs={'current_dag_name': current_dag_name,
                       'dag_name': dag_name,
                       'last_control_var_name': f'{dag_name}_last_control_var',
                       'list_dates': f'{current_dag_name}_list_dates',
                       'postgres_conn_id': 'airflow_db'}
        )

    # -----------------
    #      GRAPH
    # -----------------
    task_start >> group_pre_process_clean_load >> task_create_partitions >> group_compare_data >> task_check_if_contains_inconsistency >> task_generate_list_inconsistency >> task_hdfs_clear_inconsistency_data >> task_crash_dag
    task_start >> group_pre_process_clean_load >> task_create_partitions >> group_compare_data >> task_check_if_contains_inconsistency >> task_clear_airflow_var >> task_no_inconsistencies_were_found >> task_end
    task_start >> task_clear_airflow_var >> task_no_inconsistencies_were_found >> task_end

    return dag
コード例 #5
0
def create_dag(dag_name: str, agg_by: str, doc_type: str, cache_blob: str,
               path_avro_schema: str, path_local_avro_schemas: str,
               executor_cores: str, executor_memory: str,
               executor_instances: str, driver_memory: str, col_type_pk: str,
               extra_cols: str, max_registry_by_file: str, oracle_conn_id: str,
               table_ctrl: str, table_ctrl_col_control_var: str,
               table_ctrl_col_fk: str, table_ctrl_col_dt_ref: str,
               table_ctrl_col_dt_created: str, oracle_conn_blob: str,
               table_blob: str, table_blob_col_pk: str,
               table_blob_col_blob: str) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 8, 17),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 10,
        'retry_delay': timedelta(seconds=60),
        'dag_name': dag_name
    }

    with DAG(dag_id=f'{step}_{dag_name}',
             description=f'Import data from {dag_name}',
             schedule_interval='00 19 * * *',
             catchup=False,
             default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)"""

        layer = 'raw'
        env = Variable.get('env', default_var='dev')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        last_control_var = Variable.get(f'{dag_name}_last_control_var',
                                        default_var='000000000000000')
        current_dag_name = dag_name + '_' + control_var
        total_pg = int(
            Variable.get(f'{current_dag_name}_total_pg', default_var=1))
        list_all_dates = eval(
            Variable.get(f'{dag_name}_list_all_dates', default_var='[]'))
        list_current_dates = eval(
            Variable.get(f'{current_dag_name}_current_dates',
                         default_var='[]'))
        list_dags = eval(
            Variable.get(f'{dag_name}_list_dags', default_var='[]'))
        total_rows = Variable.get('total_rows', default_var='100000')
        items_by_query = 1000

        sql_id = f'''
        SELECT
            {table_ctrl_col_fk} id,
            {table_ctrl_col_control_var} control_var,
            to_char({table_ctrl_col_dt_ref}, 'DD-MM-YYYY') dt_ref
        FROM {table_ctrl}
        WHERE
           {table_ctrl_col_control_var} > :control_var
           AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {table_ctrl_col_control_var} ASC
        FETCH FIRST :total_rows ROWS ONLY'''
        dict_bind_sql_get_data = {
            'control_var': f'{control_var}',
            'total_rows': f'{total_rows}'
        }

        sql_count_id = f'''
        SELECT COUNT({table_ctrl_col_fk})
        FROM {table_ctrl}
        WHERE
           {table_ctrl_col_control_var} > :control_var
           AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')'''
        dict_bind_sql_count_id = {'control_var': f'{control_var}'}

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(task_id='start',
                                    python_callable=start_time,
                                    depends_on_past=False,
                                    op_kwargs={
                                        'dag_name': dag_name,
                                        'execution_date': '{{ ts }}'
                                    })

        task_oracle_execute_count = OracleGetResults(
            task_id='oracle_execute_count',
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            sql_count_id=sql_count_id,
            dict_bind=dict_bind_sql_count_id)

        task_check_if_contains_data_in_oracle = BranchPythonOperator(
            task_id='check_if_contains_data_in_oracle',
            python_callable=AirflowMetaStoreHelper(
            ).check_if_contains_data_in_oracle,
            op_kwargs={
                'control_var': control_var,
                'last_control_var': last_control_var,
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_original',
                'true_case': 'get_id',
                'false_case': 'check_len_list_processed_dates'
            })

        task_get_id = OracleToRedisTransfer(
            task_id='get_id',
            oracle_conn_id=oracle_conn_id,
            redis_conn_id=cache_blob,
            sql=sql_id,
            dict_bind=dict_bind_sql_get_data,
            name_redis_key=f'{dag_name}_original')

        task_fill_data_gap = PythonOperator(
            task_id='fill_data_gap',
            python_callable=RedisHelper(cache_blob).fill_data_gaps,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_original'
            })

        task_get_dag_name = PythonOperator(
            task_id='get_dag_name',
            python_callable=AirflowMetaStoreHelper().get_dag_name,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'name_list_dags': f'{dag_name}_list_dags',
                'list_dags': list_dags
            })

        task_get_date = PythonOperator(
            task_id='get_date',
            python_callable=RedisHelper(cache_blob).get_date,
            op_kwargs={
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'list_columns': "['id', 'control_var', 'date']",
                'redis_key': current_dag_name
            })

        task_split_id_by_date = PythonOperator(
            task_id='split_id_by_date',
            python_callable=RedisHelper(cache_blob).split_id_by_date,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'list_current_dates': list_current_dates,
                'redis_key': current_dag_name
            })

        task_generate_pagination = PythonOperator(
            task_id='generate_pagination',
            python_callable=RedisHelper(cache_blob).generate_pagination,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'items_by_query': items_by_query,
                'list_current_dates': list_current_dates,
                'redis_key': current_dag_name
            })

        task_generate_sql_by_date = PythonOperator(
            task_id='generate_sql_by_date',
            python_callable=RedisHelper(cache_blob).generate_sql_by_date,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'list_current_dates': list_current_dates,
                'oracle_conn': oracle_conn_blob,
                'table_ctrl': table_ctrl,
                'table_ctrl_col_fk': table_ctrl_col_fk,
                'table_blob': table_blob,
                'table_blob_col_pk': table_blob_col_pk,
                'table_blob_col_blob': table_blob_col_blob,
                'items_by_query': items_by_query,
                'total_pg': total_pg,
                'extra_cols': extra_cols,
                'redis_key': current_dag_name
            })

        task_extract_decompress_load = OracleBlobToHdfsTransfer(
            task_id=f'extract_decompress_load',
            retries=20,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            query_id=sql_id,
            table_ctrl_col_fk=table_ctrl_col_fk,
            extra_cols=extra_cols,
            oracle_conn_blob=oracle_conn_blob,
            table_blob_col_pk=table_blob_col_pk,
            table_blob_col_blob=table_blob_col_blob,
            path_avro_schema=path_avro_schema,
            path_local_avro_schemas=
            f'{path_local_avro_schemas}/{layer}/{dag_name}.avsc',
            total_pg=total_pg,
            layer=layer,
            env=env,
            step=step,
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            col_type_pk=col_type_pk,
            compress_type='snappy',
            hdfs_conn_id='webhdfs',
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            list_current_dates=list_current_dates)

        task_update_control_var = PythonOperator(
            task_id='update_control_var',
            python_callable=AirflowMetaStoreHelper().update_control_var,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            depends_on_past=True,
            op_kwargs={
                'control_var': control_var,
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'last_control_var': last_control_var,
                'list_dags': list_dags,
                'total_pg': total_pg,
                'list_current_dates': list_current_dates,
                'list_all_dates': list_all_dates,
                'redis_key': current_dag_name
            })

        task_clear_environment = PythonOperator(
            task_id='clear_environment',
            python_callable=clear_environment,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            op_kwargs={
                'control_var': control_var,
                'dag_name': dag_name,
                'redis_conn_id': cache_blob,
                'airflow_conn_id': 'airflow_db',
                'last_control_var': last_control_var,
                'list_dags': list_dags,
                'redis_key': current_dag_name
            })

        task_check_len_list_processed_dates = BranchPythonOperator(
            task_id='check_len_list_processed_dates',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=check_len_list_processed_dates,
            op_kwargs={
                'dag_name': dag_name,
                'list_all_dates': list_all_dates,
                'true_case': 'prepare_execution',
                'false_case': 'waiting_execution'
            })

        task_prepare_execution = DummyOperator(task_id='prepare_execution')

        with TaskGroup(
                group_id='group_hdfs_concat_file') as group_hdfs_concat_file:
            task_hdfs_prepare_concat = PythonOperator(
                task_id='hdfs_prepare_concat',
                trigger_rule=TriggerRule.ALL_SUCCESS,
                python_callable=HdfsPrepareConcat('webhdfs').execute,
                op_kwargs={
                    'dag_name': dag_name,
                    'current_dag_name': current_dag_name,
                    'hdfs_path': f'/data/{env}/{layer}/{dag_name}',
                    'agg_by': agg_by,
                    'layer': layer,
                    'env': env,
                    'list_all_dates': list_all_dates,
                    'path_avro_tools': path_avro_tools
                })

            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)
            for date in list_all_dates:
                task_concat_file = HdfsConcatFiles(
                    task_id=f'hdfs_concat_file-{date}',
                    retries=100,
                    dag_name=dag_name,
                    date=date,
                    layer=layer,
                    env=env,
                    col_name_control_var=table_ctrl_col_control_var,
                    path_avro_schema=path_avro_schema,
                    hdfs_conn_id='webhdfs',
                    executor_cores=executor_cores,
                    executor_memory=executor_memory,
                    driver_memory=driver_memory,
                    path_ojdbc=path_ojdbc,
                    path_spark_avro=path_spark_avro,
                    path_native_lib=path_native_lib,
                    format_data='avro',
                    compress_type='snappy',
                    max_registry_by_avro=max_registry_by_file)
                task_hdfs_prepare_concat >> task_concat_file

        task_create_partitions = PythonOperator(
            task_id='create_partitions',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=CreatePartitions().execute,
            op_kwargs={
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'list_all_dates': list_all_dates,
                'hive_conn_id': 'hive',
                'impala_conn_id': 'impala',
                'agg_by': agg_by,
                'layer': layer,
                'env': env
            })

        task_save_execution_state_hdfs = PythonOperator(
            task_id='save_execution_state_hdfs',
            python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs,
            op_kwargs={
                'dag_name': dag_name,
                'layer': layer,
                'control_var': control_var
            })

        with TaskGroup(group_id='group_generate_statistics'
                       ) as group_generate_statistics:
            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)

            for date in list_all_dates:
                PythonOperator(task_id=f'generate_statistics-{date}',
                               retries=50,
                               python_callable=GenerateStatistics().execute,
                               op_kwargs={
                                   'dag_name': dag_name,
                                   'date': date,
                                   'layer': layer,
                                   'impala_conn_id': 'impala',
                                   'hive_conn_id': 'hive'
                               })

        with TaskGroup(group_id='group_check_data_quality'
                       ) as group_check_data_quality:
            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)

            for date in list_all_dates:
                CompareDataOracleImpala(
                    task_id=f'compare_oracle_impala_{date}',
                    retries=100,
                    dag_name=dag_name,
                    last_control_var=last_control_var,
                    layer=layer,
                    date=date,
                    table_ctrl=table_ctrl,
                    dt_ref=table_ctrl_col_dt_ref,
                    agg_by=agg_by,
                    oracle_conn_id=oracle_conn_id,
                    hive_conn='impala',
                    table_ctrl_col_fk=table_ctrl_col_fk,
                    table_ctrl_col_dt_created=table_ctrl_col_dt_created)

        task_check_if_contains_inconsistency = BranchPythonOperator(
            task_id=f'check_if_contains_inconsistency',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            wait_for_downstream=True,
            python_callable=AirflowMetaStoreHelper(
                'airflow_db').check_if_contains_inconsistency,
            op_kwargs={
                'dag_name': dag_name,
                'last_control_var': last_control_var,
                'layer': layer,
                'true_case': 'prepare_reprocessing_inconsistency_data',
                'false_case': f'check_next_dag',
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_inconsistency_date'
            })

        task_prepare_reprocessing_inconsistency_data = PrepareReprocessingInconsistencyData(
            task_id=f'prepare_reprocessing_inconsistency_data',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            layer=layer,
            last_control_var=last_control_var,
            list_all_dates=list_all_dates,
            table_ctrl=table_ctrl,
            table_ctrl_col_fk=table_ctrl_col_fk,
            table_ctrl_col_control_var=table_ctrl_col_control_var,
            table_ctrl_col_dt_ref=table_ctrl_col_dt_ref,
            table_ctrl_col_dt_created=table_ctrl_col_dt_created,
            hive_conn_id='impala',
            hdfs_conn_id='webhdfs',
            airflow_conn_id='airflow_db',
            oracle_conn_id=oracle_conn_id)

        task_crash_dag = PythonOperator(
            task_id=f'crash_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=crash_dag,
        )

        task_check_next_dag = BranchPythonOperator(
            task_id='check_next_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=check_next_dag,
            op_kwargs={
                'dag_name': dag_name,
                'doc_type': doc_type,
                'true_case': f'trigger_pre_process_{dag_name}',
                'false_case': f'trigger_parser_{dag_name}'
            })

        task_trigger_pre_process = TriggerDagRunOperator(
            task_id=f'trigger_pre_process_{dag_name}',
            trigger_dag_id=f"pre_process_{dag_name}")

        task_trigger_parser = TriggerDagRunOperator(
            task_id=f'trigger_parser_{dag_name}',
            trigger_dag_id=f"parser_{dag_name}")

        task_trigger_import_file = TriggerDagRunOperator(
            task_id=f'trigger_import_file_{dag_name}',
            trigger_dag_id=dag.dag_id)

        task_waiting_execution = DummyOperator(
            trigger_rule=TriggerRule.ALL_DONE, task_id='waiting_execution')

        task_end = PythonOperator(task_id='end',
                                  python_callable=end_time,
                                  op_kwargs={
                                      'current_dag_name': current_dag_name,
                                      'dag_name': dag_name,
                                      'last_control_var_name':
                                      f'{dag_name}_last_control_var',
                                      'list_dates':
                                      f'{current_dag_name}_list_dates',
                                      'postgres_conn_id': 'airflow_db'
                                  })

    # -----------------
    #      GRAPH
    # -----------------
    # task_check_if_contains_data_in_oracle: true
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_get_id >> task_fill_data_gap >> [
        task_get_date, task_get_dag_name
    ] >> task_split_id_by_date >> task_generate_pagination >> task_generate_sql_by_date >> task_extract_decompress_load >> task_update_control_var >> [
        task_clear_environment, task_trigger_import_file
    ] >> task_waiting_execution >> task_end

    # task_check_if_contains_data_in_oracle: false
    #   task_check_len_list_processed_dates: true
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [
        group_hdfs_concat_file, task_save_execution_state_hdfs
    ] >> task_create_partitions >> [
        group_check_data_quality, group_generate_statistics
    ] >> task_check_if_contains_inconsistency
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [
        group_hdfs_concat_file, task_save_execution_state_hdfs
    ] >> task_create_partitions >> task_check_if_contains_inconsistency >> task_prepare_reprocessing_inconsistency_data >> task_crash_dag

    # task_check_next_dag: true
    task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_pre_process >> task_waiting_execution >> task_end
    # task_check_next_dag: false
    task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_parser >> task_waiting_execution >> task_end

    # task_check_if_contains_data_in_oracle: false
    #   task_check_len_list_processed_dates: false
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_waiting_execution >> task_end

    return dag
コード例 #6
0
def create_dag(dag_name: str, max_registry_by_file: str, db_name: str,
               table_name: str, col_control_var: str, col_name_dt_ref: str,
               oracle_conn_id: str, path_avro_schema: str,
               path_config_docs: str, executor_cores: str,
               executor_memory: str, executor_instances: str,
               driver_memory: str) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 8, 1),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 10,
        'retry_delay': timedelta(seconds=90),
        'dag_name': dag_name
    }

    with DAG(
            dag_id=f'{step}_{dag_name}',
            description=f'Importa os dados de {dag_name}',
            # schedule_interval=None,
            schedule_interval='00 19 * * *',
            catchup=False,
            default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)"""
        layer = 'raw'
        env = Variable.get('env', default_var='prod')
        last_control_var = Variable.get(f'{dag_name}_last_control_var',
                                        default_var='000000000000000')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        current_dag_name = dag_name + '_' + control_var
        total_rows = Variable.get('total_rows', default_var='100000')

        sql_get_data = f'''
        SELECT
            *
        FROM {db_name}.{table_name}
        WHERE
            {col_control_var} > :control_var
            AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {col_control_var} ASC
        FETCH FIRST :total_rows ROWS ONLY'''
        dict_bind_sql_get_data = {
            'control_var': control_var,
            'total_rows': total_rows
        }

        sql_count_id = f'''
        SELECT COUNT({col_control_var})
        FROM {db_name}.{table_name}
        WHERE
            {col_control_var} > :control_var
            AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')'''
        dict_bind_sql_count_id = {'control_var': control_var}

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(task_id='start',
                                    python_callable=start_time,
                                    depends_on_past=False,
                                    op_kwargs={
                                        'dag_name': dag_name,
                                        'execution_date': '{{ ts }}'
                                    })

        task_oracle_execute_count = OracleGetResults(
            task_id='oracle_execute_count',
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            sql_count_id=sql_count_id,
            dict_bind=dict_bind_sql_count_id)

        task_check_if_contains_data_in_oracle = BranchPythonOperator(
            task_id='check_if_contains_data_in_oracle',
            python_callable=AirflowMetaStoreHelper(
            ).check_if_contains_data_in_oracle,
            op_kwargs={
                'control_var': control_var,
                'last_control_var': last_control_var,
                'current_dag_name': current_dag_name,
                'true_case': 'extract_transform_load',
                'false_case': 'sync_data'
            })

        task_extract_transform_load = OracleTableToHdfsTransfer(
            task_id=f'extract_transform_load',
            retries=20,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            sql_get_data=sql_get_data,
            dict_bind=dict_bind_sql_get_data,
            col_control_var=col_control_var,
            path_avro_schema=path_avro_schema,
            layer=layer,
            env=env,
            step=step,
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            compress_type='snappy',
            oracle_conn_id=oracle_conn_id,
            hdfs_conn_id='webhdfs',
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            max_registry_by_file=max_registry_by_file)

        task_sync_data = SyncData(
            task_id='sync_data',
            dag_name=dag_name,
            db_name=db_name,
            table_name=table_name,
            col_name_control_var=col_control_var,
            col_name_dt_ref=col_name_dt_ref,
            path_avro_schema=path_avro_schema,
            layer=layer,
            env=env,
            hdfs_conn_id='webhdfs',
            oracle_conn_id=oracle_conn_id,
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            compress_type='snappy',
            max_registry_by_file=max_registry_by_file)

        task_concat_file = HdfsConcatFiles(
            task_id=f'hdfs_concat_file',
            retries=100,
            dag_name=dag_name,
            layer=layer,
            env=env,
            col_name_control_var=col_control_var,
            path_avro_schema=path_avro_schema,
            hdfs_conn_id='webhdfs',
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            format_data='parquet',
            compress_type='snappy',
            max_registry_by_avro=max_registry_by_file)

        task_save_execution_state_hdfs = PythonOperator(
            task_id='save_execution_state_hdfs',
            python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs,
            op_kwargs={
                'dag_name': dag_name,
                'control_var': control_var,
                'layer': layer
            })

        task_update_statistics = UpdateStasByTable(
            task_id=f'update_statistics',
            retries=50,
            db_name=dag_name,
            table_name=layer,
            impala_conn_id='impala')

        task_clear_airflow_var = PythonOperator(
            task_id='clear_airflow_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=clear_airflow_var,
            op_kwargs={
                'airflow_conn_id':
                'airflow_db',
                'list_pattern_name_var':
                f"["
                f" '{dag_name}_%_total_row_id',"
                f" '{dag_name}_raw_total_registry_%',"
                f" '{dag_name}_list_dags',"
                f" '{dag_name}_list_path_to_concat',"
                f" '{dag_name}_list_path_to_rename',"
                f" '{dag_name}_list_all_dates'"
                f"]"
            })

        task_update_control_var = OracleUpdateControlVar(
            task_id='update_control_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            depends_on_past=True,
            control_var=control_var,
            last_control_var=last_control_var,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            col_control_var=col_control_var,
            oracle_conn_id=oracle_conn_id,
            dict_bind=dict_bind_sql_get_data,
            sql=sql_get_data)

        task_trigger_import_file = TriggerDagRunOperator(
            task_id=f'trigger_import_file_{dag_name}',
            trigger_dag_id=dag.dag_id)

        task_end = PythonOperator(task_id='end',
                                  trigger_rule=TriggerRule.ALL_DONE,
                                  python_callable=end_time,
                                  op_kwargs={
                                      'current_dag_name': current_dag_name,
                                      'dag_name': dag_name,
                                      'last_control_var_name':
                                      f'{dag_name}_last_control_var',
                                      'postgres_conn_id': 'airflow_db'
                                  })

    # -----------------
    #      GRAPH
    # -----------------
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_extract_transform_load >> task_update_control_var >> task_trigger_import_file >> task_end
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_sync_data >> [
        task_concat_file, task_save_execution_state_hdfs
    ] >> task_update_statistics >> task_clear_airflow_var >> task_end

    return dag
コード例 #7
0
def clear_environment(**context) -> None:
    RedisHelper(context['redis_conn_id']) \
        .clear_redis(dag_name=context['dag_name'])
    AirflowMetaStoreHelper(context['airflow_conn_id']) \
        .delete_airflow_var(dag_name=context['dag_name'],
                            last_control_var=context['last_control_var'])
コード例 #8
0
def clear_airflow_var(**context) -> None:
    AirflowMetaStoreHelper(context['airflow_conn_id']) \
        .delete_pattern_var(context['list_pattern_name_var'])