def execute(self, **context): hive = HiveHelper(hive_conn=context['hive_conn_id']) impala = ImpalaHelper(hive_conn=context['impala_conn_id']) list_all_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=context['list_all_dates'], agg_by=context['agg_by'] ) for date in list_all_dates: hdfs_path = HdfsHelper().generate_hdfs_path(env=context["env"], layer=context['layer'], dag_id=context['dag_name'], date=date) self.log.info("HIVE: Creating partition") hive.add_partition(date=date, db=context['dag_name'], table=context['layer'], hdfs_path=hdfs_path) self.log.info("IMPALA: Creating partition") impala.add_partition(date=date, db=context['dag_name'], table=context['layer'], hdfs_path=hdfs_path) self.log.info("IMPALA: Refreshing table") impala.execute_refresh_table(db=context['dag_name'], table=context['layer'])
def generate_all_partitions(self, oracle_conn: str, table_ctrl: str, table_ctrl_col_dt_ref: str, agg_by: str, env: str, layer: str, data: str) -> None: """ Generates all partitions in hive and impala. Useful when it is necessary to recreate/change dbs, tables or data directories in HDFS """ list_all_dates = OracleHelper(oracle_conn).get_all_dates( table_ctrl=table_ctrl, table_ctrl_col_dt_ref=table_ctrl_col_dt_ref) list_all_dates = [dt for dt in list_all_dates if dt is not None] list_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_dates: hdfs_path = HdfsHelper().generate_hdfs_path(env=env, layer=layer, dag_id=data, date=date) self.log.info(f"Creating partition:") self.add_partition(date=date, db=data, table=layer, hdfs_path=hdfs_path)
def main(): template = '-' * 79 list_data = get_list_docs(path=''.join(here + f'/../../configs/data/')) for tag in ('oracle', 'redis', 'big_data'): logging.info(f'\n{template}\nCreating connections\n{template}') for conn_name in dict_config_conn[tag]: c = dict_config_conn[tag][conn_name] AirflowMetaStoreHelper(airflow_conn=c['conn']) \ .create_conn(conn_type=c['conn_type'], host=c['host'], login=c['login'], passwd=c['password'], port=c['port'], extra=c['extra'], describe=c['describe']) impala = ImpalaHelper(hive_conn='impala') hive = HiveHelper(hive_conn='hive') hdfs = HdfsHelper(hdfs_conn='webhdfs') Variable.set(key='total_rows', value='100000') Variable.set(key='env', value=os.environ.get('ENV').lower()) for data_name in list_data: dict_cfg_data = read_data_config(data_name=data_name) cfg = dict_cfg_data[data_name] cfg_hdfs = dag_cfg["paths"]["path_hdfs"] list_all_layers = list(cfg["hdfs_data_schema"].keys()) list_layers = [ layer for layer in list_all_layers if layer != 'db_comment' ] for layer in list_layers: # control var try: logging.info( f'\n{template}\nLoading {data_name} control_var to airflow variables\n{template}' ) val_control_var = hdfs.get_control_var(data_name=data_name, layer=layer) Variable.set(key=f'{data_name}_control_var', value=val_control_var) logging.info(f'control_var = {val_control_var}') except Exception as err: if layer == 'raw': logging.error(err.__class__) logging.info(f'File does not have exists. Creating ...') hdfs.create_file_state(data_name=data_name, layer=layer) # avro schema logging.info( f'\n{template}\nGenerating and saving {data_name} avro schema\n{template}' ) avro_schema = prepare_avro_schema( layer=layer, data_name=data_name, template=template, path_ojdbc= f"{dag_cfg['paths']['path_libs']}/{dag_cfg['libs']['ojdbc']}", path_native_lib=dag_cfg['paths']['path_native_lib'], doc_type=cfg['doc_type'], list_dict_cols=cfg["hdfs_data_schema"][layer]["cols"]) hdfs \ .save_avro_schema_hdfs(data_name=data_name, hdfs_path=cfg_hdfs["path_avro_schemas"], layer=layer, avro_schema=avro_schema) hdfs \ .download_file(hdfs_path=f'{cfg_hdfs["path_avro_schemas"]}/{layer}/{data_name}.avsc', local_path=f'{dag_cfg["paths"]["path_local_avro_schemas"]}/{layer}') # db, tables, partitions hive \ .hive_prepare_env(env='prod', layer=layer, data_name=data_name, cfg=dict_cfg_data[data_name], cfg_hdfs=cfg_hdfs, template=template) logging.info(f'\n{template}\nUpdating DB statistics\n{template}') impala \ .execute_invalidate_metadata() logging.info( f'\n{template}\nDownload {data_name} avro schemas to' f' {dag_cfg["paths"]["path_local_avro_schemas"]}\n{template}')
def create_dag( dag_name: str, agg_by: str, cache_blob: str, path_avro_schema: str, executor_cores: str, executor_memory: str, driver_memory: str, max_registry_by_avro: str, table_id: str, table_id_col_fk: str, table_id_col_dt_ref: str, table_id_col_dt_created: str, table_blob_col_blob: str ) -> airflow.models.dag.DAG: # ----------------- # DAG # ----------------- args = { 'owner': 'job', 'run_as_user': '******', 'start_date': datetime(2021, 4, 12), 'do_xcom_push': False, 'depends_on_past': True, 'retries': 100, 'retry_delay': timedelta(seconds=90), 'dag_name': dag_name } with DAG(dag_id=f'{step}_{dag_name}', description=f'pre_processe e limpa os dados de {dag_name}', schedule_interval=None, default_args=args) as dag: dag.doc_md = __doc__ dag.doc_md = """![image alt <](http://www.xpto.company/layout/site/images/logo_xpto.png)""" origin_layer = 'raw' dest_layer = 'pre_processed' env = Variable.get('env', default_var='dev') control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}" last_control_var = Variable.get(f'{dag_name}_last_control_var', default_var='000000000000000') current_dag_name = dag_name + '_' + control_var list_all_dates = eval(Variable.get(f'{dag_name}_list_all_dates', default_var='[]')) list_partitions = eval(Variable.get(f'{dag_name}_list_partitions', default_var='[]')) list_inconsistency = eval(Variable.get(f'{dag_name}_{dest_layer}_list_inconsistency', default_var='[]')) # ----------------- # TASKS # ----------------- task_start = PythonOperator( task_id='start', python_callable=start_time, depends_on_past=False, op_kwargs={'dag_name': dag_name, 'execution_date': '{{ ts }}'} ) with TaskGroup(group_id='group_pre_process_clean_load') as group_pre_process_clean_load: list_all_dates = AirflowMetaStoreHelper().set_granularity(list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: PythonOperator( task_id=f'pre_process_clean_load-{date}', python_callable=pre_processAndCleanData().execute, trigger_rule=TriggerRule.ALL_SUCCESS, op_kwargs={ 'dag_id': dag.dag_id, 'step': step, 'dag_name': dag_name, 'date': date, 'env': env, 'origin_layer': origin_layer, 'dest_layer': dest_layer, 'table_blob_col_blob': table_blob_col_blob, 'path_spark_xml': path_spark_xml, 'path_spark_avro': path_spark_avro, 'path_native_lib': path_native_lib, 'path_ojdbc': path_ojdbc, 'executor_cores': executor_cores, 'executor_memory': executor_memory, 'driver_memory': driver_memory, 'hdfs_conn_id': 'webhdfs', 'path_avro_schema': path_avro_schema, 'list_all_dates': list_all_dates, 'max_registry_by_avro': max_registry_by_avro } ) task_create_partitions = PythonOperator( task_id='create_partitions', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=CreatePartitions().execute, op_kwargs={'dag_name': dag_name, 'current_dag_name': current_dag_name, 'list_all_dates': list_all_dates, 'agg_by': agg_by, 'env': env, 'hive_conn_id': 'hive', 'impala_conn_id': 'impala', 'layer': dest_layer} ) with TaskGroup(group_id='group_compare_data') as group_compare_data: list_all_dates = AirflowMetaStoreHelper().set_granularity(list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: PythonOperator( task_id=f'compare_{origin_layer}_and_{dest_layer}_{date}', python_callable=CompareDataImpalaImpala().execute, trigger_rule=TriggerRule.ALL_SUCCESS, op_kwargs={'dag_name': dag_name, 'control_var': control_var, 'last_control_var': last_control_var, 'date': date, 'hive_conn_id': 'impala', 'origin_layer': origin_layer, 'dest_layer': dest_layer, 'redis_conn_id': cache_blob, 'table_id': table_id, 'dt_ref': table_id_col_dt_ref, 'agg_by': agg_by, 'table_id_col_fk': table_id_col_fk, 'table_id_col_dt_created': table_id_col_dt_created, 'list_partitions': list_partitions, 'list_all_dates': list_all_dates} ) task_check_if_contains_inconsistency = BranchPythonOperator( task_id=f'check_if_contains_inconsistency', trigger_rule=TriggerRule.ALL_SUCCESS, wait_for_downstream=True, python_callable=AirflowMetaStoreHelper('airflow_db').check_if_contains_inconsistency, op_kwargs={'dag_name': dag_name, 'last_control_var': last_control_var, 'layer': dest_layer, 'true_case': 'generate_list_inconsistency', 'false_case': 'clear_airflow_var', 'redis_conn_id': cache_blob, 'get_redis_key': f'{dag_name}_inconsistency_date'} ) task_generate_list_inconsistency = PythonOperator( task_id=f'generate_list_inconsistency', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=AirflowMetaStoreHelper('airflow_db').generate_list_inconsistency, op_kwargs={'dag_name': dag_name, 'last_control_var': last_control_var, 'layer': dest_layer} ) task_hdfs_clear_inconsistency_data = PythonOperator( task_id='hdfs_clear_inconsistency_data', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=HdfsHelper('webhdfs').clear_inconsistency_data, op_kwargs={'dag_name': dag_name, 'list_inconsistency': list_inconsistency, 'layer': dest_layer} ) task_crash_dag = PythonOperator( task_id=f'crash_dag', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=crash_dag ) task_clear_airflow_var = PythonOperator( task_id='clear_airflow_var', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=clear_airflow_var, op_kwargs={'airflow_conn_id': 'airflow_db', 'list_pattern_name_var': f"[" f" '{dag_name}_%_total_row_id'," f" '{dag_name}_raw_total_registry_%'," f" '{dag_name}_list_dags'," f" '{dag_name}_list_path_to_concat'," f" '{dag_name}_list_path_to_rename'," f" '{dag_name}_list_all_dates'" f"]"} ) task_no_inconsistencies_were_found = DummyOperator( task_id='no_inconsistencies_were_found' ) task_end = PythonOperator( task_id='end', python_callable=end_time, op_kwargs={'current_dag_name': current_dag_name, 'dag_name': dag_name, 'last_control_var_name': f'{dag_name}_last_control_var', 'list_dates': f'{current_dag_name}_list_dates', 'postgres_conn_id': 'airflow_db'} ) # ----------------- # GRAPH # ----------------- task_start >> group_pre_process_clean_load >> task_create_partitions >> group_compare_data >> task_check_if_contains_inconsistency >> task_generate_list_inconsistency >> task_hdfs_clear_inconsistency_data >> task_crash_dag task_start >> group_pre_process_clean_load >> task_create_partitions >> group_compare_data >> task_check_if_contains_inconsistency >> task_clear_airflow_var >> task_no_inconsistencies_were_found >> task_end task_start >> task_clear_airflow_var >> task_no_inconsistencies_were_found >> task_end return dag
def create_dag(dag_name: str, agg_by: str, doc_type: str, cache_blob: str, path_avro_schema: str, path_local_avro_schemas: str, executor_cores: str, executor_memory: str, executor_instances: str, driver_memory: str, col_type_pk: str, extra_cols: str, max_registry_by_file: str, oracle_conn_id: str, table_ctrl: str, table_ctrl_col_control_var: str, table_ctrl_col_fk: str, table_ctrl_col_dt_ref: str, table_ctrl_col_dt_created: str, oracle_conn_blob: str, table_blob: str, table_blob_col_pk: str, table_blob_col_blob: str) -> airflow.models.dag.DAG: # ----------------- # DAG # ----------------- args = { 'owner': 'job', 'run_as_user': '******', 'start_date': datetime(2021, 8, 17), 'do_xcom_push': False, 'depends_on_past': True, 'retries': 10, 'retry_delay': timedelta(seconds=60), 'dag_name': dag_name } with DAG(dag_id=f'{step}_{dag_name}', description=f'Import data from {dag_name}', schedule_interval='00 19 * * *', catchup=False, default_args=args) as dag: dag.doc_md = __doc__ dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)""" layer = 'raw' env = Variable.get('env', default_var='dev') control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}" last_control_var = Variable.get(f'{dag_name}_last_control_var', default_var='000000000000000') current_dag_name = dag_name + '_' + control_var total_pg = int( Variable.get(f'{current_dag_name}_total_pg', default_var=1)) list_all_dates = eval( Variable.get(f'{dag_name}_list_all_dates', default_var='[]')) list_current_dates = eval( Variable.get(f'{current_dag_name}_current_dates', default_var='[]')) list_dags = eval( Variable.get(f'{dag_name}_list_dags', default_var='[]')) total_rows = Variable.get('total_rows', default_var='100000') items_by_query = 1000 sql_id = f''' SELECT {table_ctrl_col_fk} id, {table_ctrl_col_control_var} control_var, to_char({table_ctrl_col_dt_ref}, 'DD-MM-YYYY') dt_ref FROM {table_ctrl} WHERE {table_ctrl_col_control_var} > :control_var AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY') ORDER BY {table_ctrl_col_control_var} ASC FETCH FIRST :total_rows ROWS ONLY''' dict_bind_sql_get_data = { 'control_var': f'{control_var}', 'total_rows': f'{total_rows}' } sql_count_id = f''' SELECT COUNT({table_ctrl_col_fk}) FROM {table_ctrl} WHERE {table_ctrl_col_control_var} > :control_var AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')''' dict_bind_sql_count_id = {'control_var': f'{control_var}'} # ----------------- # TASKS # ----------------- task_start = PythonOperator(task_id='start', python_callable=start_time, depends_on_past=False, op_kwargs={ 'dag_name': dag_name, 'execution_date': '{{ ts }}' }) task_oracle_execute_count = OracleGetResults( task_id='oracle_execute_count', current_dag_name=current_dag_name, oracle_conn_id=oracle_conn_id, sql_count_id=sql_count_id, dict_bind=dict_bind_sql_count_id) task_check_if_contains_data_in_oracle = BranchPythonOperator( task_id='check_if_contains_data_in_oracle', python_callable=AirflowMetaStoreHelper( ).check_if_contains_data_in_oracle, op_kwargs={ 'control_var': control_var, 'last_control_var': last_control_var, 'current_dag_name': current_dag_name, 'redis_conn_id': cache_blob, 'redis_key': f'{dag_name}_original', 'true_case': 'get_id', 'false_case': 'check_len_list_processed_dates' }) task_get_id = OracleToRedisTransfer( task_id='get_id', oracle_conn_id=oracle_conn_id, redis_conn_id=cache_blob, sql=sql_id, dict_bind=dict_bind_sql_get_data, name_redis_key=f'{dag_name}_original') task_fill_data_gap = PythonOperator( task_id='fill_data_gap', python_callable=RedisHelper(cache_blob).fill_data_gaps, op_kwargs={ 'current_dag_name': current_dag_name, 'redis_conn_id': cache_blob, 'redis_key': f'{dag_name}_original' }) task_get_dag_name = PythonOperator( task_id='get_dag_name', python_callable=AirflowMetaStoreHelper().get_dag_name, op_kwargs={ 'current_dag_name': current_dag_name, 'name_list_dags': f'{dag_name}_list_dags', 'list_dags': list_dags }) task_get_date = PythonOperator( task_id='get_date', python_callable=RedisHelper(cache_blob).get_date, op_kwargs={ 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'list_columns': "['id', 'control_var', 'date']", 'redis_key': current_dag_name }) task_split_id_by_date = PythonOperator( task_id='split_id_by_date', python_callable=RedisHelper(cache_blob).split_id_by_date, op_kwargs={ 'current_dag_name': current_dag_name, 'list_current_dates': list_current_dates, 'redis_key': current_dag_name }) task_generate_pagination = PythonOperator( task_id='generate_pagination', python_callable=RedisHelper(cache_blob).generate_pagination, op_kwargs={ 'current_dag_name': current_dag_name, 'items_by_query': items_by_query, 'list_current_dates': list_current_dates, 'redis_key': current_dag_name }) task_generate_sql_by_date = PythonOperator( task_id='generate_sql_by_date', python_callable=RedisHelper(cache_blob).generate_sql_by_date, op_kwargs={ 'current_dag_name': current_dag_name, 'list_current_dates': list_current_dates, 'oracle_conn': oracle_conn_blob, 'table_ctrl': table_ctrl, 'table_ctrl_col_fk': table_ctrl_col_fk, 'table_blob': table_blob, 'table_blob_col_pk': table_blob_col_pk, 'table_blob_col_blob': table_blob_col_blob, 'items_by_query': items_by_query, 'total_pg': total_pg, 'extra_cols': extra_cols, 'redis_key': current_dag_name }) task_extract_decompress_load = OracleBlobToHdfsTransfer( task_id=f'extract_decompress_load', retries=20, dag_name=dag_name, current_dag_name=current_dag_name, oracle_conn_id=oracle_conn_id, query_id=sql_id, table_ctrl_col_fk=table_ctrl_col_fk, extra_cols=extra_cols, oracle_conn_blob=oracle_conn_blob, table_blob_col_pk=table_blob_col_pk, table_blob_col_blob=table_blob_col_blob, path_avro_schema=path_avro_schema, path_local_avro_schemas= f'{path_local_avro_schemas}/{layer}/{dag_name}.avsc', total_pg=total_pg, layer=layer, env=env, step=step, executor_cores=executor_cores, executor_memory=executor_memory, executor_instances=executor_instances, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, col_type_pk=col_type_pk, compress_type='snappy', hdfs_conn_id='webhdfs', oracle_driver='oracle.jdbc.driver.OracleDriver', list_current_dates=list_current_dates) task_update_control_var = PythonOperator( task_id='update_control_var', python_callable=AirflowMetaStoreHelper().update_control_var, trigger_rule=TriggerRule.ALL_SUCCESS, depends_on_past=True, op_kwargs={ 'control_var': control_var, 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'redis_conn_id': cache_blob, 'last_control_var': last_control_var, 'list_dags': list_dags, 'total_pg': total_pg, 'list_current_dates': list_current_dates, 'list_all_dates': list_all_dates, 'redis_key': current_dag_name }) task_clear_environment = PythonOperator( task_id='clear_environment', python_callable=clear_environment, trigger_rule=TriggerRule.ALL_SUCCESS, op_kwargs={ 'control_var': control_var, 'dag_name': dag_name, 'redis_conn_id': cache_blob, 'airflow_conn_id': 'airflow_db', 'last_control_var': last_control_var, 'list_dags': list_dags, 'redis_key': current_dag_name }) task_check_len_list_processed_dates = BranchPythonOperator( task_id='check_len_list_processed_dates', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=check_len_list_processed_dates, op_kwargs={ 'dag_name': dag_name, 'list_all_dates': list_all_dates, 'true_case': 'prepare_execution', 'false_case': 'waiting_execution' }) task_prepare_execution = DummyOperator(task_id='prepare_execution') with TaskGroup( group_id='group_hdfs_concat_file') as group_hdfs_concat_file: task_hdfs_prepare_concat = PythonOperator( task_id='hdfs_prepare_concat', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=HdfsPrepareConcat('webhdfs').execute, op_kwargs={ 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'hdfs_path': f'/data/{env}/{layer}/{dag_name}', 'agg_by': agg_by, 'layer': layer, 'env': env, 'list_all_dates': list_all_dates, 'path_avro_tools': path_avro_tools }) # TODO: refactor -> create a task list_all_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: task_concat_file = HdfsConcatFiles( task_id=f'hdfs_concat_file-{date}', retries=100, dag_name=dag_name, date=date, layer=layer, env=env, col_name_control_var=table_ctrl_col_control_var, path_avro_schema=path_avro_schema, hdfs_conn_id='webhdfs', executor_cores=executor_cores, executor_memory=executor_memory, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, format_data='avro', compress_type='snappy', max_registry_by_avro=max_registry_by_file) task_hdfs_prepare_concat >> task_concat_file task_create_partitions = PythonOperator( task_id='create_partitions', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=CreatePartitions().execute, op_kwargs={ 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'list_all_dates': list_all_dates, 'hive_conn_id': 'hive', 'impala_conn_id': 'impala', 'agg_by': agg_by, 'layer': layer, 'env': env }) task_save_execution_state_hdfs = PythonOperator( task_id='save_execution_state_hdfs', python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs, op_kwargs={ 'dag_name': dag_name, 'layer': layer, 'control_var': control_var }) with TaskGroup(group_id='group_generate_statistics' ) as group_generate_statistics: # TODO: refactor -> create a task list_all_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: PythonOperator(task_id=f'generate_statistics-{date}', retries=50, python_callable=GenerateStatistics().execute, op_kwargs={ 'dag_name': dag_name, 'date': date, 'layer': layer, 'impala_conn_id': 'impala', 'hive_conn_id': 'hive' }) with TaskGroup(group_id='group_check_data_quality' ) as group_check_data_quality: # TODO: refactor -> create a task list_all_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: CompareDataOracleImpala( task_id=f'compare_oracle_impala_{date}', retries=100, dag_name=dag_name, last_control_var=last_control_var, layer=layer, date=date, table_ctrl=table_ctrl, dt_ref=table_ctrl_col_dt_ref, agg_by=agg_by, oracle_conn_id=oracle_conn_id, hive_conn='impala', table_ctrl_col_fk=table_ctrl_col_fk, table_ctrl_col_dt_created=table_ctrl_col_dt_created) task_check_if_contains_inconsistency = BranchPythonOperator( task_id=f'check_if_contains_inconsistency', trigger_rule=TriggerRule.ALL_SUCCESS, wait_for_downstream=True, python_callable=AirflowMetaStoreHelper( 'airflow_db').check_if_contains_inconsistency, op_kwargs={ 'dag_name': dag_name, 'last_control_var': last_control_var, 'layer': layer, 'true_case': 'prepare_reprocessing_inconsistency_data', 'false_case': f'check_next_dag', 'redis_conn_id': cache_blob, 'redis_key': f'{dag_name}_inconsistency_date' }) task_prepare_reprocessing_inconsistency_data = PrepareReprocessingInconsistencyData( task_id=f'prepare_reprocessing_inconsistency_data', trigger_rule=TriggerRule.ALL_SUCCESS, dag_name=dag_name, current_dag_name=current_dag_name, layer=layer, last_control_var=last_control_var, list_all_dates=list_all_dates, table_ctrl=table_ctrl, table_ctrl_col_fk=table_ctrl_col_fk, table_ctrl_col_control_var=table_ctrl_col_control_var, table_ctrl_col_dt_ref=table_ctrl_col_dt_ref, table_ctrl_col_dt_created=table_ctrl_col_dt_created, hive_conn_id='impala', hdfs_conn_id='webhdfs', airflow_conn_id='airflow_db', oracle_conn_id=oracle_conn_id) task_crash_dag = PythonOperator( task_id=f'crash_dag', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=crash_dag, ) task_check_next_dag = BranchPythonOperator( task_id='check_next_dag', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=check_next_dag, op_kwargs={ 'dag_name': dag_name, 'doc_type': doc_type, 'true_case': f'trigger_pre_process_{dag_name}', 'false_case': f'trigger_parser_{dag_name}' }) task_trigger_pre_process = TriggerDagRunOperator( task_id=f'trigger_pre_process_{dag_name}', trigger_dag_id=f"pre_process_{dag_name}") task_trigger_parser = TriggerDagRunOperator( task_id=f'trigger_parser_{dag_name}', trigger_dag_id=f"parser_{dag_name}") task_trigger_import_file = TriggerDagRunOperator( task_id=f'trigger_import_file_{dag_name}', trigger_dag_id=dag.dag_id) task_waiting_execution = DummyOperator( trigger_rule=TriggerRule.ALL_DONE, task_id='waiting_execution') task_end = PythonOperator(task_id='end', python_callable=end_time, op_kwargs={ 'current_dag_name': current_dag_name, 'dag_name': dag_name, 'last_control_var_name': f'{dag_name}_last_control_var', 'list_dates': f'{current_dag_name}_list_dates', 'postgres_conn_id': 'airflow_db' }) # ----------------- # GRAPH # ----------------- # task_check_if_contains_data_in_oracle: true task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_get_id >> task_fill_data_gap >> [ task_get_date, task_get_dag_name ] >> task_split_id_by_date >> task_generate_pagination >> task_generate_sql_by_date >> task_extract_decompress_load >> task_update_control_var >> [ task_clear_environment, task_trigger_import_file ] >> task_waiting_execution >> task_end # task_check_if_contains_data_in_oracle: false # task_check_len_list_processed_dates: true task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [ group_hdfs_concat_file, task_save_execution_state_hdfs ] >> task_create_partitions >> [ group_check_data_quality, group_generate_statistics ] >> task_check_if_contains_inconsistency task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [ group_hdfs_concat_file, task_save_execution_state_hdfs ] >> task_create_partitions >> task_check_if_contains_inconsistency >> task_prepare_reprocessing_inconsistency_data >> task_crash_dag # task_check_next_dag: true task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_pre_process >> task_waiting_execution >> task_end # task_check_next_dag: false task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_parser >> task_waiting_execution >> task_end # task_check_if_contains_data_in_oracle: false # task_check_len_list_processed_dates: false task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_waiting_execution >> task_end return dag
def create_dag(dag_name: str, max_registry_by_file: str, db_name: str, table_name: str, col_control_var: str, col_name_dt_ref: str, oracle_conn_id: str, path_avro_schema: str, path_config_docs: str, executor_cores: str, executor_memory: str, executor_instances: str, driver_memory: str) -> airflow.models.dag.DAG: # ----------------- # DAG # ----------------- args = { 'owner': 'job', 'run_as_user': '******', 'start_date': datetime(2021, 8, 1), 'do_xcom_push': False, 'depends_on_past': True, 'retries': 10, 'retry_delay': timedelta(seconds=90), 'dag_name': dag_name } with DAG( dag_id=f'{step}_{dag_name}', description=f'Importa os dados de {dag_name}', # schedule_interval=None, schedule_interval='00 19 * * *', catchup=False, default_args=args) as dag: dag.doc_md = __doc__ dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)""" layer = 'raw' env = Variable.get('env', default_var='prod') last_control_var = Variable.get(f'{dag_name}_last_control_var', default_var='000000000000000') control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}" current_dag_name = dag_name + '_' + control_var total_rows = Variable.get('total_rows', default_var='100000') sql_get_data = f''' SELECT * FROM {db_name}.{table_name} WHERE {col_control_var} > :control_var AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY') ORDER BY {col_control_var} ASC FETCH FIRST :total_rows ROWS ONLY''' dict_bind_sql_get_data = { 'control_var': control_var, 'total_rows': total_rows } sql_count_id = f''' SELECT COUNT({col_control_var}) FROM {db_name}.{table_name} WHERE {col_control_var} > :control_var AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')''' dict_bind_sql_count_id = {'control_var': control_var} # ----------------- # TASKS # ----------------- task_start = PythonOperator(task_id='start', python_callable=start_time, depends_on_past=False, op_kwargs={ 'dag_name': dag_name, 'execution_date': '{{ ts }}' }) task_oracle_execute_count = OracleGetResults( task_id='oracle_execute_count', current_dag_name=current_dag_name, oracle_conn_id=oracle_conn_id, sql_count_id=sql_count_id, dict_bind=dict_bind_sql_count_id) task_check_if_contains_data_in_oracle = BranchPythonOperator( task_id='check_if_contains_data_in_oracle', python_callable=AirflowMetaStoreHelper( ).check_if_contains_data_in_oracle, op_kwargs={ 'control_var': control_var, 'last_control_var': last_control_var, 'current_dag_name': current_dag_name, 'true_case': 'extract_transform_load', 'false_case': 'sync_data' }) task_extract_transform_load = OracleTableToHdfsTransfer( task_id=f'extract_transform_load', retries=20, dag_name=dag_name, current_dag_name=current_dag_name, sql_get_data=sql_get_data, dict_bind=dict_bind_sql_get_data, col_control_var=col_control_var, path_avro_schema=path_avro_schema, layer=layer, env=env, step=step, executor_cores=executor_cores, executor_memory=executor_memory, executor_instances=executor_instances, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, compress_type='snappy', oracle_conn_id=oracle_conn_id, hdfs_conn_id='webhdfs', oracle_driver='oracle.jdbc.driver.OracleDriver', max_registry_by_file=max_registry_by_file) task_sync_data = SyncData( task_id='sync_data', dag_name=dag_name, db_name=db_name, table_name=table_name, col_name_control_var=col_control_var, col_name_dt_ref=col_name_dt_ref, path_avro_schema=path_avro_schema, layer=layer, env=env, hdfs_conn_id='webhdfs', oracle_conn_id=oracle_conn_id, oracle_driver='oracle.jdbc.driver.OracleDriver', executor_cores=executor_cores, executor_memory=executor_memory, executor_instances=executor_instances, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, compress_type='snappy', max_registry_by_file=max_registry_by_file) task_concat_file = HdfsConcatFiles( task_id=f'hdfs_concat_file', retries=100, dag_name=dag_name, layer=layer, env=env, col_name_control_var=col_control_var, path_avro_schema=path_avro_schema, hdfs_conn_id='webhdfs', executor_cores=executor_cores, executor_memory=executor_memory, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, format_data='parquet', compress_type='snappy', max_registry_by_avro=max_registry_by_file) task_save_execution_state_hdfs = PythonOperator( task_id='save_execution_state_hdfs', python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs, op_kwargs={ 'dag_name': dag_name, 'control_var': control_var, 'layer': layer }) task_update_statistics = UpdateStasByTable( task_id=f'update_statistics', retries=50, db_name=dag_name, table_name=layer, impala_conn_id='impala') task_clear_airflow_var = PythonOperator( task_id='clear_airflow_var', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=clear_airflow_var, op_kwargs={ 'airflow_conn_id': 'airflow_db', 'list_pattern_name_var': f"[" f" '{dag_name}_%_total_row_id'," f" '{dag_name}_raw_total_registry_%'," f" '{dag_name}_list_dags'," f" '{dag_name}_list_path_to_concat'," f" '{dag_name}_list_path_to_rename'," f" '{dag_name}_list_all_dates'" f"]" }) task_update_control_var = OracleUpdateControlVar( task_id='update_control_var', trigger_rule=TriggerRule.ALL_SUCCESS, depends_on_past=True, control_var=control_var, last_control_var=last_control_var, dag_name=dag_name, current_dag_name=current_dag_name, col_control_var=col_control_var, oracle_conn_id=oracle_conn_id, dict_bind=dict_bind_sql_get_data, sql=sql_get_data) task_trigger_import_file = TriggerDagRunOperator( task_id=f'trigger_import_file_{dag_name}', trigger_dag_id=dag.dag_id) task_end = PythonOperator(task_id='end', trigger_rule=TriggerRule.ALL_DONE, python_callable=end_time, op_kwargs={ 'current_dag_name': current_dag_name, 'dag_name': dag_name, 'last_control_var_name': f'{dag_name}_last_control_var', 'postgres_conn_id': 'airflow_db' }) # ----------------- # GRAPH # ----------------- task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_extract_transform_load >> task_update_control_var >> task_trigger_import_file >> task_end task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_sync_data >> [ task_concat_file, task_save_execution_state_hdfs ] >> task_update_statistics >> task_clear_airflow_var >> task_end return dag
def clear_environment(**context) -> None: RedisHelper(context['redis_conn_id']) \ .clear_redis(dag_name=context['dag_name']) AirflowMetaStoreHelper(context['airflow_conn_id']) \ .delete_airflow_var(dag_name=context['dag_name'], last_control_var=context['last_control_var'])
def clear_airflow_var(**context) -> None: AirflowMetaStoreHelper(context['airflow_conn_id']) \ .delete_pattern_var(context['list_pattern_name_var'])