def get_pyspark_df_to_process(oracle_conn_id: str, oracle_conn_blob: str, oracle_driver: str, spark: pyspark.sql.session.SparkSession, n_partitions: int, query_blob: str, table_blob_col_pk: str, table_blob_col_blob: str, current_dag_name: str, extra_cols: str, date: str) -> pyspark.sql.dataframe.DataFrame: df = OracleHelper(oracle_conn_blob) \ .get_pyspark_df_from_table(oracle_driver=oracle_driver, spark=spark, table=f'({query_blob})', partition_col='COL_PARTITION', n_partitions=n_partitions * 5) \ .select(table_blob_col_pk, table_blob_col_blob) if len(extra_cols) > 0: query_extra_col = Variable.get( f'{current_dag_name}_sql_extra_cols_{date}') df_extra_cols = OracleHelper(oracle_conn_id) \ .get_pyspark_df(spark=spark, oracle_driver=oracle_driver, sql=query_extra_col) return join_pyspark_df(df=df, df_extra_cols=df_extra_cols, id_df=table_blob_col_pk) return df
def get_pandas_df_to_process(oracle_conn_id: str, oracle_conn_blob: str, query_blob: str, table_blob_col_pk: str, table_blob_col_blob: str, extra_cols: str, current_dag_name: str, date: str) -> pd.DataFrame: df = OracleHelper(oracle_conn_blob) \ .get_pandas_df(query_blob) df = df[ [table_blob_col_pk, table_blob_col_blob] ] if len(extra_cols) > 0: query_extra_col = Variable.get(f'{current_dag_name}_sql_extra_cols_{date}') df_extra_cols = OracleHelper(oracle_conn_id) \ .get_pandas_df(sql=query_extra_col) return join_pandas_df(pdf=df, pdf_extra_cols=df_extra_cols, id_df=table_blob_col_pk.upper()) return df
def execute(self, context): oracle = OracleHelper(self.oracle_conn_id) self.log.info( f"Executing SQL:{self.sql_count_id}\nParameters: {self.dict_bind}") count_id = oracle.get_rows_with_bind(sql=self.sql_count_id, bind=self.dict_bind)[0][0] Variable.set(key=f'{self.current_dag_name}_total_row_id', value=count_id) self.log.info(f"{count_id} rows are not in HDFS.")
def execute(self, context): oracle = OracleHelper(self.oracle_conn_id) redis = RedisHook(self.redis_conn_id) self.log.info(f"Executing SQL:{self.sql}") self.log.info("Extracting data from Oracle") conn_redis = redis.get_conn() records = oracle.get_rows_with_bind(sql=self.sql, bind=self.dict_bind) self.log.info("Inserting rows into Redis") pipe = conn_redis.pipeline() [pipe.lpush(self.name_redis_key, str(row)) for row in records] pipe.execute() self.log.info(f"Inserted {len(records)} rows.")
def test_oracle_conn_db_transfers(): """ Tests whether the connection in Airflow has been correctly created and validates that the connection is open """ assert 1 == OracleHelper('DB_trans') \ .get_rows('SELECT 1 db_name.table_name FETCH FIRST 1 ROWS ONLY')
def prepare_avro_schema(layer: str, data_name: str, template: str, path_ojdbc: str, path_native_lib: str, doc_type: str, list_dict_cols: list) -> dict: if doc_type != 'table_test': return generate_avro_schema(data_name=data_name, layer=layer, list_dict_cols=list_dict_cols) "Generate dynamic avro schema from tables" spark, sc = init_spark(app_name='generate_data_schema', step='generate_data_schema', dag_name=data_name, layer=layer, env=env, path_ojdbc=path_ojdbc, path_native_lib=path_native_lib, executor_cores='4', executor_memory='4g', executor_instances='2', driver_memory='1g') logging.info(f'\n{template}\nGetting data from Oracle\n{template}') df_oracle_table = OracleHelper(context['oracle_conn_table']) \ .get_pyspark_df(spark=spark, oracle_driver='oracle.jdbc.driver.OracleDriver', sql=f'SELECT * FROM {data_name}') df_preprocessed = preprocess_data_table(df_oracle_table) return generate_avro_schema_from_df(dag_name=data_name, layer=layer, df=df_preprocessed)
def generate_sql_by_date(self, **context) -> None: self.log.info(f'Generating SQL ...') items_by_query = int(context['items_by_query']) for date in context['list_current_dates']: list_records_unsorted = self.get_list_redis(context['redis_key'] + '_' + date) list_records = sorted(list_records_unsorted, key=lambda tup: tup[1]) total_pg_date = eval( Variable.get(context['current_dag_name'] + '_' + 'total_pg' + '_' + date)) self.log.info(f'Getting {context["redis_key"] + "_" + date} in Redis') sql_blob = OracleHelper(context['oracle_conn']) \ .generate_sql_get_data(total_pg_date=total_pg_date, list_id_by_date=[x[-3] for x in list_records], items_by_query=items_by_query, date=date, table_blob=context['table_blob'], table_blob_col_pk=context['table_blob_col_pk'], table_blob_col_blob=context['table_blob_col_blob']) Variable.set(key=context['current_dag_name'] + '_' + 'sql_blob' + '_' + date, value=sql_blob) if len(context['extra_cols']) > 0: self.log.info(f'Generating extra cols ...') sql_id_extra_cols = OracleHelper(context['oracle_conn']) \ .generate_sql_get_data(total_pg_date=total_pg_date, list_id_by_date=[x[-3] for x in list_records], date=date, items_by_query=items_by_query, table_ctrl=context['table_ctrl'], table_ctrl_col_fk=context['table_ctrl_col_fk'], has_extra_cols=True, extra_cols=context['extra_cols']) Variable.set(key=context['current_dag_name'] + '_' + 'sql_extra_cols' + '_' + date, value=sql_id_extra_cols)
def execute(self, context): oracle = OracleHelper(self.oracle_conn_id) sql = f"SELECT MAX({self.col_control_var}) FROM ({self.sql})" self.log.info(f"Executing SQL:\n{sql}") self.log.info(f"Parameters:\n{self.dict_bind}") max_value = f"{oracle.get_rows_with_bind(sql=sql, bind=self.dict_bind)[0][0]:015d}" # 000.000.000.000.000 Variable.set(key=f'{self.dag_name}_control_var', value=max_value) Variable.set(key=f'{self.dag_name}_last_control_var', value=self.control_var) self.log.info(f'Updated Airflow variable:\n' f'current_dag_name: {self.current_dag_name}\n' f'last_control_var to: {self.control_var}\n' f'control_var to: {max_value}')
def generate_all_partitions(self, oracle_conn: str, table_ctrl: str, table_ctrl_col_dt_ref: str, agg_by: str, env: str, layer: str, data: str) -> None: """ Generates all partitions in hive and impala. Useful when it is necessary to recreate/change dbs, tables or data directories in HDFS """ list_all_dates = OracleHelper(oracle_conn).get_all_dates( table_ctrl=table_ctrl, table_ctrl_col_dt_ref=table_ctrl_col_dt_ref) list_all_dates = [dt for dt in list_all_dates if dt is not None] list_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_dates: hdfs_path = HdfsHelper().generate_hdfs_path(env=env, layer=layer, dag_id=data, date=date) self.log.info(f"Creating partition:") self.add_partition(date=date, db=data, table=layer, hdfs_path=hdfs_path)
def execute(self, context): template = '-' * 79 start = time.time() hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id) oracle = OracleHelper(self.oracle_conn_id) spark, sc = init_spark(app_name=f'{self.step}_{self.dag_name}', step=self.step, env=self.env, dag_name=self.dag_name, layer=self.layer, path_ojdbc=self.path_ojdbc, path_spark_avro=self.path_spark_avro, path_native_lib=self.path_native_lib, executor_cores=self.executor_cores, executor_memory=self.executor_memory, executor_instances=self.executor_instances, driver_memory=self.driver_memory) avro_schema = hdfs \ .read_avro_schema(path_avro_schema=self.path_avro_schema, layer=self.layer, dag_name=self.dag_name) hdfs_path = hdfs \ .generate_hdfs_path(dag_id=self.dag_name, env=self.env, layer=self.layer, is_partitioned=False) self.log.info(f'\n{template}\nGetting data from Oracle\n{template}') self.log.info( f'query:{self.sql_get_data}\n parameters:\n{self.dict_bind}') records = oracle.get_rows_with_bind(sql=self.sql_get_data, bind=self.dict_bind) list_dict_cols = read_data_config( self.dag_name)[self.dag_name]['hdfs_data_schema']['raw']['cols'] df_oracle_table = convert_type_oracle_to_spark( spark=spark, records=records, list_dict_cols=list_dict_cols) df_preprocessed = preprocess_data_table(df_oracle_table) df_preprocessed.explain() df_preprocessed.printSchema() df_preprocessed.show(n=1) total_registry = df_oracle_table.count() n_partitions = calculate_partitions(total_registry=total_registry, max_registry_by_avro=int( self.max_registry_by_file)) # TODO: analyze and test ORC (accept ACID) self.log.info(f'\n{template}\nWriting table in HDFS\n{template}') hdfs.save_pyspark_df(df=df_preprocessed, format='parquet', avro_schema=avro_schema, compress_type=self.compress_type, mode='append', partitions=n_partitions, hdfs_path=hdfs_path) self.log.info( f'\n***** REPORT *****\n' f'Local = {hdfs_path}\n' f'Total time = {time.time() - start} sec\n' f'Total rows = {total_registry}\n' f'Total partitions = {df_preprocessed.rdd.getNumPartitions()}')
def execute(self, **context): template = '-' * 79 hdfs = HdfsHelper(hdfs_conn=self.hdfs_conn_id) spark, sc = init_spark(app_name=f'sync_data-{self.dag_name}', step='sync', env=self.env, dag_name=self.dag_name, layer=self.layer, path_ojdbc=self.path_ojdbc, path_spark_avro=self.path_spark_avro, path_native_lib=self.path_native_lib, executor_cores=self.executor_cores, executor_memory=self.executor_memory, executor_instances=self.executor_instances, driver_memory=self.driver_memory) avro_schema = hdfs.read_avro_schema( path_avro_schema=self.path_avro_schema, layer=self.layer, dag_name=self.dag_name) hdfs_path = hdfs.generate_hdfs_path(env=self.env, layer=self.layer, dag_id=self.dag_name, is_partitioned=False) sql_get_data = f''' SELECT {self.col_name_control_var}, {self.col_name_dt_ref} FROM {self.db_name}.{self.table_name} WHERE TO_DATE(to_char({self.col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY') ORDER BY {self.col_name_control_var} ASC ''' self.log.info(f'\n{template}\nGetting data from Oracle\n{template}') df_oracle_table = OracleHelper(self.oracle_conn_id) \ .get_pyspark_df_from_table(oracle_driver=self.oracle_driver, spark=spark, table=f'({sql_get_data})', partition_col=self.col_name_control_var, n_partitions=250) \ .orderBy(self.col_name_control_var) \ .withColumn(self.col_name_control_var, col(self.col_name_control_var).cast(LongType())) \ .withColumn(self.col_name_dt_ref, col(self.col_name_dt_ref).cast(StringType())) total_oracle = df_oracle_table.count() self.log.info(f'Total row from Oracle = {total_oracle}') self.log.info(f'\n{template}\nGetting data from HDFS\n{template}') hdfs.mv_files(hdfs_src_path=hdfs_path, hdfs_dst_path=f'{hdfs_path}/../.tmp_{self.dag_name}') df_hdfs = hdfs \ .load_pyspark_df(spark=spark, data_format='parquet', path=f'../../{hdfs_path}/../.tmp_{self.dag_name}') \ .orderBy(self.col_name_control_var) \ .withColumn(self.col_name_control_var, col(self.col_name_control_var).cast(LongType())) \ .withColumn(self.col_name_dt_ref, col(self.col_name_dt_ref).cast(StringType())) df_hdfs_filtered = df_hdfs \ .select(col(self.col_name_control_var), col(self.col_name_dt_ref)) total_hdfs = df_hdfs_filtered.count() self.log.info(f'Total row from HDFS = {total_hdfs}') if total_hdfs > total_oracle: self.log.warning( f'\n{template}\nTotal rows are not same equals!\n{template}') self.log.warning(f'\nOracle = {total_oracle}' f'\nHDFS = {total_hdfs}') self.log.info( f'\n{template}\nExecuting: df_hdfs - df_oracle_table\n{template}' ) df_row_to_delete_hdfs = df_hdfs_filtered.subtract(df_oracle_table) list_row_to_delete_hdfs = [ row[0] for row in df_row_to_delete_hdfs.select( self.col_name_control_var).collect() ] self.log.info( f'Total row to delete = {df_row_to_delete_hdfs.count()}') self.log.info(f'\n{template}\nDeleting rows from HDFS\n{template}') df = df_hdfs.filter(~df_hdfs[self.col_name_control_var].isin( list_row_to_delete_hdfs)) total_registry = df.count() self.log.info(f'Total row new df = {total_registry}') df.show(n=1, truncate=False) n_files = calculate_partitions(total_registry=total_registry, max_registry_by_avro=int( self.max_registry_by_file)) self.log.info(f'\n{template}\nWriting table in HDFS\n{template}') hdfs.save_pyspark_df(df=df, format='parquet', avro_schema=avro_schema, compress_type=self.compress_type, mode='overwrite', partitions=n_files, hdfs_path=hdfs_path) hdfs.remove_all_files( hdfs_path=f'{hdfs_path}/../.tmp_{self.dag_name}') try: hdfs.mv_files(hdfs_src_path=f'{hdfs_path}/../.tmp_{self.dag_name}', hdfs_dst_path=hdfs_path) except Exception as e: print(e)