class EtlProcess: """ EtlProcess orchestrates the ETL process """ spark: SparkSession extractor: Any # better: create abstract base class Extractor transformer: Any loader: Any logger: EtlLogger def __init__(self, extractor, transformer, loader, spark: Union[SparkSession, Mock] = create_spark_session()) \ -> None: """ Initialize the EtlProcess """ self.logger = EtlLogger() self.spark = spark self.extractor = extractor self.transformer = transformer self.loader = loader def run(self) -> None: """ Run the ETL process """ self.logger.debug('starting run') try: initial_df: DataFrame = self.extractor.read_from_db(self.spark) transformed_df: DataFrame = \ self.transformer.clean_data(self.spark, initial_df) self.loader.write_to_db(self.spark, transformed_df) finally: self.spark.stop()
def __init__(self, extractor, transformer, loader, spark: Union[SparkSession, Mock] = create_spark_session()) \ -> None: """ Initialize the EtlProcess """ self.logger = EtlLogger() self.spark = spark self.extractor = extractor self.transformer = transformer self.loader = loader
class LoaderCsv: """ Loader implements the "load" process of ETL """ output_cols: ClassVar[List[str]] = ['Customer ID', 'Total Orders'] logger: EtlLogger def __init__(self, config: Dict[str, Any]): self.path = config['path'] self.logger = EtlLogger() def load(self, spark: SparkSession, df: DataFrame): self.logger.info(f'Load: {self.path}') # Convert Spark DataFrame to Pandas DataFrame, because Pandas can write # to a single, plain CSV file instead of writing a distributed HDFS file # (for demo purposes only; usually HDFS is the right way to go) df.toDF(*LoaderCsv.output_cols) \ .toPandas() \ .to_csv(self.path, header=True, index=False)
class ExtractorCsv: """ Extractor implements the "extract" process of ETL """ input_schema: ClassVar[str] = \ '`Customer ID` integer, `Order ID` integer, `Order Total` double' path: str logger: EtlLogger def __init__(self, config: Dict[str, Any]): self.path = config['path'] self.logger = EtlLogger() def extract(self, spark: SparkSession) -> DataFrame: self.logger.info(f'Extract: {self.path}') df: DataFrame = spark.read.csv(self.path, header=True, schema=ExtractorCsv.input_schema) return df
class TransformerTopFiveCust: """ Transform implements the "transform" process of ETL """ logger: EtlLogger def __init__(self): self.logger = EtlLogger() def transform(self, spark: SparkSession, df: DataFrame) -> DataFrame: """ Apply transformations to a DataFrame """ self.logger.info('Transform: top 5 customer totals') result_limit = 5 result_df = df.toDF('cust_id', 'order_id', 'amount') \ .select('cust_id', 'amount') \ .groupBy('cust_id') \ .agg(sum('amount').alias('total')) \ .orderBy('total', ascending=False) \ .limit(result_limit) return result_df
def __init__(self, config: Dict[str, Any]): self.path = config['path'] self.logger = EtlLogger()
def __init__(self): self.logger = EtlLogger()
""" Driver for an ETL process that extracts a CSV file and loads a CSV file. """ from etl_process import EtlProcess from extractor import ExtractorCsv from transformer import TransformerTopFiveCust from loader import LoaderCsv from etl_logger import EtlLogger if __name__ == '__main__': logger = EtlLogger() logger.info('ETL Process starting') extract_file = 'customer-orders.csv' # path: str = f'file://{Path().absolute()}/{file}' # read local file extract_path = f'hdfs://localhost:9000/user/sutter/data/{extract_file}' # read from Hadoop server extractor = ExtractorCsv({'path': extract_path}) transformer = TransformerTopFiveCust() load_path = './customer-orders-totals.csv' loader = LoaderCsv({'path': load_path}) etl_process = EtlProcess(extractor, transformer, loader) etl_process.run() logger.info('ETL Process complete')