Exemple #1
0
class EtlProcess:
    """ EtlProcess orchestrates the ETL process """

    spark: SparkSession
    extractor: Any  # better: create abstract base class Extractor
    transformer: Any
    loader: Any
    logger: EtlLogger

    def __init__(self, extractor, transformer, loader,
                 spark: Union[SparkSession, Mock] = create_spark_session()) \
            -> None:
        """ Initialize the EtlProcess """
        self.logger = EtlLogger()

        self.spark = spark
        self.extractor = extractor
        self.transformer = transformer
        self.loader = loader

    def run(self) -> None:
        """ Run the ETL process """
        self.logger.debug('starting run')
        try:
            initial_df: DataFrame = self.extractor.read_from_db(self.spark)

            transformed_df: DataFrame = \
                self.transformer.clean_data(self.spark, initial_df)

            self.loader.write_to_db(self.spark, transformed_df)
        finally:
            self.spark.stop()
Exemple #2
0
    def __init__(self, extractor, transformer, loader,
                 spark: Union[SparkSession, Mock] = create_spark_session()) \
            -> None:
        """ Initialize the EtlProcess """
        self.logger = EtlLogger()

        self.spark = spark
        self.extractor = extractor
        self.transformer = transformer
        self.loader = loader
Exemple #3
0
class LoaderCsv:
    """ Loader implements the "load" process of ETL """
    output_cols: ClassVar[List[str]] = ['Customer ID', 'Total Orders']
    logger: EtlLogger

    def __init__(self, config: Dict[str, Any]):
        self.path = config['path']
        self.logger = EtlLogger()

    def load(self, spark: SparkSession, df: DataFrame):
        self.logger.info(f'Load: {self.path}')

        # Convert Spark DataFrame to Pandas DataFrame, because Pandas can write
        # to a single, plain CSV file instead of writing a distributed HDFS file
        # (for demo purposes only; usually HDFS is the right way to go)
        df.toDF(*LoaderCsv.output_cols) \
          .toPandas() \
          .to_csv(self.path, header=True, index=False)
class ExtractorCsv:
    """ Extractor implements the "extract" process of ETL """
    input_schema: ClassVar[str] = \
        '`Customer ID` integer, `Order ID` integer, `Order Total` double'
    path: str
    logger: EtlLogger

    def __init__(self, config: Dict[str, Any]):
        self.path = config['path']
        self.logger = EtlLogger()

    def extract(self, spark: SparkSession) -> DataFrame:
        self.logger.info(f'Extract: {self.path}')

        df: DataFrame = spark.read.csv(self.path,
                                       header=True,
                                       schema=ExtractorCsv.input_schema)
        return df
Exemple #5
0
class TransformerTopFiveCust:
    """ Transform implements the "transform" process of ETL """
    logger: EtlLogger

    def __init__(self):
        self.logger = EtlLogger()

    def transform(self, spark: SparkSession, df: DataFrame) -> DataFrame:
        """ Apply transformations to a DataFrame """
        self.logger.info('Transform: top 5 customer totals')

        result_limit = 5
        result_df = df.toDF('cust_id', 'order_id', 'amount') \
                      .select('cust_id', 'amount') \
                      .groupBy('cust_id') \
                      .agg(sum('amount').alias('total')) \
                      .orderBy('total', ascending=False) \
                      .limit(result_limit)
        return result_df
Exemple #6
0
 def __init__(self, config: Dict[str, Any]):
     self.path = config['path']
     self.logger = EtlLogger()
Exemple #7
0
 def __init__(self):
     self.logger = EtlLogger()
"""
Driver for an ETL process that extracts a CSV file and loads a CSV file.
"""

from etl_process import EtlProcess
from extractor import ExtractorCsv
from transformer import TransformerTopFiveCust
from loader import LoaderCsv
from etl_logger import EtlLogger


if __name__ == '__main__':
    logger = EtlLogger()
    logger.info('ETL Process starting')

    extract_file = 'customer-orders.csv'
    # path: str = f'file://{Path().absolute()}/{file}'  # read local file
    extract_path = f'hdfs://localhost:9000/user/sutter/data/{extract_file}'  # read from Hadoop server
    extractor = ExtractorCsv({'path': extract_path})

    transformer = TransformerTopFiveCust()

    load_path = './customer-orders-totals.csv'
    loader = LoaderCsv({'path': load_path})

    etl_process = EtlProcess(extractor, transformer, loader)
    etl_process.run()

    logger.info('ETL Process complete')