def getSparkDF(self, input_path):
     conf = SparkConfiguration().getSparkConf()
     spark = SparkSession.builder. \
         config(conf=conf). \
         appName("pyspark postgres performance test"). \
         getOrCreate()
     schema = FileSchema().empSchema()
     df = spark.read.format("csv").load(input_path,
                                        schema=schema,
                                        inferSchema=False)
     print("No of Partitions:{0}".format(df.rdd.getNumPartitions()))
     return df
Ejemplo n.º 2
0
from pyspark.sql import SparkSession

from com.rposam.config.SparkConf import SparkConfiguration
from com.rposam.schema.FileSchema import FileSchema
import sys
import os

from com.rposam.util.logger import Log4j

if __name__ == "__main__":
    conf = SparkConfiguration.getSparkConf()

    Driver = SparkSession. \
        builder. \
        config(conf=conf). \
        appName("ETL Pipeline using Airflow CSV To Parquet"). \
        getOrCreate()

    logger = Log4j(Driver)

    logger.info("Fetching schema of source file")
    schema = FileSchema.empSchema()

    source = sys.argv[1]
    target = sys.argv[2]

    logger.info("Source is {0} and target is {1}".format(source, target))

    logger.info("Started reading data from sources")

    empDF = Driver.read. \