Beispiel #1
0
from com.rposam.schema.FileSchema import FileSchema
import sys
import os

from com.rposam.util.logger import Log4j

if __name__ == "__main__":
    conf = SparkConfiguration.getSparkConf()

    Driver = SparkSession. \
        builder. \
        config(conf=conf). \
        appName("ETL Pipeline using Airflow CSV To Parquet"). \
        getOrCreate()

    logger = Log4j(Driver)

    logger.info("Fetching schema of source file")
    schema = FileSchema.empSchema()

    source = sys.argv[1]
    target = sys.argv[2]

    logger.info("Source is {0} and target is {1}".format(source, target))

    logger.info("Started reading data from sources")

    empDF = Driver.read. \
        format("csv"). \
        schema(schema=schema). \
        option("header", "false"). \
FileSchema = FileSchema()._20000recordsSchema()

if __name__ == "__main__":
    conf = SparkConfiguration().getSparkConf()
    warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db"
    spark = SparkSession. \
        builder. \
        config(conf=conf). \
        appName("Custom UDF for Email and Gender validations and also a LTI Assignment"). \
        config("hive.metastore.uris", "thrift://localhost:9083"). \
        enableHiveSupport(). \
        config("spark.sql.shuffle.partitions", 10). \
        getOrCreate()

    logger = Log4j(spark)
    logger.info("Reading csv file with dropmalformed mode")
    df = spark.read.format("com.databricks.spark.csv"). \
        option("header", "true"). \
        schema(schema=FileSchema). \
        option("mode", "DROPMALFORMED"). \
        load(r"csv\2000000_records.csv")

    df.printSchema()

    logger.info("Creating custome UDF for email and gender")
    isValidEmail = f.udf(isValidEmail, returnType=BooleanType())
    parse_Gender = f.udf(parseGender, returnType=StringType())

    # df.printSchema()
    # df.show()