def getSparkDF(self, input_path): conf = SparkConfiguration().getSparkConf() spark = SparkSession.builder. \ config(conf=conf). \ appName("pyspark postgres performance test"). \ getOrCreate() schema = FileSchema().empSchema() df = spark.read.format("csv").load(input_path, schema=schema, inferSchema=False) print("No of Partitions:{0}".format(df.rdd.getNumPartitions())) return df
from com.rposam.util.logger import Log4j if __name__ == "__main__": conf = SparkConfiguration.getSparkConf() Driver = SparkSession. \ builder. \ config(conf=conf). \ appName("ETL Pipeline using Airflow CSV To Parquet"). \ getOrCreate() logger = Log4j(Driver) logger.info("Fetching schema of source file") schema = FileSchema.empSchema() source = sys.argv[1] target = sys.argv[2] logger.info("Source is {0} and target is {1}".format(source, target)) logger.info("Started reading data from sources") empDF = Driver.read. \ format("csv"). \ schema(schema=schema). \ option("header", "false"). \ load(source) empDF.show()
if (re.search(regex, str(email))): return True else: return False def parseGender(gender): if str(gender) in ["Male", "M", "male", "m", "MALE"]: return "M" elif str(gender) in ["Female", "F", "f", "female", "feMale", "FEMALE"]: return "F" else: return "N" FileSchema = FileSchema()._20000recordsSchema() if __name__ == "__main__": conf = SparkConfiguration().getSparkConf() warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db" spark = SparkSession. \ builder. \ config(conf=conf). \ appName("Custom UDF for Email and Gender validations and also a LTI Assignment"). \ config("hive.metastore.uris", "thrift://localhost:9083"). \ enableHiveSupport(). \ config("spark.sql.shuffle.partitions", 10). \ getOrCreate() logger = Log4j(spark) logger.info("Reading csv file with dropmalformed mode")
import sys from pyspark.sql import SparkSession from pyspark.sql.functions import expr from pyspark.sql.types import StructType, StructField, ArrayType, StringType from pyspark.sql.functions import col, explode from com.rposam.util.logger import Log4j from com.rposam.schema.FileSchema import FileSchema from com.rposam.config.SparkConf import SparkConfiguration import os schema = FileSchema.randomuserapiSchema() if __name__ == "__main__": conf = SparkConfiguration.getSparkConf() # for locally installed spark and hadoop # warehouseLocation = "hdfs://localhost:8020/user/hive/warehouse/sparkdb.db" # thriftServer ="thrift://localhost:9083" # for itversity cluster warehouseLocation = "hdfs://nn01.itversity.com:8020/user/rposam2021/warehouse/rposam2021_hivedb.db" thriftServer = "thrift://gw02.itversity.com:9083" os.environ["HADOOP_USER_NAME"] = "rposam2021" spark = SparkSession.builder.\ appName("Read json and write to local installed spark on ubuntu"). \ config("spark.sql.warehouse.dir", warehouseLocation). \ config("hive.metastore.uris", thriftServer). \ enableHiveSupport().\ getOrCreate() logger = Log4j(spark) logger.info("Spark session created using enableHivesupport") df = spark.read.schema(schema=schema).option(