def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info( "Asmath --> Easy usage is to follow the json format while replacing values" ) filePath = str( get_project_root()) + "/resources/data/small_zipcode.csv" df = self.spark.read.options(header='true', inferSchema='true') \ .csv(filePath) logging.info("Drop nulls using df.na.drop()") df.na.drop().show(truncate=False) logging.info( "Asmath --> Easy usage is to follow the json format while replacing values , second way is reverse value and then key" ) df.fillna({"city": "unknown", "type": ""}) \ .show() df.printSchema() df.show(truncate=False) df.fillna(value=0).show() df.fillna(value=0, subset=["population"]).show() df.na.fill(value=0).show() df.na.fill(value=0, subset=["population"]).show() df.fillna(value="").show() df.na.fill(value="").show() df.fillna("unknown", ["city"]) \ .fillna("", ["type"]).show() df.fillna({"city": "unknown", "type": ""}) \ .show() df.na.fill("unknown", ["city"]) \ .na.fill("", ["type"]).show() df.na.fill({"city": "unknown", "type": ""}) \ .show() logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return
class Persist: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def __init__(self, spark, file_config): self.spark = spark self.file_config = file_config def persist_data(self, df): try: logger = logging.getLogger("Persist") logger.info('Persisting') #config = configparser.ConfigParser() #config.read('pipeline/resources/pipeline.ini') target_table = self.file_config.get('DB_CONFIGS', 'TARGET_PG_TABLE') logger.info('PG Target table is ' + str(target_table)) #df.coalesce(1).write.option("header", "true").csv("transformed_retailstore") df.write\ .mode("append")\ .format("jdbc")\ .option("url", "jdbc:postgresql://localhost:5432/postgres")\ .option("dbtable", target_table)\ .option("user", "postgres")\ .option("password", "admin")\ .save() except Exception as exp: logger.error("An error occured while persisiting data >" + str(exp)) # store in database table # send an email notification raise Exception("HDFS directory already exists") def insert_into_pg(self): connection = psycopg2.connect(user='******', password='******', host='localhost', database='postgres') cursor = connection.cursor() insert_query = "INSERT INTO futurexschema.futurex_course_catalog (course_id, course_name, author_name, course_section, creation_date) VALUES (%s, %s, %s, %s,%s)" insert_tuple = (3, 'Machine Learning', 'FutureX', '{}', '2020-10-20') cursor.execute(insert_query, insert_tuple) cursor.close() connection.commit()
class Transform: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def __init__(self, spark, file_config): self.spark = spark self.file_config = file_config def transform_data(self, df): logger = logging.getLogger("Transform") logger.info("Transforming") logger.warning("Warning in Transformer") # drop all the rows having null values #df1 = df.na.drop() df1 = df.na.fill("Unknown", ["author_name"]) df2 = df1.na.fill("0", ["no_of_reviews"]) return df2
def verifyUsage(self, arguments): self.config_file = '' try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file))
class Ingest: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def __init__(self, spark, file_config): self.spark = spark self.file_config = file_config def ingest_data(self): logger = logging.getLogger("Ingest") logger.info('Ingesting from csv') #customer_df = self.spark.read.csv("retailstore.csv",header=True) course_df = self.spark.sql("select * from fxxcoursedb.fx_course_table") logger.info('DataFrame created') logger.warning('DataFrame created with warning') return course_df def read_from_pg(self): connection = psycopg2.connect(user='******', password='******', host='localhost', database='postgres') cursor = connection.cursor() sql_query = "select * from futurexschema.futurex_course_catalog" pdDF = sqlio.read_sql_query(sql_query, connection) sparkDf = self.spark.createDataFrame(pdDF) sparkDf.show() def read_from_pg_using_jdbc_driver(self): jdbcDF = self.spark.read \ .format("jdbc") \ .option("url", "jdbc:postgresql://localhost:5432/postgres") \ .option("dbtable", "futurexschema.futurex_course_catalog") \ .option("user", "postgres") \ .option("password", "admin") \ .load() jdbcDF.show()
class ExplodeMapArraysToRows: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) arrayData = [('James', ['Java', 'Scala'], { 'hair': 'black', 'eye': 'brown' }), ('Michael', ['Spark', 'Java', None], { 'hair': 'brown', 'eye': None }), ('Robert', ['CSharp', ''], { 'hair': 'red', 'eye': '' }), ('Washington', None, None), ('Jefferson', ['1', '2'], {})] df = self.spark.createDataFrame( data=arrayData, schema=['name', 'knownLanguages', 'properties']).cache() df.printSchema() df.show() from pyspark.sql.functions import explode df2 = df.select(df.name, explode(df.knownLanguages)) df2.printSchema() df2.show() df3 = df.withColumn("ExplodedColumn", explode(df.knownLanguages)) df3.printSchema() df3.show() # Exploding map and Array from pyspark.sql.functions import explode logging.info( "Asmath --> Only one generator allowed per select clause but found 2: explode(knownLanguages), explode(properties);" ) #Error: Only one generator allowed per select clause but found 2: explode(knownLanguages), explode(properties); #df5 = df.select(df.name, explode(df.knownLanguages), explode(df.properties)) df5 = df.withColumn("ExplodedArrayColumn", explode(df.knownLanguages)) df5.printSchema( ) # it wont throw error if you dont pass () but it wont print schema df6 = df5.withColumn("ExplodedMapColumn", explode( df5.properties)) # pass df5 here not df df6.printSchema() df6.show() # Explode Array logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql( "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)" ) #Treat empty strings as null self.spark.sql( "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')" ) def verifyUsage(self, arguments): self.config_file = '' self.file_config = None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file)) logging.info('file config is ' + str(self.file_config))
class GroupByExamples: logging.config.fileConfig(str(get_project_root())+"/resources/configs/logging.conf") def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info('run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/') simpleData = [("James", "Sales", "NY", 90000, 34, 10000), ("Michael", "Sales", "NY", 86000, 56, 20000), ("Robert", "Sales", "CA", 81000, 30, 23000), ("Maria", "Finance", "CA", 90000, 24, 23000), ("Raman", "Finance", "CA", 99000, 40, 24000), ("Scott", "Finance", "NY", 83000, 36, 19000), ("Jen", "Finance", "NY", 79000, 53, 15000), ("Jeff", "Marketing", "CA", 80000, 25, 18000), ("Kumar", "Marketing", "NY", 91000, 50, 21000) ] # agg function in spark is used to calculate multiple aggegrates in group by clause. # We can use seperate functions too without agg function #having clause is where on aggregrate in spark # https://sparkbyexamples.com/pyspark/pyspark-groupby-explained-with-example/ #aggrgrate functions should use group by if you select other columns #aggregrate function no need group by if you dont select other columns - This is defualt nature in spark, will not return other columns #https://stackoverflow.com/questions/6467216/is-it-possible-to-use-aggregate-function-in-a-select-statment-without-using-grou/6467287 schema = ["employee_name", "department", "state", "salary", "age", "bonus"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.printSchema() df.show(truncate=False) #Sum df.groupby(df.department).sum("salary").alias("sum_salary").show(truncate=False) # cannot have df.salary in sum clause variable df.groupBy("department").sum("salary").show(truncate=False) df.groupBy(F.col("department")).sum("salary").show(truncate=False) # cannot have F.col("salary") in sum clause variable #Count df.groupby(df.department).count().show(truncate=False) df.groupBy("department").count().show(truncate=False) df.groupBy(F.col("department")).count().show(truncate=False) # min df.groupby(df.department).min("salary").show(truncate=False) df.groupBy("department").min("salary").show(truncate=False) df.groupBy(F.col("department")).min("salary").show(truncate=False) # max df.groupby(df.department).max("salary").show(truncate=False) df.groupBy("department").max("salary").show(truncate=False) df.groupBy(F.col("department")).max("salary").show(truncate=False) from pyspark.sql.functions import avg df.groupBy("department") \ .agg(sum("salary").alias("sum_salary"), \ avg("salary").alias("avg_salary"), \ sum("bonus").alias("sum_bonus"), \ max("bonus").alias("max_bonus") \ ) \ .show(truncate=False) logging.info("using only one function inside agg . agg for multiple functions") df.groupBy("department") \ .agg(sum("salary").alias("sum_salary") ) \ .show(truncate=False) from pyspark.sql.functions import col df.groupBy("department") \ .agg(sum("salary").alias("sum_salary"), \ avg("salary").alias("avg_salary"), \ sum("bonus").alias("sum_bonus"), \ max("bonus").alias("max_bonus")) \ .where(col("sum_bonus") >= 50000) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " +str(exp) ) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql("create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)") #Treat empty strings as null self.spark.sql("alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')") def verifyUsage(self,arguments): self.config_file = '' self.file_config=None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read(str(get_project_root())+"/resources/pipeline.ini") logging.info('Input file is '+str(self.config_file)) logging.info('file config is '+str(self.file_config))
class FilterColumns: logging.config.fileConfig(str(get_project_root())+"/resources/configs/logging.conf") def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info('run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-withcolumn/') arrayStructureData = [ (("James", "", "Smith"), ["Java", "Scala", "C++"], "OH", "M"), (("Anna", "Rose", ""), ["Spark", "Java", "C++"], "NY", "F"), (("Julia", "", "Williams"), ["CSharp", "VB"], "OH", "F"), (("Maria", "Anne", "Jones"), ["CSharp", "VB"], "NY", "M"), (("Jen", "Mary", "Brown"), ["CSharp", "VB"], "NY", "M"), (("Mike", "Mary", "Williams"), ["Python", "VB"], "OH", "M") ] arrayStructureSchema = StructType([ StructField('name', StructType([ StructField('firstname', StringType(), True), StructField('middlename', StringType(), True), StructField('lastname', StringType(), True) ])), StructField('languages', ArrayType(StringType()), True), StructField('state', StringType(), True), StructField('gender', StringType(), True) ]) df = self.spark.createDataFrame(data=arrayStructureData, schema=arrayStructureSchema) df.printSchema() df.show(truncate=False) # filter dataframe where state= OH df.filter(df.state == "OH").show(truncate=False) df.filter(F.col("state")=="OH").show(truncate=False) # Multiple conditions use ( and it is mandatory sometimes df.filter((df.state == "OH") & (df.gender == "M")).show(truncate=False) df.filter((F.col("state") == "OH") & (F.col("gender") == "M")).show(truncate=False) # Filter array date_add df.filter(array_contains(df.languages, "Java") & (df.state == "OH") & (df.gender == "M")) \ .show(truncate=False) df.filter((array_contains(F.col("languages"), "Java")) & (F.col("state") == "OH") & (F.col("gender") == "M")).show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " +str(exp) ) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql("create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)") #Treat empty strings as null self.spark.sql("alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')") def verifyUsage(self,arguments): self.config_file = '' self.file_config=None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read(str(get_project_root())+"/resources/pipeline.ini") logging.info('Input file is '+str(self.config_file)) logging.info('file config is '+str(self.file_config))
class AggregateFunctions: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def run_pipeline(self): try: logging.info( "https://sparkbyexamples.com/pyspark/pyspark-aggregate-functions/" ) # check collect_list and collect_set #collect_set() function returns all values from an input column with duplicate values eliminated. #collect_list() function returns all values from an input column with duplicates logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) simpleData = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3000), ("Scott", "Finance", 3300), ("Jen", "Finance", 3900), ("Jeff", "Marketing", 3000), ("Kumar", "Marketing", 2000), ("Saif", "Sales", 4100)] schema = ["employee_name", "department", "salary"] df = self.spark.createDataFrame(data=simpleData, schema=schema).cache() df.show(truncate=False) from pyspark.sql.functions import approx_count_distinct, collect_list from pyspark.sql.functions import collect_set, sum, avg, max, countDistinct, count from pyspark.sql.functions import first, last, kurtosis, min, mean, skewness from pyspark.sql.functions import stddev, stddev_samp, stddev_pop, sumDistinct from pyspark.sql.functions import variance, var_samp, var_pop df.printSchema() df.show(truncate=False) print("approx_count_distinct: " + \ str(df.select(approx_count_distinct("salary")).collect()[0][0])) print("avg: " + str(df.select(avg("salary")).collect()[0][0])) df.select(collect_list("salary")).show(truncate=False) df.select(collect_set("salary")).show(truncate=False) df2 = df.select(countDistinct("department", "salary")) df2.show(truncate=False) print("Distinct Count of Department & Salary: " + str(df2.collect()[0][0])) print("count: " + str(df.select(count("salary")).collect()[0])) dffirst = df.select(first("salary")) dffirst.show(truncate=False) df.select(last("salary")).show(truncate=False) df.select(kurtosis("salary")).show(truncate=False) df.select(max("salary")).show(truncate=False) df.select(min("salary")).show(truncate=False) df.select(mean("salary")).show(truncate=False) df.select(skewness("salary")).show(truncate=False) df.select(stddev("salary"), stddev_samp("salary"), \ stddev_pop("salary")).show(truncate=False) df.select(sum("salary")).show(truncate=False) df.select(sumDistinct("salary")).show(truncate=False) df.select(variance("salary"), var_samp("salary"), var_pop("salary")) \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql( "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)" ) #Treat empty strings as null self.spark.sql( "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')" ) def verifyUsage(self, arguments): self.config_file = '' self.file_config = None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file)) logging.info('file config is ' + str(self.file_config))
class HandleNulls: logging.config.fileConfig(str(get_project_root())+"/resources/configs/logging.conf") def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info("Asmath --> Easy usage is to follow the json format while replacing values") data = [("James", "", "Smith", "36636", "M", 60000), ("Michael", "Rose", "", "40288", "M", 70000), ("Robert", "", "Williams", "42114", "", 400000), ("Maria", "Anne", "Jones", "39192", "F", 500000), ("Jen", "Mary", "Brown", "", "F", 0)] columns = ["first_name", "middle_name", "last_name", "dob", "gender", "salary"] df = self.spark.createDataFrame(data=data, schema=columns) df.printSchema() df.show(truncate=False) # Using when otherwise from pyspark.sql.functions import col, when df2 = df.withColumn("new_gender", when(col("gender") == "M", "Male") .when(col("gender") == "F", "Female") .otherwise("Unknown")) df2.show(truncate=False) df22 = df.select(col("*"), when(col("gender") == "M", "Male") .when(col("gender") == "F", "Female") .otherwise("Unknown").alias("new_gender")).show(truncate=False) # Using case when from pyspark.sql.functions import expr df3 = df.withColumn("new_gender", expr("case when gender = 'M' then 'Male' " + "when gender = 'F' then 'Female' " + "else 'Unknown' end")) df3.show(truncate=False) # Using case when df4 = df.select(col("*"), expr("case when gender = 'M' then 'Male' " + "when gender = 'F' then 'Female' " + "else 'Unknown' end").alias("new_gender")) df4.show(truncate=False) data2 = [(66, "a", "4"), (67, "a", "0"), (70, "b", "4"), (71, "d", "4")] df5 = self.spark.createDataFrame(data=data2, schema=["id", "code", "amt"]) df5.withColumn("new_column", when(col("code") == "a" | col("code") == "d", "A") .when(col("code") == "b" & col("amt") == "4", "B") .otherwise("A1")).show() logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " +str(exp) ) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql("create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)") self.spark.sql("insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)") #Treat empty strings as null self.spark.sql("alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')") def verifyUsage(self,arguments): self.config_file = '' self.file_config=None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read(str(get_project_root())+"/resources/pipeline.ini") logging.info('Input file is '+str(self.config_file)) logging.info('file config is '+str(self.file_config))
class CaseWhenOtherWise: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info( "Asmath --> Easy usage is to follow the json format while replacing values" ) filePath = str( get_project_root()) + "/resources/data/small_zipcode.csv" df = self.spark.read.options(header='true', inferSchema='true') \ .csv(filePath) logging.info("Drop nulls using df.na.drop()") df.na.drop().show(truncate=False) logging.info( "Asmath --> Easy usage is to follow the json format while replacing values , second way is reverse value and then key" ) df.fillna({"city": "unknown", "type": ""}) \ .show() df.printSchema() df.show(truncate=False) df.fillna(value=0).show() df.fillna(value=0, subset=["population"]).show() df.na.fill(value=0).show() df.na.fill(value=0, subset=["population"]).show() df.fillna(value="").show() df.na.fill(value="").show() df.fillna("unknown", ["city"]) \ .fillna("", ["type"]).show() df.fillna({"city": "unknown", "type": ""}) \ .show() df.na.fill("unknown", ["city"]) \ .na.fill("", ["type"]).show() df.na.fill({"city": "unknown", "type": ""}) \ .show() logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql( "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)" ) #Treat empty strings as null self.spark.sql( "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')" ) def verifyUsage(self, arguments): self.config_file = '' self.file_config = None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file)) logging.info('file config is ' + str(self.file_config))
class SortOrderBy: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-orderby-and-sort-explained/' ) simpleData = [("James", "Sales", "NY", 90000, 34, 10000), \ ("Michael", "Sales", "NY", 86000, 56, 20000), \ ("Robert", "Sales", "CA", 81000, 30, 23000), \ ("Maria", "Finance", "CA", 90000, 24, 23000), \ ("Raman", "Finance", "CA", 99000, 40, 24000), \ ("Scott", "Finance", "NY", 83000, 36, 19000), \ ("Jen", "Finance", "NY", 79000, 53, 15000), \ ("Jeff", "Marketing", "CA", 80000, 25, 18000), \ ("Kumar", "Marketing", "NY", 91000, 50, 21000) \ ] # Sort by - sorting happens in specific partitons. Order is not guaranted #Group by - order is gurantee columns = [ "employee_name", "department", "state", "salary", "age", "bonus" ] df = self.spark.createDataFrame(data=simpleData, schema=columns) df.printSchema() df.show(truncate=False) #Default is ascending order df.sort(df.department.asc(), df.state.asc()).show(truncate=False) df.sort("department", "state").show(truncate=False) df.sort(F.col("department").asc(), F.col("state").desc()).show(truncate=False) df.orderBy(df.department.asc(), df.state.desc()).show(truncate=False) df.orderBy("department", "state").show(truncate=False) df.orderBy(F.col("department").asc(), F.col("state").desc()).show(truncate=False) #Nulls first or last function ascending order df.sort(df.department.asc_nulls_first(), df.state.desc_nulls_first()).show(truncate=False) df.sort("department", "state").show(truncate=False) df.sort( F.col("department").asc_nulls_last()(), F.col("state").desc_nulls_last()).show(truncate=False) df.orderBy(df.department.asc(), df.state.desc()).show(truncate=False) df.orderBy("department", "state").show(truncate=False) df.orderBy(F.col("department").asc(), F.col("state").desc()).show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql( "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)" ) #Treat empty strings as null self.spark.sql( "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')" ) def verifyUsage(self, arguments): self.config_file = '' self.file_config = None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file)) logging.info('file config is ' + str(self.file_config))
def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath",str(get_project_root())+"/resources/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate()
class Pipeline: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def run_pipeline(self): try: logging.info('run_pipeline method started') ingest_process = ingest.Ingest(self.spark, self.file_config) #ingest_process.read_from_pg() #ingest_process.read_from_pg_using_jdbc_driver() df = ingest_process.ingest_data() df.show() tranform_process = transform.Transform(self.spark, self.file_config) transformed_df = tranform_process.transform_data(df) transformed_df.show() persist_process = persist.Persist(self.spark, self.file_config) #persist_process.insert_into_pg() persist_process.persist_data(transformed_df) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath",str(get_project_root())+"/resources/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql( "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)" ) #Treat empty strings as null self.spark.sql( "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')" ) def verifyUsage(self, arguments): self.config_file = '' try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file))
class UpdateColumns: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def run_pipeline(self): try: logging.info("https://sparkbyexamples.com/pyspark-tutorial/") logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-withcolumn/' ) data = [('James', '', 'Smith', '1991-04-01', 'M', 3000), ('Michael', 'Rose', '', '2000-05-19', 'M', 4000), ('Robert', '', 'Williams', '1978-09-05', 'M', 4000), ('Maria', 'Anne', 'Jones', '1967-12-01', 'F', 4000), ('Jen', 'Mary', 'Brown', '1980-02-17', 'F', -1)] columns = [ "firstname", "middlename", "lastname", "dob", "gender", "salary" ] df = self.spark.createDataFrame(data=data, schema=columns).cache() # Update column values and data type df2 = df.withColumn("salary", (F.col("salary") * 2).cast("Integer")) df2.printSchema() #2. Update the value of an existing column df3 = df.withColumn("salary", F.col("salary") * 100) df3.printSchema() #3.Create a new column from an existing df4 = df.withColumn("CopiedColumn", F.col("salary") * -1) df3.printSchema() #4. Add new constant column using lit df5 = df.withColumn("Country", F.lit("USA")) df5.printSchema() df5.show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql( "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)" ) #Treat empty strings as null self.spark.sql( "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')" ) def verifyUsage(self, arguments): self.config_file = '' self.file_config = None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file)) logging.info('file config is ' + str(self.file_config))
class Joins: logging.config.fileConfig( str(get_project_root()) + "/resources/configs/logging.conf") def run_pipeline(self): try: logging.info( "https://github.com/khajaasmath786/pyspark-examples/blob/master/pyspark-join.py" ) logging.info( 'run_pipeline method started --> https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/' ) emp = [(1, "Smith", -1, "2018", "10", "M", 3000), \ (2, "Rose", 1, "2010", "20", "M", 4000), \ (3, "Williams", 1, "2010", "10", "M", 1000), \ (4, "Jones", 2, "2005", "10", "F", 2000), \ (5, "Brown", 2, "2010", "40", "", -1), \ (6, "Brown", 2, "2010", "50", "", -1) \ ] empColumns = ["emp_id", "name", "superior_emp_id", "year_joined", \ "emp_dept_id", "gender", "salary"] empDF = self.spark.createDataFrame(data=emp, schema=empColumns) empDF.printSchema() empDF.show(truncate=False) from pyspark.sql.functions import col dept = [("Finance", 10), \ ("Marketing", 20), \ ("Sales", 30), \ ("IT", 40) \ ] deptColumns = ["dept_name", "dept_id"] deptDF = self.spark.createDataFrame(data=dept, schema=deptColumns) deptDF.printSchema() deptDF.show(truncate=False) df5=empDF.alias("emp1").join(empDF.alias("emp2"), \ col("emp1.superior_emp_id") == col("emp2.emp_id"), "inner") df5.printSchema() empDF.alias("emp1").join(empDF.alias("emp2"), \ col("emp1.superior_emp_id") == col("emp2.emp_id"), "inner") \ .select(col("emp1.emp_id"), col("emp1.name"), \ col("emp2.emp_id").alias("superior_emp_id"), \ col("emp2.name").alias("superior_emp_name")) \ .show(truncate=False) empDF.createOrReplaceTempView("EMP") deptDF.createOrReplaceTempView("DEPT") joinDF = self.spark.sql("select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id") \ .show(truncate=False) joinDF2 = self.spark.sql("select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id") \ .show(truncate=False) logging.info('run_pipeline method ended') except Exception as exp: logging.error("An error occured while running the pipeline > " + str(exp)) # send email notification # log error to database sys.exit(1) return def create_spark_session(self): app_name = self.file_config.get('APP_CONFIGS', 'APP_NAME') self.spark = SparkSession.builder\ .appName(str(app_name))\ .config("spark.driver.extraClassPath","pipeline/postgresql-42.2.18.jar")\ .enableHiveSupport().getOrCreate() def create_hive_table(self): self.spark.sql("create database if not exists fxxcoursedb") self.spark.sql( "create table if not exists fxxcoursedb.fx_course_table (course_id string,course_name string,author_name string,no_of_reviews string)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (1,'Java','FutureX',45)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (2,'Java','FutureXSkill',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (3,'Big Data','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (4,'Linux','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (5,'Microservices','Future',100)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (6,'CMS','',100)") self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (7,'Python','FutureX','')" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (8,'CMS','Future',56)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (9,'Dot Net','FutureXSkill',34)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (10,'Ansible','FutureX',123)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (11,'Jenkins','Future',32)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (12,'Chef','FutureX',121)" ) self.spark.sql( "insert into fxxcoursedb.fx_course_table VALUES (13,'Go Lang','',105)" ) #Treat empty strings as null self.spark.sql( "alter table fxxcoursedb.fx_course_table set tblproperties('serialization.null.format'='')" ) def verifyUsage(self, arguments): self.config_file = '' self.file_config = None try: opts, args = getopt.getopt(arguments, "c:") except getopt.GetoptError: logging.error('test.py -c <inputfile> ') sys.exit(2) for opt, arg in opts: if opt not in ("-c"): logging.error('test.py -c <configfile> ') sys.exit() elif opt == '-h': logging.info('test.py -c <configfile> ') elif opt in ("-c"): self.config_file = arg self.file_config = configparser.ConfigParser() self.file_config.read( str(get_project_root()) + "/resources/pipeline.ini") logging.info('Input file is ' + str(self.config_file)) logging.info('file config is ' + str(self.file_config))