def main(): print ("Starting Spark Read CSV Program") options = get_cmdline_options() conf = get_spark_app_config(options.job_name) print ("Create Spark Session") spark = SparkSession\ .builder\ .config(conf=conf)\ .getOrCreate() conf_out = spark.sparkContext.getConf() #print(conf_out.toDebugString()) #spark.read # This returns a DataFrameReader Object, This obj is the gateway to read the data in Apache. df = read_df(spark, filename=options.file_name) #print (df.show()) #Repartitioning partitioned_df = df.repartition(2) #Transformations cnt_df = partitioned_df.where("Age < 40")\ .select("Age","Gender","Country","state")\ .groupBy("Country")\ .count() print ("Length of Output", len(cnt_df.collect())) spark.stop()
import sys from pyspark import SparkConf from pyspark.sql import * from lib.logger import Log4J from lib.utils import get_spark_app_config, load_data_file, count_by_country if __name__ == "__main__": conf = get_spark_app_config() spark = SparkSession.builder \ .config(conf=conf) \ .getOrCreate() #.appName("Hello Spark") \ #.master("local[3]") \ logger = Log4J(spark) # logger.info("Starting Hellospark") sample_df = load_data_file(spark=spark, datafile=sys.argv[1]) logger.info("Using Spark Dataframe (like ORM)") partitioned_df = sample_df.repartition(2) counted_df = count_by_country(df=partitioned_df) # counted_df.show() logger.info(counted_df.collect()) logger.info("Using Spark SQL") view_tbl = sample_df.createOrReplaceTempView("sample_tbl") counted_df = spark.sql( "SELECT Country, COUNT(1) AS count FROM sample_tbl WHERE Age<40 GROUP BY Country" )