def readFromCsv(self, spark): print("Reading from CSV") sqlContext = SQLContext(sc) schema = StructType([]) df = sqlContext.createDataFrame(sc.emptyRDD(), schema) print("First SparkContext:") print("APP Name :".format(spark.sparkContext.appName)) print("Master :" + spark.sparkContext.master) messageLogger = ml.MessageLogger(const.getProjectName(__file__), "Reading from file.....") try: messageLogger.logInfo("Reading from CSV file.") df = spark.read.csv(const.csv_file_project_1, inferSchema=True, header=True) messageLogger.logInfo("File reading finished successfully.") except Exception as e: messageLogger.logError( "unable to read the file, exception occurred: " + str(e.__class__) + "occurred.") if df.count() > 0: messageLogger.logInfo("Number of records in file: " + str(df.count())) # Display Data Frame Results # processedData = pf.ProcessedData() # processedData.processOutput("hellodcddd") # df.select('*').show() # 100, False) # Data Frame Filter Statements # df.filter(df['eq_site_limit'] == 0).select('*').show() df.filter(df['eq_site_limit'] == 0 & df['hu_site_limit'] > 20000).select('*').show()
def __init__(self, spark, db_instance): self.db_instance = db_instance self.spark = spark self.messageLogger = ml.MessageLogger(const.getProjectName(__file__), "Logger Started") self.messageLogger.logInfo( "Processing read and write operations from Database")
def readFromCsv(self): print("Reading from CSV") messageLogger = ml.MessageLogger(const.getProjectName(__file__), "Reading from file.....") readDataFromCsv = dataO.ProcessFileOps(self.spark) df = readDataFromCsv.readCSV(const.csv_file_project_1) if df.count() > 0: messageLogger.logInfo("Number of records in file: " + str(df.count())) return df
def main(): spark = SparkSession \ .builder \ .master("local[1]") \ .appName("Spark Data Operations") \ .config('spark.driver.extraClassPath', r'C:\Users\amitk\Documents\softwares\postgresql-42.2.18.jar') \ .config("spark.sql.catalogImplementation", "hive") \ .getOrCreate() df, msg = testSparkInstallation(spark, const) # Start Logger messageLogger = ml.MessageLogger(const.getProjectName(__file__), "Logger Started") messageLogger.logInfo("Logger Started") messageLogger.logInfo(msg) # write temp df into csv, parquet and json # test_spark_installation_output = "test_spark_installation_output" # processData = dataO.ProcessFileOps(spark) # processData.processCSVoutput(df, test_spark_installation_output) # processData.processParquetOutput(df, test_spark_installation_output, 2, 'gender') # processData.processJsonOutput(df, test_spark_installation_output) # File operations with Data Frames # import com.dataanalytics.services.project1 as p1 # project1 = p1.DataFrameOperations(spark) # df = project1.readFromCsv() # project1.dataFrameExamples(df) # project1.writeToDB(df) #Database operations import com.dataanalytics.services.project2 as p2 proj2 = p2.DatabaseOps(spark) proj2.write_to_db(df) # proj2.read_from_db() # Assignment: SQL TempViews # import com.dataanalytics.services.project3 as p3 # project3 = p3.ReadCsv(spark, "messageLogger") # project3.readFromCsv() ## Test Printing # print(spark.sparkContext.appName) # configurations = spark.sparkContext.getConf().getAll() # for conf in configurations: # print(conf) const.updateJobRunId()
def __init__(self, spark): self.spark = spark self.messageLogger = ml.MessageLogger(const.getProjectName(__file__), "Logger Started") self.messageLogger.logInfo( "Processing read and write operations from files")