def readFromCsv(self, spark):
        print("Reading from CSV")

        sqlContext = SQLContext(sc)

        schema = StructType([])
        df = sqlContext.createDataFrame(sc.emptyRDD(), schema)
        print("First SparkContext:")
        print("APP Name :".format(spark.sparkContext.appName))
        print("Master :" + spark.sparkContext.master)
        messageLogger = ml.MessageLogger(const.getProjectName(__file__),
                                         "Reading from file.....")
        try:
            messageLogger.logInfo("Reading from CSV file.")
            df = spark.read.csv(const.csv_file_project_1,
                                inferSchema=True,
                                header=True)
            messageLogger.logInfo("File reading finished successfully.")
        except Exception as e:
            messageLogger.logError(
                "unable to read the file, exception occurred: " +
                str(e.__class__) + "occurred.")
        if df.count() > 0:
            messageLogger.logInfo("Number of records in file: " +
                                  str(df.count()))

        # Display Data Frame Results
        #         processedData = pf.ProcessedData()
        #         processedData.processOutput("hellodcddd")
        # df.select('*').show()  # 100, False)

        # Data Frame Filter Statements
        # df.filter(df['eq_site_limit'] == 0).select('*').show()
        df.filter(df['eq_site_limit'] == 0
                  & df['hu_site_limit'] > 20000).select('*').show()
 def __init__(self, spark, db_instance):
     self.db_instance = db_instance
     self.spark = spark
     self.messageLogger = ml.MessageLogger(const.getProjectName(__file__),
                                           "Logger Started")
     self.messageLogger.logInfo(
         "Processing read and write operations from Database")
Example #3
0
 def readFromCsv(self):
     print("Reading from CSV")
     messageLogger = ml.MessageLogger(const.getProjectName(__file__),
                                      "Reading from file.....")
     readDataFromCsv = dataO.ProcessFileOps(self.spark)
     df = readDataFromCsv.readCSV(const.csv_file_project_1)
     if df.count() > 0:
         messageLogger.logInfo("Number of records in file: " +
                               str(df.count()))
     return df
Example #4
0
def main():
    spark = SparkSession \
        .builder \
        .master("local[1]") \
        .appName("Spark Data Operations") \
        .config('spark.driver.extraClassPath', r'C:\Users\amitk\Documents\softwares\postgresql-42.2.18.jar') \
        .config("spark.sql.catalogImplementation", "hive") \
        .getOrCreate()
    df, msg = testSparkInstallation(spark, const)

    # Start Logger
    messageLogger = ml.MessageLogger(const.getProjectName(__file__), "Logger Started")
    messageLogger.logInfo("Logger Started")
    messageLogger.logInfo(msg)

    # write temp df into csv, parquet and json
    # test_spark_installation_output = "test_spark_installation_output"
    # processData = dataO.ProcessFileOps(spark)
    # processData.processCSVoutput(df, test_spark_installation_output)
    # processData.processParquetOutput(df, test_spark_installation_output, 2, 'gender')
    # processData.processJsonOutput(df, test_spark_installation_output)

    # File operations with Data Frames

    # import com.dataanalytics.services.project1 as p1
    # project1 = p1.DataFrameOperations(spark)
    # df = project1.readFromCsv()
    # project1.dataFrameExamples(df)
    # project1.writeToDB(df)



    #Database operations

    import com.dataanalytics.services.project2 as p2
    proj2 = p2.DatabaseOps(spark)
    proj2.write_to_db(df)
    # proj2.read_from_db()



     # Assignment: SQL TempViews

    # import com.dataanalytics.services.project3 as p3
    # project3 = p3.ReadCsv(spark, "messageLogger")
    # project3.readFromCsv()

    ## Test Printing
    # print(spark.sparkContext.appName)
    # configurations = spark.sparkContext.getConf().getAll()
    # for conf in configurations:
    #     print(conf)

    const.updateJobRunId()
Example #5
0
 def __init__(self, spark):
     self.spark = spark
     self.messageLogger = ml.MessageLogger(const.getProjectName(__file__),
                                           "Logger Started")
     self.messageLogger.logInfo(
         "Processing read and write operations from files")