Beispiel #1
0
def test_file_path(spark_session):
    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)

    json_file = "C://Datasetprocessing//tests//test1.json"

    with open(json_file) as jfile:
        df_dict = json.load(jfile)

    actual_filepath = dataframeconfig.get_source_driverFilerPath(df_dict)

    expected_filePath = "C:\\Test\\Paid_Parking.csv"

    assert actual_filepath == expected_filePath
Beispiel #2
0
def test_dataframe_partition(spark_session):

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)

    json_file = "C://Datasetprocessing//tests//test1.json"

    with open(json_file) as jfile:
        df_dict = json.load(jfile)

    actual_partition = dataframeconfig.partition_column(df_dict)

    expected_partition = "MONTH"

    assert actual_partition == expected_partition
Beispiel #3
0
def test_historic_read_occupancy(spark_session):
    spark_session.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    occ_pr = OccupancyProcessing(spark_session)
    dataframeconfig = DataframeConfig(spark_session)
    gen_logs = GenerateLogs(spark_session)

    occ_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")

    occfilePath = dataframeconfig.get_source_driverFilerPath(occ_config_dict)

    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_historic_dataframe_schema(
        occ_config_dict)
    import glob

    file_names = glob.glob(occfilePath)

    for file in file_names:
        year = file.split("\\")[3][:4]
        if year == 2014:
            (occupancy, source_data_info) = occ_pr.sourceOccupancyReadParquet(
                occfilePath, TargetDataframeSchema, "MONTH")
            occupancy.printSchema()
            occupancy.head(4)
            break

    # Removing unwanted files
    if os.path.isfile("./tests/._SUCCESS.crc") and os.path.isfile(
            "./tests/_SUCCESS"):
        os.remove("./tests/._SUCCESS.crc")
        os.remove("./tests/_SUCCESS")
    def parse_config(self, caller_function, filename, option_char="="):
        ReturnCode = 0
        OPTION_CHAR = option_char
        options = {}
        param_list = caller_function + "\n"

        f = open(filename)

        for line in f:
            # Ignore Empty lines
            if not line.strip():
                continue
            # First, remove comments:
            if COMMENT_CHAR in line:
                # if first char is '#' on the line, skip
                strip_line = line.strip()
                if strip_line[0] == "#":
                    continue
                # split on comment char, keep on the part before
                line, comment = line.split(COMMENT_CHAR, 1)
                line += "\n"

            # Second, find lines with an option = value
            if OPTION_CHAR in line:
                param_list += "{}".format(line)
                # spliy on option char
                option, value = line.split(OPTION_CHAR, 1)
                # strip spaces:

                option = option.strip()
                value = value.strip()

                value = self.remove_whitespace(value)
                options[option] = value

            else:
                GenerateLogs.log_error(
                    SCRIPT_NAME,
                    "ERROR: WRONG PARAMETER ASSIGNMENT ON LINE: {}".format(
                        line.strip()), 1)
                ReturnCode = 1
                break

        f.close()
        GenerateLogs.log_info(SCRIPT_NAME, param_list)
        return options, ReturnCode
Beispiel #5
0
def test_column_list(spark_session):

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)

    json_file = "C://Datasetprocessing//tests//test1.json"

    with open(json_file) as jfile:
        df_dict = json.load(jfile)

    actual_list = dataframeconfig.build_dataframe_column_list(df_dict)

    expected_list = ["occupancydatetime", "occupied_spots"]
    print(actual_list)

    assert actual_list[0] == expected_list[0]
    assert actual_list[1] == expected_list[1]
Beispiel #6
0
    def sourceOccupancyReadParquet(self, occupancyFilePath, custom_schema,
                                   partition_value):
        gen_logs = GenerateLogs(self.spark)
        gen_logs.log_info(SCRIPT_NAME, "Reading Occupancy CSV file...")
        print("Reading Occupancy CSV file")

        source_data_info = {}
        source_data_info["type"] = "CSV"

        # filepath = source_config['sources']['driverSource']["filePath"]
        print("Occupancy file path : {}".format(occupancyFilePath))

        try:
            occupancy = (self.spark.read.format("csv").option(
                "header", True).schema(custom_schema).load(occupancyFilePath))

        except Exception as e:
            gen_logs.log_info(SCRIPT_NAME,
                              "error in reading csv: {}".format(e))

        source_data_info["occupancyFilePath"] = occupancyFilePath
        source_data_info["partition"] = str(partition_value)

        occupancy.show(3)

        return (occupancy, source_data_info)
Beispiel #7
0
def test_read_occupancy(spark_session):
    spark_session.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    occ_pr = OccupancyProcessing(spark_session)
    dataframeconfig = DataframeConfig(spark_session)
    gen_logs = GenerateLogs(spark_session)

    occ_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")

    occfilePath = dataframeconfig.get_source_driverFilerPath(occ_config_dict)

    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        occ_config_dict)

    (occupancy, source_data_info) = occ_pr.sourceOccupancyReadParquet(
        occfilePath, TargetDataframeSchema, "MONTH")

    occupancy.printSchema()

    occupancy.head(4)
Beispiel #8
0
    def sourceBlockfaceReadParquet(self, blockfacefilePath, cust_schema):
        gen_logs = GenerateLogs(self.spark)
        gen_logs.log_info(SCRIPT_NAME, "Reading CSV file...")
        print("Reading CSV file")

        source_data_info = {}
        source_data_info["type"] = "CSV"

        try:
            blockface = self.spark.read.format("csv").option("header", True).schema(cust_schema).load(blockfacefilePath)

        except Exception as e:
            gen_logs.log_info(SCRIPT_NAME, "error in reading csv: {}".format(e))

        source_data_info["blockfacefilePath"] = blockfacefilePath

        return (blockface, source_data_info)
Beispiel #9
0
    def executeBlockfaceOperations(self, src_df, output, cols_list, max_retry_count, retry_delay):
        gen_logs = GenerateLogs(self.spark)

        gen_logs.log_info("Starting the Blockface Execute Operations")

        # src_df.printSchema()

        ReturnCode = 0
        rec_cnt = 0
        RetryCt = 0
        Success = False

        while (RetryCt < max_retry_count) and not Success:

            try:
                Success = True
                input_df = src_df

            except:
                Success = False
                RetryCt += 1
                if RetryCt == max_retry_count:
                    gen_logs.log_info(
                        SCRIPT_NAME, "Failed on reading input file after {} tries: {}".format(max_retry_count)
                    )
                    ReturnCode = 1
                    return ReturnCode, rec_cnt

                else:
                    gen_logs.log_info(
                        SCRIPT_NAME, "Failed on reading input file, re-try in {} seconds ".format(retry_delay)
                    )

        select_df = input_df.select([colname for colname in input_df.columns if colname in (cols_list)])

        select_df = (
            select_df.withColumn("wkd_start1", self.format_minstoHHMMSS(F.col("wkd_start1")))
            .withColumn("wkd_end1", self.format_minstoHHMMSS(F.col("wkd_end1")))
            .withColumn("wkd_start2", self.format_minstoHHMMSS(F.col("wkd_start2")))
            .withColumn("wkd_end2", self.format_minstoHHMMSS(F.col("wkd_end2")))
            .withColumn("wkd_end3", self.format_minstoHHMMSS(F.col("wkd_end3")))
            .withColumn("sat_start1", self.format_minstoHHMMSS(F.col("sat_start1")))
            .withColumn("sat_end1", self.format_minstoHHMMSS(F.col("sat_end1")))
            .withColumn("sat_start2", self.format_minstoHHMMSS(F.col("sat_start2")))
            .withColumn("sat_end2", self.format_minstoHHMMSS(F.col("sat_end2")))
            .withColumn("sat_start3", self.format_minstoHHMMSS(F.col("sat_start3")))
            .withColumn("sat_end3", self.format_minstoHHMMSS(F.col("sat_end3")))
        )

        # miscProcess.log_print("Writing to output file: {}".format(output))

        select_df = select_df.select([colname for colname in input_df.columns if colname in (cols_list)])

        RetryCt = 0
        Success = False

        while (RetryCt < max_retry_count) and not Success:
            try:
                Success = True
                gen_logs.log_info(SCRIPT_NAME, "Writing to Parquet file")
                select_df.show(3)
                print("Output file {}".format(output))
                select_df.coalesce(1).write.mode("overwrite").parquet(output + "//Blockface.parquet")
            except:
                Success = False
                RetryCt += 1
                if RetryCt == max_retry_count:
                    gen_logs.log_info(
                        SCRIPT_NAME, "Failed on writing File after {} tries: {} ".format(max_retry_count, output)
                    )
                    ReturnCode = 2
                    return ReturnCode, rec_cnt
                else:
                    gen_logs.log_info(SCRIPT_NAME, "Failed on writing File, re-try in {} seconds ".format(retry_delay))
                    time.sleep(retry_delay)

        gen_logs.log_print("Number of Records Processed: {}".format(rec_cnt))

        return ReturnCode, rec_cnt
Beispiel #10
0
 def get_currentDate(self):
     current_time = datetime.now()
     str_current_time = current_time.strftime("%Y-%m-%d")
     gen_logs = GenerateLogs(self.spark)
     gen_logs.log_print(str_current_time)
Beispiel #11
0
    def executeHistoricOccupancyOperations(self, src_df, output,
                                           path_of_year_tob_processed,
                                           partn_col, max_retry_count,
                                           retry_delay, custom_schema):
        gen_logs = GenerateLogs(self.spark)

        PartitionColumn = partn_col
        station_id_lookup = self.createStationIDDF(path_of_year_tob_processed,
                                                   custom_schema)

        src_df = src_df.withColumn("station_id", F.col("sourceelementkey"))
        src_df = src_df.drop("sourceelementkey")

        occ_df = src_df.join(station_id_lookup, ["station_id"],
                             how="left_outer").select(
                                 src_df.occupancydatetime,
                                 src_df.station_id,
                                 src_df.paidoccupancy,
                                 src_df.available_spots,
                                 station_id_lookup.longitude,
                                 station_id_lookup.latitude,
                             )

        self.spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

        occ_df = occ_df.withColumn(
            "occupancydatetime",
            self.timestamp_format(F.col("occupancydatetime"),
                                  "MM/dd/yyyy hh:mm:ss a"))

        occ_df = occ_df.withColumn(
            PartitionColumn,
            self.date__format(F.col("occupancydatetime"), "MMMM"))

        ReturnCode = 0
        rec_cnt = 0
        RetryCt = 0
        Success = False

        while (RetryCt < max_retry_count) and not Success:
            try:
                Success = True
                occ_df.write.mode("append").partitionBy(
                    PartitionColumn).parquet(output)
            except:
                Success = False
                RetryCt += 1
                if RetryCt == max_retry_count:
                    gen_logs.log_info(
                        SCRIPT_NAME,
                        "Failed on writing to Output after {} tries: {} ".
                        format(max_retry_count, output))
                    ReturnCode = 2
                    return ReturnCode, rec_cnt
                else:
                    gen_logs.log_info(
                        SCRIPT_NAME,
                        "Failed on writing to Output, re-try in {} seconds ".
                        format(retry_delay))
                    time.sleep(retry_delay)

        gen_logs.log_print("Number of Records Processed: {}".format(rec_cnt))
        return ReturnCode, rec_cnt
Beispiel #12
0
    def executeOccupancyOperations(self, src_df, output, datedimoutputpath,
                                   cols_list, partn_col, max_retry_count,
                                   retry_delay):

        PartitionColumn = partn_col
        gen_logs = GenerateLogs(self.spark)

        ReturnCode = 0
        rec_cnt = 0
        RetryCt = 0
        Success = False

        while (RetryCt < max_retry_count) and not Success:

            try:
                Success = True
                # reading from DBFS
                input_df = src_df

            except:
                Success = False
                RetryCt += 1
                if RetryCt == max_retry_count:
                    gen_logs.log_info(
                        SCRIPT_NAME,
                        "Failed on reading input file after {} tries: {}".
                        format(max_retry_count))
                    ReturnCode = 1
                    return ReturnCode, rec_cnt

                else:
                    gen_logs.log_info(
                        SCRIPT_NAME,
                        "Failed on reading input file, re-try in {} seconds ".
                        format(retry_delay))

        select_df = input_df.select([colname for colname in input_df.columns])
        # if colname in (cols_list)])

        select_df = select_df.withColumn("station_id",
                                         F.col("sourceelementkey"))
        select_df = select_df.drop("sourceelementkey")

        print("Reading inside transformation function")
        select_df.show(5)

        for x in range(len(cols_list)):
            if cols_list[x] == "station_id":
                column = cols_list[x]

                select_df = select_df.withColumn(
                    column,
                    self.remove_non_word_characters(F.col("station_id")))
                select_df = select_df.withColumn(
                    column, select_df[column].cast(IntegerType()))

            if cols_list[x] == "occupancydatetime":
                column = cols_list[x]

                self.spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
                select_df = select_df.withColumn(
                    column,
                    self.timestamp_format(F.col(column),
                                          "MM/dd/yyyy hh:mm:ss a"))

                select_df = select_df.withColumn(
                    PartitionColumn, self.date__format(F.col(column), "MMMM"))

                date_dim = select_df.withColumn(
                    "day_of_week",
                    self.date__format(F.col(column), "EEEE")).withColumn(
                        "month", self.date__format(F.col(column), "MMMM"))

                date_dim = date_dim.select("occupancydatetime", "day_of_week",
                                           "month")

                select_df = select_df.withColumn(
                    PartitionColumn, self.date__format(F.col(column), "MMMM"))

            if cols_list[x] == "location":

                column = cols_list[x]
                split_col = ["longitude", "latitude"]

                select_df = select_df.withColumn(
                    split_col[0],
                    F.split(column, " ").getItem(1)).withColumn(
                        split_col[1],
                        F.split(column, " ").getItem(2))

                select_df = select_df.withColumn(
                    split_col[0],
                    self.remove__parenthesis(col(split_col[0]))).withColumn(
                        split_col[1],
                        self.remove__parenthesis(col(split_col[1])))

                select_df = select_df.withColumn(
                    split_col[0],
                    select_df[split_col[0]].cast(DoubleType())).withColumn(
                        split_col[1],
                        select_df[split_col[1]].cast(DoubleType()))

                select_df = select_df.drop(column)

            #   select_df = select_df.select(cols_list)
            # select_df = select_df.select([colname for colname in input_df.columns if colname in (cols_list)])

        RetryCt = 0
        Success = False

        while (RetryCt < max_retry_count) and not Success:
            try:
                Success = True
                select_df.show(3)
                gen_logs.log_print(
                    "Writing occupancy dataframe to output file: {}".format(
                        output))

                select_df.write.mode("append").partitionBy(
                    PartitionColumn).parquet(output)

                gen_logs.log_print(
                    "Writing date dimension to output file: {}".format(
                        datedimoutputpath))
                date_dim.show(3)
                # date_dim.write.mode("append").partitionBy(PartitionColumn).parquet(datedimoutputpath)
            except:
                Success = False
                RetryCt += 1
                if RetryCt == max_retry_count:
                    gen_logs.log_info(
                        SCRIPT_NAME,
                        "Failed on writing to Output after {} tries: {} ".
                        format(max_retry_count, output),
                    )
                    ReturnCode = 2
                    return ReturnCode, rec_cnt
                else:
                    gen_logs.log_info(
                        SCRIPT_NAME,
                        "Failed on writing to Output, re-try in {} seconds ".
                        format(retry_delay))
                    time.sleep(retry_delay)

        gen_logs.log_print("Number of Records Processed: {}".format(rec_cnt))
        return ReturnCode, rec_cnt
def update_control_table(job_id, JOBNAME, status, dataset, loadtype, step, stepdesc, year_processed, date):
    JobTracker.insert_job_details(job_id, JOBNAME, status, dataset, loadtype, step, stepdesc, year_processed, date)


# =========================================================================================================
# ================ Open Spark Context Session =============================================================
# =========================================================================================================

spark = create_sparksession()

# Make the SQLContext Session available to sub-scripts
BlockfaceProcessing.global_SQLContext(spark)
OccupancyProcessing.global_SQLContext(spark)
# ReadEnvironmentParameters.global_SQLContext(spark)
GenerateLogs.global_SQLContext(spark)


# =========================================================================================================
# ================ Initialize log Filename =============================================================
# =========================================================================================================

GenerateLogs.initial_log_file(LogFileName)


# =========================================================================================================
# PROCESS ALL PARAMETERS
STEP, STEP_DESC = (10, "Read Job Specific Parameter Files")
# =========================================================================================================

GenerateLogs.log_step(SCRIPT_NAME, "PERFORMING STEP {}:{} ".format(STEP, STEP_DESC))
                    "ERROR: WRONG PARAMETER ASSIGNMENT ON LINE: {}".format(
                        line.strip()), 1)
                ReturnCode = 1
                break

        f.close()
        GenerateLogs.log_info(SCRIPT_NAME, param_list)
        return options, ReturnCode

    def read_job_control(self, paramFile):
        param, ReturnCode = self.parse_config("Job Control Parameters",
                                              paramFile, "=")
        globals().update(param)
        return param, ReturnCode

    def read_runtime_control(self, paramFile):
        param, ReturnCode = self.parse_config("Runtime Tracker Parameters",
                                              paramFile, "=")
        globals().update(param)
        return param, ReturnCode

    def main(self, paramFile):

        paramFile, ReturnCode = self.read_job_control(paramFile)


if __name__ == "__main__":
    log_file = "test.log"
    GenerateLogs.initial_log_file(log_file)
    ReadEnvironmentParameters.main(sys.argv[1])
Beispiel #15
0
def test_blockface_transformations(spark_session):
    data2 = [
        (
            1065,
            79025,
            11078,
            7900,
            90,
            "JOHN ST BETWEEN 9TH AVE N AND WESTLAKE AVE N",
            "N",
            "N2-09",
            900,
            "Y",
            "Paid Parking",
            1,
            1,
            2,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            24,
            "",
            "South Lake",
            120,
            "South",
            "08AM",
            "06PM",
            "08AM",
            "06PM",
            "DISTRICT7",
            "",
            "N",
            "",
            380.306908791687,
        ),
        (
            1491,
            56353,
            11076,
            7900,
            75,
            "JOHN ST BETWEEN DEXTER AVE N AND 8TH AVE N",
            "N",
            "N2-07",
            900,
            "Y",
            "Paid Parking",
            1,
            1,
            2,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            24,
            "",
            "South Lake",
            120,
            "South",
            "08AM",
            "06PM",
            "08AM",
            "06PM",
            "DISTRICT7",
            "",
            "N",
            "",
            268.306908791687,
        ),
    ]

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)
    blockface_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\blockface.json")
    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        blockface_config_dict)

    blockface_pr = BlockfaceProcessing(spark_session)
    # rdd = spark_session.sparkContext.parallelize(data)

    df = spark_session.createDataFrame(data=data2,
                                       schema=TargetDataframeSchema)

    cols = [
        "station_id",
        "station_address",
        "side",
        "block_nbr",
        "parking_category",
        "wkd_rate1",
        "wkd_start1",
        "wkd_end1",
        "wkd_rate2",
        "wkd_start2",
        "wkd_end2",
        "wkd_rate3",
        "wkd_start3",
        "wkd_end3",
        "sat_rate1",
        "sat_start1",
        "sat_end1",
        "sat_rate2",
        "sat_start2",
        "sat_end2",
        "sat_rate3",
        "sat_start3",
        "sat_end3",
        "parking_time_limit",
        "subarea",
    ]

    blockface_pr.executeBlockfaceOperations(src_df=df,
                                            output="./tests/",
                                            cols_list=cols,
                                            max_retry_count=1,
                                            retry_delay=3)

    actual_df = spark_session.read.parquet("./tests/*.parquet")
    actual_df.show(truncate=False)

    actual_wkd_start_data_list = actual_df.select("wkd_start1").collect()

    actual_wkd_end_data_list = actual_df.select("wkd_end1").collect()

    actual_wkd_start1_array = [
        str(row["wkd_start1"]) for row in actual_wkd_start_data_list
    ]

    actual_wkd_end1_array = [
        str(row["wkd_end1"]) for row in actual_wkd_end_data_list
    ]

    assert actual_wkd_start1_array[0] == "08:00:00"
    assert actual_wkd_end1_array[0] == "10:59:00"

    # Removing folders
    shutil.rmtree("./tests/Blockface.parquet/")
Beispiel #16
0
    def create_station_id_lookup(spark_session):

        data2 = [
            (
                "05/14/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "59,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.32498613 47.6808223)",
            ),
            (
                "05/15/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "89,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.33297326 47.59872593)",
            ),
        ]

        gen_logs = GenerateLogs(spark_session)
        gen_logs.initial_log_file("test_log.log")
        dataframeconfig = DataframeConfig(spark_session)
        occupancy_config_dict = dataframeconfig.json_reader(
            "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
        # Get Target Table Schema
        TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
            occupancy_config_dict)

        occ_df = spark_session.createDataFrame(data=data2,
                                               schema=TargetDataframeSchema)

        cols = [
            "occupancydatetime",
            "paidoccupancy",
            "blockfacename",
            "sideofstreet",
            "station_id",
            "pakingtimelimitcategory",
            "available_spots",
            "paidparkingarea",
            "paidparkingsubarea",
            "paidparkingrate",
            "parkingcategory",
            "location",
        ]

        occ_df.coalesce(1).write.format("csv").save(
            "./tests/station_id_lookup.csv", header="true")
Beispiel #17
0
def test_historic_occupancy(spark_session):
    def create_station_id_lookup(spark_session):

        data2 = [
            (
                "05/14/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "59,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.32498613 47.6808223)",
            ),
            (
                "05/15/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "89,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.33297326 47.59872593)",
            ),
        ]

        gen_logs = GenerateLogs(spark_session)
        gen_logs.initial_log_file("test_log.log")
        dataframeconfig = DataframeConfig(spark_session)
        occupancy_config_dict = dataframeconfig.json_reader(
            "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
        # Get Target Table Schema
        TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
            occupancy_config_dict)

        occ_df = spark_session.createDataFrame(data=data2,
                                               schema=TargetDataframeSchema)

        cols = [
            "occupancydatetime",
            "paidoccupancy",
            "blockfacename",
            "sideofstreet",
            "station_id",
            "pakingtimelimitcategory",
            "available_spots",
            "paidparkingarea",
            "paidparkingsubarea",
            "paidparkingrate",
            "parkingcategory",
            "location",
        ]

        occ_df.coalesce(1).write.format("csv").save(
            "./tests/station_id_lookup.csv", header="true")

    create_station_id_lookup(spark_session)

    hist_data2 = [
        (
            "05/14/2017 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "59013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
        ),
        (
            "05/15/2021 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "89013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
        ),
    ]

    occupancy_pr = OccupancyProcessing(spark_session)
    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)
    occupancy_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
    # Get Target Table Schema
    TargetHistOccpDFSchema = dataframeconfig.get_historic_dataframe_schema(
        occupancy_config_dict)
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        occupancy_config_dict)

    hist_occ_df = spark_session.createDataFrame(data=hist_data2,
                                                schema=TargetHistOccpDFSchema)

    occupancy_pr.executeHistoricOccupancyOperations(
        hist_occ_df, "./tests/", "./tests/station_id_lookup.csv/*.csv",
        "MONTH", 1, 3, TargetDataframeSchema)

    actual_occ_df = spark_session.read.parquet("./tests/MONTH=MAY/*.parquet")
    actual_occ_df.show(truncate=False)

    actual_lat_list = actual_occ_df.select("latitude").collect()

    actual_long_list = actual_occ_df.select("longitude").collect()

    actual_lat_array = [str(row["latitude"]) for row in actual_lat_list]

    actual_long_array = [str(row["longitude"]) for row in actual_long_list]

    assert actual_lat_array[0] == "47.6808223"
    assert actual_long_array[0] == "-122.32498613"

    # rdd = spark_session.sparkContext.parallelize(data)

    # Removing folders
    shutil.rmtree("./tests/MONTH=May/")
    # Removing folders
    shutil.rmtree("./tests/station_id_lookup.csv/")
Beispiel #18
0
def test_occupancy_transformations(spark_session):
    data2 = [
        (
            "04/14/2021 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "59,013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
            "POINT (-122.32498613 47.6808223)",
        ),
        (
            "04/15/2021 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "89,013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
            "POINT (-122.33297326 47.59872593)",
        ),
    ]

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)
    occupancy_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        occupancy_config_dict)

    occupancy_pr = OccupancyProcessing(spark_session)
    # rdd = spark_session.sparkContext.parallelize(data)

    occ_df = spark_session.createDataFrame(data=data2,
                                           schema=TargetDataframeSchema)

    cols = [
        "occupancydatetime",
        "paidoccupancy",
        "blockfacename",
        "sideofstreet",
        "station_id",
        "pakingtimelimitcategory",
        "available_spots",
        "paidparkingarea",
        "paidparkingsubarea",
        "paidparkingrate",
        "parkingcategory",
        "location",
    ]

    occupancy_pr.executeOccupancyOperations(
        src_df=occ_df,
        output="./tests/",
        datedimoutputpath="./tests/",
        cols_list=cols,
        partn_col="MONTH",
        max_retry_count=1,
        retry_delay=3,
    )

    actual_occ_df = spark_session.read.parquet("./tests/MONTH=April/*.parquet")
    actual_occ_df.show(truncate=False)

    actual_lat_list = actual_occ_df.select("latitude").collect()

    actual_long_list = actual_occ_df.select("longitude").collect()

    actual_lat_array = [str(row["latitude"]) for row in actual_lat_list]

    actual_long_array = [str(row["longitude"]) for row in actual_long_list]

    assert actual_lat_array[0] == "47.6808223"
    assert actual_long_array[0] == "-122.32498613"

    # Removing folders
    shutil.rmtree("./tests/MONTH=April/")