Exemple #1
0
def test_historic_read_occupancy(spark_session):
    spark_session.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    occ_pr = OccupancyProcessing(spark_session)
    dataframeconfig = DataframeConfig(spark_session)
    gen_logs = GenerateLogs(spark_session)

    occ_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")

    occfilePath = dataframeconfig.get_source_driverFilerPath(occ_config_dict)

    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_historic_dataframe_schema(
        occ_config_dict)
    import glob

    file_names = glob.glob(occfilePath)

    for file in file_names:
        year = file.split("\\")[3][:4]
        if year == 2014:
            (occupancy, source_data_info) = occ_pr.sourceOccupancyReadParquet(
                occfilePath, TargetDataframeSchema, "MONTH")
            occupancy.printSchema()
            occupancy.head(4)
            break

    # Removing unwanted files
    if os.path.isfile("./tests/._SUCCESS.crc") and os.path.isfile(
            "./tests/_SUCCESS"):
        os.remove("./tests/._SUCCESS.crc")
        os.remove("./tests/_SUCCESS")
Exemple #2
0
def test_file_path(spark_session):
    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)

    json_file = "C://Datasetprocessing//tests//test1.json"

    with open(json_file) as jfile:
        df_dict = json.load(jfile)

    actual_filepath = dataframeconfig.get_source_driverFilerPath(df_dict)

    expected_filePath = "C:\\Test\\Paid_Parking.csv"

    assert actual_filepath == expected_filePath
Exemple #3
0
def test_dataframe_partition(spark_session):

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)

    json_file = "C://Datasetprocessing//tests//test1.json"

    with open(json_file) as jfile:
        df_dict = json.load(jfile)

    actual_partition = dataframeconfig.partition_column(df_dict)

    expected_partition = "MONTH"

    assert actual_partition == expected_partition
Exemple #4
0
def test_column_list(spark_session):

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)

    json_file = "C://Datasetprocessing//tests//test1.json"

    with open(json_file) as jfile:
        df_dict = json.load(jfile)

    actual_list = dataframeconfig.build_dataframe_column_list(df_dict)

    expected_list = ["occupancydatetime", "occupied_spots"]
    print(actual_list)

    assert actual_list[0] == expected_list[0]
    assert actual_list[1] == expected_list[1]
Exemple #5
0
def test_read_occupancy(spark_session):
    spark_session.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    occ_pr = OccupancyProcessing(spark_session)
    dataframeconfig = DataframeConfig(spark_session)
    gen_logs = GenerateLogs(spark_session)

    occ_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")

    occfilePath = dataframeconfig.get_source_driverFilerPath(occ_config_dict)

    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        occ_config_dict)

    (occupancy, source_data_info) = occ_pr.sourceOccupancyReadParquet(
        occfilePath, TargetDataframeSchema, "MONTH")

    occupancy.printSchema()

    occupancy.head(4)
Exemple #6
0
def test_read_blockface(spark_session):
    spark_session.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    blockface_pr = BlockfaceProcessing(spark_session)
    dataframeconfig = DataframeConfig(spark_session)

    blockface_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\blockface.json")

    blockfacefilePath = dataframeconfig.get_source_driverFilerPath(
        blockface_config_dict)

    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        blockface_config_dict)

    (blockface, source_data_info) = blockface_pr.sourceBlockfaceReadParquet(
        blockfacefilePath, TargetDataframeSchema)

    blockface.printSchema()

    blockface.head(4)
Exemple #7
0
    def create_station_id_lookup(spark_session):

        data2 = [
            (
                "05/14/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "59,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.32498613 47.6808223)",
            ),
            (
                "05/15/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "89,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.33297326 47.59872593)",
            ),
        ]

        gen_logs = GenerateLogs(spark_session)
        gen_logs.initial_log_file("test_log.log")
        dataframeconfig = DataframeConfig(spark_session)
        occupancy_config_dict = dataframeconfig.json_reader(
            "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
        # Get Target Table Schema
        TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
            occupancy_config_dict)

        occ_df = spark_session.createDataFrame(data=data2,
                                               schema=TargetDataframeSchema)

        cols = [
            "occupancydatetime",
            "paidoccupancy",
            "blockfacename",
            "sideofstreet",
            "station_id",
            "pakingtimelimitcategory",
            "available_spots",
            "paidparkingarea",
            "paidparkingsubarea",
            "paidparkingrate",
            "parkingcategory",
            "location",
        ]

        occ_df.coalesce(1).write.format("csv").save(
            "./tests/station_id_lookup.csv", header="true")
Exemple #8
0
def test_historic_occupancy(spark_session):
    def create_station_id_lookup(spark_session):

        data2 = [
            (
                "05/14/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "59,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.32498613 47.6808223)",
            ),
            (
                "05/15/2021 04:26:00 PM",
                4,
                "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
                "SW",
                "89,013",
                120,
                6,
                "Green Lake",
                "",
                0.00,
                "Paid Parking",
                "POINT (-122.33297326 47.59872593)",
            ),
        ]

        gen_logs = GenerateLogs(spark_session)
        gen_logs.initial_log_file("test_log.log")
        dataframeconfig = DataframeConfig(spark_session)
        occupancy_config_dict = dataframeconfig.json_reader(
            "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
        # Get Target Table Schema
        TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
            occupancy_config_dict)

        occ_df = spark_session.createDataFrame(data=data2,
                                               schema=TargetDataframeSchema)

        cols = [
            "occupancydatetime",
            "paidoccupancy",
            "blockfacename",
            "sideofstreet",
            "station_id",
            "pakingtimelimitcategory",
            "available_spots",
            "paidparkingarea",
            "paidparkingsubarea",
            "paidparkingrate",
            "parkingcategory",
            "location",
        ]

        occ_df.coalesce(1).write.format("csv").save(
            "./tests/station_id_lookup.csv", header="true")

    create_station_id_lookup(spark_session)

    hist_data2 = [
        (
            "05/14/2017 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "59013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
        ),
        (
            "05/15/2021 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "89013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
        ),
    ]

    occupancy_pr = OccupancyProcessing(spark_session)
    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)
    occupancy_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
    # Get Target Table Schema
    TargetHistOccpDFSchema = dataframeconfig.get_historic_dataframe_schema(
        occupancy_config_dict)
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        occupancy_config_dict)

    hist_occ_df = spark_session.createDataFrame(data=hist_data2,
                                                schema=TargetHistOccpDFSchema)

    occupancy_pr.executeHistoricOccupancyOperations(
        hist_occ_df, "./tests/", "./tests/station_id_lookup.csv/*.csv",
        "MONTH", 1, 3, TargetDataframeSchema)

    actual_occ_df = spark_session.read.parquet("./tests/MONTH=MAY/*.parquet")
    actual_occ_df.show(truncate=False)

    actual_lat_list = actual_occ_df.select("latitude").collect()

    actual_long_list = actual_occ_df.select("longitude").collect()

    actual_lat_array = [str(row["latitude"]) for row in actual_lat_list]

    actual_long_array = [str(row["longitude"]) for row in actual_long_list]

    assert actual_lat_array[0] == "47.6808223"
    assert actual_long_array[0] == "-122.32498613"

    # rdd = spark_session.sparkContext.parallelize(data)

    # Removing folders
    shutil.rmtree("./tests/MONTH=May/")
    # Removing folders
    shutil.rmtree("./tests/station_id_lookup.csv/")
Exemple #9
0
def test_occupancy_transformations(spark_session):
    data2 = [
        (
            "04/14/2021 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "59,013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
            "POINT (-122.32498613 47.6808223)",
        ),
        (
            "04/15/2021 04:26:00 PM",
            4,
            "WOODLAWN AVE NE BETWEEN NE 72ND ST AND NE 73RD ST",
            "SW",
            "89,013",
            120,
            6,
            "Green Lake",
            "",
            0.00,
            "Paid Parking",
            "POINT (-122.33297326 47.59872593)",
        ),
    ]

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)
    occupancy_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\occupancy.json")
    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        occupancy_config_dict)

    occupancy_pr = OccupancyProcessing(spark_session)
    # rdd = spark_session.sparkContext.parallelize(data)

    occ_df = spark_session.createDataFrame(data=data2,
                                           schema=TargetDataframeSchema)

    cols = [
        "occupancydatetime",
        "paidoccupancy",
        "blockfacename",
        "sideofstreet",
        "station_id",
        "pakingtimelimitcategory",
        "available_spots",
        "paidparkingarea",
        "paidparkingsubarea",
        "paidparkingrate",
        "parkingcategory",
        "location",
    ]

    occupancy_pr.executeOccupancyOperations(
        src_df=occ_df,
        output="./tests/",
        datedimoutputpath="./tests/",
        cols_list=cols,
        partn_col="MONTH",
        max_retry_count=1,
        retry_delay=3,
    )

    actual_occ_df = spark_session.read.parquet("./tests/MONTH=April/*.parquet")
    actual_occ_df.show(truncate=False)

    actual_lat_list = actual_occ_df.select("latitude").collect()

    actual_long_list = actual_occ_df.select("longitude").collect()

    actual_lat_array = [str(row["latitude"]) for row in actual_lat_list]

    actual_long_array = [str(row["longitude"]) for row in actual_long_list]

    assert actual_lat_array[0] == "47.6808223"
    assert actual_long_array[0] == "-122.32498613"

    # Removing folders
    shutil.rmtree("./tests/MONTH=April/")
Exemple #10
0
def test_blockface_transformations(spark_session):
    data2 = [
        (
            1065,
            79025,
            11078,
            7900,
            90,
            "JOHN ST BETWEEN 9TH AVE N AND WESTLAKE AVE N",
            "N",
            "N2-09",
            900,
            "Y",
            "Paid Parking",
            1,
            1,
            2,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            24,
            "",
            "South Lake",
            120,
            "South",
            "08AM",
            "06PM",
            "08AM",
            "06PM",
            "DISTRICT7",
            "",
            "N",
            "",
            380.306908791687,
        ),
        (
            1491,
            56353,
            11076,
            7900,
            75,
            "JOHN ST BETWEEN DEXTER AVE N AND 8TH AVE N",
            "N",
            "N2-07",
            900,
            "Y",
            "Paid Parking",
            1,
            1,
            2,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            1.0,
            480,
            659,
            0.5,
            660,
            1079,
            1.0,
            1080,
            1499,
            24,
            "",
            "South Lake",
            120,
            "South",
            "08AM",
            "06PM",
            "08AM",
            "06PM",
            "DISTRICT7",
            "",
            "N",
            "",
            268.306908791687,
        ),
    ]

    gen_logs = GenerateLogs(spark_session)
    gen_logs.initial_log_file("test_log.log")
    dataframeconfig = DataframeConfig(spark_session)
    blockface_config_dict = dataframeconfig.json_reader(
        "C:\\Datasetprocessing\\dataset_processing\\data\\blockface.json")
    # Get Target Table Schema
    TargetDataframeSchema = dataframeconfig.get_dataframe_schema(
        blockface_config_dict)

    blockface_pr = BlockfaceProcessing(spark_session)
    # rdd = spark_session.sparkContext.parallelize(data)

    df = spark_session.createDataFrame(data=data2,
                                       schema=TargetDataframeSchema)

    cols = [
        "station_id",
        "station_address",
        "side",
        "block_nbr",
        "parking_category",
        "wkd_rate1",
        "wkd_start1",
        "wkd_end1",
        "wkd_rate2",
        "wkd_start2",
        "wkd_end2",
        "wkd_rate3",
        "wkd_start3",
        "wkd_end3",
        "sat_rate1",
        "sat_start1",
        "sat_end1",
        "sat_rate2",
        "sat_start2",
        "sat_end2",
        "sat_rate3",
        "sat_start3",
        "sat_end3",
        "parking_time_limit",
        "subarea",
    ]

    blockface_pr.executeBlockfaceOperations(src_df=df,
                                            output="./tests/",
                                            cols_list=cols,
                                            max_retry_count=1,
                                            retry_delay=3)

    actual_df = spark_session.read.parquet("./tests/*.parquet")
    actual_df.show(truncate=False)

    actual_wkd_start_data_list = actual_df.select("wkd_start1").collect()

    actual_wkd_end_data_list = actual_df.select("wkd_end1").collect()

    actual_wkd_start1_array = [
        str(row["wkd_start1"]) for row in actual_wkd_start_data_list
    ]

    actual_wkd_end1_array = [
        str(row["wkd_end1"]) for row in actual_wkd_end_data_list
    ]

    assert actual_wkd_start1_array[0] == "08:00:00"
    assert actual_wkd_end1_array[0] == "10:59:00"

    # Removing folders
    shutil.rmtree("./tests/Blockface.parquet/")
# ==============================================================================================================#
(STEP, STEP_DESC) = (40, "Processing Blockface Dataframe configuration file")
# ===============================================================================================================#
GenerateLogs.log_step(SCRIPT_NAME, "PERFORMING STEP {}:{} ".format(STEP, STEP_DESC))
today = datetime.now()
current_year = today.year


if StartStep <= STEP and StopStep >= STEP:
    GenerateLogs.log_step(SCRIPT_NAME, "PERFORMING STEP {}: {} ".format(STEP, STEP_DESC))
    if os.path.isfile(blockface_config_filename):
        GenerateLogs.log_info(
            SCRIPT_NAME, "Blockface Dataframe Configuration filename: {} exists ".format(blockface_config_filename)
        )
        blockface_config_dict = DataframeConfig.json_reader(blockface_config_filename)
    else:
        GenerateLogs.log_error(
            SCRIPT_NAME,
            "ERROR: Dataframe Configuration file: {} does not exist ".format(blockface_config_filename),
            STEP,
        )
        exit(STEP)


# Get Dataframe Column List
cols_list = DataframeConfig.build_dataframe_column_list(blockface_config_dict)

# Get Blockface file path
blockfacefilePath = DataframeConfig.get_source_driverFilerPath(blockface_config_dict)