def test_should_maintain_all_data_it_reads() -> None:
    given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders(
    )
    given_dataframe = SPARK.read.parquet(given_ingest_folder)
    distance_transformer.run(SPARK, given_ingest_folder,
                             given_transform_folder)

    actual_dataframe = SPARK.read.parquet(given_transform_folder)
    actual_columns = set(actual_dataframe.columns)
    actual_schema = set(actual_dataframe.schema)
    expected_columns = set(given_dataframe.columns)
    expected_schema = set(given_dataframe.schema)

    assert expected_columns == actual_columns
    assert expected_schema.issubset(actual_schema)
def test_should_add_distance_column_with_calculated_distance() -> None:
    given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders(
    )
    distance_transformer.run(SPARK, given_ingest_folder,
                             given_transform_folder)

    actual_dataframe = SPARK.read.parquet(given_transform_folder)
    expected_dataframe = SPARK.createDataFrame([
        SAMPLE_DATA[0] + [1.07],
        SAMPLE_DATA[1] + [0.92],
        SAMPLE_DATA[2] + [1.99],
    ], BASE_COLUMNS + ['distance'])
    expected_distance_schema = StructField('distance',
                                           DoubleType(),
                                           nullable=True)
    actual_distance_schema = actual_dataframe.schema['distance']

    assert expected_distance_schema == actual_distance_schema
    assert expected_dataframe.collect() == actual_dataframe.collect()
import logging

import sys
from pyspark.sql import SparkSession

from data_transformations.citibike import distance_transformer

LOG_FILENAME = 'project.log'
APP_NAME = "Citibike Pipeline: Distance Calculation"

if __name__ == '__main__':
    logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO)
    arguments = sys.argv

    if len(arguments) is not 3:
        logging.warning("Dataset file path and output path not specified!")
        sys.exit(1)

    dataset_path = arguments[2]
    output_path = arguments[3]

    spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
    logging.info("Application Initialized: " + spark.sparkContext.appName)
    distance_transformer.run(spark, dataset_path, output_path)
    logging.info("Application Done: " + spark.sparkContext.appName)

    spark.stop()