def __create_ingest_and_transform_folders() -> Tuple[str, str]:
    base_path = tempfile.mkdtemp()
    ingest_folder = "%s%singest" % (base_path, os.path.sep)
    transform_folder = "%s%stransform" % (base_path, os.path.sep)
    ingest_dataframe = SPARK.createDataFrame(SAMPLE_DATA, BASE_COLUMNS)
    ingest_dataframe.write.parquet(ingest_folder, mode='overwrite')
    return ingest_folder, transform_folder
def test_should_sanitize_column_names():
    given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders(
    )
    input_csv_path = given_ingest_folder + 'input.csv'
    csv_content = [
        ['first_field', 'field with space', ' fieldWithOuterSpaces '],
        ['3', '4', '1'],
        ['1', '5', '2'],
    ]
    __write_csv_file(input_csv_path, csv_content)
    ingest.run(SPARK, input_csv_path, given_transform_folder)

    actual = SPARK.read.parquet(given_transform_folder)
    expected = SPARK.createDataFrame(
        [['3', '4', '1'], ['1', '5', '2']],
        ['first_field', 'field_with_space', '_fieldWithOuterSpaces_'])

    assert expected.collect() == actual.collect()
def test_should_add_distance_column_with_calculated_distance() -> None:
    given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders(
    )
    distance_transformer.run(SPARK, given_ingest_folder,
                             given_transform_folder)

    actual_dataframe = SPARK.read.parquet(given_transform_folder)
    expected_dataframe = SPARK.createDataFrame([
        SAMPLE_DATA[0] + [1.07],
        SAMPLE_DATA[1] + [0.92],
        SAMPLE_DATA[2] + [1.99],
    ], BASE_COLUMNS + ['distance'])
    expected_distance_schema = StructField('distance',
                                           DoubleType(),
                                           nullable=True)
    actual_distance_schema = actual_dataframe.schema['distance']

    assert expected_distance_schema == actual_distance_schema
    assert expected_dataframe.collect() == actual_dataframe.collect()
Esempio n. 4
0
def test_should_tokenize_words_and_count_them() -> None:
    lines = [
        "In my younger and more vulnerable years my father gave me some advice that I've been "
        "turning over in my mind ever since. \"Whenever you feel like criticising any one,\""
        " he told me, \"just remember that all the people in this world haven't had the advantages"
        " that you've had.\"",
        "Most of the big shore places were closed now and there were hardly any lights except the "
        "shadowy, moving glow of a ferryboat across the Sound. And as the moon rose higher the "
        "inessential houses began to melt away until gradually I became aware of the old island "
        "here that flowered once for Dutch sailors' eyes--a fresh, green breast of the new world. "
        "Its vanished trees, the trees that had made way for Gatsby's house, had once pandered in "
        "whispers to the last and greatest of all human dreams; for a transitory enchanted moment "
        "man must have held his breath in the presence of this continent, compelled into an "
        "aesthetic contemplation he neither understood nor desired, face to face for the last time "
        "in history with something commensurate to his capacity for wonder.",
        "And as I sat there, brooding on the old unknown world, I thought of Gatsby's wonder when "
        "he first picked out the green light at the end of Daisy's dock. He had come a long way to "
        "this blue lawn and his dream must have seemed so close that he could hardly fail to grasp "
        "it. He did not know that it was already behind him, somewhere back in that vast obscurity "
        "beyond the city, where the dark fields of the republic rolled on under the night.",
        "Gatsby believed in the green light, the orgastic future that year by year recedes before "
        "us. It eluded us then, but that's no matter--tomorrow we will run faster, stretch out our "
        "arms farther.... And one fine morning----",
        "So we beat on, boats against the current, borne back ceaselessly into the past.      "
    ]
    input_file_path, output_path = _get_file_paths(lines)

    word_count_transformer.run(SPARK, input_file_path, output_path)

    actual = SPARK.read.csv(output_path, header=True, inferSchema=True)
    expected_data = [
        ["a", 4],
        ["across", 1],
        ["advantages", 1],
        ["advice", 1],
        ["aesthetic", 1],
        ["against", 1],
        ["all", 2],
        ["already", 1],
        ["an", 1],
        ["and", 7],
        ["any", 2],
        ["arms", 1],
        ["as", 2],
        ["at", 1],
        ["aware", 1],
        ["away", 1],
        ["back", 2],
        ["beat", 1],
        ["became", 1],
        ["been", 1],
        ["before", 1],
        ["began", 1],
        ["behind", 1],
        ["believed", 1],
        ["beyond", 1],
        ["big", 1],
        ["blue", 1],
        ["boats", 1],
        ["borne", 1],
        ["breast", 1],
        ["breath", 1],
        ["brooding", 1],
        ["but", 1],
        ["by", 1],
        ["capacity", 1],
        ["ceaselessly", 1],
        ["city", 1],
        ["close", 1],
        ["closed", 1],
        ["come", 1],
        ["commensurate", 1],
        ["compelled", 1],
        ["contemplation", 1],
        ["continent", 1],
        ["could", 1],
        ["criticising", 1],
        ["current", 1],
        ["daisy's", 1],
        ["dark", 1],
        ["desired", 1],
        ["did", 1],
        ["dock", 1],
        ["dream", 1],
        ["dreams", 1],
        ["dutch", 1],
        ["eluded", 1],
        ["enchanted", 1],
        ["end", 1],
        ["ever", 1],
        ["except", 1],
        ["eyes", 1],
        ["face", 2],
        ["fail", 1],
        ["farther", 1],
        ["faster", 1],
        ["father", 1],
        ["feel", 1],
        ["ferryboat", 1],
        ["fields", 1],
        ["fine", 1],
        ["first", 1],
        ["flowered", 1],
        ["for", 5],
        ["fresh", 1],
        ["future", 1],
        ["gatsby", 1],
        ["gatsby's", 2],
        ["gave", 1],
        ["glow", 1],
        ["gradually", 1],
        ["grasp", 1],
        ["greatest", 1],
        ["green", 3],
        ["had", 5],
        ["hardly", 2],
        ["have", 2],
        ["haven't", 1],
        ["he", 6],
        ["held", 1],
        ["here", 1],
        ["higher", 1],
        ["him", 1],
        ["his", 3],
        ["history", 1],
        ["house", 1],
        ["houses", 1],
        ["human", 1],
        ["i", 3],
        ["i've", 1],
        ["in", 8],
        ["inessential", 1],
        ["into", 2],
        ["island", 1],
        ["it", 3],
        ["its", 1],
        ["just", 1],
        ["know", 1],
        ["last", 2],
        ["lawn", 1],
        ["light", 2],
        ["lights", 1],
        ["like", 1],
        ["long", 1],
        ["made", 1],
        ["man", 1],
        ["matter", 1],
        ["me", 2],
        ["melt", 1],
        ["mind", 1],
        ["moment", 1],
        ["moon", 1],
        ["more", 1],
        ["morning", 1],
        ["most", 1],
        ["moving", 1],
        ["must", 2],
        ["my", 3],
        ["neither", 1],
        ["new", 1],
        ["night", 1],
        ["no", 1],
        ["nor", 1],
        ["not", 1],
        ["now", 1],
        ["obscurity", 1],
        ["of", 9],
        ["old", 2],
        ["on", 3],
        ["once", 2],
        ["one", 2],
        ["orgastic", 1],
        ["our", 1],
        ["out", 2],
        ["over", 1],
        ["pandered", 1],
        ["past", 1],
        ["people", 1],
        ["picked", 1],
        ["places", 1],
        ["presence", 1],
        ["recedes", 1],
        ["remember", 1],
        ["republic", 1],
        ["rolled", 1],
        ["rose", 1],
        ["run", 1],
        ["sailors'", 1],
        ["sat", 1],
        ["seemed", 1],
        ["shadowy", 1],
        ["shore", 1],
        ["since", 1],
        ["so", 2],
        ["some", 1],
        ["something", 1],
        ["somewhere", 1],
        ["sound", 1],
        ["stretch", 1],
        ["that", 9],
        ["that's", 1],
        ["the", 24],
        ["then", 1],
        ["there", 2],
        ["this", 3],
        ["thought", 1],
        ["time", 1],
        ["to", 6],
        ["told", 1],
        ["tomorrow", 1],
        ["transitory", 1],
        ["trees", 2],
        ["turning", 1],
        ["under", 1],
        ["understood", 1],
        ["unknown", 1],
        ["until", 1],
        ["us", 2],
        ["vanished", 1],
        ["vast", 1],
        ["vulnerable", 1],
        ["was", 1],
        ["way", 2],
        ["we", 2],
        ["were", 2],
        ["when", 1],
        ["whenever", 1],
        ["where", 1],
        ["whispers", 1],
        ["will", 1],
        ["with", 1],
        ["wonder", 2],
        ["world", 3],
        ["year", 2],
        ["years", 1],
        ["you", 1],
        ["you've", 1],
        ["younger", 1],
    ]
    expected = SPARK.createDataFrame(expected_data, ["word", "count"])

    assert actual.collect() == expected.collect()