def test_get_table_def(self):
        data = [
            (1001, 1, "Jane", "Doe", "2000-05-01", 29.0, False),
            (1002, 2, "John", "Doe", "1988-05-03", 33.0, False),
            (2201, 3, "Elonzo", "Smith", "1990-05-03", 21.0, True),
            (None, None, None, None, None, None, None)  # Test Nulls
        ]
        df = get_spark_session()\
            .createDataFrame(data, ["id", "dept_id", "first_name", "last_name", "dob", "age", "is_temp"])\
            .createOrReplaceTempView("employees")
        df = get_spark_session().sql(
            "select id, cast(dept_id as short), first_name, "
            "last_name, dob, age, is_temp from employees")
        table_def = get_table_def(df, "Extract", "Extract")

        # Ensure that the Table Name matches
        assert (table_def.table_name.name == Name("Extract"))

        # Ensure that the the TableDefinition column names match
        assert (table_def.get_column(0).name == Name("id"))
        assert (table_def.get_column(1).name == Name("dept_id"))
        assert (table_def.get_column(2).name == Name("first_name"))
        assert (table_def.get_column(3).name == Name("last_name"))
        assert (table_def.get_column(4).name == Name("dob"))
        assert (table_def.get_column(5).name == Name("age"))
        assert (table_def.get_column(6).name == Name("is_temp"))

        # Ensure that the column data types were converted correctly
        assert (table_def.get_column(0).type == SqlType.big_int())
        assert (table_def.get_column(1).type == SqlType.small_int())
        assert (table_def.get_column(2).type == SqlType.text())
        assert (table_def.get_column(3).type == SqlType.text())
        assert (table_def.get_column(4).type == SqlType.text())
        assert (table_def.get_column(5).type == SqlType.double())
        assert (table_def.get_column(6).type == SqlType.bool())
Beispiel #2
0
 def test_print_table_definition(self):
     data = [(1001, "Jane", "Doe", "2000-05-01", 29, False),
             (1002, "John", "Doe", "1988-05-03", 29, False),
             (2201, "Elonzo", "Smith", "1990-05-03", 29, True)]
     df = get_spark_session().createDataFrame(
         data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])
     hf = HyperFile(name="employees", df=df)
     hf.print_table_def()
 def test_get_rows(self):
     data = [(1001, "Jane", "Doe", "2000-05-01", 29.0, False),
             (1002, "John", "Doe", "1988-05-03", 33.0, False),
             (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True)]
     df = get_spark_session().createDataFrame(
         data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])
     rows = get_rows(df)
     expected_row = [1001, "Jane", "Doe", "2000-05-01", 29.0, False]
     assert (len(rows) == 3)
     assert (rows[0] == expected_row)
 def test_write_parquet_to_local_file_system(self):
     data = [(1001, "Jane", "Doe", "2000-05-01", 29.0, False),
             (1002, "John", "Doe", "1988-05-03", 33.0, False),
             (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True),
             (2202, "James", "Towdry", "1980-05-03", 45.0, False),
             (2235, "Susan", "Sanders", "1980-05-03", 43.0, True)]
     df = get_spark_session().createDataFrame(
         data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])
     parquet_file = write_parquet_to_local_file_system(df, "employees")
     assert (parquet_file.startswith("/tmp/hyperleaup/employees/"))
Beispiel #5
0
    def test_print_rows(self):
        # Ensure that a HyperFile can be created from a Spark DataFrame
        data = [(1001, "Jane", "Doe", "2000-05-01", 29, False),
                (1002, "John", "Doe", "1988-05-03", 29, False),
                (2201, "Elonzo", "Smith", "1990-05-03", 29, True)]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])
        hf = HyperFile(name="employees", df=df)
        hf.print_rows()

        # Ensure that a HyperFile can be created from Spark SQL
        data = [(101, "IT"), (103, "Engineering"), (104, "Management"),
                (105, "HR")]
        get_spark_session()\
            .createDataFrame(data, ["id", "department"])\
            .createOrReplaceGlobalTempView("departments")
        sql = "SELECT * FROM global_temp.departments"
        hf = HyperFile(name="employees", sql=sql)
        hf.print_rows()
Beispiel #6
0
 def test_creation_mode(self):
     data = [(1001, "Jane", "Doe", "2000-05-01", 29, False),
             (1002, "John", "Doe", "1988-05-03", 29, False),
             (2201, "Elonzo", "Smith", "1990-05-03", 29, True)]
     df = get_spark_session().createDataFrame(
         data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])
     hf = HyperFile(name="employees",
                    df=df,
                    is_dbfs_enabled=False,
                    creation_mode="insert")
     assert (hf.path == "/tmp/hyperleaup/employees/employees.hyper")
Beispiel #7
0
    def test_append(self):
        # Ensure that new data can be appended to an existing Hyper File
        existing_hf_path = '/tmp/save/employees.hyper'
        hf = HyperFile.load(path=existing_hf_path, is_dbfs_enabled=False)
        num_rows = TestUtils.get_row_count("Extract", "Extract",
                                           "/tmp/save/employees.hyper")
        assert (num_rows == 3)

        # Create new data
        data = [(3001, "Will", "Girten", "1990-05-01", 31, True),
                (3002, "Sammy", "Smith", "1988-05-03", 29, True),
                (3003, "Gregory", "Denver", "1990-05-03", 29, True)]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])
        hf.append(df=df)
        num_rows = TestUtils.get_row_count("Extract", "Extract",
                                           "/tmp/save/employees.hyper")
        assert (num_rows == 6)
    def test_creation_mode(self):
        data = [
            (1001, "Jane", "Doe", "2000-05-01", 29.0, False),
            (1002, "John", "Doe", "1988-05-03", 33.0, False),
            (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True),
            (2202, None, None, "1980-05-03", 45.0, False),  # Add a few nulls
            (2235, "", "", "1980-05-03", 43.0, True)
        ]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])

        # creation_mode using a str
        creator = Creator(df=df,
                          name='employees',
                          is_dbfs_enabled=False,
                          creation_mode="Insert")
        hyper_file_path = creator.create()
        assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper")
        num_rows = TestUtils.get_row_count(
            "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (num_rows == 5)
Beispiel #9
0
    def test_save(self):
        data = [(1001, "Jane", "Doe", "2000-05-01", 29, False),
                (1002, "John", "Doe", "1988-05-03", 29, False),
                (2201, "Elonzo", "Smith", "1990-05-03", 29, True)]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])
        hf = HyperFile(name="employees",
                       df=df,
                       is_dbfs_enabled=False,
                       creation_mode="insert")

        # Ensure that the Hyper File can be saved to an alternative location
        current_path = hf.path
        new_path = '/tmp/save/'
        expected_path = '/tmp/save/employees.hyper'
        hf.save(new_path)

        # Save operation should not update the current Hyper File's path
        assert (current_path == hf.path)
        assert (os.path.exists(expected_path))
        assert (os.path.isfile(expected_path))
    def test_create(self):
        data = [
            (1001, "Jane", "Doe", "2000-05-01", 29.0, False),
            (1002, "John", "Doe", "1988-05-03", 33.0, False),
            (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True),
            (2202, None, None, "1980-05-03", 45.0, False),  # Add a few nulls
            (2235, "", "", "1980-05-03", 43.0, True)
        ]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])

        # Ensure that a Hyper file can be created with date and timestamp columns
        df.withColumn("hire_date", current_date())
        df.withColumn("last_updated", current_timestamp())

        creator = Creator(df, 'employees', False)
        hyper_file_path = creator.create()
        assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper")
        tables = TestUtils.get_tables(
            "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (len(tables) == 1)
        num_rows = TestUtils.get_row_count(
            "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (num_rows == 5)
Beispiel #11
0
def get_spark_dataframe(sql) -> DataFrame:
    return get_spark_session().sql(sql)