Beispiel #1
0
 def test_map(self):
     result = StringMap(
         labels={'a': 1.0},
         inputCol='key_col',
         outputCol='value_col',
     ).transform(self.input)
     expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA)
     assert_df(expected, result)
Beispiel #2
0
 def test_map_from_dataframe(self):
     labels_df = StringMapTest.spark.createDataFrame([['a', 1.0]], 'key_col: string, value_col: double')
     result = StringMap.from_dataframe(
         labels_df=labels_df,
         inputCol='key_col',
         outputCol='value_col'
     ).transform(self.input)
     expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA)
     assert_df(expected, result)
Beispiel #3
0
 def test_serialize_to_bundle(self):
     string_map = StringMap({'a': 1.0}, 'key_col', 'value_col')
     pipeline = Pipeline(stages=[string_map]).fit(self.input)
     pipeline_file = os.path.join(os.path.dirname(__file__), '..', '..',
                                  'target', 'test_serialize_to_bundle-pipeline.zip')
     _serialize_to_file(pipeline_file, self.input, pipeline)
     deserialized_pipeline = _deserialize_from_file(pipeline_file)
     result = deserialized_pipeline.transform(self.input)
     expected = self.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA)
     assert_df(expected, result)
Beispiel #4
0
 def test_map_custom_default_value(self):
     result = StringMap(
         labels={'z': 1.0},
         inputCol='key_col',
         outputCol='value_col',
         handleInvalid='keep',
         defaultValue=-1.0
     ).transform(self.input)
     expected = StringMapTest.spark.createDataFrame([['a', 'b', -1.0]], OUTPUT_SCHEMA)
     assert_df(expected, result)
Beispiel #5
0
    def test_serialize_to_bundle(self):
        string_map = StringMap(
            labels={'a': 1.0},
            inputCol='key_col',
            outputCol='value_col',
        )
        pipeline = Pipeline(stages=[string_map]).fit(self.input)
        serialization_dataset = pipeline.transform(self.input)

        jar_file_path = _serialize_to_file(pipeline, serialization_dataset)
        deserialized_pipeline = _deserialize_from_file(jar_file_path)

        result = deserialized_pipeline.transform(self.input)
        expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA)
        assert_df(expected, result)
Beispiel #6
0
 def test_map_default_value(self):
     result = StringMap({'z': 1.0}, 'key_col', 'value_col', handleInvalid='keep').transform(self.input)
     expected = self.spark.createDataFrame([['a', 'b', 0.0]], OUTPUT_SCHEMA)
     assert_df(expected, result)
Beispiel #7
0
 def test_map(self):
     result = StringMap({'a': 1.0}, 'key_col', 'value_col').transform(self.input)
     expected = self.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA)
     assert_df(expected, result)