def test_map(self): result = StringMap( labels={'a': 1.0}, inputCol='key_col', outputCol='value_col', ).transform(self.input) expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA) assert_df(expected, result)
def test_map_from_dataframe(self): labels_df = StringMapTest.spark.createDataFrame([['a', 1.0]], 'key_col: string, value_col: double') result = StringMap.from_dataframe( labels_df=labels_df, inputCol='key_col', outputCol='value_col' ).transform(self.input) expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA) assert_df(expected, result)
def test_serialize_to_bundle(self): string_map = StringMap({'a': 1.0}, 'key_col', 'value_col') pipeline = Pipeline(stages=[string_map]).fit(self.input) pipeline_file = os.path.join(os.path.dirname(__file__), '..', '..', 'target', 'test_serialize_to_bundle-pipeline.zip') _serialize_to_file(pipeline_file, self.input, pipeline) deserialized_pipeline = _deserialize_from_file(pipeline_file) result = deserialized_pipeline.transform(self.input) expected = self.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA) assert_df(expected, result)
def test_map_custom_default_value(self): result = StringMap( labels={'z': 1.0}, inputCol='key_col', outputCol='value_col', handleInvalid='keep', defaultValue=-1.0 ).transform(self.input) expected = StringMapTest.spark.createDataFrame([['a', 'b', -1.0]], OUTPUT_SCHEMA) assert_df(expected, result)
def test_serialize_to_bundle(self): string_map = StringMap( labels={'a': 1.0}, inputCol='key_col', outputCol='value_col', ) pipeline = Pipeline(stages=[string_map]).fit(self.input) serialization_dataset = pipeline.transform(self.input) jar_file_path = _serialize_to_file(pipeline, serialization_dataset) deserialized_pipeline = _deserialize_from_file(jar_file_path) result = deserialized_pipeline.transform(self.input) expected = StringMapTest.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA) assert_df(expected, result)
def test_map_default_value(self): result = StringMap({'z': 1.0}, 'key_col', 'value_col', handleInvalid='keep').transform(self.input) expected = self.spark.createDataFrame([['a', 'b', 0.0]], OUTPUT_SCHEMA) assert_df(expected, result)
def test_map(self): result = StringMap({'a': 1.0}, 'key_col', 'value_col').transform(self.input) expected = self.spark.createDataFrame([['a', 'b', 1.0]], OUTPUT_SCHEMA) assert_df(expected, result)