def test_idf(self): dataset = self.spark.createDataFrame([ (DenseVector([1.0, 2.0]),), (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"]) idf0 = IDF(inputCol="tf") self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) self.assertEqual(idf0m.uid, idf0.uid, "Model should inherit the UID from its parent estimator.") output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) # Test that parameters transferred to Python Model check_params(self, idf0m)
def test_idf(self): dataset = self.spark.createDataFrame([(DenseVector([1.0, 2.0]), ), (DenseVector([0.0, 1.0]), ), (DenseVector([3.0, 0.2]), )], ["tf"]) idf0 = IDF(inputCol="tf") self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) self.assertEqual( idf0m.uid, idf0.uid, "Model should inherit the UID from its parent estimator.") output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) # Test that parameters transferred to Python Model check_params(self, idf0m)
def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [ pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression ] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls): # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), check_params_exist=False) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), check_params_exist=False)
def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls): # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), check_params_exist=False) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), check_params_exist=False)
def test_java_params(self): import re import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [ pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression, ] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if (not name.endswith("Model") and not name.endswith("Params") and issubclass(cls, JavaParams) and not inspect.isabstract(cls) and not re.match("_?Java", name) and name != "_LSH" and name != "_Selector"): check_params(self, cls(), check_params_exist=True) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(["a"], "input"), check_params_exist=True) check_params(self, StringIndexerModel.from_labels(["a", "b"], "input"), check_params_exist=True)