def test_count_vectorizer_from_vocab(self): model = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words", outputCol="features", minTF=2) self.assertEqual(model.vocabulary, ["a", "b", "c"]) self.assertEqual(model.getMinTF(), 2) dataset = self.spark.createDataFrame([ (0, "a a a b b c".split(' '), SparseVector(3, {0: 3.0, 1: 2.0}),), (1, "a a".split(' '), SparseVector(3, {0: 2.0}),), (2, "a b".split(' '), SparseVector(3, {}),)], ["id", "words", "expected"]) transformed_list = model.transform(dataset).select("features", "expected").collect() for r in transformed_list: feature, expected = r self.assertEqual(feature, expected) # Test an empty vocabulary with QuietTest(self.sc): with self.assertRaisesRegexp(Exception, "vocabSize.*invalid.*0"): CountVectorizerModel.from_vocabulary([], inputCol="words") # Test model with default settings can transform model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words") transformed_list = model_default.transform(dataset) \ .select(model_default.getOrDefault(model_default.outputCol)).collect() self.assertEqual(len(transformed_list), 3)
def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [ pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression ] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls): # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), check_params_exist=False) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), check_params_exist=False)
def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and not name.endswith('Params') \ and issubclass(cls, JavaParams) and not inspect.isabstract(cls): # NOTE: disable check_params_exist until there is parity with Scala API check_params(self, cls(), check_params_exist=False) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'), check_params_exist=False) check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'), check_params_exist=False)
def test_count_vectorizer_from_vocab(self): model = CountVectorizerModel.from_vocabulary( ["a", "b", "c"], inputCol="words", outputCol="features", minTF=2 ) self.assertEqual(model.vocabulary, ["a", "b", "c"]) self.assertEqual(model.getMinTF(), 2) dataset = self.spark.createDataFrame( [ ( 0, "a a a b b c".split(" "), SparseVector(3, {0: 3.0, 1: 2.0}), ), ( 1, "a a".split(" "), SparseVector(3, {0: 2.0}), ), ( 2, "a b".split(" "), SparseVector(3, {}), ), ], ["id", "words", "expected"], ) transformed_list = model.transform(dataset).select("features", "expected").collect() for r in transformed_list: feature, expected = r self.assertEqual(feature, expected) # Test an empty vocabulary with QuietTest(self.sc): with self.assertRaisesRegex(Exception, "vocabSize.*invalid.*0"): CountVectorizerModel.from_vocabulary([], inputCol="words") # Test model with default settings can transform model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words") transformed_list = ( model_default.transform(dataset) .select(model_default.getOrDefault(model_default.outputCol)) .collect() ) self.assertEqual(len(transformed_list), 3)
def test_java_params(self): import re import pyspark.ml.feature import pyspark.ml.classification import pyspark.ml.clustering import pyspark.ml.evaluation import pyspark.ml.pipeline import pyspark.ml.recommendation import pyspark.ml.regression modules = [ pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation, pyspark.ml.regression, ] for module in modules: for name, cls in inspect.getmembers(module, inspect.isclass): if (not name.endswith("Model") and not name.endswith("Params") and issubclass(cls, JavaParams) and not inspect.isabstract(cls) and not re.match("_?Java", name) and name != "_LSH" and name != "_Selector"): check_params(self, cls(), check_params_exist=True) # Additional classes that need explicit construction from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel check_params(self, CountVectorizerModel.from_vocabulary(["a"], "input"), check_params_exist=True) check_params(self, StringIndexerModel.from_labels(["a", "b"], "input"), check_params_exist=True)
def multionehot(self, df, column): """ // Prepare training documents from a list of (id, text, label) tuples. val data = spark.createDataFrame(Seq( (0L, Seq("A", "B")), (1L, Seq("B")), (2L, Seq.empty), (3L, Seq("D", "E")) )).toDF("id", "categories") // Get distinct tags array val tags = data .flatMap(r ⇒ r.getAs[Seq[String]]("categories")) .distinct() .collect() .sortWith(_ < _) val cvmData = new CountVectorizerModel(tags) .setInputCol("categories") .setOutputCol("sparseFeatures") .transform(data) val asDense = udf((v: Vector) ⇒ v.toDense) cvmData .withColumn("features", asDense($"sparseFeatures")) .select("id", "categories", "features") .show() :param df: :param column: :return: """ df.select(column).show() categories = list( set( df.select(column).distinct().rdd.flatMap( lambda x: print(x, type(x), '\n')).collect())) categories = list( set( df.select(column).distinct().rdd.flatMap( lambda x: x[0] if x is not None else None).collect())) categories.sort(reverse=False) # sorted(categories, key=(lambda x: x[0])) print(categories) cvm = CountVectorizerModel.from_vocabulary(categories, inputCol=column, outputCol=column + "_sparse_vec").transform(df) cvm.show() @udf(ArrayType(IntegerType())) def toDense(v): print(v) print(Vectors.dense(v).toArray()) v = DenseVector(v) new_array = list([int(x) for x in v]) return new_array result = cvm.withColumn('features_vec', toDense(column + "_sparse_vec")) result = result.drop(column + "_sparse_vec") return result