Example #1
0
    def test_count_vectorizer_from_vocab(self):
        model = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words",
                                                     outputCol="features", minTF=2)
        self.assertEqual(model.vocabulary, ["a", "b", "c"])
        self.assertEqual(model.getMinTF(), 2)

        dataset = self.spark.createDataFrame([
            (0, "a a a b b c".split(' '), SparseVector(3, {0: 3.0, 1: 2.0}),),
            (1, "a a".split(' '), SparseVector(3, {0: 2.0}),),
            (2, "a b".split(' '), SparseVector(3, {}),)], ["id", "words", "expected"])

        transformed_list = model.transform(dataset).select("features", "expected").collect()

        for r in transformed_list:
            feature, expected = r
            self.assertEqual(feature, expected)

        # Test an empty vocabulary
        with QuietTest(self.sc):
            with self.assertRaisesRegexp(Exception, "vocabSize.*invalid.*0"):
                CountVectorizerModel.from_vocabulary([], inputCol="words")

        # Test model with default settings can transform
        model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words")
        transformed_list = model_default.transform(dataset) \
            .select(model_default.getOrDefault(model_default.outputCol)).collect()
        self.assertEqual(len(transformed_list), 3)
Example #2
0
    def test_java_params(self):
        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [
            pyspark.ml.feature, pyspark.ml.classification,
            pyspark.ml.clustering, pyspark.ml.evaluation, pyspark.ml.pipeline,
            pyspark.ml.recommendation, pyspark.ml.regression
        ]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if not name.endswith('Model') and not name.endswith('Params') \
                        and issubclass(cls, JavaParams) and not inspect.isabstract(cls):
                    # NOTE: disable check_params_exist until there is parity with Scala API
                    check_params(self, cls(), check_params_exist=False)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
        check_params(self,
                     CountVectorizerModel.from_vocabulary(['a'], 'input'),
                     check_params_exist=False)
        check_params(self,
                     StringIndexerModel.from_labels(['a', 'b'], 'input'),
                     check_params_exist=False)
Example #3
0
    def test_java_params(self):
        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [pyspark.ml.feature, pyspark.ml.classification, pyspark.ml.clustering,
                   pyspark.ml.evaluation, pyspark.ml.pipeline, pyspark.ml.recommendation,
                   pyspark.ml.regression]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if not name.endswith('Model') and not name.endswith('Params') \
                        and issubclass(cls, JavaParams) and not inspect.isabstract(cls):
                    # NOTE: disable check_params_exist until there is parity with Scala API
                    check_params(self, cls(), check_params_exist=False)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel
        check_params(self, CountVectorizerModel.from_vocabulary(['a'], 'input'),
                     check_params_exist=False)
        check_params(self, StringIndexerModel.from_labels(['a', 'b'], 'input'),
                     check_params_exist=False)
Example #4
0
    def test_count_vectorizer_from_vocab(self):
        model = CountVectorizerModel.from_vocabulary(
            ["a", "b", "c"], inputCol="words", outputCol="features", minTF=2
        )
        self.assertEqual(model.vocabulary, ["a", "b", "c"])
        self.assertEqual(model.getMinTF(), 2)

        dataset = self.spark.createDataFrame(
            [
                (
                    0,
                    "a a a b b c".split(" "),
                    SparseVector(3, {0: 3.0, 1: 2.0}),
                ),
                (
                    1,
                    "a a".split(" "),
                    SparseVector(3, {0: 2.0}),
                ),
                (
                    2,
                    "a b".split(" "),
                    SparseVector(3, {}),
                ),
            ],
            ["id", "words", "expected"],
        )

        transformed_list = model.transform(dataset).select("features", "expected").collect()

        for r in transformed_list:
            feature, expected = r
            self.assertEqual(feature, expected)

        # Test an empty vocabulary
        with QuietTest(self.sc):
            with self.assertRaisesRegex(Exception, "vocabSize.*invalid.*0"):
                CountVectorizerModel.from_vocabulary([], inputCol="words")

        # Test model with default settings can transform
        model_default = CountVectorizerModel.from_vocabulary(["a", "b", "c"], inputCol="words")
        transformed_list = (
            model_default.transform(dataset)
            .select(model_default.getOrDefault(model_default.outputCol))
            .collect()
        )
        self.assertEqual(len(transformed_list), 3)
Example #5
0
    def test_java_params(self):
        import re

        import pyspark.ml.feature
        import pyspark.ml.classification
        import pyspark.ml.clustering
        import pyspark.ml.evaluation
        import pyspark.ml.pipeline
        import pyspark.ml.recommendation
        import pyspark.ml.regression

        modules = [
            pyspark.ml.feature,
            pyspark.ml.classification,
            pyspark.ml.clustering,
            pyspark.ml.evaluation,
            pyspark.ml.pipeline,
            pyspark.ml.recommendation,
            pyspark.ml.regression,
        ]
        for module in modules:
            for name, cls in inspect.getmembers(module, inspect.isclass):
                if (not name.endswith("Model") and not name.endswith("Params")
                        and issubclass(cls, JavaParams)
                        and not inspect.isabstract(cls)
                        and not re.match("_?Java", name) and name != "_LSH"
                        and name != "_Selector"):
                    check_params(self, cls(), check_params_exist=True)

        # Additional classes that need explicit construction
        from pyspark.ml.feature import CountVectorizerModel, StringIndexerModel

        check_params(self,
                     CountVectorizerModel.from_vocabulary(["a"], "input"),
                     check_params_exist=True)
        check_params(self,
                     StringIndexerModel.from_labels(["a", "b"], "input"),
                     check_params_exist=True)
Example #6
0
    def multionehot(self, df, column):
        """
        // Prepare training documents from a list of (id, text, label) tuples.
        val data = spark.createDataFrame(Seq(
          (0L, Seq("A", "B")),
          (1L, Seq("B")),
          (2L, Seq.empty),
          (3L, Seq("D", "E"))
        )).toDF("id", "categories")

        // Get distinct tags array
        val tags = data
          .flatMap(r ⇒ r.getAs[Seq[String]]("categories"))
          .distinct()
          .collect()
          .sortWith(_ < _)

        val cvmData = new CountVectorizerModel(tags)
          .setInputCol("categories")
          .setOutputCol("sparseFeatures")
          .transform(data)

        val asDense = udf((v: Vector) ⇒ v.toDense)

        cvmData
          .withColumn("features", asDense($"sparseFeatures"))
          .select("id", "categories", "features")
          .show()

        :param df:
        :param column:
        :return:
        """
        df.select(column).show()
        categories = list(
            set(
                df.select(column).distinct().rdd.flatMap(
                    lambda x: print(x, type(x), '\n')).collect()))
        categories = list(
            set(
                df.select(column).distinct().rdd.flatMap(
                    lambda x: x[0] if x is not None else None).collect()))
        categories.sort(reverse=False)
        # sorted(categories, key=(lambda x: x[0]))
        print(categories)
        cvm = CountVectorizerModel.from_vocabulary(categories,
                                                   inputCol=column,
                                                   outputCol=column +
                                                   "_sparse_vec").transform(df)
        cvm.show()

        @udf(ArrayType(IntegerType()))
        def toDense(v):
            print(v)
            print(Vectors.dense(v).toArray())
            v = DenseVector(v)

            new_array = list([int(x) for x in v])

            return new_array

        result = cvm.withColumn('features_vec',
                                toDense(column + "_sparse_vec"))
        result = result.drop(column + "_sparse_vec")

        return result