def test_length(self, spark):
        col = ["col_a"]
        data = [(["a", "b", "c"],), (["x", "y"],)]
        df = spark.createDataFrame(data, col)
        res = df.withColumn("col_length", arrays.length(df["col_a"]))
        expected = [3, 2]
        actual = to_dicts(res)

        assert [i["col_length"] for i in actual] == expected
 def test_array_intersection(self, spark):
     col = ["col_a", "col_b"]
     data = [
         (["a", "b", "c", "a"], ["a", "b"]),
         (["a", "a", "d", "e"], ["d"]),
         (["d", "b", "c", "a"], ["b", "c", "d"]),
         (["apple", "b", "a", "d"], ["apple", "d"]),
     ]
     df = spark.createDataFrame(data, col)
     df_intersection = df.withColumn(
         "col_intersection", arrays.array_intersection(df["col_a"], df["col_b"])
     )
     expected = [["a", "b"], ["d"], ["d", "b", "c"], ["apple", "d"]]
     actual = to_dicts(df_intersection)
     assert [i["col_intersection"] for i in actual] == expected
    def test_string_length(self, spark):
        original_col = "column_a"
        updated_col = "length_column_a"

        data = [("a",), ("ab",), ("abc",), ("abcd",)]

        raw = spark.createDataFrame(data, [original_col])
        df = strings.count_str_length(original_col, updated_col, raw)
        assert df.count() == 4

        res = to_dicts(df.orderBy(updated_col))

        names = [i[original_col] for i in res]
        lengths = [i[updated_col] for i in res]

        assert lengths == [1, 2, 3, 4]
        assert names == ["a", "ab", "abc", "abcd"]
    def test_spacy_cosine_similarity(self, spark: SQLContext):
        """Confirm that the pyspark cosine calculations are
        the same as the spacy cosine calculations"""

        id_col = "document_id"
        text_col = "document_text"
        primary_col = "primary_col"
        secondary_col = "secondary_col"
        output_col = "output_col"

        data = [
            (0, "ale"),
            (1, "rum"),
            (2, "mojito"),
            (3, "beer"),
            (4, "lager"),
            (5, "vodka"),
        ]

        df = spark.createDataFrame(data, [id_col, text_col])

        docs = spacy.get_spacy_docs(id_col,
                                    text_col,
                                    df,
                                    spacy_model_version=self.spacy_version())

        vectors = spacy.extract_document_vectors(docs)

        input_data = []
        for i in range(len(vectors)):
            input_data.append((vectors[0][1], vectors[i][1]))

        df = spark.createDataFrame(input_data, [primary_col, secondary_col])
        res = distance.cosine_similarity(primary_col, secondary_col,
                                         output_col, df)

        actual = [i[output_col] for i in to_dicts(res)]
        expected = [docs[0].similarity(doc) for doc in docs]

        # soacy and pyspark must give the same value to at least 6 decimal places
        self.validate_to_decimal_places(actual, expected, decimal_places=6)
    def test_array_union(self, spark):
        cols = ["col_a", "col_b"]
        data = [
            (["a", "b", "c"], ["a", "d"]),
            (["f", "g", "h"], ["e", "d"]),
            (["q", "a", "c"], ["a", "q", "f"]),
            (["p", "o", "c"], ["r", "t", "c"]),
        ]
        df = spark.createDataFrame(data, cols)
        df_union = df.withColumn(
            "col_union", arrays.array_union(df["col_a"], df["col_b"])
        )
        expected = [
            ["a", "b", "c", "d"],
            ["f", "g", "h", "e", "d"],
            ["q", "a", "c", "f"],
            ["p", "o", "c", "r", "t"],
        ]
        actual = to_dicts(df_union)

        assert [i["col_union"] for i in actual] == expected
    def test_merge_collected_sets(self, spark):
        """Merge collected sets into a set of unique values"""

        col_a = "col_a"
        col_b = "col_b"
        col_c = "col_c"

        data = [
            (["a", "b", "c", "d"], ["c", "d", "e"]),
            (["x", "y", "z"], ["x"]),
            (None, ["foo", "bar", "baz"]),
            (["random"], None),
            (None, None),
        ]

        expected = [
            set(["a", "b", "c", "d", "e"]),
            set(["x", "y", "z"]),
            set(["foo", "bar", "baz"]),
            set(["random"]),
            set(),
        ]

        schema = StructType(
            [
                StructField(col_a, ArrayType(StringType())),
                StructField(col_b, ArrayType(StringType())),
            ]
        )

        input_df = spark.createDataFrame(data, schema)
        output_df = input_df.select(
            arrays.merge_collected_sets(F.col(col_a), F.col(col_b)).alias(col_c)
        )

        res = [set(i[col_c]) for i in to_dicts(output_df)]

        assert res == expected
Exemple #7
0
    def test_to_dicts(self, spark):
        cols = ["col_a", "col_b", "col_c"]
        data = [("a", 2, "c"), ("d", 5, "f"), ("g", 8, "i")]
        df = spark.createDataFrame(data, cols)
        expected = [
            {
                "col_a": "a",
                "col_b": 2,
                "col_c": "c"
            },
            {
                "col_a": "d",
                "col_b": 5,
                "col_c": "f"
            },
            {
                "col_a": "g",
                "col_b": 8,
                "col_c": "i"
            },
        ]

        assert dataframe.to_dicts(df) == expected
Exemple #8
0
 def _get_transformed_data(self, df: DataFrame, col_name: str):
     return [i[col_name] for i in to_dicts(df)]