def test_recursive_struct_validation(self, spark: SQLContext):

        nested_nested_schema = StructType([
            StructField("num_2", IntegerType()),
            StructField("arr_2", ArrayType(StringType())),
        ])

        nested_schema = StructType([
            StructField("num_1", IntegerType()),
            StructField("arr_1", ArrayType(StringType())),
            StructField("col_c", nested_nested_schema),
        ])

        schema = StructType([
            StructField("col_a", nested_schema),
            StructField("col_b", nested_schema)
        ])

        # rows are represented as tuples, whereas arrays are lists
        a = [
            (
                (1, ["a1", "b1", "c1"], (11, ["x1", "y1", "z1"])),
                (2, ["a2", "b2", "c2"], (12, ["x2", "y2", "z2"])),
            ),
            (
                (3, ["a3", "b3", "c3"], (13, ["x3", "y3", "z3"])),
                (4, ["a4", "b4", "c4"], (14, ["x4", "y4", "z4"])),
            ),
        ]

        # same as a but with one wrong val
        b = [
            (
                (1, ["a1", "b1", "c1"], (11, ["x1", "y1", "z1"])),
                (2, ["a2", "b2", "c2"], (12, ["x2", "y2", "z2"])),
            ),
            (
                (3, ["a3", "b3", "c3"], (13, ["x3", "y3", "WRONG VAL"])),
                (4, ["a4", "b4", "c4"], (14, ["x4", "y4", "z4"])),
            ),
        ]

        df = spark.createDataFrame(a, schema)

        print("res")
        print(to_tuples(df))
        # should be the exact specification defined by the to_tuples helper
        assert to_tuples(df) == a

        # should pass the validation helper
        validators.validate_values(df, schema, a)

        # should fail with an incorrect nested value
        with pytest.raises(ValueError):
            validators.validate_values(df, schema, b)
Esempio n. 2
0
    def test_token_vectors_pipeline(self, spark: SQLContext):
        input_data = [
            ("foo bar baz biz", ),
            ("foo    baz   bar", ),
            ("bar baz  ", ),
            ("  foo  biz  ", ),
            ("", ),
            (None, ),
        ]

        raw = spark.createDataFrame(input_data, ["text"])
        res = tfidf.token_vectors_pipeline("text", "vectors", raw)

        actual = to_tuples(res.select("vectors"))

        row_0 = set(actual[0][0])
        row_1 = set(actual[1][0])
        row_2 = set(actual[2][0])
        row_3 = set(actual[3][0])
        row_4 = set(actual[4][0])
        row_5 = set(actual[5][0])

        assert len(row_0) == 4
        assert len(row_1) == 3
        assert len(row_2) == 2
        assert len(row_3) == 2
        assert len(row_4) == 0
        assert len(row_5) == 0

        assert row_1.issubset(row_0)
        assert row_2.issubset(row_0)
        assert row_3.issubset(row_0)

        assert len(row_1.intersection(row_2)) == 2
        assert len(row_2.intersection(row_3)) == 0
Esempio n. 3
0
    def test_tf_ngrams_pipeline(self, spark: SQLContext):
        input_data = [
            ("foo bar baz biz", ),
            ("foo baz bar", ),
            ("bar baz", ),
            ("foo biz", ),
            ("", ),
            (None, ),
        ]
        raw = spark.createDataFrame(input_data, ["text"])
        res = tfidf.tf_ngrams_pipeline("text", "vectors", raw)

        actual = [i[0] for i in to_tuples(res.select("vectors"))]

        for v in actual:
            assert isinstance(v, SparseVector)

        row_0 = set(actual[0].indices)
        row_1 = set(actual[1].indices)
        row_2 = set(actual[2].indices)
        row_3 = set(actual[3].indices)
        row_4 = set(actual[4].indices)
        row_5 = set(actual[4].indices)

        assert row_1.issubset(row_0)
        assert row_2.issubset(row_0)
        assert row_3.issubset(row_0)

        assert len(row_4) == 0
        assert len(row_5) == 0
Esempio n. 4
0
def get_spacy_docs(
    document_id_col: str,
    document_text_col: str,
    df: DataFrame,
    spacy_model_version="en_core_web_lg",
):
    """Retrieve the spacy docs as a dataframe. Note that this is done in the driver"""
    log.info("initate spacy pipeline")

    # select both the document id (can be a row number for instance)
    # as well as the raw document text
    raw = to_tuples(df.select(F.col(document_id_col), F.col(document_text_col)))

    # load spacy
    nlp = get_spacy(spacy_model_version)

    # each entry is a tuple of (text, context) where the context is a dictionary
    raw_texts = [i if isinstance(i, str) else "" for _, i in raw]

    # use the spacy pipe method to process all the docs at once
    docs = list(nlp.pipe(raw_texts))

    # set the id as an "extension attribute" on each doc object
    Doc.set_extension(DOCUMENT_ID, default=None)

    for i in range(len(raw_texts)):
        docs[i]._.document_id = raw[i][0]

    return docs
Esempio n. 5
0
    def test_to_tuples(self, spark):
        cols = ["col_a", "col_b", "col_c"]
        data = [("a", 2, "c"), ("d", 5, "f"), ("g", 8, "i")]
        df = spark.createDataFrame(data, cols)
        expected = [("a", 2, "c"), ("d", 5, "f"), ("g", 8, "i")]

        assert dataframe.to_tuples(df) == expected
Esempio n. 6
0
    def test_remove_empty_strings(self, spark):
        data = [(["foo", "bar", "", None],), (["", None],), (None,)]

        df = spark.createDataFrame(data, ["input"])
        res = df.withColumn("output", arrays.remove_empty_strings(F.col("input")))

        expected = [a + b for a, b in zip(data, [(["foo", "bar"],), ([],), ([],)])]

        assert to_tuples(res) == expected
    def test_utc_timestamps(self, spark):

        t = timestamp.utcnow()
        e = timestamp.format_timestamp(t)

        data = [("a",), ("b",), ("c",)]
        raw = spark.createDataFrame(data, ["key"])

        df = timestamp.with_timestamp("val", t, raw)

        for row in dataframe.to_tuples(df):
            assert row[1] == e
Esempio n. 8
0
    def test_snowball_stemmer(self, spark: SQLContext):
        input_col = "tokens"
        output_col = "res"
        raw = self._get_stemmer_input(spark, input_col)
        df = tokens.snowball_tokens(input_col, output_col, raw)

        expected = [
            (["i", "may", "be", "use"], ),
            (["a", "simplist", "stem", "algorithm"], ),
            (["but", "the", "result", "are", "great"], ),
        ]

        assert to_tuples(df.select(output_col)) == expected
    def test_non_utc_timestamps(self, spark):
        pass
        au = pytz.timezone("Australia/Sydney")
        t1 = timestamp.utcnow()
        e = timestamp.format_timestamp(t1)
        t2 = t1.astimezone(au)

        data = [("a",), ("b",), ("c",)]
        raw = spark.createDataFrame(data, ["key"])

        df = timestamp.with_timestamp("val", t2, raw)

        for row in dataframe.to_tuples(df):
            assert row[1] == e
Esempio n. 10
0
    def test_normalize_dense_vectors(self, spark: SQLContext):

        input_data = [(Vectors.dense([1, 4, 16]), ), (Vectors.dense(1, 0,
                                                                    9), )]
        df = spark.createDataFrame(input_data, ["vectors"])
        res = vectors.normalize_vectors("vectors", "normalized",
                                        df).select("normalized")

        vals = [i[0].toArray() for i in to_tuples(res)]

        # after being normalized the magnitude of each vector should be 1
        magnitudes = [np.linalg.norm(v) for v in vals]
        expected = [1.0 for _ in range(len(magnitudes))]
        # some magnitudes migth come out as 0.999999 etc
        self.validate_to_decimal_places(magnitudes, expected)
Esempio n. 11
0
    def test_cosine_similarity(self, spark: SQLContext):

        input_data = [
            # two_dim_normals
            (Vectors.dense([1, 0]), Vectors.dense([0, 1])),
            # three_dim_normals
            (Vectors.dense([0, 1, 0]), Vectors.dense([0, 0, 1])),
            # two_dim_colinear
            (Vectors.dense([1, 0]), Vectors.dense([1, 0])),
            # three_dim_colinear
            (Vectors.dense([1, 1, 0]), Vectors.dense([1, 1, 0])),
        ]

        df = spark.createDataFrame(input_data, ["col_a", "col_b"])
        res = distance.cosine_similarity("col_a", "col_b", "col_c", df)

        actual = [i[0] for i in to_tuples(res.select("col_c"))]
        expected = [0.0, 0.0, 1.0, 1.0]
        self.validate_to_decimal_places(actual, expected, decimal_places=6)
Esempio n. 12
0
    def test_stemmed_token_vectors_pipeline(self, spark: SQLContext):
        # the exact same set up as the previous test however we are expecting
        # the stemmer to reduce these 4 words to: run, walk, jog, sprint
        input_data = [
            ("running walks jogged sprinted", ),
            ("runs jogging walked", ),
            ("walking jogs", ),
            ("running sprinting", ),
            ("", ),
            (None, ),
        ]

        raw = spark.createDataFrame(input_data, ["text"])

        res = tfidf.token_vectors_pipeline("text",
                                           "vectors",
                                           raw,
                                           stemmer_func=tokens.porter_tokens)

        actual = to_tuples(res.select("vectors"))

        row_0 = set(actual[0][0])
        row_1 = set(actual[1][0])
        row_2 = set(actual[2][0])
        row_3 = set(actual[3][0])
        row_4 = set(actual[4][0])
        row_5 = set(actual[5][0])

        assert len(row_0) == 4
        assert len(row_1) == 3
        assert len(row_2) == 2
        assert len(row_3) == 2
        assert len(row_4) == 0
        assert len(row_5) == 0

        assert row_1.issubset(row_0)
        assert row_2.issubset(row_0)
        assert row_3.issubset(row_0)

        assert len(row_1.intersection(row_2)) == 2
        assert len(row_1.intersection(row_3)) == 1
        assert len(row_2.intersection(row_3)) == 0
Esempio n. 13
0
def validate_values(
    df: DataFrame,
    expected_schema,
    expected_values: list,
    enforce_array_order=True,
    verbose=False,
):
    """Validate that the dataframe contains an exact list of rows and columns"""
    # validate the expected columns
    validate_schema(df, expected_schema, verbose=verbose)

    row_count = df.count()
    if row_count == 0:
        raise DataFrameException("DataFrame has 0 rows")

    if row_count != len(expected_values):
        raise DataFrameException(
            f"Incorrect number of rows: Received {row_count} - Expected: {len(expected_values)}"
        )

    res = to_tuples(df)
    col_count = len(res[0])
    for row_index, expected in enumerate(expected_values):
        actual = res[row_index]
        if verbose:
            print("Actual:")
            print(actual)
            print("Expected:")
            print(expected)
        # should have the same number of columns in each row
        if len(actual) != len(expected):
            raise DataFrameException(
                f"Incorrect number of columns: Received {len(actual)} - Expected: {len(expected)}"
            )

        for col_index in range(col_count):
            _recursive_validator(
                actual[col_index],
                expected[col_index],
                enforce_array_order=enforce_array_order,
            )
Esempio n. 14
0
    def test_select_longest_string(self, spark):

        # group the rows by the partiton col
        partition_col = "partition_col"

        # rank the rows by the aggregation col
        agg_col = "agg_col"

        # more than 1 string might have the same length
        # alphabetical ordering should be used to break the tie
        data = [
            ("A", "longest_a"),
            ("A", "longest_a"),  # duplicate
            ("A", "longest_b"),
            ("A", "short"),
            ("A", None),
            ("B", "longest_x"),
            ("B", "longest_x"),  # duplicate
            ("B", "longest_y"),
            ("B", "longest_y"),
            ("B", "short"),
            ("B", None),
            ("B", None),
            ("C", None),
            ("C", None),
        ]

        raw = spark.createDataFrame(data, [partition_col, agg_col])

        window = Window.partitionBy("partition_col")
        df = strings.select_longest_string("agg_col", window, raw).orderBy(
            "partition_col"
        )
        assert df.count() == 3
        res = to_tuples(df)
        expected = [["A", "longest_a"], ["B", "longest_x"], ["C", None]]

        for i, expected in enumerate(expected):
            actual = res[i]
            assert actual[0] == expected[0]
            assert actual[1] == expected[1]
Esempio n. 15
0
    def test_normalize_sparse_vectors(self, spark: SQLContext):

        # based on the following imaginary tokens
        # [a, c, c]
        # [a, a, b]
        # [b, b, d]
        input_data = [
            (Vectors.sparse(4, [0, 2], [1.0, 2.0]), ),
            (Vectors.sparse(4, [0, 1], [2.0, 1.0]), ),
            (Vectors.sparse(4, [1, 3], [2.0, 1.0]), ),
        ]

        df = spark.createDataFrame(input_data, ["vectors"])

        res = vectors.normalize_vectors("vectors", "normalized",
                                        df).select("normalized")

        vals = [i[0].toArray() for i in to_tuples(res)]

        # after being normalized the magnitude of each vector should be 1
        magnitudes = [np.linalg.norm(v) for v in vals]
        expected = [1.0 for _ in range(len(magnitudes))]
        # some magnitudes migth come out as 0.999999 etc
        self.validate_to_decimal_places(magnitudes, expected)