def test_length(self, spark): col = ["col_a"] data = [(["a", "b", "c"],), (["x", "y"],)] df = spark.createDataFrame(data, col) res = df.withColumn("col_length", arrays.length(df["col_a"])) expected = [3, 2] actual = to_dicts(res) assert [i["col_length"] for i in actual] == expected
def test_array_intersection(self, spark): col = ["col_a", "col_b"] data = [ (["a", "b", "c", "a"], ["a", "b"]), (["a", "a", "d", "e"], ["d"]), (["d", "b", "c", "a"], ["b", "c", "d"]), (["apple", "b", "a", "d"], ["apple", "d"]), ] df = spark.createDataFrame(data, col) df_intersection = df.withColumn( "col_intersection", arrays.array_intersection(df["col_a"], df["col_b"]) ) expected = [["a", "b"], ["d"], ["d", "b", "c"], ["apple", "d"]] actual = to_dicts(df_intersection) assert [i["col_intersection"] for i in actual] == expected
def test_string_length(self, spark): original_col = "column_a" updated_col = "length_column_a" data = [("a",), ("ab",), ("abc",), ("abcd",)] raw = spark.createDataFrame(data, [original_col]) df = strings.count_str_length(original_col, updated_col, raw) assert df.count() == 4 res = to_dicts(df.orderBy(updated_col)) names = [i[original_col] for i in res] lengths = [i[updated_col] for i in res] assert lengths == [1, 2, 3, 4] assert names == ["a", "ab", "abc", "abcd"]
def test_spacy_cosine_similarity(self, spark: SQLContext): """Confirm that the pyspark cosine calculations are the same as the spacy cosine calculations""" id_col = "document_id" text_col = "document_text" primary_col = "primary_col" secondary_col = "secondary_col" output_col = "output_col" data = [ (0, "ale"), (1, "rum"), (2, "mojito"), (3, "beer"), (4, "lager"), (5, "vodka"), ] df = spark.createDataFrame(data, [id_col, text_col]) docs = spacy.get_spacy_docs(id_col, text_col, df, spacy_model_version=self.spacy_version()) vectors = spacy.extract_document_vectors(docs) input_data = [] for i in range(len(vectors)): input_data.append((vectors[0][1], vectors[i][1])) df = spark.createDataFrame(input_data, [primary_col, secondary_col]) res = distance.cosine_similarity(primary_col, secondary_col, output_col, df) actual = [i[output_col] for i in to_dicts(res)] expected = [docs[0].similarity(doc) for doc in docs] # soacy and pyspark must give the same value to at least 6 decimal places self.validate_to_decimal_places(actual, expected, decimal_places=6)
def test_array_union(self, spark): cols = ["col_a", "col_b"] data = [ (["a", "b", "c"], ["a", "d"]), (["f", "g", "h"], ["e", "d"]), (["q", "a", "c"], ["a", "q", "f"]), (["p", "o", "c"], ["r", "t", "c"]), ] df = spark.createDataFrame(data, cols) df_union = df.withColumn( "col_union", arrays.array_union(df["col_a"], df["col_b"]) ) expected = [ ["a", "b", "c", "d"], ["f", "g", "h", "e", "d"], ["q", "a", "c", "f"], ["p", "o", "c", "r", "t"], ] actual = to_dicts(df_union) assert [i["col_union"] for i in actual] == expected
def test_merge_collected_sets(self, spark): """Merge collected sets into a set of unique values""" col_a = "col_a" col_b = "col_b" col_c = "col_c" data = [ (["a", "b", "c", "d"], ["c", "d", "e"]), (["x", "y", "z"], ["x"]), (None, ["foo", "bar", "baz"]), (["random"], None), (None, None), ] expected = [ set(["a", "b", "c", "d", "e"]), set(["x", "y", "z"]), set(["foo", "bar", "baz"]), set(["random"]), set(), ] schema = StructType( [ StructField(col_a, ArrayType(StringType())), StructField(col_b, ArrayType(StringType())), ] ) input_df = spark.createDataFrame(data, schema) output_df = input_df.select( arrays.merge_collected_sets(F.col(col_a), F.col(col_b)).alias(col_c) ) res = [set(i[col_c]) for i in to_dicts(output_df)] assert res == expected
def test_to_dicts(self, spark): cols = ["col_a", "col_b", "col_c"] data = [("a", 2, "c"), ("d", 5, "f"), ("g", 8, "i")] df = spark.createDataFrame(data, cols) expected = [ { "col_a": "a", "col_b": 2, "col_c": "c" }, { "col_a": "d", "col_b": 5, "col_c": "f" }, { "col_a": "g", "col_b": 8, "col_c": "i" }, ] assert dataframe.to_dicts(df) == expected
def _get_transformed_data(self, df: DataFrame, col_name: str): return [i[col_name] for i in to_dicts(df)]