Ejemplo n.º 1
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     d = self.pl_utils.join(df1.as_pandas(),
                            df2.as_pandas(),
                            join_type=how,
                            on=key_schema.names)
     return PandasDataFrame(d.reset_index(drop=True), output_schema,
                            metadata)
Ejemplo n.º 2
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     d = self.pl_utils.join(
         self.to_df(df1).native,
         self.to_df(df2).native,
         join_type=how,
         on=key_schema.names,
     )
     return DaskDataFrame(d, output_schema, metadata)
Ejemplo n.º 3
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     how = how.lower().replace("_", "").replace(" ", "")
     assert_or_throw(
         how in _TO_SPARK_JOIN_MAP,
         ValueError(f"{how} is not supported as a join type"),
     )
     how = _TO_SPARK_JOIN_MAP[how]
     d1 = self.to_df(df1).native
     d2 = self.to_df(df2).native
     cols = [col(n) for n in output_schema.names]
     if how == "cross":
         res = d1.crossJoin(d2).select(*cols)
     else:
         res = d1.join(d2, on=key_schema.names, how=how).select(*cols)
     return self.to_df(res, output_schema, metadata)
Ejemplo n.º 4
0
 def join(
     self,
     df1: DataFrame,
     df2: DataFrame,
     how: str,
     on: List[str] = _DEFAULT_JOIN_KEYS,
     metadata: Any = None,
 ) -> DataFrame:
     key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on)
     how = how.lower().replace("_", "").replace(" ", "")
     if how == "cross":
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()
         d1["__cross_join_index__"] = 1
         d2["__cross_join_index__"] = 1
         d = d1.merge(d2, on=("__cross_join_index__")).drop(
             "__cross_join_index__", axis=1)
         return PandasDataFrame(d.reset_index(drop=True), output_schema,
                                metadata)
     if how in ["semi", "leftsemi"]:
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()[key_schema.names]
         d = d1.merge(d2, on=key_schema.names, how="inner")
         return PandasDataFrame(d.reset_index(drop=True), output_schema,
                                metadata)
     if how in ["anti", "leftanti"]:
         d1 = df1.as_pandas()
         d2 = df2.as_pandas()[key_schema.names]
         d2["__anti_join_dummy__"] = 1.0
         d = d1.merge(d2, on=key_schema.names, how="left")
         d = d[d.iloc[:, -1].isnull()]
         return PandasDataFrame(
             d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True),
             output_schema,
             metadata,
         )
     fix_left, fix_right = False, False
     if how in ["leftouter"]:
         how = "left"
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_right = True
     if how in ["rightouter"]:
         how = "right"
         self._validate_outer_joinable(df1.schema, key_schema)
         fix_left = True
     if how in ["fullouter"]:
         how = "outer"
         self._validate_outer_joinable(df1.schema, key_schema)
         self._validate_outer_joinable(df2.schema, key_schema)
         fix_left, fix_right = True, True
     d1 = df1.as_pandas()
     d2 = df2.as_pandas()
     d = d1.merge(d2, on=key_schema.names, how=how)
     if fix_left:
         d = self._fix_nan(
             d, output_schema,
             df1.schema.exclude(list(df2.schema.keys())).keys())
     if fix_right:
         d = self._fix_nan(
             d, output_schema,
             df2.schema.exclude(list(df1.schema.keys())).keys())
     return PandasDataFrame(d.reset_index(drop=True), output_schema,
                            metadata)
Ejemplo n.º 5
0
def test_get_join_schemas():
    a = ArrayDataFrame([], "a:int,b:int")
    b = ArrayDataFrame([], "c:int")
    c = ArrayDataFrame([], "d:str,a:int")
    i, u = get_join_schemas(a, b, how="cross", on=[])
    assert i == ""
    assert u == "a:int,b:int,c:int"
    raises(NoneArgumentError, lambda: get_join_schemas(a, b, how=None, on=[]))
    raises(ValueError, lambda: get_join_schemas(a, b, how="x", on=[]))
    raises(SchemaError, lambda: get_join_schemas(a, b, how="CROSS", on=["a"]))
    raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=["a"]))
    raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=[]))
    raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"]))
    raises(ValueError, lambda: get_join_schemas(a, c, how="outer", on=["a"]))
    i, u = get_join_schemas(a, c, how="inner", on=["a"])
    assert i == "a:int"
    assert u == "a:int,b:int,d:str"
    i, u = get_join_schemas(a, c, how="inner", on=[])  # infer
    assert i == "a:int"
    assert u == "a:int,b:int,d:str"
    a = ArrayDataFrame([], "a:int,b:int,c:int")
    b = ArrayDataFrame([], "c:int,b:int,x:int")
    raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"]))
    i, u = get_join_schemas(a, b, how="inner", on=["c", "b"])
    assert i == "b:int,c:int"
    assert u == "a:int,b:int,c:int,x:int"
    for how in ["SEMI", "LEFT_Semi", "Anti", "left_Anti"]:
        i, u = get_join_schemas(c, a, how=how, on=["a"])
        assert i == "a:int"
        assert u == "d:str,a:int"