def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join(df1.as_pandas(), df2.as_pandas(), join_type=how, on=key_schema.names) return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata)
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) d = self.pl_utils.join( self.to_df(df1).native, self.to_df(df2).native, join_type=how, on=key_schema.names, ) return DaskDataFrame(d, output_schema, metadata)
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) how = how.lower().replace("_", "").replace(" ", "") assert_or_throw( how in _TO_SPARK_JOIN_MAP, ValueError(f"{how} is not supported as a join type"), ) how = _TO_SPARK_JOIN_MAP[how] d1 = self.to_df(df1).native d2 = self.to_df(df2).native cols = [col(n) for n in output_schema.names] if how == "cross": res = d1.crossJoin(d2).select(*cols) else: res = d1.join(d2, on=key_schema.names, how=how).select(*cols) return self.to_df(res, output_schema, metadata)
def join( self, df1: DataFrame, df2: DataFrame, how: str, on: List[str] = _DEFAULT_JOIN_KEYS, metadata: Any = None, ) -> DataFrame: key_schema, output_schema = get_join_schemas(df1, df2, how=how, on=on) how = how.lower().replace("_", "").replace(" ", "") if how == "cross": d1 = df1.as_pandas() d2 = df2.as_pandas() d1["__cross_join_index__"] = 1 d2["__cross_join_index__"] = 1 d = d1.merge(d2, on=("__cross_join_index__")).drop( "__cross_join_index__", axis=1) return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["semi", "leftsemi"]: d1 = df1.as_pandas() d2 = df2.as_pandas()[key_schema.names] d = d1.merge(d2, on=key_schema.names, how="inner") return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata) if how in ["anti", "leftanti"]: d1 = df1.as_pandas() d2 = df2.as_pandas()[key_schema.names] d2["__anti_join_dummy__"] = 1.0 d = d1.merge(d2, on=key_schema.names, how="left") d = d[d.iloc[:, -1].isnull()] return PandasDataFrame( d.drop(["__anti_join_dummy__"], axis=1).reset_index(drop=True), output_schema, metadata, ) fix_left, fix_right = False, False if how in ["leftouter"]: how = "left" self._validate_outer_joinable(df2.schema, key_schema) fix_right = True if how in ["rightouter"]: how = "right" self._validate_outer_joinable(df1.schema, key_schema) fix_left = True if how in ["fullouter"]: how = "outer" self._validate_outer_joinable(df1.schema, key_schema) self._validate_outer_joinable(df2.schema, key_schema) fix_left, fix_right = True, True d1 = df1.as_pandas() d2 = df2.as_pandas() d = d1.merge(d2, on=key_schema.names, how=how) if fix_left: d = self._fix_nan( d, output_schema, df1.schema.exclude(list(df2.schema.keys())).keys()) if fix_right: d = self._fix_nan( d, output_schema, df2.schema.exclude(list(df1.schema.keys())).keys()) return PandasDataFrame(d.reset_index(drop=True), output_schema, metadata)
def test_get_join_schemas(): a = ArrayDataFrame([], "a:int,b:int") b = ArrayDataFrame([], "c:int") c = ArrayDataFrame([], "d:str,a:int") i, u = get_join_schemas(a, b, how="cross", on=[]) assert i == "" assert u == "a:int,b:int,c:int" raises(NoneArgumentError, lambda: get_join_schemas(a, b, how=None, on=[])) raises(ValueError, lambda: get_join_schemas(a, b, how="x", on=[])) raises(SchemaError, lambda: get_join_schemas(a, b, how="CROSS", on=["a"])) raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=["a"])) raises(SchemaError, lambda: get_join_schemas(a, c, how="CROSS", on=[])) raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"])) raises(ValueError, lambda: get_join_schemas(a, c, how="outer", on=["a"])) i, u = get_join_schemas(a, c, how="inner", on=["a"]) assert i == "a:int" assert u == "a:int,b:int,d:str" i, u = get_join_schemas(a, c, how="inner", on=[]) # infer assert i == "a:int" assert u == "a:int,b:int,d:str" a = ArrayDataFrame([], "a:int,b:int,c:int") b = ArrayDataFrame([], "c:int,b:int,x:int") raises(SchemaError, lambda: get_join_schemas(a, b, how="inner", on=["a"])) i, u = get_join_schemas(a, b, how="inner", on=["c", "b"]) assert i == "b:int,c:int" assert u == "a:int,b:int,c:int,x:int" for how in ["SEMI", "LEFT_Semi", "Anti", "left_Anti"]: i, u = get_join_schemas(c, a, how=how, on=["a"]) assert i == "a:int" assert u == "d:str,a:int"