def _select_top(self, df: DataFrame, top_n: int): if top_n > 0: if len(self.partition_spec.partition_by) > 0: p_keys = ", ".join(self.partition_spec.partition_by) if len(self.partition_spec.presort) > 0: sort_expr = f"ORDER BY {self.partition_spec.presort_expr}" else: sort_expr = "" cols = ", ".join(df.schema.names) sql = """ SELECT {cols} FROM ( SELECT *, ROW_NUMBER() OVER(PARTITION BY {p_keys} {sort_expr}) AS __top_row_number__ FROM __plot_df__) WHERE __top_row_number__ <= {top_n} """.format(cols=cols, p_keys=p_keys, sort_expr=sort_expr, top_n=top_n) df = self.execution_engine.default_sql_engine.select( DataFrames(__plot_df__=df), sql) else: order_expr = "" if "order_by" in self.params: order_by = parse_presort_exp( self.params.get_or_throw("order_by", object)) if len(order_by) > 0: order_expr = "ORDER BY " + ", ".join( k + " " + ("ASC" if v else "DESC") for k, v in order_by.items()) sql = """ SELECT * FROM __plot_df__ {order_expr} LIMIT {top_n} """.format(order_expr=order_expr, top_n=top_n) df = self.execution_engine.default_sql_engine.select( DataFrames(__plot_df__=df), sql) return df
def test_comap_with_key(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1]], "c:int,a:int") z1 = e.persist(e.zip(a, b, df1_name="x", df2_name="y")) z2 = e.persist(e.zip_all(DataFrames(x=a, y=b, z=b))) z3 = e.persist( e.zip_all(DataFrames(z=c), partition_spec=PartitionSpec(by=["a"])) ) def comap(cursor, dfs): assert dfs.has_key v = ",".join([k + str(v.count()) for k, v in dfs.items()]) keys = cursor.key_value_array # if len(keys) == 0: # return ArrayDataFrame([[v]], "v:str") return ArrayDataFrame([keys + [v]], cursor.key_schema + "v:str") def on_init(partition_no, dfs): assert dfs.has_key assert partition_no >= 0 assert len(dfs) > 0 res = e.comap( z1, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "x2,y1"]], "a:int,v:str", metadata=dict(a=1), throw=True) res = e.comap( z2, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "x2,y1,z1"]], "a:int,v:str", metadata=dict(a=1), throw=True) res = e.comap( z3, comap, "a:int,v:str", PartitionSpec(), metadata=dict(a=1), on_init=on_init, ) df_eq(res, [[1, "z1"]], "a:int,v:str", metadata=dict(a=1), throw=True)
def test_zip_all(self): e = self.engine a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int") z = e.persist(e.zip_all(DataFrames(a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert not z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(x=a))) assert 1 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) z = e.persist( e.zip_all(DataFrames(x=a), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() assert z.metadata.get("serialized", False) assert z.metadata.get("serialized_has_name", False) b = e.to_df([[6, 1], [2, 7]], "c:int,a:int") c = e.to_df([[6, 1], [2, 7]], "d:int,a:int") z = e.persist(e.zip_all(DataFrames(a, b, c))) assert 1 == z.count() assert not z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(x=a, y=b, z=c))) assert 1 == z.count() assert z.metadata.get("serialized_has_name", False) z = e.persist(e.zip_all(DataFrames(b, b))) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema z = e.persist(e.zip_all(DataFrames(x=b, y=b))) assert 2 == z.count() assert z.metadata.get("serialized_has_name", False) assert ["a", "c"] in z.schema z = e.persist( e.zip_all(DataFrames(b, b), partition_spec=PartitionSpec(by=["a"])) ) assert 2 == z.count() assert not z.metadata.get("serialized_has_name", False) assert "c" not in z.schema