コード例 #1
0
def to_validation_rules(data: Dict[str, Any]) -> Dict[str, Any]:
    res: Dict[str, Any] = {}
    for k, v in data.items():
        if k in ["partitionby_has", "partitionby_is"]:
            if isinstance(v, str):
                v = [x.strip() for x in v.split(",")]
            res[k] = PartitionSpec(by=v).partition_by
        elif k in ["presort_has", "presort_is"]:
            res[k] = list(parse_presort_exp(v).items())
        elif k in ["input_has"]:
            if isinstance(v, str):
                res[k] = v.replace(" ", "").split(",")
            else:
                assert_or_throw(
                    isinstance(v, list),
                    lambda: SyntaxError(f"{v} is neither a string or a list"),
                )
                res[k] = [x.replace(" ", "") for x in v]
        elif k in ["input_is"]:
            try:
                res[k] = str(Schema(v))
            except SyntaxError:
                raise SyntaxError(  # pylint: disable=W0707
                    f"for input_is, the input must be a schema expression {v}")
        else:
            raise NotImplementedError(k)
    return res
コード例 #2
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = self.to_df(df).native
        nulls_last = bool(na_position == "last")

        if presort:
            presort = parse_presort_exp(presort)
        # Use presort over partition_spec.presort if possible
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        def _presort_to_col(_col: str, _asc: bool) -> Any:
            if nulls_last:
                if _asc:
                    return col(_col).asc_nulls_last()
                else:
                    return col(_col).desc_nulls_last()
            else:
                if _asc:
                    return col(_col).asc_nulls_first()
                else:
                    return col(_col).desc_nulls_first()

        # If no partition
        if len(partition_spec.partition_by) == 0:
            if len(_presort.keys()) > 0:
                d = d.orderBy(
                    [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()]
                )
            d = d.limit(n)

        # If partition exists
        else:
            w = Window.partitionBy([col(x) for x in partition_spec.partition_by])

            if len(_presort.keys()) > 0:
                w = w.orderBy(
                    [_presort_to_col(_col, _presort[_col]) for _col in _presort.keys()]
                )
            else:
                # row_number() still needs an orderBy
                w = w.orderBy(lit(1))

            d = (
                d.select(col("*"), row_number().over(w).alias("__row_number__"))
                .filter(col("__row_number__") <= n)
                .drop("__row_number__")
            )

        return self.to_df(d, df.schema, metadata)
コード例 #3
0
 def _select_top(self, df: DataFrame, top_n: int):
     if top_n > 0:
         if len(self.partition_spec.partition_by) > 0:
             p_keys = ", ".join(self.partition_spec.partition_by)
             if len(self.partition_spec.presort) > 0:
                 sort_expr = f"ORDER BY {self.partition_spec.presort_expr}"
             else:
                 sort_expr = ""
             cols = ", ".join(df.schema.names)
             sql = """
             SELECT {cols} FROM (
                 SELECT *, ROW_NUMBER() OVER(PARTITION BY {p_keys} {sort_expr})
                                             AS __top_row_number__
                 FROM __plot_df__) WHERE __top_row_number__ <= {top_n}
             """.format(cols=cols,
                        p_keys=p_keys,
                        sort_expr=sort_expr,
                        top_n=top_n)
             df = self.execution_engine.default_sql_engine.select(
                 DataFrames(__plot_df__=df), sql)
         else:
             order_expr = ""
             if "order_by" in self.params:
                 order_by = parse_presort_exp(
                     self.params.get_or_throw("order_by", object))
                 if len(order_by) > 0:
                     order_expr = "ORDER BY " + ", ".join(
                         k + " " + ("ASC" if v else "DESC")
                         for k, v in order_by.items())
             sql = """
             SELECT * FROM __plot_df__ {order_expr} LIMIT {top_n}
             """.format(order_expr=order_expr, top_n=top_n)
             df = self.execution_engine.default_sql_engine.select(
                 DataFrames(__plot_df__=df), sql)
     return df
コード例 #4
0
 def process(self, dfs: DataFrames) -> None:
     kwargs: Dict[str, Any] = {
         k: v
         for k, v in self.params.items() if k not in [
             "top_n", "order_by", "x", "y", "kind", "width", "height",
             "group"
         ]
     }
     top_n = self.params.get("top_n", 0)
     df = self._select_top(dfs[0], top_n).as_pandas()
     if "order_by" in self.params:
         order_by: Any = parse_presort_exp(
             self.params.get_or_throw("order_by", object))
     else:
         order_by = self.partition_spec.presort
     self._plot(
         df,
         self.partition_spec.partition_by,
         x=self.params.get_or_throw("x", str),
         y=self.params.get_or_none("y", object),
         kind=self.params.get("kind", self.kind),
         width=self.params.get("width", 1.0),
         height=self.params.get("height", 0.5),
         order_by=order_by,
         group=self.params.get_or_none("group", object),
         **kwargs,
     )
コード例 #5
0
 def validate_on_compile(self) -> None:
     if self.kind == "":
         self.params.get_or_throw("kind", str)
     else:
         assert_or_throw("kind" not in self.params,
                         f"can't reset kind {self.kind}")
     self.params.get("top_n", 0)
     parse_presort_exp(self.params.get("order_by", "a"))
     self.params.get_or_throw("x", str)
     y = self.params.get_or_none("y", object)
     gp = self.params.get_or_none("group", object)
     assert_or_throw(
         gp is None or isinstance(y, str),
         "when group is set, y must be set as a string",
     )
     self.params.get("height", 0.5)
     width = self.params.get("width", 1.0)
     assert_or_throw(width in [0.5, 1.0], ValueError())
コード例 #6
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = self.to_df(df).native
        meta = [(d[x].name, d[x].dtype) for x in d.columns]

        if presort:
            presort = parse_presort_exp(presort)
        # Use presort over partition_spec.presort if possible
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        def _partition_take(partition, n, presort):
            if len(presort.keys()) > 0:
                partition = partition.sort_values(
                    list(presort.keys()),
                    ascending=list(presort.values()),
                    na_position=na_position,
                )
            return partition.head(n)

        if len(partition_spec.partition_by) == 0:
            if len(_presort.keys()) == 0:
                d = d.head(n)
            else:
                # Use the default partition
                d = (d.map_partitions(
                    _partition_take, n, _presort,
                    meta=meta).reset_index(drop=True).compute())
                # compute() brings this to Pandas so we can use pandas
                d = d.sort_values(
                    list(_presort.keys()),
                    ascending=list(_presort.values()),
                    na_position=na_position,
                ).head(n)

        else:
            d = (d.groupby(partition_spec.partition_by,
                           dropna=False).apply(
                               _partition_take,
                               n=n,
                               presort=_presort,
                               meta=meta).reset_index(drop=True))

        return DaskDataFrame(d, df.schema, metadata)
コード例 #7
0
    def take(
        self,
        df: DataFrame,
        n: int,
        presort: str,
        na_position: str = "last",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        metadata: Any = None,
    ) -> DataFrame:
        assert_or_throw(
            isinstance(n, int),
            ValueError("n needs to be an integer"),
        )
        d = df.as_pandas()

        # Use presort over partition_spec.presort if possible
        if presort:
            presort = parse_presort_exp(presort)
        _presort: IndexedOrderedDict = presort or partition_spec.presort

        if len(_presort.keys()) > 0:
            d = d.sort_values(
                list(_presort.keys()),
                ascending=list(_presort.values()),
                na_position=na_position,
            )

        if len(partition_spec.partition_by) == 0:
            d = d.head(n)
        else:
            d = d.groupby(by=partition_spec.partition_by, dropna=False).head(n)

        return PandasDataFrame(d.reset_index(drop=True),
                               df.schema,
                               metadata,
                               pandas_df_wrapper=True)
コード例 #8
0
def test_parse_presort_exp():

    assert parse_presort_exp(None) == IndexedOrderedDict()
    assert parse_presort_exp(IndexedOrderedDict([
        ('c', True)
    ])) == IndexedOrderedDict([('c', True)])
    assert parse_presort_exp("c") == IndexedOrderedDict([('c', True)])
    assert parse_presort_exp("         c") == IndexedOrderedDict([('c', True)])
    assert parse_presort_exp("c           desc") == IndexedOrderedDict([
        ('c', False)
    ])
    assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([
        ('b', False), ('c', True)
    ])
    assert parse_presort_exp("DESC DESC, ASC ASC") == IndexedOrderedDict([
        ('DESC', False), ('ASC', True)
    ])
    assert parse_presort_exp([("b", False), ("c", True)
                              ]) == IndexedOrderedDict([('b', False),
                                                        ('c', True)])
    assert parse_presort_exp("B DESC, C ASC") == IndexedOrderedDict([
        ('B', False), ('C', True)
    ])
    assert parse_presort_exp("b desc, c asc") == IndexedOrderedDict([
        ('b', False), ('c', True)
    ])

    with raises(SyntaxError):
        parse_presort_exp("b dsc, c asc")  # mispelling of desc

    with raises(SyntaxError):
        parse_presort_exp("c true")  # string format needs desc/asc

    with raises(SyntaxError):
        parse_presort_exp("c true, c true")  # cannot contain duplicates

    with raises(SyntaxError):
        parse_presort_exp([("b", "desc"), ("c", "asc")
                           ])  # instead of desc and asc, needs to be bool