Beispiel #1
0
def _load_csv(
    p: FileParser, columns: Any = None, **kwargs: Any
) -> Tuple[pd.DataFrame, Any]:
    kw = dict(kwargs)
    header = kw.get("header", False)
    if "header" in kw:
        del kw["header"]
    if str(header) in ["True", "0"]:
        pdf = pd.read_csv(p.uri, **{"index_col": False, "header": 0, **kw})
        if columns is None:
            return pdf, None
        if isinstance(columns, list):  # column names
            return pdf[columns], None
        schema = Schema(columns)
        return pdf[schema.names], schema
    if header is None or str(header) == "False":
        if columns is None:
            raise InvalidOperationError("columns must be set if without header")
        if isinstance(columns, list):  # column names
            pdf = pd.read_csv(
                p.uri, **{"index_col": False, "header": None, "names": columns, **kw}
            )
            return pdf, None
        schema = Schema(columns)
        pdf = pd.read_csv(
            p.uri, **{"index_col": False, "header": None, "names": schema.names, **kw}
        )
        return pdf, schema
    else:
        raise NotImplementedError(f"{header} is not supported")
Beispiel #2
0
 def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame:
     kw = ParamDict(kwargs)
     infer_schema = kw.get("infer_schema", False)
     if infer_schema:
         kw["inferSchema"] = True
     if "infer_schema" in kw:
         del kw["infer_schema"]
     header = str(kw.get_or_none("header", object)).lower()
     if "header" in kw:
         del kw["header"]
     reader = self._session.read.format("csv")
     reader.options(**kw)
     if header == "true":
         reader.option("header", "true")
         if columns is None:
             return SparkDataFrame(reader.load(p))
         if isinstance(columns, list):  # column names
             return SparkDataFrame(reader.load(p)[columns])
         schema = Schema(columns)
         return SparkDataFrame(reader.load(p)[schema.names], schema)
     if header in ["false", "none"]:
         reader.option("header", "false")
         if columns is None:
             raise InvalidOperationError("columns must be set if without header")
         if isinstance(columns, list):  # column names
             sdf = reader.load(p)
             inferred = to_schema(sdf)
             renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)]
             return SparkDataFrame(sdf.selectExpr(*renames))
         schema = Schema(columns)
         sdf = reader.schema(to_spark_schema(schema)).load(p)
         return SparkDataFrame(sdf, schema)
     else:
         raise NotImplementedError(f"{header} is not supported")
Beispiel #3
0
def to_taskspec(
        obj: Any,
        parent_workflow_spec: Optional[WorkflowSpec] = None) -> TaskSpec:
    assert_arg_not_none(obj, "obj")
    if isinstance(obj, str):
        return to_taskspec(json.loads(obj))
    if isinstance(obj, TaskSpec):
        return obj
    if isinstance(obj, Dict):
        d: Dict[str, Any] = dict(obj)
        node_spec: Optional[_NodeSpec] = None
        if "node_spec" in d:
            aot(
                parent_workflow_spec is not None,
                lambda: InvalidOperationError("parent workflow must be set"),
            )
            node_spec = _NodeSpec(
                workflow=parent_workflow_spec,
                **d["node_spec"]  # type: ignore
            )
            del d["node_spec"]
        if "tasks" in d:
            ts: TaskSpec = WorkflowSpec(**d)
        else:
            ts = TaskSpec(**d)
        if node_spec is not None:
            ts._node_spec = node_spec
        return ts
    raise TypeError(f"can't convert {obj} to TaskSpec")  # pragma: no cover
Beispiel #4
0
    def skip(self, key: str) -> None:  # pragma: no cover
        """Skip `key`

        :param key: uuid string
        """
        raise InvalidOperationError(
            "skip is not valid in FugueInteractiveCache")
Beispiel #5
0
def _make_top_level_workflow(spec: WorkflowSpec, ctx: WorkflowContext,
                             configs: Dict[str, Any]) -> _Workflow:
    aot(
        len(spec.inputs) == 0,
        InvalidOperationError("Can't have inputs for top level workflow"),
    )
    wf = _Workflow(spec, ctx)
    for k, vv in configs.items():
        wf.configs[k].set(vv)
    for k, v in wf.configs.items():
        try:
            v.get()
        except Exception:
            raise InvalidOperationError(
                f"config {k}'s value is not set for top level workflow")
    wf._init_tasks()
    return wf
Beispiel #6
0
 def _append(self, value: Any):
     assert_or_throw(not self.has_key,
                     InvalidOperationError("this DataFrames must have key"))
     assert_or_throw(
         isinstance(value, DataFrame),
         lambda: ValueError(f"{value} is not a DataFrame"),
     )
     super().__setitem__("_" + str(len(self)), value)
Beispiel #7
0
 def __setitem__(  # type: ignore
         self, key: str, value: DataFrame, *args: Any, **kwds: Any) -> None:
     assert isinstance(key, str)
     assert_or_throw(
         len(self) == 0 or self.has_key,
         InvalidOperationError("this DataFrames can's have key"),
     )
     assert_or_throw(isinstance(value, DataFrame),
                     ValueError(f"{key} has non DataFrame value"))
     super().__setitem__(key, value, *args, **kwds)  # type: ignore
     self._has_key = True
Beispiel #8
0
 def transit(state_from: "_State", state_to: "_State") -> "_State":
     if state_from == _State.CREATED:
         if state_to in [
                 _State.RUNNING,
                 _State.ABORTED,
                 _State.SKIPPED,
                 _State.FINISHED,
         ]:
             return state_to
     elif state_from == _State.RUNNING:
         if state_to in [_State.FINISHED, _State.ABORTED, _State.FAILED]:
             return state_to
     raise InvalidOperationError(
         f"Unable to transit from {state_from} to {state_to}")
Beispiel #9
0
    def get(self, key: str) -> Tuple[bool, bool, Any]:
        """Try to get value for `key`

        :param key: uuid string
        :return: <hasvalue>, <skipped>, <value>
        """
        with self._lock:
            if key in self._data:
                return True, False, self._data[key]
            has_value, skipped, value = self._cache.get(key)
            assert_or_throw(
                not skipped,
                InvalidOperationError(
                    "skip is not valid in FugueInteractiveCache"),
            )
            if has_value:
                self._data[key] = value
            return has_value, skipped, value
Beispiel #10
0
 def _append_task(self, task: TaskSpec) -> TaskSpec:
     name = task.name
     assert_triad_var_name(name)
     aot(
         name not in self.tasks,
         lambda: KeyError(f"{name} already exists in workflow"),
     )
     aot(
         task.parent_workflow is self,
         lambda: InvalidOperationError(f"{task} has mismatching node_spec"),
     )
     try:
         task._validate_config()
         task._validate_dependency()
     except DependencyDefinitionError:
         raise
     except Exception as e:
         raise DependencyDefinitionError(e)
     self.tasks[name] = task
     return task
Beispiel #11
0
def serialize_df(
    df: Optional[DataFrame],
    threshold: int = -1,
    file_path: Optional[str] = None,
    fs: Optional[FileSystem] = None,
) -> str:
    """Serialize input dataframe to base64 string or to file
    if it's larger than threshold

    :param df: input DataFrame
    :param threshold: file byte size threshold, defaults to -1
    :param file_path: file path to store the data (used only if the serialized data
      is larger than ``threshold``), defaults to None
    :param fs: :class:`~triad:triad.collections.fs.FileSystem`, defaults to None
    :raises InvalidOperationError: if file is large but ``file_path`` is not provided
    :return: a json string either containing the base64 data or the file path

    .. note::

        If fs is not provided but it needs to write to disk, then it will use
        :meth:`~fs:fs.opener.registry.Registry.open_fs` to try to open the file to
        write.
    """
    if df is None:
        return json.dumps(dict())
    data = pickle_df(df)
    size = len(data)
    if threshold < 0 or size <= threshold:
        res = dict(data=base64.b64encode(data).decode())
    else:
        if file_path is None:
            raise InvalidOperationError("file_path is not provided")
        if fs is None:
            with open_fs(os.path.dirname(file_path),
                         writeable=True,
                         create=False) as _fs:
                _fs.writebytes(os.path.basename(file_path), data)
        else:
            fs.writebytes(file_path, data)
        res = dict(path=file_path)
    return json.dumps(res)
Beispiel #12
0
 def __getitem__(self, key: str) -> Any:  # pragma: no cover
     raise InvalidOperationError("Can't get items from outputs")
Beispiel #13
0
    def zip(
        self,
        df1: DataFrame,
        df2: DataFrame,
        how: str = "inner",
        partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC,
        temp_path: Optional[str] = None,
        to_file_threshold: Any = -1,
        df1_name: Optional[str] = None,
        df2_name: Optional[str] = None,
    ):
        """Partition the two dataframes in the same way with ``partition_spec`` and
        zip the partitions together on the partition keys.

        :param df1: the first dataframe
        :param df2: the second dataframe
        :param how: can accept ``inner``, ``left_outer``, ``right_outer``,
          ``full_outer``, ``cross``, defaults to ``inner``
        :param partition_spec: partition spec to partition each dataframe,
          defaults to empty.
        :type partition_spec: PartitionSpec, optional
        :param temp_path: file path to store the data (used only if the serialized data
          is larger than ``to_file_threshold``), defaults to None
        :param to_file_threshold: file byte size threshold, defaults to -1
        :param df1_name: df1's name in the zipped dataframe, defaults to None
        :param df2_name: df2's name in the zipped dataframe, defaults to None

        :return: a zipped dataframe, the metadata of the
          dataframe will indicate it's zipped

        :Notice:

        * Different from join, ``df1`` and ``df2`` can have common columns that you will
          not use as partition keys.
        * If ``on`` is not specified it will also use the common columns of the two
          dataframes (if it's not a cross zip)
        * For non-cross zip, the two dataframes must have common columns, or error will
          be thrown

        For more details and examples, read
        :ref:`Zip & Comap <tutorial:/tutorials/execution_engine.ipynb#zip-&-comap>`.
        """
        on = list(partition_spec.partition_by)
        how = how.lower()
        assert_or_throw(
            "semi" not in how and "anti" not in how,
            InvalidOperationError("zip does not support semi or anti joins"),
        )
        serialized_cols: Dict[str, Any] = {}
        schemas: Dict[str, Any] = {}
        if len(on) == 0:
            if how != "cross":
                on = df1.schema.extract(df2.schema.names,
                                        ignore_key_mismatch=True).names
        else:
            assert_or_throw(
                how != "cross",
                InvalidOperationError("can't specify keys for cross join"),
            )
        partition_spec = PartitionSpec(partition_spec, by=on)

        def update_df(df: DataFrame, name: Optional[str]) -> DataFrame:
            if name is None:
                name = f"_{len(serialized_cols)}"
            if not df.metadata.get("serialized", False):
                df = self._serialize_by_partition(
                    df,
                    partition_spec,
                    name,
                    temp_path,
                    to_file_threshold,
                    has_name=name is not None,
                )
            for k in df.metadata["serialized_cols"].keys():
                assert_or_throw(k not in serialized_cols,
                                lambda: ValueError(f"{k} is duplicated"))
                serialized_cols[k] = df.metadata["serialized_cols"][k]
                schemas[k] = df.metadata["schemas"][k]
            return df

        df1 = update_df(df1, df1_name)
        df2 = update_df(df2, df2_name)
        metadata = dict(
            serialized=True,
            serialized_cols=serialized_cols,
            schemas=schemas,
            serialized_has_name=df1_name is not None or df2_name is not None,
        )
        return self.join(df1, df2, how=how, on=on, metadata=metadata)
Beispiel #14
0
 def __deepcopy__(self, memo: Any) -> "FugueTask":
     raise InvalidOperationError("can't copy")
Beispiel #15
0
 def __copy__(self) -> "FugueTask":
     raise InvalidOperationError("can't copy")
Beispiel #16
0
 def __init__(  # noqa: C901
     self,
     df: Any = None,
     schema: Any = None,
     metadata: Any = None,
     pandas_df_wrapper: bool = False,
 ):
     try:
         if df is None:
             schema = _input_schema(schema).assert_not_empty()
             arr = [pa.array([])] * len(schema)
             self._native = pa.Table.from_arrays(arr,
                                                 schema=schema.pa_schema)
             super().__init__(schema, metadata)
             return
         elif isinstance(df, pa.Table):
             assert_or_throw(
                 schema is None,
                 InvalidOperationError("can't reset schema for pa.Table"),
             )
             self._native = df
             super().__init__(Schema(df.schema), metadata)
             return
         elif isinstance(df, (pd.DataFrame, pd.Series)):
             if isinstance(df, pd.Series):
                 df = df.to_frame()
             pdf = df
             if schema is None:
                 self._native = pa.Table.from_pandas(
                     pdf,
                     schema=Schema(pdf).pa_schema,
                     preserve_index=False,
                     safe=True,
                 )
                 schema = Schema(self._native.schema)
             else:
                 schema = _input_schema(schema).assert_not_empty()
                 self._native = pa.Table.from_pandas(
                     pdf,
                     schema=schema.pa_schema,
                     preserve_index=False,
                     safe=True)
             super().__init__(schema, metadata)
             return
         elif isinstance(df, Iterable):
             schema = _input_schema(schema).assert_not_empty()
             # n = len(schema)
             # arr = []
             # for i in range(n):
             #     arr.append([])
             # for row in df:
             #     for i in range(n):
             #         arr[i].append(row[i])
             # cols = [pa.array(arr[i], type=schema.types[i]) for i in range(n)]
             # self._native = pa.Table.from_arrays(cols, schema=schema.pa_schema)
             pdf = pd.DataFrame(df, columns=schema.names)
             for f in schema.fields:
                 if pa.types.is_timestamp(f.type) or pa.types.is_date(
                         f.type):
                     pdf[f.name] = pd.to_datetime(pdf[f.name])
             schema = _input_schema(schema).assert_not_empty()
             self._native = pa.Table.from_pandas(pdf,
                                                 schema=schema.pa_schema,
                                                 preserve_index=False,
                                                 safe=True)
             super().__init__(schema, metadata)
             return
         else:
             raise ValueError(f"{df} is incompatible with ArrowDataFrame")
     except Exception as e:
         raise FugueDataFrameInitError from e
Beispiel #17
0
 def items(self) -> Iterable[Tuple[str, Any]]:  # pragma: no cover
     raise InvalidOperationError("Can't get items from outputs")
Beispiel #18
0
 def count(self) -> int:
     """
     :raises InvalidOperationError: You can't count an unbounded dataframe
     """
     raise InvalidOperationError("Impossible to count an LocalUnboundedDataFrame")
Beispiel #19
0
 def _pre_update(self, op: str, need_reindex: bool = True) -> None:
     if self.readonly:
         raise InvalidOperationError("This dict is readonly")
     self._need_reindex = need_reindex
Beispiel #20
0
 def node_spec(self) -> "_NodeSpec":
     if self._node_spec is not None:
         return self._node_spec
     raise InvalidOperationError(  # pragma: no cover
         f"node_spec is not set for {self}")