def _load_csv( p: FileParser, columns: Any = None, **kwargs: Any ) -> Tuple[pd.DataFrame, Any]: kw = dict(kwargs) header = kw.get("header", False) if "header" in kw: del kw["header"] if str(header) in ["True", "0"]: pdf = pd.read_csv(p.uri, **{"index_col": False, "header": 0, **kw}) if columns is None: return pdf, None if isinstance(columns, list): # column names return pdf[columns], None schema = Schema(columns) return pdf[schema.names], schema if header is None or str(header) == "False": if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names pdf = pd.read_csv( p.uri, **{"index_col": False, "header": None, "names": columns, **kw} ) return pdf, None schema = Schema(columns) pdf = pd.read_csv( p.uri, **{"index_col": False, "header": None, "names": schema.names, **kw} ) return pdf, schema else: raise NotImplementedError(f"{header} is not supported")
def _load_csv(self, p: List[str], columns: Any = None, **kwargs: Any) -> DataFrame: kw = ParamDict(kwargs) infer_schema = kw.get("infer_schema", False) if infer_schema: kw["inferSchema"] = True if "infer_schema" in kw: del kw["infer_schema"] header = str(kw.get_or_none("header", object)).lower() if "header" in kw: del kw["header"] reader = self._session.read.format("csv") reader.options(**kw) if header == "true": reader.option("header", "true") if columns is None: return SparkDataFrame(reader.load(p)) if isinstance(columns, list): # column names return SparkDataFrame(reader.load(p)[columns]) schema = Schema(columns) return SparkDataFrame(reader.load(p)[schema.names], schema) if header in ["false", "none"]: reader.option("header", "false") if columns is None: raise InvalidOperationError("columns must be set if without header") if isinstance(columns, list): # column names sdf = reader.load(p) inferred = to_schema(sdf) renames = [f"{k} AS {v}" for k, v in zip(inferred.names, columns)] return SparkDataFrame(sdf.selectExpr(*renames)) schema = Schema(columns) sdf = reader.schema(to_spark_schema(schema)).load(p) return SparkDataFrame(sdf, schema) else: raise NotImplementedError(f"{header} is not supported")
def to_taskspec( obj: Any, parent_workflow_spec: Optional[WorkflowSpec] = None) -> TaskSpec: assert_arg_not_none(obj, "obj") if isinstance(obj, str): return to_taskspec(json.loads(obj)) if isinstance(obj, TaskSpec): return obj if isinstance(obj, Dict): d: Dict[str, Any] = dict(obj) node_spec: Optional[_NodeSpec] = None if "node_spec" in d: aot( parent_workflow_spec is not None, lambda: InvalidOperationError("parent workflow must be set"), ) node_spec = _NodeSpec( workflow=parent_workflow_spec, **d["node_spec"] # type: ignore ) del d["node_spec"] if "tasks" in d: ts: TaskSpec = WorkflowSpec(**d) else: ts = TaskSpec(**d) if node_spec is not None: ts._node_spec = node_spec return ts raise TypeError(f"can't convert {obj} to TaskSpec") # pragma: no cover
def skip(self, key: str) -> None: # pragma: no cover """Skip `key` :param key: uuid string """ raise InvalidOperationError( "skip is not valid in FugueInteractiveCache")
def _make_top_level_workflow(spec: WorkflowSpec, ctx: WorkflowContext, configs: Dict[str, Any]) -> _Workflow: aot( len(spec.inputs) == 0, InvalidOperationError("Can't have inputs for top level workflow"), ) wf = _Workflow(spec, ctx) for k, vv in configs.items(): wf.configs[k].set(vv) for k, v in wf.configs.items(): try: v.get() except Exception: raise InvalidOperationError( f"config {k}'s value is not set for top level workflow") wf._init_tasks() return wf
def _append(self, value: Any): assert_or_throw(not self.has_key, InvalidOperationError("this DataFrames must have key")) assert_or_throw( isinstance(value, DataFrame), lambda: ValueError(f"{value} is not a DataFrame"), ) super().__setitem__("_" + str(len(self)), value)
def __setitem__( # type: ignore self, key: str, value: DataFrame, *args: Any, **kwds: Any) -> None: assert isinstance(key, str) assert_or_throw( len(self) == 0 or self.has_key, InvalidOperationError("this DataFrames can's have key"), ) assert_or_throw(isinstance(value, DataFrame), ValueError(f"{key} has non DataFrame value")) super().__setitem__(key, value, *args, **kwds) # type: ignore self._has_key = True
def transit(state_from: "_State", state_to: "_State") -> "_State": if state_from == _State.CREATED: if state_to in [ _State.RUNNING, _State.ABORTED, _State.SKIPPED, _State.FINISHED, ]: return state_to elif state_from == _State.RUNNING: if state_to in [_State.FINISHED, _State.ABORTED, _State.FAILED]: return state_to raise InvalidOperationError( f"Unable to transit from {state_from} to {state_to}")
def get(self, key: str) -> Tuple[bool, bool, Any]: """Try to get value for `key` :param key: uuid string :return: <hasvalue>, <skipped>, <value> """ with self._lock: if key in self._data: return True, False, self._data[key] has_value, skipped, value = self._cache.get(key) assert_or_throw( not skipped, InvalidOperationError( "skip is not valid in FugueInteractiveCache"), ) if has_value: self._data[key] = value return has_value, skipped, value
def _append_task(self, task: TaskSpec) -> TaskSpec: name = task.name assert_triad_var_name(name) aot( name not in self.tasks, lambda: KeyError(f"{name} already exists in workflow"), ) aot( task.parent_workflow is self, lambda: InvalidOperationError(f"{task} has mismatching node_spec"), ) try: task._validate_config() task._validate_dependency() except DependencyDefinitionError: raise except Exception as e: raise DependencyDefinitionError(e) self.tasks[name] = task return task
def serialize_df( df: Optional[DataFrame], threshold: int = -1, file_path: Optional[str] = None, fs: Optional[FileSystem] = None, ) -> str: """Serialize input dataframe to base64 string or to file if it's larger than threshold :param df: input DataFrame :param threshold: file byte size threshold, defaults to -1 :param file_path: file path to store the data (used only if the serialized data is larger than ``threshold``), defaults to None :param fs: :class:`~triad:triad.collections.fs.FileSystem`, defaults to None :raises InvalidOperationError: if file is large but ``file_path`` is not provided :return: a json string either containing the base64 data or the file path .. note:: If fs is not provided but it needs to write to disk, then it will use :meth:`~fs:fs.opener.registry.Registry.open_fs` to try to open the file to write. """ if df is None: return json.dumps(dict()) data = pickle_df(df) size = len(data) if threshold < 0 or size <= threshold: res = dict(data=base64.b64encode(data).decode()) else: if file_path is None: raise InvalidOperationError("file_path is not provided") if fs is None: with open_fs(os.path.dirname(file_path), writeable=True, create=False) as _fs: _fs.writebytes(os.path.basename(file_path), data) else: fs.writebytes(file_path, data) res = dict(path=file_path) return json.dumps(res)
def __getitem__(self, key: str) -> Any: # pragma: no cover raise InvalidOperationError("Can't get items from outputs")
def zip( self, df1: DataFrame, df2: DataFrame, how: str = "inner", partition_spec: PartitionSpec = EMPTY_PARTITION_SPEC, temp_path: Optional[str] = None, to_file_threshold: Any = -1, df1_name: Optional[str] = None, df2_name: Optional[str] = None, ): """Partition the two dataframes in the same way with ``partition_spec`` and zip the partitions together on the partition keys. :param df1: the first dataframe :param df2: the second dataframe :param how: can accept ``inner``, ``left_outer``, ``right_outer``, ``full_outer``, ``cross``, defaults to ``inner`` :param partition_spec: partition spec to partition each dataframe, defaults to empty. :type partition_spec: PartitionSpec, optional :param temp_path: file path to store the data (used only if the serialized data is larger than ``to_file_threshold``), defaults to None :param to_file_threshold: file byte size threshold, defaults to -1 :param df1_name: df1's name in the zipped dataframe, defaults to None :param df2_name: df2's name in the zipped dataframe, defaults to None :return: a zipped dataframe, the metadata of the dataframe will indicate it's zipped :Notice: * Different from join, ``df1`` and ``df2`` can have common columns that you will not use as partition keys. * If ``on`` is not specified it will also use the common columns of the two dataframes (if it's not a cross zip) * For non-cross zip, the two dataframes must have common columns, or error will be thrown For more details and examples, read :ref:`Zip & Comap <tutorial:/tutorials/execution_engine.ipynb#zip-&-comap>`. """ on = list(partition_spec.partition_by) how = how.lower() assert_or_throw( "semi" not in how and "anti" not in how, InvalidOperationError("zip does not support semi or anti joins"), ) serialized_cols: Dict[str, Any] = {} schemas: Dict[str, Any] = {} if len(on) == 0: if how != "cross": on = df1.schema.extract(df2.schema.names, ignore_key_mismatch=True).names else: assert_or_throw( how != "cross", InvalidOperationError("can't specify keys for cross join"), ) partition_spec = PartitionSpec(partition_spec, by=on) def update_df(df: DataFrame, name: Optional[str]) -> DataFrame: if name is None: name = f"_{len(serialized_cols)}" if not df.metadata.get("serialized", False): df = self._serialize_by_partition( df, partition_spec, name, temp_path, to_file_threshold, has_name=name is not None, ) for k in df.metadata["serialized_cols"].keys(): assert_or_throw(k not in serialized_cols, lambda: ValueError(f"{k} is duplicated")) serialized_cols[k] = df.metadata["serialized_cols"][k] schemas[k] = df.metadata["schemas"][k] return df df1 = update_df(df1, df1_name) df2 = update_df(df2, df2_name) metadata = dict( serialized=True, serialized_cols=serialized_cols, schemas=schemas, serialized_has_name=df1_name is not None or df2_name is not None, ) return self.join(df1, df2, how=how, on=on, metadata=metadata)
def __deepcopy__(self, memo: Any) -> "FugueTask": raise InvalidOperationError("can't copy")
def __copy__(self) -> "FugueTask": raise InvalidOperationError("can't copy")
def __init__( # noqa: C901 self, df: Any = None, schema: Any = None, metadata: Any = None, pandas_df_wrapper: bool = False, ): try: if df is None: schema = _input_schema(schema).assert_not_empty() arr = [pa.array([])] * len(schema) self._native = pa.Table.from_arrays(arr, schema=schema.pa_schema) super().__init__(schema, metadata) return elif isinstance(df, pa.Table): assert_or_throw( schema is None, InvalidOperationError("can't reset schema for pa.Table"), ) self._native = df super().__init__(Schema(df.schema), metadata) return elif isinstance(df, (pd.DataFrame, pd.Series)): if isinstance(df, pd.Series): df = df.to_frame() pdf = df if schema is None: self._native = pa.Table.from_pandas( pdf, schema=Schema(pdf).pa_schema, preserve_index=False, safe=True, ) schema = Schema(self._native.schema) else: schema = _input_schema(schema).assert_not_empty() self._native = pa.Table.from_pandas( pdf, schema=schema.pa_schema, preserve_index=False, safe=True) super().__init__(schema, metadata) return elif isinstance(df, Iterable): schema = _input_schema(schema).assert_not_empty() # n = len(schema) # arr = [] # for i in range(n): # arr.append([]) # for row in df: # for i in range(n): # arr[i].append(row[i]) # cols = [pa.array(arr[i], type=schema.types[i]) for i in range(n)] # self._native = pa.Table.from_arrays(cols, schema=schema.pa_schema) pdf = pd.DataFrame(df, columns=schema.names) for f in schema.fields: if pa.types.is_timestamp(f.type) or pa.types.is_date( f.type): pdf[f.name] = pd.to_datetime(pdf[f.name]) schema = _input_schema(schema).assert_not_empty() self._native = pa.Table.from_pandas(pdf, schema=schema.pa_schema, preserve_index=False, safe=True) super().__init__(schema, metadata) return else: raise ValueError(f"{df} is incompatible with ArrowDataFrame") except Exception as e: raise FugueDataFrameInitError from e
def items(self) -> Iterable[Tuple[str, Any]]: # pragma: no cover raise InvalidOperationError("Can't get items from outputs")
def count(self) -> int: """ :raises InvalidOperationError: You can't count an unbounded dataframe """ raise InvalidOperationError("Impossible to count an LocalUnboundedDataFrame")
def _pre_update(self, op: str, need_reindex: bool = True) -> None: if self.readonly: raise InvalidOperationError("This dict is readonly") self._need_reindex = need_reindex
def node_spec(self) -> "_NodeSpec": if self._node_spec is not None: return self._node_spec raise InvalidOperationError( # pragma: no cover f"node_spec is not set for {self}")