def _delegate(self, path) -> Tuple[FSBase, str]: with self._fs_lock: if self._in_create: # pragma: no cover return super()._delegate(path) self._in_create = True fp = _FSPath(path) if fp.root not in self._fs_store: self._fs_store[fp.root] = self.create_fs(fp.root) self.mount(to_uuid(fp.root), self._fs_store[fp.root]) self._in_create = False m_path = to_uuid(fp.root) + "/" + fp.relative_path return super()._delegate(m_path)
def __init__( self, file_id: str, deterministic: bool, permanent: bool, lazy: bool = False, partition: Any = None, single: bool = False, namespace: Any = None, **save_kwargs: Any, ): super().__init__( to_file=True, deterministic=deterministic, permanent=permanent, lazy=lazy, fmt="", partition=PartitionSpec(partition), single=single, namespace=namespace, save_kwargs=dict(save_kwargs), ) self._yield_func: Any = None self._file_id = to_uuid(file_id, namespace) self._yielded = YieldedFile(self._file_id)
def __uuid__(self) -> str: return to_uuid( super().__uuid__(), self._outputter, self._outputter._params, self._outputter._partition_spec, )
def __uuid__(self) -> str: return to_uuid( super().__uuid__(), self._processor, self._processor._params, self._processor._partition_spec, )
def __uuid__(self) -> str: return to_uuid( self._wrapper, self._need_engine, self._need_output_schema, str(self._output_schema), )
def __uuid__(self) -> str: return to_uuid( self._wrapper, self._engine_param, self._use_dfs, self._need_output_schema, str(self._output_schema), )
def __uuid__(self) -> str: if self._id == "": self._ensure_fully_connected() if self.deterministic: self._id = to_uuid(self.spec, self.configs, self.inputs) else: self._id = str(uuid4()) return self._id
def __uuid__(self) -> str: return to_uuid( self.configs, self.inputs, self.outputs, get_full_type_path(self.func), self.metadata, self.deterministic, self.lazy, self._node_spec, )
def test__to_processor_determinism(): a = _to_processor(t1, None) b = _to_processor(t1, None) c = _to_processor("t1", None) d = _to_processor("t2", None) assert a is not b assert to_uuid(a) == to_uuid(b) assert a is not c assert to_uuid(a) == to_uuid(c) assert to_uuid(a) != to_uuid(d) a = _to_processor(MockProcessor) b = _to_processor("MockProcessor") assert a is not b assert to_uuid(a) == to_uuid(b)
def test__to_outputter_determinism(): a = _to_outputter(t1) b = _to_outputter(t1) c = _to_outputter("t1") d = _to_outputter("t2") assert a is not b assert to_uuid(a) == to_uuid(b) assert a is not c assert to_uuid(a) == to_uuid(c) assert to_uuid(a) != to_uuid(d) a = _to_outputter(MockOutputter) b = _to_outputter("MockOutputter") assert a is not b assert to_uuid(a) == to_uuid(b)
def test__to_creator_determinism(): a = _to_creator(t1, None) b = _to_creator(t1, None) c = _to_creator("t1", None) d = _to_creator("t2", None) assert a is not b assert to_uuid(a) == to_uuid(b) assert a is not c assert to_uuid(a) == to_uuid(c) assert to_uuid(a) != to_uuid(d) a = _to_creator(T0) b = _to_creator("T0") assert a is not b assert to_uuid(a) == to_uuid(b)
def test__to_transformer_determinism(): a = _to_transformer(t1, None) b = _to_transformer(t1, None) c = _to_transformer("t1", None) assert a is not b assert to_uuid(a) == to_uuid(b) assert a is not c assert to_uuid(a) == to_uuid(c) a = _to_transformer(t4, "a:int,b:int") b = _to_transformer("t4", Schema("a:int,b:int")) assert a is not b assert to_uuid(a) == to_uuid(b) a = _to_transformer(MockTransformer) b = _to_transformer("MockTransformer") assert a is not b assert to_uuid(a) == to_uuid(b) a = _to_transformer(t7, "a:int,b:int") b = _to_transformer("t7", "a:int,b:int") assert a is not b assert to_uuid(a) == to_uuid(b)
def test__to_output_transformer_determinism(): a = _to_output_transformer(t1) b = _to_output_transformer(t1) c = _to_output_transformer("t1") assert a is not b assert to_uuid(a) == to_uuid(b) assert a is not c assert to_uuid(a) == to_uuid(c) a = _to_output_transformer(t4) b = _to_output_transformer("t4") assert a is not b assert to_uuid(a) == to_uuid(b) a = _to_output_transformer(MockTransformer) b = _to_output_transformer("MockTransformer") assert a is not b assert to_uuid(a) == to_uuid(b) a = _to_output_transformer(t7) b = _to_output_transformer("t7") assert a is not b assert to_uuid(a) == to_uuid(b)
def __uuid__(self) -> str: # _checkpoint is not part of determinism # _yield_name is not part of determinism return to_uuid( self.configs, self.inputs, self.outputs, # get_full_type_path(self.func), self.metadata, self.deterministic, self.lazy, self._get_dependency_uuid(), self._broadcast, )
def _get_dependency_uuid(self) -> Any: # TODO: this should be a part of adagio!! if self._dependency_uuid is not None: return self._dependency_uuid values: List[Any] = [] for k, v in self.node_spec.dependency.items(): t = v.split(".", 1) assert_or_throw(len(t) == 2) values.append(k) values.append(t[1]) task = self.parent_workflow.tasks[t[0]] values.append(task.__uuid__()) self._dependency_uuid = to_uuid(values) return self._dependency_uuid
def map( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _map(pdf: Any) -> pd.DataFrame: if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame( pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True ) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) if len(partition_spec.partition_by) == 0: pdf = self.repartition(df, partition_spec) result = pdf.native.map_partitions(_map, meta=output_schema.pandas_dtype) else: df = self.repartition(df, PartitionSpec(num=partition_spec.num_partitions)) result = self.pl_utils.safe_groupby_apply( df.native, partition_spec.partition_by, _map, meta=output_schema.pandas_dtype, ) return DaskDataFrame(result, output_schema, metadata)
def _map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: df = self.to_df(self.repartition(df, partition_spec)) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf( dfs: Iterable[pd.DataFrame], ) -> Iterable[pd.DataFrame]: # pragma: no cover def get_dfs() -> Iterable[LocalDataFrame]: for df in dfs: if df.shape[0] > 0: yield PandasDataFrame( df.reset_index(drop=True), input_schema, pandas_df_wrapper=True, ) input_df = LocalDataFrameIterableDataFrame(get_dfs(), input_schema) if input_df.empty: return PandasDataFrame([], output_schema).as_pandas() if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) if isinstance(output_df, LocalDataFrameIterableDataFrame): for res in output_df.native: yield res.as_pandas() else: yield output_df.as_pandas() df = self.to_df(df) sdf = df.native.mapInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def test_output(): t = MockTaskForVar() s = OutputSpec("o", dict, False) o = _Output(t, s) assert to_uuid(t, s) == o.__uuid__() assert not o.is_set assert not o.is_skipped assert not o.is_successful assert not o.is_failed raises(ValueError, lambda: o.set(1)) assert o.is_set assert not o.is_skipped assert not o.is_successful assert o.is_failed assert isinstance(o.exception, ValueError) assert o.trace is not None o.set(dict()) # when is_set, setting again will do nothing assert o.is_set assert not o.is_skipped assert not o.is_successful assert o.is_failed assert isinstance(o.exception, ValueError) o = _Output(t, s) # setting a bad value will cause exception on both setters and getters raises(ValueError, lambda: o.set(None)) assert o.is_set assert not o.is_skipped assert not o.is_successful assert o.is_failed assert isinstance(o.exception, ValueError) s2 = OutputSpec("o", dict, True) o = _Output(t, s2) o.set(None) assert o.is_set assert not o.is_skipped assert o.is_successful assert not o.is_failed assert o.exception is None s2 = OutputSpec("o", dict, True) o = _Output(t, s2) o.skip() assert o.is_set assert o.is_skipped assert not o.is_successful assert not o.is_failed assert o.exception is None
def __uuid__(self) -> str: return to_uuid( self.configs, self.inputs, self.outputs, # get_full_type_path(self.func), self.metadata, self.deterministic, self.lazy, self.node_spec, str(self._persist), self._broadcast, self._checkpoint, self._checkpoint_namespace, )
def test_determinism(): a = PartitionSpec(num=0) b = PartitionSpec() assert to_uuid(a) == to_uuid(b) a = PartitionSpec(by=["a"], num=2) b = PartitionSpec(num="2", by=["a"]) assert to_uuid(a) == to_uuid(b) a = PartitionSpec(by=["a", "b"]) b = PartitionSpec(by=["b", "a"]) assert to_uuid(a) != to_uuid(b)
def _group_map_by_pandas_udf( self, df: DataFrame, map_func: Callable[[PartitionCursor, LocalDataFrame], LocalDataFrame], output_schema: Any, partition_spec: PartitionSpec, metadata: Any = None, on_init: Optional[Callable[[int, DataFrame], Any]] = None, ) -> DataFrame: presort = partition_spec.presort presort_keys = list(presort.keys()) presort_asc = list(presort.values()) output_schema = Schema(output_schema) input_schema = df.schema on_init_once: Any = ( None if on_init is None else RunOnce( on_init, lambda *args, **kwargs: to_uuid(id(on_init), id(args[0])) ) ) def _udf(pdf: Any) -> pd.DataFrame: # pragma: no cover if pdf.shape[0] == 0: return PandasDataFrame([], output_schema).as_pandas() if len(presort_keys) > 0: pdf = pdf.sort_values(presort_keys, ascending=presort_asc) input_df = PandasDataFrame( pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True ) if on_init_once is not None: on_init_once(0, input_df) cursor = partition_spec.get_cursor(input_schema, 0) cursor.set(input_df.peek_array(), 0, 0) output_df = map_func(cursor, input_df) return output_df.as_pandas() df = self.to_df(df) gdf = df.native.groupBy(*partition_spec.partition_by) sdf = gdf.applyInPandas(_udf, schema=to_spark_schema(output_schema)) return SparkDataFrame(sdf, metadata=metadata)
def __uuid__(self) -> str: return to_uuid(get_full_type_path(self._func), self._params, self._rt)
def __uuid__(self) -> str: return to_uuid(self._wrapper, self._need_engine, self._use_dfs)
def __uuid__(self) -> str: return to_uuid(get_full_type_path(self))
def __uuid__(self) -> str: """Get deterministic unique id of this object""" return to_uuid(self.jsondict)
def __uuid__(self) -> str: return to_uuid(self._wrapper.__uuid__(), self._output_schema_arg)
def __uuid__(self) -> str: return to_uuid(self._wrapper, self._engine_param, self._use_dfs)
def __uuid__(self) -> str: return to_uuid(super().__uuid__(), self._creator, self._creator._params)
def test_function_wrapper_determinism(): w1 = FunctionWrapper(f20, "^[ldsp][ldsp]$", "[ldsp]") w2 = FunctionWrapper(f20, "^[ldsp][ldsp]$", "[ldsp]") assert w1 is not w2 assert to_uuid(w1) == to_uuid(w2)
def __uuid__(self) -> str: return to_uuid(self.code, self.annotation, self._type)