def get_futures(self, kf: KTSFrame) -> Tuple[Dict[Tuple, ObjectID], Dict[Tuple, AnyFrame]]: scheduled_dfs = dict() result_dfs = dict() for args in self.split(kf): scope = self.get_scope(*args) run_id = RunID(scope, kf._fold, kf.hash()) res_df = self.request_resource(run_id, kf) if res_df is not None: result_dfs[args] = res_df continue state = self.request_resource(run_id.state_id, kf) if self.parallel: kf_arg = kf.clear_states() else: kf_arg = kf if state is not None: kf_arg.set_scope(scope) kf_arg.__states__[kf_arg._state_key] = state run_id, res_df, res_state, stats = self.schedule(*args, scope=scope, kf=kf_arg) self.sync(run_id, res_df, res_state, stats, kf) if self.parallel: scheduled_dfs[args] = res_df else: result_dfs[args] = res_df return scheduled_dfs, result_dfs
def local_worker(self, *args, kf: KTSFrame): run_id = RunID(kf._scope, kf._fold, kf.hash()) return_state = kf._train # default for cached FCs or first calls of not cached FCs if not self.cache and bool(kf._state): # second call of not cached FC does not return state, as it is saved previously # refer to https://github.com/konodyuk/kts/tree/master/kts/core#caching-policy return_state = False stats = Stats(kf) if in_worker() and self.verbose: report = None io = self.remote_io(run_id) rs.send(ProgressSignal(0, 1, None, None, None, run_id)) elif not in_worker() and self.verbose: report = kf.__meta__['report'] io = self.local_io(report, run_id) report.update(run_id, 0, 1) else: report = None io = self.suppress_io() with stats, io, self.suppress_stderr(), pbar.local_mode(report, run_id): res_kf = self.compute(*args, kf) if 'columns' in dir(res_kf) and '__columns' not in kf._state: kf._state['__columns'] = list(res_kf.columns) if return_state: res_state = kf._state else: res_state = None if in_worker() and self.verbose: rs.send(ProgressSignal(1, 1, stats.data['took'], None, None, run_id)) elif not in_worker() and self.verbose: report = kf.__meta__['report'] report.update(run_id, 1, 1, stats.data['took']) return res_kf, res_state, stats.data
def worker(self, *args, df: pd.DataFrame, meta: Dict): assert 'run_manager' not in meta assert 'report' not in meta assert 'pid' in meta signal.pid = meta['pid'] address_manager.pid = meta['pid'] kf = KTSFrame(df, meta=meta) kf.__meta__['remote'] = True return_state = kf._train if self.verbose: rs.send(ProgressSignal(0, 1, None, None, None)) io = self.remote_io() else: io = self.suppress_io() rs.send(RunPID(os.getpid())) stats = Stats(df) with stats, io, self.suppress_stderr(): try: res_kf = self.compute(*args, kf) except: rs.send(rs.ErrorSignal(traceback.format_exc())) return None, None, None if 'columns' in dir(res_kf) and '__columns' not in kf._state: kf._state['__columns'] = list(res_kf.columns) if return_state: res_state = kf._state else: res_state = None if self.verbose: rs.send(ProgressSignal(1, 1, stats.data['took'], None, None)) return res_kf, res_state, stats.data
def assemble_futures(self, scheduled_dfs: Dict[Tuple, ObjectID], result_dfs: Dict[Tuple, AnyFrame], kf: KTSFrame) -> KTSFrame: for k, v in scheduled_dfs.items(): result_dfs[k] = ray.get(v) res_list = list() for args in self.split(kf): res_list.append(result_dfs[args]) res = self.reduce(res_list) res = KTSFrame(res) res.__meta__ = kf.__meta__ return res
def safe_put(kf: KTSFrame): address_manager = get_address_manager() h = kf.hash() if ray.get(address_manager.has.remote(h)): oid = ray.get(address_manager.get.remote(h)) else: oid = ray.put(kf) address_manager.put.remote((h, oid, False)) return oid
def schedule(self, *args, scope: str, kf: KTSFrame) -> Tuple[RunID, Union[ObjectID, AnyFrame], Union[ObjectID, Dict], Union[ObjectID, Dict]]: run_id = RunID(scope, kf._fold, kf.hash()) with self.set_scope(kf, scope): if self.parallel: meta = kf.__meta__ oid = safe_put(kf) res_df, res_state, stats = worker.remote(self, *args, df=oid, meta=meta) else: res_df, res_state, stats = self.local_worker(*args, kf=kf) return run_id, res_df, res_state, stats
def run(self, feature_constructors: List[BaseFeatureConstructor], frame: AnyFrame, *, train: bool, fold: str, ret: bool = False, report=None) -> Optional[Dict[str, AnyFrame]]: ensure_ray() if report is None: report = SilentFeatureComputingReport() frame = KTSFrame(frame) results = dict() for feature_constructor in feature_constructors: frame.__meta__['train'] = train frame.__meta__['fold'] = fold frame.__meta__['run_manager'] = self frame.__meta__['report'] = report frame.__meta__['pid'] = os.getpid() run_id = RunID(feature_constructor.name, frame._fold, frame.hash()) with pbar.local_mode(report, run_id): results[feature_constructor.name] = feature_constructor(frame, ret=ret) if ret: return results
def compute(self, kf: KTSFrame): kwargs = { key: self.request_resource(value, kf) for key, value in self.dependencies.items() } result = self.func(kf, **kwargs) assert result.shape[0] == kf.shape[0] if isinstance(result, pd.DataFrame): assert all(result.index == kf.index) else: result = pd.DataFrame( data=result, index=kf.index, columns=[f"{self.name}_{i}" for i in range(result.shape[1])]) if (not kf.train and '__columns' in kf._state and not (len(result.columns) == len(kf._state['__columns']) and all(result.columns == kf._state['__columns']))): fixed_columns = kf._state['__columns'] for col in set(fixed_columns) - set(result.columns): result[col] = None return result[fixed_columns] if '__columns' not in kf._state: kf._state['__columns'] = list(result.columns) return result
def set_scope(self, kf: KTSFrame, scope: str): tmp = kf.__meta__['scope'] kf.__meta__['scope'] = scope yield kf.__meta__['scope'] = tmp