self.save(df_train) class TaskTrain(d6tflow.tasks.TaskPickle): # save output as pickle do_preprocess = luigi.BoolParameter(default=True) def requires(self): return TaskPreprocess(do_preprocess=self.do_preprocess) def run(self): df_train = self.input().load() model = sklearn.svm.SVC() model.fit(df_train.iloc[:,:-1], df_train['y']) self.save(model) # Check task dependencies and their execution status d6tflow.show(TaskTrain()) d6tflow.show([TaskTrain(do_preprocess=False)]) ''' └─--[TaskTrain-{'do_preprocess': 'True'} (PENDING)] └─--[TaskPreprocess-{'do_preprocess': 'True'} (PENDING)] └─--[TaskGetData-{} (PENDING)] ''' # Execute the model training task including dependencies d6tflow.run(TaskTrain()) ''' ===== Luigi Execution Summary ===== Scheduled 3 tasks of which:
def test_pipes_advanced(cleanup_pipe): import d6tflow.pipes d6tflow.pipes.init(cfg['d6tpipe_pipe1'], profile=cfg['d6tpipe_profile'], local_pipe=True, reset=True) assert 'Local' in d6tflow.pipes.get_pipe().__class__.__name__ d6tflow.pipes.init(cfg['d6tpipe_pipe1'], profile=cfg['d6tpipe_profile'], reset=True) class Task1(d6tflow.tasks.TaskPqPandas): def run(self): self.save(df) t1 = Task1() pipe1 = t1.get_pipe() pipedir = pipe1.dirpath t1filepath = t1.output().path t1file = str(PurePosixPath(t1filepath.relative_to(pipedir))) d6tflow.preview(t1) assert d6tflow.run(t1) assert t1.complete() with fuckit: pipe1._pullpush_luigi([t1file], op='remove') assert pipe1.scan_remote(cached=False) == [] assert t1.pull_preview() == [] assert t1.push_preview() == [t1file] assert d6tflow.pipes.all_push_preview(t1) == { cfg['d6tpipe_pipe1']: [t1file] } assert d6tflow.pipes.all_push(t1) == {cfg['d6tpipe_pipe1']: [t1file]} class Task1(d6tflow.tasks.TaskPqPandas): external = True pipename = cfg['d6tpipe_pipe1'] class Task2(d6tflow.tasks.TaskPqPandas): persist = ['df2', 'df4'] def requires(self): return Task1() def run(self): df2fun(self) import importlib importlib.reload(d6tflow) importlib.reload(d6tflow.pipes) d6tflow.cache.pipes = {} d6tflow.pipes.init(cfg['d6tpipe_pipe2'], profile=cfg['d6tpipe_profile2'], reset=True) t1 = Task1() assert t1.get_pipename() == cfg['d6tpipe_pipe1'] assert not t1.complete() assert t1.pull_preview() == [str(t1file)] assert d6tflow.pipes.all_pull_preview(t1) == { cfg['d6tpipe_pipe1']: [t1file] } assert t1.pull() == [str(t1file)] assert t1.complete() assert t1.output().load().equals(df) t2 = Task2() d6tflow.show([t2]) assert d6tflow.run([t2]) # run as list pipe2 = t2.get_pipe() pipedir = t2.get_pipe().dirpath # assert False t2files = [ str(PurePosixPath(p.path.relative_to(pipedir))) for p in t2.output().values() ] assert d6tflow.pipes.all_push_preview(t2) == { cfg['d6tpipe_pipe2']: t2files } # cleanup pipe1._pullpush_luigi([t1file], op='remove') assert pipe1.scan_remote(cached=False) == []
import d6tflow import cfg, tasks, visualize # Check task dependencies and their execution status d6tflow.show(tasks.TaskTrain()) # Execute the model training task including dependencies d6tflow.run(tasks.TaskTrain()) # use output visualize.accuracy() visualize.plot_importances() # change parameter and rerun d6tflow.run(tasks.TaskTrain(do_preprocess=False)) visualize.accuracy(do_preprocess=False) # task output is parameter specific # rerun flow after code changes import importlib importlib.reload(cfg) importlib.reload(tasks) # say you changed TaskGetData, reset all tasks depending on TaskGetData d6tflow.invalidate_downstream(tasks.TaskGetData(), tasks.TaskTrain()) d6tflow.show(tasks.TaskTrain()) d6tflow.run(tasks.TaskTrain())