Esempio n. 1
0
        self.save(df_train)

class TaskTrain(d6tflow.tasks.TaskPickle): # save output as pickle
    do_preprocess = luigi.BoolParameter(default=True)

    def requires(self):
        return TaskPreprocess(do_preprocess=self.do_preprocess)

    def run(self):
        df_train = self.input().load()
        model = sklearn.svm.SVC()
        model.fit(df_train.iloc[:,:-1], df_train['y'])
        self.save(model)

# Check task dependencies and their execution status
d6tflow.show(TaskTrain())
d6tflow.show([TaskTrain(do_preprocess=False)])

'''
└─--[TaskTrain-{'do_preprocess': 'True'} (PENDING)]
   └─--[TaskPreprocess-{'do_preprocess': 'True'} (PENDING)]
      └─--[TaskGetData-{} (PENDING)]
'''

# Execute the model training task including dependencies
d6tflow.run(TaskTrain())

'''
===== Luigi Execution Summary =====

Scheduled 3 tasks of which:
Esempio n. 2
0
def test_pipes_advanced(cleanup_pipe):
    import d6tflow.pipes
    d6tflow.pipes.init(cfg['d6tpipe_pipe1'],
                       profile=cfg['d6tpipe_profile'],
                       local_pipe=True,
                       reset=True)
    assert 'Local' in d6tflow.pipes.get_pipe().__class__.__name__
    d6tflow.pipes.init(cfg['d6tpipe_pipe1'],
                       profile=cfg['d6tpipe_profile'],
                       reset=True)

    class Task1(d6tflow.tasks.TaskPqPandas):
        def run(self):
            self.save(df)

    t1 = Task1()
    pipe1 = t1.get_pipe()
    pipedir = pipe1.dirpath
    t1filepath = t1.output().path
    t1file = str(PurePosixPath(t1filepath.relative_to(pipedir)))

    d6tflow.preview(t1)
    assert d6tflow.run(t1)
    assert t1.complete()

    with fuckit:
        pipe1._pullpush_luigi([t1file], op='remove')

    assert pipe1.scan_remote(cached=False) == []
    assert t1.pull_preview() == []
    assert t1.push_preview() == [t1file]
    assert d6tflow.pipes.all_push_preview(t1) == {
        cfg['d6tpipe_pipe1']: [t1file]
    }
    assert d6tflow.pipes.all_push(t1) == {cfg['d6tpipe_pipe1']: [t1file]}

    class Task1(d6tflow.tasks.TaskPqPandas):
        external = True
        pipename = cfg['d6tpipe_pipe1']

    class Task2(d6tflow.tasks.TaskPqPandas):
        persist = ['df2', 'df4']

        def requires(self):
            return Task1()

        def run(self):
            df2fun(self)

    import importlib
    importlib.reload(d6tflow)
    importlib.reload(d6tflow.pipes)
    d6tflow.cache.pipes = {}
    d6tflow.pipes.init(cfg['d6tpipe_pipe2'],
                       profile=cfg['d6tpipe_profile2'],
                       reset=True)
    t1 = Task1()
    assert t1.get_pipename() == cfg['d6tpipe_pipe1']
    assert not t1.complete()
    assert t1.pull_preview() == [str(t1file)]
    assert d6tflow.pipes.all_pull_preview(t1) == {
        cfg['d6tpipe_pipe1']: [t1file]
    }
    assert t1.pull() == [str(t1file)]
    assert t1.complete()
    assert t1.output().load().equals(df)

    t2 = Task2()
    d6tflow.show([t2])
    assert d6tflow.run([t2])  # run as list

    pipe2 = t2.get_pipe()
    pipedir = t2.get_pipe().dirpath
    # assert False
    t2files = [
        str(PurePosixPath(p.path.relative_to(pipedir)))
        for p in t2.output().values()
    ]

    assert d6tflow.pipes.all_push_preview(t2) == {
        cfg['d6tpipe_pipe2']: t2files
    }

    # cleanup
    pipe1._pullpush_luigi([t1file], op='remove')
    assert pipe1.scan_remote(cached=False) == []
Esempio n. 3
0
import d6tflow
import cfg, tasks, visualize

# Check task dependencies and their execution status
d6tflow.show(tasks.TaskTrain())

# Execute the model training task including dependencies
d6tflow.run(tasks.TaskTrain())

# use output
visualize.accuracy()
visualize.plot_importances()

# change parameter and rerun
d6tflow.run(tasks.TaskTrain(do_preprocess=False))
visualize.accuracy(do_preprocess=False)  # task output is parameter specific

# rerun flow after code changes
import importlib
importlib.reload(cfg)
importlib.reload(tasks)

# say you changed TaskGetData, reset all tasks depending on TaskGetData
d6tflow.invalidate_downstream(tasks.TaskGetData(), tasks.TaskTrain())

d6tflow.show(tasks.TaskTrain())
d6tflow.run(tasks.TaskTrain())