def run(config_path): local_path = './experimentos/producao/' filename = 'config.yaml' #caminho do bucket para fzer o download do arquivo para a maquina local bucket_name_download = config_path[5:].split('/')[0] Utils.download_file_from_gcp(config_path, local_path=local_path, filename=filename, bucket_name=bucket_name_download) #variáveis de ambiente de acordo com desenvolvimento ou produção config = config_pre_tratamento(local_path + filename) project = config['project'] config['caminho_saida_dados'] = local_path d6tflow.set_dir(config['caminho_saida_dados']) params = get_tasks(config) t = tasks.TaskPrdReport(**params) d6tflow.preview(t) d6tflow.run(t, workers=config['workers']) model = tasks.TaskTrainModel( task_engineer_params=params['task_engineer_params'], task_te_params=params['task_te_params'], task_ps_params=params['task_ps_params'], task_model_params=params['task_model_params']).output().load() salvar_modelo(t, model, config) return True
def test_plot(cleanup): df1 = pd.DataFrame({'a': range(10)}) df2 = pd.DataFrame({'b': range(10, 20)}) plt1 = df1.plot.bar() plt2 = df2.plot.bar() class TaskPlot(d6tflow.tasks.TaskMatplotlib): def run(self): self.save(plt1) TaskPlot().run() assert TaskPlot().output().exists() TaskPlot().invalidate(confirm=False) assert not TaskPlot().output().exists() class TaskPlot2(d6tflow.tasks.TaskMatplotlib): persist = ['plot1', 'plot2'] def run(self): self.save({'plot1': plt1, 'plot2': plt2}) TaskPlot2().run() assert TaskPlot2().complete() TaskPlot2().invalidate(confirm=False) assert not TaskPlot2().complete() d6tflow.preview(TaskPlot2(), clip_params=True)
def test_preview(): t1 = Task1() t2 = Task2() t3 = Task3() d6tflow.invalidate_upstream(t3, confirm=False) import io from contextlib import redirect_stdout with io.StringIO() as buf, redirect_stdout(buf): d6tflow.preview(t3) output = buf.getvalue() assert output.count('PENDING') == 3 assert output.count('COMPLETE') == 0 with io.StringIO() as buf, redirect_stdout(buf): d6tflow.run(t3) d6tflow.preview(t3) output = buf.getvalue() assert output.count('PENDING') == 0 assert output.count('COMPLETE') == 3 with io.StringIO() as buf, redirect_stdout(buf): d6tflow.preview(Task3(do_preprocess=False)) output = buf.getvalue() assert output.count('PENDING') == 1 assert output.count('COMPLETE') == 2
def preview(self, func_to_preview, params: dict): func_params = params name = func_to_preview.__name__ all_params = self.params_used.get(name, None) if func_params: d6tflow.preview(self.steps[name](**func_params)) elif all_params: for params in self.params_used[name]: d6tflow.preview(self.steps[name](**params)) else: d6tflow.preview(self.steps[name]())
import d6tflow import cfg, tasks, visualize # Check task dependencies and their execution status d6tflow.preview(tasks.TaskTrain()) # Execute the model training task including dependencies. See https://d6tflow.readthedocs.io/en/latest/run.html d6tflow.run(tasks.TaskTrain()) # use output visualize.accuracy() visualize.plot_importances() # change parameter and rerun, see https://d6tflow.readthedocs.io/en/latest/advparam.html d6tflow.run(tasks.TaskTrain(do_preprocess=False)) visualize.accuracy(do_preprocess=False) # task output is parameter specific # rerun flow after code changes import importlib importlib.reload(cfg) importlib.reload(tasks) # say you changed TaskGetData, reset all tasks depending on TaskGetData d6tflow.invalidate_downstream(tasks.TaskGetData(), tasks.TaskTrain()) d6tflow.preview(tasks.TaskTrain()) d6tflow.run(tasks.TaskTrain())
def test_pipes_advanced(cleanup_pipe): import d6tflow.pipes d6tflow.pipes.init(cfg['d6tpipe_pipe1'], profile=cfg['d6tpipe_profile'], local_pipe=True, reset=True) assert 'Local' in d6tflow.pipes.get_pipe().__class__.__name__ d6tflow.pipes.init(cfg['d6tpipe_pipe1'], profile=cfg['d6tpipe_profile'], reset=True) class Task1(d6tflow.tasks.TaskPqPandas): def run(self): self.save(df) t1 = Task1() pipe1 = t1.get_pipe() pipedir = pipe1.dirpath t1filepath = t1.output().path t1file = str(PurePosixPath(t1filepath.relative_to(pipedir))) d6tflow.preview(t1) assert d6tflow.run(t1) assert t1.complete() with fuckit: pipe1._pullpush_luigi([t1file], op='remove') assert pipe1.scan_remote(cached=False) == [] assert t1.pull_preview() == [] assert t1.push_preview() == [t1file] assert d6tflow.pipes.all_push_preview(t1) == { cfg['d6tpipe_pipe1']: [t1file] } assert d6tflow.pipes.all_push(t1) == {cfg['d6tpipe_pipe1']: [t1file]} class Task1(d6tflow.tasks.TaskPqPandas): external = True pipename = cfg['d6tpipe_pipe1'] class Task2(d6tflow.tasks.TaskPqPandas): persist = ['df2', 'df4'] def requires(self): return Task1() def run(self): df2fun(self) import importlib importlib.reload(d6tflow) importlib.reload(d6tflow.pipes) d6tflow.cache.pipes = {} d6tflow.pipes.init(cfg['d6tpipe_pipe2'], profile=cfg['d6tpipe_profile2'], reset=True) t1 = Task1() assert t1.get_pipename() == cfg['d6tpipe_pipe1'] assert not t1.complete() assert t1.pull_preview() == [str(t1file)] assert d6tflow.pipes.all_pull_preview(t1) == { cfg['d6tpipe_pipe1']: [t1file] } assert t1.pull() == [str(t1file)] assert t1.complete() assert t1.output().load().equals(df) t2 = Task2() d6tflow.show([t2]) assert d6tflow.run([t2]) # run as list pipe2 = t2.get_pipe() pipedir = t2.get_pipe().dirpath # assert False t2files = [ str(PurePosixPath(p.path.relative_to(pipedir))) for p in t2.output().values() ] assert d6tflow.pipes.all_push_preview(t2) == { cfg['d6tpipe_pipe2']: t2files } # cleanup pipe1._pullpush_luigi([t1file], op='remove') assert pipe1.scan_remote(cached=False) == []
idx3 = luigi.Parameter(default='test3') export = False def run(self): self.save({'df': df, 'df2': df}) @d6tflow.requires(Task1A, Task1B, Task1C) class Task1All(d6tflow.tasks.TaskCache): def run(self): self.save(df) d6tflow.run(Task1All()) d6tflow.invalidate_upstream(Task1All(), confirm=False) d6tflow.preview(Task1All()) task = Task1All() #************************************************** # tests #************************************************** import pytest import d6tflow.pipes def readfile(file_dir): with open(file_dir, 'r') as f: file = f.read()
import importlib import d6tflow import luigi import pandas as pd df = pd.DataFrame({'a': range(10)}) class Task1(d6tflow.tasks.TaskCache): persist = ['df'] idx = luigi.Parameter(default='test') idx2 = luigi.Parameter(default='test') def run(self): self.save({'df': df}) @d6tflow.inherits(Task1) @d6tflow.clone_parent class Task2(d6tflow.tasks.TaskCache): def run(self): self.save({'df': df}) d6tflow.preview(Task2(), clip_params=True)
import d6tflow # Import workflow tasks and output visualizations import flow_tasks, flow_viz # Instantiate terminal task with parameters params = {'data_size': 6, 'mini_batch_size': 2} task = flow_tasks.TaskModelTrain(**params) # optional: reset everything every time workflow is run d6tflow.invalidate_upstream(task, confirm=False) # Preview terminal task d6tflow.preview(task, clip_params=True) # Run terminal task d6tflow.run(task) # Show output if task.complete(): flow_viz.show_test_prints(params)
def preview(self, func_to_preview, params=None): self._instantiate([func_to_preview], params=params) return d6tflow.preview( self.instantiated_tasks[func_to_preview.__name__])
return self.clone_parent() # automatically pass parameters upstream def run(self): df_train = self.input().load() if self.model == 'ols': model = sklearn.linear_model.LogisticRegression() elif self.model == 'svm': model = sklearn.svm.SVC() else: raise ValueError('invalid model selection') model.fit(df_train.iloc[:, :-1], df_train['y']) self.save(model) # Check task dependencies and their execution status d6tflow.preview(TaskTrain()) ''' └─--[TaskTrain-{'do_preprocess': 'False', 'model': 'ols'} (PENDING)] └─--[TaskPreprocess-{'do_preprocess': 'False'} (PENDING)] └─--[TaskGetData-{} (PENDING)] ''' # Execute the model training task including dependencies d6tflow.run(TaskTrain()) ''' ===== Luigi Execution Summary ===== Scheduled 3 tasks of which: * 3 ran successfully: - 1 TaskGetData() - 1 TaskPreprocess(do_preprocess=False)
symbols = ['CAT','WMT'], lookback_period = 1 ) strategy2 = strategy1.copy() strategy2['symbols']=['MSFT','FB'] # run another universe strategy3 = strategy1.copy() strategy3['date_start']= datetime.date(2019,1,1) # run another time period #************************************************************ # run backtests #************************************************************ # run backtest including necessary dependencies for istrat, strategy in enumerate([strategy1,strategy2,strategy3]): print(f'run strategy #{istrat+1}') print(d6tflow.preview(Backtest(**strategy))) # show which tasks will be run d6tflow.run(Backtest(**strategy)) df_pnl1 = Backtest(**strategy).output()['pnl'].load() # load task output print(f'pnl strategy #{istrat+1}:', df_pnl1.sum().sum().round(3)) def dev(): TradingSignals(**strategy1).reset() # reset after making updates #************************************************************ # backtest output #************************************************************ ''' run strategy #1
# run workflow for model 1 d6tflow.run(TaskTrain(**params_model1)) ''' ===== Luigi Execution Summary ===== Scheduled 3 tasks of which: * 3 ran successfully: - 1 TaskGetData() - 1 TaskPreprocess(do_preprocess=False) - 1 TaskTrain(do_preprocess=False, model=ols) ''' # Intelligently rerun workflow after changing parameters d6tflow.preview(TaskTrain(**params_model2)) ''' └─--[TaskTrain-{'do_preprocess': 'False'} (PENDING)] └─--[TaskPreprocess-{'do_preprocess': 'False'} (PENDING)] └─--[TaskGetData-{} (COMPLETE)] => this doesn't change and doesn't need to rerun ''' # run workflow for model 2 d6tflow.run(TaskTrain(**params_model2)) # compare results from new model # Load task output to pandas dataframe and model object for model evaluation model1 = TaskTrain(**params_model1).output().load() df_train = TaskPreprocess(**params_model1).output().load()
self.save(df_train) class TaskTrain(dt.tasks.TaskPickle): do_preprocess = li.BoolParameter(default = True) def requires(self): return TaskPreprocess(do_preprocess = self.do_preprocess) def run(self): df_train = self.input().load() class_weights = {0:1, 1:2} model = RandomForestClassifier(n_estimators = 400, random_state = 0, max_depth=20, class_weight=class_weights, min_samples_split=5,min_samples_leaf=4) model.fit(df_train.iloc[:, :-1], df_train['LuxL']) self.save(model) dt.preview(TaskTrain()) dt.run(TaskTrain()) model = TaskTrain().output().load() # run model on test data def load_test_data(file_name): test_df = pd.read_csv(test_file_name) return(test_df) def get_test_preds(test_data): test_df = test test_df = test_df.fillna(test_df.mean()) test_df.iloc[:,:-1] = sk.preprocessing.scale(test_df.iloc[:,:-1]) preds = model.predict(test_df.iloc[:, :-1]) return(preds)
assert df_train.equals(self.input()['data'].load()) assert df_train.equals(self.inputLoad(task='data')) assert df_trainX.equals(self.input()['data-drain']['x'].load()) assert df_trainX.equals(self.inputLoad(task='data-train')[0]) assert df_trainX.equals( self.inputLoad(task='data-train', as_dict=True)['x']) df_train['target_naive1'] = df_train['target'].mean() df_train['target_ols'] = data['ols'].predict(df_trainX) df_train['target_lgbm'] = data['lgbm'].predict(df_trainX) self.save(df_train) params = dict() d6tflow.preview(ModelEval(**params)) d6tflow.run(ModelEval( **params)) #,forced_all=True,confirm=False, forced_all_upstream=True) # multi model comparison df_train = ModelEval(**params).outputLoad() print('insample errors') print('naive mean', mean_squared_error(df_train[cfg_col_Y], df_train['target_naive1'])) print('ols', mean_squared_error(df_train[cfg_col_Y], df_train['target_ols'])) print('gbm', mean_squared_error(df_train[cfg_col_Y], df_train['target_lgbm'])) print('cv errors') model_ols = ModelTrainOLS(**params) mod_lgbm = ModelTrainLGBM(**params) df_trainX, df_trainY = DataTrain(**params).outputLoad()
import d6tflow from d6tflow.tasks import TaskCSVPandas, TaskJson import pandas as pd from pandas.io.json import json_normalize class Task_Dual(TaskJson): ''' illustration of adding status information ''' def run(self): D = pd.DataFrame({'Test': [1, 2, 3]}) out = {'dataframe': D.to_json(), 'status': True} self.save(out) if __name__ == "__main__": print(d6tflow.preview(Task_Dual())) d6tflow.run(Task_Dual()) out_json = Task_Dual().output().load()['dataframe'] out = pd.read_json(out_json) print(out)