def test_run(self): config_file_path = os.path.join(os.path.dirname(__name__), 'config', 'test_config.ini') luigi.configuration.LuigiConfigParser.add_config_path(config_file_path) os.environ.setdefault('test_param', 'test') with self.assertRaises(SystemExit) as exit_code: gokart.run() self.assertEqual(exit_code.exception.code, 0)
def _test_run_with_empty_data_frame(cmdline_args: List[str], test_run_params: test_run): from unittest.mock import patch try: gokart.run(cmdline_args=cmdline_args) except SystemExit as e: assert e.code == 0, f'original workflow does not run properly. It exited with error code {e}.' with CmdlineParser.global_instance(cmdline_args) as cp: all_tasks = _get_all_tasks(cp.get_task_obj()) if test_run_params.namespace is not None: all_tasks = [ t for t in all_tasks if t.task_namespace == test_run_params.namespace ] with patch('gokart.TaskOnKart.load_data_frame', new=lambda *args, required_columns=None, **kwargs: pd.DataFrame( columns=required_columns)): with patch('gokart.TaskOnKart.dump', new=lambda *args, **kwargs: None): test_status_list = [_run_with_test_status(t) for t in all_tasks] test_logger.info('gokart test results:\n' + '\n'.join(s.format() for s in test_status_list)) if any(s.fail() for s in test_status_list): exit(1)
def test_run_with_undefined_environ(self): config_file_path = os.path.join(os.path.dirname(__name__), 'config', 'test_config.ini') luigi.configuration.LuigiConfigParser.add_config_path(config_file_path) with self.assertRaises(luigi.parameter.MissingParameterException ) as missing_parameter: gokart.run()
def test_run_tree_info(self): config_file_path = os.path.join(os.path.dirname(__name__), 'config', 'test_config.ini') luigi.configuration.LuigiConfigParser.add_config_path(config_file_path) os.environ.setdefault('test_param', 'test') tree_info = gokart.tree_info(mode='simple', output_path='tree.txt') with self.assertRaises(SystemExit): gokart.run() self.assertTrue(gokart.make_tree_info(_DummyTask(param='test')), tree_info.output().load())
def test_run_with_error(self): argv = [ f'{__name__}.DummyWorkFlowWithError', '--local-scheduler', '--test-run-pandas', '--log-level=CRITICAL', '--no-lock' ] logger = logging.getLogger( 'gokart.testing.check_if_run_with_empty_data_frame') with patch.object(logger, 'info') as mock_debug: with self.assertRaises(SystemExit) as exit_code: gokart.run(argv) log_str = mock_debug.call_args[0][0] self.assertEqual(exit_code.exception.code, 1) self.assertTrue('DummyModelTask' in log_str)
def test_run_tree_info(self): config_file_path = os.path.join(os.path.dirname(__name__), 'test_config.ini') luigi.configuration.LuigiConfigParser.add_config_path(config_file_path) os.environ.setdefault('test_param', 'test') tree = gokart.run() self.assertTrue(gokart.make_tree_info(_DummyTask(param='test')), tree)
class TaskB(gokart.TaskOnKart): param = luigi.Parameter() def requires(self): return TaskA(param='called by TaskB') def output(self): # `make_target` makes an instance of `luigi.Target`. # This infers the output format and the destination of an output objects. # The target file path is # '{TaskOnKart.workspace_directory}/output_of_task_b_{self.make_unique_id()}.pkl'. return self.make_target('output_of_task_b.pkl') def run(self): # `load` loads input data. In this case, this loads the output of `TaskA`. output_of_task_a = self.load() results = f'"{output_of_task_a}" is loaded in TaskB.' # `dump` writes `results` to the file path of `self.output()`. self.dump(results) if __name__ == '__main__': # luigi.build([TaskB(param='Hello')], local_scheduler=True) # gokart.run(['--tree-info-mode=simple', '--tree-info-output-path=tree_simple.txt', 'TaskB', '--param=Hello', '--local-scheduler']) gokart.run([ '--tree-info-mode=all', '--tree-info-output-path=tree_all.txt', 'TaskB', '--param=Hello', '--local-scheduler' ])
import luigi import numpy as np import gokart import kaggle_disaster_tweets_gokart if __name__ == "__main__": luigi.configuration.LuigiConfigParser.add_config_path("./conf/param.ini") np.random.seed(57) gokart.run()
def test_success(self): with self.assertRaises(SystemExit) as exit_code: gokart.run() self.assertEqual(exit_code.exception.code, 0)
def test_fail_with_None(self): with self.assertRaises(SystemExit) as exit_code: gokart.run() self.assertNotEqual(exit_code.exception.code, 0) # raise Error
return self.make_target('netflix/example_results.txt') def run(self): tf.reset_default_graph() model = self.load('model') # type: redshells.model.MatrixFactorization test_data = self.load_data_frame('test_data') predictions = model.predict(user_ids=test_data['user_id'], item_ids=test_data['item_id'], service_ids=test_data['service_id']) valid_indices = np.where(~np.isnan(predictions))[0] error = np.sqrt( sklearn.metrics.mean_squared_error( predictions[valid_indices], test_data['rating'].values[valid_indices])) logger.info(f'error={error}') self.dump(error) if __name__ == '__main__': # Please download Netflix data from https://www.kaggle.com/netflix-inc/netflix-prize-data, and copy combined_data_*.txt to resources/netflix/. luigi.configuration.add_config_path('./config/example.ini') gokart.run([ 'examples.MatrixFactorizationExample', '--data-size-rate', '1.0', '--local-scheduler', ])
class WordItemSimilarityExample(gokart.TaskOnKart): task_namespace = 'examples' def requires(self): word_data = MakeDummyWordData() item_train_data = MakeDummyItemData() click_data = MakeDummyClickData() item_predict_data = MakeDummyItemData(data_size=1000) return redshells.app.word_item_similarity.BuildWordItemSimilarity( word_data_task=word_data, item_train_data_task=item_train_data, click_data_task=click_data, item_predict_data_task=item_predict_data) def output(self): return self.make_target('word_item_similarity/example.pkl') def run(self): data = self.load() print(data) if __name__ == '__main__': luigi.configuration.add_config_path('./config/example.ini') gokart.run([ 'examples.WordItemSimilarityExample', '--local-scheduler', ])
import gokart from gokart.info import tree_info class SampleTaskLog(gokart.TaskOnKart): def run(self): self.task_log['sample key'] = 'sample value' if __name__ == '__main__': SampleTaskLog().run() tree_info() gokart.run([ '--tree-info-mode=all', '--tree-info-output-path=sample_task_log.txt', 'SampleTaskLog', '--local-scheduler' ])
def requires(self): return self.task def run(self): params = self.load() params.update({'trained': True}) # training model self.dump(params) class TaskC(gokart.TaskOnKart): '''Output trained model file using optimized parameter from TaskA.''' def requires(self): return TaskB(task=self.clone(TaskA, sample='hoge')) def run(self): model = self.load() model.update({'task_name': 'task_c'}) self.dump(model) if __name__ == '__main__': ''' ./resource/ └─ __main__/ ├── TaskA_74416d6e12945172d2ae8a4eaa6bc9de.pkl # {'param_a': 1, 'param_b': 'hoge'} ├── TaskB_c48d5b5c44a9d87b0e3be5b7dbc2df68.pkl # {'param_a': 1, 'param_b': 'hoge', 'trained': True} └── TaskC_12de1c6b5eca6cc86d18424d5d1e16e5.pkl # {'param_a': 1, 'param_b': 'hoge', 'trained': True, 'task_name': 'task_c'} ''' gokart.run(['TaskC', '--local-scheduler'])
save_directory_path=os.path.join(self.local_temporary_directory, 'factorization_machine'), scope_name='FactorizationMachineExample'), output_file_path='criteo/validation.zip') return dict(model=validation_task, test_data=test_data_task) def output(self): return self.make_target('criteo/example_results.txt') def run(self): tf.reset_default_graph() model = self.load('model') # type: redshells.model.FactorizationMachine test_data = self.load_data_frame('test_data') y = test_data['label'].copy() x = test_data.drop('label', axis=1) predict = model.predict(x) auc = redshells.model.utils.calculate_auc(y, predict) self.dump(f'auc={auc}') if __name__ == '__main__': # Please download criteo data from https://www.kaggle.com/c/criteo-display-ad-challenge and put train.txt on ./resouces/criteo/train.txt. luigi.configuration.add_config_path('./config/example.ini') gokart.run([ 'examples.FactorizationMachineExample', '--text-data-file-path', './resources/criteo/train.txt', '--data-size-rate', '0.1', '--local-scheduler', ])
for report in reports: print('====================================================') print(report[0]) print(report[-1]) tf.reset_default_graph() model = self.load( 'model' )['model'] # type: redshells.model.GraphConvolutionalMatrixCompletion test_data = self.load_data_frame('test_data') predictions = model.predict(user_ids=test_data['user_id'], item_ids=test_data['item_id']) error = np.sqrt( sklearn.metrics.mean_squared_error(predictions, test_data['rating'].values)) logger.info(f'error={error}') # self.dump(error) if __name__ == '__main__': # Please download ml100k data from http://files.grouplens.org/datasets/movielens/ml-100k/, and copy combined_data_*.txt to resources/ml_100k/. luigi.configuration.add_config_path('./config/example.ini') gokart.run([ 'examples.GraphConvolutionalMatrixCompletionExample', '--local-scheduler', '--text-data-file-path=ml_data/100k.txt', # '--tree-info-mode=all', # '--tree-info-output-path=sample_task_log.txt', ])
x = [random.randint(0, 100) for _ in range(0, 10)] y = [np.random.randint(0, 100) for _ in range(0, 10)] try: import torch z = [torch.randn(1).tolist()[0] for _ in range(0, 5)] except ImportError: z = [] self.dump({'random': x, 'numpy': y, 'torch': z}) if __name__ == '__main__': # //--------------------------------------------------------------------- # Please set fix_random_seed_methods parameter. # Change seed if you change sample_param. # # //--- The output is as follows every time (with pytorch installed). --- # {'random': [65, 41, 61, 37, 55, 81, 48, 2, 94, 21], # 'numpy': [79, 86, 5, 22, 79, 98, 56, 40, 81, 37], 'torch': []} # 'torch': [0.14460121095180511, -0.11649507284164429, # 0.6928958296775818, -0.916053831577301, 0.7317505478858948]} # # //------------------------- without pytorch --------------------------- # {'random': [65, 41, 61, 37, 55, 81, 48, 2, 94, 21], # 'numpy': [79, 86, 5, 22, 79, 98, 56, 40, 81, 37], 'torch': []} # # //--------------------------------------------------------------------- gokart.run([ 'sample_fix_random_seed.SampleTask', '--local-scheduler', '--rerun', '--sample-param=a', '--fix-random-seed-methods=["random.seed","numpy.random.seed","torch.random.manual_seed"]', '--fix-random-seed-value=57' ])
class OptimizeModelExample(gokart.TaskOnKart): task_namespace = 'examples' def requires(self): data = MakeData() redshells.factory.register_prediction_model('XGBClassifier', xgboost.XGBClassifier) return redshells.train.OptimizeBinaryClassificationModel( rerun=True, train_data_task=data, target_column_name='y', model_name='XGBClassifier', model_kwargs=dict(n_estimators=50), test_size=0.2, optuna_param_name='XGBClassifier_default') def output(self): return self.make_target('binary_classification/results.pkl') def run(self): model = self.load() logger.info(model) if __name__ == '__main__': luigi.configuration.add_config_path('./config/example.ini') gokart.run([ 'examples.OptimizeModelExample', '--local-scheduler', ])
import gokart import pandas as pd # Please define a class which inherits `gokart.PandasTypeConfig`. # **In practice, please import `SamplePandasTypeConfig` in `__init__`.** class SamplePandasTypeConfig(gokart.PandasTypeConfig): task_namespace = 'sample_pandas_type_check' @classmethod def type_dict(cls) -> Dict[str, Any]: return {'int_column': int} class SampleTask(gokart.TaskOnKart): # Please set the same `task_namespace` as `SamplePandasTypeConfig`. task_namespace = 'sample_pandas_type_check' def run(self): df = pd.DataFrame(dict(int_column=['a'])) self.dump( df ) # This line causes PandasTypeError, because expected type is `int`, but `str` is passed. if __name__ == '__main__': gokart.run([ 'sample_pandas_type_check.SampleTask', '--local-scheduler', '--rerun' ])
task_namespace = 'examples' def requires(self): return TrainClassificationModel() def output(self): return self.make_target('output/results.txt') def run(self): score_texts = self.load()["scores"] scores = np.array( [self._extract_average(text) for text in score_texts]) averages = dict( zip(['precision', 'recall', 'f1-score', 'support'], np.average(scores, axis=0))) self.dump(averages) @staticmethod def _extract_average(score_text: str): # return 'precision', 'recall', 'f1-score', 'support' return [float(x) for x in score_text.split()[-4:]] if __name__ == '__main__': gokart.run([ 'examples.ReportClassificationResults', '--local-scheduler', ])