Beispiel #1
0
 def test_run(self):
     config_file_path = os.path.join(os.path.dirname(__name__), 'config', 'test_config.ini')
     luigi.configuration.LuigiConfigParser.add_config_path(config_file_path)
     os.environ.setdefault('test_param', 'test')
     with self.assertRaises(SystemExit) as exit_code:
         gokart.run()
     self.assertEqual(exit_code.exception.code, 0)
def _test_run_with_empty_data_frame(cmdline_args: List[str],
                                    test_run_params: test_run):
    from unittest.mock import patch

    try:
        gokart.run(cmdline_args=cmdline_args)
    except SystemExit as e:
        assert e.code == 0, f'original workflow does not run properly. It exited with error code {e}.'

    with CmdlineParser.global_instance(cmdline_args) as cp:
        all_tasks = _get_all_tasks(cp.get_task_obj())

    if test_run_params.namespace is not None:
        all_tasks = [
            t for t in all_tasks
            if t.task_namespace == test_run_params.namespace
        ]

    with patch('gokart.TaskOnKart.load_data_frame',
               new=lambda *args, required_columns=None, **kwargs: pd.DataFrame(
                   columns=required_columns)):
        with patch('gokart.TaskOnKart.dump', new=lambda *args, **kwargs: None):
            test_status_list = [_run_with_test_status(t) for t in all_tasks]

    test_logger.info('gokart test results:\n' +
                     '\n'.join(s.format() for s in test_status_list))
    if any(s.fail() for s in test_status_list):
        exit(1)
Beispiel #3
0
 def test_run_with_undefined_environ(self):
     config_file_path = os.path.join(os.path.dirname(__name__), 'config',
                                     'test_config.ini')
     luigi.configuration.LuigiConfigParser.add_config_path(config_file_path)
     with self.assertRaises(luigi.parameter.MissingParameterException
                            ) as missing_parameter:
         gokart.run()
Beispiel #4
0
 def test_run_tree_info(self):
     config_file_path = os.path.join(os.path.dirname(__name__), 'config', 'test_config.ini')
     luigi.configuration.LuigiConfigParser.add_config_path(config_file_path)
     os.environ.setdefault('test_param', 'test')
     tree_info = gokart.tree_info(mode='simple', output_path='tree.txt')
     with self.assertRaises(SystemExit):
         gokart.run()
     self.assertTrue(gokart.make_tree_info(_DummyTask(param='test')), tree_info.output().load())
 def test_run_with_error(self):
     argv = [
         f'{__name__}.DummyWorkFlowWithError', '--local-scheduler',
         '--test-run-pandas', '--log-level=CRITICAL', '--no-lock'
     ]
     logger = logging.getLogger(
         'gokart.testing.check_if_run_with_empty_data_frame')
     with patch.object(logger, 'info') as mock_debug:
         with self.assertRaises(SystemExit) as exit_code:
             gokart.run(argv)
     log_str = mock_debug.call_args[0][0]
     self.assertEqual(exit_code.exception.code, 1)
     self.assertTrue('DummyModelTask' in log_str)
Beispiel #6
0
 def test_run_tree_info(self):
     config_file_path = os.path.join(os.path.dirname(__name__),
                                     'test_config.ini')
     luigi.configuration.LuigiConfigParser.add_config_path(config_file_path)
     os.environ.setdefault('test_param', 'test')
     tree = gokart.run()
     self.assertTrue(gokart.make_tree_info(_DummyTask(param='test')), tree)
Beispiel #7
0

class TaskB(gokart.TaskOnKart):
    param = luigi.Parameter()

    def requires(self):
        return TaskA(param='called by TaskB')

    def output(self):
        # `make_target` makes an instance of `luigi.Target`.
        # This infers the output format and the destination of an output objects.
        # The target file path is
        #     '{TaskOnKart.workspace_directory}/output_of_task_b_{self.make_unique_id()}.pkl'.
        return self.make_target('output_of_task_b.pkl')

    def run(self):
        # `load` loads input data. In this case, this loads the output of `TaskA`.
        output_of_task_a = self.load()
        results = f'"{output_of_task_a}" is loaded in TaskB.'
        # `dump` writes `results` to the file path of `self.output()`.
        self.dump(results)


if __name__ == '__main__':
    # luigi.build([TaskB(param='Hello')], local_scheduler=True)
    # gokart.run(['--tree-info-mode=simple', '--tree-info-output-path=tree_simple.txt', 'TaskB', '--param=Hello', '--local-scheduler'])
    gokart.run([
        '--tree-info-mode=all', '--tree-info-output-path=tree_all.txt',
        'TaskB', '--param=Hello', '--local-scheduler'
    ])
import luigi
import numpy as np
import gokart

import kaggle_disaster_tweets_gokart

if __name__ == "__main__":
    luigi.configuration.LuigiConfigParser.add_config_path("./conf/param.ini")
    np.random.seed(57)
    gokart.run()
Beispiel #9
0
 def test_success(self):
     with self.assertRaises(SystemExit) as exit_code:
         gokart.run()
     self.assertEqual(exit_code.exception.code, 0)
Beispiel #10
0
 def test_fail_with_None(self):
     with self.assertRaises(SystemExit) as exit_code:
         gokart.run()
     self.assertNotEqual(exit_code.exception.code, 0)  # raise Error
Beispiel #11
0
        return self.make_target('netflix/example_results.txt')

    def run(self):
        tf.reset_default_graph()
        model = self.load('model')  # type: redshells.model.MatrixFactorization
        test_data = self.load_data_frame('test_data')

        predictions = model.predict(user_ids=test_data['user_id'],
                                    item_ids=test_data['item_id'],
                                    service_ids=test_data['service_id'])
        valid_indices = np.where(~np.isnan(predictions))[0]

        error = np.sqrt(
            sklearn.metrics.mean_squared_error(
                predictions[valid_indices],
                test_data['rating'].values[valid_indices]))

        logger.info(f'error={error}')
        self.dump(error)


if __name__ == '__main__':
    # Please download Netflix data from https://www.kaggle.com/netflix-inc/netflix-prize-data, and copy combined_data_*.txt to resources/netflix/.
    luigi.configuration.add_config_path('./config/example.ini')
    gokart.run([
        'examples.MatrixFactorizationExample',
        '--data-size-rate',
        '1.0',
        '--local-scheduler',
    ])

class WordItemSimilarityExample(gokart.TaskOnKart):
    task_namespace = 'examples'

    def requires(self):
        word_data = MakeDummyWordData()
        item_train_data = MakeDummyItemData()
        click_data = MakeDummyClickData()
        item_predict_data = MakeDummyItemData(data_size=1000)
        return redshells.app.word_item_similarity.BuildWordItemSimilarity(
            word_data_task=word_data,
            item_train_data_task=item_train_data,
            click_data_task=click_data,
            item_predict_data_task=item_predict_data)

    def output(self):
        return self.make_target('word_item_similarity/example.pkl')

    def run(self):
        data = self.load()
        print(data)


if __name__ == '__main__':
    luigi.configuration.add_config_path('./config/example.ini')
    gokart.run([
        'examples.WordItemSimilarityExample',
        '--local-scheduler',
    ])
Beispiel #13
0
import gokart
from gokart.info import tree_info


class SampleTaskLog(gokart.TaskOnKart):
    def run(self):
        self.task_log['sample key'] = 'sample value'


if __name__ == '__main__':
    SampleTaskLog().run()
    tree_info()
    gokart.run([
        '--tree-info-mode=all', '--tree-info-output-path=sample_task_log.txt',
        'SampleTaskLog', '--local-scheduler'
    ])
    def requires(self):
        return self.task

    def run(self):
        params = self.load()
        params.update({'trained': True})  # training model
        self.dump(params)


class TaskC(gokart.TaskOnKart):
    '''Output trained model file using optimized parameter from TaskA.'''
    def requires(self):
        return TaskB(task=self.clone(TaskA, sample='hoge'))

    def run(self):
        model = self.load()
        model.update({'task_name': 'task_c'})
        self.dump(model)


if __name__ == '__main__':
    '''
    ./resource/
    └─ __main__/
       ├── TaskA_74416d6e12945172d2ae8a4eaa6bc9de.pkl   # {'param_a': 1, 'param_b': 'hoge'}
       ├── TaskB_c48d5b5c44a9d87b0e3be5b7dbc2df68.pkl    # {'param_a': 1, 'param_b': 'hoge', 'trained': True}
       └── TaskC_12de1c6b5eca6cc86d18424d5d1e16e5.pkl   # {'param_a': 1, 'param_b': 'hoge', 'trained': True, 'task_name': 'task_c'}
    '''
    gokart.run(['TaskC', '--local-scheduler'])
                save_directory_path=os.path.join(self.local_temporary_directory, 'factorization_machine'),
                scope_name='FactorizationMachineExample'),
            output_file_path='criteo/validation.zip')
        return dict(model=validation_task, test_data=test_data_task)

    def output(self):
        return self.make_target('criteo/example_results.txt')

    def run(self):
        tf.reset_default_graph()
        model = self.load('model')  # type: redshells.model.FactorizationMachine
        test_data = self.load_data_frame('test_data')
        y = test_data['label'].copy()
        x = test_data.drop('label', axis=1)
        predict = model.predict(x)
        auc = redshells.model.utils.calculate_auc(y, predict)
        self.dump(f'auc={auc}')


if __name__ == '__main__':
    # Please download criteo data from https://www.kaggle.com/c/criteo-display-ad-challenge and put train.txt on ./resouces/criteo/train.txt.
    luigi.configuration.add_config_path('./config/example.ini')
    gokart.run([
        'examples.FactorizationMachineExample',
        '--text-data-file-path',
        './resources/criteo/train.txt',
        '--data-size-rate',
        '0.1',
        '--local-scheduler',
    ])
Beispiel #16
0
        for report in reports:
            print('====================================================')
            print(report[0])
            print(report[-1])
        tf.reset_default_graph()
        model = self.load(
            'model'
        )['model']  # type: redshells.model.GraphConvolutionalMatrixCompletion
        test_data = self.load_data_frame('test_data')

        predictions = model.predict(user_ids=test_data['user_id'],
                                    item_ids=test_data['item_id'])
        error = np.sqrt(
            sklearn.metrics.mean_squared_error(predictions,
                                               test_data['rating'].values))

        logger.info(f'error={error}')
        # self.dump(error)


if __name__ == '__main__':
    # Please download ml100k data from http://files.grouplens.org/datasets/movielens/ml-100k/, and copy combined_data_*.txt to resources/ml_100k/.
    luigi.configuration.add_config_path('./config/example.ini')
    gokart.run([
        'examples.GraphConvolutionalMatrixCompletionExample',
        '--local-scheduler',
        '--text-data-file-path=ml_data/100k.txt',
        # '--tree-info-mode=all',
        # '--tree-info-output-path=sample_task_log.txt',
    ])
Beispiel #17
0
        x = [random.randint(0, 100) for _ in range(0, 10)]
        y = [np.random.randint(0, 100) for _ in range(0, 10)]
        try:
            import torch
            z = [torch.randn(1).tolist()[0] for _ in range(0, 5)]
        except ImportError:
            z = []
        self.dump({'random': x, 'numpy': y, 'torch': z})


if __name__ == '__main__':
    # //---------------------------------------------------------------------
    # Please set fix_random_seed_methods parameter.
    # Change seed if you change sample_param.
    #
    # //--- The output is as follows every time (with pytorch installed). ---
    # {'random': [65, 41, 61, 37, 55, 81, 48, 2, 94, 21],
    #   'numpy': [79, 86, 5, 22, 79, 98, 56, 40, 81, 37], 'torch': []}
    #   'torch': [0.14460121095180511, -0.11649507284164429,
    #            0.6928958296775818, -0.916053831577301, 0.7317505478858948]}
    #
    # //------------------------- without pytorch ---------------------------
    # {'random': [65, 41, 61, 37, 55, 81, 48, 2, 94, 21],
    #   'numpy': [79, 86, 5, 22, 79, 98, 56, 40, 81, 37], 'torch': []}
    #
    # //---------------------------------------------------------------------
    gokart.run([
        'sample_fix_random_seed.SampleTask', '--local-scheduler', '--rerun', '--sample-param=a',
        '--fix-random-seed-methods=["random.seed","numpy.random.seed","torch.random.manual_seed"]', '--fix-random-seed-value=57'
    ])
class OptimizeModelExample(gokart.TaskOnKart):
    task_namespace = 'examples'

    def requires(self):
        data = MakeData()
        redshells.factory.register_prediction_model('XGBClassifier',
                                                    xgboost.XGBClassifier)
        return redshells.train.OptimizeBinaryClassificationModel(
            rerun=True,
            train_data_task=data,
            target_column_name='y',
            model_name='XGBClassifier',
            model_kwargs=dict(n_estimators=50),
            test_size=0.2,
            optuna_param_name='XGBClassifier_default')

    def output(self):
        return self.make_target('binary_classification/results.pkl')

    def run(self):
        model = self.load()
        logger.info(model)


if __name__ == '__main__':
    luigi.configuration.add_config_path('./config/example.ini')
    gokart.run([
        'examples.OptimizeModelExample',
        '--local-scheduler',
    ])
import gokart
import pandas as pd


# Please define a class which inherits `gokart.PandasTypeConfig`.
# **In practice, please import `SamplePandasTypeConfig` in `__init__`.**
class SamplePandasTypeConfig(gokart.PandasTypeConfig):
    task_namespace = 'sample_pandas_type_check'

    @classmethod
    def type_dict(cls) -> Dict[str, Any]:
        return {'int_column': int}


class SampleTask(gokart.TaskOnKart):
    # Please set the same `task_namespace` as `SamplePandasTypeConfig`.
    task_namespace = 'sample_pandas_type_check'

    def run(self):
        df = pd.DataFrame(dict(int_column=['a']))
        self.dump(
            df
        )  # This line causes PandasTypeError, because expected type is `int`, but `str` is passed.


if __name__ == '__main__':
    gokart.run([
        'sample_pandas_type_check.SampleTask', '--local-scheduler', '--rerun'
    ])
    task_namespace = 'examples'

    def requires(self):
        return TrainClassificationModel()

    def output(self):
        return self.make_target('output/results.txt')

    def run(self):
        score_texts = self.load()["scores"]
        scores = np.array(
            [self._extract_average(text) for text in score_texts])
        averages = dict(
            zip(['precision', 'recall', 'f1-score', 'support'],
                np.average(scores, axis=0)))

        self.dump(averages)

    @staticmethod
    def _extract_average(score_text: str):
        # return 'precision', 'recall', 'f1-score', 'support'
        return [float(x) for x in score_text.split()[-4:]]


if __name__ == '__main__':

    gokart.run([
        'examples.ReportClassificationResults',
        '--local-scheduler',
    ])