def test_columns_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnotnums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(LookupError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Task "numproc" column "list" expected type "numbers" got '\
            'type "notnumbers" instead.'
        self.assertIn(errmsg, outerr_msg)
    def test_columns_name_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'rangenums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(LookupError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Task "numproc" missing required column "list" from '\
            '"numgen.numlist".'
        self.assertIn(errmsg, outerr_msg)
def mortgage_greenflow_run(run_params_dict):
    '''Using dataframe-flow runs the tasks/workflow specified in the
    run_params_dict. Expected run_params_dict ex:
        run_params_dict = {
            'replace_spec': replace_spec,
            'task_spec_list': greenflow_task_spec_list,
            'out_list': out_list
        }

    greenflow_task_spec_list - Mortgage ETL workflow list of task-specs. Refer to
        module mortgage_common function mortgage_etl_workflow_def.

    out_list - Expected to specify one output which should be the final
        dataframe produced by the mortgage ETL workflow.

    :param run_params_dict: Dictionary with parameters and greenflow task list to
        run mortgage workflow.

    '''
    from greenflow.dataframe_flow import TaskGraph

    task_spec_list = run_params_dict['task_spec_list']
    out_list = run_params_dict['out_list']

    replace_spec = run_params_dict['replace_spec']
    task_graph = TaskGraph(task_spec_list)

    (final_perf_acq_df, ) = task_graph.run(out_list, replace_spec)

    return final_perf_acq_df
    def test_ports_connection_subclass_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'}
        numproc_spec[TaskSpecSchema.conf] = {'port_type': MyList}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(TypeError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Connected nodes do not have matching port types. '\
            'Fix port types.'
        self.assertIn(errmsg, outerr_msg)
    def test_ports_output_type_mismatch(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {
            'columns_option': 'listnums',
            'out_type': 'rangenums'
        }

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_invalid = TaskGraph(tspec_list)

        with self.assertRaises(TypeError) as cm:
            tgraph_invalid.run(['numproc.sum'])
        outerr_msg = '{}'.format(cm.exception)

        errmsg = 'Node "numgen" output port "numlist" produced wrong type '\
            '"<class \'range\'>". Expected type "[<class \'list\'>]"'
        self.assertEqual(errmsg, outerr_msg)
    def test_columns_and_ports_types_match(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {'columns_option': 'listnums'}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_valid = TaskGraph(tspec_list)

        sumout, = tgraph_valid.run(['numproc.sum'])

        self.assertEqual(sumout, 45)
    def test_ports_connection_subclass_type_match(self):
        numgen_spec = copy.deepcopy(self.numgen_spec)
        numproc_spec = copy.deepcopy(self.numproc_spec)

        numgen_spec[TaskSpecSchema.conf] = {
            'port_type': MyList,
            'columns_option': 'mylistnums'
        }
        numproc_spec[TaskSpecSchema.conf] = {'port_type': list}

        tspec_list = [numgen_spec, numproc_spec]
        tgraph_valid = TaskGraph(tspec_list)

        sumout, = tgraph_valid.run(['numproc.sum'])

        self.assertEqual(sumout, 45)
class TestTaskGraphAPI(unittest.TestCase):
    def setUp(self):
        import gc  # python garbage collector

        # warmup
        s = pd.Series([1, 2, 3, None, 4])
        del (s)
        gc.collect()

        os.environ['GREENFLOW_PLUGIN_MODULE'] = 'tests.unit.custom_port_nodes'

        points_task_spec = {
            TaskSpecSchema.task_id: 'points_task',
            TaskSpecSchema.node_type: 'PointNode',
            TaskSpecSchema.conf: {
                'npts': 1000
            },
            TaskSpecSchema.inputs: []
        }

        distance_task_spec = {
            TaskSpecSchema.task_id: 'distance_by_df',
            TaskSpecSchema.node_type: 'DistanceNode',
            TaskSpecSchema.conf: {},
            TaskSpecSchema.inputs: {
                'points_df_in': 'points_task.points_df_out'
            }
        }

        tspec_list = [points_task_spec, distance_task_spec]

        self.tgraph = TaskGraph(tspec_list)

        # Create a temporary directory
        self._test_dir = tempfile.mkdtemp()
        os.environ['GREENFLOW_CACHE_DIR'] = os.path.join(
            self._test_dir, '.cache')

    def tearDown(self):
        global DEFAULT_MODULE
        os.environ['GREENFLOW_PLUGIN_MODULE'] = DEFAULT_MODULE
        os.environ['GREENFLOW_CACHE_DIR'] = Node.cache_dir
        shutil.rmtree(self._test_dir)

    @ordered
    def test_viz_graph(self):
        '''Test taskgraph to networkx graph conversion for graph visualization.
        '''
        nx_graph = self.tgraph.viz_graph(show_ports=True)
        nx_nodes = [
            'points_task', 'points_task.points_df_out', 'distance_by_df',
            'distance_by_df.distance_df', 'distance_by_df.distance_abs_df'
        ]
        nx_edges = [('points_task', 'points_task.points_df_out'),
                    ('points_task.points_df_out', 'distance_by_df'),
                    ('distance_by_df', 'distance_by_df.distance_df'),
                    ('distance_by_df', 'distance_by_df.distance_abs_df')]
        self.assertEqual(list(nx_graph.nodes), nx_nodes)
        self.assertEqual(list(nx_graph.edges), nx_edges)

    @ordered
    def test_build(self):
        '''Test build of a taskgraph and that all inputs and outputs are set
        for the tasks withink a taskgraph.
        '''
        self.tgraph.build()

        points_node = self.tgraph['points_task']
        distance_node = self.tgraph['distance_by_df']

        onode_info = {
            'to_node': distance_node,
            'to_port': 'points_df_in',
            'from_port': 'points_df_out'
        }
        self.assertIn(onode_info, points_node.outputs)

        onode_cols = {
            'points_df_out': {
                'x': 'float64',
                'y': 'float64'
            },
            'points_ddf_out': {
                'x': 'float64',
                'y': 'float64'
            }
        }
        self.assertEqual(onode_cols, points_node.meta_setup().outports)

        inode_info = {
            'from_node': points_node,
            'from_port': 'points_df_out',
            'to_port': 'points_df_in'
        }
        self.assertIn(inode_info, distance_node.inputs)

        inode_in_cols = {'points_df_in': {'x': 'float64', 'y': 'float64'}}
        self.assertEqual(inode_in_cols, distance_node.get_input_meta())

        inode_out_cols = {
            'distance_df': {
                'distance_df': 'float64',
                'x': 'float64',
                'y': 'float64'
            },
            'distance_abs_df': {
                'distance_abs_df': 'float64',
                'x': 'float64',
                'y': 'float64'
            }
        }
        self.assertEqual(inode_out_cols, distance_node.meta_setup().outports)

    @ordered
    def test_run(self):
        '''Test that a taskgraph can run successfully.
        '''
        outlist = ['distance_by_df.distance_df']
        # Using numpy random seed to get repeatable and deterministic results.
        # For seed 2335 should get something around 761.062831178.
        replace_spec = {
            'points_task': {
                TaskSpecSchema.conf: {
                    'npts': 1000,
                    'nseed': 2335
                }
            }
        }
        (dist_df_w_df, ) = self.tgraph.run(outputs=outlist,
                                           replace=replace_spec)
        dist_sum = dist_df_w_df['distance_df'].sum()
        # self.assertAlmostEqual(dist_sum, 0.0, places, msg, delta)
        self.assertAlmostEqual(dist_sum, 761.062831178)  # match to 7 places

    @ordered
    def test_save(self):
        '''Test that a taskgraph can be save to a yaml file.
        '''
        workflow_file = os.path.join(self._test_dir,
                                     'test_save_taskgraph.yaml')
        self.tgraph.save_taskgraph(workflow_file)

        with open(workflow_file) as wf:
            workflow_str = wf.read()

        # verify the workflow contentst same as expected. Empty list if same.
        global TASKGRAPH_YAML
        cdiff = list(context_diff(TASKGRAPH_YAML, workflow_str))
        cdiff_empty = cdiff == []

        err_msg = 'Taskgraph yaml contents do not match expected results.\n'\
            'SHOULD HAVE SAVED:\n\n'\
            '{wyaml}\n\n'\
            'INSTEAD FILE CONTAINS:\n\n'\
            '{fcont}\n\n'\
            'DIFF:\n\n'\
            '{diff}'.format(wyaml=TASKGRAPH_YAML, fcont=workflow_str,
                            diff=''.join(cdiff))

        self.assertTrue(cdiff_empty, err_msg)

    @ordered
    def test_load(self):
        '''Test that a taskgraph can be loaded from a yaml file.
        '''
        workflow_file = os.path.join(self._test_dir,
                                     'test_load_taskgraph.yaml')

        global TASKGRAPH_YAML
        with open(workflow_file, 'w') as wf:
            wf.write(TASKGRAPH_YAML)

        tspec_list = [task._task_spec for task in self.tgraph]

        tgraph = TaskGraph.load_taskgraph(workflow_file)
        all_tasks_exist = True
        for task in tgraph:
            if task._task_spec not in tspec_list:
                all_tasks_exist = False
                break

        with StringIO() as yf:
            yaml.dump(tspec_list,
                      yf,
                      default_flow_style=False,
                      sort_keys=False)
            yf.seek(0)

            err_msg = 'Load taskgraph failed. Missing expected task items.\n'\
                'EXPECTED TASKGRAPH YAML:\n\n'\
                '{wyaml}\n\n'\
                'GOT TASKS FORMATTED AS YAML:\n\n'\
                '{tlist}\n\n'.format(wyaml=TASKGRAPH_YAML, tlist=yf.read())

            self.assertTrue(all_tasks_exist, err_msg)

    @ordered
    def test_save_load_cache(self):
        '''Test caching of tasks outputs within a taskgraph.

            1. Save points_task output to cache when running the taskgraph.
            2. Load points_task df from cache when running the taskgraph.
        '''
        replace_spec = {'points_task': {TaskSpecSchema.save: True}}
        outlist = ['distance_by_df.distance_df']

        with warnings.catch_warnings():
            # ignore UserWarning: Using CPU via Pandas to write HDF dataset
            warnings.filterwarnings(
                'ignore',
                message='Using CPU via Pandas to write HDF dataset',
                category=UserWarning,
            )
            # ignore RuntimeWarning: numpy.ufunc size changed
            warnings.filterwarnings('ignore',
                                    category=RuntimeWarning,
                                    message='numpy.ufunc size changed')
            (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec)

        cache_dir = os.path.join(self._test_dir, '.cache', 'points_task.hdf5')
        self.assertTrue(os.path.exists(cache_dir))

        replace_spec = {'points_task': {TaskSpecSchema.load: True}}
        with warnings.catch_warnings():
            # ignore UserWarning: Using CPU via Pandas to read HDF dataset
            warnings.filterwarnings(
                'ignore',
                message='Using CPU via Pandas to read HDF dataset',
                category=UserWarning)
            (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec)
def main():
    _basedir = os.path.dirname(__file__)

    # mortgage_data_path = '/datasets/rapids_data/mortgage'
    mortgage_data_path = os.path.join(_basedir, 'mortgage_data')

    # Using some default csv files for testing.
    # csvfile_names = os.path.join(mortgage_data_path, 'names.csv')
    # acq_data_path = os.path.join(mortgage_data_path, 'acq')
    # perf_data_path = os.path.join(mortgage_data_path, 'perf')
    # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt')
    # csvfile_perfdata = \
    #     os.path.join(perf_data_path, 'Performance_2000Q1.txt_0')
    # mortgage_etl_workflow_def(
    #     csvfile_names, csvfile_acqdata, csvfile_perfdata)

    greenflow_task_spec_list = mortgage_etl_workflow_def()

    start_year = 2000
    end_year = 2001  # end_year is inclusive
    # end_year = 2016  # end_year is inclusive
    # part_count = 16  # the number of data files to train against
    part_count = 12  # the number of data files to train against
    # part_count = 4  # the number of data files to train against

    mortgage_run_params_dict_list = generate_mortgage_greenflow_run_params_list(
        mortgage_data_path, start_year, end_year, part_count,
        greenflow_task_spec_list)

    _basedir = os.path.dirname(__file__)
    mortgage_lib_module = os.path.join(_basedir, 'mortgage_greenflow_plugins.py')

    mortgage_workflow_runner_task = {
        TaskSpecSchema.task_id:
            MortgageTaskNames.mortgage_workflow_runner_task_name,
        TaskSpecSchema.node_type: 'MortgageWorkflowRunner',
        TaskSpecSchema.conf: {
            'mortgage_run_params_dict_list': mortgage_run_params_dict_list
        },
        TaskSpecSchema.inputs: [],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    # Can be multi-gpu. Set ngpus > 1. This is different than dask xgboost
    # which is distributed multi-gpu i.e. dask-xgboost could distribute on one
    # node or multiple nodes. In distributed mode the dmatrix is disributed.
    ngpus = 1
    xgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2 ** 8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': ngpus,
        # 'distributed_dask': True,
        'loss': 'ls',
        # 'objective': 'gpu:reg:linear',
        'objective': 'reg:squarederror',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    xgb_trainer_task = {
        TaskSpecSchema.task_id: MortgageTaskNames.xgb_trainer_task_name,
        TaskSpecSchema.node_type: 'XgbMortgageTrainer',
        TaskSpecSchema.conf: {
            'delete_dataframes': False,
            'xgb_gpu_params': xgb_gpu_params
        },
        TaskSpecSchema.inputs: [
            MortgageTaskNames.mortgage_workflow_runner_task_name
        ],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    task_spec_list = [mortgage_workflow_runner_task, xgb_trainer_task]
    task_graph = TaskGraph(task_spec_list)

    # out_list = [MortgageTaskNames.mortgage_workflow_runner_task_name]
    # ((mortgage_feat_df_pandas, delinq_df_pandas),) = task_graph.run(out_list)

    out_list = [MortgageTaskNames.xgb_trainer_task_name]
    (bst,) = task_graph.run(out_list)

    print('XGBOOST BOOSTER:\n', bst)
Exemple #10
0
def main():

    memory_limit = 128e9
    threads_per_worker = 4
    cluster = LocalCUDACluster(memory_limit=memory_limit,
                               threads_per_worker=threads_per_worker)
    client = Client(cluster)
    sched_info = client.scheduler_info()

    print('CLIENT: {}'.format(client))
    print('SCHEDULER INFO:\n{}'.format(json.dumps(sched_info, indent=2)))

    # Importing here in case RMM is used later on. Must start client prior
    # to importing cudf stuff if using RMM.
    from greenflow.dataframe_flow import (TaskSpecSchema, TaskGraph)

    # workers_names = \
    #     [iw['name'] for iw in client.scheduler_info()['workers'].values()]
    # nworkers = len(workers_names)

    _basedir = os.path.dirname(__file__)
    # mortgage_data_path = '/datasets/rapids_data/mortgage'
    mortgage_data_path = os.path.join(_basedir, 'mortgage_data')

    # Using some default csv files for testing.
    # csvfile_names = os.path.join(mortgage_data_path, 'names.csv')
    # acq_data_path = os.path.join(mortgage_data_path, 'acq')
    # perf_data_path = os.path.join(mortgage_data_path, 'perf')
    # csvfile_acqdata = os.path.join(acq_data_path, 'Acquisition_2000Q1.txt')
    # csvfile_perfdata = \
    #     os.path.join(perf_data_path, 'Performance_2000Q1.txt_0')
    # mortgage_etl_workflow_def(
    #     csvfile_names, csvfile_acqdata, csvfile_perfdata)

    greenflow_task_spec_list = mortgage_etl_workflow_def()

    start_year = 2000
    end_year = 2001  # end_year is inclusive
    # end_year = 2016  # end_year is inclusive
    # part_count = 16  # the number of data files to train against

    # create_dmatrix_serially - When False on same node if not enough host RAM
    # then it's a race condition when creating the dmatrix. Make sure enough
    # host RAM otherwise set to True.
    # create_dmatrix_serially = False

    # able to do 18 with create_dmatrix_serially set to True
    part_count = 18  # the number of data files to train against
    create_dmatrix_serially = True
    # part_count = 4  # the number of data files to train against

    # Use RAPIDS Memory Manager. Seems to work fine without it.
    use_rmm = False

    # Clean up intermediate dataframes in the xgboost training task.
    delete_dataframes = True

    mortgage_run_params_dict_list = generate_mortgage_greenflow_run_params_list(
        mortgage_data_path, start_year, end_year, part_count,
        greenflow_task_spec_list)

    _basedir = os.path.dirname(__file__)
    mortgage_lib_module = os.path.join(_basedir,
                                       'mortgage_greenflow_plugins.py')

    filter_dask_logger = False

    mortgage_workflow_runner_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_mortgage_workflow_runner_task_name,
        TaskSpecSchema.node_type: 'DaskMortgageWorkflowRunner',
        TaskSpecSchema.conf: {
            'mortgage_run_params_dict_list': mortgage_run_params_dict_list,
            'client': client,
            'use_rmm': use_rmm,
            'filter_dask_logger': filter_dask_logger,
        },
        TaskSpecSchema.inputs: [],
        TaskSpecSchema.filepath: mortgage_lib_module
    }

    dxgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2**8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': 1,
        'distributed_dask': True,
        'loss': 'ls',
        # 'objective': 'gpu:reg:linear',
        'objective': 'reg:squarederror',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    dxgb_trainer_task = {
        TaskSpecSchema.task_id:
        MortgageTaskNames.dask_xgb_trainer_task_name,
        TaskSpecSchema.node_type:
        'DaskXgbMortgageTrainer',
        TaskSpecSchema.conf: {
            'create_dmatrix_serially': create_dmatrix_serially,
            'delete_dataframes': delete_dataframes,
            'dxgb_gpu_params': dxgb_gpu_params,
            'client': client,
            'filter_dask_logger': filter_dask_logger
        },
        TaskSpecSchema.inputs:
        [MortgageTaskNames.dask_mortgage_workflow_runner_task_name],
        TaskSpecSchema.filepath:
        mortgage_lib_module
    }

    task_spec_list = [mortgage_workflow_runner_task, dxgb_trainer_task]

    out_list = [MortgageTaskNames.dask_xgb_trainer_task_name]
    task_graph = TaskGraph(task_spec_list)
    (bst, ) = task_graph.run(out_list)

    print('XGBOOST BOOSTER:\n', bst)