コード例 #1
0
    def test_save_workflow(self):
        '''Test saving a workflow to yaml:'''
        from gquant.dataframe_flow import TaskGraph
        task_graph = TaskGraph(self._task_list)
        workflow_file = os.path.join(self._test_dir, 'test_save_workflow.yaml')
        task_graph.save_taskgraph(workflow_file)

        with open(workflow_file) as wf:
            workflow_str = wf.read()

        # verify the workflow contentst same as expected. Empty list if same.
        cdiff = list(context_diff(WORKFLOW_YAML, workflow_str))
        cdiff_empty = cdiff == []

        err_msg = 'Workflow yaml contents do not match expected results.\n'\
            'SHOULD HAVE SAVED:\n\n'\
            '{wyaml}\n\n'\
            'INSTEAD FILE CONTAINS:\n\n'\
            '{fcont}\n\n'\
            'DIFF:\n\n'\
            '{diff}'.format(wyaml=WORKFLOW_YAML, fcont=workflow_str,
                            diff=''.join(cdiff))

        self.assertTrue(cdiff_empty, err_msg)
コード例 #2
0
ファイル: test_taskgraph_api.py プロジェクト: idanre1/gQuant
class TestTaskGraphAPI(unittest.TestCase):
    def setUp(self):
        import gc  # python garbage collector
        import cudf

        # warmup
        s = cudf.Series([1, 2, 3, None, 4], nan_as_null=False)
        del(s)
        gc.collect()

        os.environ['GQUANT_PLUGIN_MODULE'] = 'tests.unit.custom_port_nodes'

        points_task_spec = {
            TaskSpecSchema.task_id: 'points_task',
            TaskSpecSchema.node_type: 'PointNode',
            TaskSpecSchema.conf: {'npts': 1000},
            TaskSpecSchema.inputs: []
        }

        distance_task_spec = {
            TaskSpecSchema.task_id: 'distance_by_cudf',
            TaskSpecSchema.node_type: 'DistanceNode',
            TaskSpecSchema.conf: {},
            TaskSpecSchema.inputs: {
                'points_df_in': 'points_task.points_df_out'
            }
        }

        tspec_list = [points_task_spec, distance_task_spec]

        self.tgraph = TaskGraph(tspec_list)

        # Create a temporary directory
        self._test_dir = tempfile.mkdtemp()
        os.environ['GQUANT_CACHE_DIR'] = os.path.join(self._test_dir, '.cache')

    def tearDown(self):
        global DEFAULT_MODULE
        os.environ['GQUANT_PLUGIN_MODULE'] = DEFAULT_MODULE
        os.environ['GQUANT_CACHE_DIR'] = Node.cache_dir
        shutil.rmtree(self._test_dir)

    @ordered
    def test_viz_graph(self):
        '''Test taskgraph to networkx graph conversion for graph visualization.
        '''
        nx_graph = self.tgraph.viz_graph(show_ports=True)
        nx_nodes = ['points_task', 'points_task.points_df_out',
                    'points_task.points_ddf_out',
                    'distance_by_cudf', 'distance_by_cudf.distance_df',
                    'distance_by_cudf.distance_abs_df']
        nx_edges = [('points_task', 'points_task.points_df_out'),
                    ('points_task', 'points_task.points_ddf_out'),
                    ('points_task.points_df_out', 'distance_by_cudf'),
                    ('distance_by_cudf', 'distance_by_cudf.distance_df'),
                    ('distance_by_cudf', 'distance_by_cudf.distance_abs_df')]
        self.assertEqual(list(nx_graph.nodes), nx_nodes)
        self.assertEqual(list(nx_graph.edges), nx_edges)

    @ordered
    def test_build(self):
        '''Test build of a taskgraph and that all inputs and outputs are set
        for the tasks withink a taskgraph.
        '''
        self.tgraph.build()

        points_node = self.tgraph['points_task']
        distance_node = self.tgraph['distance_by_cudf']

        onode_info = {
            'to_node': distance_node,
            'to_port': 'points_df_in',
            'from_port': 'points_df_out'
        }
        self.assertIn(onode_info, points_node.outputs)

        onode_cols = {'points_df_out': {'x': 'float64', 'y': 'float64'},
                      'points_ddf_out': {'x': 'float64', 'y': 'float64'}}
        self.assertEqual(onode_cols, points_node.meta_setup().outports)

        inode_info = {
            'from_node': points_node,
            'from_port': 'points_df_out',
            'to_port': 'points_df_in'
        }
        self.assertIn(inode_info, distance_node.inputs)

        inode_in_cols = {
            'points_df_in': {
                'x': 'float64',
                'y': 'float64'
            }
        }
        self.assertEqual(inode_in_cols, distance_node.get_input_meta())

        inode_out_cols = {'distance_df': {'distance_cudf': 'float64',
                                          'x': 'float64',
                                          'y': 'float64'},
                          'distance_abs_df': {'distance_abs_cudf': 'float64',
                                              'x': 'float64', 'y': 'float64'}}
        self.assertEqual(inode_out_cols, distance_node.meta_setup().outports)

    @ordered
    def test_run(self):
        '''Test that a taskgraph can run successfully.
        '''
        outlist = ['distance_by_cudf.distance_df']
        # Using numpy random seed to get repeatable and deterministic results.
        # For seed 2335 should get something around 761.062831178.
        replace_spec = {
            'points_task': {
                TaskSpecSchema.conf: {
                    'npts': 1000,
                    'nseed': 2335
                }
            }
        }
        (dist_df_w_cudf, ) = self.tgraph.run(
            outputs=outlist, replace=replace_spec)
        dist_sum = dist_df_w_cudf['distance_cudf'].sum()
        # self.assertAlmostEqual(dist_sum, 0.0, places, msg, delta)
        self.assertAlmostEqual(dist_sum, 761.062831178)  # match to 7 places

    @ordered
    def test_save(self):
        '''Test that a taskgraph can be save to a yaml file.
        '''
        workflow_file = os.path.join(self._test_dir,
                                     'test_save_taskgraph.yaml')
        self.tgraph.save_taskgraph(workflow_file)

        with open(workflow_file) as wf:
            workflow_str = wf.read()

        # verify the workflow contentst same as expected. Empty list if same.
        global TASKGRAPH_YAML
        cdiff = list(context_diff(TASKGRAPH_YAML, workflow_str))
        cdiff_empty = cdiff == []

        err_msg = 'Taskgraph yaml contents do not match expected results.\n'\
            'SHOULD HAVE SAVED:\n\n'\
            '{wyaml}\n\n'\
            'INSTEAD FILE CONTAINS:\n\n'\
            '{fcont}\n\n'\
            'DIFF:\n\n'\
            '{diff}'.format(wyaml=TASKGRAPH_YAML, fcont=workflow_str,
                            diff=''.join(cdiff))

        self.assertTrue(cdiff_empty, err_msg)

    @ordered
    def test_load(self):
        '''Test that a taskgraph can be loaded from a yaml file.
        '''
        workflow_file = os.path.join(self._test_dir,
                                     'test_load_taskgraph.yaml')

        global TASKGRAPH_YAML
        with open(workflow_file, 'w') as wf:
            wf.write(TASKGRAPH_YAML)

        tspec_list = [task._task_spec for task in self.tgraph]

        tgraph = TaskGraph.load_taskgraph(workflow_file)
        all_tasks_exist = True
        for task in tgraph:
            if task._task_spec not in tspec_list:
                all_tasks_exist = False
                break

        with StringIO() as yf:
            yaml.dump(tspec_list, yf,
                      default_flow_style=False, sort_keys=False)
            yf.seek(0)

            err_msg = 'Load taskgraph failed. Missing expected task items.\n'\
                'EXPECTED TASKGRAPH YAML:\n\n'\
                '{wyaml}\n\n'\
                'GOT TASKS FORMATTED AS YAML:\n\n'\
                '{tlist}\n\n'.format(wyaml=TASKGRAPH_YAML, tlist=yf.read())

            self.assertTrue(all_tasks_exist, err_msg)

    @ordered
    def test_save_load_cache(self):
        '''Test caching of tasks outputs within a taskgraph.

            1. Save points_task output to cache when running the taskgraph.
            2. Load points_task df from cache when running the taskgraph.
        '''
        replace_spec = {'points_task': {TaskSpecSchema.save: True}}
        outlist = ['distance_by_cudf.distance_df']

        with warnings.catch_warnings():
            # ignore UserWarning: Using CPU via Pandas to write HDF dataset
            warnings.filterwarnings(
                'ignore',
                message='Using CPU via Pandas to write HDF dataset',
                category=UserWarning,)
            # ignore RuntimeWarning: numpy.ufunc size changed
            warnings.filterwarnings('ignore',
                                    category=RuntimeWarning,
                                    message='numpy.ufunc size changed')
            (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec)

        cache_dir = os.path.join(self._test_dir, '.cache', 'points_task.hdf5')
        self.assertTrue(os.path.exists(cache_dir))

        replace_spec = {'points_task': {TaskSpecSchema.load: True}}
        with warnings.catch_warnings():
            # ignore UserWarning: Using CPU via Pandas to read HDF dataset
            warnings.filterwarnings(
                'ignore',
                message='Using CPU via Pandas to read HDF dataset',
                category=UserWarning)
            (_, ) = self.tgraph.run(outputs=outlist, replace=replace_spec)