Example #1
0
 def test_allow_different_columns(self):
     result = render(
         pd.DataFrame({"A": [1, 2]}),
         params={
             "tabs": [
                 TabOutput(
                     "tab-2",
                     "Tab 2",
                     {"B": RenderColumn("B", "number", "{}")},
                     pd.DataFrame({"B": [3, 4]}),
                 )
             ],
             "add_source_column":
             False,
             "source_column_name":
             "",
         },
         tab_name="Tab 1",
         input_columns={"A": RenderColumn("A", "number", "{}")},
     )
     # This tests the ordering of columns, too
     assert_frame_equal(
         result,
         pd.DataFrame({
             "A": [1, 2, np.nan, np.nan],
             "B": [np.nan, np.nan, 3, 4]
         }),
     )
Example #2
0
    def test_prevent_overwrite(self):
        left = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'z']})
        right = pd.DataFrame({'A': ['1', '2'], 'B': ['X', 'Y']})
        result = render(left, {
            'right_tab': TabOutput(
                'slug',
                'Tab 2',
                {'A': RenderColumn('A', 'number', '{}'),
                 'B': RenderColumn('B', 'text', None)},
                right
            ),
            'join_columns': {
                'on': ['A'],
                'right': ['B'],
            },
            'type': 'left',
        }, input_columns={
            'A': RenderColumn('A', 'number', '{}'),
            'B': RenderColumn('B', 'text', None),
        })

        self.assertEqual(result, (
            'You tried to add "B" from Tab 2, but your table already has that '
            'column. Please rename the column in one of the tabs, or unselect '
            'the column.'
        ))
Example #3
0
 def test_add_source_column(self):
     result = render(
         pd.DataFrame({"A": [1, 2]}),
         params={
             "tabs": [
                 TabOutput(
                     "tab-2",
                     "Tab 2",
                     {"A": RenderColumn("A", "number", "{}")},
                     pd.DataFrame({"A": [3, 4]}),
                 )
             ],
             "add_source_column":
             True,
             "source_column_name":
             "S",
         },
         tab_name="Tab 1",
         input_columns={"A": RenderColumn("A", "number", "{}")},
     )
     expected = pd.DataFrame({
         # Source column comes _first_
         "S": ["Tab 1", "Tab 1", "Tab 2", "Tab 2"],
         "A": [1, 2, 3, 4],
     })
     # Source column should be categorical: no need to load it with useless
     # copied bytes.
     expected["S"] = expected["S"].astype("category")
     assert_frame_equal(result, expected)
Example #4
0
    def test_on_types_differ(self):
        left = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]})
        right = pd.DataFrame({"A": ["1", "2"], "C": ["X", "Y"]})
        result = render(
            left,
            {
                "right_tab":
                TabOutput(
                    "slug",
                    "Tab 2",
                    {
                        "A": RenderColumn("A", "text", None),
                        "C": RenderColumn("C", "text", None),
                    },
                    right,
                ),
                "join_columns": {
                    "on": ["A"],
                    "right": ["C"]
                },
                "type":
                "left",
            },
            input_columns={
                "A": RenderColumn("A", "number", "{}"),
                "B": RenderColumn("B", "text", None),
            },
        )

        self.assertEqual(
            result,
            ('Column "A" is *number* in this tab and *text* in Tab 2. '
             "Please convert one or the other so they are both the same type."
             ),
        )
Example #5
0
    def test_import_columns_without_formats(self):
        dataframe = pd.DataFrame(
            {
                "A": [1, 2, 3],
                "B": pd.Series(
                    ["2012-01-01", "2015-02-03", "2019-05-23"], dtype="datetime64[ns]"
                ),
                "C": ["a", "b", "c"],
            }
        )

        result = render(
            pd.DataFrame(),
            {
                "tab": TabOutput(
                    "tab-2",
                    "Tab 2",
                    {
                        "A": RenderColumn("A", "number", "{,.2f}"),
                        "B": RenderColumn("B", "datetime", None),
                        "C": RenderColumn("C", "text", None),
                    },
                    dataframe,
                )
            },
        )
        assert_frame_equal(result["dataframe"], dataframe)
        self.assertEqual(result["column_formats"], {"A": "{,.2f}"})
Example #6
0
 def test_add_source_column(self):
     result = render(
         pd.DataFrame({'A': [1, 2]}),
         params={
             'tabs': [
                 TabOutput('tab-2', 'Tab 2', {
                     'A': RenderColumn('A', 'number'),
                 }, pd.DataFrame({'A': [3, 4]})),
             ],
             'add_source_column': True,
             'source_column_name': 'S',
         },
         tab_name='Tab 1',
         input_columns={
             'A': RenderColumn('A', 'number'),
         }
     )
     expected = pd.DataFrame({
         # Source column comes _first_
         'S': ['Tab 1', 'Tab 1', 'Tab 2', 'Tab 2'],
         'A': [1, 2, 3, 4],
     })
     # Source column should be categorical: no need to load it with useless
     # copied bytes.
     expected['S'] = expected['S'].astype('category')
     assert_frame_equal(result, expected)
Example #7
0
 def test_left(self):
     left = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'z']})
     right = pd.DataFrame({'A': [1, 2], 'C': ['X', 'Y'], 'D': [0.1, 0.2]})
     result = render(left, {
         'right_tab': TabOutput(
             'slug',
             'name',
             {'A': RenderColumn('A', 'number', '{:,.2f}'),
              'C': RenderColumn('C', 'text', None),
              'D': RenderColumn('D', 'number', '{:,}')},
             right),
         'join_columns': {
             'on': ['A'],
             'right': ['C', 'D'],
         },
         'type': 'left',
     }, input_columns={
         'A': RenderColumn('A', 'number', '{:d}'),
         'B': RenderColumn('B', 'text', None),
     })
     assert_frame_equal(result['dataframe'], pd.DataFrame({
         'A': [1, 2, 3],
         'B': ['x', 'y', 'z'],
         'C': ['X', 'Y', np.nan],
         'D': [0.1, 0.2, np.nan],
     }))
     self.assertEqual(result['column_formats'], {'C': None, 'D': '{:,}'})
Example #8
0
    def test_prevent_overwrite(self):
        left = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]})
        right = pd.DataFrame({"A": ["1", "2"], "B": ["X", "Y"]})
        result = render(
            left,
            {
                "right_tab":
                TabOutput(
                    "slug",
                    "Tab 2",
                    {
                        "A": RenderColumn("A", "number", "{}"),
                        "B": RenderColumn("B", "text", None),
                    },
                    right,
                ),
                "join_columns": {
                    "on": ["A"],
                    "right": ["B"]
                },
                "type":
                "left",
            },
            input_columns={
                "A": RenderColumn("A", "number", "{}"),
                "B": RenderColumn("B", "text", None),
            },
        )

        self.assertEqual(
            result,
            ('You tried to add "B" from Tab 2, but your table already has that '
             "column. Please rename the column in one of the tabs, or unselect "
             "the column."),
        )
Example #9
0
 def test_inner_join_delete_unused_categories_in_all_columns(self):
     left = pd.DataFrame({
         'A': pd.Series(['a', 'b'], dtype='category'),  # join column
         'B': pd.Series(['c', 'd'], dtype='category'),  # other column
     })
     right = pd.DataFrame({
         'A': pd.Series(['a', 'x'], dtype='category'),  # join column
         'C': pd.Series(['e', 'y'], dtype='category'),  # other column
     })
     result = render(left, {
         'right_tab': TabOutput(
             'slug',
             'Tab 2',
             {'A': RenderColumn('A', 'text', None),
              'C': RenderColumn('C', 'text', None)},
             right
         ),
         'join_columns': {'on': ['A'], 'right': ['C']},
         'type': 'inner',
     }, input_columns={
         'A': RenderColumn('A', 'text', None),
         'B': RenderColumn('B', 'text', None),
     })
     # 'b', 'd', 'x' and 'y' categories don't appear in the result, so the
     # dtypes should not contain them.
     assert_frame_equal(result['dataframe'], pd.DataFrame({
         'A': ['a'],
         'B': ['c'],
         'C': ['e'],
     }, dtype='category'))
Example #10
0
 def test_left_join_delete_unused_categories_in_added_columns(self):
     left = pd.DataFrame({'A': ['a', 'b']}, dtype='category')
     right = pd.DataFrame({
         'A': pd.Series(['a', 'z'], dtype='category'),
         'B': pd.Series(['x', 'y'], dtype='category'),
     })
     result = render(left, {
         'right_tab': TabOutput(
             'slug',
             'Tab 2',
             {'A': RenderColumn('A', 'text', None),
              'B': RenderColumn('B', 'text', None)},
             right
         ),
         'join_columns': {'on': ['A'], 'right': ['B']},
         'type': 'left',
     }, input_columns={
         'A': RenderColumn('A', 'text', None),
     })
     # 'z' category does not appear in result, so it should not be a
     # category in the 'B' column.
     assert_frame_equal(result['dataframe'], pd.DataFrame({
         'A': pd.Series(['a', 'b'], dtype='category'),
         'B': pd.Series(['x', np.nan], dtype='category')
     }))
Example #11
0
 def test_error_different_types(self):
     result = render(
         pd.DataFrame({"A": ["x", "y"]}),
         params={
             "tabs": [
                 TabOutput(
                     "tab-2",
                     "Tab 2",
                     {"A": RenderColumn("A", "number", "{}")},
                     pd.DataFrame({"A": [3, 4]}),
                 )
             ],
             "add_source_column":
             False,
             "source_column_name":
             "",
         },
         tab_name="Tab 1",
         input_columns={"A": RenderColumn("A", "text", None)},
     )
     self.assertEqual(
         result,
         ('Cannot concatenate column "A" of type "number" in "Tab 2" to '
          'column "A" of type "text" in "Tab 1". Please convert one or the '
          "other so they are the same type."),
     )
Example #12
0
 def test_allow_different_columns(self):
     result = render(pd.DataFrame({'A': [1, 2]}),
                     params={
                         'tabs': [
                             TabOutput(
                                 'tab-2', 'Tab 2', {
                                     'B': RenderColumn('B', 'number', '{}'),
                                 }, pd.DataFrame({'B': [3, 4]})),
                         ],
                         'add_source_column':
                         False,
                         'source_column_name':
                         '',
                     },
                     tab_name='Tab 1',
                     input_columns={
                         'A': RenderColumn('A', 'number', '{}'),
                     })
     # This tests the ordering of columns, too
     assert_frame_equal(
         result,
         pd.DataFrame({
             'A': [1, 2, np.nan, np.nan],
             'B': [np.nan, np.nan, 3, 4],
         }))
 def test_convert_float(self):
     result = render(
         pd.DataFrame({"A": [1.111], "B": [2.6]}),
         {"colnames": ["A", "B"]},
         input_columns={
             "A": RenderColumn("A", "number", "{:.2f}"),
             "B": RenderColumn("B", "number", "{:d}"),
         },
     )
     assert_frame_equal(result, pd.DataFrame({"A": ["1.11"], "B": ["2"]}))
 def test_convert_float(self):
     result = render(
         pd.DataFrame({'A': [1.111], 'B': [2.6]}),
         {'colnames': ['A', 'B']},
         input_columns={
             'A': RenderColumn('A', 'number', '{:.2f}'),
             'B': RenderColumn('B', 'number', '{:d}'),
         }
     )
     assert_frame_equal(result, pd.DataFrame({'A': ['1.11'], 'B': ['2']}))
 def test_convert_str(self):
     result = render(
         pd.DataFrame({"A": ["a"]}),
         {"colnames": ["A"]},
         input_columns={"A": RenderColumn("A", "text", None)},
     )
     assert_frame_equal(result, pd.DataFrame({"A": ["a"]}))
Example #16
0
    def render(self, input_result: Optional[ProcessResult],
               params: Dict[str, Any], tab_name: str,
               fetch_result: Optional[ProcessResult]) -> ProcessResult:
        """
        Process `table` with module `render` method, for a ProcessResult.

        If the `render` method raises an exception, this method will return an
        error string. It is always an error for a module to raise an exception.

        Exceptions become error messages. This method cannot produce an
        exception.

        This synchronous method can be slow for complex modules or large
        datasets. Consider calling it from an executor.
        """
        kwargs = {}
        spec = inspect.getfullargspec(self.render_impl)
        varkw = bool(spec.varkw)  # if True, function accepts **kwargs
        kwonlyargs = spec.kwonlyargs
        if varkw or 'fetch_result' in kwonlyargs:
            kwargs['fetch_result'] = fetch_result
        if varkw or 'tab_name' in kwonlyargs:
            kwargs['tab_name'] = tab_name
        if varkw or 'input_columns' in kwonlyargs:
            kwargs['input_columns'] = dict(
                (c.name,
                 RenderColumn(c.name, c.type.name,
                              getattr(c.type, 'format', None)))
                for c in input_result.table_shape.columns)

        table = input_result.dataframe
        input_columns = input_result.columns

        time1 = time.time()

        try:
            out = self.render_impl(table, params, **kwargs)
        except Exception as err:
            logger.exception('Exception in %s.render', self.module_id_name)
            out = self._wrap_exception(err)

        try:
            out = ProcessResult.coerce(out, try_fallback_columns=input_columns)
        except ValueError as err:
            logger.exception('Exception coercing %s.render output',
                             self.module_id_name)
            out = ProcessResult(error=(
                'Something unexpected happened. We have been notified and are '
                'working to fix it. If this persists, contact us. Error code: '
                + str(err)))

        out.truncate_in_place_if_too_big()

        time2 = time.time()
        shape = out.dataframe.shape if out is not None else (-1, -1)
        logger.info('%s rendered (%drows,%dcols)=>(%drows,%dcols) in %dms',
                    self.name, table.shape[0], table.shape[1], shape[0],
                    shape[1], int((time2 - time1) * 1000))

        return out
 def test_convert_numbers_all_null(self):
     result = render(
         pd.DataFrame({"A": [np.nan, np.nan]}, dtype=np.float64),
         {"colnames": ["A"]},
         input_columns={"A": RenderColumn("A", "number", "{:d}")},
     )
     assert_frame_equal(result, pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object))
 def test_convert_null(self):
     result = render(
         pd.DataFrame({"A": [1, np.nan]}),
         {"colnames": ["A"]},
         input_columns={"A": RenderColumn("A", "number", "{:,d}")},
     )
     assert_frame_equal(result, pd.DataFrame({"A": ["1", np.nan]}))
 def test_NOP(self):
     # should NOP when first applied
     result = render(
         pd.DataFrame({"A": [0.006]}),
         {"colnames": []},
         input_columns={"A": RenderColumn("A", "number", "{:.2f}")},
     )
     assert_frame_equal(result, pd.DataFrame({"A": [0.006]}))
Example #20
0
 def test_inner_join_delete_unused_categories_in_all_columns(self):
     left = pd.DataFrame({
         "A": pd.Series(["a", "b"], dtype="category"),  # join column
         "B": pd.Series(["c", "d"], dtype="category"),  # other column
     })
     right = pd.DataFrame({
         "A": pd.Series(["a", "x"], dtype="category"),  # join column
         "C": pd.Series(["e", "y"], dtype="category"),  # other column
     })
     result = render(
         left,
         {
             "right_tab":
             TabOutput(
                 "slug",
                 "Tab 2",
                 {
                     "A": RenderColumn("A", "text", None),
                     "C": RenderColumn("C", "text", None),
                 },
                 right,
             ),
             "join_columns": {
                 "on": ["A"],
                 "right": ["C"]
             },
             "type":
             "inner",
         },
         input_columns={
             "A": RenderColumn("A", "text", None),
             "B": RenderColumn("B", "text", None),
         },
     )
     # 'b', 'd', 'x' and 'y' categories don't appear in the result, so the
     # dtypes should not contain them.
     assert_frame_equal(
         result["dataframe"],
         pd.DataFrame({
             "A": ["a"],
             "B": ["c"],
             "C": ["e"]
         },
                      dtype="category"),
     )
Example #21
0
    def test_clean_tabs_preserve_ordering(self):
        tab2_output = ProcessResult(pd.DataFrame({"A": [1, 2]}))
        tab3_output = ProcessResult(pd.DataFrame({"B": [2, 3]}))
        workflow = Workflow.create_and_init()
        tab1 = workflow.tabs.first()
        tab2 = workflow.tabs.create(position=1, slug="tab-2", name="Tab 2")
        tab3 = workflow.tabs.create(position=1, slug="tab-3", name="Tab 3")
        wfm2 = tab2.wf_modules.create(
            order=0,
            slug="step-1",
            last_relevant_delta_id=workflow.last_delta_id)
        wfm2.cache_render_result(workflow.last_delta_id, tab2_output)
        wfm3 = tab3.wf_modules.create(
            order=0,
            slug="step-2",
            last_relevant_delta_id=workflow.last_delta_id)
        wfm3.cache_render_result(workflow.last_delta_id, tab3_output)

        # RenderContext's dict ordering determines desired tab order. (Python
        # 3.7 spec: dict is ordered in insertion order. CPython 3.6 and PyPy 7
        # do this, too.)
        context = RenderContext(
            workflow.id,
            None,
            None,
            {
                tab1.slug: None,
                tab2.slug: StepResultShape("ok", tab2_output.table_shape),
                tab3.slug: StepResultShape("ok", tab3_output.table_shape),
            },
            None,
        )
        # Supply wrongly-ordered tabs. Cleaned, they should be in order.
        result = clean_value(ParamDType.Multitab(), [tab3.slug, tab2.slug],
                             context)
        self.assertEqual(result[0].slug, tab2.slug)
        self.assertEqual(result[0].name, tab2.name)
        self.assertEqual(result[0].columns,
                         {"A": RenderColumn("A", "number", "{:,}")})
        assert_frame_equal(result[0].dataframe, pd.DataFrame({"A": [1, 2]}))
        self.assertEqual(result[1].slug, tab3.slug)
        self.assertEqual(result[1].name, tab3.name)
        self.assertEqual(result[1].columns,
                         {"B": RenderColumn("B", "number", "{:,}")})
        assert_frame_equal(result[1].dataframe, pd.DataFrame({"B": [2, 3]}))
Example #22
0
 def test_right_join_delete_unused_categories_in_input_columns(self):
     left = pd.DataFrame({
         "A": pd.Series(["a", "b"], dtype="category"),  # join column
         "B": pd.Series(["c", "d"], dtype="category"),  # other column
     })
     right = pd.DataFrame({
         "A": pd.Series(["a"], dtype="category"),
         "C": ["e"]
     }  # join column
                          )
     result = render(
         left,
         {
             "right_tab":
             TabOutput(
                 "slug",
                 "Tab 2",
                 {
                     "A": RenderColumn("A", "text", None),
                     "C": RenderColumn("C", "text", None),
                 },
                 right,
             ),
             "join_columns": {
                 "on": ["A"],
                 "right": ["C"]
             },
             "type":
             "right",
         },
         input_columns={
             "A": RenderColumn("A", "text", None),
             "B": RenderColumn("B", "text", None),
         },
     )
     # 'b' and 'd' categories don't appear in result, so it should not be
     # categories in the result dataframe.
     assert_frame_equal(
         result["dataframe"],
         pd.DataFrame({
             "A": pd.Series(["a"], dtype="category"),
             "B": pd.Series(["c"], dtype="category"),
             "C": ["e"],
         }),
     )
 def test_convert_str(self):
     result = render(
         pd.DataFrame({'A': ['a']}),
         {'colnames': ['A']},
         input_columns={
             'A': RenderColumn('A', 'text', None),
         }
     )
     assert_frame_equal(result, pd.DataFrame({'A': ['a']}))
 def test_convert_datetime(self):
     result = render(
         pd.DataFrame(
             {"A": [np.datetime64("2018-01-01"), np.datetime64("2019-02-13")]}
         ),
         {"colnames": ["A"]},
         input_columns={"A": RenderColumn("A", "datetime", None)},
     )
     assert_frame_equal(result, pd.DataFrame({"A": ["2018-01-01", "2019-02-13"]}))
Example #25
0
 def test_happy_path(self):
     result = render(
         pd.DataFrame({'A': [1, 2]}),
         params={
             'tabs': [
                 TabOutput('tab-2', 'Tab 2', {
                     'A': RenderColumn('A', 'number'),
                 }, pd.DataFrame({'A': [3, 4]})),
             ],
             'add_source_column': False,
             'source_column_name': '',
         },
         tab_name='Tab 1',
         input_columns={
             'A': RenderColumn('A', 'number'),
         }
     )
     assert_frame_equal(result, pd.DataFrame({'A': [1, 2, 3, 4]}))
 def test_convert_null(self):
     result = render(
         pd.DataFrame({'A': [1, np.nan]}),
         {'colnames': ['A']},
         input_columns={
             'A': RenderColumn('A', 'number', '{:,d}'),
         }
     )
     assert_frame_equal(result, pd.DataFrame({'A': ['1', np.nan]}))
 def test_happy_path(self):
     result = render(
         pd.DataFrame(), {
             'tab':
             TabOutput('tab-2', 'Tab 2', {
                 'A': RenderColumn('A', 'number', '{}'),
             }, pd.DataFrame({'A': [3, 4]})),
         })
     assert_frame_equal(result['dataframe'], pd.DataFrame({'A': [3, 4]}))
     self.assertEqual(result['column_formats'], {'A': '{}'})
 def test_NOP(self):
     # should NOP when first applied
     result = render(
         pd.DataFrame({'A': [0.006]}),
         {'colnames': []},
         input_columns={
             'A': RenderColumn('A', 'number', '{:.2f}'),
         }
     )
     assert_frame_equal(result, pd.DataFrame({'A': [0.006]}))
 def test_convert_numbers_all_null(self):
     result = render(
         pd.DataFrame({'A': [np.nan, np.nan]}, dtype=np.float64),
         {'colnames': ['A']},
         input_columns={
             'A': RenderColumn('A', 'number', '{:d}'),
         }
     )
     assert_frame_equal(result, pd.DataFrame({'A': [np.nan, np.nan]},
                                             dtype=object))
Example #30
0
 def test_coerce_categories_and_str(self):
     result = render(
         pd.DataFrame({'A': ['a', 'b']}, dtype='category'),  # cat
         params={
             'tabs': [
                 TabOutput('tab-2', 'Tab 2', {
                     'A': RenderColumn('A', 'text'),
                 }, pd.DataFrame({'A': ['c', 'd']})),  # str
             ],
             'add_source_column': False,
             'source_column_name': '',
         },
         tab_name='Tab 1',
         input_columns={
             'A': RenderColumn('A', 'text'),
         }
     )
     assert_frame_equal(result, pd.DataFrame({
         'A': ['a', 'b', 'c', 'd'],  # str
     }))