Example #1
0
 def test_coerce_validate_unused_categories(self):
     with self.assertRaisesRegex(ValueError, "unused category 'b'"):
         ProcessResult.coerce(
             pd.DataFrame({'foo': ['a', 'a']},
                          dtype=pd.CategoricalDtype(['a', 'b']))
         )
Example #2
0
 def test_coerce_tuple_none_str_none(self):
     expected = ProcessResult(error='hi')
     result = ProcessResult.coerce((None, 'hi', None))
     self.assertEqual(result, expected)
Example #3
0
 def test_coerce_tuple_none_none_none(self):
     expected = ProcessResult()
     result = ProcessResult.coerce((None, None, None))
     self.assertEqual(result, expected)
Example #4
0
 def test_coerce_tuple_dataframe_str(self):
     df = DataFrame({'foo': ['bar']})
     expected = ProcessResult(dataframe=df, error='hi')
     result = ProcessResult.coerce((df, 'hi'))
     self.assertEqual(result, expected)
Example #5
0
 def test_coerce_tuple_dataframe_none_none(self):
     df = DataFrame({'foo': ['bar']})
     expected = ProcessResult(df)
     result = ProcessResult.coerce((df, None, None))
     self.assertEqual(result, expected)
Example #6
0
 def test_coerce_empty_dict(self):
     result = ProcessResult.coerce({})
     expected = ProcessResult()
     self.assertEqual(result, expected)
Example #7
0
 def test_coerce_dataframe(self):
     df = DataFrame({'foo': ['bar']})
     expected = ProcessResult(dataframe=df)
     result = ProcessResult.coerce(df)
     self.assertEqual(result, expected)
Example #8
0
 def test_coerce_validate_colnames_all_str(self):
     with self.assertRaisesRegex(ValueError, 'column names'):
         # df.columns is object, but not all are str
         ProcessResult.coerce(pd.DataFrame({'A': [1], 2: [2]}))
Example #9
0
def render(has_header, fetch_result):
    x = uploadfile.render(pd.DataFrame(), {'has_header': has_header},
                          fetch_result=fetch_result)
    result = ProcessResult.coerce(x)
    result.sanitize_in_place()
    return result
Example #10
0
 def test_coerce_validate_numpy_dtype(self):
     # Numpy dtypes should be treated just like pandas dtypes.
     dataframe = pd.DataFrame({'A': np.array([1, 2, 3])})
     result = ProcessResult.coerce(dataframe)
     assert_frame_equal(result.dataframe, dataframe)
Example #11
0
 def test_coerce_validate_colnames_dtype_object(self):
     with self.assertRaisesRegex(ValueError, 'column names'):
         # df.columns is numeric
         ProcessResult.coerce(pd.DataFrame({1: [1]}))
Example #12
0
 def test_coerce_validate_empty_colname(self):
     dataframe = pd.DataFrame({'': [1], 'B': [2]})
     with self.assertRaisesRegex(ValueError, 'empty column name'):
         ProcessResult.coerce(dataframe)
Example #13
0
 def test_coerce_validate_unique_colnames(self):
     dataframe = pd.DataFrame({'A': [1], 'B': [2]})
     dataframe.columns = ['A', 'A']
     with self.assertRaisesRegex(ValueError, 'duplicate column name'):
         ProcessResult.coerce(dataframe)
Example #14
0
 def test_coerce_validate_empty_categories(self):
     df = pd.DataFrame({'A': []}, dtype='category')
     result = ProcessResult.coerce(df)
     assert_frame_equal(result.dataframe, df)
Example #15
0
 def test_coerce_processresult(self):
     expected = ProcessResult()
     result = ProcessResult.coerce(expected)
     self.assertIs(result, expected)
Example #16
0
def render(table, reorder_history):
    params = {'reorder-history': reorder_history}
    result = reordercolumns.render(table.copy(), params)
    return ProcessResult.coerce(result)
Example #17
0
 def test_coerce_dict_wrong_key(self):
     with self.assertRaises(ValueError):
         ProcessResult.coerce({'table': DataFrame({'A': [1]})})
Example #18
0
 def test_parse_empty_csv(self):
     result = parse_bytesio(io.BytesIO(b""), "text/csv", "utf-8")
     expected = ProcessResult.coerce(pd.DataFrame().reset_index(drop=True))
     self.assertEqual(result, expected)
Example #19
0
 def test_coerce_invalid_value(self):
     result = ProcessResult.coerce([None, 'foo'])
     self.assertIsNotNone(result.error)
Example #20
0
def render(table, params):
    return ProcessResult.coerce(countbydate.render(table, params))
Example #21
0
 def test_coerce_str(self):
     expected = ProcessResult(error='yay')
     result = ProcessResult.coerce('yay')
     self.assertEqual(result, expected)
Example #22
0
    async def fetch(
        self,
        params: Params,
        *,
        workflow_id: int,
        get_input_dataframe: Callable[[], Awaitable[pd.DataFrame]],
        get_stored_dataframe: Callable[[], Awaitable[pd.DataFrame]],
        get_workflow_owner: Callable[[], Awaitable[User]]
    ) -> ProcessResult:
        """
        Process `params` with module `fetch` method, to build a ProcessResult.

        If the `fetch` method raises an exception, this method will return an
        error string. It is always an error for a module to raise an exception.
        """
        kwargs = {}
        spec = inspect.getfullargspec(self.fetch_impl)
        varkw = bool(spec.varkw)  # if True, function accepts **kwargs
        kwonlyargs = spec.kwonlyargs
        if varkw or 'workflow_id' in kwonlyargs:
            kwargs['workflow_id'] = workflow_id
        if varkw or 'get_input_dataframe' in kwonlyargs:
            kwargs['get_input_dataframe'] = get_input_dataframe
        if varkw or 'get_stored_dataframe' in kwonlyargs:
            kwargs['get_stored_dataframe'] = get_stored_dataframe
        if varkw or 'get_workflow_owner' in kwonlyargs:
            kwargs['get_workflow_owner'] = get_workflow_owner

        # Pass input to params.to_painful_dict().
        #
        # TODO consider ... _not_ doing this. It's only needed if the module
        # has 'column' params ... which [2019-01-31, adamhooper] is unwise. We
        # use it in old-style 'join' and 'concat' (which require fetch of
        # another workflow) and in 'urlscraper' (which seems like a unique
        # case).

        input_dataframe_future = get_input_dataframe()

        input_dataframe = await input_dataframe_future
        if input_dataframe is None:
            input_dataframe = pd.DataFrame()
        params = params.to_painful_dict(input_dataframe)

        # If we're passing get_input_dataframe via kwargs, short-circuit it
        # because we already know the result.
        async def get_input_dataframe_again():
            return input_dataframe
        if 'get_input_dataframe' in kwargs:
            kwargs['get_input_dataframe'] = get_input_dataframe_again

        time1 = time.time()

        if inspect.iscoroutinefunction(self.fetch_impl):
            future_result = self.fetch_impl(params, **kwargs)
        else:
            loop = asyncio.get_event_loop()
            func = partial(self.fetch_impl, params, **kwargs)
            future_result = loop.run_in_executor(None, func)

        try:
            out = await future_result
        except Exception as err:
            logger.exception('Exception in %s.fetch', self.module_id_name)
            out = self._wrap_exception(err)

        time2 = time.time()

        if out is None:
            shape = (-1, -1)
        else:
            out = ProcessResult.coerce(out)
            out.truncate_in_place_if_too_big()
            out.sanitize_in_place()
            shape = out.dataframe.shape

        logger.info('%s fetched =>(%drows,%dcols) in %dms',
                    self.name, shape[0], shape[1],
                    int((time2 - time1) * 1000))

        return out
Example #23
0
 def test_coerce_tuple_dataframe_none_dict(self):
     df = DataFrame({'foo': ['bar']})
     expected = ProcessResult(df, '', json={'a': 'b'})
     result = ProcessResult.coerce((df, None, {'a': 'b'}))
     self.assertEqual(result, expected)
Example #24
0
    async def fetch(
            self, *, params: Dict[str,
                                  Any], secrets: Dict[str,
                                                      Any], workflow_id: int,
            get_input_dataframe: Callable[[], Awaitable[pd.DataFrame]],
            get_stored_dataframe: Callable[[], Awaitable[pd.DataFrame]],
            get_workflow_owner: Callable[[],
                                         Awaitable[User]]) -> ProcessResult:
        """
        Call module `fetch(...)` method to build a `ProcessResult`.

        If the `fetch` method raises an exception, this method will return an
        error string. It is always an error for a module to raise an exception.
        """
        kwargs = {}
        spec = inspect.getfullargspec(self.fetch_impl)
        varkw = bool(spec.varkw)  # if True, function accepts **kwargs
        kwonlyargs = spec.kwonlyargs
        get_input_dataframe = _memoize_async_func(get_input_dataframe)
        if varkw or 'secrets' in kwonlyargs:
            kwargs['secrets'] = secrets
        if varkw or 'workflow_id' in kwonlyargs:
            kwargs['workflow_id'] = workflow_id
        if varkw or 'get_input_dataframe' in kwonlyargs:
            kwargs['get_input_dataframe'] = get_input_dataframe
        if varkw or 'get_stored_dataframe' in kwonlyargs:
            kwargs['get_stored_dataframe'] = get_stored_dataframe
        if varkw or 'get_workflow_owner' in kwonlyargs:
            kwargs['get_workflow_owner'] = get_workflow_owner

        time1 = time.time()

        if inspect.iscoroutinefunction(self.fetch_impl):
            future_result = self.fetch_impl(params, **kwargs)
        else:
            loop = asyncio.get_event_loop()
            func = partial(self.fetch_impl, params, **kwargs)
            future_result = loop.run_in_executor(None, func)

        try:
            out = await future_result
        except asyncio.CancelledError:
            raise
        except Exception as err:
            logger.exception('Exception in %s.fetch', self.module_id_name)
            out = self._wrap_exception(err)

        time2 = time.time()

        if out is None:
            shape = (-1, -1)
        else:
            try:
                out = ProcessResult.coerce(out)
            except ValueError as err:
                logger.exception(
                    '%s.fetch gave invalid output. workflow=%d, params=%s' %
                    (self.module_id_name, workflow_id, json.dumps(params)))
                out = ProcessResult(error=('Fetch produced invalid data: %s' %
                                           (str(err), )))
            out.truncate_in_place_if_too_big()
            shape = out.dataframe.shape

        logger.info('%s fetched =>(%drows,%dcols) in %dms', self.name,
                    shape[0], shape[1], int((time2 - time1) * 1000))

        return out
Example #25
0
 def test_coerce_tuple_none_str_dict(self):
     expected = ProcessResult(error='hi', json={'a': 'b'})
     result = ProcessResult.coerce((None, 'hi', {'a': 'b'}))
     self.assertEqual(result, expected)
Example #26
0
 def test_coerce_3tuple_no_dataframe(self):
     result = ProcessResult.coerce(('foo', 'bar', {'a': 'b'}))
     self.assertIsNotNone(result.error)
Example #27
0
 def test_coerce_tuple_none_none_dict(self):
     expected = ProcessResult(json={'a': 'b'})
     result = ProcessResult.coerce((None, None, {'a': 'b'}))
     self.assertEqual(result, expected)
Example #28
0
 def test_coerce_none(self):
     result = ProcessResult.coerce(None)
     expected = ProcessResult(dataframe=DataFrame())
     self.assertEqual(result, expected)
Example #29
0
 def test_coerce_bad_tuple(self):
     result = ProcessResult.coerce(('foo', 'bar', 'baz', 'moo'))
     self.assertIsNotNone(result.error)
Example #30
0
 def test_coerce_validate_non_str_categories(self):
     with self.assertRaisesRegex(ValueError, 'must all be str'):
         ProcessResult.coerce(
             pd.DataFrame({'foo': ['a', 1]}, dtype='category')
         )