def test_coerce_validate_unused_categories(self): with self.assertRaisesRegex(ValueError, "unused category 'b'"): ProcessResult.coerce( pd.DataFrame({'foo': ['a', 'a']}, dtype=pd.CategoricalDtype(['a', 'b'])) )
def test_coerce_tuple_none_str_none(self): expected = ProcessResult(error='hi') result = ProcessResult.coerce((None, 'hi', None)) self.assertEqual(result, expected)
def test_coerce_tuple_none_none_none(self): expected = ProcessResult() result = ProcessResult.coerce((None, None, None)) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_str(self): df = DataFrame({'foo': ['bar']}) expected = ProcessResult(dataframe=df, error='hi') result = ProcessResult.coerce((df, 'hi')) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_none_none(self): df = DataFrame({'foo': ['bar']}) expected = ProcessResult(df) result = ProcessResult.coerce((df, None, None)) self.assertEqual(result, expected)
def test_coerce_empty_dict(self): result = ProcessResult.coerce({}) expected = ProcessResult() self.assertEqual(result, expected)
def test_coerce_dataframe(self): df = DataFrame({'foo': ['bar']}) expected = ProcessResult(dataframe=df) result = ProcessResult.coerce(df) self.assertEqual(result, expected)
def test_coerce_validate_colnames_all_str(self): with self.assertRaisesRegex(ValueError, 'column names'): # df.columns is object, but not all are str ProcessResult.coerce(pd.DataFrame({'A': [1], 2: [2]}))
def render(has_header, fetch_result): x = uploadfile.render(pd.DataFrame(), {'has_header': has_header}, fetch_result=fetch_result) result = ProcessResult.coerce(x) result.sanitize_in_place() return result
def test_coerce_validate_numpy_dtype(self): # Numpy dtypes should be treated just like pandas dtypes. dataframe = pd.DataFrame({'A': np.array([1, 2, 3])}) result = ProcessResult.coerce(dataframe) assert_frame_equal(result.dataframe, dataframe)
def test_coerce_validate_colnames_dtype_object(self): with self.assertRaisesRegex(ValueError, 'column names'): # df.columns is numeric ProcessResult.coerce(pd.DataFrame({1: [1]}))
def test_coerce_validate_empty_colname(self): dataframe = pd.DataFrame({'': [1], 'B': [2]}) with self.assertRaisesRegex(ValueError, 'empty column name'): ProcessResult.coerce(dataframe)
def test_coerce_validate_unique_colnames(self): dataframe = pd.DataFrame({'A': [1], 'B': [2]}) dataframe.columns = ['A', 'A'] with self.assertRaisesRegex(ValueError, 'duplicate column name'): ProcessResult.coerce(dataframe)
def test_coerce_validate_empty_categories(self): df = pd.DataFrame({'A': []}, dtype='category') result = ProcessResult.coerce(df) assert_frame_equal(result.dataframe, df)
def test_coerce_processresult(self): expected = ProcessResult() result = ProcessResult.coerce(expected) self.assertIs(result, expected)
def render(table, reorder_history): params = {'reorder-history': reorder_history} result = reordercolumns.render(table.copy(), params) return ProcessResult.coerce(result)
def test_coerce_dict_wrong_key(self): with self.assertRaises(ValueError): ProcessResult.coerce({'table': DataFrame({'A': [1]})})
def test_parse_empty_csv(self): result = parse_bytesio(io.BytesIO(b""), "text/csv", "utf-8") expected = ProcessResult.coerce(pd.DataFrame().reset_index(drop=True)) self.assertEqual(result, expected)
def test_coerce_invalid_value(self): result = ProcessResult.coerce([None, 'foo']) self.assertIsNotNone(result.error)
def render(table, params): return ProcessResult.coerce(countbydate.render(table, params))
def test_coerce_str(self): expected = ProcessResult(error='yay') result = ProcessResult.coerce('yay') self.assertEqual(result, expected)
async def fetch( self, params: Params, *, workflow_id: int, get_input_dataframe: Callable[[], Awaitable[pd.DataFrame]], get_stored_dataframe: Callable[[], Awaitable[pd.DataFrame]], get_workflow_owner: Callable[[], Awaitable[User]] ) -> ProcessResult: """ Process `params` with module `fetch` method, to build a ProcessResult. If the `fetch` method raises an exception, this method will return an error string. It is always an error for a module to raise an exception. """ kwargs = {} spec = inspect.getfullargspec(self.fetch_impl) varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or 'workflow_id' in kwonlyargs: kwargs['workflow_id'] = workflow_id if varkw or 'get_input_dataframe' in kwonlyargs: kwargs['get_input_dataframe'] = get_input_dataframe if varkw or 'get_stored_dataframe' in kwonlyargs: kwargs['get_stored_dataframe'] = get_stored_dataframe if varkw or 'get_workflow_owner' in kwonlyargs: kwargs['get_workflow_owner'] = get_workflow_owner # Pass input to params.to_painful_dict(). # # TODO consider ... _not_ doing this. It's only needed if the module # has 'column' params ... which [2019-01-31, adamhooper] is unwise. We # use it in old-style 'join' and 'concat' (which require fetch of # another workflow) and in 'urlscraper' (which seems like a unique # case). input_dataframe_future = get_input_dataframe() input_dataframe = await input_dataframe_future if input_dataframe is None: input_dataframe = pd.DataFrame() params = params.to_painful_dict(input_dataframe) # If we're passing get_input_dataframe via kwargs, short-circuit it # because we already know the result. async def get_input_dataframe_again(): return input_dataframe if 'get_input_dataframe' in kwargs: kwargs['get_input_dataframe'] = get_input_dataframe_again time1 = time.time() if inspect.iscoroutinefunction(self.fetch_impl): future_result = self.fetch_impl(params, **kwargs) else: loop = asyncio.get_event_loop() func = partial(self.fetch_impl, params, **kwargs) future_result = loop.run_in_executor(None, func) try: out = await future_result except Exception as err: logger.exception('Exception in %s.fetch', self.module_id_name) out = self._wrap_exception(err) time2 = time.time() if out is None: shape = (-1, -1) else: out = ProcessResult.coerce(out) out.truncate_in_place_if_too_big() out.sanitize_in_place() shape = out.dataframe.shape logger.info('%s fetched =>(%drows,%dcols) in %dms', self.name, shape[0], shape[1], int((time2 - time1) * 1000)) return out
def test_coerce_tuple_dataframe_none_dict(self): df = DataFrame({'foo': ['bar']}) expected = ProcessResult(df, '', json={'a': 'b'}) result = ProcessResult.coerce((df, None, {'a': 'b'})) self.assertEqual(result, expected)
async def fetch( self, *, params: Dict[str, Any], secrets: Dict[str, Any], workflow_id: int, get_input_dataframe: Callable[[], Awaitable[pd.DataFrame]], get_stored_dataframe: Callable[[], Awaitable[pd.DataFrame]], get_workflow_owner: Callable[[], Awaitable[User]]) -> ProcessResult: """ Call module `fetch(...)` method to build a `ProcessResult`. If the `fetch` method raises an exception, this method will return an error string. It is always an error for a module to raise an exception. """ kwargs = {} spec = inspect.getfullargspec(self.fetch_impl) varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs get_input_dataframe = _memoize_async_func(get_input_dataframe) if varkw or 'secrets' in kwonlyargs: kwargs['secrets'] = secrets if varkw or 'workflow_id' in kwonlyargs: kwargs['workflow_id'] = workflow_id if varkw or 'get_input_dataframe' in kwonlyargs: kwargs['get_input_dataframe'] = get_input_dataframe if varkw or 'get_stored_dataframe' in kwonlyargs: kwargs['get_stored_dataframe'] = get_stored_dataframe if varkw or 'get_workflow_owner' in kwonlyargs: kwargs['get_workflow_owner'] = get_workflow_owner time1 = time.time() if inspect.iscoroutinefunction(self.fetch_impl): future_result = self.fetch_impl(params, **kwargs) else: loop = asyncio.get_event_loop() func = partial(self.fetch_impl, params, **kwargs) future_result = loop.run_in_executor(None, func) try: out = await future_result except asyncio.CancelledError: raise except Exception as err: logger.exception('Exception in %s.fetch', self.module_id_name) out = self._wrap_exception(err) time2 = time.time() if out is None: shape = (-1, -1) else: try: out = ProcessResult.coerce(out) except ValueError as err: logger.exception( '%s.fetch gave invalid output. workflow=%d, params=%s' % (self.module_id_name, workflow_id, json.dumps(params))) out = ProcessResult(error=('Fetch produced invalid data: %s' % (str(err), ))) out.truncate_in_place_if_too_big() shape = out.dataframe.shape logger.info('%s fetched =>(%drows,%dcols) in %dms', self.name, shape[0], shape[1], int((time2 - time1) * 1000)) return out
def test_coerce_tuple_none_str_dict(self): expected = ProcessResult(error='hi', json={'a': 'b'}) result = ProcessResult.coerce((None, 'hi', {'a': 'b'})) self.assertEqual(result, expected)
def test_coerce_3tuple_no_dataframe(self): result = ProcessResult.coerce(('foo', 'bar', {'a': 'b'})) self.assertIsNotNone(result.error)
def test_coerce_tuple_none_none_dict(self): expected = ProcessResult(json={'a': 'b'}) result = ProcessResult.coerce((None, None, {'a': 'b'})) self.assertEqual(result, expected)
def test_coerce_none(self): result = ProcessResult.coerce(None) expected = ProcessResult(dataframe=DataFrame()) self.assertEqual(result, expected)
def test_coerce_bad_tuple(self): result = ProcessResult.coerce(('foo', 'bar', 'baz', 'moo')) self.assertIsNotNone(result.error)
def test_coerce_validate_non_str_categories(self): with self.assertRaisesRegex(ValueError, 'must all be str'): ProcessResult.coerce( pd.DataFrame({'foo': ['a', 1]}, dtype='category') )