def test_coerce_str(self): expected = ProcessResult(error="yay") result = ProcessResult.coerce("yay") self.assertEqual(result, expected)
def test_csv_detect_separator_semicolon(self): result = parse_bytesio(io.BytesIO(b'A;C\nB;D'), 'text/csv', 'utf-8') expected = ProcessResult(pd.DataFrame({'A': ['B'], 'C': ['D']})) self.assertEqual(result, expected)
def test_workflow_does_not_exist(self): result = self._fetch(self.workflow.id + 1, self.user, self.workflow.id + 2) self.assertEqual(result, ProcessResult(error='Target workflow does not exist'))
async def fetch(params, *, get_input_dataframe): urls = [] urlsource = params['urlsource'] if urlsource == 'list': if are_params_empty(params, None): return None urllist_text: str = params['urllist'] urllist_raw = urllist_text.split('\n') for url in urllist_raw: s_url = url.strip() if len(s_url) == 0: continue # Fix in case user adds an URL without http(s) prefix if not re.match('^https?://.*', s_url): urls.append('http://{}'.format(s_url)) else: urls.append(s_url) elif urlsource == 'column': # We won't execute here -- there's no need: the user clicked a # button so should be pretty clear on what the input is. prev_table = await get_input_dataframe() if prev_table is None: prev_table = pd.DataFrame() if are_params_empty(params, prev_table): return None # get our list of URLs from a column in the input table urlcol: str = params['urlcol'] if urlcol in prev_table.columns: urls = prev_table[urlcol].tolist() else: urls = [] elif urlsource == 'paged': # Count through a list of page numbers, appending each to the URL if are_params_empty(params, None): return None pagedurl: str = params['pagedurl'] # Fix in case user adds an URL without http(s) prefix if not re.match('^https?://.*', pagedurl): pagedurl = 'http://' + pagedurl # Generate multiple urls by adding page numbers, if user says so if params['addpagenumbers']: # limit the number of pages we can scrape with this method maxpages = 10 pagenums = range(params['startpage'], params['endpage']+1)[:maxpages] urls = [pagedurl + str(num) for num in pagenums] else: urls = [ pagedurl ] else: raise ValueError('Unrecognized urlsource %r' % urlsource) if len(urls) > 0: table = pd.DataFrame( {'url': urls, 'status': ''}, columns=['url', 'date', 'status', 'html'] ) await scrape_urls(urls, table) else: table = pd.DataFrame() # TODO make `date` datetime table['date'] = timezone.now().isoformat(timespec='seconds') \ .replace('+00:00', 'Z') result = ProcessResult(dataframe=table) # No need to truncate: input is already truncated # No need to sanitize: we only added text+date+status return result
def test_json_syntax_error(self): result = parse_bytesio(io.BytesIO(b'{not JSON'), 'application/json') expected = ProcessResult( error=('Invalid JSON (Unexpected character found when ' "decoding 'null')")) self.assertEqual(result, expected)
def test_404(self): fetch_result = fetch(url='http://example.org') self.assertEqual( fetch_result, ProcessResult(error='Error from server: 404 Not Found'))
def test_table_index_over(self): fetch_result = fetch(url='http://example.org', tablenum=2) self.assertEqual( fetch_result, ProcessResult(error='The maximum table number on this page is 1'))
def test_coerce_tuple_none_none_dict(self): expected = ProcessResult(json={"a": "b"}) result = ProcessResult.coerce((None, None, {"a": "b"})) self.assertEqual(result, expected)
def test_coerce_tuple_none_none_none(self): expected = ProcessResult() result = ProcessResult.coerce((None, None, None)) self.assertEqual(result, expected)
def test_coerce_tuple_none_str_dict(self): expected = ProcessResult(error="hi", json={"a": "b"}) result = ProcessResult.coerce((None, "hi", {"a": "b"})) self.assertEqual(result, expected)
def test_coerce_tuple_none_str_none(self): expected = ProcessResult(error="hi") result = ProcessResult.coerce((None, "hi", None)) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_none_none(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(df) result = ProcessResult.coerce((df, None, None)) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_none_dict(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(df, "", json={"a": "b"}) result = ProcessResult.coerce((df, None, {"a": "b"})) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_str(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(dataframe=df, error="hi") result = ProcessResult.coerce((df, "hi")) self.assertEqual(result, expected)
def test_empty_table_shape(self): result = ProcessResult() self.assertEqual(result.table_shape, TableShape(0, []))
def test_coerce_empty_dict(self): result = ProcessResult.coerce({}) expected = ProcessResult() self.assertEqual(result, expected)
def test_bad_server(self): fetch_result = fetch(url='http://example.org') self.assertEqual( fetch_result, ProcessResult(error='Error from server: 500 Server Error'))
def test_status_ok_with_warning(self): result = ProcessResult(pd.DataFrame({"A": [1]}), "warning") self.assertEqual(result.status, "ok")
def test_table_index_under(self): url = 'http:INVALID:URL' # we should never even validate the URL fetch_result = fetch(url=url, tablenum=0) self.assertEqual( fetch_result, ProcessResult(error='Table number must be at least 1'))
def test_status_ok_with_no_rows(self): result = ProcessResult(pd.DataFrame({"A": []}), "") self.assertEqual(result.status, "ok")
def test_invalid_url(self): fetch_result = fetch(url='http:NOT:A:URL') self.assertEqual(fetch_result, ProcessResult(error='Invalid URL'))
def test_status_error(self): result = ProcessResult(pd.DataFrame(), "error") self.assertEqual(result.status, "error")
def Err(error): return ProcessResult(error=error)
def test_status_unreachable(self): result = ProcessResult(pd.DataFrame(), "") self.assertEqual(result.status, "unreachable")
def test_txt_detect_separator_comma(self): result = parse_bytesio(io.BytesIO(b'A,C\nB,D'), 'text/plain', 'utf-8') expected = ProcessResult(pd.DataFrame({'A': ['B'], 'C': ['D']})) self.assertEqual(result, expected)
def test_empty_columns(self): result = ProcessResult() self.assertEqual(result.column_names, []) self.assertEqual(result.columns, [])
def test_deny_import_from_same_workflow(self): result = self._fetch(self.workflow.id, self.user, self.workflow.id) self.assertEqual( result, ProcessResult(error='Cannot import the current workflow'))
def test_table_shape(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = ProcessResult(df) self.assertEqual(result.table_shape, TableShape(3, [Column("A", ColumnType.NUMBER())]))
def test_workflow_has_no_modules(self): self.wf_module.delete() result = self._fetch(self.workflow.id + 1, self.user, self.workflow.id) self.assertEqual(result, ProcessResult(error='Target workflow is empty'))
def test_coerce_dataframe(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(dataframe=df) result = ProcessResult.coerce(df) self.assertEqual(result, expected)