예제 #1
0
 def test_coerce_str(self):
     expected = ProcessResult(error="yay")
     result = ProcessResult.coerce("yay")
     self.assertEqual(result, expected)
예제 #2
0
 def test_csv_detect_separator_semicolon(self):
     result = parse_bytesio(io.BytesIO(b'A;C\nB;D'), 'text/csv', 'utf-8')
     expected = ProcessResult(pd.DataFrame({'A': ['B'], 'C': ['D']}))
     self.assertEqual(result, expected)
예제 #3
0
 def test_workflow_does_not_exist(self):
     result = self._fetch(self.workflow.id + 1, self.user,
                          self.workflow.id + 2)
     self.assertEqual(result,
                      ProcessResult(error='Target workflow does not exist'))
예제 #4
0
async def fetch(params, *, get_input_dataframe):
    urls = []
    urlsource = params['urlsource']

    if urlsource == 'list':
        if are_params_empty(params, None):
            return None
        urllist_text: str = params['urllist']
        urllist_raw = urllist_text.split('\n')
        for url in urllist_raw:
            s_url = url.strip()
            if len(s_url) == 0:
                continue
            # Fix in case user adds an URL without http(s) prefix
            if not re.match('^https?://.*', s_url):
                urls.append('http://{}'.format(s_url))
            else:
                urls.append(s_url)
    elif urlsource == 'column':
        # We won't execute here -- there's no need: the user clicked a
        # button so should be pretty clear on what the input is.
        prev_table = await get_input_dataframe()
        if prev_table is None:
            prev_table = pd.DataFrame()

        if are_params_empty(params, prev_table):
            return None

        # get our list of URLs from a column in the input table
        urlcol: str = params['urlcol']
        if urlcol in prev_table.columns:
            urls = prev_table[urlcol].tolist()
        else:
            urls = []
    elif urlsource == 'paged':
        # Count through a list of page numbers, appending each to the URL
        if are_params_empty(params, None):
            return None

        pagedurl: str = params['pagedurl']
        # Fix in case user adds an URL without http(s) prefix
        if not re.match('^https?://.*', pagedurl):
            pagedurl = 'http://' +  pagedurl

        # Generate multiple urls by adding page numbers, if user says so
        if params['addpagenumbers']:
            # limit the number of pages we can scrape with this method
            maxpages = 10
            pagenums = range(params['startpage'], params['endpage']+1)[:maxpages]
            urls = [pagedurl + str(num) for num in pagenums]
        else:
            urls = [ pagedurl ]

    else:
        raise ValueError('Unrecognized urlsource %r' % urlsource)

    if len(urls) > 0:
        table = pd.DataFrame(
            {'url': urls, 'status': ''},
            columns=['url', 'date', 'status', 'html']
        )

        await scrape_urls(urls, table)

    else:
        table = pd.DataFrame()

    # TODO make `date` datetime
    table['date'] = timezone.now().isoformat(timespec='seconds') \
        .replace('+00:00', 'Z')

    result = ProcessResult(dataframe=table)
    # No need to truncate: input is already truncated
    # No need to sanitize: we only added text+date+status
    return result
예제 #5
0
 def test_json_syntax_error(self):
     result = parse_bytesio(io.BytesIO(b'{not JSON'), 'application/json')
     expected = ProcessResult(
         error=('Invalid JSON (Unexpected character found when '
                "decoding 'null')"))
     self.assertEqual(result, expected)
예제 #6
0
 def test_404(self):
     fetch_result = fetch(url='http://example.org')
     self.assertEqual(
         fetch_result,
         ProcessResult(error='Error from server: 404 Not Found'))
예제 #7
0
 def test_table_index_over(self):
     fetch_result = fetch(url='http://example.org', tablenum=2)
     self.assertEqual(
         fetch_result,
         ProcessResult(error='The maximum table number on this page is 1'))
예제 #8
0
 def test_coerce_tuple_none_none_dict(self):
     expected = ProcessResult(json={"a": "b"})
     result = ProcessResult.coerce((None, None, {"a": "b"}))
     self.assertEqual(result, expected)
예제 #9
0
 def test_coerce_tuple_none_none_none(self):
     expected = ProcessResult()
     result = ProcessResult.coerce((None, None, None))
     self.assertEqual(result, expected)
예제 #10
0
 def test_coerce_tuple_none_str_dict(self):
     expected = ProcessResult(error="hi", json={"a": "b"})
     result = ProcessResult.coerce((None, "hi", {"a": "b"}))
     self.assertEqual(result, expected)
예제 #11
0
 def test_coerce_tuple_none_str_none(self):
     expected = ProcessResult(error="hi")
     result = ProcessResult.coerce((None, "hi", None))
     self.assertEqual(result, expected)
예제 #12
0
 def test_coerce_tuple_dataframe_none_none(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(df)
     result = ProcessResult.coerce((df, None, None))
     self.assertEqual(result, expected)
예제 #13
0
 def test_coerce_tuple_dataframe_none_dict(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(df, "", json={"a": "b"})
     result = ProcessResult.coerce((df, None, {"a": "b"}))
     self.assertEqual(result, expected)
예제 #14
0
 def test_coerce_tuple_dataframe_str(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(dataframe=df, error="hi")
     result = ProcessResult.coerce((df, "hi"))
     self.assertEqual(result, expected)
예제 #15
0
 def test_empty_table_shape(self):
     result = ProcessResult()
     self.assertEqual(result.table_shape, TableShape(0, []))
예제 #16
0
 def test_coerce_empty_dict(self):
     result = ProcessResult.coerce({})
     expected = ProcessResult()
     self.assertEqual(result, expected)
예제 #17
0
    def test_bad_server(self):
        fetch_result = fetch(url='http://example.org')

        self.assertEqual(
            fetch_result,
            ProcessResult(error='Error from server: 500 Server Error'))
예제 #18
0
 def test_status_ok_with_warning(self):
     result = ProcessResult(pd.DataFrame({"A": [1]}), "warning")
     self.assertEqual(result.status, "ok")
예제 #19
0
 def test_table_index_under(self):
     url = 'http:INVALID:URL'  # we should never even validate the URL
     fetch_result = fetch(url=url, tablenum=0)
     self.assertEqual(
         fetch_result,
         ProcessResult(error='Table number must be at least 1'))
예제 #20
0
 def test_status_ok_with_no_rows(self):
     result = ProcessResult(pd.DataFrame({"A": []}), "")
     self.assertEqual(result.status, "ok")
예제 #21
0
 def test_invalid_url(self):
     fetch_result = fetch(url='http:NOT:A:URL')
     self.assertEqual(fetch_result, ProcessResult(error='Invalid URL'))
예제 #22
0
 def test_status_error(self):
     result = ProcessResult(pd.DataFrame(), "error")
     self.assertEqual(result.status, "error")
예제 #23
0
def Err(error):
    return ProcessResult(error=error)
예제 #24
0
 def test_status_unreachable(self):
     result = ProcessResult(pd.DataFrame(), "")
     self.assertEqual(result.status, "unreachable")
예제 #25
0
 def test_txt_detect_separator_comma(self):
     result = parse_bytesio(io.BytesIO(b'A,C\nB,D'), 'text/plain', 'utf-8')
     expected = ProcessResult(pd.DataFrame({'A': ['B'], 'C': ['D']}))
     self.assertEqual(result, expected)
예제 #26
0
 def test_empty_columns(self):
     result = ProcessResult()
     self.assertEqual(result.column_names, [])
     self.assertEqual(result.columns, [])
예제 #27
0
 def test_deny_import_from_same_workflow(self):
     result = self._fetch(self.workflow.id, self.user, self.workflow.id)
     self.assertEqual(
         result, ProcessResult(error='Cannot import the current workflow'))
예제 #28
0
 def test_table_shape(self):
     df = pd.DataFrame({"A": [1, 2, 3]})
     result = ProcessResult(df)
     self.assertEqual(result.table_shape,
                      TableShape(3, [Column("A", ColumnType.NUMBER())]))
예제 #29
0
 def test_workflow_has_no_modules(self):
     self.wf_module.delete()
     result = self._fetch(self.workflow.id + 1, self.user, self.workflow.id)
     self.assertEqual(result,
                      ProcessResult(error='Target workflow is empty'))
예제 #30
0
 def test_coerce_dataframe(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(dataframe=df)
     result = ProcessResult.coerce(df)
     self.assertEqual(result, expected)