Beispiel #1
0
async def fetch(params, **kwargs):
    url: str = params['url'].strip()

    mimetypes = ','.join(AllowedMimeTypes)
    headers = {'Accept': mimetypes}
    timeout = aiohttp.ClientTimeout(total=5 * 60, connect=30)

    try:
        async with utils.spooled_data_from_url(url, headers,
                                               timeout) as (bytes_io, headers,
                                                            charset):
            content_type = headers.get('Content-Type', '') \
                    .split(';')[0] \
                    .strip()
            mime_type = guess_mime_type_or_none(content_type, url)

            if mime_type:
                result = parse_bytesio(bytes_io, mime_type, charset)
                result.truncate_in_place_if_too_big()
                return result
            else:
                return ProcessResult(
                    error=(f'Error fetching {url}: '
                           f'unknown content type {content_type}'))
    except asyncio.TimeoutError:
        return ProcessResult(error=f'Timeout fetching {url}')
    except aiohttp.InvalidURL:
        return ProcessResult(error=f'Invalid URL')
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=('Error from server: %d %s' %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
Beispiel #2
0
    async def fetch(wfm):
        async def fail(error: str) -> None:
            result = ProcessResult(error=error)
            await ModuleImpl.commit_result(wfm, result)

        params = wfm.get_params()

        table = None
        url = params.get_param_string('url').strip()
        tablenum = params.get_param_integer('tablenum') - 1  # 1 based for user

        if tablenum < 0:
            return await fail(_('Table number must be at least 1'))

        result = None

        try:
            async with utils.spooled_data_from_url(url) as (spool, headers,
                                                            charset):
                # TODO use charset for encoding detection
                tables = pd.read_html(spool,
                                      encoding=charset,
                                      flavor='html5lib')
        except asyncio.TimeoutError:
            result = ProcessResult(error=f'Timeout fetching {url}')
        except aiohttp.InvalidURL:
            result = ProcessResult(error=f'Invalid URL')
        except aiohttp.ClientResponseError as err:
            result = ProcessResult(error=('Error from server: %d %s' %
                                          (err.status, err.message)))
        except aiohttp.ClientError as err:
            result = ProcessResult(error=str(err))
        except ValueError as e:
            result = ProcessResult(
                error=_('Did not find any <table> tags on that page'))

        if not result:
            if not tables:
                result = ProcessResult(
                    error=_('Did not find any <table> tags on that page'))
            elif tablenum >= len(tables):
                result = ProcessResult(
                    error=(_('The maximum table number on this page is %d') %
                           len(tables)))
            else:
                table = tables[tablenum]
                merge_colspan_headers_in_place(table)
                result = ProcessResult(dataframe=table)

        result.truncate_in_place_if_too_big()
        result.sanitize_in_place()

        await ModuleImpl.commit_result(wfm, result)
    async def fetch(params, **kwargs):
        table = None
        url = params.get_param_string('url').strip()
        tablenum = params.get_param_integer('tablenum') - 1  # 1 based for user

        if tablenum < 0:
            return ProcessResult(error='Table number must be at least 1')

        result = None

        try:
            async with utils.spooled_data_from_url(url) as (spool, headers,
                                                            charset):
                # TODO use charset for encoding detection
                tables = pd.read_html(spool, encoding=charset,
                                      flavor='html5lib')
        except asyncio.TimeoutError:
            return ProcessResult(error=f'Timeout fetching {url}')
        except aiohttp.InvalidURL:
            return ProcessResult(error=f'Invalid URL')
        except aiohttp.ClientResponseError as err:
            return ProcessResult(error=('Error from server: %d %s' % (
                                          err.status, err.message)))
        except aiohttp.ClientError as err:
            return ProcessResult(error=str(err))
        except ValueError as e:
            return ProcessResult(
                error=_('Did not find any <table> tags on that page')
            )
        except IndexError:
            # pandas.read_html() gives this unhelpful error message....
            return ProcessResult(error='Table has no columns')

        if not tables:
            return ProcessResult(
                error=_('Did not find any <table> tags on that page')
            )

        if tablenum >= len(tables):
            return ProcessResult(error=(
                f'The maximum table number on this page is {len(tables)}'
            ))

        table = tables[tablenum]
        merge_colspan_headers_in_place(table)
        result = ProcessResult(dataframe=table)
        result.truncate_in_place_if_too_big()
        result.sanitize_in_place()
        return result
Beispiel #4
0
async def fetch(params, **kwargs):
    url: str = params['url'].strip()

    mimetypes = ','.join(AllowedMimeTypes)
    headers = {'Accept': mimetypes}
    timeout = aiohttp.ClientTimeout(total=5 * 60, connect=30)

    try:
        async with utils.spooled_data_from_url(url, headers,
                                               timeout) as (bytesio, headers,
                                                            charset):
            content_type = headers.get('Content-Type', '') \
                    .split(';')[0] \
                    .strip()
            mime_type = guess_mime_type_or_none(content_type, url)

            if mime_type:
                # FIXME has_header=True always, because of a stupid decision
                # ages ago that we can't fix because everything we've stored
                # was stored with has_header=True (which is lossy).
                #
                # In https://www.pivotaltracker.com/story/show/166712967 we'll
                # store the input file instead of parsed file; then we'll be
                # able to parse correctly moving forward.
                #
                # FIXME move this to render(). In the meantime, we need to
                # run_in_executor() so we continue to send AMQP heartbeats and
                # handle other HTTP connections, even when parsing a big file.
                return await asyncio.get_event_loop().run_in_executor(
                    None,
                    parse_bytesio,
                    bytesio,
                    charset,
                    mime_type,
                    True  # has_header
                )
            else:
                return ProcessResult(
                    error=(f'Error fetching {url}: '
                           f'unknown content type {content_type}'))
    except asyncio.TimeoutError:
        return ProcessResult(error=f'Timeout fetching {url}')
    except aiohttp.InvalidURL:
        return ProcessResult(error=f'Invalid URL')
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=('Error from server: %d %s' %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
Beispiel #5
0
 async def inner():
     async with spooled_data_from_url("mailto:[email protected]"):
         pass
Beispiel #6
0
 async def inner():
     async with spooled_data_from_url("//a/b"):
         pass
Beispiel #7
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params['url'].strip()
    tablenum: int = params['tablenum'] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult(error='Table number must be at least 1')

    result = None

    try:
        async with utils.spooled_data_from_url(url) as (spool, headers,
                                                        charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with utils.wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor='html5lib',  # force algorithm, for reproducibility
                    io=textio,
                    match='.+',
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult(error=f'Timeout fetching {url}')
    except aiohttp.InvalidURL:
        return ProcessResult(error=f'Invalid URL')
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=('Error from server: %d %s' %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
    except ValueError:
        return ProcessResult(
            error='Did not find any <table> tags on that page')
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult(error='Table has no columns')

    if not tables:
        return ProcessResult(
            error='Did not find any <table> tags on that page')

    if tablenum >= len(tables):
        return ProcessResult(
            error=(f'The maximum table number on this page is {len(tables)}'))

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    utils.autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result
Beispiel #8
0
 async def inner():
     async with spooled_data_from_url('/foo'):
         pass