Beispiel #1
0
def unparse_response(prefix, params, data):
    ''' Constructs an http response.

        This is essentially the reverse of parse_response().
    '''
    return (prefix + HTTP_EOL + params_to_bytes(params) + HTTP_SEPARATOR +
            to_bytes(data))
Beispiel #2
0
def parse_request(request):
    '''
        Parse raw http request data into prefix, params, and data.

        'request' must be a string or bytes.

        Returns:
        'prefix' is same type as 'request'.
        'params' is a dict with names and values as strings.
        'data' is same type as 'request', or None.
    '''

    if type(request) is str:
        request = to_bytes(request)
    if type(request) is not bytes:
        raise TypeError("'request' must be bytes")

    prefix, _, remainder = request.partition(HTTP_EOL)

    if HTTP_SEPARATOR in remainder:
        raw_params, _, content = remainder.partition(HTTP_SEPARATOR)
    else:
        raw_params = remainder
        content = None
    params = parse_params(raw_params.split(HTTP_EOL))

    return prefix, params, content
Beispiel #3
0
def create_response(status, params=None, data=None):
    ''' Return raw http response.

        'status' is an integer. http lib defines status constants.

        'params' is an optional dict. 'data' is an optional string. '''

    if params is None:
        params = {}
    if data is None:
        data = b''

    prefix = b'HTTP/1.1 {} {}'.format(status, client_responses[status])
    params['Content-Length'] = len(data)
    response = prefix + HTTP_EOL + params_to_bytes(
        params) + HTTP_SEPARATOR + to_bytes(data)
    return response
Beispiel #4
0
def get_title_using_re(html):
    ''' DEPRECATED. Use get_title()

        Get title from html.

        Return title, or None if no title found.

        >>> html = """
        ...     <html lang="en">
        ...         <head>
        ...              <title>
        ...                  Test title
        ...              </title>
        ...
        ...              <meta charset="utf-8">
        ...              </meta>
        ...         </head>
        ...
        ...         <body>
        ...             just a test
        ...         </body>
        ...
        ...     </html>
        ... """

        >>> print(get_title_using_re(html))
        Test title
    '''

    PATTERN = re.compile(
        rb'''<\s* title .*?>
                                  (.*?)
                              <\s*/\s* title \s*>
                         ''', re.VERBOSE | re.DOTALL | re.IGNORECASE)

    title = None

    if is_html(html):
        match = PATTERN.search(to_bytes(html))
        if match:
            title = to_string(match.group(1).strip())

    return title
Beispiel #5
0
def parse_response(response):
    '''
        Parse raw http response data into (prefix, params, content).

        Returns:
        'prefix' and 'content' are byte sequences.
        'params' is a dict.
    '''

    if type(response) is str:
        response = to_bytes(response)
    if type(response) is not bytes:
        raise TypeError("'response' must be bytes")

    header, _, content = response.partition(HTTP_SEPARATOR)
    prefix, params = parse_header(header)
    params, content = uncompress_content(params, content)

    return prefix, params, content
Beispiel #6
0
def is_xml(s):
    ''' Return True if likely xml, else False.

        >>> is_xml('text')
        False

        >>> is_xml('<a>')
        True

        >>> is_xml('<p>')
        True

        >>> is_xml('<a> <p>')
        True
    '''

    PATTERN = rb'< \s* \w+ .*? >'
    match = re.search(PATTERN, to_bytes(s), flags=(re.IGNORECASE | re.VERBOSE))

    return match != None
Beispiel #7
0
def is_html(s):
    ''' Return True if likely html, else False.

        Looks for <html>.

        >>> is_html('text')
        False

        >>> is_html('<a>')
        False

        >>> is_html('<html>')
        True
    '''

    PATTERN = rb'< \s* html .*? >'

    s_is_html = False
    if re.search(PATTERN, to_bytes(s), flags=(re.IGNORECASE | re.VERBOSE)):
        s_is_html = True

    return s_is_html
Beispiel #8
0
def get_links(htmlpath, exclude=None):
    ''' Get links from an html file.

        Not well tested. See reinhardt.feeds for examples of more reliable parsing.

        Returns a list. Each item is a list of [PATH, URL, SUMMARY].

        'htmlpath' is path of html file.

        'exclude' is string in href to exclude, without top level domain.
        Example: To exclude links to google, use "exclude='google'".

        Very ad hoc.
    '''

    # fallable importdelayed until needed
    try:
        from pyquery.pyquery import PyQuery

    except ModuleNotFoundError:
        raise Exception('pyquery not installed')

    else:

        results = []

        with open(htmlpath) as infile:

            html = PyQuery(to_bytes(infile.read()))
            anchor_tags = html.items('a')
            # log.debug(f'{len(list(anchor_tags))} links: {htmlpath}') # DEBUG
            for item in anchor_tags:
                href = item.attr('href')
                if href and href.startswith('http'):
                    if exclude and (exclude not in href):
                        results.append([htmlpath, href, item.text().strip()])
                        # log.debug(f'\t{href}') # DEBUG

        return results
Beispiel #9
0
def params_to_bytes(params):
    ''' Convert params dict to http protocol params byte sequence. '''

    return to_bytes(params_to_str(params))