Exemple #1
0
def _prepare_response_message(response, send_body):
    return {
        'url': to_unicode(response.url),
        'status_code': response.status_code,
        'meta': dict_to_unicode(response.meta),
        'body': to_unicode(b64encode(response.body)) if send_body else None
    }
Exemple #2
0
def _prepare_request_message(request):
    return {
        'url': to_unicode(request.url),
        'method': to_unicode(request.method),
        'headers': dict_to_unicode(request.headers),
        'cookies': dict_to_unicode(request.cookies),
        'meta': dict_to_unicode(request.meta)
    }
Exemple #3
0
def get_meta_refresh(
    text: AnyStr,
    baseurl: str = "",
    encoding: str = "utf-8",
    ignore_tags: Iterable[str] = ("script", "noscript"),
) -> Tuple[Optional[float], Optional[str]]:
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    try:
        utext = to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    utext = remove_tags_with_content(utext, ignore_tags)
    utext = remove_comments(replace_entities(utext))
    m = _meta_refresh_re.search(utext)
    if m:
        interval = float(m.group("int"))
        url = safe_url_string(m.group("url").strip(" \"'"), encoding)
        url = urljoin(baseurl, url)
        return interval, url
    else:
        return None, None
Exemple #4
0
    def _request_from_dict(cls, d, spider=None, method=None, properties=None):
        """Create Request object from a dict.

        If a spider is given, it will try to resolve the callbacks looking at the
        spider for methods with the same name.
        """
        cb = d.get("callback", None)
        if cb and spider:
            cb = _get_method(spider, cb)
        eb = d.get("errback", None)
        if eb and spider:
            eb = _get_method(spider, eb)
        request_cls = load_object(d.get("_class",
                                        None)) if "_class" in d else Request
        return request_cls(
            url=to_unicode(d.get("url", None)),
            callback=cb,
            errback=eb,
            method=d.get("method", None),
            headers=d.get("headers", None),
            body=d.get("body", None),
            cookies=d.get("cookies", None),
            meta=d.get("meta", None),
            encoding=d.get("_encoding", None),
            priority=d.get("priority", 0),
            dont_filter=d.get("dont_filter", True),
            flags=d.get("flags", None),
        )
Exemple #5
0
def _convert_and_save_type(obj):
    """
    :param obj: dict object

    The purpose of this method is to transform the given dict
    into a form that would be able to serialize with JSONEncoder.
    In order to implement this, this method converts all byte strings
    inside a dict to unicode and saves their type for reverting to its
    original state. The type and the value are stored as a tuple in the
    following format: (original_type, converted value). All other objects
    like dict, tuple, list are converted to the same format for the sake
    of serialization and for the ease of reverting.
    Refer `https://github.com/scrapinghub/frontera/pull/233#discussion_r97432868`
    for the detailed explanation about the design.
    """
    if isinstance(obj, bytes):
        return 'bytes', to_unicode(obj)
    elif isinstance(obj, dict):
        return 'dict', [(_convert_and_save_type(k), _convert_and_save_type(v))
                        for k, v in six.iteritems(obj)]
    elif isinstance(obj, (list, tuple)):
        return type(obj).__name__, [
            _convert_and_save_type(item) for item in obj
        ]
    return 'other', obj
Exemple #6
0
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
        encoding=None):
    """Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\\n``, ``\\t``, ``\\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    """

    text = to_unicode(text, encoding)
    for ec in which_ones:
        text = text.replace(ec, to_unicode(replace_by, encoding))
    return text
Exemple #7
0
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
    """
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    """

    def _get_fragments(txt, pattern):
        offset = 0
        for match in pattern.finditer(txt):
            match_s, match_e = match.span(1)
            yield txt[offset:match_s]
            yield match
            offset = match_e
        yield txt[offset:]

    text = to_unicode(text, encoding)
    ret_text = u''
    for fragment in _get_fragments(text, _cdata_re):
        if isinstance(fragment, six.string_types):
            # it's not a CDATA (so we try to remove its entities)
            ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal)
        else:
            # it's a CDATA (so we just extract its content)
            ret_text += fragment.group('cdata_d')
    return ret_text
Exemple #8
0
def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')):
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    if six.PY2:
        baseurl = to_bytes(baseurl, encoding)
    try:
        text = to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    text = remove_tags_with_content(text, ignore_tags)
    text = remove_comments(replace_entities(text))
    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = safe_url_string(m.group('url').strip(' "\''), encoding)
        url = moves.urllib.parse.urljoin(baseurl, url)
        return interval, url
    else:
        return None, None
Exemple #9
0
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_unicode(url, encoding))
Exemple #10
0
def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \
        encoding=None):
    """Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\\n``, ``\\t``, ``\\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    """

    text = to_unicode(text, encoding)
    for ec in which_ones:
        text = text.replace(ec, to_unicode(replace_by, encoding))
    return text
Exemple #11
0
def unquote_markup(text, keep=(), remove_illegal=True, encoding=None):
    """
    This function receives markup as a text (always a unicode string or
    a UTF-8 encoded string) and does the following:

    1. removes entities (except the ones in `keep`) from any part of it
        that is not inside a CDATA
    2. searches for CDATAs and extracts their text (if any) without modifying it.
    3. removes the found CDATAs

    """
    def _get_fragments(txt, pattern):
        offset = 0
        for match in pattern.finditer(txt):
            match_s, match_e = match.span(1)
            yield txt[offset:match_s]
            yield match
            offset = match_e
        yield txt[offset:]

    text = to_unicode(text, encoding)
    ret_text = u''
    for fragment in _get_fragments(text, _cdata_re):
        if isinstance(fragment, six.string_types):
            # it's not a CDATA (so we try to remove its entities)
            ret_text += replace_entities(fragment,
                                         keep=keep,
                                         remove_illegal=remove_illegal)
        else:
            # it's a CDATA (so we just extract its content)
            ret_text += fragment.group('cdata_d')
    return ret_text
Exemple #12
0
def get_meta_refresh(text,
                     baseurl='',
                     encoding='utf-8',
                     ignore_tags=('script', 'noscript')):
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    if six.PY2:
        baseurl = to_bytes(baseurl, encoding)
    try:
        text = to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    text = remove_tags_with_content(text, ignore_tags)
    text = remove_comments(replace_entities(text))
    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = safe_url_string(m.group('url').strip(' "\''), encoding)
        url = moves.urllib.parse.urljoin(baseurl, url)
        return interval, url
    else:
        return None, None
Exemple #13
0
    def remove_tags_attribute(text, which_ones=(), keep=(), encoding=None):
        """ Remove HTML Tags attribute only.

        `which_ones` and `keep` are both tuples, there are four cases:

        |``which_ones``  |  ``keep``      | what it does                                |
        |--------------  |:---------------|:------------------------------------------- |
        | **not empty**  |   empty        | remove all tags in ``which_ones``           |
        | empty          | **not empty**  | remove all tags except the ones in ``keep`` |
        | empty          | empty          | remove all tags                             |
        | **not empty**  |  **not empty** | not allowed                                 |
        """

        assert not (which_ones and keep
                    ), 'which_ones and keep can not be given at the same time'

        which_ones = {tag.lower() for tag in which_ones}
        keep = {tag.lower() for tag in keep}

        def will_remove(tag):
            tag = tag.lower()
            if which_ones:
                return tag in which_ones
            else:
                return tag not in keep

        def remove_tag(m):
            tag = m.group(1)
            return "<{}>".format(
                m.group(1)) if will_remove(tag) else m.group(0)

        regex = r"<([a-zA-Z!].*?)\s+.*?>"
        retags = re.compile(regex, re.DOTALL | re.IGNORECASE)
        return retags.sub(remove_tag, to_unicode(text, encoding))
Exemple #14
0
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_unicode(url, encoding))
Exemple #15
0
def safe_url_string(
    url: StrOrBytes,
    encoding: str = "utf8",
    path_encoding: str = "utf8",
    quote_path: bool = True,
) -> str:
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986. Also, ASCII tabs and newlines are removed
    as per https://url.spec.whatwg.org/#url-parsing.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). If quote_path is True (default),
    path_encoding ('utf-8' by default) is used to encode URL path component
    which is then quoted. Otherwise, if quote_path is False, path component
    is not encoded or quoted. Given encoding is used for query string
    or form data.

    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    decoded = to_unicode(url, encoding=encoding, errors="percentencode")
    parts = urlsplit(_ascii_tab_newline_re.sub("", decoded))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc_bytes = parts.netloc.encode("idna")
    except UnicodeError:
        netloc = parts.netloc
    else:
        netloc = netloc_bytes.decode()

    # default encoding for path component SHOULD be UTF-8
    if quote_path:
        path = quote(parts.path.encode(path_encoding), _path_safe_chars)
    else:
        path = parts.path

    return urlunsplit(
        (
            parts.scheme,
            netloc.rstrip(":"),
            path,
            quote(parts.query.encode(encoding), _safe_chars),
            quote(parts.fragment.encode(encoding), _safe_chars),
        )
    )
Exemple #16
0
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
    u"""Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    u'Price: \\xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    """
    def convert_entity(m):
        groups = m.groupdict()
        if groups.get('dec'):
            number = int(groups['dec'], 10)
        elif groups.get('hex'):
            number = int(groups['hex'], 16)
        elif groups.get('named'):
            entity_name = groups['named']
            if entity_name.lower() in keep:
                return m.group(0)
            else:
                number = (moves.html_entities.name2codepoint.get(entity_name)
                          or moves.html_entities.name2codepoint.get(
                              entity_name.lower()))
        if number is not None:
            # Numeric character references in the 80-9F range are typically
            # interpreted by browsers as representing the characters mapped
            # to bytes 80-9F in the Windows-1252 encoding. For more info
            # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
            try:
                if 0x80 <= number <= 0x9f:
                    return six.int2byte(number).decode('cp1252')
                else:
                    return six.unichr(number)
            except ValueError:
                pass

        return u'' if remove_illegal and groups.get('semicolon') else m.group(
            0)

    return _ent_re.sub(convert_entity, to_unicode(text, encoding))
Exemple #17
0
def parse_url(
    url: Union[StrOrBytes, ParseResult], encoding: Optional[str] = None
) -> ParseResult:
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_unicode(url, encoding))
Exemple #18
0
def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'):
    u"""Remove entities from the given `text` by converting them to their
    corresponding unicode character.

    `text` can be a unicode string or a byte string encoded in the given
    `encoding` (which defaults to 'utf-8').

    If `keep` is passed (with a list of entity names) those entities will
    be kept (they won't be removed).

    It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
    and named entities (such as ``&nbsp;`` or ``&gt;``).

    If `remove_illegal` is ``True``, entities that can't be converted are removed.
    If `remove_illegal` is ``False``, entities that can't be converted are kept "as
    is". For more information see the tests.

    Always returns a unicode string (with the entities removed).

    >>> import w3lib.html
    >>> w3lib.html.replace_entities(b'Price: &pound;100')
    u'Price: \\xa3100'
    >>> print(w3lib.html.replace_entities(b'Price: &pound;100'))
    Price: £100
    >>>

    """

    def convert_entity(m):
        groups = m.groupdict()
        if groups.get('dec'):
            number = int(groups['dec'], 10)
        elif groups.get('hex'):
            number = int(groups['hex'], 16)
        elif groups.get('named'):
            entity_name = groups['named']
            if entity_name.lower() in keep:
                return m.group(0)
            else:
                number = (moves.html_entities.name2codepoint.get(entity_name) or
                    moves.html_entities.name2codepoint.get(entity_name.lower()))
        if number is not None:
            # Numeric character references in the 80-9F range are typically
            # interpreted by browsers as representing the characters mapped
            # to bytes 80-9F in the Windows-1252 encoding. For more info
            # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML
            try:
                if 0x80 <= number <= 0x9f:
                    return six.int2byte(number).decode('cp1252')
                else:
                    return six.unichr(number)
            except ValueError:
                pass

        return u'' if remove_illegal and groups.get('semicolon') else m.group(0)

    return _ent_re.sub(convert_entity, to_unicode(text, encoding))
Exemple #19
0
def replace_escape_chars(
    text: AnyStr,
    which_ones: Iterable[str] = ("\n", "\t", "\r"),
    replace_by: StrOrBytes = "",
    encoding: Optional[str] = None,
) -> str:
    """Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\\n``, ``\\t``, ``\\r``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    """

    utext = to_unicode(text, encoding)
    for ec in which_ones:
        utext = utext.replace(ec, to_unicode(replace_by, encoding))
    return utext
Exemple #20
0
def remove_comments(text, encoding=None):
    """ Remove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    u'test  whatever'
    >>>

    """

    text = to_unicode(text, encoding)
    return _REMOVECOMMENTS_RE.sub(u'', text)
Exemple #21
0
def remove_comments(text: AnyStr, encoding: Optional[str] = None) -> str:
    """Remove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    'test  whatever'
    >>>

    """

    utext = to_unicode(text, encoding)
    return _REMOVECOMMENTS_RE.sub("", utext)
Exemple #22
0
def remove_comments(text, encoding=None):
    """ Remove HTML Comments.

    >>> import w3lib.html
    >>> w3lib.html.remove_comments(b"test <!--textcoment--> whatever")
    u'test  whatever'
    >>>

    """

    text = to_unicode(text, encoding)
    return _REMOVECOMMENTS_RE.sub(u'', text)
Exemple #23
0
def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    """

    text = to_unicode(text, encoding)
    m = _baseurl_re.search(text)
    if m:
        return urljoin(safe_url_string(baseurl),
                       safe_url_string(m.group(1), encoding=encoding))
    else:
        return safe_url_string(baseurl)
Exemple #24
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986. Also, ASCII tabs and newlines are removed
    as per https://url.spec.whatwg.org/#url-parsing.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    decoded = to_unicode(url, encoding=encoding, errors='percentencode')
    parts = urlsplit(_ascii_tab_newline_re.sub('', decoded))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc).rstrip(':'),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #25
0
def get_base_url(text: AnyStr,
                 baseurl: StrOrBytes = "",
                 encoding: str = "utf-8") -> str:
    """Return the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    """

    utext = to_unicode(text, encoding)
    m = _baseurl_re.search(utext)
    if m:
        return urljoin(safe_url_string(baseurl),
                       safe_url_string(m.group(1), encoding=encoding))
    else:
        return safe_url_string(baseurl)
Exemple #26
0
def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    """

    text = to_unicode(text, encoding)
    m = _baseurl_re.search(text)
    if m:
        return moves.urllib.parse.urljoin(
            safe_url_string(baseurl),
            safe_url_string(m.group(1), encoding=encoding)
        )
    else:
        return safe_url_string(baseurl)
Exemple #27
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # IDNA encoding can fail for too long labels (>63 characters)
    # or missing labels (e.g. http://.example.com)
    try:
        netloc = parts.netloc.encode('idna')
    except UnicodeError:
        netloc = parts.netloc

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(netloc),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #28
0
def create_spiders(spiders, schemas, extractors, items, selector='css'):
    """Create all spiders from slybot spiders."""
    item_classes = ''
    if items:
        item_classes = '\nfrom ..items import {}'.format(', '.join(
            (v().__class__.__name__ for v in items.values())))
    spider_data = []
    for name, (spider, spec) in spiders.items():
        log.info('Creating spider "%s"' % spider.name)
        spider = create_spider(name, spider, spec, schemas, extractors, items,
                               selector)
        cleaned_name = _clean(name)
        filename = 'spiders/{}.py'.format(cleaned_name)
        data = '\n'.join(
            (SPIDER_FILE(item_classes=item_classes), spider.strip()))
        code = fix_code(to_unicode(data), OPTIONS)
        spider_data.append((filename, code))
    return spider_data
Exemple #29
0
def create_spiders(spiders, schemas, extractors, items, selector='css'):
    """Create all spiders from slybot spiders."""
    item_classes = ''
    if items:
        item_classes = '\nfrom ..items import {}'.format(
            ', '.join((v().__class__.__name__ for v in items.values()))
        )
    spider_data = []
    for name, (spider, spec) in spiders.items():
        log.info('Creating spider "%s"' % spider.name)
        spider = create_spider(name, spider, spec, schemas, extractors, items,
                               selector)
        cleaned_name = _clean(name)
        filename = 'spiders/{}.py'.format(cleaned_name)
        data = '\n'.join((SPIDER_FILE(item_classes=item_classes),
                          spider.strip()))
        code = fix_code(to_unicode(data), OPTIONS)
        spider_data.append((filename, code))
    return spider_data
Exemple #30
0
def remove_tags_with_content(text, which_ones=(), encoding=None):
    """Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    u'<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    """

    text = to_unicode(text, encoding)
    if which_ones:
        tags = '|'.join([r'<%s\b.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
        retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
        text = retags.sub(u'', text)
    return text
Exemple #31
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing a encoding, you should use the encoding of the
    original page (the page from which the url was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding.
    #
    # it is assumed that a raw bytes input comes from the page
    # corresponding to the encoding
    #
    # Note: if this assumption is wrong, this will fail;
    #       in the general case, users are required to use Unicode
    #       or safe ASCII bytes input
    parts = urlsplit(to_unicode(url, encoding=encoding))

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #32
0
def replace_tags(text, token='', encoding=None):
    """Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
    u'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
    u' -- Je ne parle pas  -- fran\\xe7ais --  -- '
    >>>

    """

    return _tag_re.sub(token, to_unicode(text, encoding))
Exemple #33
0
def remove_tags_with_content(text, which_ones=(), encoding=None):
    """Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    u'<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    """

    text = to_unicode(text, encoding)
    if which_ones:
        tags = '|'.join(
            [r'<%s.*?</%s>|<%s\s*/>' % (tag, tag, tag) for tag in which_ones])
        retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
        text = retags.sub(u'', text)
    return text
Exemple #34
0
def replace_tags(text, token='', encoding=None):
    """Replace all markup tags found in the given `text` by the given token.
    By default `token` is an empty string so it just removes all tags.

    `text` can be a unicode string or a regular string encoded as `encoding`
    (or ``'utf-8'`` if `encoding` is not given.)

    Always returns a unicode string.

    Examples:

    >>> import w3lib.html
    >>> w3lib.html.replace_tags(u'This text contains <a>some tag</a>')
    u'This text contains some tag'
    >>> w3lib.html.replace_tags('<p>Je ne parle pas <b>fran\\xe7ais</b></p>', ' -- ', 'latin-1')
    u' -- Je ne parle pas  -- fran\\xe7ais --  -- '
    >>>

    """

    return _tag_re.sub(token, to_unicode(text, encoding))
Exemple #35
0
def safe_url_string(url, encoding='utf8', path_encoding='utf8'):
    """Convert the given URL into a legal URL by escaping unsafe characters
    according to RFC-3986.

    If a bytes URL is given, it is first converted to `str` using the given
    encoding (which defaults to 'utf-8'). 'utf-8' encoding is used for
    URL path component (unless overriden by path_encoding), and given
    encoding is used for query string or form data.
    When passing an encoding, you should use the encoding of the
    original page (the page from which the URL was extracted from).

    Calling this function on an already "safe" URL will return the URL
    unmodified.

    Always returns a native `str` (bytes in Python2, unicode in Python3).
    """
    # Python3's urlsplit() chokes on bytes input with non-ASCII chars,
    # so let's decode (to Unicode) using page encoding:
    #   - it is assumed that a raw bytes input comes from a document
    #     encoded with the supplied encoding (or UTF8 by default)
    #   - if the supplied (or default) encoding chokes,
    #     percent-encode offending bytes
    parts = urlsplit(to_unicode(url, encoding=encoding,
                                errors='percentencode'))

    # quote() in Python2 return type follows input type;
    # quote() in Python3 always returns Unicode (native str)
    return urlunsplit((
        to_native_str(parts.scheme),
        to_native_str(parts.netloc.encode('idna')),

        # default encoding for path component SHOULD be UTF-8
        quote(to_bytes(parts.path, path_encoding), _safe_chars),

        # encoding of query and fragment follows page encoding
        # or form-charset (if known and passed)
        quote(to_bytes(parts.query, encoding), _safe_chars),
        quote(to_bytes(parts.fragment, encoding), _safe_chars),
    ))
Exemple #36
0
def _convert_and_save_type(obj):
    """
    :param obj: dict object

    The purpose of this method is to transform the given dict
    into a form that would be able to serialize with JSONEncoder.
    In order to implement this, this method converts all byte strings
    inside a dict to unicode and saves their type for reverting to its
    original state. The type and the value are stored as a tuple in the
    following format: (original_type, converted value). All other objects
    like dict, tuple, list are converted to the same format for the sake
    of serialization and for the ease of reverting.
    Refer `https://github.com/scrapinghub/frontera/pull/233#discussion_r97432868`
    for the detailed explanation about the design.
    """
    if isinstance(obj, bytes):
        return 'bytes', to_unicode(obj)
    elif isinstance(obj, dict):
        return 'dict', [(_convert_and_save_type(k), _convert_and_save_type(v)) for k, v in six.iteritems(obj)]
    elif isinstance(obj, (list, tuple)):
        return type(obj).__name__, [_convert_and_save_type(item) for item in obj]
    return 'other', obj
Exemple #37
0
def remove_tags_with_content(text: AnyStr,
                             which_ones: Iterable[str] = (),
                             encoding: Optional[str] = None) -> str:
    """Remove tags and their content.

    `which_ones` is a tuple of which tags to remove including their content.
    If is empty, returns the string unmodified.

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',))
    '<div><p> <a href="http://www.example.com">example</a></p></div>'
    >>>

    """

    utext = to_unicode(text, encoding)
    if which_ones:
        tags = "|".join(
            [fr"<{tag}\b.*?</{tag}>|<{tag}\s*/>" for tag in which_ones])
        retags = re.compile(tags, re.DOTALL | re.IGNORECASE)
        utext = retags.sub("", utext)
    return utext
Exemple #38
0
def remove_tags(text, which_ones=(), keep=(), encoding=None):
    """ Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    u'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    u'<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    u'<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
        ...
    ValueError: Cannot use both which_ones and keep
    >>>

    """
    if which_ones and keep:
        raise ValueError('Cannot use both which_ones and keep')

    which_ones = {tag.lower() for tag in which_ones}
    keep = {tag.lower() for tag in keep}

    def will_remove(tag):
        tag = tag.lower()
        if which_ones:
            return tag in which_ones
        else:
            return tag not in keep

    def remove_tag(m):
        tag = m.group(1)
        return u'' if will_remove(tag) else m.group(0)

    regex = '</?([^ >/]+).*?>'
    retags = re.compile(regex, re.DOTALL | re.IGNORECASE)

    return retags.sub(remove_tag, to_unicode(text, encoding))
Exemple #39
0
def create_schemas(items):
    """Create and write schemas from definitions."""
    schema_classes, schema_names = create_schemas_classes(items)
    items_py = '\n'.join(chain([ITEMS_IMPORTS], schema_classes)).strip()
    items_py = fix_code(to_unicode(items_py), OPTIONS)
    return items_py, schema_names
Exemple #40
0
def has_entities(text, encoding=None):
    return bool(_ent_re.search(to_unicode(text, encoding)))
Exemple #41
0
def _quote_byte(error):
    return (to_unicode(quote(error.object[error.start:error.end])), error.end)
Exemple #42
0
def remove_tags(text, which_ones=(), keep=(), encoding=None):
    """ Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    u'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    u'<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    u'<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags
        assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
    AssertionError: which_ones and keep can not be given at the same time
    >>>

    """

    assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'

    which_ones = {tag.lower() for tag in which_ones}
    keep = {tag.lower() for tag in keep}

    def will_remove(tag):
        tag = tag.lower()
        if which_ones:
            return tag in which_ones
        else:
            return tag not in keep

    def remove_tag(m):
        tag = m.group(1)
        return u'' if will_remove(tag) else m.group(0)

    regex = '</?([^ >/]+).*?>'
    retags = re.compile(regex, re.DOTALL | re.IGNORECASE)

    return retags.sub(remove_tag, to_unicode(text, encoding))
Exemple #43
0
def remove_tags(text, which_ones=(), keep=(), encoding=None):
    """ Remove HTML Tags only.

    `which_ones` and `keep` are both tuples, there are four cases:

    ==============  ============= ==========================================
    ``which_ones``  ``keep``      what it does
    ==============  ============= ==========================================
    **not empty**   empty         remove all tags in ``which_ones``
    empty           **not empty** remove all tags except the ones in ``keep``
    empty           empty         remove all tags
    **not empty**   **not empty** not allowed
    ==============  ============= ==========================================


    Remove all tags:

    >>> import w3lib.html
    >>> doc = '<div><p><b>This is a link:</b> <a href="http://www.example.com">example</a></p></div>'
    >>> w3lib.html.remove_tags(doc)
    u'This is a link: example'
    >>>

    Keep only some tags:

    >>> w3lib.html.remove_tags(doc, keep=('div',))
    u'<div>This is a link: example</div>'
    >>>

    Remove only specific tags:

    >>> w3lib.html.remove_tags(doc, which_ones=('a','b'))
    u'<div><p>This is a link: example</p></div>'
    >>>

    You can't remove some and keep some:

    >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',))
    Traceback (most recent call last):
      File "<stdin>", line 1, in <module>
      File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags
        assert not (which_ones and keep), 'which_ones and keep can not be given at the same time'
    AssertionError: which_ones and keep can not be given at the same time
    >>>

    """

    assert not (which_ones and
                keep), 'which_ones and keep can not be given at the same time'

    which_ones = {tag.lower() for tag in which_ones}
    keep = {tag.lower() for tag in keep}

    def will_remove(tag):
        tag = tag.lower()
        if which_ones:
            return tag in which_ones
        else:
            return tag not in keep

    def remove_tag(m):
        tag = m.group(1)
        return u'' if will_remove(tag) else m.group(0)

    regex = '</?([^ >/]+).*?>'
    retags = re.compile(regex, re.DOTALL | re.IGNORECASE)

    return retags.sub(remove_tag, to_unicode(text, encoding))
Exemple #44
0
def create_schemas(items):
    """Create and write schemas from definitions."""
    schema_classes, schema_names = create_schemas_classes(items)
    items_py = '\n'.join(chain([ITEMS_IMPORTS], schema_classes)).strip()
    items_py = fix_code(to_unicode(items_py), OPTIONS)
    return items_py, schema_names
Exemple #45
0
def _prepare_response_message(response, send_body):
    return {'url': to_unicode(response.url),
            'status_code': response.status_code,
            'meta': dict_to_unicode(response.meta),
            'body': to_unicode(b64encode(response.body)) if send_body else None}
Exemple #46
0
def _prepare_request_message(request):
    return {'url': to_unicode(request.url),
            'method': to_unicode(request.method),
            'headers': dict_to_unicode(request.headers),
            'cookies': dict_to_unicode(request.cookies),
            'meta': dict_to_unicode(request.meta)}
Exemple #47
0
def has_entities(text: AnyStr, encoding: Optional[str] = None) -> bool:
    return bool(_ent_re.search(to_unicode(text, encoding)))
Exemple #48
0
def has_entities(text, encoding=None):
    return bool(_ent_re.search(to_unicode(text, encoding)))
Exemple #49
0
def _quote_byte(error):
    return (to_unicode(quote(error.object[error.start:error.end])), error.end)