コード例 #1
0
ファイル: test_utils_markup.py プロジェクト: chzealot/scrapy
    def test_remove_comments(self):
        # make sure it always return unicode
        assert isinstance(remove_comments('without comments'), unicode)
        assert isinstance(remove_comments('<!-- with comments -->'), unicode)

        # text without comments 
        self.assertEqual(remove_comments(u'text without comments'), u'text without comments')

        # text with comments
        self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
        self.assertEqual(remove_comments(u'Hello<!--World-->'),u'Hello')
コード例 #2
0
    def test_remove_comments(self):
        # make sure it always return unicode
        assert isinstance(remove_comments('without comments'), unicode)
        assert isinstance(remove_comments('<!-- with comments -->'), unicode)

        # text without comments
        self.assertEqual(remove_comments(u'text without comments'),
                         u'text without comments')

        # text with comments
        self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
        self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')
コード例 #3
0
ファイル: extractors.py プロジェクト: chzealot/scrapy
def _process_markup(region, textf, tagf):
    fragments = getattr(region, 'parsed_fragments', None)
    if fragments is None:
        yield textf(region)
        return
    fiter = iter(fragments)
    for fragment in fiter:
        if isinstance(fragment, HtmlTag):
            # skip forward to closing script tags
            tag = fragment.tag
            if tag in _TAGS_TO_PURGE:
                # if opening, keep going until closed
                if fragment.tag_type == HtmlTagType.OPEN_TAG:
                    for probe in fiter:
                        if isinstance(probe, HtmlTag) and \
                            probe.tag == tag and \
                            probe.tag_type == HtmlTagType.CLOSE_TAG:
                            break
            else:
                output = tagf(fragment)
                if output:
                    yield output
        else:
            text = region.htmlpage.fragment_data(fragment)
            text = remove_comments(text)
            text = textf(text)
            if text:
                yield text
コード例 #4
0
def _process_markup(region, textf, tagf):
    fragments = getattr(region, 'parsed_fragments', None)
    if fragments is None:
        yield textf(region)
        return
    fiter = iter(fragments)
    for fragment in fiter:
        if isinstance(fragment, HtmlTag):
            # skip forward to closing script tags
            tag = fragment.tag
            if tag in _TAGS_TO_PURGE:
                # if opening, keep going until closed
                if fragment.tag_type == HtmlTagType.OPEN_TAG:
                    for probe in fiter:
                        if isinstance(probe, HtmlTag) and \
                            probe.tag == tag and \
                            probe.tag_type == HtmlTagType.CLOSE_TAG:
                            break
            else:
                output = tagf(fragment)
                if output:
                    yield output
        else:
            text = region.htmlpage.fragment_data(fragment)
            text = remove_comments(text)
            text = textf(text)
            if text:
                yield text
コード例 #5
0
ファイル: response.py プロジェクト: chenhbzl/book-crawler
def get_meta_refresh(response):
    """Parse the http-equiv parameter of the HTML meta element from the given
    response and return a tuple (interval, url) where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, (None, None) is returned.
    """
    if response not in _metaref_cache:
        body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096]))
        match = META_REFRESH_RE.search(body_chunk)
        if match:
            interval = float(match.group('int'))
            url = safe_url_string(match.group('url').strip(' "\''))
            url = urljoin_rfc(response.url, url)
            _metaref_cache[response] = (interval, url)
        else:
            _metaref_cache[response] = (None, None)
        #_metaref_cache[response] = match.groups() if match else (None, None)
    return _metaref_cache[response]
コード例 #6
0
ファイル: response.py プロジェクト: richard-ma/CodeReading
def get_meta_refresh(response):
    """Parse the http-equiv parameter of the HTML meta element from the given
    response and return a tuple (interval, url) where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, (None, None) is returned.
    """
    if response not in _metaref_cache:
        body_chunk = remove_comments(
            remove_entities(response.body_as_unicode()[0:4096]))
        match = META_REFRESH_RE.search(body_chunk)
        if match:
            interval = float(match.group('int'))
            url = safe_url_string(match.group('url').strip(' "\''))
            url = urljoin_rfc(response.url, url)
            _metaref_cache[response] = (interval, url)
        else:
            _metaref_cache[response] = (None, None)
        #_metaref_cache[response] = match.groups() if match else (None, None)
    return _metaref_cache[response]
コード例 #7
0
ファイル: response.py プロジェクト: robyoung/scrapy
def get_meta_refresh(response):
    """Parse the http-equiv parameter of the HTML meta element from the given
    response and return a tuple (interval, url) where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, (None, None) is returned.
    """
    if response not in _metaref_cache:
        body_chunk = remove_comments(remove_entities(response.body_as_unicode()[0:4096]))
        for match1 in META_TAG_RE.finditer(body_chunk):
            params = {}
            for match2 in META_TAG_ATTRS_RE.finditer(match1.group(1)):
                params[match2.group("key")] = match2.group("value")
            if params.get("http-equiv") == "refresh":
                match = META_CONTENT_RE.search(params.get("content", ""))
                if match:
                    interval = float(match.group("int"))
                    url = urljoin_rfc(response.url, safe_url_string((match.group("url") or "").strip(' "\'')))
                    _metaref_cache[response] = (interval, url)
                    return (interval, url)
        _metaref_cache[response] = (None, None)
    return _metaref_cache[response]