Python safe_url_stringの例、w3lib.url.safe_url_string Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_url.py プロジェクト: scrapy/w3lib

    def test_safe_url_string_with_query(self):
        safeurl = safe_url_string("http://www.example.com/£?unit=µ")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

        safeurl = safe_url_string("http://www.example.com/£?unit=µ",
                                  encoding="utf-8")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

        safeurl = safe_url_string("http://www.example.com/£?unit=µ",
                                  encoding="latin-1")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")

        safeurl = safe_url_string("http://www.example.com/£?unit=µ",
                                  path_encoding="latin-1")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%C2%B5")

        safeurl = safe_url_string(
            "http://www.example.com/£?unit=µ",
            encoding="latin-1",
            path_encoding="latin-1",
        )
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")

コード例 #2

0

ファイルを表示

ファイル: ToeflListeningpipelines.py プロジェクト: Darren2Grit/spider-xyt

    def item_completed(self, results, item, info):
        print(results)
        if len(results) > 0:
            for result in results:
                if result[0]:
                    url = result[1]['url']
                    path = result[1]['path']
                    audio_url = item['question_audio_url']
                    if audio_url is not None and str(audio_url) != "" and url.find("mp3") != -1:
                        if url == safe_url_string(audio_url):
                            item['question_audio_url'] = path
                    elif len(item['question_content_file_url_list']) > 0:
                        for index, file in enumerate(item['question_content_file_url_list']):
                            if safe_url_string(file) == url:
                                article_html = item['question_content'][0]
                                count = 0
                                new_article_html = ""
                                new_article_html_list=[]
                                for i in range(len(article_html) - 1):
                                    if article_html[i:i + len("$img")] == "$img":
                                        if count == index:
                                            new_article_html = article_html[:i]
                                            path_new = str(path).replace("\\", "/")
                                            new_article_html += "<img src='upload/upload/img/" + path_new + "'/>"
                                            new_article_html += article_html[i + len("$img"):]
                                            new_article_html_list.append(new_article_html)
                                            item['question_content'] = new_article_html_list
                                            break
                                        else:
                                            count += 1

            print(results)
        data = dict(item)
        self.client.insert(data)
        return item

コード例 #3

0

ファイルを表示

ファイル: test_url.py プロジェクト: scrapy/w3lib

 def test_safe_url_port_number(self):
     self.assertEqual(
         safe_url_string(u"http://www.example.com:80/résumé?q=résumé"),
         "http://www.example.com:80/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(
         safe_url_string(u"http://www.example.com:/résumé?q=résumé"),
         "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

コード例 #4

0

ファイルを表示

ファイル: test_url.py プロジェクト: hackrush01/w3lib

 def test_safe_url_port_number(self):
     self.assertEqual(
         safe_url_string(u"http://www.example.com:80/résumé?q=résumé"),
         "http://www.example.com:80/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
     self.assertEqual(
         safe_url_string(u"http://www.example.com:/résumé?q=résumé"),
         "http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

コード例 #5

0

ファイルを表示

ファイル: test_url.py プロジェクト: codecov-test/w3lib

    def test_safe_url_string_misc(self):
        # mixing Unicode and percent-escaped sequences
        safeurl = safe_url_string(u"http://www.example.com/£?unit=%C2%B5")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

        safeurl = safe_url_string(u"http://www.example.com/%C2%A3?unit=µ")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

コード例 #6

0

ファイルを表示

    def test_safe_url_idna(self):
        # adapted from:
        # https://ssl.icu-project.org/icu-bin/idnbrowser
        # http://unicode.org/faq/idn.html
        # + various others
        websites = (
            (u'http://www.färgbolaget.nu/färgbolaget',
             'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'),
            (u'http://www.räksmörgås.se/?räksmörgås=yes',
             'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes'
             ),
            (u'http://www.brændendekærlighed.com/brændende/kærlighed',
             'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed'
             ),
            (u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'),
            (u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'),
            (u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'),

            # --- real websites ---

            # in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher
            (u'http://www.bücher.de/?q=bücher',
             'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'),

            # Japanese
            (u'http://はじめよう.みんな/?query=サ&maxResults=5',
             'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'
             ),

            # Russian
            (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'),
            (u'http://кто.рф/index.php?domain=Что',
             'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'),

            # Korean
            (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'),
            (u'http://맨체스터시티축구단.한국/',
             'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'),

            # Arabic
            (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'),

            # Chinese
            (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'),
            (u'https://www2.xn--0kwr83e.在线',
             'https://www2.xn--0kwr83e.xn--3ds443g'),
            (u'https://www3.贷款.xn--3ds443g',
             'https://www3.xn--0kwr83e.xn--3ds443g'),
        )
        for idn_input, safe_result in websites:
            safeurl = safe_url_string(idn_input)
            self.assertEqual(safeurl, safe_result)

        # make sure the safe URL is unchanged when made safe a 2nd time
        for _, safe_result in websites:
            safeurl = safe_url_string(safe_result)
            self.assertEqual(safeurl, safe_result)

コード例 #7

0

ファイルを表示

ファイル: test_url.py プロジェクト: scrapy/w3lib

    def test_safe_url_string_misc(self):
        # mixing Unicode and percent-escaped sequences
        safeurl = safe_url_string("http://www.example.com/£?unit=%C2%B5")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

        safeurl = safe_url_string("http://www.example.com/%C2%A3?unit=µ")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

コード例 #8

0

ファイルを表示

ファイル: test_url.py プロジェクト: scrapy/w3lib

    def test_safe_url_string_quote_path(self):
        safeurl = safe_url_string('http://google.com/"hello"', quote_path=True)
        self.assertEqual(safeurl, "http://google.com/%22hello%22")

        safeurl = safe_url_string('http://google.com/"hello"',
                                  quote_path=False)
        self.assertEqual(safeurl, 'http://google.com/"hello"')

        safeurl = safe_url_string('http://google.com/"hello"')
        self.assertEqual(safeurl, "http://google.com/%22hello%22")

コード例 #9

0

ファイルを表示

ファイル: test_url.py プロジェクト: scrapy/w3lib

 def test_safe_url_string_encode_idna_domain_with_username_and_empty_password_and_port_number(
     self, ):
     self.assertEqual(
         safe_url_string("ftp://admin:@新华网.中国:21"),
         "ftp://admin:@xn--xkrr14bows.xn--fiqs8s:21",
     )
     self.assertEqual(
         safe_url_string("ftp://admin@新华网.中国:21"),
         "ftp://[email protected]:21",
     )

コード例 #10

0

ファイルを表示

ファイル: __init__.py プロジェクト: zhangcheng/scrapy

 def _set_url(self, url):
     if isinstance(url, str):
         self._url = safe_url_string(url)
     elif isinstance(url, unicode):
         if self.encoding is None:
             raise TypeError('Cannot convert unicode url - %s has no encoding' %
                 type(self).__name__)
         unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding)
         self._url = safe_url_string(unicode_url, self.encoding)
     else:
         raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

コード例 #11

0

ファイルを表示

ファイル: urlparseprofile.py プロジェクト: yhjohn163/scrapy-bench

def main():
    total = 0
    time = 0
    time_file_uri_to_path = 0
    time_safe_url_string = 0
    time_canonicalize_url = 0

    tar = tarfile.open("sites.tar.gz")
    urls = []

    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()
        response = HtmlResponse(url="local", body=html, encoding='utf8')

        links = response.css('a::attr(href)').extract()
        urls.extend(links)

    for url in urls:
        start_file_uri_to_path = timer()
        file_uri_to_path(url)
        end_file_uri_to_path = timer()
        time_file_uri_to_path += (end_file_uri_to_path -
                                  start_file_uri_to_path)
        time += (end_file_uri_to_path - start_file_uri_to_path)

        start_safe_url_string = timer()
        safe_url_string(url)
        end_safe_url_string = timer()
        time_safe_url_string += (end_safe_url_string - start_safe_url_string)
        time += (end_safe_url_string - start_safe_url_string)

        start_canonicalize_url = timer()
        canonicalize_url(url)
        end_canonicalize_url = timer()
        time_canonicalize_url += (end_canonicalize_url -
                                  start_canonicalize_url)
        time += (end_canonicalize_url - start_canonicalize_url)

        # any_to_uri(url) # Error on Python 2: KeyError: u'\u9996'

        total += 1

    print("\nTotal number of items extracted = {0}".format(total))
    print("Time spent on file_uri_to_path = {0}".format(time_file_uri_to_path))
    print("Time spent on safe_url_string = {0}".format(time_safe_url_string))
    print("Time spent on canonicalize_url = {0}".format(time_canonicalize_url))
    print("Total time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} items/second\n".format(
        float(total / time)),
                bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time))))

コード例 #12

0

ファイルを表示

ファイル: GMATpipelines.py プロジェクト: Darren2Grit/spider-xyt

    def item_completed(self, results, item, info):
        if len(results) > 0:
            for result in results:
                if result[0]:
                    url = result[1]['url']
                    path = result[1]['path']
                    if item.__contains__('question_content_file') and len(
                            item['question_content_file']) > 0:
                        for index, file in enumerate(
                                item['question_content_file']):
                            if safe_url_string(file, encoding="utf8") == url:
                                article_html = item['question_title']
                                count = 0
                                new_article_html = ""
                                for i in range(len(article_html) - 1):
                                    if article_html[i:i +
                                                    len("$img")] == "$img":
                                        if count == index:
                                            new_article_html = article_html[:i]
                                            path_new = str(path).replace(
                                                "\\", "/")
                                            new_article_html += "<img src='upload/upload/img/gmat/" + path_new + "'/>"
                                            new_article_html += article_html[
                                                i + len("$img"):]
                                            item[
                                                'question_title'] = new_article_html
                                            break
                                        else:
                                            count += 1
                            elif item.__contains__(
                                    "article_content_file"
                            ) and item["article_content_file"] != "":
                                if safe_url_string(
                                        item["article_content_file"],
                                        encoding="utf8") == url:
                                    path_new = str(path).replace("\\", "/")
                                    new_article_content = item[
                                        "article_content"]
                                    new_article_content += "<p><img src='upload/upload/img/gmat/" + path_new + "'/></p>"
                    else:
                        if item.__contains__("article_content_file") and item[
                                "article_content_file"] != "":
                            if safe_url_string(item["article_content_file"],
                                               encoding="utf8") == url:
                                path_new = str(path).replace("\\", "/")
                                new_article_content = item["article_content"]
                                new_article_content += "<p><img src='upload/upload/img/gmat/" + path_new + "'/></p>"
                                item["article_content"] = new_article_content

        else:
            print(results)
        data = dict(item)
        self.client.insert(data)
        return item

コード例 #13

0

ファイルを表示

ファイル: test_url.py プロジェクト: sibiryakov/w3lib

    def test_safe_url_string_bytes_input_nonutf8(self):
        # latin1
        safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")

        # cp1251
        # >>> u'Россия'.encode('cp1251')
        # '\xd0\xee\xf1\xf1\xe8\xff'
        safeurl = safe_url_string(b"http://www.example.com/country/\xd0\xee\xf1\xf1\xe8\xff")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/country/%D0%EE%F1%F1%E8%FF")

コード例 #14

0

ファイルを表示

    def test_safe_url_idna_encoding_failure(self):
        # missing DNS label
        self.assertEqual(
            safe_url_string(u"http://.example.com/résumé?q=résumé"),
            "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # DNS label too long
        self.assertEqual(
            safe_url_string(u"http://www.{label}.com/résumé?q=résumé".format(
                label=u"example" * 11)),
            "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".
            format(label=u"example" * 11))

コード例 #15

0

ファイルを表示

ファイル: test_url.py プロジェクト: codecov-test/w3lib

    def test_safe_url_string_bytes_input_nonutf8(self):
        # latin1
        safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")

        # cp1251
        # >>> u'Россия'.encode('cp1251')
        # '\xd0\xee\xf1\xf1\xe8\xff'
        safeurl = safe_url_string(b"http://www.example.com/country/\xd0\xee\xf1\xf1\xe8\xff")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/country/%D0%EE%F1%F1%E8%FF")

コード例 #16

0

ファイルを表示

ファイル: test_url.py プロジェクト: scrapy/w3lib

    def test_safe_url_idna_encoding_failure(self):
        # missing DNS label
        self.assertEqual(
            safe_url_string("http://.example.com/résumé?q=résumé"),
            "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9",
        )

        # DNS label too long
        self.assertEqual(
            safe_url_string(
                f"http://www.{'example' * 11}.com/résumé?q=résumé"),
            f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9",
        )

コード例 #17

0

ファイルを表示

ファイル: __init__.py プロジェクト: SeaBear/scrapy

 def _set_url(self, url):
     if isinstance(url, str):
         self._url = escape_ajax(safe_url_string(url))
     elif isinstance(url, unicode):
         if self.encoding is None:
             raise TypeError('Cannot convert unicode url - %s has no encoding' %
                 type(self).__name__)
         unicode_url = url if isinstance(url, unicode) else url.decode(self.encoding)
         self._url = safe_url_string(unicode_url, self.encoding)
     else:
         raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
     if ':' not in self._url:
         raise ValueError('Missing scheme in request url: %s' % self._url)

コード例 #18

0

ファイルを表示

ファイル: test_url.py プロジェクト: Preetwinder/w3lib

    def test_safe_url_idna_encoding_failure(self):
        # missing DNS label
        self.assertEqual(
            safe_url_string(u"http://.example.com/résumé?q=résumé"),
            "http://.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")

        # DNS label too long
        self.assertEqual(
            safe_url_string(
                u"http://www.{label}.com/résumé?q=résumé".format(
                    label=u"example"*11)),
            "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
                    label=u"example"*11))

コード例 #19

0

ファイルを表示

ファイル: test_url.py プロジェクト: scrapy/w3lib

 def test_safe_url_string_encode_idna_domain_with_username_password_and_port_number(
     self, ):
     self.assertEqual(
         safe_url_string("ftp://*****:*****@新华网.中国:21"),
         "ftp://*****:*****@xn--xkrr14bows.xn--fiqs8s:21",
     )
     self.assertEqual(
         safe_url_string("http://Åsa:abc123@➡.ws:81/admin"),
         "http://%C3%85sa:[email protected]:81/admin",
     )
     self.assertEqual(
         safe_url_string("http://japão:não@️i❤️.ws:8000/"),
         "http://jap%C3%A3o:n%C3%[email protected]:8000/",
     )

コード例 #20

0

ファイルを表示

ファイル: response.py プロジェクト: reference-project/pyCreeper

 def url(self, url):
     if isinstance(url, str):
         self._url = safe_url_string(url)
     elif isinstance(url, six.text_type):
         if self.encoding is None:
             raise TypeError(
                 'Cannot convert unicode url - %s has no encoding' %
                 type(self).__name__)
         self._url = safe_url_string(url.encode(self.encoding))
     else:
         raise TypeError('Response url must be str or unicode, got %s:' %
                         type(url).__name__)
     if ':' not in self._url:
         raise ValueError('Missing scheme in request url: %s' % self._url)

コード例 #21

0

ファイルを表示

    def process_response(self, request, response, spider):
        if request.meta.get('dont_redirect', False):
            return response

        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, safe_url_string(response.headers['location']))
            redirected = self._redirect_request_using_get(request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin(request.url, safe_url_string(response.headers['location']))
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)
        return response

コード例 #22

0

ファイルを表示

 def _set_url(self, url):
     if isinstance(url, str):
         self._url = safe_url_string(url)
     elif isinstance(url, unicode):
         if self.encoding is None:
             raise TypeError(
                 'Cannot convert unicode url - %s has no encoding' %
                 type(self).__name__)
         unicode_url = url if isinstance(url, unicode) else url.decode(
             self.encoding)
         self._url = safe_url_string(unicode_url, self.encoding)
     else:
         raise TypeError('Request url must be str or unicode, got %s:' %
                         type(url).__name__)

コード例 #23

0

ファイルを表示

ファイル: html.py プロジェクト: zanachka/w3lib

def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    """

    text = to_unicode(text, encoding)
    m = _baseurl_re.search(text)
    if m:
        return urljoin(safe_url_string(baseurl),
                       safe_url_string(m.group(1), encoding=encoding))
    else:
        return safe_url_string(baseurl)

コード例 #24

0

ファイルを表示

ファイル: html.py プロジェクト: rrosajp/w3lib

def get_meta_refresh(
    text: AnyStr,
    baseurl: str = "",
    encoding: str = "utf-8",
    ignore_tags: Iterable[str] = ("script", "noscript"),
) -> Tuple[Optional[float], Optional[str]]:
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    try:
        utext = to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    utext = remove_tags_with_content(utext, ignore_tags)
    utext = remove_comments(replace_entities(utext))
    m = _meta_refresh_re.search(utext)
    if m:
        interval = float(m.group("int"))
        url = safe_url_string(m.group("url").strip(" \"'"), encoding)
        url = urljoin(baseurl, url)
        return interval, url
    else:
        return None, None

コード例 #25

0

ファイルを表示

	def process_response(self, request, response, spider):
		if (request.meta.get('dont_redirect', False) or
				response.status in getattr(spider, 'handle_httpstatus_list', []) or
				response.status in request.meta.get('handle_httpstatus_list', []) or
				request.meta.get('handle_httpstatus_all', False)):
			return response

		allowed_status = (301, 302, 303, 307, 308)
		if 'Location' not in response.headers or response.status not in allowed_status:
			return response

		location = safe_url_string(response.headers['location'])

		redirected_url = urljoin(request.url, location)

		if response.status in (301, 307, 308) or request.method == 'HEAD':
			redirected = request.replace(url=redirected_url)
			return self._redirect(redirected, request, spider, response.status)

		redirected = self._redirect_request_using_get(request, redirected_url)
		redirected_0 = self._redirect(redirected, request, spider, response.status)
		logger.debug('Detectada redirección, pausando y cambiando IP...')
		self.crawler.engine.pause()
		tor_controller.change_identity()
		self.crawler.engine.unpause()
		return Request(url=redirected_0.meta['redirect_urls'][0])

コード例 #26

0

ファイルを表示

ファイル: pdf.py プロジェクト: muzichenglong/scrapyc

    def parse_all(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        base_site = get_url_site(base_url)

        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)

            filename = abs_url.split("?")[0].split("/")[-1]
            if filename :
                ctype  = filename.split(".")[-1].lower() 
            else:
                ctype = None
            if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
                continue

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})

            site = get_url_site(abs_url)
            if site != base_site:
                continue
            if ctype in ["pdf","doc","docx","rtf",]:
                continue
            yield scrapy.Request(url=abs_url,callback=self.parse_all)

コード例 #27

0

ファイルを表示

    def process_response(self, request, response, spider):
        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            return response

        allowed_status = (301, 302, 303, 307, 308)

        if 'Location' not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers['location'])

        redirected_url = urljoin(request.url, location)

        if response.status in (301, 307, 308) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if 'firewall' in redirected_url:
            # to avoid case1、case2：real_url -> firewall
            return Request(request.url, callback=spider.parse_detail, meta=request.meta, dont_filter=True)

        if 'Jump' in redirected_url:
            # to avoid case3：fake_url -> jump_url -> jump_url -> jump_url放弃url
            new_request = request.replace(url=redirected_url, method='GET', body='', meta=request.meta)  # 每次遇到这个跳转url都会加一次retry就是无线retry了

        else:
            new_request = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(new_request, request, spider, response.status)

コード例 #28

0

ファイルを表示

ファイル: middlewares.py プロジェクト: git-wsf/crawler_project

    def process_response(self, request, response, spider):
        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            return response

        allowed_status = (301, 302, 303, 307, 308)
        if 'Location' not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers['location'])
        redirected_url = urljoin(request.url, location)
        spider.logger.info('original_url:{}'.format(request.url))
        spider.logger.info('location:{}'.format(location))
        spider.logger.info('redirected_url:{}'.format(redirected_url))

        if location != 'http://gd.chinavnet.com':
            redirected_url = urljoin(request.url, location)
        else:
            redirected_url = request.url

        if response.status in (301, 307, 308) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status)

コード例 #29

0

ファイルを表示

ファイル: middlewares.py プロジェクト: zhangda7/spider

    def process_response(self, request, response, spider):
        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            return response

        allowed_status = (301, 302, 303, 307)
        if 'Location' not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers['location'])

        redirected_url = urljoin(request.url, location)


        if(redirected_url == request.url):
            logger.info("Url %s %s, equal", redirected_url, request.url)
            #must return response to pass to next middleware
            # return response
            pass
        else:
            logger.info("Url %s %s, not equal, just retry request from scratch", redirected_url, request.url)
            #set redirect url
            redirected_url = request.url
            return request

        if response.status in (301, 307) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status)

コード例 #30

0

ファイルを表示

    def share_post(self,
                   toSharepost_url,
                   toforumname,
                   title='分享一下',
                   content='转发一下'):
        postdata = self._get_data_for_sharePost(toshareurl=toSharepost_url,
                                                toforumname=toforumname,
                                                content=content,
                                                title=title)
        post_url = 'http://tieba.baidu.com/f/commit/share/commitShareApi'
        headers1 = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest',
            'Host': 'tieba.baidu.com',
            'Origin': 'http://tieba.baidu.com',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }

        headerreferurl = postdata['referurl']
        headerreferurlsafe = safe_url_string(url=headerreferurl)
        headers1['Referer'] = headerreferurlsafe
        del postdata['referurl']
        response1 = self.session.post(url=post_url,
                                      data=postdata,
                                      headers=headers1)
        print response1.text

コード例 #31

0

ファイルを表示

def get_meta_refresh(text,
                     baseurl='',
                     encoding='utf-8',
                     ignore_tags=('script', 'noscript')):
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    if six.PY2:
        baseurl = to_bytes(baseurl, encoding)
    try:
        text = to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    text = remove_tags_with_content(text, ignore_tags)
    text = remove_comments(replace_entities(text))
    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = safe_url_string(m.group('url').strip(' "\''), encoding)
        url = moves.urllib.parse.urljoin(baseurl, url)
        return interval, url
    else:
        return None, None

コード例 #32

0

ファイルを表示

    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin(
                response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, six.text_type):
                link.url = link.url.encode(response_encoding)
            try:
                link.url = urljoin(base_url, link.url)
            except ValueError:
                continue
            link.url = safe_url_string(link.url, response_encoding)
            link.text = to_unicode(link.text,
                                   response_encoding,
                                   errors='replace').strip()
            ret.append(link)

        return ret

コード例 #33

0

ファイルを表示

ファイル: middlewares.py プロジェクト: mingyue33/-

    def process_response(self, request, response, spider):
        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            return response

        if response.status in self.proxy_status:
            if 'Location' in response.headers:
                location = safe_url_string(response.headers['location'])
                redirected_url = urljoin(request.url, location)
            else:
                redirected_url = ''

            # AutoProxy for first time
            if not request.meta.get('auto_proxy'):
                request.meta.update({'auto_proxy': True, 'proxy': self.proxy_config})
                new_request = request.replace(meta=request.meta, dont_filter=True)
                new_request.priority = request.priority + 2

                spider.log('Will AutoProxy for <{} {}> {}'.format(
                    response.status, request.url, redirected_url))
                return new_request

            # IgnoreRequest for second time
            else:
                spider.logger.warn('Ignoring response <{} {}>: HTTP status code still in {} after AutoProxy'.format(
                    response.status, request.url, self.proxy_status))
                raise IgnoreRequest

        return response

コード例 #34

0

ファイルを表示

ファイル: html.py プロジェクト: fubuki/w3lib

def get_meta_refresh(text, baseurl='', encoding='utf-8'):
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple ``(interval, url)`` where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, ``(None, None)`` is returned.

    """

    if six.PY2:
        baseurl = unicode_to_str(baseurl, encoding)
    try:
        text = str_to_unicode(text, encoding)
    except UnicodeDecodeError:
        print(text)
        raise
    text = remove_comments(remove_entities(text))
    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = safe_url_string(m.group('url').strip(' "\''), encoding)
        url = moves.urllib.parse.urljoin(baseurl, url)
        return interval, url
    else:
        return None, None

コード例 #35

0

ファイルを表示

    def process_response(self, request, response, spider):

        if request.meta.get("local_redirect"):
            location_url = ""
            # logger.debug("local redirect middlewares: {}".format(response.url))
            if response.status == 302:
                location_url = safe_url_string(
                    response.headers.get("location", ""))

            for off_key in off_keys:
                if off_key in location_url:
                    # response.status = 200
                    # request.meta["is_404"] = True
                    # ignore the page
                    # request.meta["dont_redirect"] = True
                    raise IgnoreRequest

            if location_url.startswith("http"):
                reason = "local pan middlewares, redirected!!!"
                request.headers.pop("Referer", None)
                request.priority += 100
                redirected = request.replace(url=location_url)
                return self._redirect(redirected, request, spider,
                                      reason) or response

        return response

コード例 #36

0

ファイルを表示

ファイル: redirect.py プロジェクト: atharwa-24/scrapy

    def process_response(self, request, response, spider):
        if (request.meta.get("dont_redirect", False) or response.status
                in getattr(spider, "handle_httpstatus_list", [])
                or response.status in request.meta.get(
                    "handle_httpstatus_list", [])
                or request.meta.get("handle_httpstatus_all", False)):
            return response

        allowed_status = (301, 302, 303, 307, 308)
        if "Location" not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers["Location"])
        if response.headers["Location"].startswith(b"//"):
            request_scheme = urlparse(request.url).scheme
            location = request_scheme + "://" + location.lstrip("/")

        redirected_url = urljoin(request.url, location)

        if response.status in (301, 307, 308) or request.method == "HEAD":
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status)

コード例 #37

0

ファイルを表示

    def process_response(self, request, response, spider):
        if (request.meta.get('dont_redirect', False) or response.status
                in getattr(spider, 'handle_httpstatus_list', [])
                or response.status in request.meta.get(
                    'handle_httpstatus_list', [])
                or request.meta.get('handle_httpstatus_all', False)):
            return response

        allowed_status = (301, 302, 303, 307, 308)
        if 'Location' not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers['Location'])
        if response.headers['Location'].startswith(b'//'):
            request_scheme = urlparse(request.url).scheme
            location = request_scheme + '://' + location.lstrip('/')

        redirected_url = urljoin(request.url, location)

        if response.status in (301, 307, 308) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status)

コード例 #38

0

ファイルを表示

 def _set_url(self, url):
     if isinstance(url, str):
         self._url = escape_ajax(safe_url_string(url))
     elif isinstance(url, unicode):
         if self.encoding is None:
             raise TypeError(
                 'Cannot convert unicode url - %s has no encoding' %
                 type(self).__name__)
         unicode_url = url if isinstance(url, unicode) else url.decode(
             self.encoding)
         self._url = safe_url_string(unicode_url, self.encoding)
     else:
         raise TypeError('Request url must be str or unicode, got %s:' %
                         type(url).__name__)
     if ':' not in self._url:
         raise ValueError('Missing scheme in request url: %s' % self._url)

コード例 #39

0

ファイルを表示

ファイル: utils.py プロジェクト: UncleJim/project

def std_url(url, keep_blank_values=True, keep_fragments=False):
    scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
    keyvals = cgi.parse_qsl(query, keep_blank_values)
    keyvals.sort()
    query = urllib.urlencode(keyvals)
    path = safe_url_string(path) or '/'
    fragment = '' if not keep_fragments else fragment
    return urlparse.urlunparse((scheme, netloc.lower(), path, params, query, fragment))

コード例 #40

0

ファイルを表示

ファイル: html.py プロジェクト: scrapy/w3lib

def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given HTML `text`,
    relative to the given base url.

    If no base url is found, the given `baseurl` is returned.

    """

    text = to_unicode(text, encoding)
    m = _baseurl_re.search(text)
    if m:
        return moves.urllib.parse.urljoin(
            safe_url_string(baseurl),
            safe_url_string(m.group(1), encoding=encoding)
        )
    else:
        return safe_url_string(baseurl)

コード例 #41

0

ファイルを表示

ファイル: test_url.py プロジェクト: codecov-test/w3lib

    def test_safe_url_idna(self):
        # adapted from:
        # https://ssl.icu-project.org/icu-bin/idnbrowser
        # http://unicode.org/faq/idn.html
        # + various others
        websites = (
            (u'http://www.färgbolaget.nu/färgbolaget', 'http://www.xn--frgbolaget-q5a.nu/f%C3%A4rgbolaget'),
            (u'http://www.räksmörgås.se/?räksmörgås=yes', 'http://www.xn--rksmrgs-5wao1o.se/?r%C3%A4ksm%C3%B6rg%C3%A5s=yes'),
            (u'http://www.brændendekærlighed.com/brændende/kærlighed', 'http://www.xn--brndendekrlighed-vobh.com/br%C3%A6ndende/k%C3%A6rlighed'),
            (u'http://www.예비교사.com', 'http://www.xn--9d0bm53a3xbzui.com'),
            (u'http://理容ナカムラ.com', 'http://xn--lck1c3crb1723bpq4a.com'),
            (u'http://あーるいん.com', 'http://xn--l8je6s7a45b.com'),

            # --- real websites ---

            # in practice, this redirect (301) to http://www.buecher.de/?q=b%C3%BCcher
            (u'http://www.bücher.de/?q=bücher', 'http://www.xn--bcher-kva.de/?q=b%C3%BCcher'),

            # Japanese
            (u'http://はじめよう.みんな/?query=サ&maxResults=5', 'http://xn--p8j9a0d9c9a.xn--q9jyb4c/?query=%E3%82%B5&maxResults=5'),

            # Russian
            (u'http://кто.рф/', 'http://xn--j1ail.xn--p1ai/'),
            (u'http://кто.рф/index.php?domain=Что', 'http://xn--j1ail.xn--p1ai/index.php?domain=%D0%A7%D1%82%D0%BE'),

            # Korean
            (u'http://내도메인.한국/', 'http://xn--220b31d95hq8o.xn--3e0b707e/'),
            (u'http://맨체스터시티축구단.한국/', 'http://xn--2e0b17htvgtvj9haj53ccob62ni8d.xn--3e0b707e/'),

            # Arabic
            (u'http://nic.شبكة', 'http://nic.xn--ngbc5azd'),

            # Chinese
            (u'https://www.贷款.在线', 'https://www.xn--0kwr83e.xn--3ds443g'),
            (u'https://www2.xn--0kwr83e.在线', 'https://www2.xn--0kwr83e.xn--3ds443g'),
            (u'https://www3.贷款.xn--3ds443g', 'https://www3.xn--0kwr83e.xn--3ds443g'),
        )
        for idn_input, safe_result in websites:
            safeurl = safe_url_string(idn_input)
            self.assertEqual(safeurl, safe_result)

        # make sure the safe URL is unchanged when made safe a 2nd time
        for _, safe_result in websites:
            safeurl = safe_url_string(safe_result)
            self.assertEqual(safeurl, safe_result)

コード例 #42

0

ファイルを表示

ファイル: __init__.py プロジェクト: runt18/scrapy

    def _set_url(self, url):
        if not isinstance(url, six.string_types):
            raise TypeError('Request url must be str or unicode, got {0!s}:'.format(type(url).__name__))

        url = to_native_str(url, self.encoding)
        self._url = escape_ajax(safe_url_string(url))

        if ':' not in self._url:
            raise ValueError('Missing scheme in request url: {0!s}'.format(self._url))

コード例 #43

0

ファイルを表示

ファイル: __init__.py プロジェクト: JohnDoes95/project_parser

    def _set_url(self, url):
        if not isinstance(url, six.string_types):
            raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)

        s = safe_url_string(url, self.encoding)
        self._url = escape_ajax(s)

        if ':' not in self._url:
            raise ValueError('Missing scheme in request url: %s' % self._url)

コード例 #44

0

ファイルを表示

ファイル: test_url.py プロジェクト: codecov-test/w3lib

    def test_safe_url_string_bytes_input(self):
        safeurl = safe_url_string(b"http://www.example.com/")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/")

        # bytes input is assumed to be UTF-8
        safeurl = safe_url_string(b"http://www.example.com/\xc2\xb5")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%B5")

        # page-encoding encoded bytes still end up as UTF-8 sequences in path
        safeurl = safe_url_string(b"http://www.example.com/\xb5", encoding='latin1')
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%B5")

        safeurl = safe_url_string(b"http://www.example.com/\xa3?unit=\xb5", encoding='latin1')
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")

コード例 #45

0

ファイルを表示

ファイル: pdf.py プロジェクト: muzichenglong/scrapyc

    def parse_zgyszz(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        #base_site = get_url_site(base_url)
        if  "qklist/show-" in response.url:
            base_url  = get_base_url(response)

            downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
            relative_url = downLink.split("'")[1]

            abs_url = urljoin_rfc(base_url,relative_url)
            yield scrapy.Request(abs_url,callback=self.parse_zgyszz)

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
            
            return
        if '/upload/qklist/' in response.url:
            yield self.baidu_rpc_request({"url":response.url,"src_id":22})
            return

        base_url  = response.url
        for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            #request.meta["dont_redirect"] = True
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
        
        for sel in response.xpath("//div[@class='flickr']/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue         
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})

コード例 #46

0

ファイルを表示

ファイル: html.py プロジェクト: TheRinger/find_books

def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given html text, relative to the
    given base url. If no base url is found, the given base url is returned
    """
    text = str_to_unicode(text, encoding)
    baseurl = unicode_to_str(baseurl, encoding)
    m = _baseurl_re.search(text)
    if m:
        baseurl = urljoin(baseurl, m.group(1).encode(encoding))
    return safe_url_string(baseurl)

コード例 #47

0

ファイルを表示

ファイル: extractors.py プロジェクト: 4iji/scrapely

def image_url(txt):
    """convert text to a url
    
    this is quite conservative, since relative urls are supported
    Example:

        >>> image_url('')

        >>> image_url('   ')

        >>> image_url(' \\n\\n  ')

        >>> image_url('foo-bar.jpg')
        ['foo-bar.jpg']
        >>> image_url('/images/main_logo12.gif')
        ['/images/main_logo12.gif']
        >>> image_url("http://www.image.com/image.jpg")
        ['http://www.image.com/image.jpg']
        >>> image_url("http://www.domain.com/path1/path2/path3/image.jpg")
        ['http://www.domain.com/path1/path2/path3/image.jpg']
        >>> image_url("/path1/path2/path3/image.jpg")
        ['/path1/path2/path3/image.jpg']
        >>> image_url("path1/path2/image.jpg")
        ['path1/path2/image.jpg']
        >>> image_url("background-image : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background-image : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background-image : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url(http://www.site.com/path1/path2/image.jpg)")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url("background : url('http://www.site.com/path1/path2/image.jpg')")
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('background : url("http://www.site.com/path1/path2/image.jpg")')
        ['http://www.site.com/path1/path2/image.jpg']
        >>> image_url('/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350')
        ['http://www.site.com/getimage.php?image=totalgardens/outbbq2_400.jpg&type=prod&resizeto=350']
        >>> image_url('http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80')
        ['http://s7d4.scene7.com/is/image/Kohler/jaa03267?hei=425&wid=457&op_usm=2,1,2,1&qlt=80']
        >>> image_url('../image.aspx?thumb=true&amp;boxSize=175&amp;img=Unknoportrait[1].jpg')
        ['../image.aspx?thumb=true&boxSize=175&img=Unknoportrait%5B1%5D.jpg']
        >>> image_url('http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff')
        ['http://www.sundancecatalog.com/mgen/catalog/test.ms?args=%2245932|MERIDIAN+PENDANT|.jpg%22&is=336,336,0xffffff']
        >>> image_url('http://www.site.com/image.php')
        ['http://www.site.com/image.php']
        >>> image_url('background-image:URL(http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&amp;defaultImage=noimage_wasserstrom)')
        ['http://s7d5.scene7.com/is/image/wasserstrom/165133?wid=227&hei=227&defaultImage=noimage_wasserstrom']

    """
    imgurl = extract_image_url(txt)
    return [safe_url_string(remove_entities(url(imgurl)))] if imgurl else None

コード例 #48

0

ファイルを表示

ファイル: test_url.py プロジェクト: codecov-test/w3lib

    def test_safe_url_string_with_query(self):
        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ")
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='utf-8')
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%C2%B5")

        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1')
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%C2%A3?unit=%B5")

        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", path_encoding='latin-1')
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%C2%B5")

        safeurl = safe_url_string(u"http://www.example.com/£?unit=µ", encoding='latin-1', path_encoding='latin-1')
        self.assertTrue(isinstance(safeurl, str))
        self.assertEqual(safeurl, "http://www.example.com/%A3?unit=%B5")

コード例 #49

0

ファイルを表示

ファイル: pdf.py プロジェクト: wjianwei126/scrapyc

    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        count = 0
        for a in response.xpath('//a'):
            text = a.xpath("string(.)").extract()
            text = "".join(text).strip()
            if len(text) > 5 or "PDF" not in text:
                continue
            href = a.xpath("@href").extract()
            if len(href) != 1:
                continue
            href = href[0]
            if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1:
                onclick = a.xpath("@onclick").extract()[0]
                onclick = onclick.split(",")
                if len(onclick) < 2:
                    continue
                if onclick[0].startswith("showArticleFile"):
                    id = onclick[-1].split(")", 1)[0].replace("'", "")
                else:
                    id = onclick[1].split(")", 1)[0].replace("'", "")
                if "/CN/" in response.url:
                    pdf = response.url.split("/CN/", 1)[
                              0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                elif "/EN/" in response.url:
                    pdf = response.url.split("/EN/", 1)[
                              0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                else:
                    continue
            elif "attachType=PDF&id=" in href:

                abs_url = urljoin_rfc(response.url, href)
                pdf = abs_url
            else:
                continue
            # url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id
            # print pdf
            self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO)
            yield self.baidu_rpc_request({"url": pdf, "src_id": 22})
            count += 1

        base_url = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#":
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            abs_url = safe_url_string(abs_url, encoding=response.encoding)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22})
        self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)

コード例 #50

0

ファイルを表示

ファイル: pdf.py プロジェクト: muzichenglong/scrapyc

 def parse_cameo(self, response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         return
     base_url  = get_base_url(response)
     for sel in response.xpath('//a/@href'):
         relative_url = sel.extract().encode(response.encoding)
         if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
             continue                  
         abs_url = urljoin_rfc(base_url,relative_url)
         abs_url = safe_url_string(abs_url,encoding=response.encoding)
         yield self.baidu_rpc_request({"url":abs_url,"src_id":22})

コード例 #51

0

ファイルを表示

ファイル: lxmlparser.py プロジェクト: netconstructor/scrapy

    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser)

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors="replace")
            ret.append(link)

        return ret

コード例 #52

0

ファイルを表示

ファイル: pdf.py プロジェクト: muzichenglong/scrapyc

 def parse2(self, response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         return
     base_url  = get_base_url(response)
     for sel in response.xpath('//table/tr/td/div/a/@href'):
         relative_url = sel.extract().encode(response.encoding)
         abs_url = urljoin_rfc(base_url,relative_url)
         abs_url = safe_url_string(abs_url,encoding=response.encoding)
     
         if relative_url.endswith(".pdf") or relative_url.endswith(".doc"):
             yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) 
         elif  relative_url.startswith("?currPath=") :
             yield scrapy.Request(url=abs_url,callback=self.parse2)

コード例 #53

0

ファイルを表示

ファイル: sgml.py プロジェクト: bihicheng/scrapy

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
            ret.append(link)

        return ret

コード例 #54

0

ファイルを表示

ファイル: htmlparser.py プロジェクト: bihicheng/scrapy

    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret

コード例 #55

0

ファイルを表示

ファイル: lxmllinkextract.py プロジェクト: wxpjimmy/Crawler_Demo

    def _extract_links(self, response_text, response_url, response_encoding):
        links = []
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.tag_func(e.tag):
                if self.attr_func(a):
                    l = safe_url_string(l, response_encoding)
                    text = u''
                    if e.text:
                        text = str_to_unicode(e.text, response_encoding, errors='replace').strip()
                    link = Link(self.process_func(l), text=text)
                    links.append(link)

        links = unique_list(links, key=lambda link: link.url) \
                if self.unique else links

        return links

コード例 #56

0

ファイルを表示

ファイル: sgml.py プロジェクト: quanshengxixin/scrapy

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            link.url = urljoin(base_url, link.url)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = to_unicode(link.text, response_encoding, errors="replace").strip()
            ret.append(link)

        return ret

コード例 #57

0

ファイルを表示

ファイル: referer.py プロジェクト: ArturGaspar/scrapy

 def request_scheduled(self, request, spider):
     # check redirected request to patch "Referer" header if necessary
     redirected_urls = request.meta.get('redirect_urls', [])
     if redirected_urls:
         request_referrer = request.headers.get('Referer')
         # we don't patch the referrer value if there is none
         if request_referrer is not None:
             # the request's referrer header value acts as a surrogate
             # for the parent response URL
             #
             # Note: if the 3xx response contained a Referrer-Policy header,
             #       the information is not available using this hook
             parent_url = safe_url_string(request_referrer)
             policy_referrer = self.policy(parent_url, request).referrer(
                 parent_url, request.url)
             if policy_referrer != request_referrer:
                 if policy_referrer is None:
                     request.headers.pop('Referer')
                 else:
                     request.headers['Referer'] = policy_referrer

コード例 #58

0

ファイルを表示

ファイル: redirect.py プロジェクト: elacuesta/scrapy

    def process_response(self, request, response, spider):
        if (request.meta.get('dont_redirect', False) or
                response.status in getattr(spider, 'handle_httpstatus_list', []) or
                response.status in request.meta.get('handle_httpstatus_list', []) or
                request.meta.get('handle_httpstatus_all', False)):
            return response

        allowed_status = (301, 302, 303, 307, 308)
        if 'Location' not in response.headers or response.status not in allowed_status:
            return response

        location = safe_url_string(response.headers['location'])

        redirected_url = urljoin(request.url, location)

        if response.status in (301, 307, 308) or request.method == 'HEAD':
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        redirected = self._redirect_request_using_get(request, redirected_url)
        return self._redirect(redirected, request, spider, response.status)

コード例 #59

0

ファイルを表示

ファイル: sgml.py プロジェクト: xunyuw/iFlyQA

    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        #self.reset()
        #self.feed(response_text)
        #self.close()
        html = lxml.etree.HTML(response_text)
        links = html.xpath("//a")
        self.links = [Link(link.get("href") or "", link.text or "") for link in links]

        ret = []
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            link.url = urljoin(base_url, link.url.strip())
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
            ret.append(link)

        return ret

コード例 #60

0

ファイルを表示

ファイル: htmlparser.py プロジェクト: amogh14/fintra

    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            try:
                link.url = urljoin(base_url, link.url)
            except ValueError:
                continue
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret