Esempio n. 1
0
    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        for href in response.xpath('//table/tr/td/strong/a/@href').extract():
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            #yield scrapy.Request(url=abs_url,callback=self.parse)

        #解析pdf
        for href in response.xpath('//table[@class="object_table"]/tr/td[4]/a/@href').extract():
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            #yield scrapy.Request(url=abs_url,callback=self.parse)

        #解析翻页
        for href in response.xpath('//table/tr/td/table/tr/td/a/@href').extract():
            if ("page=" not in href  and "browse-date?top=" not in href ) or "itemsPerPage=" in href:
                continue

            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            #self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
            yield scrapy.Request(url=abs_url,callback=self.parse)
Esempio n. 2
0
    def process_response(self, request, response, spider):
        if "dont_redirect" in request.meta:
            return response
        if request.method.upper() == "HEAD":
            if response.status in [301, 302, 303, 307] and "Location" in response.headers:
                redirected_url = urljoin_rfc(request.url, response.headers["location"])
                redirected = request.replace(url=redirected_url)
                return self._redirect(redirected, request, spider, response.status)
            else:
                return response

        if response.status in [302, 303] and "Location" in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers["location"])
            redirected = self._redirect_request_using_get(request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and "Location" in response.headers:
            redirected_url = urljoin_rfc(request.url, response.headers["location"])
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return self._redirect(redirected, request, spider, "meta refresh")

        return response
Esempio n. 3
0
    def parse_index(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            yield scrapy.Request(url=response.url,callback=self.parse_index)
            return
        base_url  = get_base_url(response)
        #解析期刊首页
        count = 0
        for href in response.xpath("//div[@id='divperilist']/ul/li/a/@href").extract():
            if href.startswith("Rss.ashx?"):
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            #self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
            yield scrapy.Request(url=abs_url,callback=self.parse_content)
            count += 1
        self.log("F**k %s %d"%(response.url,count),level=scrapy.log.INFO)

        #解析索引页翻页
        for href in response.xpath("//div[@id='divperilist']/table//a/@href").extract():
            if "PageNo" not in href:
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            self.log("Parse %s %s"%(response.url,abs_url),level=scrapy.log.INFO)
            yield scrapy.Request(url=abs_url,callback=self.parse_index)
Esempio n. 4
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        # Categories
        for url in hxs.select('//a[@class="category-link"]/@href').extract():
            url = urljoin_rfc(base_url, url)
            yield Request(url)

        for url in hxs.select(
                '//*[@class="CategoryChildCategories"]//a/@href').extract():
            url = urljoin_rfc(base_url, url)
            yield Request(url)

        # Pages
        for url in hxs.select(
                '//ul[@class="pagination"]//a[not(contains(@href, "#"))]/@href'
        ).extract():
            url = urljoin_rfc(base_url, url)
            yield Request(url)

        # Products
        for url in hxs.select(
                '//a[@class="category-item-name"]/@href').extract():
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product)
Esempio n. 5
0
    def process_response(self, request, response, spider):
        if 'dont_redirect' in request.meta:
            return response
        if request.method.upper() == 'HEAD':
            if response.status in [301, 302, 303, 307
                                   ] and 'Location' in response.headers:
                redirected_url = urljoin_rfc(request.url,
                                             response.headers['location'])
                redirected = request.replace(url=redirected_url)
                return self._redirect(redirected, request, spider,
                                      response.status)
            else:
                return response

        if response.status in [302, 303] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url,
                                         response.headers['location'])
            redirected = self._redirect_request_using_get(
                request, redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if response.status in [301, 307] and 'Location' in response.headers:
            redirected_url = urljoin_rfc(request.url,
                                         response.headers['location'])
            redirected = request.replace(url=redirected_url)
            return self._redirect(redirected, request, spider, response.status)

        if isinstance(response, HtmlResponse):
            interval, url = get_meta_refresh(response)
            if url and interval < self.max_metarefresh_delay:
                redirected = self._redirect_request_using_get(request, url)
                return self._redirect(redirected, request, spider,
                                      'meta refresh')

        return response
Esempio n. 6
0
    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret
Esempio n. 7
0
    def extract_links(self, response):
        xs = HtmlXPathSelector(response)
        base_url = xs.select('//base/@href').extract()
        base_url = urljoin_rfc(response.url,
                               base_url[0]) if base_url else response.url

        links = []
        for location in self.locations:
            if isinstance(location, basestring):
                selectors = xs.select(location)
            elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)):
                selectors = [location] if isinstance(
                    location, HtmlXPathSelector) else location
            else:
                continue

            for selector in selectors:
                links.extend(
                    self.extract_from_selector(selector, response.encoding))

        seen, ret = set(), []
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response.encoding)
            if self.unique:
                if link.url in seen:
                    continue
                else:
                    seen.add(link.url)
            if self.canonicalize:
                link.url = canonicalize_url(link.url)
            ret.append(link)

        return ret
Esempio n. 8
0
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text)) for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
Esempio n. 9
0
    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        count = 0
        for a in response.xpath('//a'):
            text = a.xpath("string(.)").extract()
            text = "".join(text).strip()
            if len(text) > 5 or "PDF" not in text:
                continue
            href = a.xpath("@href").extract()
            if len(href) != 1:
                continue
            href = href[0]
            if (href == "#" or href.startswith("javascript")) and len(a.xpath("@onclick").extract()) == 1:
                onclick = a.xpath("@onclick").extract()[0]
                onclick = onclick.split(",")
                if len(onclick) < 2:
                    continue
                if onclick[0].startswith("showArticleFile"):
                    id = onclick[-1].split(")", 1)[0].replace("'", "")
                else:
                    id = onclick[1].split(")", 1)[0].replace("'", "")
                if "/CN/" in response.url:
                    pdf = response.url.split("/CN/", 1)[
                              0] + "/CN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                elif "/EN/" in response.url:
                    pdf = response.url.split("/EN/", 1)[
                              0] + "/EN/article/downloadArticleFile.do?attachType=PDF&id=" + id
                else:
                    continue
            elif "attachType=PDF&id=" in href:

                abs_url = urljoin_rfc(response.url, href)
                pdf = abs_url
            else:
                continue
            # url = "http://www.zjnyxb.cn/CN/article/downloadArticleFile.do?attachType=PDF&id="+id
            # print pdf
            self.log("PDF_URL %s" % (pdf), level=scrapy.log.INFO)
            yield self.baidu_rpc_request({"url": pdf, "src_id": 22})
            count += 1

        base_url = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url == "#":
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            abs_url = safe_url_string(abs_url, encoding=response.encoding)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22})
        self.log("PDF_TOTAL %s %d" % (response.url, count), level=scrapy.log.INFO)
Esempio n. 10
0
 def test_urljoin_rfc(self):
     self.assertEqual(urljoin_rfc('http://example.com/some/path', 'newpath/test'),
                                  'http://example.com/some/newpath/test')
     self.assertEqual(urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'),
                                  'http://example.com/some/key/other')
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', 'lala/\xa3', encoding='latin-1')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
Esempio n. 11
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser)

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors="replace")
            ret.append(link)

        return ret
Esempio n. 12
0
    def _extract_links(self, response_text, response_url, response_encoding):
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url

        clean_url = lambda u: urljoin_rfc(
            base_url, remove_entities(clean_link(u.decode(response_encoding))))
        clean_text = lambda t: replace_escape_chars(
            remove_tags(t.decode(response_encoding))).strip()

        links_text = linkre.findall(response_text)
        urlstext = set([(clean_url(url), clean_text(text))
                        for url, _, text in links_text])

        return [Link(url, text) for url, text in urlstext]
Esempio n. 13
0
    def parse(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        
        site = get_url_site(response.url)

        if site in self.parses:
            parser = self.parses[site]
            #self.log("Parser %s %s"%(response.url,parser.name),level=scrapy.log.INFO)
            for item in parser.parse(response) :
                yield item
            return

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()

            abs_url =urljoin_rfc(base_url,relative_url)
            #print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue            
            site = get_url_site(abs_url)
            yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
Esempio n. 14
0
    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            # self.log(response.headers,level=scrapy.log.INFO)
            yield scrapy.Request(response.url)
            return
        if response.__class__ != scrapy.http.HtmlResponse:
            return

        base_site = get_url_site(response.url)
        # print response.url,response.status
        base_url = response.url
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            if not self.is_valid_url(relative_url):
                continue
            abs_url = urljoin_rfc(base_url, relative_url)
            # print abs_url
            schema = get_url_scheme(abs_url)
            if schema not in ["http", "https"]:
                continue
            site = get_url_site(abs_url)

            # yield NimeiItem(url=abs_url,furl=response.url)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 22}, furl=response.url)
            if site != base_site and site not in self.settings.get("ALLOW_SITES", []):
                continue
            self.log("SendCrawl %s" % (abs_url), level=scrapy.log.INFO)
            yield scrapy.Request(abs_url)
Esempio n. 15
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret
Esempio n. 16
0
    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            if relative_url.startswith("javascript:"):
                continue
            if "mod=redirect" in relative_url or "redirect.php" in relative_url:
                continue
                
            abs_url =urljoin_rfc(base_url,relative_url)
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue  

            #yield NimeiItem(url=abs_url,furl=response.url)
            abs_url = self.remove_param(abs_url,["extra","orderby","typeid","filter","sortid","searchsort","vk_payway_13","sid","recommend","digest"])


            if self.PATTERN1.match(abs_url):
                abs_url = re.sub("\-\d+\-\d+\.html.*","-1-1.html",abs_url,1)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4})
            if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/") or relative_url.startswith("forumdisplay.php?fid=") or relative_url.startswith("forum.php?mod=forumdisplay&fid="):
                
                yield scrapy.Request(abs_url)
Esempio n. 17
0
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text, response_encoding, errors='replace')
            ret.append(link)

        return ret
Esempio n. 18
0
    def parse(self, response):
        self.log("Crawled (%d) <GET %s>" % (response.status, response.url),
                 level=scrapy.log.INFO)
        if response.status != 200:
            yield response.request
            return
        if not isinstance(response, scrapy.http.HtmlResponse):
            return
        depth = response.meta.get("depth", 1)
        for href in response.xpath("//a/@href").extract():
            href = href.strip()

            if href.startswith("javascript:") or href.startswith(
                    "rtsp:") or href.startswith("ftp:"):
                continue
            scheme, netloc, path, params, query, fragment = parse_url(href)
            if path:
                suffix = path.split('.')[-1]
                if suffix in [
                        "png", "jpg", "gif", "rar", "zip", "mp3", ".pdf",
                        "doc", ".txt", "docx", "swf", "mp4"
                ]:
                    continue
            abs_url = urljoin_rfc(response.url, href)
            yield UrlItem(url=abs_url, fromurl=response.url)
            if depth < 1:
                depth += 1
                yield scrapy.Request(abs_url, meta={"depth": depth})
Esempio n. 19
0
    def parse_all(self, response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        base_url  = get_base_url(response)
        base_site = get_url_site(base_url)

        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)

            filename = abs_url.split("?")[0].split("/")[-1]
            if filename :
                ctype  = filename.split(".")[-1].lower() 
            else:
                ctype = None
            if ctype in ["jpeg","jpg","swf","rar","zip","gz","gif","mov","png","bmp","exe","pps","db","txt","pptx",'xls',"ppt","xlsx"]:
                continue

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})

            site = get_url_site(abs_url)
            if site != base_site:
                continue
            if ctype in ["pdf","doc","docx","rtf",]:
                continue
            yield scrapy.Request(url=abs_url,callback=self.parse_all)
Esempio n. 20
0
 def test_urljoin_rfc(self):
     self.assertEqual(
         urljoin_rfc('http://example.com/some/path', 'newpath/test'),
         'http://example.com/some/newpath/test')
     self.assertEqual(
         urljoin_rfc('http://example.com/some/path/a.jpg', '../key/other'),
         'http://example.com/some/key/other')
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele', u'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xc2\xa3/lala/\xc2\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc(u'http://example.com/lolo/\xa3/lele',
                     'lala/\xa3',
                     encoding='latin-1')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
     u = urljoin_rfc('http://example.com/lolo/\xa3/lele', 'lala/\xa3')
     self.assertEqual(u, 'http://example.com/lolo/\xa3/lala/\xa3')
     assert isinstance(u, str)
Esempio n. 21
0
    def parse_zgyszz(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        #base_site = get_url_site(base_url)
        if  "qklist/show-" in response.url:
            base_url  = get_base_url(response)

            downLink = response.xpath("//div[@id='down']//a/@onclick").extract()[0]
            relative_url = downLink.split("'")[1]

            abs_url = urljoin_rfc(base_url,relative_url)
            yield scrapy.Request(abs_url,callback=self.parse_zgyszz)

            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
            
            return
        if '/upload/qklist/' in response.url:
            yield self.baidu_rpc_request({"url":response.url,"src_id":22})
            return

        base_url  = response.url
        for sel in response.xpath("//div[@class='main_box']//table/tr[1]/td/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue              
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            #request.meta["dont_redirect"] = True
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
        
        for sel in response.xpath("//div[@class='flickr']/a/@href"):
            relative_url = sel.extract().encode(response.encoding)
            if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
                continue         
            abs_url = urljoin_rfc(base_url,relative_url)
            abs_url = safe_url_string(abs_url,encoding=response.encoding)
            request = scrapy.Request(abs_url,callback=self.parse_zgyszz)
            yield request
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
Esempio n. 22
0
def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given html text, relative to the
    given base url. If no base url is found, the given base url is returned
    """
    text = str_to_unicode(text, encoding)
    baseurl = unicode_to_str(baseurl, encoding)
    m = _baseurl_re.search(text)
    if m:
        baseurl = urljoin_rfc(baseurl, m.group(1).encode(encoding))
    return safe_url_string(baseurl)
Esempio n. 23
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.reset()
        self.feed(response_text)
        self.close()

        links = unique_list(
            self.links,
            key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = link.text.decode(response_encoding)
            ret.append(link)

        return ret
Esempio n. 24
0
    def _extract_links(self, response_text, response_url, response_encoding):
        self.base_url, self.links = etree.HTML(response_text, self.parser)

        links = unique_list(
            self.links,
            key=lambda link: link.url) if self.unique else self.links

        ret = []
        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url
        for link in links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text,
                                       response_encoding,
                                       errors='replace')
            ret.append(link)

        return ret
Esempio n. 25
0
    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            yield scrapy.Request(url=response.url)
            return
        base_url  = get_base_url(response)

        for href in response.xpath('//div[@class="center_bottom_list"]//a/@href').extract():
            if not self.is_valid_url(href):
                continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url)

        #翻页
        for href in response.xpath('//div[@class="article_list_page"]//a/@href').extract():
            abs_url =urljoin_rfc(base_url,href)
            yield scrapy.Request(url=abs_url)
Esempio n. 26
0
def get_base_url(text, baseurl='', encoding='utf-8'):
    """Return the base url if declared in the given html text, relative to the
    given base url. If no base url is found, the given base url is returned
    """
    text = str_to_unicode(text, encoding)
    baseurl = unicode_to_str(baseurl, encoding)
    m = _baseurl_re.search(text)
    if m:
        baseurl = urljoin_rfc(baseurl, m.group(1).encode(encoding))
    return safe_url_string(baseurl)
Esempio n. 27
0
    def _extract_requests(self, response_text, response_url, response_encoding):
        """Extract requests with absolute urls"""
        self.reset()
        self.feed(response_text)
        self.close()

        base_url = urljoin_rfc(response_url, self.base_url) if self.base_url else response_url
        self._make_absolute_urls(base_url, response_encoding)
        self._fix_link_text_encoding(response_encoding)

        return self.requests
Esempio n. 28
0
 def parse_index(self,response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         return
     for href in response.xpath('//div[@class="az"]/ul/li/p/a/@href').extract():
         if "policy.php" in href:
             continue
         abs_url =urljoin_rfc(response.url,href)
         yield scrapy.Request(url=abs_url+"/article/latestArticlesByJournal")
         yield self.baidu_rpc_request({"url":abs_url,"src_id":22},response.url)
Esempio n. 29
0
    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return 
        base_url  = get_base_url(response)
        for href in response.xpath('//form[@name="itemlist"]/table/tr[@class="itemLine"]/td/span/a/@href').extract():
            relative_url = href
            if relative_url.startswith("/simple-search?"):
                continue

            abs_url =urljoin_rfc(base_url,relative_url.split("?",1)[0])
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        categories = hxs.select(
            "//div[@id='box_left_ctl03_livello_box']//table[@class='tabellaMenu']/tr/td[2]/a/@href"
        ).extract()
        for category in categories:
            yield Request(urljoin_rfc(base_url, category), callback=self.parse)

        pages = hxs.select(
            "//div[@id='box_center_span_navigazione']//a/@href").extract()
        for page in pages:
            yield Request(urljoin_rfc(base_url, page), callback=self.parse)

        items = hxs.select(
            '//td[@class="centerPagina"]//a[contains(@href, "prodotto") and not(contains(@href, ".jpg") and not(contains(@href, ".pdf")))]/@href'
        ).extract()
        for item in items:
            yield Request(urljoin_rfc(base_url, item),
                          callback=self.parse_item)
Esempio n. 31
0
 def parse_content(self,response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         yield scrapy.Request(url=response.url,callback=self.parse_content)     
         return
     base_url  = get_base_url(response)
     #解析文章
     for href in response.xpath("//em/a/@href").extract():
         relative_url = href
         abs_url =urljoin_rfc(base_url,relative_url)            
         yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
Esempio n. 32
0
    def parse_dir(self, response):
        hxs = Selector(response)
        sec_nodes = hxs.xpath('//table[@id="at"]//td[@class="L"]/a')
        secs = OrderedDict()
        curr_url = response._get_url()
        for sn in sec_nodes:
            url = urljoin_rfc(curr_url, sn.xpath('@href').extract()[0])
            name = sn.xpath('child::text()').extract()[0]
            secs[url] = name
#         vs = RedisStrHelper.split(response.meta['info'])
#         yield ItemHelper.gene_sections_item(self.source_short_name, self.source_zh_name, vs[0], vs[1], self.name, secs, 1)
        yield ItemHelper.gene_sections_item(self.source_short_name, self.source_zh_name, self._id, self.start_urls[0], self.name, secs, 0)
    def parse(self, response):
        self.log("Crawled %s %d" % (response.url, response.status), level=scrapy.log.INFO)
        # self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return

        base_url = get_base_url(response)
        for sel in response.xpath("//*/@onclick").extract():
            if not sel.startswith("gotourl"):
                continue
            relative_url = sel.split("'")[1]
            abs_url = urljoin_rfc(base_url, relative_url)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 4})
            if "v" in relative_url:
                yield scrapy.Request(url=abs_url)
        for href in response.xpath("//*/@href").extract():
            if not href.endswith("html"):
                continue
            relative_url = href
            abs_url = urljoin_rfc(base_url, relative_url)
            yield self.baidu_rpc_request({"url": abs_url, "src_id": 4})
Esempio n. 34
0
    def _extract_requests(self, response_text, response_url,
                          response_encoding):
        """Extract requests with absolute urls"""
        self.reset()
        self.feed(response_text)
        self.close()

        base_url = urljoin_rfc(
            response_url, self.base_url) if self.base_url else response_url
        self._make_absolute_urls(base_url, response_encoding)
        self._fix_link_text_encoding(response_encoding)

        return self.requests
Esempio n. 35
0
    def parse(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            yield scrapy.Request(url=response.url)
            return
        base_url  = get_base_url(response)
        #解析文章
        for href in response.xpath("//table[@id='articleList']/tr/td/a/@href").extract():
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)

        #解析当年各期
        #只更新最近2期的,为的是减少数据量,提高更新批次
        for href in response.xpath("//table[@id='issueList']/tr/td/a/@href").extract()[-2:]:

            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            #yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
            yield scrapy.Request(url=abs_url)
            self.log("Parse %s %s "%(response.url,abs_url),level=scrapy.log.INFO)


        #解析历年各期
         # for href in response.xpath("//table[@id='yearList']//a/@href").extract():

         #    relative_url = href
         #    abs_url =urljoin_rfc(base_url,relative_url)
         #    yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
         #    yield scrapy.Request(url=abs_url)
                   
        #解析期刊首页
        for href in response.xpath("//table[@class='r_list']/tr/td/span/span[1]/a/@href").extract():

            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            #yield self.baidu_rpc_request({"url":abs_url,"src_id":22})
            yield scrapy.Request(url=abs_url)
Esempio n. 36
0
 def parse_cameo(self, response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         return
     base_url  = get_base_url(response)
     for sel in response.xpath('//a/@href'):
         relative_url = sel.extract().encode(response.encoding)
         if relative_url.startswith("javascript:") or relative_url.startswith("mailto:") or relative_url=="#":
             continue                  
         abs_url = urljoin_rfc(base_url,relative_url)
         abs_url = safe_url_string(abs_url,encoding=response.encoding)
         yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) 
Esempio n. 37
0
    def _extract_links(self,
                       response_text,
                       response_url,
                       response_encoding,
                       base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin_rfc(
                response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            link.url = urljoin_rfc(base_url, link.url, response_encoding)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = str_to_unicode(link.text,
                                       response_encoding,
                                       errors='replace')
            ret.append(link)

        return ret
Esempio n. 38
0
    def parse(self,response):

        base_url  = get_base_url(response)
        for sel in response.xpath('//a/@href'):
            relative_url = sel.extract()
            abs_url =urljoin_rfc(base_url,relative_url)
            schema = get_url_scheme(abs_url)
            if schema not in ["http","https"]:
                continue  

            yield NimeiItem(url=abs_url,furl=response.url)
            if relative_url.startswith("forum_") or relative_url.startswith("forum-") or relative_url.startswith("/archives/"):
                
                yield scrapy.Request(abs_url)
Esempio n. 39
0
    def parse(self, response):

        base_url = get_base_url(response)
        for sel in response.xpath('//a/@href')[1:]:
            relative_url = sel.extract()
            abs_url = urljoin_rfc(base_url, relative_url)
            # print abs_url
            # schema = get_url_scheme(abs_url)
            # if schema not in ["http","https"]:
            #     continue
            if abs_url[-1] == "/":
                yield scrapy.Request(abs_url, callback=self.parse)
            else:
                yield NimeiItem(url=abs_url, furl=response.url)
Esempio n. 40
0
    def parse_index(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            yield scrapy.Request(url=response.url,callback=self.parse_index)   
            return
        base_url  = get_base_url(response)
        count = 0
        for href in response.xpath("//a/@href").extract():
            if re.match("/[Jj]ournal/\d+(_\d+)?\.shtml",href)   :
                relative_url = href
                abs_url =urljoin_rfc(base_url,relative_url)
                #yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
                self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
                yield scrapy.Request(url=abs_url,callback=self.parse_index)

            #解析期刊首页
            if "QK" in href or "qk" in href:
                relative_url = href
                abs_url =urljoin_rfc(base_url,relative_url)
                self.log("Parse %s %s"%(abs_url,response.url),level=scrapy.log.INFO)
                #yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
                yield scrapy.Request(url=abs_url,callback=self.parse_content)   
                count += 1
        self.log("F**k %s %d"%(response.url,count),level=scrapy.log.INFO)
Esempio n. 41
0
 def parse2(self, response):
     self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
     #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
     if response.status / 100 != 2:
         return
     base_url  = get_base_url(response)
     for sel in response.xpath('//table/tr/td/div/a/@href'):
         relative_url = sel.extract().encode(response.encoding)
         abs_url = urljoin_rfc(base_url,relative_url)
         abs_url = safe_url_string(abs_url,encoding=response.encoding)
     
         if relative_url.endswith(".pdf") or relative_url.endswith(".doc"):
             yield self.baidu_rpc_request({"url":abs_url,"src_id":22}) 
         elif  relative_url.startswith("?currPath=") :
             yield scrapy.Request(url=abs_url,callback=self.parse2)
Esempio n. 42
0
    def parse_unit(self,response):
        self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO)
        #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO)
        if response.status / 100 != 2:
            return
        site = get_url_site(response.url)
        base_url  = get_base_url(response)

        for href in response.xpath("//a[@class='zt_name']/@href").extract():
            # if not self.is_valid_url(href):
            #     continue
            if href == "#":continue
            relative_url = href
            abs_url =urljoin_rfc(base_url,relative_url)
            yield self.baidu_rpc_request({"url":abs_url,"src_id":4},furl=response.url)
            yield scrapy.Request(url=abs_url,callback=self.parse_cdmd)
Esempio n. 43
0
    def get_matched_products(self, website_id):
        api_url = urljoin_rfc(self.host,
                              '/api/get_matched_products_paged.json')
        api_url = add_or_replace_parameter(api_url, 'website_id',
                                           str(website_id))
        api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key)

        page = 0
        count = 1000
        continue_next_page = True
        matched_products = []

        while continue_next_page:
            api_url = add_or_replace_parameter(api_url, 'start',
                                               str(page * count))
            api_url = add_or_replace_parameter(api_url, 'count', str(count))

            try:
                try_no = 1
                try_query = True
                while try_query:
                    try:
                        r = requests.get(api_url)
                        data = r.json()
                        new_matches = data.get('matches', [])
                    except Exception, e:
                        if not (try_no <= 10 and self.retry):
                            raise e
                        else:
                            try_no += 1
                            time.sleep(1)
                    else:
                        try_query = False
            except Exception:
                continue_next_page = False
            else:
                matched_products.extend(new_matches)
                if len(new_matches) < count:
                    continue_next_page = False
                else:
                    page += 1

        return matched_products
    def parse_item(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        content = hxs.select(
            "//td[@class='centerPagina']/div[@class='tabMargini']/table[@class='tabellaBoxCentrale']/form/tr[2]/td/table/tr/td[2]"
        )
        name = content.select(
            "//td[@class='centerPagina']/div[@class='tabMargini']/table[@class='tabellaBoxCentrale']/form/tr[1]/td/h1[@class='titolo']/text()"
        ).extract()
        if not name:
            logging.error("NO NAME!")
            return
        name = name[0]
        url = response.url

        # adding product
        price = content.select(
            "span[@id='box_center_span_prezzo']/span[@class='prezzo']/strong/text()"
        ).extract()
        if not price:
            logging.error("NO PRICE")
            return
        price = price[0].replace(".", "").replace(",", ".")

        l = ProductLoader(item=Product(), response=response)
        l.add_xpath('identifier', '//input[@id="pid"]/@value')
        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        yield l.load_item()

        items = hxs.select(
            '//td[@class="centerPagina"]//a[contains(@href, "prodotto") and not(contains(@href, ".jpg") and not(contains(@href, ".pdf")))]/@href'
        ).extract()
        for item in items:
            yield Request(urljoin_rfc(base_url, item),
                          callback=self.parse_item)
Esempio n. 45
0
    def get_main_website_id(self, member_id):
        main_website_id = 0

        api_url = urljoin_rfc(self.host, '/api/get_account_info.json')
        api_url = add_or_replace_parameter(api_url, 'member_id',
                                           str(member_id))
        api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key)

        try_no = 1
        try_query = True
        while try_query:
            try:
                r = requests.get(api_url)
                data = r.json()
                main_website_id = data['main_site']
            except Exception, e:
                if not (try_no <= 10 and self.retry):
                    raise e
                else:
                    try_no += 1
                    time.sleep(1)
            else:
                try_query = False
Esempio n. 46
0
def get_meta_refresh(text, baseurl='', encoding='utf-8'):
    """Return  the http-equiv parameter of the HTML meta element from the given
    HTML text and return a tuple (interval, url) where interval is an integer
    containing the delay in seconds (or zero if not present) and url is a
    string with the absolute url to redirect.

    If no meta redirect is found, (None, None) is returned.
    """
    baseurl = unicode_to_str(baseurl, encoding)
    try:
        text = str_to_unicode(text, encoding)
    except UnicodeDecodeError:
        print text
        raise
    text = remove_comments(remove_entities(text))
    m = _meta_refresh_re.search(text)
    if m:
        interval = float(m.group('int'))
        url = safe_url_string(m.group('url').strip(' "\''))
        url = urljoin_rfc(baseurl, url)
        return interval, url
    else:
        return None, None
Esempio n. 47
0
    def get_match_rate_website(self, website_id):
        rate = 0

        api_url = urljoin_rfc(self.host, '/api/get_match_rate_website.json')
        api_url = add_or_replace_parameter(api_url, 'website_id',
                                           str(website_id))
        api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key)

        try_no = 1
        try_query = True
        while try_query:
            try:
                r = requests.get(api_url)
                data = r.json()
                rate = data['rate']
            except Exception, e:
                if not (try_no <= 10 and self.retry):
                    raise e
                else:
                    try_no += 1
                    time.sleep(1)
            else:
                try_query = False
Esempio n. 48
0
    def get_products_total_account(self, member_id):
        total = 0

        api_url = urljoin_rfc(self.host,
                              '/api/get_products_total_account.json')
        api_url = add_or_replace_parameter(api_url, 'member_id',
                                           str(member_id))
        api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key)

        try_no = 1
        try_query = True
        while try_query:
            try:
                r = requests.get(api_url)
                data = r.json()
                total = data['total']
            except Exception, e:
                if not (try_no <= 10 and self.retry):
                    raise e
                else:
                    try_no += 1
                    time.sleep(1)
            else:
                try_query = False
Esempio n. 49
0
    def retrieve_all_products_website(self, website_id, path):
        api_url = urljoin_rfc(self.host,
                              '/api/get_all_products_website_optimized')
        api_url = add_or_replace_parameter(api_url, 'website_id',
                                           str(website_id))
        api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key)

        try_no = 1
        try_query = True
        while try_query:
            r = requests.get(api_url, stream=True)
            if r.status_code == 200:
                with open(path, 'wb') as f:
                    for chunk in r.iter_content(1024):
                        f.write(chunk)
                try_query = False
            else:
                if not (try_no <= 10 and self.retry):
                    raise Exception(
                        'Could not retrieve the website products for {}'.
                        format(website_id))
                else:
                    try_no += 1
                    time.sleep(1)
Esempio n. 50
0
 def _make_absolute_urls(self, base_url, encoding):
     """Makes all request's urls absolute"""
     self.requests = [x.replace(url=safe_url_string(urljoin_rfc(base_url, \
         x.url, encoding), encoding)) for x in self.requests]
Esempio n. 51
0
 def test_urljoin_rfc_deprecated(self):
     jurl = urljoin_rfc("http://www.example.com/", "/test")
     self.assertEqual(jurl, b"http://www.example.com/test")
Esempio n. 52
0
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath('name', '//*[@itemprop="name"]/text()')
        product_loader.add_xpath('brand',
                                 '//*[@itemprop="manufacturer"]/@content')
        img_src = hxs.select('//a[@itemprop="image"]/img/@src').extract()
        if img_src:
            product_loader.add_value('image_url',
                                     urljoin_rfc(base_url, img_src[0]))
        price = hxs.select('//*[@itemprop="price"]//*[@id="lblPrice"]').re(
            r'([\d,.]+)')
        if not price:
            price = hxs.select('//*[@itemprop="price"]//*[@id="lblSalePrice"]'
                               ).re(r'([\d,.]+)')
            if not price:
                price = 0
        product_loader.add_value('price', price)
        product_loader.add_value(
            'category',
            hxs.select('//*[@id="lblCategoryTrail"]//a/text()')[-1].extract())
        product_loader.add_xpath('identifier',
                                 '//input[@id="hfItemID"]/@value')
        product_loader.add_xpath('sku', '//input[@id="hfItemID"]/@value')
        product_loader.add_value('url', response.url)

        product_item = product_loader.load_item()

        ajax_url = 'http://www.protechonline.net/Store/Controls/ScriptService.asmx/GetPrice'

        params = {
            'itemID': int(product_item['identifier']),
            'personalizationIds': [],
            'personalizationStrings': [],
            'quantity': 1,
            'variantIDs': [],
        }

        options_select = hxs.select('//div[@id="dvProductVariations"]//select')
        if options_select:
            options_variants = product(
                *[opt.select('option') for opt in options_select])
            for variant in options_variants:
                variant_name = ' '.join([
                    opt.select('text()').extract()[0].split('/')[0]
                    for opt in variant
                ])
                variant_ids_list = [
                    int(opt.select('@value').extract()[0]) for opt in variant
                ]
                variant_id = '_'.join(
                    [str(ident) for ident in variant_ids_list])

                option_item = Product(product_item)
                option_item['name'] = product_item['name'] + ' ' + variant_name
                option_item['identifier'] = product_item[
                    'identifier'] + '_' + variant_id

                params['variantIDs'] = variant_ids_list

                yield Request(ajax_url,
                              method='POST',
                              body=json.dumps(params),
                              headers={
                                  'Content-Type':
                                  'application/json; charset=utf-8'
                              },
                              dont_filter=True,
                              callback=self.parse_ajax_price,
                              meta={'product_item': option_item})
        else:
            yield product_item
Esempio n. 53
0
    def get_proxy_list(self,
                       target_id,
                       length=10,
                       profile=None,
                       locations='',
                       types='',
                       ignore_ips='',
                       blocked=None,
                       log=None):
        proxy_list = []

        proxy_list_url = urljoin_rfc(self.host, 'proxy_list')
        proxy_list_url = add_or_replace_parameter(proxy_list_url, 'target_id',
                                                  str(target_id))
        proxy_list_url = add_or_replace_parameter(proxy_list_url, 'length',
                                                  str(length))
        if profile and isinstance(profile, int):
            proxy_list_url = add_or_replace_parameter(proxy_list_url,
                                                      'profile', str(profile))
        else:
            if locations:
                proxy_list_url = add_or_replace_parameter(
                    proxy_list_url, 'locations', str(locations))
            if types:
                proxy_list_url = add_or_replace_parameter(
                    proxy_list_url, 'types', str(types))
        if ignore_ips:
            proxy_list_url = add_or_replace_parameter(proxy_list_url, 'ignore',
                                                      ignore_ips)
        if blocked and isinstance(blocked, list):
            proxy_list_url = add_or_replace_parameter(
                proxy_list_url, 'blocked', '|'.join(map(str, blocked)))

        try_no = 1
        try_query = True
        while try_query:
            try:
                if log:
                    log('PROXY SERVICE: get list => %s' % proxy_list_url)
                r = requests.get(proxy_list_url,
                                 auth=HTTPBasicAuth(self.user, self.password))
                data = r.json()
                if log:
                    log('PROXY SERVICE: data received => %r' % data)
                if not data['proxy_list']:
                    proxy_list_url = add_or_replace_parameter(
                        proxy_list_url, 'refresh', str(1))
                    r = requests.get(proxy_list_url,
                                     auth=HTTPBasicAuth(
                                         self.user, self.password))
                    data = r.json()
                    if log:
                        log('PROXY SERVICE: data received => %r' % data)
                proxy_list = data['proxy_list']
            except Exception, e:
                if not (try_no <= 10 and self.retry):
                    raise e
                else:
                    try_no += 1
                    time.sleep(1)
            else:
                try_query = False