async def scrape_page(session, url): async with session.get(url) as resp: content = await resp.text() print('parsing url: {}'.format(url)) doc = PyQuery(content) doc.make_links_absolute(base_url=url) table = doc( '#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)') results = [] for row in table.items('tr:gt(0)'): company_col = row('td').eq(0) phone_col = row('td').eq(1) website_col = row('td').eq(2) company = { 'name': company_col.text(), 'phone': phone_col.text(), 'url': website_col('a').attr('href'), 'details_url': company_col('a').attr('href'), } results.append(company) return results
async def scrape_page(session, url): async with session.get(url) as resp: content = await resp.text() print('parsing url: {}'.format(url)) doc = PyQuery(content) doc.make_links_absolute(base_url=url) table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)') results = [] for row in table.items('tr:gt(0)'): company_col = row('td').eq(0) phone_col = row('td').eq(1) website_col = row('td').eq(2) company = { 'name': company_col.text(), 'phone': phone_col.text(), 'url': website_col('a').attr('href'), 'details_url': company_col('a').attr('href'), } results.append(company) return results
def get_urls(base_url, exclude=set()): urls = [] if base_url.endswith("/"): base_url = base_url[:-1] doc = PyQuery(base_url + "/plog/") doc.make_links_absolute(base_url=base_url) for a in doc("dd a"): href = a.attrib["href"] if href in exclude: continue urls.append(href) doc = PyQuery(base_url + "/") doc.make_links_absolute(base_url=base_url) for a in doc("a"): try: href = a.attrib["href"] except KeyError: pass if not href.startswith(base_url): continue if href.endswith(".html") or href.endswith(".png"): continue if href not in urls and href not in exclude: urls.append(href) urls.append(href) urls.append(href) url_start = base_url + "/p" for i in range(2, 10): url = url_start + str(i) if url in exclude: continue urls.append(url) return urls
class SegmentfaultTagSpider(object): def __init__(self, tag_name, page=1): self.url = 'http://segmentfault.com/t/%s?type=newest&page=%s' % ( tag_name, page) self.tag_name = tag_name self.page = page self._dom = None @property def dom(self): if not self._dom: document = requests.get(self.url) document.encoding = 'utf-8' self._dom = Pq(document.text) self._dom.make_links_absolute( base_url="http://segmentfault.com/") # 相对链接变成绝对链接 爽 return self._dom @property def questions(self): return [ question.attr('href') for question in self.dom('h2.title > a').items() ] @property def has_next_page(self): # 看看还有没有下一页,这个有必要 return bool(self.dom('ul.pagination > li.next')) # 看看有木有下一页 def next_page( self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了 if self.has_next_page: self.__init__(tag_name=self.tag_name, page=self.page + 1) else: return None
def tp_rest_detail_page_url(self, page_num_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(page_num_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(page_num_url) data = [] worker = u'daodao_poi_base_data' for item in doc('.property_title').items(): href = item.attr.href if 'Restaurant_Review' in href: args = json.dumps( {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'}) task_id = get_task_id(worker, args=args) data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail'))) print insert_task(data=data)
def list_page(self, response): result_content = {} content_iter = re.finditer(r"STK && STK.pageletM && STK.pageletM.view\((?P<content>\{.*?\})\)", response.content) for iter in content_iter: ok, content = safe_loads(iter.groupdict()['content']) if ok and "pl_weibo_direct" == content.get("pid"): result_content = content break else: return {} pyquery_doc = PyQuery(result_content["html"]) pyquery_doc.make_links_absolute(response.url) items = [] for item in pyquery_doc("DIV.feed_lists>DIV.WB_cardwrap>DIV").items(): weibo_href = item("DIV.content>DIV.feed_from>A").attr.href if weibo_href: weibo_pics = [] for pic in item("DIV.feed_content DIV.media_box IMG").items(): weibo_pics.append(pic.attr.src) data = { "content": item("DIV.feed_content P.comment_txt").text(), "nickname": item("DIV.feed_content A.W_texta").attr.title, "href": weibo_href, "quote_nickname": item("DIV.feed_content DIV.comment DIV.comment_info A.W_texta").attr.title, "quote_content": item("DIV.feed_content DIV.comment DIV.comment_info P.comment_txt").text(), "pics": ''.join(weibo_pics) } self.crawl("data:,%s" % weibo_href, callback = self.detail_page, data_fetch_content=data)
class SegmentfaultTagSpider(object): def __init__(self, tag_name, page=1): self.url = 'https://segmentfault.com/t/%s?type=newest&page=%s'%(tag_name, page) self.tag_name = tag_name self.page = page self._dom = None @property def dom(self): if not self._dom: document = requests.get(self.url) document.encoding = 'utf-8' self._dom = Pq(document.text) self._dom.make_links_absolute(base_url='http://segmentfault.com/') return self._dom @property def questions(self): return [question.attr('href') for question in self.dom('h2.title > a').items()] @property def has_next_page(self): return bool(self.dom('ul.pagination > li.next')) def next_page(self): if self.has_next_page: print(self.page) self.__init__(tag_name=self.tag_name, page = self.page+1) else: return None
def get_urls(): doc = PyQuery('https://www.peterbe.com/plog/') doc.make_links_absolute(base_url='https://www.peterbe.com') urls = [] for a in doc('dd a'): urls.append(a.attrib['href']) return urls
def download(threadUrl): """ """ d = PyQuery(url=threadUrl, parser='soup') links = d('a[href^="job.php?action=download&aid="]') # 获取 verify 的值 tmp = d('script:contains("var verifyhash =")').text() verify = re.search(r"var verifyhash = '(.*?)'", tmp).group(1) total = len(links) d.make_links_absolute() for i, e in enumerate(links.items(), start=1): filename = e.text() print('%s/%s %s' % (i, total, filename)) if not os.path.exists(os.path.join(SAVE_PATH, filename)): params = urlencode( {'check': 1, 'verify': verify, 'nowtime': int(time.time() * 1000)}) url = '%s?%s' % (e.attr['href'], params) print(' fetch: ' + url) downDoc = PyQuery(url, headers=headers) # 第0个是电信下载点,第1个是移动下载点 downUrl = BASE_URL + downDoc('a[href^="remotedown.php"]').eq(1).attr('href') addToIDM(downUrl, SAVE_PATH, filename) time.sleep(1.5) wefiler_urls = checkWefiler(d) if wefiler_urls: print(wefiler_urls)
def get_urls(base_url, exclude=set()): urls = [] if base_url.endswith("/"): base_url = base_url[:-1] doc = PyQuery(base_url + "/plog/") doc.make_links_absolute(base_url=base_url) for a in doc("dd a"): href = a.attrib["href"] if href in exclude: continue urls.append(href) doc = PyQuery(base_url + "/") doc.make_links_absolute(base_url=base_url) for a in doc("a"): try: href = a.attrib["href"] except KeyError: pass if not href.startswith(base_url): continue if href.endswith(".html") or href.endswith(".png"): continue if href not in urls and href not in exclude: urls.append(href) urls.append(href) urls.append(href) url_start = base_url + "/p" for i in range(2, 10): url = url_start + str(i) if url in exclude: continue urls.append(url) return urls
def run(): """Search session cookies in pastebin.com""" parser = argparse.ArgumentParser() parser.add_argument("-w", "--word", help="palabra clave que desea buscar") parser.add_argument("-u", "--url", help="sitio en donde se desea buscar") args = parser.parse_args() if args.url: try: title = args.word + " " + args.url for page in range(10): params_1 = urlencode({'as_q': "%s" % title}) params = params_1.replace("%22", '') jq = Pq( url= " https://www.google.com/search?%s&source=lnt&tbs=qdr:d&sa=X&ved=0ahUKEwi35d68i-XXAhUDuRQKHVVpB88QpwUIHQ&biw=1366&bih=647" % params, headers={ "user-agent": "Mozilla/7.0 (Windows NT 6.1; rv:24.0) Gecko/20140129 Firefox/24.0" }) jq.make_links_absolute("http://www.google.com") for flix in jq("div.rc").children().items(): url = flix.find("a").attr("href") if url == "http://www.google.com": url = "" print url except: print "error de red"
def tp_rest_list_page_num(self, index_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(index_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() page.encoding = 'utf8' doc = PyQuery(page.text) doc.make_links_absolute(index_url) num_list = [] for item in doc('.pageNumbers a').items(): num = int(rest_oa_pattern.findall(item.attr.href)[0]) num_list.append(num) tp_rest_detail_page_url.delay(index_url, city_id, part) try: for page_num in range(30, max(num_list) + 30, 30): g_num = rest_g_pattern.findall(index_url)[0] tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)), city_id, part) except: pass
class SegmentfaultTagSpider(object): def __init__(self, tag_name, page=1): self.url = 'http://segmentfault.com/t/%s?type=newest&page=%s' % (tag_name, page) self.tag_name = tag_name self.page = page self._dom = None @property def dom(self): if not self._dom: document = requests.get(self.url) document.encoding = 'utf-8' self._dom = PyQuery(document.text) self._dom.make_links_absolute(base_url="http://segmentfault.com/") # 相对链接变成绝对链接 爽 return self._dom @property def questions(self): return [question.attr('href') for question in self.dom('h2.title > a').items()] @property def has_next_page(self): # 看看还有没有下一页,这个有必要 return bool(self.dom('ul.pagination > li.next')) # 看看有木有下一页 def next_page(self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了 if self.has_next_page: self.__init__(tag_name=self.tag_name ,page=self.page+1) else: return None
class Getlist(object): #zurl查询网址ttp://www.0731gch.com/paixie/bianmi/index_26_ #scatid保存到的栏目id #getpages要采集的页数 #page分页码 def __init__(self, zurl , scatid ,getpages , page=1): self.zurl=zurl self.url = zurl+"%d.html" % (page) self.catid = zurl.split('_')[1] self.page = page self._dom = None self.getpages=getpages self.scatid=scatid @property def dom(self): if not self._dom: document = requests.get(self.url) document.encoding = 'utf-8' self._dom = Pq(document.text) self._dom.make_links_absolute(base_url="http://www.0731gch.com/") # 相对链接变成绝对链接 爽 return self._dom @property def urls(self): return [url.attr('href') for url in self.dom('.case_list dl dd h3 a').items()] @property def has_next_page(self): # 看看还有没有下一页,这个有必要 return bool(self.dom('.fy ul .nextPage')) # 看看有木有下一页 def next_page(self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了 if self.has_next_page : self.__init__(zurl=self.zurl,scatid=self.scatid,getpages=self.getpages,page=self.page+1) else: return None def crawl(self): # 采集当前分页 # sf_ids = [url for url in self.urls] con=len(self.urls) print('此页共要采集%s篇文章' %con) i=1 for url in self.urls: print('此页第%d篇文章采集中' %i) Getshow(url).mysave(self.scatid) i+=1 time.sleep(1) def crawl_all_pages(self): while True: print(u'正在抓取栏目页:%s%d.html, 分页:%d ,共需抓 %d 页' % (self.zurl,self.page, self.page, self.getpages)) self.crawl() if int(self.page) >= int(self.getpages) or not self.has_next_page : # if not self.has_next_page : print('停止') break else: self.next_page()
def take(self, *args, **kwargs): base_url = kwargs.pop('base_url', None) or self.base_url _doc = PyQuery(*args, **kwargs) if base_url: _doc.make_links_absolute(base_url) rv = {} self.node.do(None, rv=rv, value=_doc, last_value=_doc) return rv
def doc(html, url): """Returns a PyQuery object of a request's content""" parser = lxml.html.HTMLParser(encoding='utf-8') elements = lxml.html.fromstring(html, parser=parser) if isinstance(elements, lxml.etree._ElementTree): elements = elements.getroot() doc = PyQuery(elements) doc.make_links_absolute(url) return doc
def doc(rsp): """Returns a PyQuery object of a request's content""" parser = lxml.html.HTMLParser(encoding=encoding(rsp)) elements = lxml.html.fromstring(rsp.content, parser=parser) if isinstance(elements, lxml.etree._ElementTree): elements = elements.getroot() doc =PyQuery(elements) doc.make_links_absolute(rsp.url) return doc
def get_hosters_for_episode(url): dom = PyQuery(url=url) dom.make_links_absolute(base_url=SerienStream.base_url) hosters = dom('div.hosterSiteVideo > ul > li > div > a') hoster_list = list() for h in hosters: hoster = PyQuery(h) hoster_list.append((hoster.find('h4').text(), hoster.attr.href)) return hoster_list
def get_episodes(url): dom = PyQuery(url=url) dom.make_links_absolute(base_url=SerienStream.base_url) episodes = dom('table.seasonEpisodesList td.seasonEpisodeTitle > a') episode_list = list() for e in episodes: episode = PyQuery(e) episode_list.append((episode.text(), episode.attr.href)) return episode_list
def get_seasons(url): dom = PyQuery(url=url) dom.make_links_absolute(base_url=SerienStream.base_url) seasons = dom('div#stream ul:first-child a') season_list = list() for s in seasons: season = PyQuery(s) season_list.append((season.text(), season.attr.href)) return season_list
def dom(self): if not self._dom: d = requests.get(self.url) d.encoding = self.encoding __dom = Pq(d.text) if self.absolute_link: try: __dom.make_links_absolute(base_url=self.base_url) except ValueError: raise ValueError('When absolute_link is enabled, a base_url must be specified') self._dom = __dom return self._dom
def process_spider_input(self, response, spider): """Returns a PyQuery object of the response's content""" if response.meta.has_key('_splash_processed'): splash_setting = response.meta['_splash_processed'] endpoint = splash_setting['endpoint'] if endpoint in ['render.json', 'execute']: splash_key_html = spider._splash_json_key_html body = response.body_as_unicode() splash_result = demjson.decode(body) if splash_result.has_key(splash_key_html): body = splash_result[splash_key_html] setattr(response, 'splash_result', splash_result) else: setattr(response, 'pq', None) return elif endpoint in ['render.png', 'render.jpeg', 'render.har']: # do nothing and pyquery is unavailable setattr(response, 'pq', None) return elif endpoint in ['render.html']: # do nothing, continue body = response.body else: body = response.body if not body: setattr(response, 'pq', None) return enc = self.encoding(body, response) try: parser = lxml.html.HTMLParser(encoding=enc) elements = lxml.html.fromstring(body, parser=parser) except (LookupError, ) as e: # lxml would raise LookupError when encoding not supported # try fromstring without encoding instead. # on windows, unicode is not availabe as encoding for lxml elements = lxml.html.fromstring(body) if isinstance(elements, lxml.etree._ElementTree): elements = elements.getroot() pq = PyQuery(elements) if response.meta.get('_splash_processed'): pq.make_links_absolute( response.meta["_splash_processed"]["args"]["url"]) else: pq.make_links_absolute(response.url) setattr(response, 'pq', pq)
def collect_variable_listing_sources(data_source, output_dir, verbose): for letter in string.ascii_uppercase: i, url = 0, variable_listing_url(data_source, letter) while url: if verbose: print("\tFetching: %s" % url) src = requests.get(url).text save_source(src, output_dir, letter, i) doc = PyQuery(src, parser='html') doc.make_links_absolute("https://%s.ipums.org/" % data_source) next_page = doc('a.next_page') if next_page: url = next_page.attr['href'] i += 1 else: url = None
def get_urls(base_url, top_urls, exclude=set()): urls = [] if base_url.endswith("/"): base_url = base_url[:-1] doc = PyQuery(base_url + "/plog/") doc.make_links_absolute(base_url=base_url) for a in doc("dd a"): href = a.attrib["href"] if href in exclude: continue urls.append(href) if len(urls) >= top_urls: break return urls
def get_urls(base_url, exclude=set()): urls = [] if base_url.endswith("/"): base_url = base_url[:-1] doc = PyQuery(base_url + "/plog/") doc.make_links_absolute(base_url=base_url) for a in doc("dd a"): href = a.attrib["href"] if href in exclude: continue urls.append(href) if len(urls) > 200: break return urls
def get_all_links(): try: return pickle.load(open('.links')) except IOError: URL_BASE = "http://www.casarosada.gob.ar/informacion/discursos?start={}" links = [] for start in pages: url = URL_BASE.format(start) logging.info('Descargando links desde {}'.format(url)) pq = PyQuery(url=url, headers=headers) pq.make_links_absolute() page_links = pq('div.category-item-title a') links.extend(list(reversed(page_links))) links = [pq(a).attr('href') for a in links] pickle.dump(links, open('.links', 'w')) return links
def get_all_links(): try: return pickle.load(open('.links')) except IOError: URL_BASE = "http://www.casarosada.gob.ar/informacion/discursos?start={}" links = [] for start in pages: url = URL_BASE.format(start) logging.info('Descargando links desde {}'.format(url)) pq = PyQuery(url=url, headers=headers) pq.make_links_absolute() page_links = pq('div.category-item-title a') links.extend(list(reversed(page_links))) links = [pq(a).attr('href') for a in links] pickle.dump(links, open('.links', 'w')) return links
def get_doc_hyperlinking(doc: PyQuery, base_url: str) -> List[HyperLinkingInPage]: """ 获取网页的超链接列表 Parameters ---------- doc : PyQuery 整个文档的 pyquery 对象 base_url : str 网页的地址信息,用于将相对地址转换成绝对地址 """ rlt = [] doc.make_links_absolute(base_url=base_url) all_href = doc("a") body_text = get_pq_object_inner_text(doc) ls_href_to_query = [] for link in all_href: link_obj = PyQuery(link) url = str(link_obj.attr("href")) if not url.startswith("http"): continue ls_href_to_query.append(link_obj) ls_start_pos = batch_get_dom_node_start_pos(doc, ls_href_to_query) for ui_ele, start_pos in zip(ls_href_to_query, ls_start_pos): if start_pos < 0: logger.error(f"Can't find ui object '{ui_ele}'") text = get_pq_object_inner_text(ui_ele) if text != body_text[start_pos:start_pos + len(text)]: logger.error( f"inner text is not equal with doc body '{text}' ?= '{body_text[start_pos:start_pos+len(text)]}'" ) url = str(ui_ele.attr("href")) hyperlinking_in_page = HyperLinkingInPage(start_pos=start_pos, end_pos=start_pos + len(text), text=text, url=url, query_obj=ui_ele) rlt.append(hyperlinking_in_page) return rlt
def scrape_page(url): print('getting url: {}'.format(url)) doc = PyQuery(url) doc.make_links_absolute() table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)') for row in table.items('tr:gt(0)'): company_col = row('td').eq(0) phone_col = row('td').eq(1) website_col = row('td').eq(2) company = { 'name': company_col.text(), 'phone': phone_col.text(), 'url': website_col('a').attr('href'), 'details_url': company_col('a').attr('href'), } yield company
def scrape_page(url): doc = PyQuery(url) doc.make_links_absolute() table = doc( '#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)') for row in table.items('tr:gt(0)'): company_col = row('td').eq(0) phone_col = row('td').eq(1) website_col = row('td').eq(2) company = { 'name': company_col.text(), 'phone': phone_col.text(), 'url': website_col('a').attr('href'), 'details_url': company_col('a').attr('href'), } yield company
def tp_rest_city_page(self, city_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(city_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(city_url) for item in doc('.restaurants.twoLines a').items(): tp_rest_list_page_num.delay(item.attr.href, city_id, part)
def search_youtube_video(title, pages): print("Entramos en la busqueda") cont = 0 lista_url = [] lista_views = [] for page in range(pages): params = urllib.parse.urlencode({ 'search_query': 'intitle:"%s", video' % title, 'page': page }) jq = Pq( url="http://www.youtube.com/results?%s" % params, headers={ "user-agent": "Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20140129 Firefox/24.0" }) jq.make_links_absolute("http://www.youtube.com") for video in jq("ol.item-section").children().items(): url = video.find("a.yt-uix-tile-link").attr("href") lista_url.append(url) views = video.find("ul.yt-lockup-meta-info li").eq(1).html() if views is not None: res = int( views.split('visualizaciones')[0].strip().replace('.', '')) else: res = 0 lista_views.append(res) cont = cont + 1 if cont == 8: indice = lista_views.index(max(lista_views)) print("views: {} ".format(max(lista_views))) print("indice: {}".format(indice)) print("url: " + lista_url[indice]) return lista_url[indice] indice = lista_views.index(max(lista_views)) return lista_url[indice]
def get_urls_from_podcast(url, verbose=False): """given the url to a podcast, return the list of urls to each audiocut""" pq = PyQuery(url) pq.make_links_absolute() return [PyQuery(a).attr('href') for a in pq('.cut_brief h4 a')]
def get_urls_from_podcast(url, verbose=False): """given the url to a podcast, return the list of urls to each audiocut""" pq = PyQuery(url) pq.make_links_absolute() return [PyQuery(a).attr('href') for a in pq('.cut_brief h4 a')]
from pyquery import PyQuery doc = PyQuery('https://www.rigzone.com/search/alpha/a/') doc.make_links_absolute() table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)') for row in table.items('tr:gt(0)'): company_col = row('td').eq(0) phone_col = row('td').eq(1) website_col = row('td').eq(2) details_url = company_col('a').attr('href') company_name = company_col.text() company_phone = phone_col.text() company_url = website_col('a').attr('href') print(company_name, company_phone, company_url, details_url) break
class Getlist(object): #tocatid保存到的栏目id #getpages要采集的页数 #page分页码 def __init__(self, catid, tocatid, getpages, page=1): self.url = "http://www.vccoo.com/category/?id=%d&page=%d" % (catid, page) self.catid = catid self.getpages = getpages self.tocatid = tocatid self.page = page self._dom = None @property def dom(self): if not self._dom: document = requests.get(self.url) document.encoding = 'utf-8' self._dom = Pq(document.text) self._dom.make_links_absolute( base_url="http://www.vccoo.com/") # 相对链接变成绝对链接 爽 return self._dom @property def urls(self): return [ url.attr('href') for url in self.dom('.list-con h3 > a').items() ] @property def has_next_page(self): # 看看还有没有下一页,这个有必要 return bool(self.dom('.pages ul li .next-page')) # 看看有木有下一页 def next_page( self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了 if self.has_next_page: self.__init__(catid=self.catid, tocatid=self.tocatid, getpages=self.getpages, page=self.page + 1) else: return None def crawl(self): # 采集当前分页 sf_ids = [url.split('/')[-1] for url in self.urls] con = len(sf_ids) print('此页共要采集%s篇文章' % con) i = 1 for sf_id in sf_ids: print('此页第%d篇文章采集中' % i) Getshow(sf_id).save(self.tocatid) i += 1 print('休息3s采第%d篇' % i) time.sleep(3) def crawl_all_pages(self): while True: print( u'正在抓取栏目页:http://www.vccoo.com/category/?id=%d&page=%d , 分页:%d ,共需抓 %d 页' % (self.catid, self.page, self.page, self.getpages)) self.crawl() if int(self.page) >= int(self.getpages) or not self.has_next_page: print('采集任务完成!!!') break else: self.next_page()