def parse(self): #时尚, 八卦, 旅游, 养生 categorys = [9, 4, 11, 2] purl = 'http://weixin.sogou.com/pcindex/pc/pc_{category}/{page}.html' for category in categorys: urls = [ purl.format(page=page, category=category) for page in range(1, 5) ] urls.insert( 0, 'http://weixin.sogou.com/pcindex/pc/pc_{category}/pc_{category}.html' .format(category=category)) for url in urls: resp = rget(url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath( '//ul[@id="pc_0_0"]//li/div[@class="txt-box"]/h3/a/@href') if not hrefs: hrefs = html.xpath('//li/div[@class="img-box"]/a/@href') if not hrefs: import pdb pdb.set_trace() logger.debug("\033[92m 开始爬取:{} \033[0m".format(url)) details = [] for href in hrefs: try: item = self._extract(href, url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): resp = rget('http://meilibaobao.com/artlist-217.html') html = etree.HTML(resp.content) try: last_page = ''.join( html.xpath('//td[@class="pagernum"]/a[last()]/text()')) last_page = int(last_page) except: last_page = 180 pages = self._construct_pages(last_page + 1) details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) divs = html.xpath('//div[@id="columns"]/div') for dd in divs: try: href = ''.join(dd.xpath('./div[@class="pic"]/a/@href')) item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): resp = rget(self.site_url) html = etree.HTML(resp.content) try: total_url = ''.join( html.xpath('//div[@id="pager"]/a[@class="last"]/@href')) pages = parse.parse_qs(parse.urlsplit(total_url).query)['page'][0] except: pages = 12 urls = self._construct_page_url(int(pages) + 1) details = [] for page_url in urls: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath('//div[@class="art_cat_box"]/table//a/@href') for href in hrefs: try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): resp = rget('http://www.milanstand.com/article-zixun-1/') html = etree.HTML(resp.content) try: last_page = html.xpath( '//p[@class="nx"]/following-sibling::p/a/@href')[0] last_page = int(last_page.split('-')[-1][:-1]) except IndexError: last_page = 55 pages = self._construct_pages(last_page + 1) details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) divs = html.xpath( '//div[@class="box_3"]/table/tr//div[contains(text(), "包")]') for dd in divs: try: href = ''.join(dd.xpath('./a/@href')) href = urljoin(self.site_url, href) item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): pages = [urljoin(self.site_url, 'list_{}.html'.format(page)) for page in range(1, 494)] details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath('//div[@class="newlist"]//h6//a[2]/@href') for href in hrefs: try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): url = 'http://m.sohu.com/ch/23/' resp = rget(url) html = etree.HTML(resp.content) hrefs = html.xpath('//div[@class="swiper-wrapper"]/div/a/@href') + \ html.xpath('//ul[@class="feed-list-area"]//li/a/@href') if not hrefs: self.parse() details = [] for href in hrefs: if href.startswith('http'): continue time.sleep(1) try: href = urljoin(self.url, href) logger.debug("\033[92m 开始爬取:{} \033[0m".format(href)) item = self._extract(href, url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): pages_url = [self.site_url] pages_url += [ urljoin(self.site_url, 'newsp{}.html'.format(page)) for page in range(2, 20) ] for page_url in pages_url: details = [] resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = html.xpath('//div[@id="brand"]/table//tr/td/h3/a/@href') for href in hrefs: href = urljoin(self.site_url, href) try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): url = 'https://www.toutiao.com/ch/news_fashion/' chrome_options = get_chrome_options() driver = webdriver.Chrome(options=chrome_options) driver.get(url) html = etree.HTML(driver.page_source) hrefs = html.xpath('//a[@class="link title"]/@href') if not hrefs: self.parse() logger.debug("\033[92m 开始爬取:{} \033[0m".format(url)) details = [] for href in hrefs: if href.startswith('http'): continue time.sleep(1) try: href = urljoin(self.url, href) item = self._extract(href, url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)
def parse(self): resp = rget(self.site_url) html = etree.HTML(resp.content) typeHrefs = html.xpath('//div[@class="sub_nav"]/div[@class="wrapper"]/ul/li//a/@href') for url in typeHrefs: pages = [urljoin(url, 'p{}.html'.format(page)) for page in range(1, 5)] pages[0] = url details = [] for page_url in pages: resp = rget(page_url) if not resp: continue html = etree.HTML(resp.content) hrefs = set(html.xpath('//dl[position()<last()]//a/@href|//a/@href')) for href in hrefs: try: item = self._extract(href, page_url) if not item: continue details.append(item) except IndexError: # 像这种很可能是网络原因 导致失败,需要将失败的href写入 某个队列中,待重爬 continue NewsPipeline().save(details)