def getTotalPage(self,response): article_urls = Selector(text=response.body.decode("gbk")).xpath(self.xpath["article_link"]).extract() #article_urls = Selector(text=response.body.decode("gbk")).xpath('//td[@class="unnamed1"]/a[contains(@href,"fileview")]/@href').extract() if len(article_urls) > 0: for url in article_urls: result = urlparse.urlparse(url) params = urlparse.parse_qs(result.query,True) meta=response.meta meta['title'] = str(params['title'][0]) if "title" in params else "" meta['author'] = str(params['name'][0]) if "name" in params else "" if "date" in params: date_str = str(params['date'][0]) meta['publish_time'] = '%s-%s-%s' % (date_str[0:4],date_str[4:6],date_str[6:8]) if response.url.find("ld_history_jianghua") != -1: meta['url'] = self.url_prefix1 + url url = self.url_prefix1 + url.encode("gbk") else: meta['url'] = self.url_prefix2 + url url = self.url_prefix2 + url.encode("gbk") if isArticleExist(url) == False: yield Request(url, callback=self.getContent, meta=meta)
def getTotalPage(self, response): meta = response.meta #meta['dont_redirect'] = True article_urls = response.xpath(self.xpath['article_link']).extract() #article_urls = response.xpath('//ul[@class="bottom_ul" or @class="news_ul"]/li/a/@href').extract() if len(article_urls) > 0: for url in article_urls: if url.startswith("../"): url = self.url_prefix + url.strip("../") if not url.startswith("http://"): index = response.url.rfind("/") url = response.url[0:index + 1] + url meta['url'] = url if isArticleExist(url) == False: yield Request(url, callback=self.getContent, meta=meta) next_page_urls = response.xpath(self.xpath['next_page_link']).extract() #next_page_urls = response.xpath('//div[@id="displaypagenum"]//a[contains(text(),">>")]/@href').extract() if len(next_page_urls) > 0: index = response.url.rfind("/") next_page_url = response.url[0:index + 1] + next_page_urls[0] # print next_page_url yield Request(next_page_url, callback=self.getTotalPage, meta=meta)
def getArticleUrl(self, response): meta = response.meta if response.url.find("gkml") != -1: article_urls = response.xpath( self.xpath["article_link_GKML"]).extract() #article_urls = response.xpath('//div[@id="documentContainer"]/div[@class="row"]/li[@class="mc"]//a/@href').extract() if len(article_urls) > 0: for url in article_urls: #for url in article_urls: if url.startswith("../../"): url = self.url_prefix + url.lstrip("../../") if not url.startswith("http://"): continue meta['url'] = url if isArticleExist(url) == False: yield Request(url, callback=self.getContentGKML, meta=meta) else: article_urls = response.xpath( self.xpath["article_link_SYrlzyhshbzb"]).extract() #article_urls = response.xpath('//div[@class="serviceMainListConType"]/div/div[@class="serviceMainListTxt"]/span/a/@href').extract() if len(article_urls) > 0: for url in article_urls: index = response.url.rfind("/") if url.startswith("./"): url = response.url[0:index] + url.lstrip(".") if not url.startswith("http://"): continue meta['url'] = url if isArticleExist(url) == False: yield Request(url, callback=self.getContentSYrlzyhshbzb, meta=meta)
def getArticleUrl(self, response): meta = response.meta article_urls = response.xpath(self.xpath['article_link']).extract() #article_urls = response.xpath('//table[@width="610"]//tr/td/a/@href').extract() if len(article_urls) > 0: for url in article_urls: url = response.url + url.strip("./") meta['url'] = url if isArticleExist(url) == False: yield Request(url, callback=self.getContent, meta=meta)
def getArticleUrl(self,response): meta = response.meta article_urls = response.xpath(self.xpath["article_link"]).extract() #article_urls = response.xpath('(//div[@class="newsList"])//div[@class="title"]/a/@href').extract() #print response.url, len(article_urls) if len(article_urls) > 0: for url in article_urls: url = self.url_prefix + url meta['url'] = url if isArticleExist(url) == False: yield Request(url, callback=self.getContent, meta=meta)
def getArticleUrl(self, response): article_urls = response.xpath(self.xpath["article_link"]).extract() #article_urls = response.xpath('//ul[@id="ContentPlaceHolder1_MainMiddleControl1_WebPageDocumentsByUId1"]/li/div[@class="m_sub"]/a/@href').extract() if len(article_urls) > 0: for url in article_urls: if not url.startswith("http://"): continue if isArticleExist(url) == False: yield Request(url, callback=self.getContent, meta=response.meta)
def getArticlesID(self, response): meta = response.meta article_ids = response.xpath(self.xpath["article_id"]).extract() #article_ids = response.xpath('//d/r1/id/text()').extract() for article_id in article_ids: article_url = self.url_prefix + article_id if isArticleExist(article_url) == False: url = "http://ipub.exuezhe.com/Qk/GetTextArt?id=%s&pn=1&ps=100" % article_id meta["url"] = article_url meta["article_id"] = article_id yield Request(url, callback=self.getContent, meta=meta)
def getArticleUrl(self, response): meta = response.meta article_urls = response.xpath(self.xpath["article_link"]).extract() #article_urls = response.xpath('//div[@class="f-main-leftMain-content clear"]//ol/li/a/@href').extract() if len(article_urls) > 0: for url in article_urls: if url.startswith("./"): url = response.meta['url_prefix'] + url.lstrip("./") if not url.startswith("http://"): continue meta['url'] = url if isArticleExist(url) == False: yield Request(url, callback=self.getContent, meta=meta, dont_filter=True)
def getArticleUrl(self, response): meta = response.meta #跳转后的页面 current_page_no = self.getPageNo(response.url) if current_page_no != meta['page_no']: yield Request(response.url, callback=self.getTotalPage, meta=response.meta) article_urls = response.xpath(self.xpath['article_link']).extract() #article_urls = response.xpath('//div[@id="Content1"]/div[@class="xin"]/ul/li/span/a/@href').extract() #print response.url, len(article_urls) if len(article_urls) > 0: for url in article_urls: url = self.url_prefix + url meta['url'] = url if isArticleExist(url) == False: yield Request(url, callback=self.getContent, meta=meta)