def parse(self, response): print("yes in ") if self.pg == 1: # 第一頁用get # html_str = response.text # print(html_str) div_list = response.xpath("//div[@class='container']//div[@class='mod-info-flow']/div") print(len(div_list)) for div in div_list: item = HuxiuItem() item["title"] = div.xpath(".//div[contains(@class,'mob-ctt')]/h2/a/text()").extract_first() item["article_url"] = "{}".format("https://www.huxiu.com") + div.xpath("./div/a/@href").extract_first() item["img"] = div.xpath("./div/a/img/@src").extract_first() item["img2"] = div.xpath("./div[2]/div/div/a/img/@src").extract_first() item["href2"] = "{}".format("https://www.huxiu.com") + div.xpath("./div[2]/div/div/a/@href").extract_first() if len(div.xpath("./div[2]/div/div/a/@href")) > 0 else None yield item self.pg += 1 if self.pg > 1: # 下一頁 params = { "huxiu_hash_code": "b46b6dad804d7d9362a29fe56a8e47a2", "page": "{}".format(self.pg), } # self.pg += 1 # print(self.pg) url = "https://www.huxiu.com/v2_action/article_list" yield scrapy.FormRequest( url, callback=self.parse_nextpage, method="POST", formdata=params )
def post_parse(self, response): page = response.meta['page'] + 1 # time.sleep(random.randint(1,3)) data = json.loads(response.text) if data: if 'data' in data.keys(): html_post = data['data'] if 'total_page' in data.keys(): total_page = data['total_page'] if 'last_dateline' in data.keys(): last_dateline = data['last_dateline'] item = HuxiuItem() sel = etree.HTML(html_post) item['title'] = sel.xpath('//h2//a/text()') # print(type(item['title'])) item['url'] = [('https://www.huxiu.com' + url) for url in sel.xpath('//h2//a/@href')] item['author'] = sel.xpath('//span[@class="author-name"]/text()') if item['title'] and item['url'] and item['author']: print('获取item内容出错', page - 1) item['updata'] = datetime.datetime.now() yield item if page < int(total_page + 1): data = { 'huxiu_hash_code': '27ab1e6d0b9252b75cefec3c71dbcfba', 'page': str(page), 'last_dateline': str(last_dateline), } yield scrapy.FormRequest( 'https://www.huxiu.com/v2_action/article_list', formdata=data, callback=self.post_parse, meta={'page': page})
def parse_nextpage(self, response): print("nextpage") json_dict = json.loads(response.body_as_unicode()) # print(json_dict) data_str = json_dict["data"] # print(data_str) # print(type(data_str)) if len(data_str) > 0: html_str = etree.HTML(data_str) div_list = html_str.xpath("//div[@class='mod-b mod-art']") for div in div_list: item = HuxiuItem() item["title"] = div.xpath(".//div[@class='mob-ctt']/h2/a/text()")[0] item["article_url"] = "{}".format("https://www.huxiu.com") + div.xpath(".//div[@class='mob-ctt']/h2/a/@href")[0] item["img"] = div.xpath(".//div[contains(@class,'mod-thumb')]//img/@data-original")[0] if len(div.xpath(".//div[contains(@class,'mod-thumb')]//img/@data-original")) > 0 else None item["img2"] = div.xpath(".//div[@class='mob-ctt']/div[@class='mob-author']/div/a/img/@src")[0] item["href2"] = "{}".format("https://www.huxiu.com") + div.xpath(".//div[@class='mob-ctt']/div[@class='mob-author']/a/@href")[0] # print(item) yield item self.pg += 1 print(self.pg) # 測試用的,之後要mark==================== if self.pg == 3: # return item = HuxiuItem() yield item print("虎嗅爬蟲結束.........") return # ======================================= params = { "huxiu_hash_code": "b46b6dad804d7d9362a29fe56a8e47a2", "page": "{}".format(self.pg), } url = "https://www.huxiu.com/v2_action/article_list" yield scrapy.FormRequest( url, callback=self.parse_nextpage, method="POST", formdata=params ) else: item = HuxiuItem() yield item print("虎嗅爬蟲結束.........")
def parse(self, response): for sel in response.xpath( '//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'): item = HuxiuItem() item['title'] = sel.xpath('h3/a/text()')[0].extract() item['link'] = sel.xpath('h3/a/@href')[0].extract() url = response.urljoin(item['link']) item['desc'] = sel.xpath( 'div[@class="mob-sub"]/text()')[0].extract() print(item['title'], item['link'], item['desc'])
def parse_article(self, response): try: detail = response.xpath('//div[@class="article-wrap"]') item = HuxiuItem() item['title'] = detail.xpath('h1/text()')[0].extract().strip() item['link'] = response.url item['post_time'] = detail.xpath( "div[@class='article-author']/div[@class='column-link-box']/" "span[@class='article-time pull-left']/text()")[0].extract() yield item except IndexError: yield scrapy.Request(response.url, callback=self.parse_article)
def parse_article(self, response): # '//div[@class="article-wrap"]/div/p/span[@class="text-remarks"]/text()' item = HuxiuItem() time.sleep(1) detail = response.xpath('//div[@class="container"]/div[@class="wrap-left pull-left"]/div[@class="article-wrap"]') detail_1 = response.xpath('//div[@class="container"]/div[@class="wrap-right pull-right"]') item['title'] = detail.xpath('h1/text()')[0].extract() item['posttime'] = detail.xpath('div/div/span[@class="article-time pull-left"]/text()')[0].extract() item['author'] = detail.xpath('div/span/a/text()')[0].extract() item['nexttitle'] = detail_1.xpath('div/div[@class="author-next-article"]/a/text()')[0].extract() item['link'] = response.url item['instime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) yield item
def parse(self, response): item = HuxiuItem() # le = LinkExtractor(restrict_css='div.message-box', # deny='/article\/(d+)\.html$') data_list = response.css('div.mob-ctt') for data in data_list: item['author'] = '钱德虎' item['date'] = data.css( 'div.mob-author span.time::text').extract_first() item['sub'] = data.css('div.mob-sub::text').extract_first() href = response.urljoin( data.css('h3 a::attr(href)').extract_first()) # print(item) yield scrapy.Request(href, callback=self.detail_parse, dont_filter=True, meta={'item': deepcopy(item)})
def parse(self, response): for sel in response.xpath( "//div[@class='mod-info-flow']/div[@class='mod-b mod-art clearfix ']" ): item = HuxiuItem() title_datas = sel.xpath( "div[@class='mob-ctt index-article-list-yh']/h2/a/text()") item['title'] = title_datas[0].extract() if title_datas else '' link_datas = sel.xpath( "div[@class='mob-ctt index-article-list-yh']/h2/a/@href") item['link'] = link_datas[0].extract() if link_datas else '' url = response.urljoin(item['link']) desc_datas = sel.xpath( "div[@class='mob-ctt index-article-list-yh']/div[@class='mob-sub']/text()" ) item['desc'] = desc_datas[0].extract() if desc_datas else '' yield scrapy.Request(url, callback=self.parse_article)
def parse(self, response): item = HuxiuItem() selector = scrapy.Selector(response) # print(response) title = str(selector.xpath('//h1[@class="t-h1"]/text()').extract()[0]).strip('\n').strip() time = selector.xpath('//span[@class="article-time pull-left"]/text() | //span[@class="article-time"]/text()').extract()[0] author = selector.xpath('//span[@class="author-name"]/a/text()').extract()[0] collection_num = selector.xpath('//span[@class="article-share pull-left"]/text() | //span[@class="article-share"]/text()').extract()[0].strip("收藏") comment_num = selector.xpath('//span[@class="article-pl pull-left"]/text() | //span[@class="article-pl"]/text()').extract()[0].strip("评论") content = selector.xpath('//div[@class="article-content-wrap"]/p/text()').extract() content_all = '' for x in content: content_all = content_all + x url = 'url' # 原文url content = content_all category = selector.xpath('//div[@class="column-link-box"]/a/text()').extract() category_all = '' for x in category: category_all = category_all + x category = category_all print(title) # 填入item item['title'] = title item['time'] = time item['author'] = author item['collection_num'] = collection_num item['comment_num'] = comment_num item['content'] = content item['url'] = url item['category'] = category yield item url_next = "https://www.huxiu.com" + selector.xpath('//div[@class="hot-article-img"]/a/@href').extract()[0] print("@@@@@@@@@@@@@@@2", url_next) yield scrapy.Request(url_next, callback=self.parse)
def parse(self, response): l = loader.ItemLoader(item=HuxiuItem(), response=response) l.add_xpath('title', '//div[@class="mod-info-flow"]//h2//a/text()') l.add_xpath('url', '//div[@class="mod-info-flow"]//h2//a/@href', MapCompose(lambda i: urljoin('https://www.huxiu.com', i))) l.add_xpath('author', '//span[@class="author-name "]/text()') l.add_value('updata', datetime.datetime.now()) TIME_STAMP = response.xpath( '//div[@class="get-mod-more js-get-mod-more-list transition"]/@data-last_dateline' ).extract()[0] data = { 'huxiu_hash_code': '27ab1e6d0b9252b75cefec3c71dbcfba', 'page': '2', 'last_dateline': str(TIME_STAMP), } page = 2 yield scrapy.FormRequest( 'https://www.huxiu.com/v2_action/article_list', formdata=data, callback=self.post_parse, meta={'page': page}) yield l.load_item()