def crawl_tieba(self, response): trends = response.css('div.main > ul > li a') for trend in trends: item = ScrapyNewsItem() item['source'] = 'tieba_spider' item['title'] = trend.css('a::text').extract_first() item['url'] = trend.css('a').attrib['href'] item['remark'] = '' yield item
def crawl_topbaidu(self, response): trends = response.css('td.keyword >a:nth-child(1) ') for trend in trends: item = ScrapyNewsItem() item['source'] = 'topbaidu_spider' item['title'] = trend.css('a::text').extract_first() item['url'] = trend.css('a').attrib['href'] item['remark'] = '' yield item
def crawl_douban(self, response): trends = response.css('ul.trend > li > a') for trend in trends: item = ScrapyNewsItem() item['source'] = 'douban_spider' item['title'] = trend.css('a::text').extract_first() item['url'] = trend.css('a').attrib['href'] item['remark'] = '' yield item
def parse_item(self, response): item = ScrapyNewsItem() item['title'] = response.xpath('/html/body/div[3]/div[2]/div[2]/p/text()').extract() item['date'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[2]/text()').extract() state = response.xpath('normalize-space(/html/body/div[3]/div[2]/div[2]/div[1]/span[3]/text())').extract() item['state'] = state item['number'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract() item['content'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract() item['url'] = response.url yield item
def crawl_hupu(self, response): trends = response.css('div.list> ul > li >span:nth-child(1) >a') for trend in trends: item = ScrapyNewsItem() item['source'] = 'hupu_spider' item['title'] = trend.css('a').attrib['title'] item['url'] = "https://bbs.hupu.com" + trend.css( 'a').attrib['href'] item['remark'] = '' yield item
def crawl_weibo(self, response): trends = response.css('td.td-02 > a') for trend in trends: item = ScrapyNewsItem() item['source'] = 'weibo_spider' item['title'] = trend.css('a::text').extract_first() href = self.get_weibo_href(trend) item['url'] = "https://s.weibo.com" + href item['remark'] = '' yield item
def crawl_github(self, response): trends = response.css('div> article.Box-row ') for trend in trends: item = ScrapyNewsItem() item['source'] = 'github_spider' title = "".join(trend.css('p::text').extract()) re.sub(r'[\\*|“<>:/()()0123456789]', '', title) title.replace('\n', '').replace(' ', '') item['title'] = title item['url'] = "https://github.com" + trend.css( 'h1>a').attrib['href'] item['remark'] = '' yield item
def agricultural_detail(self, response): item = ScrapyNewsItem() base_contents_list = response.xpath( '//div[@class="content"]/div[@class="TRS_Editor"]/div/text()' ).extract() tag_p_contents_list = response.xpath( '//div[@class="content"]/div[@class="TRS_Editor"]/p/text()' ).extract() tag_image_contents_list = response.xpath( '//div[@class="content"]/div[@class="TRS_Editor"]/div[@class="Custom_UnionStyle"]/p/text()' ).extract() base_contents = "\n".join(base_contents_list).strip() tag_p_contents = "\n".join(tag_p_contents_list).strip() tag_image_contents = "\n".join(tag_image_contents_list).strip() contents = base_contents + tag_p_contents + tag_image_contents if not contents: return try: item["image"] = response.xpath( '//div[@class="content"]/div[@class="TRS_Editor"]/div[@class="Custom_UnionStyle"]/p/img/@src' ).extract()[0] except: pass item["title"] = response.xpath( '//div[@class="zhengwen-left-container"]/h1[@class="wtitle"]/text()' ).extract()[0] item["contents"] = contents item["auth"] = response.xpath( u'//div[@class="content"]/div[contains(text(),"责任编辑")]/text()' ).extract()[0] item["occurd_time"] = response.xpath( '//div[@class="yui3-g"]/div[@class="yui3-u"]/p[@class="wlaiyuan"]/text()' ).extract()[0] item["source_url"] = response.url item["source"] = u"中国农业新闻网" yield item