def parse_content(self, response): try: item = NewsspiderItem() item_type = 'society' item_url = response.url item_crawl_time = GetCrawlTime.CrawlTime item_title = ' '.join(response.xpath('//h2[@id="article_title"]/text()').extract()).strip().replace(u'\xa0', u' ') item_time = ''.join(response.xpath('//span[@id="pubtime_baidu"]/text()').extract()).strip().replace(u'\xa0', u' ') item_content = '\n'.join(response.xpath('//div[@id="content"]//p//text()').extract()).strip().replace(u'\xa0', u' ') item_img = response.xpath('//div[@id="content"]//img/@src').extract() item_author = ''.join(response.xpath('//div[@class="m-editor"]/text()').extract()).strip().replace(u'\xa0', u' ') item_tags = '' item_source = u'南方网' item_summary = item_content[:100] item_img = str(CommonParse.array_uft8_parse(item_img)) item_time = item_time[:16] item_time = TimeParse.time_parse(item_time) item = item.getItem(url=item_url, title=item_title, tags=item_tags, source=item_source, publish_date=item_time, content=item_content, author=item_author, type=item_type, img=item_img, summary=item_summary, crawl_time=item_crawl_time) item.parseStr(item) yield item except Exception as e: print '=' * 100 print e, response.url with open('%s.txt' % self.name, 'a') as f: f.write('%s - %s\n' % (e, response.url))
def parse_class(self, response): soup = BeautifulSoup(response.body, "lxml") meta = response.request.meta category = meta['category'] current_url = response.url content = soup.find("div", class_="container cc-body-content") if content is not None: item = NewsspiderItem() item['category'] = category item['url'] = current_url item['page'] = content.decode() yield item else: logging.info("failed to get content: " + current_url) all_reviews = soup.find("div", class_="course-all-reviews") if all_reviews: for review_title in all_reviews.find_all( "div", class_="review-title title-with-image"): a = review_title.find("a") if a is not None: link = self.index_url + a['href'] if link not in self.user_dict.keys(): logging.info("add new user: " + link) self.user_dict[link] = 'user'
def second_parse(self, response): # 获取meta参数里面键为'meta_1'的值 meta_1 = response.meta['meta_1'] items = [] # 遍历小类里面的子链接 for each in response.xpath('//a/@href'): # 获取的子链接,以大类链接开头,以.shtml结尾 if each.extract().encode('utf-8').startswith( meta_1['parentUrl'].encode('utf-8')) and each.extract( ).encode('utf-8').endswith('.shtml'.encode('utf-8')): item = NewsspiderItem() item['parentUrl'] = meta_1['parentUrl'] item['parentTitle'] = meta_1['parentTitle'] item['subUrl'] = meta_1['subUrl'] item['subTitle'] = meta_1['subTitle'] item['subpath'] = meta_1['subpath'] item['sonUrl'] = each.extract() items.append(item) # 发送子链接请求 for each in items: yield scrapy.Request(each['sonUrl'], meta={'meta_2': each}, callback=self.detail_parse)
def parse(self, response): # filename = response.url.split("/")[-2] filename = "huxiu" with open(filename, 'wf') as f: for newsCell in response.css("div.mod-b"): item = NewsspiderItem() item["title"] = (newsCell.css("h3 a").extract())[0].encode("utf-8") f.write(item["title"]) yield item
def parse_item(self, response): i = NewsspiderItem() i['title'] = response.xpath( "/html/head/meta[@property='og:title']/@content").extract() i['keywords'] = response.xpath( "/html/head/meta[@name='keywords']/@content").extract() content = response.xpath("//div[@id='artibody']").extract() i['content'] = str(content[0]) i['link'] = response.url i['source'] = 'sina.com.cn' i['author'] = response.xpath( "//p[@class='article-editor']/text()").extract() i['publish_time'] = response.xpath( "//meta[@property='article:published_time']/@content").extract() return i
def parse_user(self, response): soup = BeautifulSoup(response.body, "lxml") meta = response.request.meta category = 'user' #meta['category'] current_url = response.url content = soup.find("div", class_="container cc-body-content") if content is not None: item = NewsspiderItem() item['category'] = category item['url'] = current_url item['page'] = content.decode() yield item else: logging.info("failed to get content: " + current_url)
def parse_detail(self, response): news_content = '' news_img = '' for p in response.xpath('//*[@id="newscontent"]/div/div[1]/div/div/p'): content = p.xpath('.//text()').extract() img = p.xpath('./img/@src').extract() # 要先判断是否为空,不然可能会出现indexError if content: # print("正文:"+temp[0]) news_content += content[0] + "\r\n" if img: # print("图片:"+img[0]) news_img += "http://news.xmu.edu.cn" + img[0] + ";" item = NewsspiderItem() item["news_title"] = response.xpath( '//*[@id="mainContainer"]/div[3]/table/tr/td/span/span/span/text()' ).extract()[0] item["news_content"] = news_content item["news_source"] = '厦门大学新闻网' item["news_link"] = response.url item["news_release_time"] = response.xpath( '//*[@id="mainContainer"]/div[4]/table/tr[2]/td/span[1]/span/span/text()' ).extract()[0] item["news_read_status"] = '1' item["news_get_time"] = time.strftime('%Y-%m-%d', time.localtime(time.time())) item["news_imgs"] = news_img release_time = item['news_release_time'] print("the latest_release_time from web_xmu_news is {0}".format( self.latest_release_time)) st1 = time.mktime(time.strptime(release_time, "%Y-%m-%d")) st2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d")) result = int(st1) - int(st2) print("发布时间:{0} 是否继续:{1}".format(release_time, result)) # 发布时间小于数据库最新时间则停止爬取,且数据若已存在则不保存到数据库 if result <= 0: self.crawler.engine.close_spider(self, '厦门大学新闻网消息爬取完成!') elif myMysql.myMysql().columnExist(tableName="web_news", columnValue=item['news_link']): return else: yield item
def second_parse(self, response): meta_1 = response.meta['meta_1'] items = [] # 循环遍历获取文章url for each in response.xpath('//a/@href'): if each.extract().encode('utf-8').startswith( meta_1['first_url'].encode('utf-8')) and each.extract( ).encode('utf-8').endswith('.shtml'.encode('utf-8')): item = NewsspiderItem() item['first_url'] = meta_1['first_url'] item['second_url'] = meta_1['second_url'] item['article_url'] = each.extract() items.append(item) # 获取文章请求 for each in items: yield scrapy.Request(each['article_url'], meta={'meta_2': each}, callback=self.detail_parse)
def parse(self, response): # 以一级目录的“地方站”作为根来循环遍历所有的一级目录url和title for each in response.xpath( "//div[@id='tab01']/div[@data-sudaclick!='citynav']"): # 获取一级目录的url first_url = each.xpath('./h3/a/@href').extract()[0] # 循环遍历二级目录url for other in each.xpath("./ul/li/a"): # 获取二级目录的url if other.xpath('./@href').extract()[0].startswith(first_url): item = NewsspiderItem() second_url = other.xpath('./@href').extract()[0] item['first_url'] = first_url item['second_url'] = second_url # 获取二级目录请求 yield scrapy.Request(url=item['second_url'], meta={'meta_1': item}, callback=self.second_parse)
def parse_detail(self, response): print(response.status) print(response.xpath('//h1[@class="main-title"]/text()').extract() [0]) #标题 news = NewsspiderItem() news["url"] = response.url news["title"] = response.xpath( '//h1[@class="main-title"]/text()').extract()[0] news["time"] = response.xpath( '//*[@id="top_bar"]/div/div[2]/span/text()').extract()[0] news["origin"] = response.xpath( '//*[@id="top_bar"]/div/div[2]/a/text()').extract() news["origin_url"] = response.xpath( '//*[@id="top_bar"]/div/div[2]/a/@href').extract()[0] news["detail"] = "\n".join(response.xpath('//div[@class="article"]/div/p/text()').extract())+\ "\n".join(response.xpath('//div[@class="article"]/p/text()').extract())+\ "\n".join(response.xpath('//div[@class="article"]/div/div/text()')) yield news
def parse(self, response): # 通过某节点作为根节点进行大类链接遍历 for each in response.xpath( "//div[@id='tab01']/div[@data-sudaclick!='citynav']"): # 获取大类链接和大类标题 #encode('utf-8') string编码为bytes parentUrl = each.xpath('./h3/a/@href').extract()[0] #parentTitle = each.xpath('./h3/a/text()').extract()[0].encode('utf-8') parentTitle = each.xpath('./h3/a/text()').extract()[0] # 设置大类存储路径 parentpath = './data/' + parentTitle #parentpath = parentTitle if not os.path.exists(parentpath): os.makedirs(parentpath) # 遍历小类链接 for other in each.xpath("./ul/li/a"): # 获取以大类链接开头的小类链接 if other.xpath('./@href').extract()[0].startswith(parentUrl): # 注意item的位置,不同的位置会导致不同的结果。尽量不要把item的数据在外循环和内循环里面分别获取,如必须这样做,则创建空列表添加item来解决。 item = NewsspiderItem() subUrl = other.xpath('./@href').extract()[0] subTitle = other.xpath('./text()').extract()[0] subpath = parentpath + '/' + subTitle item['parentUrl'] = parentUrl item['parentTitle'] = parentTitle item['subUrl'] = subUrl item['subTitle'] = subTitle item['subpath'] = subpath if not os.path.exists(subpath): os.makedirs(subpath) # 发送小类链接请求,使用meta参数把item数据传递到回调函数里面,通过response.meta['']得到数据 yield scrapy.Request(url=item['subUrl'], meta={'meta_1': item}, callback=self.second_parse)
def archive(self, response): title_tag = response.css('.noBorder::text').extract_first().strip() art_author = response.css('.art_author::text').extract_first().strip() art_authors = art_author.encode('utf-8').split(":") if len(art_authors) == 2: art_author = art_authors[1] else: art_author = u'佚名' art_publish = response.css( '.art_publish::text').extract_first().strip() art_publishs = art_publish.encode('utf-8').split(":") if len(art_publishs) == 2: art_publish = art_publishs[1] else: art_publish = '2001-01-01' art_con = response.css('.atr_con').extract_first() item = NewsspiderItem() item['title'] = title_tag item['author'] = art_author item['date'] = art_publish item['content'] = art_con item['url'] = response.url # print item['title'] yield item