def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: l.add_value('title', response.xpath('//title/text()').extract_first() or '') l.add_value( 'date', response.xpath('//span[@class="pubTime"]/text()'). extract_first()) or '1970-01-01 00:00:00' l.add_value( 'source', response.xpath('//a[@rel="nofollow"]/text()').extract_first() or self.website) l.add_value( 'content', ''.join( response.xpath( '//div[@id="article_text"]/descendant-or-self::p/text()' ).extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: date_source_author = response.xpath( '//div[@class="info"]/span/text()').extract() l.add_value('title', response.xpath('//title/text()').extract_first() or '') l.add_value( 'date', date_source_author[0] if len(date_source_author) > 0 else '1970-01-01 00:00:00') l.add_value( 'source', date_source_author[1] if len(date_source_author) > 1 else '') l.add_value( 'content', ''.join( response.xpath( '//div[@class="TRS_Editor"]/descendant-or-self::text()' ).extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: l.add_value('title', response.xpath('//title/text()').extract_first()) date = response.xpath('//td[@align="center"]/text()').re( u'\d+年\d+月\d+日')[0] date = date.replace(u'年', '-').replace(u'月', '-').replace( u'日', ' ') + '00:00:00' l.add_value('date', date) l.add_value('source', self.website) l.add_value( 'content', ''.join( response.xpath( '//td[@style="line-height: 30px;font-size:16px; padding-top:10px;"]/descendant-or-self::text()' ).extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: l.add_value('title', response.xpath('//title/text()').extract_first() or '') l.add_value( 'date', response.xpath('//div[@class="titleInfo"]/span[1]/text()'). extract_first() or '') l.add_value('source', self.website) classname = ['artibody'] content = '' for c in classname: content += ''.join( response.xpath( '//div[@id="%s"]/descendant-or-self::text()' % c).extract()) # if content == None or content.strip() == '': # self.logger.info(response.url) l.add_value('content', content) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//span[@id="thread_subject"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="authi"]/em/text()').extract()) r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}" date0 = re.compile(r1) date = ''.join(l.get_collected_values('date')) date1 = date0.findall(date) l.replace_value('date', date1[0]) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/br/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/p/font/font/text()').extract()) l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/div/div/font/font/strong/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@class="l_tit"]/text()').extract()) l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) date = ''.join(l.get_collected_values('date')) #date = time.strptime(date.split()[0], '%Y-%m-%d') #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/strong/text()').extract()) l.add_value('content',response.xpath('//div[@class="article"]/div/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: if re.search(r'/404.h', response.body) != None: raise Exception('this item may be deleted') if response.status != 200: raise Exception('response status %s'%response.status) l.add_value('title', response.xpath('//title/text()').extract_first() or '') date = response.xpath('//td[@width="25%"]/text()').re(u"\d+年\d+月\d+日")[0] date = date.replace(u'年', '-').replace(u'月', '-').replace(u'日', ' ')+'00:00:00' l.add_value('date', date) l.add_value('source', self.website) l.add_value('content', ''.join(response.xpath('//td[@class="font02"]/text()').extract())) l.add_value('content', ''.join(response.xpath('//div[@id="ozoom"]/descendant-or-self::p/text()').extract()) or '') except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@id="biaoti"]/text()').extract()) l.add_value('title', response.xpath('//h1[@id="biaoti"]/text()').extract()) l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) l.add_value('date',response.xpath('//div[@class="center lh32 grey12a"]/text()').extract()) l.add_value('date',response.xpath('//div[@id="left"]/h2/text()').extract()) l.add_value('content',response.xpath('//div[@id="zw"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="zw"]/strong/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) url = response.url if url[11:17]=="shzfzz": date = ''.join(l.get_collected_values('date')) date = time.strptime(date.split()[0], u'%Y年%m月%d日') l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): try: item = SpiderItem() item['title'] = ''.join( response.xpath('//div[@class="STYLE67"]/text()').extract()) item['content'] = ''.join(response.xpath('//p/text()').extract()) item['collection_name'] = self.name item['url'] = response.url date0 = response.xpath( '//td[@class="style5"]/div[2]/text()').extract() date1 = ('').join(date0) item['date'] = date1[0:10] + ' 00:00:00' item['website'] = self.website yield item except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) yield l.load_item()
def parse_item(self, response): try: l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', response.xpath('//title/text()').extract_first()) da = response.xpath('//div[@class="conpage"]/span/text()' ).extract_first().split(u'\xa0\xa0\xa0\xa0') da[1] = search(r'\d+-\d+-\d+\s+(\d+){0,1}(:\d+){0,2}', da[1]).group() if da[1].count(':') == 1: da[1] += ':00' l.add_value('date', da[1]) l.add_value('source', da[0]) l.add_value( 'content', ''.join( response.xpath( '//div[@class="conpage"][2]/descendant-or-self::text()' ).extract())) l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: for attr in ['title', 'date', 'content']: function = getattr(self, 'get' + attr, None) if function: l.add_value(attr, function(response)) else: self.logger.error('no method for %s' % attr) except Exception as e: # self.logger.info((sys.exc_traceback)) self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def parse_item(self, response): item = SpiderItem() sel = Selector(response) sites = sel.xpath('//record') for site in sites: item = SpiderItem() pattern = re.compile(".*?title='(.*?)'", re.S) item['title'] = re.findall(pattern, response.body) pattern1 = re.compile(".*?href='(.*?)'", re.S) urls = re.findall(pattern1, response.body) for url in urls: url = 'http://gzgy.lss.gov.cn' + url if self.filter.url_exist(url): break yield Request(url, callback=self.get_news, dont_filter=True)
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: l.add_value('title', response.xpath('//title/text()').extract_first() or '') date = response.xpath( '//span[@class="padding-left cccc"]/text()').extract_first() if date != None and date != '' and date.find(u'年') != -1: date = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( time.mktime(time.strptime(date, u'%Y年%m月%d日 %H:%M:%S')))) l.add_value('date', date) date = response.xpath( '//span[@class="time"]/text()').extract_first() if date != None and date != '' and date.find(u'年') != -1: date = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( time.mktime(time.strptime(date, u'%Y年%m月%d日 %H:%M:%S')))) l.add_value('date', date) l.add_value('source', self.website) classname = ['newstext'] content = '' for c in classname: content += ''.join( response.xpath( '//div[@class="%s"]/descendant-or-self::text()' % c).extract()) # if content == None or content.strip() == '': # self.logger.info(response.url) l.add_value('content', content) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self, response): l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', response.xpath('//h3/text()').extract()) l.add_value( 'date', response.xpath( '//table[3]/tr[2]/td/table/tr/td/div/text()').extract()) #r1 = r"\d{4}\-\d{1,2}\-\d{1,2}" #date0 = re.compile(r1) #date = ''.join(l.get_collected_values('date')) #date1 = date0.findall(date) # l.replace_value('date', date1[0]+" "+"00:00:00") l.add_value( 'content', response.xpath( '//table[3]/tr[2]/td/table/tr[3]/td/div/p/text()').extract()) l.add_value( 'content', response.xpath( '//table[3]/tr[2]/td/table/tr[3]/td/div/div/p/text()').extract( )) l.add_value('url', response.url) l.add_value('collection_name', self.name) return l.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value("title", response.xpath('//h1[@id="title"]/text()').extract_first()) loader.add_value("title", response.xpath('//span[@id="title"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@class="time"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@id="pubtime"]/text()').extract_first()) date = ''.join(loader.get_collected_values("date")).strip() date = time.strptime(date, '%Y年%m月%d日 %H:%M:%S') loader.replace_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date)) loader.add_value("content", ''.join(response.xpath('//div[@id="content"]/descendant-or-self::text()').extract())) loader.add_value("content", ''.join(response.xpath('//div[@class="article"]/descendant-or-self::text()').extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): try: item = response.meta['item'] content_type = chardet.detect(response.body) if content_type['encoding'] != 'utf-8': html_content = response.body.decode(content_type['encoding'], 'ignore') pattern = re.compile('align=center bgcolor="#FFFFFF">(.*?) ', re.S) date = re.findall(pattern, html_content)[0] response = Selector(text=html_content) view_num = response.xpath('//font[@color="red"]/text()').extract_first() data = response.xpath('//td[@id="fontzoom"]') content = data.xpath('string(.)').extract_first().replace(u'\xa0', '') date = time.strptime(date.split(u':')[1].strip(), u'%Y年%m月%d日') item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', date) item['content'] = content item['view_num'] = view_num item['collection_name'] = self.name item['website'] = self.website yield item except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) yield l.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//td[@style=" padding-bottom:5px; padding-top:5px"]//tr[2]/td/text()' ).extract_first()) date = response.xpath( '//td[@style=" padding-bottom:5px; padding-top:5px"]//td[@style="font-size:12px"]/text()' ).extract_first() loader.add_value("date", date[5:15]) loader.add_value( "content", ''.join( response.xpath( '//td[@style=" padding-bottom:5px; padding-top:5px"]//div/descendant-or-self::text()' ).extract())) loader.add_value( "content", ''.join( response.xpath( '//td[@style=" padding-bottom:5px; padding-top:5px"]//p/descendant-or-self::text()' ).extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( 'title', response.xpath('//p[@class="title1"]/text()').extract_first()) date = response.xpath( '//pre[@class="f_title"]/text()').extract_first() loader.replace_value('date', date[date.find(u"日期:") + 3:][0:10]) loader.add_value( 'content', ''.join( response.xpath( '//div[@class="contents"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//h1[@id="artibodyTitle"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//div[@class="article-header clearfix"]/h1/text()'). extract_first()) loader.add_value( "title", response.xpath('//h2[@id="titleText"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//h1[@id="main_title"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//h1[@id="artibodyTitle"]/text()').extract_first()) loader.add_value( "title", response.xpath( '//span[@class="location"]/h1/text()').extract_first()) url = response.url loader.add_value( "date", url[url.rfind('/', 0, url.rfind('/')) + 1:url.rfind('/')]) loader.add_value( "content", ''.join( response.xpath( '//div[@id="artibody"]//p/descendant-or-self::text()'). extract())) loader.add_value( "content", ''.join( response.xpath( '//div[@class="article-body main-body"]//p/descendant-or-self::text()' ).extract())) loader.add_value( "content", ''.join( response.xpath( '//div[@id="mainContent"]//p/descendant-or-self::text()' ).extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value("title", response.xpath('//div[@class="articleTitle"]/h2/text()').extract_first()) loader.add_value("title", response.xpath('//h1[@class=" BSHARE_POP"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@class="yearMsg"]/text()').extract_first()) loader.add_value("date", response.xpath('//span[@id="pubtime_baidu"]/text()').extract_first()) loader.add_value("source", response.xpath('//span[@class="sourceMsg"]/text()').extract_first()) loader.add_value("content", ''.join( response.xpath('//div[@class="articleBox mb20 cfix"]/descendant-or-self::text()').extract())) loader.add_value("content", ''.join( response.xpath('//div[@id="pzoom"]/p/descendant::text()').extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value("source", '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) yield loader.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//h1[@id="articleTitle"]/text()').extract_first()) date = response.xpath( '//div[@class="content-th-info"]/span/text()').extract_first() loader.add_value("date", date[-16:] + ":00") loader.add_value( "content", ''.join( response.xpath( '//div[@class="document"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news_list(self, response): for sel in response.xpath('//ul[contains(@class, "main_right_box")]/li'): url = urljoin(response.url, sel.xpath('a/@href').extract_first()) if self.filter.url_exist(url): return title = sel.xpath('a/@title').extract_first() date = sel.xpath('span/text()').extract_first() item = SpiderItem() item['url'] = url item['title'] = title date = time.strptime(date.strip(), '%Y-%m-%d %H:%M:%S') item['date'] = time.strftime('%Y-%m-%d %H:%M:%S', date) request = Request(url=url, callback=self.get_news) request.meta['item'] = item yield request next_page = response.xpath('//div[@id="AspNetPager1"]/a')[-2].xpath('@href') if next_page: next_one = int(response.xpath('//span[@style="margin-right:5px;font-weight:Bold;color:red;"]/text()').extract()[0]) + 1 view_state = response.xpath('//input[@id="__VIEWSTATE"]/@value').extract_first() data = { '__EVENTTARGET': 'AspNetPager1', '__EVENTARGUMENT': str(next_one), '__VIEWSTATE': view_state } headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", 'Host': "gyxfdj.gov.cn", 'Referer': response.url } yield FormRequest(url=response.url, method="POST", formdata=data, headers=headers, callback=self.get_news_list)
def get_news_list(self, response): script = response.xpath('//ul[@class="list01"]').extract_first() pattern = re.compile('str_1 = "(.*?)".*?str_3 = "(.*?)".*?<span>(.*?)</span>', re.S) results = re.findall(pattern, ''.join(script)) for result in results: url = urljoin(response.url, result[0]) if self.filter.url_exist(url): return title = result[1] date = result[2] item = SpiderItem() item['url'] = url item['title'] = title item['date'] = date.strip() + " 00:00:00" request = Request(url=url, callback=self.get_news) request.meta['item'] = item yield request next_page = response.xpath('//div[@class="page"]/script').extract_first() pattern = re.compile('createPageHTML\((.*?), (.*?),', re.S) result = re.findall(pattern, ''.join(next_page))[0] all_page = int(result[0]) - 1 current_page = int(result[1]) if all_page != current_page: url = response.url url = url[0:url.find('index')] yield Request(url=url + 'index_' + str(current_page + 1) + ".html", callback=self.get_news_list)
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//div[@id="cont_1_1_2"]/h1[1]/text()').extract_first()) loader.add_value( "date", response.xpath( '//span[@id="pubtime_baidu"]/text()').extract_first()) loader.add_value( "content", ''.join( response.xpath( '//div[@class="left_zw"]/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath('//span[@class="jiatype16"]/strong[1]/text()'). extract_first()) date = time.strptime( response.xpath('//td[@class="news-bg"]/table//tr[2]/td/text()' ).extract()[-1][-19:], "%Y年%m月%d日%H:%M:%S") loader.add_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date)) loader.add_value( "content", ''.join( response.xpath( '//td[@class="jiawenzhang-type"]/descendant-or-self::text()' ).extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) return loader.load_item()
def get_news(self, response): loader = ItemLoader(item=SpiderItem(), response=response) try: loader.add_value( "title", response.xpath( '//div[@class="artz"]/h2[1]/text()').extract_first()) loader.add_value( "date", response.xpath('//p[@class="fbsj"]/text()').extract_first() [3:19] + ":00") loader.add_value( "content", ''.join( response.xpath( '//div[@class="artz"]/div/descendant-or-self::text()'). extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('title', '') loader.add_value('date', '1970-01-01 00:00:00') loader.add_value('content', '') loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) yield loader.load_item()
def get_news_list(self, response): content_type = chardet.detect(response.body) if content_type['encoding'] != 'utf-8': html_content = response.body.decode(content_type['encoding'], 'ignore') selector = Selector(text=html_content) more = selector.xpath('//a[@class="class"]/@href') if more: url = urljoin(response.url, more.extract_first()) yield Request(url=url, callback=self.get_news_list) else: follow = True for sel in response.xpath('//td[@width="62%"]'): item = SpiderItem() url = urljoin(response.url, sel.xpath('a/@href').extract_first()) if self.filter.url_exist(url): follow = False break title = sel.xpath('a/@title').extract_first() item['url'] = url item['title'] = title request = Request(url=item['url'], callback=self.get_news) request.meta['item'] = item yield request if follow: next_page = response.xpath('//a[@class="black"]') if next_page and next_page.xpath( 'text()').extract_first() == u'下一页': next_one = next_page.xpath('@href').extract()[-1] yield Request(url=next_one, callback=self.get_news_list)
def parse_item(self, response): loader = ItemLoader(item=SpiderItem(), response=response) contents = '' try: title = response.xpath( '//div[@class="bt font24_lan"]/text()').extract_first() # 1.利用正文信息匹配日期,但是复杂,可能并不是每个都匹配 # text = response.xpath('//div[@class="lysj"]/ul/li[3]/text()').extract_first() # match = re.search(r'([0-9]{4})[^\x00-\xff]([0-9]{2})[^\x00-\xff]([0-9]{2})[^\x00-\xff]', text) # if match: # date = match.group(1) + match.group(2) + match.group(3) # dateArray = time.strptime(date, "%Y%m%d") # otherStyleDate = time.strftime("%Y-%m-%d", dateArray) # print "date, ", otherStyleDate # 2. 利用url提取日期,简洁快速 match = re.search(r't(20[0-9]{6})', response.url) date = match.group(1) # 转换日期格式 dateArray = time.strptime(date, "%Y%m%d") otherStyleDate = time.strftime("%Y-%m-%d", dateArray) # 这里//text() 等价于提取当前标签下所有含有文本的标签的文本信息,区别于/text() content_list = response.xpath( r'//div[@class="nr font14_lan"]/descendant-or-self::p//text()' ).extract() for content in content_list: contents = contents + content # ### print info # try: # ### str.encode('GB18030') 解决cmd中打印中文报错问题 # print 'title, ', title.encode('GB18030') # print 'url, ', response.url # print "date, ", otherStyleDate # print "content, ", contents.encode('GB18030') # except Exception as e: # print " error : ", e loader.add_value('title', title) loader.add_value('date', otherStyleDate) except Exception as e: # self.logger.error('error url: %s error msg: %s' % (response.url, e)) loader.add_value('date', '1970-01-01') loader.add_value('title', '') finally: # self.logger.info('crawling url: %s' % response.url) loader.add_value('url', response.url) loader.add_value('collection_name', self.name) loader.add_value('website', self.website) if contents == '': self.logger.warning(' url: %s msg: %s' % (response.url, ' content is None')) loader.add_value('content', contents) return loader.load_item()
def parse_item(self, response): l = ItemLoader(item=SpiderItem(), response=response) try: l.add_value('title', ''.join(response.xpath('//h1/text()').extract())) l.add_value('date', ''.join(response.xpath('//div[@class="news_time_source"]/text()').re('\d+-\d+-\d+\W\d+:\d+:\d+'))) l.add_value('source', self.website) l.add_value('content', ''.join(response.xpath('//div[@class="news_main"]/descendant-or-self::text()').extract())) except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') pass finally: l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()
def get_news(self,response): try: l = ItemLoader(item=SpiderItem(),response=response) l.add_value('title', response.xpath('//div[@class="wh645 left"]/p[1]/text()').extract()) l.add_value('title', response.xpath('//p[@class="f22 lh30 yahei"]/a/text()').extract()) l.add_value('title', response.xpath('//p[@class="f22 lh40 fb"]/text()').extract()) l.add_value('date',response.xpath('//p[@class="lh30 left f14 yahei"]/text()').extract()) l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract()) l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract()) date = ''.join(l.get_collected_values('date')) #date = time.strptime(date.split()[0], '%Y-%m-%d') #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date)) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="sanji_left"]/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract()) l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract()) l.add_value('url', response.url) l.add_value('collection_name', self.name) url = response.url return l.load_item() except Exception as e: self.logger.error('error url: %s error msg: %s' % (response.url, e)) l = ItemLoader(item=SpiderItem(), response=response) l.add_value('title', '') l.add_value('date', '1970-01-01 00:00:00') l.add_value('source', '') l.add_value('content', '') l.add_value('url', response.url) l.add_value('collection_name', self.name) l.add_value('website', self.website) return l.load_item()