def parse(self, response): try: article_list = response.xpath('//ul[@class="article-list"]/li/a').extract() #self.logger.DEBUG(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a href="(.*?)" class="', article_list[i], re.S)[0] item['url'] = self.host + url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall(r'class="article-list-link">(.*?)</a>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',item['url'], item['urlmd5'], item['title']) yield item except Exception,e: self.logger.critical(e)
def parse_item(self, response): try: self.logger.debug(response.headers) article_list = response.xpath('//ul[@id="list"]/li/a').extract() self.logger.debug(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a href="(.*?)"', article_list[i], re.S)[0] item['url'] = self.host + url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() #Do not delete any space in the regex item['title'] = re.findall(r'</span>\n (.*?) </a>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s',item['url'], item['urlmd5'], item['title']) yield item except Exception,e: self.logger.critical(e)
def parse_item(self, response): try: self.logger.debug(response.headers) article_list = response.xpath( '//div[@id="articles-list"]/article/div/a').extract() self.logger.debug(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a href="(.*?)"', article_list[i], re.S)[0] item['url'] = self.host + url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall(r'<h2>(.*?)</h2>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s', item['url'], item['urlmd5'], item['title']) yield item except Exception, e: self.logger.critical(e)
def parse(self, response): try: article_list = response.xpath( '//div[@class="leftlatnews" and @id="lcontentnews"]/div[@class="latnewslist"]/div/a' ).extract() self.logger.debug(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a href="(.*?)" title="', article_list[i], re.S)[0] item['url'] = self.host + url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall(r'<h3>(.*?)</h3>\r\n', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s', item['url'], item['urlmd5'], item['title']) yield item except Exception, e: self.logger.critical(e)
def parse(self, response): try: article_list = response.xpath( '//div[@class="news-list bg-color-white mb20"]/div/div/div/div/div[2]/div[1]/a' ).extract() self.logger.debug(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a href="(.*?)" class="', article_list[i], re.S)[0] item['url'] = self.host + url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall(r'class="link-1">(.*?)</a>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s', item['url'], item['urlmd5'], item['title']) yield item except Exception, e: self.logger.critical(e)
def parse(self, response): try: article_list = response.xpath( '//table[@class="table table-hover table-striped"]/tbody/tr/td/a' ).extract() self.logger.debug(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a class="pull-left" href="(.*?)"', article_list[i], re.S)[0] item['url'] = self.host + url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall( r'">\r\n\t\t\t\t\t\t\t\t(.*?)\t\t\t\t\t\t\t\t</a>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s', item['url'], item['urlmd5'], item['title']) yield item except Exception, e: self.logger.critical(e)
def parse(self, response): try: article_list = response.xpath( '//div[@class="container"]/div[@class="news-preview-wrap col-sm-6 col-md-4"]/a' ).extract() self.logger.debug(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a class="news-preview-link" href="(.*?)"', article_list[i], re.S)[0] item['url'] = self.host + url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall( r'<h2 class="post-title news-preview-content-title">(.*?)</h2>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s', item['url'], item['urlmd5'], item['title']) yield item except Exception, e: self.logger.critical(e)
def parse(self, response): try: body = response.body.decode(response.encoding) self.logger.info(body) article_list = response.xpath('//*[@id="notice"]/ul') #article_list = response.xpath('//ul[@class="page_notice_list_content"]/li/a').extract() self.logger.debug(article_list) return for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a href="(.*?)">', article_list[i], re.S)[0] item['url'] = self.host + url item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall( r'<h2 class="page_notice_title">(.*?)</h2>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s', item['url'], item['urlmd5'], item['title']) return yield item except Exception, e: self.logger.critical(e)
def parse(self, response): try: article_list = response.xpath( '//ul[@class="cbp_tmtimeline"]/li/div[@class="cbp_tmlabel"]/article/header/h3/a' ).extract() self.logger.debug(article_list) for i in range(0, len(article_list)): item = NoticespiderItem() url = re.findall(r'<a href="(.*?)" target=', article_list[i], re.S)[0] #replace '&' to '' rep_url = url.replace('amp;', '') item['url'] = self.host + rep_url self.logger.info(item['url']) item['urlmd5'] = md5(item['url']).hexdigest() item['title'] = re.findall( r'_blank">(.*?)\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t</a>', article_list[i], re.S)[0] self.logger.info('ITEM--> url:%s, urlmd5:%s, title:%s', item['url'], item['urlmd5'], item['title']) yield item except Exception, e: self.logger.critical(e)