def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log_line('请求状态不是200') return 'error' response = etree.HTML(html.text) self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'bjjrj' return news
def parser_data(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout', 'timeout' if html.status_code != 200: return 'error', 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="ldContent"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() date = response.xpath('//div[@class="ldDate"]/text()')[0] date = date.split(':')[1] # log('内容', content) return date, content
def get_newsinfo(self, urls): ''' 访问每一条新闻详情 :param newslist: 新闻链接集合 :return: 新闻model ''' for url in urls: t_sleep() log('当前访问的URL', url) try: html = requests.get(url, timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 continue if html.status_code != 200: continue response = etree.HTML(html.text) item = self.parse_item(response, html.url) MogoMgr().insert(item)
def get_newsinfo(self, url, parser_item_fuc): ''' 请求每一个新闻详情 ''' t_sleep() log('当前访问的URL', url) html = self.get_html(url) if html == 'timeout': return 'error' response = etree.HTML(html.text) log('当前访问的URL', url, html.status_code) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # parser_item_fuc(response) title, date, content = parser_item_fuc(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'pbc' return news
def get_iteminfo(self, url): ''' 访问每一条新闻详情 :param itemlist: 新闻链接集合 :return: 新闻model ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'xinhua' return news
def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="union"]/descendant-or-self::*/text()') return ''.join(con_list).strip()
def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: return 'error' response = etree.HTML(html.text) return self.parse_item(response)
def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) return 'timeout' response = etree.HTML(html.text) log('当前访问的URL', url) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) return news
def get_html(self, url): ''' :param url: :return: ''' t_sleep() # log('当前访问的URL', url) html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' # try: # html = requests.get(url, headers=self.get_news_header(), timeout=3) # html.encoding = 'utf-8' # except Exception as e: # log_line('访问出错') # print(e) # self.__class__.retry = 1 # # return 'timeout' # if html.status_code != 200: # return 'error' html = etree.HTML(html.text) items = html.xpath('//ul[@class="list"]/li') # log(len(items)) for item in items: self.parser_item(item)