def __init__(self): self.headers = {} self.date = self.get_date() self.mgr = MogoMgr() # self.retry = -1 # self.retry_flag = -1 self.failurls = []
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.amac.org.cn/xydt/xyxx/', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.gov.cn/pushinfo/v150203/base_14px_pubdate.htm', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3302/index_7401.htm', 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3311/index_7401.htm', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.circ.gov.cn/web/site0/tab5176/', 'http://www.circ.gov.cn/web/site0/tab7924/', 'http://www.circ.gov.cn/web/site0/tab5207/', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.retry = -1 self.retry_flag = -1 self.failurls = [] self.start_urls = [ 'http://www.amac.org.cn/flfg/flfgwb/', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.mohurd.gov.cn/zcjd/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcyzcfb/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcyxydt/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcydfxx/index.html', ]
def get_newsinfo(self, urls): ''' 访问每一条新闻详情 :param newslist: 新闻链接集合 :return: 新闻model ''' for url in urls: t_sleep() log('当前访问的URL', url) try: html = requests.get(url, timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 continue if html.status_code != 200: continue response = etree.HTML(html.text) item = self.parse_item(response, html.url) MogoMgr().insert(item)
class StcnSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() # def get_date(self): # year, month, day = get_today() # date = str(year) + '-' + str(month) + '-' + str(day) # return date def get_host(self, url): host = url.split('/')[2] return host def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { # 'Host': 'epaper.zqrb.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://www.stcn.com/', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'gbk' # log(html.text) pattern = r"http://[a-z]+\.stcn.com/\d+/\d+/\d+.shtml" urls = re.findall(pattern, html.text) # new_urls = [] # for ur in urls: # log(ur) # new_urls.append(self.parser_url(ur)) # log('数量', len(urls)) return urls def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' response = etree.HTML(html.text) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'stcn' return news def parse_item(self, response): try: title = response.xpath('//div[@class="intal_tit"]/h2/text()') title = ''.join(title).strip() except Exception as e: title = '未知' try: date = response.xpath('//div[@class="info"]/text()')[0].split()[0] except Exception as e: date = '未知' try: con_list = response.xpath( '//div[@id="ctrlfscont"]/descendant-or-self::*/text()') except Exception as e: con_list = ['未知'] content = ''.join(con_list).strip() # log('content', content) return title, date, content def run(self): log_line('StcnSpider 启动!!!') url = 'http://www.stcn.com/' urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class HeXunSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_host(self, url): host = url.split('/')[2] return host def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { # 'Host': '', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://www.hexun.com/', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'gbk' pattern = r"http://[a-z]{4,10}.hexun.com/\d+-\d+-\d+/\d+.html" urls = re.findall(pattern, html.text) # log(html.text) # for ur in urls: # log(type(ur), ur) # # # log('数量', len(urls)) return urls def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的url 状态不是200', url) return 'error' response = etree.HTML(html.text) self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'hexun' return news def parse_item(self, response): try: title = response.xpath('//div[@class="layout mg articleName"]/h1/text()')[0].strip() except Exception as e: title = '未知' try: date = response.xpath('//span[@class="pr20"]/text()')[0].split()[0] except Exception as e: date = '未知' try: con_list = response.xpath('//div[@class="art_contextBox"]/descendant-or-self::*/text()') except Exception as e: con_list = '未知' content = ''.join(con_list).strip() # log('content', content) return title, date, content def run(self): log_line('HeXunSpider 启动!!!') start_urls = [ 'http://www.hexun.com/', ] for url in start_urls: urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class CircSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.circ.gov.cn/web/site0/tab5176/', 'http://www.circ.gov.cn/web/site0/tab7924/', 'http://www.circ.gov.cn/web/site0/tab5207/', ] def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.circ.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' # log(html.text) html = etree.HTML(html.text) items = html.xpath('//td[@class="hui14"]') # log(len(items)) for item in items: self.parser_item(item) def parser_item(self, item): url = item.xpath('./a/@href')[0] if 'search' in url: return date = item.getnext().xpath('./text()')[0][1:-1] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.circ.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news) def parser_url(self, url, base_url): return base_url + url def get_newsUrls(self): return [news.url for news in self.newslist] def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'timeout' or content == 'error': continue self.update_content(url, content) def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: return 'error' response = etree.HTML(html.text) con_list = response.xpath( '//span[@id="zoom"]/descendant-or-self::*/text()') return ''.join(con_list).strip().replace('\r\n', '') def update_content(self, url, content): for news in self.newslist: if news.url == url: news.content = content def run(self): log_line('CircSpider 启动!!!') for url in self.start_urls: self.get_html(url) self.send_request(self.get_newsUrls()) # for news in self.newslist: # log(news.url, news.content) # for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
class ShangHaiSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.shanghai.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//ul[@class="uli14 pageList"]/li/a/@href') return self.parser_url(urls) def parser_url(self, urls): base_url = 'http://www.shanghai.gov.cn' new_urls = [] for url in urls: if str(url).endswith('.pdf'): continue url = base_url + url new_urls.append(url) return new_urls def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'shanghai' return news def parse_item(self, response): title = response.xpath('//div[@id="ivs_title"]/text()')[0].strip() date = response.xpath('//div[@id="ivs_date"]/text()')[0][1:-1].strip() date = arrow.get(date).format('YYYY-MM-DD') con_list = response.xpath( '//div[@id="ivs_content"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() return title, date, content def run(self): log_line('ShangHaiSpider 启动!!!') url = 'http://www.shanghai.gov.cn/nw2/nw2314/nw2319/nw41893/index.html' urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class CnstockSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { # 'Host': '', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://www.cnstock.com/', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'utf-8' # log(html.text) pattern_1 = r"http://[a-z]+.cnstock.com/[a-z]+,[a-z]+-\d+-\d+.htm" pattern_2 = r"http://[a-z]+.cnstock.com/[a-z]+/[a-z]+_[a-z]+/\d+/\d+.htm" pattern_3 = r"http://[a-z]+.cnstock.com/[a-z]+/[a-z]+_[a-z]+/[a-z]+_[a-z]+/\d+/\d+.htm" # pattern = r'http://[a-z]+.cnstock.com/.*?/\d+.htm' # pattern = r'<a href=".*?\d+.htm"' # pattern = r'"http://.*?/\d+.htm"' # pattern = r'"http://(\.|[a-z]|/|,|-)+\d+.htm"' # pattern = r'"http://(\.|[a-z]|/|,|-)*\d+.htm"' pattern = '|'.join([pattern_1, pattern_2, pattern_3]) urls = re.findall(pattern, html.text) # for ur in urls: # log(ur) # log('数量', len(urls)) return set(urls) def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' # log(html.text) response = etree.HTML(html.text) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'cnstock' return news def parse_item(self, response): try: title = response.xpath('//h1[@class="title"]/text()') title = ''.join(title).strip() except Exception as e: title = '未知' try: date = response.xpath('//span[@class="timer"]/text()')[0].split()[0] except Exception as e: date = '未知' try: con_list = response.xpath('//div[@id="qmt_content_div"]/descendant-or-self::*/text()') except Exception as e: con_list = ['未知'] content = ''.join(con_list).strip() # log('content', content) return title, date, content def run(self): log_line('CnstockSpider 启动!!!') url = 'http://www.cnstock.com/' urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class ZqrbSpider(BaseSpider): def __init__(self): self.headers = {} self.date = self.get_date() self.mgr = MogoMgr() # self.retry = -1 # self.retry_flag = -1 self.failurls = [] def get_date(self): year, month, day = get_today() date = str(year) + '-' + str(month) + '-' + str(day) return date def get_host(self, url): host = url.split('/')[2] return host def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'epaper.zqrb.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://epaper.zqrb.cn/', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//a[@class="vote_content12px"]/@href') new_urls = [] for ur in urls: # log(self.parser_url(ur)) new_urls.append(self.parser_url(ur)) # log('数量', len(urls)) return new_urls def parser_url(self, url): return self.get_base_url() + url def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error': log('访问的新闻不存在 继续访问下一个URL') continue if news == 'timeout': log('访问的新闻超时 暂时跳过') continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) header = self.get_news_header() try: html = requests.get(url, headers=header, timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') self.__class__.retry = 1 print(e) return 'timeout' response = etree.HTML(html.text) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'zqrb' return news def parse_item(self, response): try: title = response.xpath('//td[@class="h1"]/text()') title = ''.join(title).strip() except Exception as e: title = '未知' date = self.date try: con_list = response.xpath( '//div[@id="ozoom"]/descendant-or-self::*/text()') except Exception as e: con_list = '未知' content = ''.join(con_list).strip() # log('content', content) return title, date, content def get_base_url(self): year, month, day = get_today() year = str(year) month = str(month) if month >= 10 else '0' + str(month) day = str(day) if day >= 10 else '0' + str(day) return 'http://epaper.zqrb.cn/html/{0}-{1}/{2}/'.format( year, month, day) def get_start_url(self): year, month, day = get_today() year = str(year) month = str(month) if month >= 10 else '0' + str(month) day = str(day) if day >= 10 else '0' + str(day) return 'http://epaper.zqrb.cn/html/{0}-{1}/{2}/node_2.htm'.format( year, month, day) def run(self): log_line('ZqrbSpider 启动!!!') url = self.get_start_url() urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class BjjrjSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.bjjrj.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//div[@class="erjiUL-word"]/a/@href') return self.parser_url(urls, url) def parser_url(self, urls, originalurl): base_url = originalurl.rsplit('/', 1)[0] new_urls = [] for url in urls: url = base_url + '/' + url new_urls.append(url) return new_urls def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'timeout' or news == 'error': log_line('timeout error') continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log_line('请求状态不是200') return 'error' response = etree.HTML(html.text) self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'bjjrj' return news def parse_item(self, response): title = response.xpath('//div[@class="article"]/h1/text()')[0].strip() date = response.xpath( '//h5[@class="articleTitleSub"]/text()')[-1].split(':')[1] con_list = response.xpath( '//div[@id="zoom"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() return title, date, content def run(self): log_line('BjjrjSpider 启动!!!') urls = [] url = 'http://www.bjjrj.gov.cn/zcfg/c19-list-1.html' urls_1 = self.get_html(url) url = 'http://www.bjjrj.gov.cn/zyzc/c138-list-1.html' urls_2 = self.get_html(url) urls.extend(urls_1) urls.extend(urls_2) news_list = self.send_request(urls) log_line(len(news_list)) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class PbcSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() self.host_url = 'http://www.pbc.gov.cn' def get_news_header(self): return { # 'Host': '', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://www.cnstock.com/', } def get_html(self, dest_url): ''' 解码PBC的JavaScript脚本 并再次访问获取原始HTML :param url: 需要访问的PBC链接 :return: HTML源码 requests中的 response 类型 ''' r = requests.session() # dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html' # dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html' # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html' # dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html' # 利用session保存cookie信息,第一次请求会设置cookie类似{'wzwsconfirm': 'ab3039756ba3ee041f7e68f634d28882', 'wzwsvtime': '1488938461'},与js解析得到的cookie合起来才能通过验证 # r = requests.session() content = r.get(dest_url).content # 获取页面脚本内容 re_script = re.search(r'<script type="text/javascript">(?P<script>.*)</script>', content.decode('utf-8'), flags=re.DOTALL) # 用点匹配所有字符,用(?P<name>...)获取:https://docs.python.org/3/howto/regex.html#regex-howto # cheatsheet:https://github.com/tartley/python-regex-cheatsheet/blob/master/cheatsheet.rst script = re_script.group('script') script = script.replace('\r\n', '') # 在美化之前,去掉\r\n之类的字符才有更好的效果 res = jsbeautifier.beautify(script) # 美化并一定程度解析js代码:https://github.com/beautify-web/js-beautify with open('x.js', 'w') as f: f.write(res) # 写入文档进行查看分析 jscode_list = res.split('function') var_ = jscode_list[0] var_list = var_.split('\n') template_js = var_list[3] # 依顺序获取,亦可用正则 template_py = js2py.eval_js(template_js) # 将所有全局变量插入第一个函数变为局部变量并计算 function1_js = 'function' + jscode_list[1] position = function1_js.index('{') + 1 function1_js = function1_js[:position] + var_ + function1_js[position:] function1_py = js2py.eval_js(function1_js) cookie1 = function1_py(str(template_py)) # 结果类似'NA==' # 保存得到的第一个cookie cookies = {} cookies['wzwstemplate'] = cookie1 # 对第三个函数做类似操作 function3_js = 'function' + jscode_list[3] position = function3_js.index('{') + 1 function3_js = function3_js[:position] + var_ + function3_js[position:] function3_py = js2py.eval_js(function3_js) middle_var = function3_py() # 是一个str变量,结果类似'WZWS_CONFIRM_PREFIX_LABEL4132209' cookie2 = function1_py(middle_var) cookies['wzwschallenge'] = cookie2 # 关于js代码中的document.cookie参见 https://developer.mozilla.org/zh-CN/docs/Web/API/Document/cookie dynamicurl = js2py.eval_js(var_list[0]) # 利用新的cookie对提供的动态网址进行访问即是我们要达到的内容页面了 r.cookies.update(cookies) # content = r.get(self.host_url + dynamicurl).content.decode('utf-8') try: content = r.get(self.host_url + dynamicurl, timeout=3) content.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' return content def send_request(self, urls, parser_item_fuc): ''' 用于请求每一个具体的新闻链接 :param urls: 具体新闻URL :param parser_item_fuc: 用于解析每一个新闻详情的函数 :return: 返回解析好的News类型列表 ''' news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url, parser_item_fuc) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list def get_newsinfo(self, url, parser_item_fuc): ''' 请求每一个新闻详情 ''' t_sleep() log('当前访问的URL', url) html = self.get_html(url) if html == 'timeout': return 'error' response = etree.HTML(html.text) log('当前访问的URL', url, html.status_code) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # parser_item_fuc(response) title, date, content = parser_item_fuc(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'pbc' return news def parser_gonggao_list(self, content): ''' 公告信息页面的解析 :param content: 公告信息页的HTML源码 用于提取公告信息 :return: 返回的是公告信息详情链接 ''' html = etree.HTML(content.text) doms = html.xpath('//font[@class="newslist_style"]') urls = [] for e in doms: # log('标题', e.xpath('./a/text()')[0].strip()) # log('url', e.xpath('./a/@href')[0].strip()) # log('日期', e.getnext().xpath('./text()')[0].strip()) url = self.host_url + e.xpath('./a/@href')[0].strip() urls.append(url) return urls def parser_falvfagui(self, content): html = etree.HTML(content.text) doms = html.xpath('//td[@class="font14 bgdian"]') urls = [] for e in doms: url = self.host_url + e.xpath('./a/@href')[0].strip() # log('标题', e.xpath('./a/text()')[0].strip()) # log('解析的新闻URL', url) urls.append(url) return urls def parser_huobi(self, content): html = etree.HTML(content.text) doms = html.xpath('//a[@class="hei12jj"]') urls = [] for e in doms: url = self.host_url + e.xpath('./@href')[0].strip() # log('标题', e.xpath('./a/text()')[0].strip()) # log('解析的新闻URL', url) urls.append(url) return urls def parser_xindai(self, content): html = etree.HTML(content.text) doms = html.xpath('//a[@class="hei12jj"]') urls = [] for e in doms: url = self.host_url + e.xpath('./@href')[0].strip() # log('标题', e.xpath('./text()')[0].strip()) # log('解析的新闻URL', url) urls.append(url) return urls def parse_gonggao_item(self, response): ''' 解析公告信息详情 :param response: :return: ''' try: title = response.xpath('//h2[@style="FONT-SIZE: 16px"]/text()') title = ''.join(title).strip() except Exception as e: title = '未知' try: date = response.xpath('//td[@class="hui12"][@align="right"]/text()')[0] except Exception as e: date = '未知' try: con_list = response.xpath('//font[@id="zoom"]/descendant-or-self::*/text()') except Exception as e: con_list = ['未知'] content = ''.join(con_list).strip() # log('content', title, date) return title, date, content def parser_common_item(self, response): ''' 解析法律法规 信贷政策 ''' try: title = response.xpath('//h2[@style="font-size: 16px;color: #333;"]/text()') title = ''.join(title).strip() except Exception as e: title = '未知' try: date = response.xpath('//span[@id="shijian"]/text()')[0].split()[0] except Exception as e: date = '未知' try: con_list = response.xpath('//div[@id="zoom"]/descendant-or-self::*/text()') except Exception as e: con_list = ['未知'] content = ''.join(con_list).strip() # log('法律法规', content) return title, date, content def send(self, dest_url, get_news_list, parser_news): ''' 获取人行 不同页面的内容 解析 再次请求新闻详情 :param dest_url: 目标URL :param get_news_list: 获取目标URL中的新闻URL :param parser_news: 解析新闻中的具体标题 日期等信息 :return: None ''' content = self.get_html(dest_url) urls = get_news_list(content) news_list = self.send_request(urls, parser_news) for news in news_list: self.mgr.insert(news) def run(self): log_line('PbcSpider 启动!!!') # 公告信息 dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html' self.send(dest_url, self.parser_gonggao_list, self.parse_gonggao_item) # 法律法规 dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html' self.send(dest_url, self.parser_falvfagui, self.parser_common_item) # 货币政策 暂未完成 # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html' # self.send(dest_url, self.parser_xindai, self.parser_common_item) # 信贷政策 dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html' self.send(dest_url, self.parser_xindai, self.parser_common_item) self.__class__().re_send()
class GzjrjSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.shanghai.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//div[@class="mainContent"]/ul/li/a/@href') log('提取的URL', urls) return self.parser_url(urls) def parser_url(self, urls): base_url = 'http://www.gzjr.gov.cn/' new_urls = [] for url in urls: if str(url).endswith('.pdf'): continue url = base_url + url.split('../../')[1] log('拼接后的url', url) new_urls.append(url) return new_urls def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) return 'timeout' response = etree.HTML(html.text) log('当前访问的URL', url) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) return news def parse_item(self, response): title = response.xpath('//div[@id="ivs_title"]/text()')[0].strip() date = response.xpath('//div[@id="ivs_date"]/text()')[0][1:-1].strip() date = arrow.get(date).format('YYYY-MM-DD') con_list = response.xpath('//div[@id="ivs_content"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() return title, date, content def run(self): log_line('GzjrjSpider 启动!!!') pass
class MoHurdSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.mohurd.gov.cn/zcjd/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcyzcfb/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcyxydt/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcydfxx/index.html', ] def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.mohurd.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//a[@style="color:#000;;font-size:12px;"]') # log_line(len(items)) for item in items: self.parser_item(item) def parser_item(self, item): news = News() news.spider_name = 'mohurd' news.url = item.xpath('./@href')[0] news.title = item.xpath('./text()')[0] news.date = item.getparent().getnext().xpath( './text()')[0][1:-1].replace('.', '-').strip() self.newslist.append(news) def get_newsUrls(self): return [news.url for news in self.newslist] def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'error' or content == 'timeout': continue self.update_content(url, content) def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="union"]/descendant-or-self::*/text()') return ''.join(con_list).strip() def update_content(self, url, content): for news in self.newslist: if news.url == url: news.content = content def run(self): log_line('MoHurdSpider 启动!!!') for url in self.start_urls: self.get_html(url) self.send_request(self.get_newsUrls()) for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
class CsrcSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3302/index_7401.htm', 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3311/index_7401.htm', ] def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.csrc.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//div[@class="row"]') # log_line(len(items)) for item in items: self.parser_item(item) def parser_item(self, item): url = item.xpath('./li[@class="mc"]/div/a/@href')[0] date = item.xpath('./li[@class="fbrq"]/text()')[0] news = News() news.spider_name = 'csrc' news.url = self.parser_url(url, 'http://www.csrc.gov.cn/pub/zjhpublic') news.title = item.xpath('./li[@class="mc"]/div/a/text()')[0] news.date = arrow.get(date).format('YYYY-MM-DD') # log(news.url, news.title, news.date) self.newslist.append(news) def parser_url(self, url, base_url): return base_url + url.split('../..')[1] def get_newsUrls(self): return [news.url for news in self.newslist] def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'error' or content == 'timeout': continue self.update_content(url, content) def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@id="ContentRegion"]/descendant-or-self::*/text()') return ''.join(con_list).strip().replace('\r\n', '') def update_content(self, url, content): for news in self.newslist: if news.url == url: news.content = content def run(self): log_line('CsrcSpider 启动!!!') for url in self.start_urls: self.get_html(url) self.send_request(self.get_newsUrls()) for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
class XinHuaSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_newlist_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'qc.wa.news.cn', 'Referer': 'http://www.news.cn/fortune/', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.xinhuanet.com', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_caijing_header(self): pass return { 'Host': 'www.news.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_money(self): ''' 金融版面 :return: ''' url = 'http://www.xinhuanet.com/money/index.htm' html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) urls_all = [] urls_1 = html.xpath('//li[@class="clearfix"]/h3/a/@href') # 只对新闻列表进行处理 urls_2 = html.xpath('//li[@class="imp"]/a/@href') urls_3 = html.xpath('//div[@class="swiper-slide"]/a/@href') urls_all.extend(urls_1) urls_all.extend(urls_2) urls_all.extend(urls_3) # log(len(urls_all), urls_all) news_list = [] for url in urls_all: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_iteminfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list def get_lunbo(self): ''' 财经版面 :return: ''' url = 'http://www.news.cn/fortune/' html = requests.get(url, headers=self.get_caijing_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//div[@class="swiper-slide"]/a/@href') year = arrow.now().date().year news_list = [] for url in urls: if str(year) in url: log('需要访问的URL 轮播图', url) find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_iteminfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list def get_itemlist(self, page='1'): ''' 获取新华财经 所有新闻详情 :return: 返回新闻model ''' # 新华财经 - 新闻列表 url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={0}&cnt=16&tp=1&orderby=1'.format(page) html = requests.get(url, headers=self.get_newlist_header()) items = json.loads(html.text[1:-1]) items = items['data']['list'] news_list = [] for item in items: # 避免重复请求 find_one = self.mgr.find_one('url', item['LinkUrl']) if find_one is not None: log_line('该URL已经存在 无需请求') log(item['LinkUrl']) continue news = self.get_iteminfo(item['LinkUrl']) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list def get_iteminfo(self, url): ''' 访问每一条新闻详情 :param itemlist: 新闻链接集合 :return: 新闻model ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'xinhua' return news def parse_item(self, response): try: con_list = response.xpath('//div[@id="p-detail"]/p') content = self.pasre_content(con_list) title = response.xpath('//div[@class="h-title"]/text()')[0].strip() date = response.xpath('//span[@class="h-time"]/text()')[0].split()[0] except Exception as e: title = '页面不存在' date = '页面不存在' content = '页面不存在' return title, date, content def pasre_content(self, con_list): ''' 解析正文 :param response: :return: ''' content = '' for con in con_list: c = con.xpath('./text()') if len(c) != 0: content = content + c[0].replace(' ', '') return content def run(self): log_line('XinHuaSpider 启动!!!') news_list = [] # 对财经版面的前两页数据进行爬取 news_list_1 = self.get_itemlist(page='1') news_list_2 = self.get_itemlist(page='2') news_list_3 = self.get_lunbo() news_list_4 = self.get_money() news_list.extend(news_list_1) news_list.extend(news_list_2) news_list.extend(news_list_3) news_list.extend(news_list_4) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class FangChanSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.fangchan.com', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://www.fangchan.com/policy/28/', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//ul[@class="related-news-list"]/li/a/@href') return urls def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'fangchan' return news def parse_item(self, response): try: title = response.xpath( '//div[@class="section top"]/h1/text()')[0].strip() except Exception as e: title = response.xpath('//h1[@class="clearfix"]/text()')[0].strip() try: date = response.xpath( '/html/body/div[1]/div[2]/div[1]/p/span[2]/text()')[0].split( )[0] except Exception as e: try: date = response.xpath( '/html/body/div/div[2]/div/div[2]/ul/li[2]/span/text()' )[0].split()[0] except Exception as e: date = '未知' con_list = response.xpath( '//div[@class="summary-text"]/descendant-or-self::*/text()') if len(con_list) == 0: con_list = response.xpath( '//div[@class="summary_text"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() return title, date, content def run(self): log_line('FangChanSpider 启动!!!') start_urls = [ 'http://www.fangchan.com/policy/28/', 'http://www.fangchan.com/plus/nlist.php?tid=2&tags=%E5%8E%9F%E5%88%9B', 'http://www.fangchan.com/plus/nlist.php?tid=2&column=%E5%AE%8F%E8%A7%82', 'http://www.fangchan.com/news/6/', 'http://www.fangchan.com/news/1/', 'http://www.fangchan.com/news/9/', 'http://www.fangchan.com/news/5/', 'http://www.fangchan.com/news/7/', 'http://www.fangchan.com/news/4/', ] for url in start_urls: urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class CctvSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'jingji.cctv.com', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' 获取顶部 轮播图 :param url: :return: ''' html = requests.get(url) html.encoding = 'utf-8' # log(html.text) html = etree.HTML(html.text) urls = html.xpath('//div[@class="shadow"]/ul/li/p/a/@href') # log(len(urls), urls) return urls def get_jsondata(self): ''' 直接访问json接口 :return: ''' url = 'http://jingji.cctv.com/data/index.json' html = requests.get(url) html.encoding = 'gbk' news_list = json.loads(html.text)['rollData'] urls = [] for news in news_list: urls.append(news['url']) return urls def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'cctv' return news def parse_item(self, response): try: title = response.xpath( '//div[@class="cnt_bd"]/h1/text()')[0].strip() except Exception as e: title = '页面不存在' try: date = response.xpath( '//span[@class="info"]/i/text()')[1].split()[0] date = arrow.get(date).format('YYYY-MM-DD') except Exception as e: try: date = response.xpath( '//span[@class="info"]/i/text()')[0].split()[1] date = arrow.get(date).format('YYYY-MM-DD') except Exception as e: date = '未知' try: con_list = response.xpath('//div[@class="cnt_bd"]/p') content = self.pasre_content(con_list) except Exception as e: content = '页面不存在' # log(content) # log(title, date) return title, date, content def pasre_content(self, con_list): ''' 解析正文 :param response: :return: ''' content = '' for con in con_list: c = con.xpath('./text()') if len(c) != 0: content = content + c[0].replace(' ', '') return content def run(self): log_line('CctvSpider 启动!!!') urls = [] url = 'http://jingji.cctv.com/' urls_1 = self.get_html(url) urls_2 = self.get_jsondata() urls.extend(urls_1) urls.extend(urls_2) urls = set(urls) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
class CsSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() def get_host(self, url): host = url.split('/')[2] return host def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { # 'Host': '', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://www.cs.com.cn/', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'gbk' # log(html.text) pattern = r"\./*[a-z]*/*[a-z]*/[a-z]+/\d+/t\d+_\d+.html" urls = re.findall(pattern, html.text) new_urls = [] for ur in urls: new_urls.append(self.parser_url(ur)) # log(new_urls) return new_urls def parser_url(self, url): log(url) return 'http://www.cs.com.cn' + url[1:] def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' # log(html.text) response = etree.HTML(html.text) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'cs' return news def parse_item(self, response): try: title = response.xpath('//div[@class="artical_t"]/h1/text()')[0].strip() except Exception as e: title = '未知' try: date = response.xpath('//span[@class="Ff"]/text()')[0].split()[0] except Exception as e: date = response.xpath('//span[@class="ctime01"]/text()')[0].split()[0] try: con_list = response.xpath('//div[@class="artical_c"]/descendant-or-self::*/text()') except Exception as e: con_list = '未知' # contents = [re.sub(r'[a-z]+|\s+', '', cc) for cc in con_list] content = ''.join(con_list).strip() # log('content', content) return title, date, content def run(self): log_line('CsSpider 启动!!!') start_urls = [ 'http://www.cs.com.cn/', ] for url in start_urls: urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.host_url = 'http://www.pbc.gov.cn'
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_url = 'http://www.cbrc.gov.cn/chinese/zhengcefg.html'
def __init__(self): self.headers = {} self.mgr = MogoMgr()
class CbrcSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_url = 'http://www.cbrc.gov.cn/chinese/zhengcefg.html' def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.cbrc.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//a[@class="STYLE8"]') for item in items: news = News() news.spider_name = 'cbrc' news.url = item.xpath('./@href')[0] news.title = item.xpath('./@title')[0] news.date = item.getparent().getnext().xpath('./text()')[0].strip() self.newslist.append(news) return self.parser_url(self.newslist) def parser_url(self, newslist): base_url = 'http://www.cbrc.gov.cn' new_urls = [] for news in newslist: url = base_url + news.url news.url = url # log('拼接后的URL', url) new_urls.append(url) return new_urls def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'timeout' or 'error': continue for news in self.newslist: if news.url == url: news.content = content def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: return 'error' response = etree.HTML(html.text) return self.parse_item(response) def parse_item(self, response): try: con_list = response.xpath( '//div[@class="notice_t"]/descendant-or-self::*/text()') content = ''.join(con_list).strip().replace('\r\n', '') except Exception as e: content = '页面不存在' return content def run(self): log_line('CbrcSpider 启动!!!') urls = self.get_html(self.start_url) self.send_request(urls) for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
class AmacSpider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.retry = -1 self.retry_flag = -1 self.failurls = [] self.start_urls = [ 'http://www.amac.org.cn/flfg/flfgwb/', ] def get_news_header(self): ''' 请求新闻列表的请求头与请求新闻详情的请求头不一样 :return: ''' return { 'Host': 'www.amac.org.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', } def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) items = html.xpath('//div[@class="newsTrTitle"]/a') # log_line(len(items)) for item in items: self.parser_item(item) def parser_item(self, item): news = News() news.spider_name = 'amac' news.url = self.parser_url( item.xpath('./@href')[0], 'http://www.amac.org.cn') news.title = item.xpath('./text()')[0] self.newslist.append(news) def parser_url(self, url, base_url): return base_url + url.split('../..')[1] def get_newsUrls(self): return [news.url for news in self.newslist] def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue date, content = self.parser_data(url) if content in ('error', 'timeout'): continue self.update_news(url, content, date) def parser_data(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout', 'timeout' if html.status_code != 200: return 'error', 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="ldContent"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() date = response.xpath('//div[@class="ldDate"]/text()')[0] date = date.split(':')[1] # log('内容', content) return date, content def update_news(self, url, content, date): for news in self.newslist: if news.url == url: news.content = content news.date = date def run(self): log_line('AmacSpider 启动!!!') for url in self.start_urls: self.get_html(url) self.send_request(self.get_newsUrls()) for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
class Circ2Spider(BaseSpider): def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.gov.cn/pushinfo/v150203/base_14px_pubdate.htm', ] def get_news_header(self): return { # 'Host': 'www.gov.cn', 'User-Agent': randomUserAgent(), 'Pragma': 'no-cache', 'Referer': 'http://www.circ.gov.cn/web/site0/tab7642/', } def get_html(self, url): ''' :param url: :return: ''' t_sleep() # log('当前访问的URL', url) html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' # try: # html = requests.get(url, headers=self.get_news_header(), timeout=3) # html.encoding = 'utf-8' # except Exception as e: # log_line('访问出错') # print(e) # self.__class__.retry = 1 # # return 'timeout' # if html.status_code != 200: # return 'error' html = etree.HTML(html.text) items = html.xpath('//ul[@class="list"]/li') # log(len(items)) for item in items: self.parser_item(item) def parser_item(self, item): url = item.xpath('./a/@href')[0] date = item.xpath('./span/text()')[0] news = News() news.spider_name = 'circ' news.url = self.parser_url(url, 'http://www.gov.cn') news.title = item.xpath('./a/text()')[0] news.date = date # log(news.url, news.title, news.date) self.newslist.append(news) def parser_url(self, url, base_url): if str(url).startswith('http'): return url else: return base_url + url def get_newsUrls(self): return [news.url for news in self.newslist] def run(self): log_line('Circ2Spider 启动!!!') for url in self.start_urls: self.get_html(url) for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()