def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="union"]/descendant-or-self::*/text()') return ''.join(con_list).strip()
def parser_data(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout', 'timeout' if html.status_code != 200: return 'error', 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="ldContent"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() date = response.xpath('//div[@class="ldDate"]/text()')[0] date = date.split(':')[1] # log('内容', content) return date, content
def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log_line('请求状态不是200') return 'error' response = etree.HTML(html.text) self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'bjjrj' return news
def get_lunbo(self): ''' 财经版面 :return: ''' url = 'http://www.news.cn/fortune/' html = requests.get(url, headers=self.get_caijing_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//div[@class="swiper-slide"]/a/@href') year = arrow.now().date().year news_list = [] for url in urls: if str(year) in url: log('需要访问的URL 轮播图', url) find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_iteminfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list
def re_send(cls): if cls.retry != -1 and cls.retry_flag == -1: log_line('部分新闻访问出错 再次进行访问') log('再次运行的爬虫类型', cls) cls.retry_flag = 1 cls().run()
def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: return 'error' response = etree.HTML(html.text) return self.parse_item(response)
def get_iteminfo(self, url): ''' 访问每一条新闻详情 :param itemlist: 新闻链接集合 :return: 新闻model ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'xinhua' return news
def get_itemlist(self, page='1'): ''' 获取新华财经 所有新闻详情 :return: 返回新闻model ''' # 新华财经 - 新闻列表 url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={0}&cnt=16&tp=1&orderby=1'.format(page) html = requests.get(url, headers=self.get_newlist_header()) items = json.loads(html.text[1:-1]) items = items['data']['list'] news_list = [] for item in items: # 避免重复请求 find_one = self.mgr.find_one('url', item['LinkUrl']) if find_one is not None: log_line('该URL已经存在 无需请求') log(item['LinkUrl']) continue news = self.get_iteminfo(item['LinkUrl']) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list
def get_newsinfo(self, urls): ''' 访问每一条新闻详情 :param newslist: 新闻链接集合 :return: 新闻model ''' for url in urls: t_sleep() log('当前访问的URL', url) try: html = requests.get(url, timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 continue if html.status_code != 200: continue response = etree.HTML(html.text) item = self.parse_item(response, html.url) MogoMgr().insert(item)
def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) return 'timeout' response = etree.HTML(html.text) log('当前访问的URL', url) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) return news
def parser_url(self, urls): base_url = 'http://www.gzjr.gov.cn/' new_urls = [] for url in urls: if str(url).endswith('.pdf'): continue url = base_url + url.split('../../')[1] log('拼接后的url', url) new_urls.append(url) return new_urls
def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) news_list.append(news) return news_list
def get_html(self, url): ''' :param url: :return: ''' html = requests.get(url) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//div[@class="mainContent"]/ul/li/a/@href') log('提取的URL', urls) return self.parser_url(urls)
def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue date, content = self.parser_data(url) if content in ('error', 'timeout'): continue self.update_news(url, content, date)
def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'error' or content == 'timeout': continue self.update_content(url, content)
def run(self): log_line('CbrcSpider 启动!!!') urls = self.get_html(self.start_url) self.send_request(urls) for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'timeout' or 'error': continue for news in self.newslist: if news.url == url: news.content = content
def get_newslist(self): ''' 获取首页的所有新闻链接 :return: 返回新闻链接集合 ''' url = 'http://www.21jingji.com/' html = requests.get(url) html.encoding = 'utf-8' html = etree.HTML(html.text) # print(type(html)) news_list = html.xpath('//a[@class="listTit"]/@href') # tt = html.xpath('//a[@class="listTit"]/text()') log(len(news_list)) return news_list
def run(self): log_line('CircSpider 启动!!!') for url in self.start_urls: self.get_html(url) self.send_request(self.get_newsUrls()) # for news in self.newslist: # log(news.url, news.content) # for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
def get_newsinfo(self, url, parser_item_fuc): ''' 请求每一个新闻详情 ''' t_sleep() log('当前访问的URL', url) html = self.get_html(url) if html == 'timeout': return 'error' response = etree.HTML(html.text) log('当前访问的URL', url, html.status_code) if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' # parser_item_fuc(response) title, date, content = parser_item_fuc(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'pbc' return news
def get_money(self): ''' 金融版面 :return: ''' url = 'http://www.xinhuanet.com/money/index.htm' html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) urls_all = [] urls_1 = html.xpath('//li[@class="clearfix"]/h3/a/@href') # 只对新闻列表进行处理 urls_2 = html.xpath('//li[@class="imp"]/a/@href') urls_3 = html.xpath('//div[@class="swiper-slide"]/a/@href') urls_all.extend(urls_1) urls_all.extend(urls_2) urls_all.extend(urls_3) # log(len(urls_all), urls_all) news_list = [] for url in urls_all: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_iteminfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list
def send_request(self, urls, parser_item_fuc): ''' 用于请求每一个具体的新闻链接 :param urls: 具体新闻URL :param parser_item_fuc: 用于解析每一个新闻详情的函数 :return: 返回解析好的News类型列表 ''' news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url, parser_item_fuc) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list
def __init__(self, model, input_shape, train_data_path=None): # model为用户自定义网络模型类 其类型应为MXnet中nn.block类 # 模型初始化 self.__net = model self.input_shape = input_shape self.__ctx = Tools.utils.try_all_gpus() self.__random_init_model() # 本地梯度维护 self.local_gradient = {"weight": [], "bias": []} self.__init_gradient_list() # 本地训练数据路径 self.train_data_path = train_data_path # log类 self.log = log(path_base + "\\Fed_Client\\log") print("-Client Data Handler初始化完成-") print(self.__net)
def insert(self, item): item = item.__dict__ if has_keywords(item): item['show_sended'] = '1' log('含有敏感关键字') log('插入数据') try: self.sheet.insert(item) except DuplicateKeyError as e: log('数据重复 无需插入', item['url'])
def __init__(self, model, input_shape, train_data): # 网络通信类 with open(path_base + "\\Fed_Client\\client_config.json", 'r') as f: json_data = json.load(f) self.server_addr = (json_data['server_ip'], json_data['server_port']) self.recv_model_savepath = json_data[ 'default_path'] # recv_model.params self.client_sock = socket.socket() # 模型处理类 self.data_handler = Client_data_handler(model, input_shape=input_shape, train_data_path=train_data) # 训练模式 从Server端同步获取 self.train_mode = "" self.learning_rate = None self.batch_size = None self.epoch = None self.__param_sync() #同步参数 # log类 self.log = log(path_base + "\\Fed_Client\\log")
def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error': log('访问的新闻不存在 继续访问下一个URL') continue if news == 'timeout': log('访问的新闻超时 暂时跳过') continue news_list.append(news) return news_list
def parser_url(self, url): log(url) return 'http://www.cs.com.cn' + url[1:]
# szjrj.SzJrjSpider().run, # xinhua.XinHuaSpider().run, # zqrb.ZqrbSpider().run, ] start = time.time() threads = [] for index, target in enumerate(targets): t = threading.Thread(target=target) threads.append(t) t.setDaemon(True) t.start() for t in threads: t.join() # 发送邮件 log('准备发送邮件') manager = EmailManager(Setting.SERVER_USER, Setting.SERVER_PASSWORD) manager.send(manager.get_emails()) log('邮件发送完成') cost = time.time() - start log('耗时', cost) ''' 邮件内容需要增加: 新闻所在的网站 新闻关键词 '''