def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] offset = remark.get('offset') html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True) if not headers: base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE) for header in headers: # 查看更多相关新闻 regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻' more_news_url = tools.get_info(str(header), regex, fetch_one = True) if more_news_url: more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url) more_news_url = more_news_url.replace('amp;', '') base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0}) url = header.h3.a['href'] article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain ='' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = None log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s '''%(uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 offset += 50 url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset}) base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) fit_url = tools.fit_url(urls, FIT_URLS) for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.replace_str(title, '&.*?;') # 内容 regexs = [ '<div id="end_text".*?>(.*?)<div class="post_btmshare">', '<div class="post_text".*?>(.*?)<div class="post_btmshare">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d url = %s title = %s content = %s ''' % (depth + 1, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)
def get_article_content(self, data, req_url): log.debug('获取文章内容') if data: # 被验证不详实的文章 首次不反回内容,跳转到https://mp.weixin.qq.com/mp/rumor req_url = req_url.replace('amp;', '') mid = tools.get_param(req_url, 'mid') or tools.get_param( req_url, 'appmsgid') # 图文消息id 同一天发布的图文消息 id一样 idx = tools.get_param(req_url, 'idx') or tools.get_param( req_url, 'itemidx') # 第几条图文消息 从1开始 article_id = mid + idx # 用mid和idx 拼接 确定唯一一篇文章 如mid = 2650492260 idx = 1,则article_id = 26504922601 WechatAction._current_aritcle_id = article_id # 记录当前文章的id 为获取评论信息时找对应的文章id使用 print('当前id' + WechatAction._current_aritcle_id) regex = '(<div class="rich_media_content ".*?)<script nonce' content = tools.get_info(data, regex, fetch_one=True) if content: # 缓存文章内容 WechatAction._article_info[article_id]['content'] = content # 取公众号名 regex = '<title>(.*?)</title>' account = tools.get_info(data, regex, fetch_one=True) WechatAction._article_info[article_id]['account'] = account else: # 被验证不实的文章,不会请求观看点赞数,此时直接入库 regex = '<title>(.*?)</title>' content = tools.get_info(data, regex, fetch_one=True) WechatAction._article_info[article_id]['content'] = content # 入库 print('被验证不实的文章,不会请求观看点赞数,此时直接入库') WechatAction._wechat_service.add_article_info( WechatAction._article_info.pop(article_id)) # 如果下一页是文章列表的链接, 替换文章列表中的appmsg_token,防止列表链接过期 if (len(WechatAction._todo_urls) == 1) and ('/mp/profile_ext' in WechatAction._todo_urls[-1]): regex = 'appmsg_token = "(.*?)"' appmsg_token = tools.get_info(data, regex, fetch_one=True).strip() WechatAction._todo_urls[-1] = tools.replace_str( WechatAction._todo_urls[-1], 'appmsg_token=.*?&', 'appmsg_token=%s&' % appmsg_token) return self.__open_next_page() else: # 无文章内容 pass
def del_not_use_tag(content): content = tools.replace_str(content, '<script(.|\n)*?</script>') content = tools.replace_str(content, '<style(.|\n)*?</style>') content = tools.replace_str(content, '<!--(.|\n)*?-->') content = content.replace('</p>', '/p') content = tools.replace_str(content, '<[^p].*?>') content = content.replace('/p', '</p>') content = tools.replace_str(content, '&.*?;') content = tools.replace_str(content, '[ \f\r\t\v]') return content
def format_keys(keywords): ''' @summary: &表示与的关系 |表示或的关系,括号括起来表示一组 --------- @param keywords: --------- @result: ''' keywords = keywords.replace('(', '(') keywords = keywords.replace(')', ')') keywords = keywords.replace(')(', ')&(') print(keywords) chinese_word = tools.get_chinese_word(keywords) keywords = keywords.split(',') for i in range(len(keywords)): keywords[i] = keywords[i].strip() # print('--------------------------') # print(keywords[i]) # chinese_word = tools.get_chinese_word(keywords[i]) regex = '[a-zA-Z 0-9:]+' english_words = tools.get_info(keywords[i], regex, allow_repeat=True) while ' ' in english_words: english_words.remove(' ') # print(english_words ) print('=========================') for j in range(len(english_words)): english_words[j] = english_words[j].strip() if english_words[j]: keywords[i] = keywords[i].replace(english_words[j], '%s') keywords[i] = tools.replace_str(keywords[i], ' +', '&') print(keywords[i]) print(english_words) keywords[i] = keywords[i] % ( tuple(english_words)) if '%s' in keywords[i] else keywords[i] keywords = ')|('.join(keywords) keywords = '(' + keywords + ')' if not keywords.startswith( '(') and keywords else keywords return keywords
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url, code='GB2312') if html == None: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('article_urls', source_url, Constance.EXCEPTION) return # 取当前页面的全部url urls = tools.get_urls(html, STOP_URLS) # 过滤掉外链接 添加到数据库 fit_url = tools.fit_url(urls, "people.com.cn") for url in fit_url: # log.debug('url = ' + url) base_parser.add_url('article_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.replace_str(title, '&.*?;') # 内容 regexs = [ 'div class="box_pic"></div>(.*?)<div class="box_pic"></div>', '<div class="content clear clearfix">(.*?)<div class="edit clearfix">', '<div class="show_text">(.*?)<div class="edit">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %d source_url = %s title = %s content = %s ''' % (depth, source_url, title, content)) if content and title: base_parser.add_article_info('article_text_info', website_id, source_url, title, content) # 更新source_url为done base_parser.update_url('article_urls', source_url, Constance.DONE)