def __init__(self, url, html = None, language='zh'): self._html = html self._url = url if not html: self._html = tools.get_html(url) self._text = self.__del_html_tag(self._html, save_useful_tag = True)
def __init__(self, url, html=None, language='zh'): self._html = html self._url = url self._content_start_pos = '' self._content_end_pos = '' self._content_center_pos = '' self._paragraphs = '' if not html: self._html = tools.get_html(url) self._text = self.__del_html_tag(self._html, save_useful_tag=True)
def delete_site(request): site_id = request.POST.get("site_id", None) if site_id: site = Site.objects.filter(pk=int(site_id)).first() if site: site.delete() msg = '删除成功' html = tools.get_html(html_path="app/msg.html", data={ 'msg': msg, 'msg_type': 'success' }) return jsonSuccess(msg=msg, data={"html": html}) return jsonFailed(code=401, msg="没有找到该站点!该站点可能已经被删除")
def parser(url_info): root_url, depth, remark, website_name, website_position, website_url, website_domain, spider_depth = parser_url_info( url_info) html = tools.get_html(root_url) if not html: log.debug('请求url失败') # base_parser.update_url('news_urls', root_url, Constance.EXCEPTION) return # 近一步取待做url add_html_url(html, depth, spider_depth, website_url, website_name, website_domain, remark) # 解析网页 parser_article(root_url, html, website_name, website_domain, website_position)
def __init__(self, url, html = None, language='zh'): self._html = html self._url = url self._content_start_pos = '' self._content_end_pos = '' self._content_center_pos = '' self._paragraphs = '' if not html: self._html = tools.get_html(url) self._text = self.__del_html_tag(self._html) self.stripper = re.compile(r'\s+') self.anchor_ratio_limit = 0.3 self.impurity_threshold = 30 self.doc = lxml.html.fromstring(self._text) self.region = Region(self.doc)
def add_site(request): form = SiteForm(request.POST) if form.is_valid(): site_id = request.POST.get("site_id") name = form.cleaned_data.get('name', None) href = form.cleaned_data.get('href', None) coding = form.cleaned_data.get('coding', None) restart = form.cleaned_data.get('restart', None) deploy = form.cleaned_data.get('deploy', None) update_cert = form.cleaned_data.get('update_cert', None) copy = form.cleaned_data.get('copy', None) need_verification = True if request.POST.get( "custom-switch-checkbox") == 'on' else False overseas = True if request.POST.get( "overseas-switch-checkbox") == 'on' else False if site_id: site = Site.objects.filter(id=int(site_id)).first() action = "change" site_tr_html = None if site: site.name = name site.href = href site.coding = coding site.need_verification = need_verification site.restart = restart site.deploy = deploy site.update_cert = update_cert site.copy = copy site.overseas = overseas site.save() msg = "修改成功" msg_type = "success" site_tr_html = tools.get_html('app/site-tr.html', {'site': site}) else: msg = "修改失败,没有找到该站点,也许站点已经被删除" msg_type = "danger" else: site = Site.objects.create(name=name, href=href, coding=coding, need_verification=need_verification, copy=copy, deploy=deploy, restart=restart, update_cert=update_cert, overseas=overseas) msg = "添加成功" msg_type = "success" action = "create" site_tr_html = tools.get_html('app/site-tr.html', {'site': site}) msg_html = tools.get_html('app/msg.html', { 'msg': msg, 'msg_type': msg_type }) return jsonSuccess(msg="添加成功", data={"site_tr_html": site_tr_html,\ 'msg_html': msg_html, 'action': action, 'site_id': site.id}) else: errors = form.get_errors(data_type='list') msg = errors[0].get("message", None) if errors else '添加失败' msg_html = tools.get_html('app/msg.html', { 'msg': msg, 'msg_type': 'danger' }) return jsonFailed(code=401, msg=msg, data={'msg_html': msg_html})
# 'http://cnews.chinadaily.com.cn/2017-12/06/content_35230092.htm', # 'http://e.gmw.cn/2017-12/04/content_26998661.htm', # 'http://www.sohu.com/a/208241102_570245', # 'http://cnews.chinadaily.com.cn/2017-12/06/content_35230092.htm', # 'http://news.eastday.com/eastday/13news/auto/news/society/20171206/u7ai7256226.html', # 'http://cj.sina.com.cn/article/detail/6185269244/510492', # 'http://0575gwy.com/index.php/Index/show/id/2130', # 'http://hdmedicine.com.cn/News_info.aspx?News_Id=787&CateId=24', # 'http://www.qz001.gov.cn/info/view/86ec076d71a44869ab71e00e5707f89e', # 'http://payh.gov.cn/Art/Art_2/Art_2_795.aspx', 'http://qiushi.nbgxedu.com/show.aspx?id=d479b45a-1747-4f60-83f3-f1e2dc85a0d2' ] for url in urls: html = tools.get_html(url) article_extractor = ArticleExtractor(url, html) title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() content = article_extractor.get_content() print('---------------------------') print(url) print('title : ', title) print('release_time: ', release_time) print('author', author) print('content : ',content) print('---------------------------')
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] website_name = remark['website_name'] website_position = remark['website_position'] website_url = remark['website_url'] website_domain = remark['website_domain'] html = tools.get_html(root_url) if not html: base_parser.update_url('news_urls', root_url, Constance.EXCEPTION) return # 近一步取待做url if depth < DEPTH: urls = tools.get_urls(html) for url in urls: url = tools.get_full_url(website_url, url) if website_name == '百度新闻': remark['website_name'] = '' remark['website_domain'] = tools.get_domain(url) remark['website_position'] = None base_parser.add_url('news_urls', SITE_ID, url, depth + 1, remark=remark) elif website_domain in url: base_parser.add_url('news_urls', SITE_ID, url, depth + 1, remark=remark) # 解析网页 content = title = release_time = author = '' article_extractor = ArticleExtractor(root_url, html) content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() uuid = tools.get_uuid( title, website_domain) if title != website_name else tools.get_uuid( root_url, ' ') log.debug(''' uuid %s title %s author %s release_time %s website_name %s domain %s position %s url %s content %s ''' % (uuid, title, author, release_time, website_name, website_domain, website_position, root_url, content)) if tools.is_have_chinese(content): # 入库 self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name, website_domain, website_position, root_url, content) log.debug('%s 处理完成' % root_url) base_parser.update_url('news_urls', root_url, Constance.DONE)
def parser_video_info(root_url, depth, site_id, remark): program_id = remark.get('program_id') chan_name = remark.get('chan_name') program_name = remark.get('program_name') program_type = remark.get('program_type') is_need_update = not remark.get('image_url') or False html, r = tools.get_html_by_requests(root_url) if not html: base_parser.update_url('mms_urls', root_url, Constance.EXCEPTION) return regex = '(<li class="list_item".*?</li>)' video_blocks = tools.get_info(html, regex) for video_block in video_blocks: regex = '<a class="figure figure-180236.*?href="(.*?)"' url = tools.get_info(video_block, regex, fetch_one=True) regex = '<img width="140" height="187" alt="(.*?)"' name = tools.get_info(video_block, regex, fetch_one=True) if (not url) or (not program_name in name): continue regex = '<em class="fs12 c999"> (.*?)</em>' release_year = tools.get_info(video_block, regex, fetch_one=True) regex = '<label class="result_info_lbl">.*?</label>[^<]*?<a data-searchpingback-elem="link.*?>(.*?)</a>' director = tools.get_info(video_block, regex, fetch_one=True) html = tools.get_html(url) # 节目类别 regex = '<a href=.*?class="channelTag".*?>(.*?)</a>' video_type = tools.get_info(html, regex, fetch_one=True) # if program_type != '其他' and video_type and program_type != video_type: # # print(video_type, name) # continue regex = [ '<div class="info-img">.*?<img src="(.*?)"', '<div class="result_pic pr" >.*?<img.*?src="(.*?)"' ] image_url = tools.get_info(html, regex, fetch_one=True) regex = '<em>导演:.*?"director">(.*?)</a>' director = director or tools.get_info(html, regex, fetch_one=True) regex = [ '<p class="episodeIntro-time" itemprop="datePublished">.*?<span>(.*?)</span>' #, '<em class="ml50">年份:</em><a>(.*?)</a>', '<em>更新至.*?>(.*?)</a>' ] release_year = release_year or tools.get_info( html, regex, fetch_one=True) regex = '<em>类型.*?<a href.*?>(.*?)</a>' classify = tools.get_info(html, regex, fetch_one=True) regex = '<em>电视台:</em><span>(.*?)</span>' institution = tools.get_info(html, regex, fetch_one=True) # 简介 regex = [ 'data-moreorless="moreinfo".*?<span class="briefIntroTxt">(.*?)</span>', '<span class="briefIntroTxt">(.*?)</span>', '<span class="showMoreText" data-moreorless="moreinfo".*?简介:</em>(.*?)</span>' ] description = tools.get_info(html, regex, fetch_one=True) # 演员 regex = [ '<div class="headImg-top">.*?<img title="(.*?)"', '<div class="headImg-top">.*?<img.*?alt="(.*?)"' ] actor = tools.get_info(html, regex, split=',') # 節目id regex = 'data-score-tvid="(.*?)"' video_id = tools.get_info(html, regex, fetch_one=True) # 评分 score_url = 'http://score-video.iqiyi.com/beaver-api/get_sns_score?qipu_ids={video_id}&appid=21&tvid={video_id}&pageNo=1'.format( video_id=video_id) score_html, r = tools.get_html_by_requests(score_url) regex = '"sns_score":(.*?)}' score = tools.get_info(score_html, regex, fetch_one=True) log.debug( ''' url: %s 名称: %s id: %s 贴图: %s 导演: %s 节目类别 %s 类型: %s 电视台: %s 年份: %s 简介: %s 演员: %s 评分: %s ''' % (url, name, video_id, image_url, director, video_type, classify, institution, release_year, description, actor, score)) if is_need_update: sql = ''' update tab_mms_program t set t.image_url = '%s', t.director = '%s', t.description = '%s', t.score = %s, t.actor = '%s' where t.program_id = %d ''' % (image_url, director, description, score, actor, program_id) print(sql) db.update(sql) # 评论区类评论http://www.iqiyi.com/a_19rrhcvhph.html parser_comment_article(html, video_id, program_id, url) # 剧情讨论http://www.iqiyi.com/a_19rrhebm2l.html parser_first_page_article(html, program_id, url) # 取wall_id, feed_id, sns_time 翻页 regex = "\['wallId'\] = \"(.*?)\"" wall_id = tools.get_info(html, regex, fetch_one=True) regex = "\['feedId'\] = (\d*?);" feed_id = tools.get_info(html, regex, fetch_one=True) regex = "\['snsTime'\] = (\d*?);" sns_time = tools.get_info(html, regex, fetch_one=True) if wall_id: parser_next_page_article(video_id, wall_id, feed_id, sns_time, url) break # 找到了想要查找到的节目, 后面的不继续爬取评论 跳出 base_parser.update_url('mms_urls', root_url, Constance.DONE)