def save_video_info(release_time='', content='', url='', author='', title='', image_url='', site_name='', play_count=None, comment_count=None, praise_count=None, summary='', time_length=None): domain = tools.get_domain(url) content_info = { 'domain': domain, 'uuid': tools.get_uuid(title, domain), 'site_name': site_name, 'image_url': image_url, 'title': title, 'author': author, 'url': url, 'content': content, 'release_time': tools.format_date(release_time), 'play_count': play_count, 'comment_count': comment_count, 'praise_count': praise_count, 'time_length': time_length, 'record_time': tools.get_current_date(), 'summary': summary } log.debug(tools.dumps_json(content_info)) es.add('video_news', content_info, content_info['uuid'])
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] offset = remark.get('offset') html = tools.get_html_by_webdirver(root_url) headers = tools.get_tag(html, 'div', {'class': 'result'}, find_all=True) if not headers: base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE) for header in headers: # 查看更多相关新闻 regex = ' <span class="c-info"><a.*?href="(.*?)".*?查看更多相关新闻' more_news_url = tools.get_info(str(header), regex, fetch_one = True) if more_news_url: more_news_url = tools.get_full_url('http://news.baidu.com', more_news_url) more_news_url = more_news_url.replace('amp;', '') base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, more_news_url, depth = 1, remark = {'offset':0}) url = header.h3.a['href'] article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain ='' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = None log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s '''%(uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name , website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 offset += 50 url = tools.replace_str(root_url, 'pn=\d*', 'pn=%d'%offset) base_parser.add_url('BAIDU_NEWS_urls', SITE_ID, url, depth = 0, remark = {'offset': offset}) base_parser.update_url('BAIDU_NEWS_urls', root_url, Constance.DONE)
def add_website_info(table, site_id, url, name, domain = '', ip = '', address = '', video_license = '', public_safety = '', icp = ''): ''' @summary: 添加网站信息 --------- @param table: 表名 @param site_id: 网站id @param url: 网址 @param name: 网站名 @param domain: 域名 @param ip: 服务器ip @param address: 服务器地址 @param video_license: 网络视听许可证| @param public_safety: 公安备案号 @param icp: ICP号 --------- @result: ''' # 用程序获取domain,ip,address,video_license,public_safety,icp 等信息 domain = tools.get_domain(url) site_info = { 'site_id':site_id, 'name':name, 'domain':domain, 'url':url, 'ip':ip, 'address':address, 'video_license':video_license, 'public_safety':public_safety, 'icp':icp, 'read_status':0, 'record_time': tools.get_current_date() } mongodb.add(table, site_info)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] # 获取搜索词比配到的url start = 0 while True: urls = mg.search_url(query=root_url, num=50, start=start, pause=random.randint(MIN_SLEEP_TIME, MAX_SLEEP_TIME)) if not urls: break for url in urls: url = url.replace('amp;', '') article_extractor = ArticleExtractor(url) content = title = release_time = author = website_domain = '' content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() website_domain = tools.get_domain(url) uuid = tools.get_uuid(title, website_domain) website_name = '' website_position = 35 # 境外 log.debug(''' uuid %s title %s author %s release_time %s domain %s url %s content %s ''' % (uuid, title, author, release_time, website_domain, url, '...')) # 入库 if tools.is_have_chinese(content): is_continue = self_base_parser.add_news_acticle( uuid, title, author, release_time, website_name, website_domain, website_position, url, content) if not is_continue: break else: # 循环正常结束 该页均正常入库, 继续爬取下页 start += 50 base_parser.update_url('google_news_urls', root_url, Constance.DONE)
def add_html_url(html, depth, spider_depth, website_url, website_name, website_domain, remark): # 近一步取待做url if depth < spider_depth - 1: urls = tools.get_urls(html) for url in urls: url = tools.get_full_url(website_url, url) if website_name == '百度新闻': remark['website_name'] = '' remark['website_domain'] = tools.get_domain(url) remark['website_position'] = None base_parser.add_url(SITE_ID, url, depth + 1, remark=remark) elif website_domain in url: base_parser.add_url(SITE_ID, url, depth + 1, remark=remark)
def get_task_from_oracle(self): tasks = [] offset = 0 while True: # 取任务 task_sql = ''' select * from (select t.id, t.name, t.position, t.url, t.depth, rownum r from TAB_IOPM_SITE t where classify = 1 and t.mointor_status = 701 and (t.position != 35 or t.position is null) and rownum < {page_size}) where r >= {offset} '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset) results = self._oracledb.find(task_sql) offset += ONE_PAGE_SIZE if not results: break # 拼装成json格式的url for task in results: website_id = task[0] website_name = task[1] website_position = task[2] website_url = task[3] website_domain = tools.get_domain(website_url) spider_depth = task[4] remark = { 'website_name': website_name, 'website_position': website_position, 'website_url': website_url, 'website_domain': website_domain, 'spider_depth': spider_depth } url_dict = { 'site_id': 1, 'url': website_url, 'depth': 0, 'remark': remark, 'retry_times': 0 } tasks.append(url_dict) return tasks
def save_baidu_info(release_time='', content='', url='', author='', title='', is_debug=False): domain = tools.get_domain(url) content_info = { 'domain': domain, 'title': title, 'author': author, 'url': url, 'content': content, 'release_time': release_time, } log.debug(tools.dumps_json(content_info))
def is_have_video_by_site(url): ''' @summary: 根据特定网站的特征来判断 --------- @param url: --------- @result: ''' domain = tools.get_domain(url) feas = db.find('FeaVideo_site', {'domain': domain}) for fea in feas: video_fea = fea['video_fea'].split(',') if tools.get_info(url, video_fea): return True return False
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) for task in parser_params: website_id = task[0] website_name = task[1] website_position = task[2] website_url = task[3] website_domain = tools.get_domain(website_url) base_parser.add_url('news_urls', SITE_ID, website_url, remark={ 'website_name': website_name, 'website_position': website_position, 'website_url': website_url, 'website_domain': website_domain })
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] website_name = remark['website_name'] website_position = remark['website_position'] website_url = remark['website_url'] website_domain = remark['website_domain'] html = tools.get_html(root_url) if not html: base_parser.update_url('news_urls', root_url, Constance.EXCEPTION) return # 近一步取待做url if depth < DEPTH: urls = tools.get_urls(html) for url in urls: url = tools.get_full_url(website_url, url) if website_name == '百度新闻': remark['website_name'] = '' remark['website_domain'] = tools.get_domain(url) remark['website_position'] = None base_parser.add_url('news_urls', SITE_ID, url, depth + 1, remark=remark) elif website_domain in url: base_parser.add_url('news_urls', SITE_ID, url, depth + 1, remark=remark) # 解析网页 content = title = release_time = author = '' article_extractor = ArticleExtractor(root_url, html) content = article_extractor.get_content() if content: title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() uuid = tools.get_uuid( title, website_domain) if title != website_name else tools.get_uuid( root_url, ' ') log.debug(''' uuid %s title %s author %s release_time %s website_name %s domain %s position %s url %s content %s ''' % (uuid, title, author, release_time, website_name, website_domain, website_position, root_url, content)) if tools.is_have_chinese(content): # 入库 self_base_parser.add_news_acticle(uuid, title, author, release_time, website_name, website_domain, website_position, root_url, content) log.debug('%s 处理完成' % root_url) base_parser.update_url('news_urls', root_url, Constance.DONE)