def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("/", url): new_url = 'http://www.naxi.gov.cn' + url else: new_url = 'http://www.naxi.gov.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<DIV class=news_conent_two_title>(.*?)</DIV>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<SPAN>日期:(.*?)</SPAN>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' #文章来源 regexs = '<SPAN>来源:(.*?)</SPAN>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = '<SPAN>点击数:(\d*?)</SPAN>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = [ '<DIV id=news_conent_two_text class=news_conent_two_text>(.*?)</DIV>' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("
", url): regex = '.*?(/GovPublicInfo.+?000)' new_url = tools.get_info(url, regex) new_url = new_url[0] new_url = 'http://www.luzhou.gov.cn' + new_url else: new_url = 'http://www.luzhou.gov.cn' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) #文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = '<span>点击数.*?src="(.*?)"></script>' times_script_url = tools.get_info(html, regexs) times_script_url = ''.join(times_script_url) times_script_url = 'http://www.luzhou.gov.cn' + times_script_url watched_count_html, request = tools.get_html_by_requests(times_script_url) regexs = '\'(\d*?)\'' watched_count = tools.get_info(watched_count_html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div class="conTxt">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, title, source_url, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: base_parser.add_url('op_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '发表时间:(.*?) ' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.del_html_tag(release_time) #作者 author = '编辑:(.*?)</div>' author = tools.get_info(html, regexs) author = release_time and author[0] or '' author = tools.del_html_tag(author) #文章来源 regexs = '来源:(.*?) ' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = '评论:<span class="style1">(\d*?)</span>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<td height="2" class="graphic10">(.*?)来源'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') regexs = 'charset=(.*?)"' code = tools.get_info(html, regexs) code = code and code[0] or 'gb2312' html, request = tools.get_html_by_requests(source_url, code=code) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.scpolicec.edu.cn' + url else: new_url = 'http://www.scpolicec.edu.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>', '<div class="contentPageTitle">(.*?)</div>'] title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>', '<h2><span>更新时间:(.*?)</span>'] release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '</a> 发布时间:(.*?) 点击数' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # #作者 regexs = ['作者:(.*?) 【'] author = tools.get_info(html, regexs) author = author and author[0] or '' #author = tools.del_html_tag(author) #文章来源 regexs = '来源:(.*?)</a>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)
发表时间'] watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>' '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>', '<div id="articleContnet">(.*?)<div class="page_css">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.sccc.edu.cn/new' + url else: new_url = 'http://www.sccc.edu.cn/new/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) regexs = '<script type="text/javascript" language="JavaScript" src="(.*?)"' urls = tools.get_info(html, regexs) for url in urls: new_url = 'http://www.sccc.edu.cn/new/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = 'td height="60" align="center" valign="bottom" class="nrbt">(.*?)</td>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<td height="3" align="center" valign="top">(.*?)</td>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[1] or '' # #作者 regexs = '<td width="250">(.*?)</td>' author = tools.get_info(html, regexs) author = author and author[0] or '' #author = tools.del_html_tag(author) #文章来源 regexs = '<td width="300">(.*?)</td>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = ' <td>阅读(\d*?)次</td>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<td class="nr">(.*?)</td>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url else: new_url = 'http://www.xuyong.gov.cn'+url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<td class="titlestyle1037" align="center">(.*?)</td></tr>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span class="timestyle1037" >(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # #作者 # regexs = '<span>作者:(.*?)</span>' # author = tools.get_info(html, regexs) # author = author and author[0] or '' # author = tools.del_html_tag(author) #文章来源 # regexs = '采编: (.*?)阅读' # origin = tools.get_info(html, regexs) # origin = origin and origin[0] or '' # origin = tools.del_html_tag(origin) # #点击数 # regexs = '阅读:(\d*?)次' # watched_count = tools.get_info(html, regexs) # watched_count = watched_count and watched_count[0] or '' # watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<tr><td class="contentstyle1037" >(.*?) <tr><td class="pagestyle1037" align="left">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s content = %s ''' % (depth+1, source_url, title, release_time, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', # 'Cookie':'__cfduid=d17ca6a5def98d8c14f73dcee28042c7f1492065760', 'Host': 'www.luzhoutianli.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } html, request = tools.get_html_by_requests(source_url, headers=headers, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: #new_url = tools.get_full_url('http://www.luzhoutianli.com', url) if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.luzhoutianli.com' + url else: new_url = 'http://www.luzhoutianli.com/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 # # print(html) regexs = '<strong class="NameTxt"><a >(.*?)</a></strong>.*?</td>' title = tools.get_info(html, regexs) if len(title) > 1: title = title[1] else: title = title and title[0] or '' title = tools.del_html_tag(title) # 时间 regexs = ' <span class="FC_Time">时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' # 作者 regexs = '<span class="FC_Time">作者:(.*?)</span>' author = tools.get_info(html, regexs) author = release_time and author[0] or '' author = tools.del_html_tag(author) # #print(author) # # 文章来源 regexs = '来源:(.*?) ' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # 点击数 regexs = '评论:<span class="style1">(\d*?)</span>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = [ '<div class="articleDetails">.*?</script>(.*?)<td class="MoBodyR"' ] content = tools.get_info(html, regexs) # print(content[0]) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s content = %s ''' % (depth, source_url, title, release_time, author, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.scmu.edu.cn' + url else: new_url = 'http://www.scmu.edu.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = [ '<div class="main_list_right_2_7">(.*?)<div class="main_list_right_2_7_1">', '<div class="articlett">(.*?)</div>' ] title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '发布时间:(.*?) 点击数' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' #作者 regexs = '作者:(.*?) 来源' author = tools.get_info(html, regexs) author = release_time and author[0] or '' author = tools.del_html_tag(author) #文章来源 regexs = '来源: (.*?) 发布时间' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = '评论:<span class="style1">(\d*?)</span>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = [ '<div class="main_list_right_2_5">(.*?)<div class="main_list_right_2_7_2">', '<div class="content">(.*?)<div id="pages" class="text-c"></div>' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url else: new_url = 'http://www.jiangyang.gov.cn/template/default/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<div class="tit">(.*?)</div>' title = tools.get_info(html, regexs) if not title: regexs = '<h1>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<label>(.*?)</label>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if release_time: release_time = tools.format_date(release_time) if not release_time: regexs = '<span class="time">发布时间:(.*?)</span><span class="source"></span></p>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' #release_time = tools.format_date(release_time) #文章来源 regexs = '<label>来源:(.*?)</label>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # 内容 regexs = ['<div class="content" id="nr" style="">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) if not content: regexs = '<p style="text-align: center;"(.*?)</div>.*?<div class="content">' content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s content = %s ''' % (depth + 1, source_url, title, release_time, origin, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return # 判断中英文 urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.lx2hs.com' + url else: new_url = 'http://www.lx2hs.com/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<title>(.*?)</title>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #更新时间 regexs = '发表时间:(.*?) ' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' # #作者 regexs = '<span>编辑:(.*?)</div>' author = tools.get_info(html, regexs) author = author and author[0] or '' author = tools.del_html_tag(author) # #文章来源 # regexs = '<td align=\'center\' class=\'info\'>(.*?) 点击数' # origin = tools.get_info(html, regexs) # origin = origin and origin[0] or '' # origin = tools.del_html_tag(origin) # #点击数 regexs = '点击/评论:<span class="style1">(\d*?)</span>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['0</span>(.*?)来源'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.gulin.gov.cn' + url else: new_url = 'http://www.gulin.gov.cn/template/default/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<div class="news_titile">(.*?)</div>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<div class="news_info">时间:2017-04-07 采编' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' # #作者 # regexs = '<span>作者:(.*?)</span>' # author = tools.get_info(html, regexs) # author = author and author[0] or '' # author = tools.del_html_tag(author) #文章来源 regexs = '采编: (.*?)阅读' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = '阅读:(\d*?)次' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = '<div class="news_content" id="news_content">(.*?)</span></b></p>' content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth+1, source_url, title, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.lzzjw.com' + url else: new_url = 'http://www.lzzjw.com/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<h3>时间:(.*?) 点击' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.del_html_tag(release_time) # 作者 regexs = '<div id="copy">作者:(.*?)来源' author = tools.get_info(html, regexs) author = author and author[0] or '' author = tools.del_html_tag(author) # 来源 regexs = ' <div id="copy">作者:.*? 来源:(.*?)</div>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = 'ID=(.*)' times_script_url = tools.get_info(source_url, regexs) times_script_url = ''.join(times_script_url) times_script_url = 'http://www.lzzjw.com/js/count.asp?id=' + times_script_url watched_count_html, request = tools.get_html_by_requests(times_script_url) regexs = '\'(\d*?)\'' watched_count = tools.get_info(watched_count_html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div id="content">(.*?)<div id="copy">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return # 取当前页的文章信息 # 标题 regexs = '<h1><b>.*?</b>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<li>发表于:(.*?)</li>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' #点击数 regexs = '<li>阅读:(\d*?)</li>' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['新浪微博登录</a></p>(.*?)</td>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url else: new_url = 'http://www.lzy.edu.cn/' + url base_parser.add_url('op_urls', website_id, url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<p class="atcTitle1a">(.*?)</p>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<div class="atcTitle">.*?</script>(.*?)</div>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.del_html_tag(release_time) release_time = release_time.strip('|') # #作者 # regexs = '<span>作者:(.*?)</span>' # author = tools.get_info(html, regexs) # author = author and author[0] or '' # author = tools.del_html_tag(author) #文章来源 regexs = '文章来源:(.*?)点击数' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) if origin.find('|'): origin = origin.strip('|') # #点击数 regexs = '点击数:<script type="text/javascript" src="(.*?)"></script>' times_script_url = tools.get_info(html, regexs) times_script_url = ''.join(times_script_url) times_script_url = 'http://www.lzy.edu.cn/' + times_script_url watched_count_html, request = tools.get_html_by_requests(times_script_url) regexs = '\'(\d*?)\'' watched_count = tools.get_info(watched_count_html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = [ '<span style="font-size: 18px">(.*?)<div style="text-align:right">' ] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("
", url): regex = '.*?(/Survey.+?html)' new_url = tools.get_info(url, regex) if new_url: new_url = new_url[0] new_url = 'http://www.longmatan.gov.cn' + new_url else: new_url = 'http://www.longmatan.gov.cn' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) #作者 regexs = '<span>作者:(.*?)</span>' author = tools.get_info(html, regexs) author = author and author[0] or '' author = tools.del_html_tag(author) #文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = '<span>点击数:(\d*?)<span' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div class="conTxt">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)