def inner_add_url(url): html = tools.get_html_by_urllib(url) regexs = 'pg.pageCount = parseInt\(\'(\d*?)\',10\)' pages = tools.get_info(html, regexs) pages = int(pages[0]) for i in range(1, pages + 1): new_url = url+'=%d' % i base_parser.add_url('WWA_search_app_urls', SITE_ID, new_url)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] depth = url_info['depth'] site_id = url_info['site_id'] remark = url_info['remark'] html = tools.get_html_by_urllib(root_url) title = '<tr height="25"><td><a href=".*?" title="(.*?)"' video_url = ['<tr height="25"><td><a href="(.*?)"'] author = ['<a href="user-.*?.html" target="_blank">(.*?)</a>'] watched_count = ['浏览次数: </span>(.*?) '] file_size = ['资料大小: </span>(.*?) '] download_count = ['下载次数: </span>(.*?) '] titles = tools.get_info(html, title, allow_repeat = True) video_urls = tools.get_info(html, video_url, allow_repeat = True) authors = tools.get_info(html, author, allow_repeat = True) watched_counts = tools.get_info(html, watched_count, allow_repeat = True) file_sizes = tools.get_info(html, file_size, allow_repeat= True) download_counts = tools.get_info(html, download_count, allow_repeat = True) for i in range(len(titles)): title = titles[i] title = tools.del_html_tag(title) video_url = video_urls[i] video_url = tools.get_full_url('http://www.sobaidupan.com', video_url) author = authors[i] watched_count = watched_counts[i] file_size = file_sizes[i] download_count = download_counts[i] log.debug(''' 标题: %s 视频地址: %s 作者: %s 观看数 %s 资料大小 %s 下载次数 %s '''%(title, video_url, author, watched_count, file_size, download_count)) contained_key, contained_key_count = base_parser.get_contained_key(title, '', remark['search_keyword1'], remark['search_keyword2'], remark['search_keyword3']) if not contained_key: continue base_parser.add_content_info('VA_content_info', SITE_ID, video_url, title, file_size = file_size, file_name = title, author = author, watched_count = watched_count, download_count = download_count, search_type = search_type, keyword = contained_key, keyword_count = contained_key_count, task_id = remark['task_id']) base_parser.update_url('VA_urls', root_url, Constance.DONE)
def add_root_url(parser_params={}): log.debug(''' 添加根url parser_params : %s ''' % str(parser_params)) url = "http://www.luzhou.ccoo.cn/tieba/" html = tools.get_html_by_urllib(url, 'gb2312') regexs = '<input name="maxpage" value="(\d*?)" type="hidden" />' pages = tools.get_info(html, regexs) pages = int(pages[0]) for x in range(1, pages + 1): url = 'http://www.luzhou.ccoo.cn/tieba/index-0-%d-1.html' % x html2 = tools.get_html_by_urllib(url, code='gb2312') regexs = '<div class="topic_show_l">(.*?)</a></div>.*?</div>.*?</div>' htmls2 = tools.get_info(html2, regexs) for ever_html in htmls2: regex = '<a href="(.*?)".*?</a>' url = tools.get_info(ever_html, regex) url = 'http://www.luzhou.ccoo.cn' + url[0] base_parser.add_url('op_urls', SITE_ID, url, depth=1)
def add_root_url(parser_params = {}): log.debug(''' 添加根url parser_params : %s '''%str(parser_params)) search_keyword1 = parser_params['search_keyword1'] search_keyword2 = parser_params['search_keyword2'] search_keyword3 = parser_params['search_keyword3'] remark = parser_params search_keywords = [] for str_key1 in search_keyword1: for str_key2 in search_keyword2: search_keywords.append((str_key1 + str_key2).strip()) else: if not search_keyword1: search_keywords = search_keyword2 if not search_keyword2: search_keywords = search_keyword1 for search_keyword in search_keywords: if not search_keyword.strip(): continue # 取页数 url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=0' % search_keyword html = tools.get_html_by_urllib(url) regex = ['分页:1/(.*?)页'] # 测试0页 page_count = tools.get_info(html, regex) page_count = int(page_count[0]) if page_count else 0 print(page_count) for page in range(0, page_count): url = 'http://www.sobaidupan.com/search.asp?r=2&wd=%s&p=0&page=%d'%(search_keyword, page) if not base_parser.add_url('VA_urls', SITE_ID, url, remark=remark): base_parser.update_url('VA_urls', url, Constance.TODO)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html = tools.get_html_by_urllib(source_url, code='gb2312') if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.lzzjw.com' + url else: new_url = 'http://www.lzzjw.com/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h1>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<h3>时间:(.*?) 点击' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.del_html_tag(release_time) # 作者 regexs = '<div id="copy">作者:(.*?)来源' author = tools.get_info(html, regexs) author = author and author[0] or '' author = tools.del_html_tag(author) # 来源 regexs = ' <div id="copy">作者:.*? 来源:(.*?)</div>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = 'ID=(.*)' times_script_url = tools.get_info(source_url, regexs) times_script_url = ''.join(times_script_url) times_script_url = 'http://www.lzzjw.com/js/count.asp?id=' + times_script_url watched_count_html, request = tools.get_html_by_requests(times_script_url) regexs = '\'(\d*?)\'' watched_count = tools.get_info(watched_count_html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div id="content">(.*?)<div id="copy">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)