def parser_content(task): title = task['title'] new_tasks = [] response = task['response'] if not response: raise RetryDownload bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.PostIndex-content') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = bs.select('.PostIndex-authorName')[0].string if bs.select( '.PostIndex-authorName') else '' voteup_count = re.search( 'likesCount":(\d+),', response.text).group(1) if re.search( 'likesCount":(\d+),', response.text) else '' created_time = str( bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title'] ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else '' article_url = task['url'] download_img_list, content = format_zhihu_content(content, task) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def parser_content(task): title = task['title'] new_tasks = [] response = task['response'] if not response: raise RetryDownload bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.show-content') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = bs.select('.post .author .name a')[0].string if bs.select( '.post .author .name a') else '' voteup_count = bs.select( '.post .author .meta .likes-count')[0].string if bs.select( '.post .author .meta .likes-count') else '' created_time = bs.select( '.post .author .meta .publish-time')[0].string if bs.select( '.post .author .meta .publish-time') else '' article_url = task['url'] download_img_list, content = format_content(content, task) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def resulter_downloader_img(task): if 'www.zhihu.com/equation' not in task['url']: write(os.path.join(task['save']['save_path'], 'static'), urlparse(task['response'].url).path[1:], task['response'].content, mode='wb') else: write(os.path.join(task['save']['save_path'], 'static'), md5string(task['url']) + '.svg', task['response'].content, mode='wb')
def convert_link(x): if 'www.zhihu.com/equation' not in x.group(1): return 'src="./static/{}"'.format(urlparse(x.group(1)).path[1:]) # svg等式的保存 else: url = x.group(1) if url.startswith('//'): url = 'http:' + url else: url = 'http://' + url a = 'src="./static/{}.svg"'.format(md5string(url)) return a
def parser_content(task): title = task['title'] new_tasks = [] response = task['response'] if not response: raise RetryDownload try: content = response.json()['body'] except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload bs = BeautifulSoup(content, 'lxml') content = str(bs.select('div.content')[0]) author_name = bs.select('.author')[0].string if bs.select( '.author') else '' voteup_count = '' created_time = '' article_url = task['url'] download_img_list, content = format_zhihu_content(content, task) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: data = response.json()['stories'] except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload for item in data: # 如果在数据库里面已经存在的项目,就不继续爬了 url = 'http://news-at.zhihu.com/api/4/story/' + str(item['id']) if md5string(url) in ARTICLE_ID_SET: to_next = False continue new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], }) new_tasks.append(new_task) # 下一页 if not IS_TODAY_URL and to_next: next_datetime = get_next_datetime_string(task['save']['cursor'], '%Y%m%d', 1) # 始终会到相等的时候 if compare_datetime_string(task['save']['end'], next_datetime, '%Y%m%d') and len(data) != 0: next_page_task = deepcopy(task) next_page_task.update({ 'url': re.sub('before/\d+', 'before/{}'.format(next_datetime), next_page_task['url']) }) next_page_task['save'].update({'cursor': next_datetime}) new_tasks.append(next_page_task) return None, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: data = response.json() data.reverse() except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload for item in data: # 如果在数据库里面已经存在的项目,就不继续爬了 url = 'https://zhuanlan.zhihu.com' + item['url'] if md5string(url) in ARTICLE_ID_SET: to_next = False continue new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], }) new_tasks.append(new_task) # 下一页 if to_next and len(data) != 0: if task['save']['cursor'] < task['save']['end'] - 20: next_page_task = deepcopy(task) next_page_task.update({ 'url': re.sub('offset=\d+', 'offset={}'.format(task['save']['cursor'] + 20), next_page_task['url']) }) next_page_task['save'].update( {'cursor': next_page_task['save']['cursor'] + 20}) new_tasks.append(next_page_task) return None, new_tasks
def parser_content(task): title = task['title'] items = [] new_tasks = [] response = task['response'] if not response: raise RetryDownload response.encoding = 'utf-8' bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.article-detail-bd > .detail') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = '未知' voteup_count = task['voteup_count'] created_time = task['created_time'] article_url = task['url'] article_id = md5string(article_url) download_img_list, content = format_content(content, task) items.append([article_id, title, content, created_time, voteup_count, author_name, int(time.time() * 100000)]) if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append(Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': {'headers': img_header, 'verify': False}, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({'parsed_data': items}) return task, new_tasks
def parser_content(task): response = task['response'] if not response: LOG.log_it("Not Response", 'WARN') raise RetryDownload new_tasks = [] items = [] content = response.text # 去除空格 content = content.replace('</p><p>', '').replace('<br/>', '') soup = BeautifulSoup(content, 'lxml') title = task['save']['title'] article_url = task['url'] created_time = soup.select('.content-th-info span')[0].string[3:] author = soup.select('.content-th-info a')[0].string download_img_list, content = format_content(soup, task) items.append([ md5string(article_url), title, content, created_time, '', author, int(time.time() * 100000) ]) if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'priority': 2, 'save': task['save'] })) task.update({'parsed_data': items}) return task, new_tasks
def make_task(params): if 'parser' not in params: # FIXME Can't raise Exception in there raise Exception("Need a parser") if 'method' not in params: raise Exception("Need a method") if 'url' not in params: raise Exception("Need a url") tid = md5string(params['url'] + str(params.get('data')) + str(params.get('params'))) params.setdefault('meta', {}) params.setdefault('priority', 0) params.setdefault('retry', 3) params.setdefault('tid', tid) if not params['url'].startswith('http'): if params['url'].startswith('//'): params['url'] = 'http:' + params['url'] else: params['url'] = 'http://' + params['url'] return Task(**params)
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: LOG.log_it("Not Response", 'WARN') raise RetryDownload try: data = response.json() except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)\nERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload try: for each_result in data['result']: url = each_result['url'] article_id = md5string(url) if article_id not in ARTICLE_ID_SET: title = each_result['title'] date_group = re.search('(.*?)T(.*?)\+', each_result['date_created']) date = date_group.group(1) + ' ' + date_group.group(2) meta = deepcopy(task['meta']) save = deepcopy(task['save']) save.update({'title': title, 'date': date}) new_task = Task.make_task({ 'url': url, 'method': 'GET', 'parser': parser_content, 'resulter': resulter_content, 'priority': 1, 'meta': meta, 'save': save }) new_tasks.append(new_task) else: to_next = False except KeyError: LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload # 获取下一页 if to_next and task['save']['cursor'] < task['save']['end'] and not len( data['result']) < 20: meta = deepcopy(task['meta']) save = deepcopy(task['save']) save['cursor'] += 20 new_task = Task.make_task({ 'url': API_URL.format(save['cursor']), 'method': 'GET', 'meta': meta, 'parser': parser_list, 'priority': 0, 'save': save, 'retry': 10, 'retry_delay': 10 }) new_tasks.append(new_task) return None, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: text = response.text bs = BeautifulSoup(text, 'lxml') except Exception as e: LOG.log_it('解析网页出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)ERRINFO:{}' .format(str(e)), 'WARN') raise RetryDownload book_name = bs.title.string if bs.title else task['save']['name'] # 插入文集名字 with ArticleDB(task['save']['save_path']) as article_db: article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False) # 顺序反向 items = bs.select('a.title') items.reverse() for item in items: # 如果已经在数据库中,则不下载 url = 'https://www.jianshu.com' + item.attrs['href'] if md5string(url) in ARTICLE_ID_SET: to_next = False continue try: title = item.string except: LOG.log_it('解析标题出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': title, }) new_tasks.append(new_task) # 下一页 if to_next and len(items) != 0: if task['save']['cursor'] < task['save']['end']: next_page_task = deepcopy(task) next_page_task.update( {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)}) next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1}) new_tasks.append(next_page_task) return None, new_tasks
def parser_content(task): title = task['title'] download_img_list = [] new_tasks = [] response = task['response'] if not response: raise RetryDownload bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.PostIndex-content') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = bs.select('.PostIndex-authorName')[0].string if bs.select( '.PostIndex-authorName') else '' voteup_count = re.search( 'likesCount":(\d+),', response.text).group(1) if re.search( 'likesCount":(\d+),', response.text) else '' created_time = str( bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title'] ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else '' article_url = task['url'] bs = BeautifulSoup(content, 'lxml') for tab in bs.select('img[src^="data"]'): # 删除无用的img标签 tab.decompose() # 居中图片 for tab in bs.select('img'): if 'equation' not in tab['src']: tab.wrap(bs.new_tag('div', style='text-align:center;')) tab['style'] = "display: inline-block;" # 删除gif if task['save']['kw']['gif'] is False: if 'gif' in tab['src']: tab.decompose() continue content = str(bs) # bs4会自动加html和body 标签 content = re.sub('<html><body>(.*?)</body></html>', lambda x: x.group(1), content, flags=re.S) # 公式地址转换(傻逼知乎又换地址了) # content = content.replace('//www.zhihu.com', 'http://www.zhihu.com') download_img_list.extend(re.findall('src="(http.*?)"', content)) # 更换为本地相对路径 content = re.sub('src="(.*?)"', convert_link, content) # 超链接的转换 content = re.sub('//link.zhihu.com/\?target=(.*?)"', lambda x: unquote(x.group(1)), content) content = re.sub('<noscript>(.*?)</noscript>', lambda x: x.group(1), content, flags=re.S) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def parser_content(task): title = task['title'] items = [] download_img_list = [] new_tasks = [] response = task['response'] if not response: raise RetryDownload response.encoding = 'utf-8' bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.article-detail-bd > .detail') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = '未知' voteup_count = task['voteup_count'] created_time = task['created_time'] article_url = task['url'] bs = BeautifulSoup(content, 'lxml') # 居中图片 for tab in bs.select('img'): if len(tab.attrs['class']) != 1: tab.decompose() continue # 删除gif if task['save']['kw']['gif'] is False: if 'gif' in tab['data-src']: tab.decompose() continue tab.wrap(bs.new_tag('div', style='text-align:center;')) tab['style'] = "display: inline-block;" content = str(bs) # bs4会自动加html和body 标签 content = re.sub('<html><body>(.*?)</body></html>', lambda x: x.group(1), content, flags=re.S) download_img_list.extend(re.findall('src="(http.*?)"', content)) # 更换为本地相对路径 content = re.sub('src="(.*?)"', convert_link, content) content = content.replace('data-src', 'src') items.append([ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ]) if task['save']['kw'].get('img', True): img_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS')) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({'parsed_data': items}) return task, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] opf = [] to_next = True if not response: raise RetryDownload try: data = response.json() except Exception as e: LOG.log_it('解析JSON出错(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)ERRINFO:{}' .format(str(e)), 'WARN') raise RetryDownload try: for item in data['data']['feeds']: if item['datatype'] == 'article': article_url = 'https://www.qdaily.com/articles/{}.html'.format(str(item['post']['id'])) article_id = md5string(article_url) # 如果在数据库里面已经存在的项目,就不继续爬了 if article_id not in ARTICLE_ID_SET: item = item['post'] # 文件名太长无法制作mobi title = item['title'] if len(title) > 55: _ = 55 - len(title) - 3 title = title[:_] + '...' opf.append({'href': format_file_name(title, '.html')}) new_task = Task.make_task({ 'url': article_url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], 'created_time': item['publish_time'], 'voteup_count': item['praise_count'] }) new_tasks.append(new_task) else: to_next = False # Next page if to_next: if len(data['data']) != 0: if data['data']['last_key'] > task['save']['end']: next_page_task = deepcopy(task) next_page_task.update( {'url': API_URL.format(data['data']['last_key'])}) next_page_task['save'].update( {'cursor': data['data']['last_key'], 'page': task['save']['page'] + 1}) new_tasks.append(next_page_task) else: LOG.log_it('不能读取列表。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)', 'WARN') raise RetryDownload except KeyError: LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload return None, new_tasks
def parser_content(task): response = task['response'] if not response: LOG.log_it("Not Response", 'WARN') raise RetryDownload new_tasks = [] download_img_list = [] items = [] soup = BeautifulSoup(response.text, 'lxml') content_select = soup.select('.document') # 移除每页后面无用的信息 if content_select: for to_del in soup.select('.copyright'): to_del.decompose() content = str(content_select) # bs4会自动加html和body 标签 content = re.sub('<html><body>(.*?)</body></html>', lambda x: x.group(1), content, flags=re.S) download_img_list.extend(re.findall('src="(http.*?)"', content)) # 更换为本地相对路径 content = re.sub('src="(.*?)"', convert_link, content) # 去掉"[]" content = content[1:-1] title = task['save']['title'] article_url = task['url'] created_time = soup.select('.content-th-info span')[0].string[3:] author = soup.select('.content-th-info a')[0].string bs2 = BeautifulSoup(content, 'lxml') # 居中图片 for tab in bs2.select('img'): tab.wrap(bs2.new_tag('div', style='text-align:center;')) tab['style'] = "display: inline-block;" # 删除gif if task['save']['kw']['gif'] is False: if 'gif' in tab['src']: tab.decompose() continue content = str(bs2) items.append([ md5string(article_url), title, content, created_time, '', author, int(time.time() * 100000) ]) if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'priority': 2, 'save': task['save'] })) task.update({'parsed_data': items}) return task, new_tasks