async def data_extraction_for_web_duck(client, html): with async_timeout.timeout(15): try: try: title = html.select('h2 a')[0].get_text() url = html.select('h2 a')[0].get('href', None) url = parse_qs(url).get('uddg', ['#'])[0] netloc = urlparse(url).netloc url = url.replace('index.html', '').replace('Index.html', '') if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url: return None is_parse = 1 if netloc in RULES.keys() else 0 is_recommend = 1 if netloc in LATEST_RULES.keys() else 0 # time = html.select('div.b_attribution')[0].get_text() # time = re.findall(r'\d+-\d+-\d+', time) # time = time[0] if time else '' timestamp = 0 time = '' return { 'title': title, 'url': url, 'time': time, 'is_parse': is_parse, 'is_recommend': is_recommend, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) url, title = None, None return None except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_web_so(client, html): with async_timeout.timeout(15): try: try: url = html.select('h3.res-title a')[0].get('href', None) title = html.select('h3.res-title a')[0].get_text() except IndexError: url = html.select('h3.title a')[0].get('href', None) title = html.select('h3.title a')[0].get_text() except Exception as e: LOGGER.exception(e) url, title = None, None return None url = parse_qs(urlparse(url).query).get('url', None) url = url[0] if url else None netloc = urlparse(url).netloc if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 time = '' timestamp = 0 return { 'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_web(html): with async_timeout.timeout(10): try: url = html.find('a').get('href', None) if not url or 'baidu' in url or urlparse(url).netloc in BLACK_DOMAIN: return None netloc = urlparse(url).netloc is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('font[size="3"]')[0].get_text() source = html.select('font[color="#008000"]')[0].get_text() time = re.findall(r'\d+-\d+-\d+', source) time = time[0] if time else None timestamp = 0 if time: try: time_list = [int(i) for i in time.split('-')] timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp except Exception as e: LOGGER.exception(e) timestamp = 0 return {'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc} except Exception as e: LOGGER.exception(e) return None
async def chapter(request): """ 返回小说章节目录页 : content_url 这决定当前U页面url的生成方式 : url 章节目录页源url : novels_name 小说名称 :return: 小说章节内容页 """ url = request.args.get('url', None) novels_name = request.args.get('novels_name', None) netloc = get_netloc(url) if netloc not in RULES.keys(): return redirect(url) if netloc in REPLACE_RULES.keys(): url = url.replace(REPLACE_RULES[netloc]['old'], REPLACE_RULES[netloc]['new']) content_url = RULES[netloc].content_url content = await cache_owllook_novels_chapter(url=url, netloc=netloc) if content: content = str(content).strip('[],, Jjs').replace(', ', '').replace( 'onerror', '').replace('js', '').replace('加入书架', '') return template('chapter.html', novels_name=novels_name, url=url, content_url=content_url, soup=content) else: return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
async def data_extraction_for_web_baidu(client, html): with async_timeout.timeout(20): try: url = html.select('h3.t a')[0].get('href', None) real_url = await get_real_url(client=client, url=url) if url else None if real_url: netloc = urlparse(real_url).netloc if 'baidu' in real_url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('h3.t a')[0].get_text() # time = re.findall(r'\d+-\d+-\d+', source) # time = time[0] if time else None timestamp = 0 time = "" # if time: # try: # time_list = [int(i) for i in time.split('-')] # timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp # except Exception as e: # LOGGER.exception(e) # timestamp = 0 return {'title': title, 'url': real_url.replace('index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc} else: return None except Exception as e: LOGGER.exception(e) return None
async def chapter(request): """ 返回小说章节目录页 : content_url 这决定当前U页面url的生成方式 : url 章节目录页源url : novels_name 小说名称 :return: 小说章节内容页 """ url = request.args.get('url', None) novels_name = request.args.get('novels_name', None) netloc = get_netloc(url) if netloc not in RULES.keys(): return redirect(url) if netloc in REPLACE_RULES.keys(): url = url.replace(REPLACE_RULES[netloc]['old'], REPLACE_RULES[netloc]['new']) content_url = RULES[netloc].content_url content = await cache_owllook_novels_chapter(url=url, netloc=netloc) if content: content = str(content).strip('[],, Jjs').replace(', ', '').replace( 'onerror', '').replace('js', '').replace('加入书架', '') if request.args.get('add_kindle', None): h = areader() h.feed(content) if (content_url == '1'): content_url = '' elif (content_url == '0'): content_url = url elif (content_url == '-1'): content_url = url (a, b) = h.chapters[0] link = "http://127.0.0.1:8001/owllook_content?url=" + content_url + "%s&name=%s&chapter_url=" + url + "&novels_name=%s" s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(('127.0.0.1', 31419)) f = open('/tmp/ow_links', 'w') print(len(h.chapters)) jjj.dump([{ 'title': title, 'url': link % (curl, urllib.parse.quote(title), urllib.parse.quote(novels_name)) } for (title, curl) in h.chapters], f) f.close() s.send( pickle.dumps( (novels_name, len(h.chapters), "*****@*****.**"))) return redirect("https://fss.cjwddtc.win") return template('chapter.html', novels_name=novels_name, url=url, content_url=content_url, soup=content) else: return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
async def data_extraction_for_web_so(client, html): with async_timeout.timeout(15): try: # 2017.09.09 修改 更加全面地获取title && url try: title = html.select('h3 a')[0].get_text() url = html.select('h3 a')[0].get('href', None) except Exception as e: LOGGER.exception(e) url, title = None, None return None # 针对不同的请进行url的提取 if "www.so.com/link?m=" in url: url = html.select('h3 a')[0].get('data-url', None) if "www.so.com/link?url=" in url: url = parse_qs(urlparse(url).query).get('url', None) url = url[0] if url else None # try: # url = html.select('h3.res-title a')[0].get('data-url', None) # title = html.select('h3.res-title a')[0].get_text() # except IndexError: # url = html.select('h3.title a')[0].get('href', None) # url = parse_qs(urlparse(url).query).get('url', None) # url = url[0] if url else None # title = html.select('h3.title a')[0].get_text() # except Exception as e: # LOGGER.exception(e) # url, title = None, None # return None # 2017.07.09 此处出现bug url展示形式发生变化 因此对于h3.title a形式依旧不变 但是h3.res-title a则取属性data-url # url = parse_qs(urlparse(url).query).get('url', None) # url = url[0] if url else None netloc = urlparse(url).netloc if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 is_recommend = 1 if netloc in LATEST_RULES.keys() else 0 time = '' timestamp = 0 return { 'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'is_recommend': is_recommend, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_web_bing(client, html): with async_timeout.timeout(15): try: try: title = html.select('h2 a')[0].get_text() url = html.select('h2 a')[0].get('href', None) netloc = urlparse(url).netloc url = url.replace('index.html', '').replace('Index.html', '') if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url: return None is_parse = 1 if netloc in RULES.keys() else 0 is_recommend = 1 if netloc in LATEST_RULES.keys() else 0 # time = html.select('div.b_attribution')[0].get_text() # time = re.findall(r'\d+-\d+-\d+', time) # time = time[0] if time else '' timestamp = 0 time = '' # if time: # try: # time_list = [int(i) for i in time.split('-')] # years = str(time_list[0])[-4:] # timestamp = arrow.get(int(years), time_list[1], time_list[2]).timestamp # time = years + "-" + str(time_list[1]) + "-" + str(time_list[2]) # except Exception as e: # LOGGER.exception(e) # timestamp = 0 return { 'title': title, 'url': url, 'time': time, 'is_parse': is_parse, 'is_recommend': is_recommend, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) url, title = None, None return None except Exception as e: LOGGER.exception(e) return None
async def owllook_content(request): """ 返回小说章节内容页 : content_url 这决定当前U页面url的生成方式 : url 章节内容页源url : chapter_url 小说目录源url : novels_name 小说名称 :return: 小说章节内容页 """ url = request.args.get('url', None) chapter_url = request.args.get('chapter_url', None) novels_name = request.args.get('novels_name', None) name = request.args.get('name', '') is_ajax = request.args.get('is_ajax', '') # 当小说内容url不在解析规则内 跳转到原本url netloc = get_netloc(url) if netloc not in RULES.keys(): return redirect(url) user = request['session'].get('user', None) # 拼接小说目录url book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format( chapter_url=chapter_url, novels_name=novels_name) motor_db = motor_base.get_db() if url == chapter_url: # 阅读到最后章节时候 在数据库中保存最新阅读章节 if user and is_ajax == "owl_cache": owl_referer = request.headers.get('Referer', '').split('owllook_content')[1] if owl_referer: latest_read = "/owllook_content" + owl_referer await motor_db.user_message.update_one( {'user': user, 'books_url.book_url': book_url}, {'$set': {'books_url.$.last_read_url': latest_read}}) return redirect(book_url) content_url = RULES[netloc].content_url content_data = await cache_owllook_novels_content(url=url, netloc=netloc) if content_data: try: content = content_data.get('content', '获取失败') next_chapter = content_data.get('next_chapter', []) title = content_data.get('title', '').replace(novels_name, '') name = title if title else name # 拼接小说书签url bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format( path=request.path, url=url, name=name, chapter_url=chapter_url, novels_name=novels_name ) # 破坏广告链接 content = str(content).strip('[]Jjs,').replace('http', 'hs') if user: bookmark = await motor_db.user_message.find_one({'user': user, 'bookmarks.bookmark': bookmark_url}) book = await motor_db.user_message.find_one({'user': user, 'books_url.book_url': book_url}) bookmark = 1 if bookmark else 0 if book: # 当书架中存在该书源 book = 1 # 保存最后一次阅读记录 if is_ajax == "owl_cache": owl_referer = request.headers.get('Referer', bookmark_url).split('owllook_content')[1] latest_read = "/owllook_content" + owl_referer await motor_db.user_message.update_one( {'user': user, 'books_url.book_url': book_url}, {'$set': {'books_url.$.last_read_url': latest_read}}) else: book = 0 if is_ajax == "owl_cache": owl_cache_dict = dict( is_login=1, user=user, name=name, url=url, bookmark=bookmark, book=book, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, next_chapter=next_chapter, soup=content ) return json(owl_cache_dict) return template( 'content.html', is_login=1, user=user, name=name, url=url, bookmark=bookmark, book=book, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, next_chapter=next_chapter, soup=content) else: if is_ajax == "owl_cache": owl_cache_dict = dict( is_login=0, name=name, url=url, bookmark=0, book=0, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, next_chapter=next_chapter, soup=content ) return json(owl_cache_dict) return template( 'content.html', is_login=0, name=name, url=url, bookmark=0, book=0, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, next_chapter=next_chapter, soup=content) except Exception as e: LOGGER.exception(e) return redirect(book_url) else: if user: is_login = 1 user = user return template('parse_error.html', url=url, is_login=is_login, user=user) else: is_login = 0 return template('parse_error.html', url=url, is_login=is_login)
async def owllook_content(request): """ 返回小说章节内容页 : content_url 这决定当前U页面url的生成方式 : url 章节内容页源url : chapter_url 小说目录源url : novels_name 小说名称 :return: 小说章节内容页 """ url = request.args.get('url', None) chapter_url = request.args.get('chapter_url', None) novels_name = request.args.get('novels_name', None) name = request.args.get('name', '') # 当小说内容url不在解析规则内 跳转到原本url netloc = get_netloc(url) if netloc not in RULES.keys(): return redirect(url) # 拼接小说目录url book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format( chapter_url=chapter_url, novels_name=novels_name) if url == chapter_url: return redirect(book_url) content_url = RULES[netloc].content_url content_data = await cache_owllook_novels_content(url=url, netloc=netloc) if content_data: user = request['session'].get('user', None) try: content = content_data.get('content', '获取失败') next_chapter = content_data.get('next_chapter', []) title = content_data.get('title', '').replace(novels_name, '') name = title if title else name # 拼接小说书签url bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format( path=request.path, url=url, name=name, chapter_url=chapter_url, novels_name=novels_name ) # 破坏广告链接 content = str(content).strip('[]Jjs,').replace('http', 'hs') if user: motor_db = motor_base.db bookmark = await motor_db.user_message.find_one({'user': user, 'bookmarks.bookmark': bookmark_url}) book = await motor_db.user_message.find_one({'user': user, 'books_url.book_url': book_url}) bookmark = 1 if bookmark else 0 if book: # 当书架中存在该书源 book = 1 # 保存最后一次阅读记录 await motor_db.user_message.update_one( {'user': user, 'books_url.book_url': book_url}, {'$set': {'books_url.$.last_read_url': bookmark_url}}) else: book = 0 return template( 'content.html', is_login=1, user=user, name=name, url=url, bookmark=bookmark, book=book, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, next_chapter=next_chapter, soup=content) else: return template( 'content.html', is_login=0, name=name, url=url, bookmark=0, book=0, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, next_chapter=next_chapter, soup=content) except Exception as e: LOGGER.exception(e) return redirect(book_url) else: return text('解析失败或者是没有下一页了,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))