async def data_extraction_for_web_so(client, html): with async_timeout.timeout(15): try: try: url = html.select('h3.res-title a')[0].get('href', None) title = html.select('h3.res-title a')[0].get_text() except IndexError: url = html.select('h3.title a')[0].get('href', None) title = html.select('h3.title a')[0].get_text() except Exception as e: LOGGER.exception(e) url, title = None, None url = parse_qs(urlparse(url).query).get('url', None) url = url[0] if url else None netloc = urlparse(url).netloc if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 time = '' timestamp = 0 return { 'title': title, 'url': url.replace('index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_web(html): with async_timeout.timeout(10): try: url = html.find('a').get('href', None) if not url or 'baidu' in url or urlparse( url).netloc in BLACK_DOMAIN: return None netloc = urlparse(url).netloc is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('font[size="3"]')[0].get_text() source = html.select('font[color="#008000"]')[0].get_text() time = re.findall(r'\d+-\d+-\d+', source) time = time[0] if time else None timestamp = 0 if time: try: time_list = [int(i) for i in time.split('-')] timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp except Exception as e: LOGGER.exception(e) timestamp = 0 return { 'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_web_baidu(client, html): with async_timeout.timeout(20): try: url = html.select('h3.t a')[0].get('href', None) real_url = await get_real_url(client=client, url=url) if url else None if real_url: netloc = urlparse(real_url).netloc if 'baidu' in real_url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('h3.t a')[0].get_text() source = real_url # time = re.findall(r'\d+-\d+-\d+', source) # time = time[0] if time else None timestamp = 0 time = "" # if time: # try: # time_list = [int(i) for i in time.split('-')] # timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp # except Exception as e: # LOGGER.exception(e) # timestamp = 0 return {'title': title, 'url': real_url.replace('index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc} else: return None except Exception as e: LOGGER.exception(e) return None
async def chapter(request): """ 返回小说章节目录页 : content_url 这决定当前U页面url的生成方式 : url 章节目录页源url : novels_name 小说名称 :return: 小说章节内容页 """ url = request.args.get('url', None) novels_name = request.args.get('novels_name', None) netloc = urlparse(url).netloc if netloc not in RULES.keys(): return redirect(url) if netloc in REPLACE_RULES.keys(): url = url.replace(REPLACE_RULES[netloc]['old'], REPLACE_RULES[netloc]['new']) content_url = RULES[netloc].content_url content = await cache_owllook_novels_chapter(url=url, netloc=netloc) if content: content = str(content).strip('[],, Jjs') return template('chapter.html', novels_name=novels_name, url=url, content_url=content_url, soup=content) else: return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
async def owllook_content(request): url = request.args.get('url', None) chapter_url = request.args.get('chapter_url', None) novels_name = request.args.get('novels_name', None) name = request.args.get('name', None) bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format( path=request.url, url=url, name=name, chapter_url=chapter_url, novels_name=novels_name) book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format( chapter_url=chapter_url, novels_name=novels_name) netloc = urlparse(url).netloc if netloc not in RULES.keys(): return redirect(url) content_url = RULES[netloc].content_url content = await cache_owllook_novels_content(url=url, netloc=netloc) if content: user = request['session'].get('user', None) # 破坏广告链接 content = str(content).strip('[]Jjs,').replace('http', 'hs') if user: motor_db = MotorBase().db bookmark = await motor_db.user_message.find_one( {'bookmarks.bookmark': bookmark_url}) book = await motor_db.user_message.find_one( {'books_url.book_url': book_url}) bookmark = 1 if bookmark else 0 book = 1 if book else 0 return template('content.html', is_login=1, user=user, name=name, url=url, bookmark=bookmark, book=book, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, soup=content) else: return template('content.html', is_login=0, name=name, url=url, bookmark=0, book=0, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, soup=content) else: return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
async def chapter(request): url = request.args.get('url', None) novels_name = request.args.get('novels_name', None) netloc = urlparse(url).netloc if netloc not in RULES.keys(): return redirect(url) content_url = RULES[netloc].content_url content = await cache_owllook_novels_chapter(url=url, netloc=netloc) if content: content = str(content).strip('[],') return template('chapter.html', novels_name=novels_name, url=url, content_url=content_url, soup=content) else: return text('解析失败,请将失败页面反馈给本站')
async def owllook_content(request): """ 返回小说章节内容页 : content_url 这决定当前U页面url的生成方式 : url 章节内容页源url : chapter_url 小说目录源url : novels_name 小说名称 :return: 小说章节内容页 """ url = request.args.get('url', None) chapter_url = request.args.get('chapter_url', None) novels_name = request.args.get('novels_name', None) name = request.args.get('name', None) # 当小说内容url不在解析规则内 跳转到原本url netloc = urlparse(url).netloc if netloc not in RULES.keys(): return redirect(url) # 拼接小说书签url bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format( path=request.path, url=url, name=name, chapter_url=chapter_url, novels_name=novels_name) # 拼接小说目录url book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format( chapter_url=chapter_url, novels_name=novels_name) content_url = RULES[netloc].content_url content = await cache_owllook_novels_content(url=url, netloc=netloc) if content: user = request['session'].get('user', None) # 破坏广告链接 content = str(content).strip('[]Jjs,').replace('http', 'hs') if user: motor_db = MotorBase().db bookmark = await motor_db.user_message.find_one({ 'user': user, 'bookmarks.bookmark': bookmark_url }) book = await motor_db.user_message.find_one({ 'user': user, 'books_url.book_url': book_url }) bookmark = 1 if bookmark else 0 if book: # 当书架中存在该书源 book = 1 # 保存最后一次阅读记录 await motor_db.user_message.update_one( { 'user': user, 'books_url.book_url': book_url }, {'$set': { 'books_url.$.last_read_url': bookmark_url }}) else: book = 0 return template('content.html', is_login=1, user=user, name=name, url=url, bookmark=bookmark, book=book, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, soup=content) else: return template('content.html', is_login=0, name=name, url=url, bookmark=0, book=0, content_url=content_url, chapter_url=chapter_url, novels_name=novels_name, soup=content) else: return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))