async def update_all_books(loop, timeout=15): try: motor_db = MotorBase().get_db() # 获取所有书架链接游标 books_url_cursor = motor_db.user_message.find({}, { 'books_url.book_url': 1, '_id': 0 }) book_urls = [] already_urls = set() async for document in books_url_cursor: if document: books_url = document['books_url'] for book_url in books_url: chapter_url = book_url['book_url'] if chapter_url not in already_urls: try: await get_the_latest_chapter( chapter_url, loop, timeout) except Exception as e: LOGGER.exception(e) already_urls.add(chapter_url) # 一组书架链接列表数据 # book_urls += [book_url['book_url'] for book_url in books_url] # url_tasks = [get_the_latest_chapter(each_url, loop) for each_url in set(book_urls)] # tasks = [asyncio.ensure_future(i) for i in url_tasks] # try: # await asyncio.gather(*tasks) # except asyncio.TimeoutError as e: # pass except Exception as e: LOGGER.exception(e) return False
async def data_extraction_for_web_so(client, html): with async_timeout.timeout(15): try: try: url = html.select('h3.res-title a')[0].get('href', None) title = html.select('h3.res-title a')[0].get_text() except IndexError: url = html.select('h3.title a')[0].get('href', None) title = html.select('h3.title a')[0].get_text() except Exception as e: LOGGER.exception(e) url, title = None, None return None url = parse_qs(urlparse(url).query).get('url', None) url = url[0] if url else None netloc = urlparse(url).netloc if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 time = '' timestamp = 0 return { 'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) return None
async def auth_param(request, *args, **kwargs): request_params = {} # POST request if request.method == 'POST' or request.method == 'DELETE': try: post_data = json_loads(str(request.body, encoding='utf-8')) except Exception as e: LOGGER.exception(e) return response_handle(request, UniResponse.PARAM_PARSE_ERR, status=400) else: request_params.update(post_data) params = [key for key, value in post_data.items() if value] elif request.method == 'GET': request_params.update(request.args) params = [key for key, value in request.args.items() if value] else: # TODO return response_handle(request, UniResponse.PARAM_UNKNOWN_ERR, status=400) if set(keys).issubset(set(params)): try: kwargs['request_params'] = request_params response = await func(request, *args, **kwargs) return response except Exception as e: LOGGER.exception(e) return response_handle(request, UniResponse.SERVER_UNKNOWN_ERR, 500) else: return response_handle(request, UniResponse.PARAM_ERR, status=400)
async def owllook_delete_book(request): """ :param request: :return: : -1 用户session失效 需要重新登录 : 0 删除书架失败 : 1 删除书架成功 """ user = request['session'].get('user', None) data = parse_qs(str(request.body, encoding='utf-8')) if user: if data.get('book_url', None): book_url = data.get('book_url', None)[0] else: novels_name = data.get('novels_name', '') chapter_url = data.get('chapter_url', '') book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format( chapter_url=chapter_url[0], novels_name=novels_name[0]) try: motor_db = motor_base.db await motor_db.user_message.update_one( {'user': user}, {'$pull': { 'books_url': { "book_url": unquote(book_url) } }}) LOGGER.info('删除书架成功') return json({'status': 1}) except Exception as e: LOGGER.exception(e) return json({'status': 0}) else: return json({'status': -1})
async def wrapper(*args, **kwargs): cache_instance = _get_cache(cache=cache, serializer=serializer, plugins=plugins, **cache_kwargs) args_dict = _get_args_dict(func, args, kwargs) cache_key = key or args_dict.get( key_from_attr, (func.__module__ or 'stub') + func.__name__ + str(args) + str(kwargs)) try: if await cache_instance.exists(cache_key): return await cache_instance.get(cache_key) except Exception: LOGGER.exception("Unexpected error with %s", cache_instance) result = await func(*args, **kwargs) if result: try: await cache_instance.set(cache_key, result, ttl=ttl) except Exception: LOGGER.exception("Unexpected error with %s", cache_instance) return result
async def fetch(client, url, name, is_web): with async_timeout.timeout(15): try: headers = {'user-agent': get_random_user_agent()} if is_web: params = { 'wd': name, 'ie': 'utf-8', 'rn': CONFIG.BAIDU_RN, 'vf_bl': 1 } else: params = {'word': name} async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def change_email(request): """ 修改用户邮箱 :param request: :return: : -1 用户session失效 需要重新登录 : 0 修改邮箱失败 : 1 添加邮箱成功 """ user = request['session'].get('user', None) data = parse_qs(str(request.body, encoding='utf-8')) if user: try: email = data.get('email', None)[0] motor_db = motor_base.get_db() await motor_db.user.update_one({'user': user}, {'$set': { 'email': email }}) LOGGER.info('修改邮箱成功') return json({'status': 1}) except Exception as e: LOGGER.exception(e) return json({'status': 0}) else: return json({'status': -1})
async def data_extraction_for_web_baidu(client, html): with async_timeout.timeout(20): try: url = html.select('h3.t a')[0].get('href', None) real_url = await get_real_url(client=client, url=url) if url else None if real_url: netloc = urlparse(real_url).netloc if 'baidu' in real_url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('h3.t a')[0].get_text() # time = re.findall(r'\d+-\d+-\d+', source) # time = time[0] if time else None timestamp = 0 time = "" # if time: # try: # time_list = [int(i) for i in time.split('-')] # timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp # except Exception as e: # LOGGER.exception(e) # timestamp = 0 return {'title': title, 'url': real_url.replace('index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc} else: return None except Exception as e: LOGGER.exception(e) return None
async def owl_novels_chapters(request): """ 返回章节目录 基本达到通用 :param request: :param chapter_url: 章节源目录页url :param novels_name: 小说名称 :return: 小说目录信息 """ chapters_url = request.args.get('chapters_url', None) novels_name = request.args.get('novels_name', None) netloc = get_netloc(chapters_url) try: res = await cache_owllook_novels_chapter(url=chapters_url, netloc=netloc) chapters_sorted = [] if res: chapters_sorted = extract_chapters(chapters_url, res) result = {'status': 200} else: result = {'status': 204} result.update({ 'data': { 'novels_name': novels_name, 'chapter_url': chapters_url, 'all_chapters': chapters_sorted }, 'msg': "ok" }) except Exception as e: LOGGER.exception(e) result = {'status': 500, 'msg': e} result.update({'finished_at': get_time()}) return response.json(result)
async def change_pass(request): """ 修改用户密码 :param request: :return: : -1 用户session失效 需要重新登录 : 0 修改密码失败 : 1 添加密码成功 : -2 原始密码错误 """ user = request['session'].get('user', None) data = parse_qs(str(request.body, encoding='utf-8')) if user: try: new_pass = data.get('new_pass', None)[0] old_pass = data.get('old_pass', None)[0] motor_db = motor_base.db user_data = await motor_db.user.find_one({'user': user}) if user_data: pass_first = hashlib.md5((CONFIG.WEBSITE["TOKEN"] + old_pass).encode("utf-8")).hexdigest() pass_second = hashlib.md5((CONFIG.WEBSITE["TOKEN"] + new_pass).encode("utf-8")).hexdigest() new_password = hashlib.md5(pass_second.encode("utf-8")).hexdigest() password = hashlib.md5(pass_first.encode("utf-8")).hexdigest() if password == user_data.get('password'): await motor_db.user.update_one({'user': user}, {'$set': {'password': new_password}}) LOGGER.info('修改密码成功') return json({'status': 1}) else: return json({'status': -2}) except Exception as e: LOGGER.exception(e) return json({'status': 0}) else: return json({'status': -1})
async def owl_novels_chapters(request, **kwargs): """ 返回章节目录 基本达到通用 :param request: :param chapter_url: 章节源目录页url :param novels_name: 小说名称 :return: 小说目录信息 """ request_params = kwargs["request_params"] chapters_url = request_params.get('chapters_url', None) novels_name = request_params.get('novels_name', None) netloc = get_netloc(chapters_url) try: res = await cache_owllook_novels_chapter(url=chapters_url, netloc=netloc) chapters_sorted = [] if res: chapters_sorted = extract_chapters(chapters_url, res) UniResponse.SUCCESS.update({ResponseField.DATA: { 'novels_name': novels_name, 'chapter_url': chapters_url, 'all_chapters': chapters_sorted }, ResponseField.FINISH_AT: get_time()}) return response_handle(request, UniResponse.SUCCESS, 200) except Exception as e: LOGGER.exception(e) return response_handle(request, UniResponse.SERVER_UNKNOWN_ERR, 500)
async def owllook_delete_bookmark(request): """ :param request: :return: : -1 用户session失效 需要重新登录 : 0 删除书签失败 : 1 删除书签成功 """ user = request['session'].get('user', None) data = parse_qs(str(request.body, encoding='utf-8')) bookmarkurl = data.get('bookmarkurl', '') if user and bookmarkurl: bookmark = unquote(bookmarkurl[0]) try: motor_db = motor_base.db await motor_db.user_message.update_one( {'user': user}, {'$pull': { 'bookmarks': { "bookmark": bookmark } }}) LOGGER.info('删除书签成功') return json({'status': 1}) except Exception as e: LOGGER.exception(e) return json({'status': 0}) else: return json({'status': -1})
async def update_all_books(): try: motor_db = MotorBase().db # 获取所有书架链接游标 books_url_cursor = motor_db.user_message.find({}, { 'books_url.book_url': 1, '_id': 0 }) book_urls = [] already_urls = set() # url_tasks = [get_the_latest_chapter(each_url) for each_url in set(book_urls)] # tasks = [asyncio.ensure_future(i) for i in url_tasks] # return await asyncio.gather(*tasks) async for document in books_url_cursor: if document: books_url = document['books_url'] # 一组书架链接列表数据 # book_urls += [book_url['book_url'] for book_url in books_url] for book_url in books_url: chapter_url = book_url['book_url'] if chapter_url not in already_urls: try: with async_timeout.timeout(20): await get_the_latest_chapter(chapter_url) except Exception as e: LOGGER.exception(e) already_urls.add(chapter_url) except Exception as e: LOGGER.exception(e) return False
async def data_extraction_for_web_duck(client, html): with async_timeout.timeout(15): try: try: title = html.select('h2 a')[0].get_text() url = html.select('h2 a')[0].get('href', None) url = parse_qs(url).get('uddg', ['#'])[0] netloc = urlparse(url).netloc url = url.replace('index.html', '').replace('Index.html', '') if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url: return None is_parse = 1 if netloc in RULES.keys() else 0 is_recommend = 1 if netloc in LATEST_RULES.keys() else 0 # time = html.select('div.b_attribution')[0].get_text() # time = re.findall(r'\d+-\d+-\d+', time) # time = time[0] if time else '' timestamp = 0 time = '' return { 'title': title, 'url': url, 'time': time, 'is_parse': is_parse, 'is_recommend': is_recommend, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) url, title = None, None return None except Exception as e: LOGGER.exception(e) return None
async def fetch(client, url, novels_name): with async_timeout.timeout(20): try: headers = { 'User-Agent': get_random_user_agent(), 'Referer': "http://www.so.com/haosou.html?src=home" } params = { 'ie': 'utf-8', 'src': 'noscript_home', 'shb': 1, 'q': novels_name, } async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def owllook_add_book(request): """ :param request: :return: : -1 用户session失效 需要重新登录 : 0 添加书架失败 : 1 添加书架成功 """ user = request['session'].get('user', None) data = parse_qs(str(request.body, encoding='utf-8')) novels_name = data.get('novels_name', '') chapter_url = data.get('chapter_url', '') last_read_url = data.get('last_read_url', '') if user and novels_name and chapter_url: url = "/chapter?url={chapter_url}&novels_name={novels_name}".format(chapter_url=chapter_url[0], novels_name=novels_name[0]) time = get_time() try: motor_db = MotorBase().db res = await motor_db.user_message.update_one({'user': user}, {'$set': {'last_update_time': time}}, upsert=True) if res: await motor_db.user_message.update_one( {'user': user, 'books_url.book_url': {'$ne': url}}, {'$push': { 'books_url': {'book_url': url, 'add_time': time, 'last_read_url': unquote(last_read_url[0])}}}) LOGGER.info('书架添加成功') return json({'status': 1}) except Exception as e: LOGGER.exception(e) return json({'status': 0}) else: return json({'status': -1})
async def owllook_add_bookmark(request): """ :param request: :return: : -1 用户session失效 需要重新登录 : 0 添加书签失败 : 1 添加书签成功 """ user = request['session'].get('user', None) data = parse_qs(str(request.body, encoding='utf-8')) bookmarkurl = data.get('bookmarkurl', '') if user and bookmarkurl: url = unquote(bookmarkurl[0]) time = get_time() try: motor_db = MotorBase().db res = await motor_db.user_message.update_one({'user': user}, {'$set': {'last_update_time': time}}, upsert=True) if res: await motor_db.user_message.update_one( {'user': user, 'bookmarks.bookmark': {'$ne': url}}, {'$push': {'bookmarks': {'bookmark': url, 'add_time': time}}}) LOGGER.info('书签添加成功') return json({'status': 1}) except Exception as e: LOGGER.exception(e) return json({'status': 0}) else: return json({'status': -1})
def extract_pre_next_chapter(chapter_url, html): """ 获取单章节上一页下一页 :param chapter_url: :param html: :return: """ next_chapter = OrderedDict() try: # 参考https://greasyfork.org/zh-CN/scripts/292-my-novel-reader next_reg = r'(<a\s+.*?>.*[上前下后][一]?[页张个篇章节步].*?</a>)' judge_reg = r'[上前下后][一]?[页张个篇章节步]' # 这里同样需要利用bs再次解析 next_res = re.findall(next_reg, html, re.I) str_next_res = '\n'.join(next_res) next_res_soup = BeautifulSoup(str_next_res, 'html5lib') for link in next_res_soup.find_all('a'): text = link.text or '' text = text.replace(' ', '') if novels_list(text): is_next = re.search(judge_reg, text) # is_ok = is_chapter(text) if is_next: url = urljoin(chapter_url, link.get('href')) or '' next_chapter[text] = url # nextDic = [{v[0]: v[1]} for v in sorted(next_chapter.items(), key=lambda d: d[1])] return next_chapter except Exception as e: LOGGER.exception(e) return next_chapter
async def data_extraction_for_web(html): with async_timeout.timeout(10): try: url = html.find('a').get('href', None) if not url or 'baidu' in url or urlparse(url).netloc in BLACK_DOMAIN: return None netloc = urlparse(url).netloc is_parse = 1 if netloc in RULES.keys() else 0 title = html.select('font[size="3"]')[0].get_text() source = html.select('font[color="#008000"]')[0].get_text() time = re.findall(r'\d+-\d+-\d+', source) time = time[0] if time else None timestamp = 0 if time: try: time_list = [int(i) for i in time.split('-')] timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp except Exception as e: LOGGER.exception(e) timestamp = 0 return {'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'timestamp': timestamp, 'netloc': netloc} except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_web_so(client, html): with async_timeout.timeout(15): try: # 2017.09.09 修改 更加全面地获取title && url try: title = html.select('h3 a')[0].get_text() url = html.select('h3 a')[0].get('href', None) except Exception as e: LOGGER.exception(e) url, title = None, None return None # 针对不同的请进行url的提取 if "www.so.com/link?m=" in url: url = html.select('h3 a')[0].get('data-url', None) if "www.so.com/link?url=" in url: url = parse_qs(urlparse(url).query).get('url', None) url = url[0] if url else None # try: # url = html.select('h3.res-title a')[0].get('data-url', None) # title = html.select('h3.res-title a')[0].get_text() # except IndexError: # url = html.select('h3.title a')[0].get('href', None) # url = parse_qs(urlparse(url).query).get('url', None) # url = url[0] if url else None # title = html.select('h3.title a')[0].get_text() # except Exception as e: # LOGGER.exception(e) # url, title = None, None # return None # 2017.07.09 此处出现bug url展示形式发生变化 因此对于h3.title a形式依旧不变 但是h3.res-title a则取属性data-url # url = parse_qs(urlparse(url).query).get('url', None) # url = url[0] if url else None netloc = urlparse(url).netloc if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 is_recommend = 1 if netloc in LATEST_RULES.keys() else 0 time = '' timestamp = 0 return { 'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'is_recommend': is_recommend, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) return None
async def fetch(client, url): with async_timeout.timeout(10): try: headers = {'user-agent': get_random_user_agent()} async with client.get(url, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
def requests_target_fetch(url): """ :param url: :return: """ try: headers = {'user-agent': get_random_user_agent()} response = requests.get(url=url, headers=headers, verify=False) response.raise_for_status() content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) return text except Exception as e: LOGGER.exception(e) return None
async def data_extraction_for_phone(html): with async_timeout.timeout(10): try: # Get title data_log = eval(html['data-log']) url = data_log.get('mu', None) if not url: return None # Get title title = html.find('h3').get_text() # Get author and update_time (option) novel_mess = html.findAll(class_='c-gap-right-large') basic_mess = [i.get_text() for i in novel_mess] if novel_mess else None return {'title': title, 'url': url, 'basic_mess': basic_mess} except Exception as e: LOGGER.exception(e) return None
def get_html_by_requests(url, headers, timeout=15): """ :param url: :return: """ try: response = requests.get(url=url, headers=headers, verify=False, timeout=timeout) response.raise_for_status() content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) return text except Exception as e: LOGGER.exception(e) return None
async def owl_so_novels(request, name): """ 360小说信息接口 :param request: :param name: 小说名 :return: 小说相关信息 """ name = unquote(name) novels_name = '{name} 小说 免费阅读'.format(name=name) try: res = await cache_owllook_so_novels_result(novels_name) parse_result = [] if res: parse_result = [i for i in res if i] UniResponse.SUCCESS.update({ResponseField.DATA: parse_result, ResponseField.FINISH_AT: get_time()}) return response_handle(request, UniResponse.SUCCESS, 200) except Exception as e: LOGGER.exception(e) return response_handle(request, UniResponse.SERVER_UNKNOWN_ERR, 500)
async def data_extraction_for_web_bing(client, html): with async_timeout.timeout(15): try: try: title = html.select('h2 a')[0].get_text() url = html.select('h2 a')[0].get('href', None) netloc = urlparse(url).netloc url = url.replace('index.html', '').replace('Index.html', '') if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url: return None is_parse = 1 if netloc in RULES.keys() else 0 is_recommend = 1 if netloc in LATEST_RULES.keys() else 0 # time = html.select('div.b_attribution')[0].get_text() # time = re.findall(r'\d+-\d+-\d+', time) # time = time[0] if time else '' timestamp = 0 time = '' # if time: # try: # time_list = [int(i) for i in time.split('-')] # years = str(time_list[0])[-4:] # timestamp = arrow.get(int(years), time_list[1], time_list[2]).timestamp # time = years + "-" + str(time_list[1]) + "-" + str(time_list[2]) # except Exception as e: # LOGGER.exception(e) # timestamp = 0 return { 'title': title, 'url': url, 'time': time, 'is_parse': is_parse, 'is_recommend': is_recommend, 'timestamp': timestamp, 'netloc': netloc } except Exception as e: LOGGER.exception(e) url, title = None, None return None except Exception as e: LOGGER.exception(e) return None
async def fetch(client, url, novels_name): with async_timeout.timeout(20): try: headers = { 'user-agent': get_random_user_agent(), 'referer': "https://www.bing.com/" } params = {'q': novels_name, 'ensearch': 0} async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def get_real_url(client, url): with async_timeout.timeout(10): try: headers = {'user-agent': get_random_user_agent()} async with client.head(url, headers=headers, allow_redirects=True) as response: assert response.status == 200 LOGGER.info('Parse url: {}'.format(response.url)) # text = "" # try: # text = await response.text() # except: # text = await response.read() # if text: # print(text) # text = re.findall(r'replace\(\"(.*?)\"\)', str(text)) # text = text[0] if text[0] else "" url = response.url if response.url else None return url except Exception as e: LOGGER.exception(e) return None
async def fetch_url(self, client, url, params, headers): """ 公共抓取函数 :param client: :param url: :param params: :return: """ with async_timeout.timeout(15): try: async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def target_fetch(url, headers, timeout=15): """ :param url: target url :return: text """ with async_timeout.timeout(timeout): try: async with aiohttp.ClientSession() as client: async with client.get(url, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: try: text = await response.read() except aiohttp.ServerDisconnectedError as e: LOGGER.exception(e) text = None return text except Exception as e: LOGGER.exception(str(e)) return None