Beispiel #1
0
async def update_all_books(loop, timeout=15):
    try:
        motor_db = MotorBase().get_db()
        # 获取所有书架链接游标
        books_url_cursor = motor_db.user_message.find({}, {
            'books_url.book_url': 1,
            '_id': 0
        })
        book_urls = []
        already_urls = set()
        async for document in books_url_cursor:
            if document:
                books_url = document['books_url']

                for book_url in books_url:
                    chapter_url = book_url['book_url']
                    if chapter_url not in already_urls:
                        try:
                            await get_the_latest_chapter(
                                chapter_url, loop, timeout)
                        except Exception as e:
                            LOGGER.exception(e)
                        already_urls.add(chapter_url)
                        # 一组书架链接列表数据
                        #         book_urls += [book_url['book_url'] for book_url in books_url]
                        # url_tasks = [get_the_latest_chapter(each_url, loop) for each_url in set(book_urls)]
                        # tasks = [asyncio.ensure_future(i) for i in url_tasks]
                        # try:
                        #     await asyncio.gather(*tasks)
                        # except asyncio.TimeoutError as e:
                        #     pass
    except Exception as e:
        LOGGER.exception(e)
        return False
Beispiel #2
0
async def data_extraction_for_web_so(client, html):
    with async_timeout.timeout(15):
        try:
            try:
                url = html.select('h3.res-title a')[0].get('href', None)
                title = html.select('h3.res-title a')[0].get_text()
            except IndexError:
                url = html.select('h3.title a')[0].get('href', None)
                title = html.select('h3.title a')[0].get_text()
            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None

            url = parse_qs(urlparse(url).query).get('url', None)
            url = url[0] if url else None
            netloc = urlparse(url).netloc
            if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN:
                return None
            is_parse = 1 if netloc in RULES.keys() else 0
            time = ''
            timestamp = 0
            return {
                'title': title,
                'url': url.replace('index.html', '').replace('Index.html', ''),
                'time': time,
                'is_parse': is_parse,
                'timestamp': timestamp,
                'netloc': netloc
            }
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #3
0
 async def auth_param(request, *args, **kwargs):
     request_params = {}
     # POST request
     if request.method == 'POST' or request.method == 'DELETE':
         try:
             post_data = json_loads(str(request.body, encoding='utf-8'))
         except Exception as e:
             LOGGER.exception(e)
             return response_handle(request,
                                    UniResponse.PARAM_PARSE_ERR,
                                    status=400)
         else:
             request_params.update(post_data)
             params = [key for key, value in post_data.items() if value]
     elif request.method == 'GET':
         request_params.update(request.args)
         params = [key for key, value in request.args.items() if value]
     else:
         # TODO
         return response_handle(request,
                                UniResponse.PARAM_UNKNOWN_ERR,
                                status=400)
     if set(keys).issubset(set(params)):
         try:
             kwargs['request_params'] = request_params
             response = await func(request, *args, **kwargs)
             return response
         except Exception as e:
             LOGGER.exception(e)
             return response_handle(request,
                                    UniResponse.SERVER_UNKNOWN_ERR, 500)
     else:
         return response_handle(request,
                                UniResponse.PARAM_ERR,
                                status=400)
Beispiel #4
0
async def owllook_delete_book(request):
    """

    :param request:
    :return:
        :   -1  用户session失效  需要重新登录
        :   0   删除书架失败
        :   1   删除书架成功
    """
    user = request['session'].get('user', None)
    data = parse_qs(str(request.body, encoding='utf-8'))
    if user:
        if data.get('book_url', None):
            book_url = data.get('book_url', None)[0]
        else:
            novels_name = data.get('novels_name', '')
            chapter_url = data.get('chapter_url', '')
            book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format(
                chapter_url=chapter_url[0], novels_name=novels_name[0])
        try:
            motor_db = motor_base.db
            await motor_db.user_message.update_one(
                {'user': user},
                {'$pull': {
                    'books_url': {
                        "book_url": unquote(book_url)
                    }
                }})
            LOGGER.info('删除书架成功')
            return json({'status': 1})
        except Exception as e:
            LOGGER.exception(e)
            return json({'status': 0})
    else:
        return json({'status': -1})
Beispiel #5
0
        async def wrapper(*args, **kwargs):
            cache_instance = _get_cache(cache=cache,
                                        serializer=serializer,
                                        plugins=plugins,
                                        **cache_kwargs)
            args_dict = _get_args_dict(func, args, kwargs)
            cache_key = key or args_dict.get(
                key_from_attr, (func.__module__ or 'stub') + func.__name__ +
                str(args) + str(kwargs))

            try:
                if await cache_instance.exists(cache_key):
                    return await cache_instance.get(cache_key)

            except Exception:
                LOGGER.exception("Unexpected error with %s", cache_instance)

            result = await func(*args, **kwargs)
            if result:
                try:
                    await cache_instance.set(cache_key, result, ttl=ttl)
                except Exception:
                    LOGGER.exception("Unexpected error with %s",
                                     cache_instance)

            return result
Beispiel #6
0
async def fetch(client, url, name, is_web):
    with async_timeout.timeout(15):
        try:
            headers = {'user-agent': get_random_user_agent()}
            if is_web:
                params = {
                    'wd': name,
                    'ie': 'utf-8',
                    'rn': CONFIG.BAIDU_RN,
                    'vf_bl': 1
                }
            else:
                params = {'word': name}
            async with client.get(url, params=params,
                                  headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #7
0
async def change_email(request):
    """
    修改用户邮箱
    :param request:
    :return:
        :   -1  用户session失效  需要重新登录
        :   0   修改邮箱失败
        :   1   添加邮箱成功
    """
    user = request['session'].get('user', None)
    data = parse_qs(str(request.body, encoding='utf-8'))
    if user:
        try:
            email = data.get('email', None)[0]
            motor_db = motor_base.get_db()
            await motor_db.user.update_one({'user': user},
                                           {'$set': {
                                               'email': email
                                           }})
            LOGGER.info('修改邮箱成功')
            return json({'status': 1})
        except Exception as e:
            LOGGER.exception(e)
            return json({'status': 0})
    else:
        return json({'status': -1})
Beispiel #8
0
async def data_extraction_for_web_baidu(client, html):
    with async_timeout.timeout(20):
        try:
            url = html.select('h3.t a')[0].get('href', None)
            real_url = await get_real_url(client=client, url=url) if url else None
            if real_url:
                netloc = urlparse(real_url).netloc
                if 'baidu' in real_url or netloc in BLACK_DOMAIN:
                    return None
                is_parse = 1 if netloc in RULES.keys() else 0
                title = html.select('h3.t a')[0].get_text()
                # time = re.findall(r'\d+-\d+-\d+', source)
                # time = time[0] if time else None
                timestamp = 0
                time = ""
                # if time:
                #     try:
                #         time_list = [int(i) for i in time.split('-')]
                #         timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp
                #     except Exception as e:
                #         LOGGER.exception(e)
                #         timestamp = 0
                return {'title': title, 'url': real_url.replace('index.html', ''), 'time': time, 'is_parse': is_parse,
                        'timestamp': timestamp,
                        'netloc': netloc}
            else:
                return None
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #9
0
async def owl_novels_chapters(request):
    """
    返回章节目录 基本达到通用
    :param request: 
    :param chapter_url: 章节源目录页url
    :param novels_name: 小说名称
    :return: 小说目录信息
    """
    chapters_url = request.args.get('chapters_url', None)
    novels_name = request.args.get('novels_name', None)
    netloc = get_netloc(chapters_url)
    try:
        res = await cache_owllook_novels_chapter(url=chapters_url,
                                                 netloc=netloc)
        chapters_sorted = []
        if res:
            chapters_sorted = extract_chapters(chapters_url, res)
            result = {'status': 200}
        else:
            result = {'status': 204}
        result.update({
            'data': {
                'novels_name': novels_name,
                'chapter_url': chapters_url,
                'all_chapters': chapters_sorted
            },
            'msg': "ok"
        })
    except Exception as e:
        LOGGER.exception(e)
        result = {'status': 500, 'msg': e}
    result.update({'finished_at': get_time()})
    return response.json(result)
Beispiel #10
0
async def change_pass(request):
    """
    修改用户密码
    :param request:
    :return:
        :   -1  用户session失效  需要重新登录
        :   0   修改密码失败
        :   1   添加密码成功
        :   -2  原始密码错误
    """
    user = request['session'].get('user', None)
    data = parse_qs(str(request.body, encoding='utf-8'))
    if user:
        try:
            new_pass = data.get('new_pass', None)[0]
            old_pass = data.get('old_pass', None)[0]
            motor_db = motor_base.db
            user_data = await motor_db.user.find_one({'user': user})
            if user_data:
                pass_first = hashlib.md5((CONFIG.WEBSITE["TOKEN"] + old_pass).encode("utf-8")).hexdigest()
                pass_second = hashlib.md5((CONFIG.WEBSITE["TOKEN"] + new_pass).encode("utf-8")).hexdigest()
                new_password = hashlib.md5(pass_second.encode("utf-8")).hexdigest()
                password = hashlib.md5(pass_first.encode("utf-8")).hexdigest()
                if password == user_data.get('password'):
                    await motor_db.user.update_one({'user': user},
                                                   {'$set': {'password': new_password}})
                    LOGGER.info('修改密码成功')
                    return json({'status': 1})
                else:
                    return json({'status': -2})
        except Exception as e:
            LOGGER.exception(e)
            return json({'status': 0})
    else:
        return json({'status': -1})
Beispiel #11
0
async def owl_novels_chapters(request, **kwargs):
    """
    返回章节目录 基本达到通用
    :param request: 
    :param chapter_url: 章节源目录页url
    :param novels_name: 小说名称
    :return: 小说目录信息
    """
    request_params = kwargs["request_params"]
    chapters_url = request_params.get('chapters_url', None)
    novels_name = request_params.get('novels_name', None)
    netloc = get_netloc(chapters_url)
    try:
        res = await cache_owllook_novels_chapter(url=chapters_url, netloc=netloc)
        chapters_sorted = []
        if res:
            chapters_sorted = extract_chapters(chapters_url, res)
        UniResponse.SUCCESS.update({ResponseField.DATA: {
            'novels_name': novels_name,
            'chapter_url': chapters_url,
            'all_chapters': chapters_sorted
        }, ResponseField.FINISH_AT: get_time()})
        return response_handle(request, UniResponse.SUCCESS, 200)
    except Exception as e:
        LOGGER.exception(e)
        return response_handle(request, UniResponse.SERVER_UNKNOWN_ERR, 500)
Beispiel #12
0
async def owllook_delete_bookmark(request):
    """

    :param request:
    :return:
        :   -1  用户session失效  需要重新登录
        :   0   删除书签失败
        :   1   删除书签成功
    """
    user = request['session'].get('user', None)
    data = parse_qs(str(request.body, encoding='utf-8'))
    bookmarkurl = data.get('bookmarkurl', '')
    if user and bookmarkurl:
        bookmark = unquote(bookmarkurl[0])
        try:
            motor_db = motor_base.db
            await motor_db.user_message.update_one(
                {'user': user},
                {'$pull': {
                    'bookmarks': {
                        "bookmark": bookmark
                    }
                }})
            LOGGER.info('删除书签成功')
            return json({'status': 1})
        except Exception as e:
            LOGGER.exception(e)
            return json({'status': 0})
    else:
        return json({'status': -1})
Beispiel #13
0
async def update_all_books():
    try:
        motor_db = MotorBase().db
        # 获取所有书架链接游标
        books_url_cursor = motor_db.user_message.find({}, {
            'books_url.book_url': 1,
            '_id': 0
        })
        book_urls = []
        already_urls = set()
        # url_tasks = [get_the_latest_chapter(each_url) for each_url in set(book_urls)]
        # tasks = [asyncio.ensure_future(i) for i in url_tasks]
        # return await asyncio.gather(*tasks)
        async for document in books_url_cursor:
            if document:
                books_url = document['books_url']

                # 一组书架链接列表数据
                # book_urls += [book_url['book_url'] for book_url in books_url]
                for book_url in books_url:
                    chapter_url = book_url['book_url']
                    if chapter_url not in already_urls:
                        try:
                            with async_timeout.timeout(20):
                                await get_the_latest_chapter(chapter_url)
                        except Exception as e:
                            LOGGER.exception(e)
                        already_urls.add(chapter_url)

    except Exception as e:
        LOGGER.exception(e)
        return False
Beispiel #14
0
async def data_extraction_for_web_duck(client, html):
    with async_timeout.timeout(15):
        try:
            try:
                title = html.select('h2 a')[0].get_text()
                url = html.select('h2 a')[0].get('href', None)
                url = parse_qs(url).get('uddg', ['#'])[0]
                netloc = urlparse(url).netloc
                url = url.replace('index.html', '').replace('Index.html', '')
                if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url:
                    return None
                is_parse = 1 if netloc in RULES.keys() else 0
                is_recommend = 1 if netloc in LATEST_RULES.keys() else 0
                # time = html.select('div.b_attribution')[0].get_text()
                # time = re.findall(r'\d+-\d+-\d+', time)
                # time = time[0] if time else ''
                timestamp = 0
                time = ''
                return {
                    'title': title,
                    'url': url,
                    'time': time,
                    'is_parse': is_parse,
                    'is_recommend': is_recommend,
                    'timestamp': timestamp,
                    'netloc': netloc
                }

            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #15
0
async def fetch(client, url, novels_name):
    with async_timeout.timeout(20):
        try:
            headers = {
                'User-Agent': get_random_user_agent(),
                'Referer': "http://www.so.com/haosou.html?src=home"
            }
            params = {
                'ie': 'utf-8',
                'src': 'noscript_home',
                'shb': 1,
                'q': novels_name,
            }
            async with client.get(url, params=params,
                                  headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #16
0
async def owllook_add_book(request):
    """

    :param request:
    :return:
        :   -1  用户session失效  需要重新登录
        :   0   添加书架失败
        :   1   添加书架成功
    """
    user = request['session'].get('user', None)
    data = parse_qs(str(request.body, encoding='utf-8'))
    novels_name = data.get('novels_name', '')
    chapter_url = data.get('chapter_url', '')
    last_read_url = data.get('last_read_url', '')
    if user and novels_name and chapter_url:
        url = "/chapter?url={chapter_url}&novels_name={novels_name}".format(chapter_url=chapter_url[0],
                                                                            novels_name=novels_name[0])
        time = get_time()
        try:
            motor_db = MotorBase().db
            res = await motor_db.user_message.update_one({'user': user}, {'$set': {'last_update_time': time}},
                                                         upsert=True)
            if res:
                await motor_db.user_message.update_one(
                    {'user': user, 'books_url.book_url': {'$ne': url}},
                    {'$push': {
                        'books_url': {'book_url': url, 'add_time': time, 'last_read_url': unquote(last_read_url[0])}}})
                LOGGER.info('书架添加成功')
                return json({'status': 1})
        except Exception as e:
            LOGGER.exception(e)
            return json({'status': 0})
    else:
        return json({'status': -1})
Beispiel #17
0
async def owllook_add_bookmark(request):
    """

    :param request:
    :return:
        :   -1  用户session失效  需要重新登录
        :   0   添加书签失败
        :   1   添加书签成功
    """
    user = request['session'].get('user', None)
    data = parse_qs(str(request.body, encoding='utf-8'))
    bookmarkurl = data.get('bookmarkurl', '')
    if user and bookmarkurl:
        url = unquote(bookmarkurl[0])
        time = get_time()
        try:
            motor_db = MotorBase().db
            res = await motor_db.user_message.update_one({'user': user}, {'$set': {'last_update_time': time}},
                                                         upsert=True)
            if res:
                await motor_db.user_message.update_one(
                    {'user': user, 'bookmarks.bookmark': {'$ne': url}},
                    {'$push': {'bookmarks': {'bookmark': url, 'add_time': time}}})
                LOGGER.info('书签添加成功')
                return json({'status': 1})
        except Exception as e:
            LOGGER.exception(e)
            return json({'status': 0})
    else:
        return json({'status': -1})
Beispiel #18
0
def extract_pre_next_chapter(chapter_url, html):
    """
    获取单章节上一页下一页
    :param chapter_url: 
    :param html: 
    :return: 
    """
    next_chapter = OrderedDict()
    try:
        # 参考https://greasyfork.org/zh-CN/scripts/292-my-novel-reader
        next_reg = r'(<a\s+.*?>.*[上前下后][一]?[页张个篇章节步].*?</a>)'
        judge_reg = r'[上前下后][一]?[页张个篇章节步]'
        # 这里同样需要利用bs再次解析
        next_res = re.findall(next_reg, html, re.I)
        str_next_res = '\n'.join(next_res)
        next_res_soup = BeautifulSoup(str_next_res, 'html5lib')
        for link in next_res_soup.find_all('a'):
            text = link.text or ''
            text = text.replace(' ', '')
            if novels_list(text):
                is_next = re.search(judge_reg, text)
                # is_ok = is_chapter(text)
                if is_next:
                    url = urljoin(chapter_url, link.get('href')) or ''
                    next_chapter[text] = url

        # nextDic = [{v[0]: v[1]} for v in sorted(next_chapter.items(), key=lambda d: d[1])]
        return next_chapter
    except Exception as e:
        LOGGER.exception(e)
        return next_chapter
Beispiel #19
0
async def data_extraction_for_web(html):
    with async_timeout.timeout(10):
        try:
            url = html.find('a').get('href', None)
            if not url or 'baidu' in url or urlparse(url).netloc in BLACK_DOMAIN:
                return None
            netloc = urlparse(url).netloc
            is_parse = 1 if netloc in RULES.keys() else 0
            title = html.select('font[size="3"]')[0].get_text()
            source = html.select('font[color="#008000"]')[0].get_text()
            time = re.findall(r'\d+-\d+-\d+', source)
            time = time[0] if time else None
            timestamp = 0
            if time:
                try:
                    time_list = [int(i) for i in time.split('-')]
                    timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp
                except Exception as e:
                    LOGGER.exception(e)
                    timestamp = 0
            return {'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time,
                    'is_parse': is_parse,
                    'timestamp': timestamp,
                    'netloc': netloc}
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #20
0
async def data_extraction_for_web_so(client, html):
    with async_timeout.timeout(15):
        try:
            # 2017.09.09 修改 更加全面地获取title && url
            try:
                title = html.select('h3 a')[0].get_text()
                url = html.select('h3 a')[0].get('href', None)
            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None
            # 针对不同的请进行url的提取
            if "www.so.com/link?m=" in url:
                url = html.select('h3 a')[0].get('data-url', None)
            if "www.so.com/link?url=" in url:
                url = parse_qs(urlparse(url).query).get('url', None)
                url = url[0] if url else None

            # try:
            #     url = html.select('h3.res-title a')[0].get('data-url', None)
            #     title = html.select('h3.res-title a')[0].get_text()
            # except IndexError:
            #     url = html.select('h3.title a')[0].get('href', None)
            #     url = parse_qs(urlparse(url).query).get('url', None)
            #     url = url[0] if url else None
            #     title = html.select('h3.title a')[0].get_text()
            # except Exception as e:
            #     LOGGER.exception(e)
            #     url, title = None, None
            #     return None

            # 2017.07.09 此处出现bug url展示形式发生变化 因此对于h3.title a形式依旧不变  但是h3.res-title a则取属性data-url
            # url = parse_qs(urlparse(url).query).get('url', None)
            # url = url[0] if url else None

            netloc = urlparse(url).netloc
            if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN:
                return None
            is_parse = 1 if netloc in RULES.keys() else 0
            is_recommend = 1 if netloc in LATEST_RULES.keys() else 0
            time = ''
            timestamp = 0
            return {
                'title': title,
                'url': url.replace('index.html', '').replace('Index.html', ''),
                'time': time,
                'is_parse': is_parse,
                'is_recommend': is_recommend,
                'timestamp': timestamp,
                'netloc': netloc
            }
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #21
0
async def fetch(client, url):
    with async_timeout.timeout(10):
        try:
            headers = {'user-agent': get_random_user_agent()}
            async with client.get(url, headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #22
0
def requests_target_fetch(url):
    """
    :param url:
    :return:
    """
    try:
        headers = {'user-agent': get_random_user_agent()}
        response = requests.get(url=url, headers=headers, verify=False)
        response.raise_for_status()
        content = response.content
        charset = cchardet.detect(content)
        text = content.decode(charset['encoding'])
        return text
    except Exception as e:
        LOGGER.exception(e)
        return None
Beispiel #23
0
async def data_extraction_for_phone(html):
    with async_timeout.timeout(10):
        try:
            # Get title
            data_log = eval(html['data-log'])
            url = data_log.get('mu', None)
            if not url:
                return None
            # Get title
            title = html.find('h3').get_text()
            # Get author and update_time (option)
            novel_mess = html.findAll(class_='c-gap-right-large')
            basic_mess = [i.get_text() for i in novel_mess] if novel_mess else None
            return {'title': title, 'url': url, 'basic_mess': basic_mess}
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #24
0
def get_html_by_requests(url, headers, timeout=15):
    """
    :param url:
    :return:
    """
    try:
        response = requests.get(url=url,
                                headers=headers,
                                verify=False,
                                timeout=timeout)
        response.raise_for_status()
        content = response.content
        charset = cchardet.detect(content)
        text = content.decode(charset['encoding'])
        return text
    except Exception as e:
        LOGGER.exception(e)
        return None
Beispiel #25
0
async def owl_so_novels(request, name):
    """
    360小说信息接口
    :param request: 
    :param name: 小说名
    :return: 小说相关信息
    """
    name = unquote(name)
    novels_name = '{name} 小说 免费阅读'.format(name=name)
    try:
        res = await cache_owllook_so_novels_result(novels_name)
        parse_result = []
        if res:
            parse_result = [i for i in res if i]
        UniResponse.SUCCESS.update({ResponseField.DATA: parse_result, ResponseField.FINISH_AT: get_time()})
        return response_handle(request, UniResponse.SUCCESS, 200)
    except Exception as e:
        LOGGER.exception(e)
        return response_handle(request, UniResponse.SERVER_UNKNOWN_ERR, 500)
Beispiel #26
0
async def data_extraction_for_web_bing(client, html):
    with async_timeout.timeout(15):
        try:
            try:
                title = html.select('h2 a')[0].get_text()
                url = html.select('h2 a')[0].get('href', None)
                netloc = urlparse(url).netloc
                url = url.replace('index.html', '').replace('Index.html', '')
                if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url:
                    return None
                is_parse = 1 if netloc in RULES.keys() else 0
                is_recommend = 1 if netloc in LATEST_RULES.keys() else 0
                # time = html.select('div.b_attribution')[0].get_text()
                # time = re.findall(r'\d+-\d+-\d+', time)
                # time = time[0] if time else ''
                timestamp = 0
                time = ''
                # if time:
                #     try:
                #         time_list = [int(i) for i in time.split('-')]
                #         years = str(time_list[0])[-4:]
                #         timestamp = arrow.get(int(years), time_list[1], time_list[2]).timestamp
                #         time = years + "-" + str(time_list[1]) + "-" + str(time_list[2])
                #     except Exception as e:
                #         LOGGER.exception(e)
                #         timestamp = 0
                return {
                    'title': title,
                    'url': url,
                    'time': time,
                    'is_parse': is_parse,
                    'is_recommend': is_recommend,
                    'timestamp': timestamp,
                    'netloc': netloc
                }

            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #27
0
async def fetch(client, url, novels_name):
    with async_timeout.timeout(20):
        try:
            headers = {
                'user-agent': get_random_user_agent(),
                'referer': "https://www.bing.com/"
            }
            params = {'q': novels_name, 'ensearch': 0}
            async with client.get(url, params=params,
                                  headers=headers) as response:
                assert response.status == 200
                LOGGER.info('Task url: {}'.format(response.url))
                try:
                    text = await response.text()
                except:
                    text = await response.read()
                return text
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #28
0
async def get_real_url(client, url):
    with async_timeout.timeout(10):
        try:
            headers = {'user-agent': get_random_user_agent()}
            async with client.head(url, headers=headers, allow_redirects=True) as response:
                assert response.status == 200
                LOGGER.info('Parse url: {}'.format(response.url))
                # text = ""
                # try:
                #     text = await response.text()
                # except:
                #     text = await response.read()
                # if text:
                #     print(text)
                #     text = re.findall(r'replace\(\"(.*?)\"\)', str(text))
                #     text = text[0] if text[0] else ""
                url = response.url if response.url else None
                return url
        except Exception as e:
            LOGGER.exception(e)
            return None
Beispiel #29
0
 async def fetch_url(self, client, url, params, headers):
     """
     公共抓取函数
     :param client:
     :param url:
     :param params:
     :return:
     """
     with async_timeout.timeout(15):
         try:
             async with client.get(url, params=params,
                                   headers=headers) as response:
                 assert response.status == 200
                 LOGGER.info('Task url: {}'.format(response.url))
                 try:
                     text = await response.text()
                 except:
                     text = await response.read()
                 return text
         except Exception as e:
             LOGGER.exception(e)
             return None
Beispiel #30
0
async def target_fetch(url, headers, timeout=15):
    """
    :param url: target url
    :return: text
    """
    with async_timeout.timeout(timeout):
        try:
            async with aiohttp.ClientSession() as client:
                async with client.get(url, headers=headers) as response:
                    assert response.status == 200
                    LOGGER.info('Task url: {}'.format(response.url))
                    try:
                        text = await response.text()
                    except:
                        try:
                            text = await response.read()
                        except aiohttp.ServerDisconnectedError as e:
                            LOGGER.exception(e)
                            text = None
                    return text
        except Exception as e:
            LOGGER.exception(str(e))
            return None