Exemple #1
0
async def data_extraction_for_web_so(client, html):
    with async_timeout.timeout(15):
        try:
            try:
                url = html.select('h3.res-title a')[0].get('href', None)
                title = html.select('h3.res-title a')[0].get_text()
            except IndexError:
                url = html.select('h3.title a')[0].get('href', None)
                title = html.select('h3.title a')[0].get_text()
            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None

            url = parse_qs(urlparse(url).query).get('url', None)
            url = url[0] if url else None
            netloc = urlparse(url).netloc
            if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN:
                return None
            is_parse = 1 if netloc in RULES.keys() else 0
            time = ''
            timestamp = 0
            return {
                'title': title,
                'url': url.replace('index.html', ''),
                'time': time,
                'is_parse': is_parse,
                'timestamp': timestamp,
                'netloc': netloc
            }
        except Exception as e:
            LOGGER.exception(e)
            return None
Exemple #2
0
async def data_extraction_for_web(html):
    with async_timeout.timeout(10):
        try:
            url = html.find('a').get('href', None)
            if not url or 'baidu' in url or urlparse(
                    url).netloc in BLACK_DOMAIN:
                return None
            netloc = urlparse(url).netloc
            is_parse = 1 if netloc in RULES.keys() else 0
            title = html.select('font[size="3"]')[0].get_text()
            source = html.select('font[color="#008000"]')[0].get_text()
            time = re.findall(r'\d+-\d+-\d+', source)
            time = time[0] if time else None
            timestamp = 0
            if time:
                try:
                    time_list = [int(i) for i in time.split('-')]
                    timestamp = arrow.get(time_list[0], time_list[1],
                                          time_list[2]).timestamp
                except Exception as e:
                    LOGGER.exception(e)
                    timestamp = 0
            return {
                'title': title,
                'url': url.replace('index.html', '').replace('Index.html', ''),
                'time': time,
                'is_parse': is_parse,
                'timestamp': timestamp,
                'netloc': netloc
            }
        except Exception as e:
            LOGGER.exception(e)
            return None
Exemple #3
0
async def data_extraction_for_web_baidu(client, html):
    with async_timeout.timeout(20):
        try:
            url = html.select('h3.t a')[0].get('href', None)
            real_url = await get_real_url(client=client, url=url) if url else None
            if real_url:
                netloc = urlparse(real_url).netloc
                if 'baidu' in real_url or netloc  in BLACK_DOMAIN:
                    return None
                is_parse = 1 if netloc in RULES.keys() else 0
                title = html.select('h3.t a')[0].get_text()
                source = real_url
                # time = re.findall(r'\d+-\d+-\d+', source)
                # time = time[0] if time else None
                timestamp = 0
                time = ""
                # if time:
                #     try:
                #         time_list = [int(i) for i in time.split('-')]
                #         timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp
                #     except Exception as e:
                #         LOGGER.exception(e)
                #         timestamp = 0
                return {'title': title, 'url': real_url.replace('index.html', ''), 'time': time, 'is_parse': is_parse,
                        'timestamp': timestamp,
                        'netloc': netloc}
            else:
                return None
        except Exception as e:
            LOGGER.exception(e)
            return None
async def chapter(request):
    """
    返回小说章节目录页
    : content_url   这决定当前U页面url的生成方式
    : url           章节目录页源url
    : novels_name   小说名称
    :return: 小说章节内容页
    """
    url = request.args.get('url', None)
    novels_name = request.args.get('novels_name', None)
    netloc = urlparse(url).netloc
    if netloc not in RULES.keys():
        return redirect(url)
    if netloc in REPLACE_RULES.keys():
        url = url.replace(REPLACE_RULES[netloc]['old'],
                          REPLACE_RULES[netloc]['new'])
    content_url = RULES[netloc].content_url
    content = await cache_owllook_novels_chapter(url=url, netloc=netloc)
    if content:
        content = str(content).strip('[],, Jjs')
        return template('chapter.html',
                        novels_name=novels_name,
                        url=url,
                        content_url=content_url,
                        soup=content)
    else:
        return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
Exemple #5
0
async def owllook_content(request):
    url = request.args.get('url', None)
    chapter_url = request.args.get('chapter_url', None)
    novels_name = request.args.get('novels_name', None)
    name = request.args.get('name', None)
    bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format(
        path=request.url,
        url=url,
        name=name,
        chapter_url=chapter_url,
        novels_name=novels_name)
    book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format(
        chapter_url=chapter_url, novels_name=novels_name)
    netloc = urlparse(url).netloc
    if netloc not in RULES.keys():
        return redirect(url)
    content_url = RULES[netloc].content_url
    content = await cache_owllook_novels_content(url=url, netloc=netloc)
    if content:
        user = request['session'].get('user', None)
        # 破坏广告链接
        content = str(content).strip('[]Jjs,').replace('http', 'hs')
        if user:
            motor_db = MotorBase().db
            bookmark = await motor_db.user_message.find_one(
                {'bookmarks.bookmark': bookmark_url})
            book = await motor_db.user_message.find_one(
                {'books_url.book_url': book_url})
            bookmark = 1 if bookmark else 0
            book = 1 if book else 0
            return template('content.html',
                            is_login=1,
                            user=user,
                            name=name,
                            url=url,
                            bookmark=bookmark,
                            book=book,
                            content_url=content_url,
                            chapter_url=chapter_url,
                            novels_name=novels_name,
                            soup=content)
        else:
            return template('content.html',
                            is_login=0,
                            name=name,
                            url=url,
                            bookmark=0,
                            book=0,
                            content_url=content_url,
                            chapter_url=chapter_url,
                            novels_name=novels_name,
                            soup=content)
    else:
        return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
async def chapter(request):
    url = request.args.get('url', None)
    novels_name = request.args.get('novels_name', None)
    netloc = urlparse(url).netloc
    if netloc not in RULES.keys():
        return redirect(url)
    content_url = RULES[netloc].content_url
    content = await cache_owllook_novels_chapter(url=url, netloc=netloc)
    if content:
        content = str(content).strip('[],')
        return template('chapter.html',
                        novels_name=novels_name,
                        url=url,
                        content_url=content_url,
                        soup=content)
    else:
        return text('解析失败,请将失败页面反馈给本站')
async def owllook_content(request):
    """
    返回小说章节内容页
    : content_url   这决定当前U页面url的生成方式
    : url           章节内容页源url
    : chapter_url   小说目录源url
    : novels_name   小说名称
    :return: 小说章节内容页
    """
    url = request.args.get('url', None)
    chapter_url = request.args.get('chapter_url', None)
    novels_name = request.args.get('novels_name', None)
    name = request.args.get('name', None)
    # 当小说内容url不在解析规则内 跳转到原本url
    netloc = urlparse(url).netloc
    if netloc not in RULES.keys():
        return redirect(url)
    # 拼接小说书签url
    bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format(
        path=request.path,
        url=url,
        name=name,
        chapter_url=chapter_url,
        novels_name=novels_name)
    # 拼接小说目录url
    book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format(
        chapter_url=chapter_url, novels_name=novels_name)
    content_url = RULES[netloc].content_url
    content = await cache_owllook_novels_content(url=url, netloc=netloc)
    if content:
        user = request['session'].get('user', None)
        # 破坏广告链接
        content = str(content).strip('[]Jjs,').replace('http', 'hs')
        if user:
            motor_db = MotorBase().db
            bookmark = await motor_db.user_message.find_one({
                'user':
                user,
                'bookmarks.bookmark':
                bookmark_url
            })
            book = await motor_db.user_message.find_one({
                'user':
                user,
                'books_url.book_url':
                book_url
            })
            bookmark = 1 if bookmark else 0
            if book:
                # 当书架中存在该书源
                book = 1
                # 保存最后一次阅读记录
                await motor_db.user_message.update_one(
                    {
                        'user': user,
                        'books_url.book_url': book_url
                    }, {'$set': {
                        'books_url.$.last_read_url': bookmark_url
                    }})
            else:
                book = 0
            return template('content.html',
                            is_login=1,
                            user=user,
                            name=name,
                            url=url,
                            bookmark=bookmark,
                            book=book,
                            content_url=content_url,
                            chapter_url=chapter_url,
                            novels_name=novels_name,
                            soup=content)
        else:
            return template('content.html',
                            is_login=0,
                            name=name,
                            url=url,
                            bookmark=0,
                            book=0,
                            content_url=content_url,
                            chapter_url=chapter_url,
                            novels_name=novels_name,
                            soup=content)
    else:
        return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))