Example #1
0
async def data_extraction_for_web_duck(client, html):
    with async_timeout.timeout(15):
        try:
            try:
                title = html.select('h2 a')[0].get_text()
                url = html.select('h2 a')[0].get('href', None)
                url = parse_qs(url).get('uddg', ['#'])[0]
                netloc = urlparse(url).netloc
                url = url.replace('index.html', '').replace('Index.html', '')
                if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url:
                    return None
                is_parse = 1 if netloc in RULES.keys() else 0
                is_recommend = 1 if netloc in LATEST_RULES.keys() else 0
                # time = html.select('div.b_attribution')[0].get_text()
                # time = re.findall(r'\d+-\d+-\d+', time)
                # time = time[0] if time else ''
                timestamp = 0
                time = ''
                return {
                    'title': title,
                    'url': url,
                    'time': time,
                    'is_parse': is_parse,
                    'is_recommend': is_recommend,
                    'timestamp': timestamp,
                    'netloc': netloc
                }

            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None
        except Exception as e:
            LOGGER.exception(e)
            return None
Example #2
0
async def data_extraction_for_web_so(client, html):
    with async_timeout.timeout(15):
        try:
            try:
                url = html.select('h3.res-title a')[0].get('href', None)
                title = html.select('h3.res-title a')[0].get_text()
            except IndexError:
                url = html.select('h3.title a')[0].get('href', None)
                title = html.select('h3.title a')[0].get_text()
            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None

            url = parse_qs(urlparse(url).query).get('url', None)
            url = url[0] if url else None
            netloc = urlparse(url).netloc
            if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN:
                return None
            is_parse = 1 if netloc in RULES.keys() else 0
            time = ''
            timestamp = 0
            return {
                'title': title,
                'url': url.replace('index.html', '').replace('Index.html', ''),
                'time': time,
                'is_parse': is_parse,
                'timestamp': timestamp,
                'netloc': netloc
            }
        except Exception as e:
            LOGGER.exception(e)
            return None
Example #3
0
async def data_extraction_for_web(html):
    with async_timeout.timeout(10):
        try:
            url = html.find('a').get('href', None)
            if not url or 'baidu' in url or urlparse(url).netloc in BLACK_DOMAIN:
                return None
            netloc = urlparse(url).netloc
            is_parse = 1 if netloc in RULES.keys() else 0
            title = html.select('font[size="3"]')[0].get_text()
            source = html.select('font[color="#008000"]')[0].get_text()
            time = re.findall(r'\d+-\d+-\d+', source)
            time = time[0] if time else None
            timestamp = 0
            if time:
                try:
                    time_list = [int(i) for i in time.split('-')]
                    timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp
                except Exception as e:
                    LOGGER.exception(e)
                    timestamp = 0
            return {'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time,
                    'is_parse': is_parse,
                    'timestamp': timestamp,
                    'netloc': netloc}
        except Exception as e:
            LOGGER.exception(e)
            return None
Example #4
0
async def chapter(request):
    """
    返回小说章节目录页
    : content_url   这决定当前U页面url的生成方式
    : url           章节目录页源url
    : novels_name   小说名称
    :return: 小说章节内容页
    """
    url = request.args.get('url', None)
    novels_name = request.args.get('novels_name', None)
    netloc = get_netloc(url)
    if netloc not in RULES.keys():
        return redirect(url)
    if netloc in REPLACE_RULES.keys():
        url = url.replace(REPLACE_RULES[netloc]['old'],
                          REPLACE_RULES[netloc]['new'])
    content_url = RULES[netloc].content_url
    content = await cache_owllook_novels_chapter(url=url, netloc=netloc)
    if content:
        content = str(content).strip('[],, Jjs').replace(', ', '').replace(
            'onerror', '').replace('js', '').replace('加入书架', '')
        return template('chapter.html',
                        novels_name=novels_name,
                        url=url,
                        content_url=content_url,
                        soup=content)
    else:
        return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
Example #5
0
async def data_extraction_for_web_baidu(client, html):
    with async_timeout.timeout(20):
        try:
            url = html.select('h3.t a')[0].get('href', None)
            real_url = await get_real_url(client=client, url=url) if url else None
            if real_url:
                netloc = urlparse(real_url).netloc
                if 'baidu' in real_url or netloc in BLACK_DOMAIN:
                    return None
                is_parse = 1 if netloc in RULES.keys() else 0
                title = html.select('h3.t a')[0].get_text()
                # time = re.findall(r'\d+-\d+-\d+', source)
                # time = time[0] if time else None
                timestamp = 0
                time = ""
                # if time:
                #     try:
                #         time_list = [int(i) for i in time.split('-')]
                #         timestamp = arrow.get(time_list[0], time_list[1], time_list[2]).timestamp
                #     except Exception as e:
                #         LOGGER.exception(e)
                #         timestamp = 0
                return {'title': title, 'url': real_url.replace('index.html', ''), 'time': time, 'is_parse': is_parse,
                        'timestamp': timestamp,
                        'netloc': netloc}
            else:
                return None
        except Exception as e:
            LOGGER.exception(e)
            return None
Example #6
0
async def chapter(request):
    """
    返回小说章节目录页
    : content_url   这决定当前U页面url的生成方式
    : url           章节目录页源url
    : novels_name   小说名称
    :return: 小说章节内容页
    """
    url = request.args.get('url', None)
    novels_name = request.args.get('novels_name', None)
    netloc = get_netloc(url)
    if netloc not in RULES.keys():
        return redirect(url)
    if netloc in REPLACE_RULES.keys():
        url = url.replace(REPLACE_RULES[netloc]['old'],
                          REPLACE_RULES[netloc]['new'])
    content_url = RULES[netloc].content_url
    content = await cache_owllook_novels_chapter(url=url, netloc=netloc)
    if content:
        content = str(content).strip('[],, Jjs').replace(', ', '').replace(
            'onerror', '').replace('js', '').replace('加入书架', '')
        if request.args.get('add_kindle', None):
            h = areader()
            h.feed(content)
            if (content_url == '1'):
                content_url = ''
            elif (content_url == '0'):
                content_url = url
            elif (content_url == '-1'):
                content_url = url
            (a, b) = h.chapters[0]
            link = "http://127.0.0.1:8001/owllook_content?url=" + content_url + "%s&name=%s&chapter_url=" + url + "&novels_name=%s"
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            s.connect(('127.0.0.1', 31419))
            f = open('/tmp/ow_links', 'w')
            print(len(h.chapters))
            jjj.dump([{
                'title':
                title,
                'url':
                link % (curl, urllib.parse.quote(title),
                        urllib.parse.quote(novels_name))
            } for (title, curl) in h.chapters], f)
            f.close()
            s.send(
                pickle.dumps(
                    (novels_name, len(h.chapters), "*****@*****.**")))
            return redirect("https://fss.cjwddtc.win")
        return template('chapter.html',
                        novels_name=novels_name,
                        url=url,
                        content_url=content_url,
                        soup=content)
    else:
        return text('解析失败,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))
Example #7
0
async def data_extraction_for_web_so(client, html):
    with async_timeout.timeout(15):
        try:
            # 2017.09.09 修改 更加全面地获取title && url
            try:
                title = html.select('h3 a')[0].get_text()
                url = html.select('h3 a')[0].get('href', None)
            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None
            # 针对不同的请进行url的提取
            if "www.so.com/link?m=" in url:
                url = html.select('h3 a')[0].get('data-url', None)
            if "www.so.com/link?url=" in url:
                url = parse_qs(urlparse(url).query).get('url', None)
                url = url[0] if url else None

            # try:
            #     url = html.select('h3.res-title a')[0].get('data-url', None)
            #     title = html.select('h3.res-title a')[0].get_text()
            # except IndexError:
            #     url = html.select('h3.title a')[0].get('href', None)
            #     url = parse_qs(urlparse(url).query).get('url', None)
            #     url = url[0] if url else None
            #     title = html.select('h3.title a')[0].get_text()
            # except Exception as e:
            #     LOGGER.exception(e)
            #     url, title = None, None
            #     return None

            # 2017.07.09 此处出现bug url展示形式发生变化 因此对于h3.title a形式依旧不变  但是h3.res-title a则取属性data-url
            # url = parse_qs(urlparse(url).query).get('url', None)
            # url = url[0] if url else None

            netloc = urlparse(url).netloc
            if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN:
                return None
            is_parse = 1 if netloc in RULES.keys() else 0
            is_recommend = 1 if netloc in LATEST_RULES.keys() else 0
            time = ''
            timestamp = 0
            return {
                'title': title,
                'url': url.replace('index.html', '').replace('Index.html', ''),
                'time': time,
                'is_parse': is_parse,
                'is_recommend': is_recommend,
                'timestamp': timestamp,
                'netloc': netloc
            }
        except Exception as e:
            LOGGER.exception(e)
            return None
Example #8
0
async def data_extraction_for_web_bing(client, html):
    with async_timeout.timeout(15):
        try:
            try:
                title = html.select('h2 a')[0].get_text()
                url = html.select('h2 a')[0].get('href', None)
                netloc = urlparse(url).netloc
                url = url.replace('index.html', '').replace('Index.html', '')
                if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN or '.html' in url:
                    return None
                is_parse = 1 if netloc in RULES.keys() else 0
                is_recommend = 1 if netloc in LATEST_RULES.keys() else 0
                # time = html.select('div.b_attribution')[0].get_text()
                # time = re.findall(r'\d+-\d+-\d+', time)
                # time = time[0] if time else ''
                timestamp = 0
                time = ''
                # if time:
                #     try:
                #         time_list = [int(i) for i in time.split('-')]
                #         years = str(time_list[0])[-4:]
                #         timestamp = arrow.get(int(years), time_list[1], time_list[2]).timestamp
                #         time = years + "-" + str(time_list[1]) + "-" + str(time_list[2])
                #     except Exception as e:
                #         LOGGER.exception(e)
                #         timestamp = 0
                return {
                    'title': title,
                    'url': url,
                    'time': time,
                    'is_parse': is_parse,
                    'is_recommend': is_recommend,
                    'timestamp': timestamp,
                    'netloc': netloc
                }

            except Exception as e:
                LOGGER.exception(e)
                url, title = None, None
                return None
        except Exception as e:
            LOGGER.exception(e)
            return None
Example #9
0
async def owllook_content(request):
    """
    返回小说章节内容页
    : content_url   这决定当前U页面url的生成方式
    : url           章节内容页源url
    : chapter_url   小说目录源url
    : novels_name   小说名称
    :return: 小说章节内容页
    """
    url = request.args.get('url', None)
    chapter_url = request.args.get('chapter_url', None)
    novels_name = request.args.get('novels_name', None)
    name = request.args.get('name', '')
    is_ajax = request.args.get('is_ajax', '')
    # 当小说内容url不在解析规则内 跳转到原本url
    netloc = get_netloc(url)
    if netloc not in RULES.keys():
        return redirect(url)
    user = request['session'].get('user', None)
    # 拼接小说目录url
    book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format(
        chapter_url=chapter_url,
        novels_name=novels_name)
    motor_db = motor_base.get_db()
    if url == chapter_url:
        # 阅读到最后章节时候 在数据库中保存最新阅读章节
        if user and is_ajax == "owl_cache":
            owl_referer = request.headers.get('Referer', '').split('owllook_content')[1]
            if owl_referer:
                latest_read = "/owllook_content" + owl_referer
                await motor_db.user_message.update_one(
                    {'user': user, 'books_url.book_url': book_url},
                    {'$set': {'books_url.$.last_read_url': latest_read}})
        return redirect(book_url)
    content_url = RULES[netloc].content_url
    content_data = await cache_owllook_novels_content(url=url, netloc=netloc)
    if content_data:
        try:
            content = content_data.get('content', '获取失败')
            next_chapter = content_data.get('next_chapter', [])
            title = content_data.get('title', '').replace(novels_name, '')
            name = title if title else name
            # 拼接小说书签url
            bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format(
                path=request.path,
                url=url,
                name=name,
                chapter_url=chapter_url,
                novels_name=novels_name
            )
            # 破坏广告链接
            content = str(content).strip('[]Jjs,').replace('http', 'hs')
            if user:
                bookmark = await motor_db.user_message.find_one({'user': user, 'bookmarks.bookmark': bookmark_url})
                book = await motor_db.user_message.find_one({'user': user, 'books_url.book_url': book_url})
                bookmark = 1 if bookmark else 0
                if book:
                    # 当书架中存在该书源
                    book = 1
                    # 保存最后一次阅读记录
                    if is_ajax == "owl_cache":
                        owl_referer = request.headers.get('Referer', bookmark_url).split('owllook_content')[1]
                        latest_read = "/owllook_content" + owl_referer
                        await motor_db.user_message.update_one(
                            {'user': user, 'books_url.book_url': book_url},
                            {'$set': {'books_url.$.last_read_url': latest_read}})
                else:
                    book = 0
                if is_ajax == "owl_cache":
                    owl_cache_dict = dict(
                        is_login=1,
                        user=user,
                        name=name,
                        url=url,
                        bookmark=bookmark,
                        book=book,
                        content_url=content_url,
                        chapter_url=chapter_url,
                        novels_name=novels_name,
                        next_chapter=next_chapter,
                        soup=content
                    )
                    return json(owl_cache_dict)
                return template(
                    'content.html',
                    is_login=1,
                    user=user,
                    name=name,
                    url=url,
                    bookmark=bookmark,
                    book=book,
                    content_url=content_url,
                    chapter_url=chapter_url,
                    novels_name=novels_name,
                    next_chapter=next_chapter,
                    soup=content)
            else:
                if is_ajax == "owl_cache":
                    owl_cache_dict = dict(
                        is_login=0,
                        name=name,
                        url=url,
                        bookmark=0,
                        book=0,
                        content_url=content_url,
                        chapter_url=chapter_url,
                        novels_name=novels_name,
                        next_chapter=next_chapter,
                        soup=content
                    )
                    return json(owl_cache_dict)
                return template(
                    'content.html',
                    is_login=0,
                    name=name,
                    url=url,
                    bookmark=0,
                    book=0,
                    content_url=content_url,
                    chapter_url=chapter_url,
                    novels_name=novels_name,
                    next_chapter=next_chapter,
                    soup=content)
        except Exception as e:
            LOGGER.exception(e)
            return redirect(book_url)
    else:
        if user:
            is_login = 1
            user = user
            return template('parse_error.html', url=url, is_login=is_login, user=user)
        else:
            is_login = 0
            return template('parse_error.html', url=url, is_login=is_login)
Example #10
0
async def owllook_content(request):
    """
    返回小说章节内容页
    : content_url   这决定当前U页面url的生成方式
    : url           章节内容页源url
    : chapter_url   小说目录源url
    : novels_name   小说名称
    :return: 小说章节内容页
    """
    url = request.args.get('url', None)
    chapter_url = request.args.get('chapter_url', None)
    novels_name = request.args.get('novels_name', None)
    name = request.args.get('name', '')
    # 当小说内容url不在解析规则内 跳转到原本url
    netloc = get_netloc(url)
    if netloc not in RULES.keys():
        return redirect(url)
    # 拼接小说目录url
    book_url = "/chapter?url={chapter_url}&novels_name={novels_name}".format(
        chapter_url=chapter_url,
        novels_name=novels_name)
    if url == chapter_url:
        return redirect(book_url)
    content_url = RULES[netloc].content_url
    content_data = await cache_owllook_novels_content(url=url, netloc=netloc)
    if content_data:
        user = request['session'].get('user', None)
        try:
            content = content_data.get('content', '获取失败')
            next_chapter = content_data.get('next_chapter', [])
            title = content_data.get('title', '').replace(novels_name, '')
            name = title if title else name
            # 拼接小说书签url
            bookmark_url = "{path}?url={url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format(
                path=request.path,
                url=url,
                name=name,
                chapter_url=chapter_url,
                novels_name=novels_name
            )
            # 破坏广告链接
            content = str(content).strip('[]Jjs,').replace('http', 'hs')
            if user:
                motor_db = motor_base.db
                bookmark = await motor_db.user_message.find_one({'user': user, 'bookmarks.bookmark': bookmark_url})
                book = await motor_db.user_message.find_one({'user': user, 'books_url.book_url': book_url})
                bookmark = 1 if bookmark else 0
                if book:
                    # 当书架中存在该书源
                    book = 1
                    # 保存最后一次阅读记录
                    await motor_db.user_message.update_one(
                        {'user': user, 'books_url.book_url': book_url},
                        {'$set': {'books_url.$.last_read_url': bookmark_url}})
                else:
                    book = 0
                return template(
                    'content.html',
                    is_login=1,
                    user=user,
                    name=name,
                    url=url,
                    bookmark=bookmark,
                    book=book,
                    content_url=content_url,
                    chapter_url=chapter_url,
                    novels_name=novels_name,
                    next_chapter=next_chapter,
                    soup=content)
            else:
                return template(
                    'content.html',
                    is_login=0,
                    name=name,
                    url=url,
                    bookmark=0,
                    book=0,
                    content_url=content_url,
                    chapter_url=chapter_url,
                    novels_name=novels_name,
                    next_chapter=next_chapter,
                    soup=content)
        except Exception as e:
            LOGGER.exception(e)
            return redirect(book_url)
    else:
        return text('解析失败或者是没有下一页了,请将失败页面反馈给本站,请重新刷新一次,或者访问源网页:{url}'.format(url=url))