Esempio n. 1
0
async def cache_owllook_novels_content(url, netloc):
    async with aiohttp.ClientSession() as client:
        html = await target_fetch(client=client, url=url)
        if html:
            soup = BeautifulSoup(html, 'html5lib')
            selector = RULES[netloc].content_selector
            if selector.get('id', None):
                content = soup.find_all(id=selector['id'])
            elif selector.get('class', None):
                content = soup.find_all(class_=selector['class'])
            else:
                content = soup.find_all(selector.get('tag'))
            if content:
                # 提取出真正的章节标题
                title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-91234567890]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]'
                title = soup.title.string
                extract_title = re.findall(title_reg, title, re.I)
                title = extract_title[0] if extract_title else title
                # if "_" in title:
                #     title = title.split('_')[0]
                # elif "-" in title:
                #     title = title.split('-')[0]
                next_chapter = extract_pre_next_chapter(chapter_url=url,
                                                        html=str(soup))
                data = {
                    'content': str(content),
                    'next_chapter': next_chapter,
                    'title': title
                }
            else:
                data = None
            return data
        return None
Esempio n. 2
0
async def cache_owllook_novels_content(url, chapter_url, netloc):
    headers = {'user-agent': await get_random_user_agent()}
    html = await target_fetch(headers=headers, url=url)
    if not html:
        html = get_html_by_requests(url=url, headers=headers)
    if html:
        soup = BeautifulSoup(html, 'html5lib')
        selector = RULES[netloc].content_selector
        if selector.get('id', None):
            content = soup.find_all(id=selector['id'])
        elif selector.get('class', None):
            content = soup.find_all(class_=selector['class'])
        else:
            content = soup.find_all(selector.get('tag'))
        if content:
            # 提取出真正的章节标题
            title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-91234567890]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]'
            title = soup.title.string
            extract_title = re.findall(title_reg, title, re.I)
            if extract_title:
                title = extract_title[0]
            else:
                title = soup.select('h1')[0].get_text()
            if not title:
                title = soup.title.string
            # if "_" in title:
            #     title = title.split('_')[0]
            # elif "-" in title:
            #     title = title.split('-')[0]
            next_chapter = extract_pre_next_chapter(url=url,
                                                    chapter_url=chapter_url,
                                                    html=str(soup))
            content = [str(i) for i in content]
            data = {
                'content': str(''.join(content)),
                'next_chapter': next_chapter,
                'title': title
            }
        else:
            data = None
        return data
    return None