Exemple #1
0
async def update_all_books(loop, timeout=15):
    try:
        motor_db = MotorBase().get_db()
        # 获取所有书架链接游标
        books_url_cursor = motor_db.user_message.find({}, {
            'books_url.book_url': 1,
            '_id': 0
        })
        book_urls = []
        already_urls = set()
        async for document in books_url_cursor:
            if document:
                books_url = document['books_url']

                for book_url in books_url:
                    chapter_url = book_url['book_url']
                    if chapter_url not in already_urls:
                        try:
                            await get_the_latest_chapter(chapter_url, timeout)
                        except Exception as e:
                            LOGGER.exception(e)
                        already_urls.add(chapter_url)
                        # 一组书架链接列表数据
                        #         book_urls += [book_url['book_url'] for book_url in books_url]
                        # url_tasks = [get_the_latest_chapter(each_url, loop) for each_url in set(book_urls)]
                        # tasks = [asyncio.ensure_future(i) for i in url_tasks]
                        # try:
                        #     await asyncio.gather(*tasks)
                        # except asyncio.TimeoutError as e:
                        #     pass
    except Exception as e:
        LOGGER.exception(e)
        return False
Exemple #2
0
class ZHNovelInfoSpider(Spider):
    start_urls = []
    request_config = {'RETRIES': 3, 'DELAY': 2, 'TIMEOUT': 10}
    motor_db = MotorBase(loop=loop).get_db()

    async def parse(self, res):
        item = await ZHNovelInfoItem.get_item(html=res.html)

        item_data = {
            'novel_name': item.novel_name,
            'author': item.author,
            'cover': item.cover,
            'abstract': item.abstract,
            'status': item.status,
            'novels_type': item.novels_type,
            'novel_chapter_url': item.novel_chapter_url,
            'target_url': res.url,
            'spider': 'zongheng',
            'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
        }

        print('获取 {} 小说信息成功'.format(item_data['novel_name']))
        print(item_data)
        await self.motor_db.all_novels_info.update_one(
            {
                'novel_name': item_data['novel_name'],
                'spider': 'zongheng'
            }, {'$set': item_data},
            upsert=True)
class ZHNovelsSpider(Spider):
    start_urls = [
        'http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html'
    ]

    request_config = {'RETRIES': 8, 'DELAY': 0, 'TIMEOUT': 3}
    concurrency = 60
    motor_db = MotorBase(loop=loop).get_db()

    async def parse(self, res):
        items_data = await ZHNovelsItem.get_items(html=res.html)
        tasks = []
        for item in items_data:
            if item.novel_url:
                res_dic = {
                    'novel_url': item.novel_url,
                    'novel_name': item.novel_name,
                    'novel_author': item.novel_author,
                    'novel_author_home_url': item.novel_author_home_url,
                    'novel_type': item.novel_type,
                    'novel_cover': item.novel_cover,
                    'novel_abstract': item.novel_abstract,
                    'novel_latest_chapter': item.novel_latest_chapter,
                    'spider': 'zongheng',
                    'updated_at': time.strftime("%Y-%m-%d %X",
                                                time.localtime()),
                }
                tasks.append(asyncio.ensure_future(self.save(res_dic)))
                # if self.all_novels_col.find_one(
                #         {"novel_name": item.novel_name, 'novel_author': item.novel_author}) is None:
                #     self.all_novels_col.insert_one(res_dic)
                #     # async_callback(self.save, res_dic=res_dic)
                #     print(item.novel_name + ' - 抓取成功')
        good_nums = 0
        if tasks:
            done_list, pending_list = await asyncio.wait(tasks)
            for task in done_list:
                if task.result():
                    good_nums += 1
        print(f"共{len(tasks)}本小说,抓取成功{good_nums}本")

    async def save(self, res_dic):
        # 存进数据库
        res_dic = res_dic
        try:

            await self.motor_db.all_novels.update_one(
                {
                    'novel_url': res_dic['novel_url'],
                    'novel_name': res_dic['novel_name']
                }, {'$set': res_dic},
                upsert=True)
            print(res_dic['novel_name'] + ' - 抓取成功')
            return True
        except Exception as e:
            self.logger.exception(e)
            return False
Exemple #4
0
async def cache_others_search_ranking(spider='qidian', novel_type='全部类别'):
    motor_db = MotorBase().get_db()
    item_data = await motor_db.novels_ranking.find_one(
        {
            'spider': spider,
            'type': novel_type
        }, {
            'data': 1,
            '_id': 0
        })
    return item_data
Exemple #5
0
 async def save(self, res_dic):
     # 存进数据库
     try:
         motor_db = MotorBase().get_db()
         await motor_db.all_novels_info.update_one(
             {
                 'novel_name': res_dic['novel_name'],
                 'spider': 'heiyan'
             }, {'$set': res_dic},
             upsert=True)
     except Exception as e:
         self.logger.exception(e)
Exemple #6
0
class QidianNovelsSpider(Spider):
    # start_urls = ['https://www.qidian.com/all?page=1']

    request_config = {'RETRIES': 15, 'DELAY': 0, 'TIMEOUT': 3}
    concurrency = 20
    motor_db = MotorBase(loop=loop).get_db()

    async def parse(self, res):
        items_data = await QidianNovelsItem.get_items(html=res.html)
        tasks = []
        for item in items_data:
            res_dic = {
                'novel_url': item.novel_url,
                'novel_name': item.novel_name,
                'novel_author': item.novel_author,
                'novel_author_home_url': item.novel_author_home_url,
                'novel_type': item.novel_type,
                'novel_cover': item.novel_cover,
                'novel_abstract': item.novel_abstract,
                'spider': 'qidian',
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
            }
            tasks.append(asyncio.ensure_future(self.save(res_dic)))

        good_nums = 0
        if tasks:
            done_list, pending_list = await asyncio.wait(tasks)
            for task in done_list:
                if task.result():
                    good_nums += 1
        print(f"共{len(tasks)}本小说,抓取成功{good_nums}本")

    async def save(self, res_dic):
        # 存进数据库
        try:
            await self.motor_db.all_novels.update_one(
                {
                    'novel_url': res_dic['novel_url'],
                    'novel_name': res_dic['novel_name']
                }, {'$set': res_dic},
                upsert=True)
            print(res_dic['novel_name'] + ' - 抓取成功')
            return True
        except Exception as e:
            self.logger.exception(e)
            return False
Exemple #7
0
async def cache_owllook_search_ranking():
    motor_db = MotorBase().get_db()
    keyword_cursor = motor_db.search_records.find({
        'count': {
            '$gte': 50
        }
    }, {
        'keyword': 1,
        'count': 1,
        '_id': 0
    }).sort('count', -1).limit(35)
    result = []
    index = 1
    async for document in keyword_cursor:
        result.append({
            'keyword': document['keyword'],
            'count': document['count'],
            'index': index
        })
        index += 1
    return result
Exemple #8
0
    async def parse(self, res):
        self.motor_db = MotorBase(loop=self.loop).get_db()
        item = await HYNovelInfoItem.get_item(html=res.html)

        item_data = {
            'novel_name': item.novel_name,
            'author': item.author,
            'cover': item.cover,
            'abstract': item.abstract,
            'status': item.status,
            'novels_type': item.novels_type,
            'novel_chapter_url': item.novel_chapter_url,
            'latest_chapter': item.latest_chapter,
            'latest_chapter_time': item.latest_chapter_time,
            'spider': 'heiyan',
            'target_url': res.url,
            'updated_at': time.strftime("%Y-%m-%d %X", time.localtime())
        }

        print('获取 {} 小说信息成功'.format(item_data['novel_name']))
        await self.save(res_dic=item_data)
Exemple #9
0
async def get_the_latest_chapter(chapter_url, timeout=15):
    try:
        with async_timeout.timeout(timeout):
            url = parse_qs(urlparse(chapter_url).query).get('url', '')
            novels_name = parse_qs(urlparse(chapter_url).query).get(
                'novels_name', '')
            data = None
            if url and novels_name:
                url = url[0]
                novels_name = novels_name[0]
                netloc = urlparse(url).netloc
                if netloc in LATEST_RULES.keys():
                    headers = {'user-agent': await get_random_user_agent()}
                    try:
                        html = await target_fetch(url=url,
                                                  headers=headers,
                                                  timeout=timeout)
                        if html is None:
                            html = get_html_by_requests(url=url,
                                                        headers=headers,
                                                        timeout=timeout)
                    except TypeError:
                        html = get_html_by_requests(url=url,
                                                    headers=headers,
                                                    timeout=timeout)
                    except Exception as e:
                        LOGGER.exception(e)
                        return None
                    try:
                        soup = BeautifulSoup(html, 'html5lib')
                    except Exception as e:
                        LOGGER.exception(e)
                        return None
                    latest_chapter_name, latest_chapter_url = None, None
                    if LATEST_RULES[netloc].plan:
                        meta_value = LATEST_RULES[netloc].meta_value
                        latest_chapter_name = soup.select(
                            'meta[property="{0}"]'.format(
                                meta_value["latest_chapter_name"])
                        ) or soup.select('meta[name="{0}"]'.format(
                            meta_value["latest_chapter_name"]))

                        latest_chapter_name = latest_chapter_name[0].get(
                            'content', None) if latest_chapter_name else None
                        latest_chapter_url = soup.select(
                            'meta[property="{0}"]'.format(
                                meta_value["latest_chapter_url"])
                        ) or soup.select('meta[name="{0}"]'.format(
                            meta_value["latest_chapter_url"]))
                        latest_chapter_url = urljoin(
                            chapter_url, latest_chapter_url[0].get(
                                'content',
                                None)) if latest_chapter_url else None
                    else:
                        selector = LATEST_RULES[netloc].selector
                        content_url = selector.get('content_url')
                        if selector.get('id', None):
                            latest_chapter_soup = soup.find_all(
                                id=selector['id'])
                        elif selector.get('class', None):
                            latest_chapter_soup = soup.find_all(
                                class_=selector['class'])
                        else:
                            latest_chapter_soup = soup.select(
                                selector.get('tag'))
                        if latest_chapter_soup:
                            if content_url == '1':
                                # TODO
                                pass
                            elif content_url == '0':
                                # TODO
                                pass
                            else:
                                latest_chapter_url = content_url + latest_chapter_soup[
                                    0].get('href', None)
                            latest_chapter_name = latest_chapter_soup[0].get(
                                'title', None)
                    if latest_chapter_name and latest_chapter_url:
                        time_current = get_time()
                        # print(latest_chapter_url)
                        data = {
                            "latest_chapter_name":
                            latest_chapter_name,
                            "latest_chapter_url":
                            latest_chapter_url,
                            "owllook_chapter_url":
                            chapter_url,
                            "owllook_content_url":
                            "/owllook_content?url={latest_chapter_url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}"
                            .format(
                                latest_chapter_url=latest_chapter_url,
                                name=latest_chapter_name,
                                chapter_url=url,
                                novels_name=novels_name,
                            ),
                        }
                        # 存储最新章节
                        motor_db = MotorBase().get_db()
                        await motor_db.latest_chapter.update_one(
                            {
                                "novels_name": novels_name,
                                'owllook_chapter_url': chapter_url
                            }, {
                                '$set': {
                                    'data': data,
                                    "finished_at": time_current
                                }
                            },
                            upsert=True)
            return data
    except Exception as e:
        LOGGER.exception(e)
        return None
Exemple #10
0
def setup_db(rank_bp, loop):
    global motor_base
    motor_base = MotorBase()
Exemple #11
0
def setup_db(admin_bp, loop):
    global motor_base
    motor_base = MotorBase()
Exemple #12
0
def setup_db(operate_bp, loop):
    global motor_base
    motor_base = MotorBase()
Exemple #13
0
def setup_db(novels_bp, loop):
    global motor_base
    motor_base = MotorBase()