Esempio n. 1
0
class QidianNovelsSpider(Spider):
    start_urls = ['https://www.qidian.com/all?page=1']
    headers = {"User-Agent": get_random_user_agent()}
    set_mul = True
    request_config = {'RETRIES': 3, 'DELAY': 0, 'TIMEOUT': 10}
    all_novels_col = MongoDb().db.all_novels

    def parse(self, res):
        urls = [
            'https://www.qidian.com/all?page={i}'.format(i=i)
            for i in range(1, 41645)
        ]
        for url in urls:
            headers = {"User-Agent": get_random_user_agent()}
            yield Request(url,
                          request_config=self.request_config,
                          headers=headers,
                          callback=self.parse_item)

    def parse_item(self, res):
        items_data = QidianNovelsItem.get_items(html=res.html)
        for item in items_data:
            data = {
                'novel_url': item.novel_url,
                'novel_name': item.novel_name,
                'novel_author': item.novel_author,
                'novel_author_home_url': item.novel_author_home_url,
                'spider': 'qidian'
            }
            if self.all_novels_col.find_one({"novel_name": item.novel_name
                                             }) is None:
                self.all_novels_col.insert_one(data)
                print(item.novel_name + ' - 抓取成功')
Esempio n. 2
0
 def parse(self, res):
     # 752
     urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p{i}/v9/s9/t0/ALL.html'.format(i=i) for i in range(1, 752)]
     for url in urls:
         headers = {
             "User-Agent": get_random_user_agent()
         }
         yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item)
Esempio n. 3
0
class ZHNovelsSpider(Spider):
    start_urls = [
        'http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html'
    ]
    headers = {"User-Agent": get_random_user_agent()}
    set_mul = True
    request_config = {'RETRIES': 3, 'DELAY': 1, 'TIMEOUT': 10}
    all_novels_col = MongoDb().db.all_novels

    def parse(self, res):
        # 752
        urls = [
            'http://book.zongheng.com/store/c0/c0/b9/u0/p{i}/v9/s9/t0/ALL.html'
            .format(i=i) for i in range(1, 752)
        ]
        for url in urls:
            headers = {"User-Agent": get_random_user_agent()}
            yield Request(url,
                          request_config=self.request_config,
                          headers=headers,
                          callback=self.parse_item)

    def parse_item(self, res):
        items_data = ZHNovelsItem.get_items(html=res.html)

        for item in items_data:
            if item.novel_url:
                res_dic = {
                    'novel_url': item.novel_url,
                    'novel_name': item.novel_name,
                    'novel_author': item.novel_author,
                    'novel_author_home_url': item.novel_author_home_url,
                    'spider': 'zongheng',
                    'updated_at': time.strftime("%Y-%m-%d %X",
                                                time.localtime()),
                }
                if self.all_novels_col.find_one(
                    {
                        "novel_name": item.novel_name,
                        'novel_author': item.novel_author
                    }) is None:
                    self.all_novels_col.insert_one(res_dic)
                    # async_callback(self.save, res_dic=res_dic)
                    print(item.novel_name + ' - 抓取成功')

    async def save(self, **kwargs):
        # 存进数据库
        res_dic = kwargs.get('res_dic')
        try:
            motor_db = MotorBaseOld().db
            await motor_db.all_novels.update_one(
                {
                    'novel_url': res_dic['novel_url'],
                    'novel_author': res_dic['novel_name']
                }, {'$set': res_dic},
                upsert=True)
        except Exception as e:
            self.logger.exception(e)
Esempio n. 4
0
class BdNovelSpider(Spider):
    start_urls = [
        'http://book.zongheng.com/api/rank/getZongHengRankList.htm?rankType=1&pageNum=1&pageSize=20'
    ]
    set_mul = True
    headers = {"User-Agent": get_random_user_agent()}

    def start_request(self):
        for url in self.start_urls:
            yield Request(url=url,
                          request_config=getattr(self, 'request_config'),
                          headers=getattr(self, 'headers', None),
                          callback=self.parse,
                          file_type="json")

    def parse(self, res):
        data = res.html
        result = []
        res_dic = {}
        if data:
            for each_data in data:
                data = {
                    'name': each_data.get('bookName', ''),
                    'type': each_data.get('bookShortCateName', ''),
                    'num': each_data.get('orderNo', ''),
                    'updated_at': time.strftime("%Y-%m-%d %X",
                                                time.localtime()),
                }
                result.append(data)
            res_dic['data'] = result
            res_dic['target_url'] = res.url
            res_dic['type'] = "全部类别"
            res_dic['spider'] = "zh_bd_novels"
        async_callback(self.save, res_dic=res_dic)

    async def save(self, **kwargs):
        # 存进数据库
        res_dic = kwargs.get('res_dic')
        try:
            motor_db = MotorBaseOld().db
            await motor_db.novels_ranking.update_one(
                {'target_url': res_dic['target_url']}, {
                    '$set': {
                        'data':
                        res_dic['data'],
                        'spider':
                        res_dic['spider'],
                        'type':
                        res_dic['type'],
                        'finished_at':
                        time.strftime("%Y-%m-%d %X", time.localtime())
                    }
                },
                upsert=True)
        except Exception as e:
            self.logger.exception(e)
Esempio n. 5
0
 def parse(self, res):
     urls = [
         'https://www.qidian.com/all?page={i}'.format(i=i)
         for i in range(1, 41645)
     ]
     for url in urls:
         headers = {"User-Agent": get_random_user_agent()}
         yield Request(url,
                       request_config=self.request_config,
                       headers=headers,
                       callback=self.parse_item)
 def parse(self, res):
     # 将html转化为etree
     etree = self.e_html(res.html)
     # 提取目标值生成新的url
     pages = [i.get('href') for i in etree.cssselect('.paginator>a')]
     pages.insert(0, '?start=0&filter=')
     headers = {
         "User-Agent": get_random_user_agent()
     }
     for page in pages:
         url = self.start_urls[0] + page
         yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item)
Esempio n. 7
0
class QidianRankingSpider(Spider):
    start_urls = [
        "http://r.qidian.com/?chn=" + str(url)
        for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]
    ]
    headers = {"User-Agent": get_random_user_agent()}
    set_mul = True
    qidian_type = {
        '-1': '全部类别',
        '21': '玄幻',
        '1': '奇幻',
        '2': '武侠',
        '22': '仙侠',
        '4': '都市',
        '15': '职场',
        '6': '军事',
        '5': '历史',
        '7': '游戏',
        '8': '体育',
        '9': '科幻',
        '10': '灵异',
        '12': '二次元',
    }

    def parse(self, res):
        items_data = RankingItem.get_items(html=res.html)
        result = []
        res_dic = {}
        for item in items_data:
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = NameItem.get_item(html_etree=value)
                name = item_data.get('top_name') or item_data.get('other_name')
                each_book_list.append({'num': index + 1, 'name': name})
            data = {
                'title': item.ranking_title,
                'more': item.more,
                'book_list': each_book_list,
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
            }
            result.append(data)
        res_dic['data'] = result
        res_dic['target_url'] = res.url
        res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1])
        res_dic['spider'] = "qidian"
        with open('qidian_ranking.txt', 'a+') as f:
            f.writelines(json.dumps(res_dic) + '\n')
Esempio n. 8
0
 def _get_html(cls, html, url, html_etree, params, **kwargs):
     if html:
         html = etree.HTML(html)
     elif url:
         if not kwargs.get('headers', None):
             kwargs['headers'] = {"User-Agent": get_random_user_agent()}
         response = requests.get(url, params, **kwargs)
         response.raise_for_status()
         content = response.content
         charset = cchardet.detect(content)
         text = content.decode(charset['encoding'])
         html = etree.HTML(text)
     elif html_etree is not None:
         return html_etree
     else:
         raise ValueError("html(url or html_etree) is expected")
     return html
Esempio n. 9
0
class QidianHonorSpider(Spider):
    start_urls = ['https://book.qidian.com/honor/1009531496']
    headers = {
        "User-Agent": get_random_user_agent()
    }
    set_mul = True
    request_config = {
        'RETRIES': 3,
        'DELAY': 0,
        'TIMEOUT': 10
    }

    def parse(self, res):
        items_data = QidianHonorItem.get_items(html=res.html)
        click_list, col_list, rec_list, other_list = [], [], [], []
        for item in items_data:
            data = {
                'honor_text': item.honor_text,
                'honor_time': item.honor_time,
            }
            if "点击" in data['honor_text'] and '月点击' not in data['honor_text']:
                click_list.append(data)
            elif "收藏" in data['honor_text']:
                col_list.append(data)
            elif "推荐票" in data['honor_text']:
                rec_list.append(data)
            else:
                other_list.append(data)
        print('点击荣誉\n')
        for i in click_list:
            print(str(i.get('honor_time')) + " - " + str(i.get('honor_text')))

        print('收藏荣誉\n')
        for i in col_list:
            print(str(i.get('honor_time')) + " - " + str(i.get('honor_text')))

        print('推荐票荣誉\n')
        for i in rec_list:
            print(str(i.get('honor_time')) + " - " + str(i.get('honor_text')))

        print('强推荣誉\n')
        for i in other_list:
            print(str(i.get('honor_time')) + " - " + str(i.get('honor_text')))
Esempio n. 10
0
class BaiduImgSpider(Spider):
    start_urls = ['https://tieba.baidu.com/p/5062084136']
    img_path = 'data/'
    set_mul = True
    headers = {"User-Agent": get_random_user_agent()}

    def parse(self, res):
        # 将html转化为etree
        etree = self.e_html(res.html)
        # 提取目标值生成新的url
        pages = list(
            set(i.get('href') for i in etree.cssselect('li.pb_list_pager>a')))

        pages.append(self.start_urls[0])
        for page in pages:
            url = urljoin(self.start_urls[0], page)
            yield Request(url, headers=self.headers, callback=self.parse_item)

    def parse_item(self, res):
        items_data = BaiduImgItem.get_item(html=res.html)
        img_urls = items_data['img_url']
        for index, url in enumerate(img_urls):
            yield Request(url,
                          headers=self.headers,
                          callback=self.save_img,
                          file_type='bytes',
                          extra_value={'index': index})

    def save_img(self, res):
        if not os.path.exists(self.img_path):
            os.makedirs(self.img_path)
        img_name = str(res.extra_value['index']) + "_" + res.url[-10:].replace(
            '/', '-')
        with open(self.img_path + img_name, 'wb') as file:
            file.write(res.html)
            logging.info('Img downloaded successfully in {dir}'.format(
                dir=self.img_path + img_name))
Esempio n. 11
0
class HYNovelInfoSpider(Spider):
    start_urls = []
    request_config = {
        'RETRIES': 3,
        'TIMEOUT': 10
    }

    headers = {
        "User-Agent": get_random_user_agent()
    }

    all_novels_col = PyMongoDb().db.all_novels
    all_novels_info_col = PyMongoDb().db.all_novels_info

    def parse(self, res):
        item_data = HYNovelInfoItem.get_item(html=res.html)
        item_data['target_url'] = res.url
        item_data['spider'] = 'heiyan'
        item_data['updated_at'] = time.strftime("%Y-%m-%d %X", time.localtime())
        print('获取 {} 小说信息成功'.format(item_data['novel_name']))
        print(item_data)
        self.all_novels_info_col.update({'novel_name': item_data['novel_name'], 'spider': 'heiyan'}, item_data,
                                        upsert=True)
        async_callback(self.save, res_dic=item_data)

    async def save(self, **kwargs):
        # 存进数据库
        res_dic = kwargs.get('res_dic')
        try:
            motor_db = MotorBase().get_db()
            await motor_db.all_novels_info.update_one({
                'novel_name': res_dic['novel_name'], 'spider': 'heiyan'},
                {'$set': res_dic},
                upsert=True)
        except Exception as e:
            self.logger.exception(e)
Esempio n. 12
0
class QidianRankingSpider(Spider):
    start_urls = ["http://r.qidian.com/?chn=" + str(url) for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]]
    headers = {
        "User-Agent": get_random_user_agent()
    }
    set_mul = True
    qidian_type = {
        '-1': '全部类别',
        '21': '玄幻',
        '1': '奇幻',
        '2': '武侠',
        '22': '仙侠',
        '4': '都市',
        '15': '职场',
        '6': '军事',
        '5': '历史',
        '7': '游戏',
        '8': '体育',
        '9': '科幻',
        '10': '灵异',
        '12': '二次元',
    }

    def parse(self, res):
        items_data = RankingItem.get_items(html=res.html)
        result = []
        res_dic = {}
        for item in items_data:
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = NameItem.get_item(html_etree=value)
                name = item_data.get('top_name') or item_data.get('other_name')
                each_book_list.append({
                    'num': index + 1,
                    'name': name
                })
            data = {
                'title': item.ranking_title,
                'more': item.more,
                'book_list': each_book_list,
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
            }
            result.append(data)
        res_dic['data'] = result
        res_dic['target_url'] = res.url
        res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1])
        res_dic['spider'] = "qidian"
        async_callback(self.save, res_dic=res_dic)

    async def save(self, **kwargs):
        # 存进数据库
        res_dic = kwargs.get('res_dic')
        try:
            motor_db = MotorBaseOld().db
            await motor_db.novels_ranking.update_one({
                'target_url': res_dic['target_url']},
                {'$set': {
                    'data': res_dic['data'],
                    'spider': res_dic['spider'],
                    'type': res_dic['type'],
                    'finished_at': time.strftime("%Y-%m-%d %X", time.localtime())
                }},
                upsert=True)
        except Exception as e:
            self.logger.exception(e)
Esempio n. 13
0
 def __init__(self, crawler):
     super(RandomUserAgentMiddlware, self).__init__()
     self.ua = get_random_user_agent()
Esempio n. 14
0
from tornado import httpclient
from talospider.utils import get_random_user_agent

http_client = httpclient.HTTPClient()

try:
    headers = {"User-Agent": get_random_user_agent()}
    response = http_client.fetch("http://www.baidu.com/", headers=headers)
    print(response.body.decode())
except httpclient.HTTPError as e:
    # HTTPError is raised for non-200 responses; the response
    # can be found in e.response.
    print("Error: " + str(e))
except Exception as e:
    # Other errors are possible, such as IOError.
    print("Error: " + str(e))
http_client.close()
Esempio n. 15
0
        :return: the motor collection instance
        """
        collection_key = db_name + collection
        if collection_key not in self._collection:
            self._collection[collection_key] = self.get_db(db_name)[collection]

        return self._collection[collection_key]


import aiohttp

from talospider.utils import get_random_user_agent

REQUEST_TIMEOUT = 0
REQUEST_DELAY = 0
HEADERS = {"User-Agent": get_random_user_agent()}


async def _get_page(url, sleep, headers):
    """
    获取并返回网页内容
    """
    async with aiohttp.ClientSession() as session:
        try:
            await asyncio.sleep(sleep)
            async with session.get(url,
                                   headers=headers,
                                   timeout=REQUEST_TIMEOUT) as resp:
                return await resp.text()
        except:
            return ""