Beispiel #1
0
class BaiduImgSpider(Spider):
    start_urls = ['https://tieba.baidu.com/p/4429779987']
    img_path = 'data/'
    set_mul = True
    headers = {
        "User-Agent": get_random_user_agent()
    }

    def parse(self, res):
        # 将html转化为etree
        etree = self.e_html(res.html)
        # 提取目标值生成新的url
        pages = list(set(i.get('href') for i in etree.cssselect('li.pb_list_pager>a')))

        pages.append(self.start_urls[0])
        for key, page in enumerate(pages):
            url = urljoin(self.start_urls[0], page)
            yield Request(url, headers=self.headers, callback=self.parse_item, extra_value={'key': key})

    def parse_item(self, res):
        items_data = BaiduImgItem.get_item(html=res.html)
        img_urls = items_data['img_url']
        for index, url in enumerate(img_urls):
            yield Request(url, headers=self.headers, callback=self.save_img, file_type='bytes',
                          extra_value={'index': index, 'key': res.extra_value['key']})

    def save_img(self, res):
        if not os.path.exists(self.img_path):
            os.makedirs(self.img_path)
        extra_value = res.extra_value
        img_name = str(extra_value['key']) + "_" + str(extra_value['index']) + "_" + res.url[-6:].replace('/', '-')
        with open(self.img_path + img_name, 'wb') as file:
            file.write(res.html)
            logging.info('Img downloaded successfully in {dir}'.format(dir=self.img_path + img_name))
Beispiel #2
0
class QidianRankingSpider(Spider):
    start_urls = [
        "http://r.qidian.com/?chn=" + str(url)
        for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]
    ]
    headers = {"User-Agent": get_random_user_agent()}
    set_mul = True

    def parse(self, html):
        items_data = RankingItem.get_items(html=html)
        result = []
        for item in items_data:
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = NameItem.get_item(html_etree=value)
                name = item_data.get('top_name') or item_data.get('other_name')
                each_book_list.append({'num': index + 1, 'name': name})
            data = {
                'title': item.ranking_title,
                'more': item.more,
                'book_list': each_book_list,
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
                'spider': 'qidian'
            }
            result.append(data)
        print(result)
Beispiel #3
0
class BaiduImgSpider(Spider):
    start_urls = []
    set_mul = True
    img_path = 'data/'
    headers = {
        "User-Agent": get_random_user_agent()
    }

    def start_request(self):
        for url in self.start_urls:
            yield Request(url=url,
                          request_config=getattr(self, 'request_config'),
                          headers=getattr(self, 'headers', None),
                          callback=self.parse, file_type="json")

    def parse(self, res):
        data = res.html['data']
        img_urls = []
        for each_data in data:
            if each_data.get('thumbURL'):
                img_urls.append(each_data.get('thumbURL'))
        for url in img_urls:
            yield Request(url, headers=self.headers, callback=self.save_img, file_type='bytes')

    def save_img(self, res):
        if not os.path.exists(self.img_path):
            os.makedirs(self.img_path)
        img_name = str(uuid.uuid1()) + "_" + res.url[-10:].replace('/', '-')
        with open(self.img_path + img_name, 'wb') as file:
            file.write(res.html)
            logging.info('Img downloaded successfully in {dir}'.format(dir=self.img_path + img_name))
Beispiel #4
0
 def parse(self, res):
     # 将html转化为etree
     etree = self.e_html(res.html)
     # 提取目标值生成新的url
     pages = [i.get('href') for i in etree.cssselect('.paginator>a')]
     pages.insert(0, '?start=0&filter=')
     headers = {"User-Agent": get_random_user_agent()}
     for page in pages:
         url = self.start_urls[0] + page
         yield Request(url,
                       request_config=self.request_config,
                       headers=headers,
                       callback=self.parse_item)
Beispiel #5
0
class QidianRankingSpider(Spider):
    start_urls = [
        "http://r.qidian.com/?chn=" + str(url)
        for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]
    ]
    headers = {"User-Agent": get_random_user_agent()}
    set_mul = True

    def parse(self, res):
        items_data = RankingItem.get_items(html=res.html)
        result = []
        res_dic = {}
        for item in items_data:
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = NameItem.get_item(html_etree=value)
                name = item_data.get('top_name') or item_data.get('other_name')
                each_book_list.append({'num': index + 1, 'name': name})
            data = {
                'title': item.ranking_title,
                'more': item.more,
                'book_list': each_book_list,
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
            }
            result.append(data)
        res_dic['data'] = result
        res_dic['target_url'] = res.url
        res_dic['spider'] = "qidian"
        async_callback(self.save, res_dic=res_dic)

    async def save(self, **kwargs):
        # 存进数据库
        res_dic = kwargs.get('res_dic')
        try:
            motor_db = MotorBase().db
            await motor_db.novels_ranking.update_one(
                {'target_url': res_dic['target_url']}, {
                    '$set': {
                        'data':
                        res_dic['data'],
                        'spider':
                        res_dic['spider'],
                        'finished_at':
                        time.strftime("%Y-%m-%d %X", time.localtime())
                    }
                },
                upsert=True)
        except Exception as e:
            self.logger.exception(e)
Beispiel #6
0
class BdNovelSpider(Spider):
    start_urls = ['http://book.zongheng.com/api/rank/getZongHengRankList.htm?rankType=1&pageNum=1&pageSize=20']
    set_mul = True
    headers = {
        "User-Agent": get_random_user_agent()
    }

    def start_request(self):
        for url in self.start_urls:
            yield Request(url=url,
                          request_config=getattr(self, 'request_config'),
                          headers=getattr(self, 'headers', None),
                          callback=self.parse, file_type="json")

    def parse(self, res):
        data = res.html
        result = []
        res_dic = {}
        if data:
            for each_data in data:
                data = {
                    'name': each_data.get('bookName', ''),
                    'type': each_data.get('bookShortCateName', ''),
                    'num': each_data.get('orderNo', ''),
                    'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
                }
                result.append(data)
            res_dic['data'] = result
            res_dic['target_url'] = res.url
            res_dic['type'] = "全部类别"
            res_dic['spider'] = "zh_bd_novels"
        async_callback(self.save, res_dic=res_dic)

    async def save(self, **kwargs):
        # 存进数据库
        res_dic = kwargs.get('res_dic')
        try:
            motor_db = MotorBaseOld().db
            await motor_db.novels_ranking.update_one({
                'target_url': res_dic['target_url']},
                {'$set': {
                    'data': res_dic['data'],
                    'spider': res_dic['spider'],
                    'type': res_dic['type'],
                    'finished_at': time.strftime("%Y-%m-%d %X", time.localtime())
                }},
                upsert=True)
        except Exception as e:
            self.logger.exception(e)
Beispiel #7
0
class QidianRankingSpider(Spider):
    start_urls = [
        "http://r.qidian.com/?chn=" + str(url)
        for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]
    ]
    headers = {"User-Agent": get_random_user_agent()}
    set_mul = True
    qidian_type = {
        '-1': '全部类别',
        '21': '玄幻',
        '1': '奇幻',
        '2': '武侠',
        '22': '仙侠',
        '4': '都市',
        '15': '职场',
        '6': '军事',
        '5': '历史',
        '7': '游戏',
        '8': '体育',
        '9': '科幻',
        '10': '灵异',
        '12': '二次元',
    }

    def parse(self, res):
        items_data = RankingItem.get_items(html=res.html)
        result = []
        res_dic = {}
        for item in items_data:
            each_book_list = []
            # 只取排名前十的书籍数据
            for index, value in enumerate(item.book_list[:10]):
                item_data = NameItem.get_item(html_etree=value)
                name = item_data.get('top_name') or item_data.get('other_name')
                each_book_list.append({'num': index + 1, 'name': name})
            data = {
                'title': item.ranking_title,
                'more': item.more,
                'book_list': each_book_list,
                'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
            }
            result.append(data)
        res_dic['data'] = result
        res_dic['target_url'] = res.url
        res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1])
        res_dic['spider'] = "qidian"
        with open('qidian_ranking.txt', 'a+') as f:
            f.writelines(json.dumps(res_dic) + '\n')
Beispiel #8
0
#!/usr/bin/env python
"""
 Created by howie.hu at  17-10-12.
"""
import os
import uuid
import logging

from talonspider import Spider, Request
from talonspider.utils import get_random_user_agent
from pprint import pprint

headers = {
    "User-Agent": get_random_user_agent()
}

demo = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&cl=2&lm=-1&ie=utf-8&oe=utf-8&word={word}&pn={pn}&rn={rn}"


class BaiduImgSpider(Spider):
    start_urls = []
    set_mul = True
    img_path = 'data/'
    headers = {
        "User-Agent": get_random_user_agent()
    }

    def start_request(self):
        for url in self.start_urls:
            yield Request(url=url,
                          request_config=getattr(self, 'request_config'),