class BaiduImgSpider(Spider): start_urls = ['https://tieba.baidu.com/p/4429779987'] img_path = 'data/' set_mul = True headers = { "User-Agent": get_random_user_agent() } def parse(self, res): # 将html转化为etree etree = self.e_html(res.html) # 提取目标值生成新的url pages = list(set(i.get('href') for i in etree.cssselect('li.pb_list_pager>a'))) pages.append(self.start_urls[0]) for key, page in enumerate(pages): url = urljoin(self.start_urls[0], page) yield Request(url, headers=self.headers, callback=self.parse_item, extra_value={'key': key}) def parse_item(self, res): items_data = BaiduImgItem.get_item(html=res.html) img_urls = items_data['img_url'] for index, url in enumerate(img_urls): yield Request(url, headers=self.headers, callback=self.save_img, file_type='bytes', extra_value={'index': index, 'key': res.extra_value['key']}) def save_img(self, res): if not os.path.exists(self.img_path): os.makedirs(self.img_path) extra_value = res.extra_value img_name = str(extra_value['key']) + "_" + str(extra_value['index']) + "_" + res.url[-6:].replace('/', '-') with open(self.img_path + img_name, 'wb') as file: file.write(res.html) logging.info('Img downloaded successfully in {dir}'.format(dir=self.img_path + img_name))
class QidianRankingSpider(Spider): start_urls = [ "http://r.qidian.com/?chn=" + str(url) for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12] ] headers = {"User-Agent": get_random_user_agent()} set_mul = True def parse(self, html): items_data = RankingItem.get_items(html=html) result = [] for item in items_data: each_book_list = [] # 只取排名前十的书籍数据 for index, value in enumerate(item.book_list[:10]): item_data = NameItem.get_item(html_etree=value) name = item_data.get('top_name') or item_data.get('other_name') each_book_list.append({'num': index + 1, 'name': name}) data = { 'title': item.ranking_title, 'more': item.more, 'book_list': each_book_list, 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), 'spider': 'qidian' } result.append(data) print(result)
class BaiduImgSpider(Spider): start_urls = [] set_mul = True img_path = 'data/' headers = { "User-Agent": get_random_user_agent() } def start_request(self): for url in self.start_urls: yield Request(url=url, request_config=getattr(self, 'request_config'), headers=getattr(self, 'headers', None), callback=self.parse, file_type="json") def parse(self, res): data = res.html['data'] img_urls = [] for each_data in data: if each_data.get('thumbURL'): img_urls.append(each_data.get('thumbURL')) for url in img_urls: yield Request(url, headers=self.headers, callback=self.save_img, file_type='bytes') def save_img(self, res): if not os.path.exists(self.img_path): os.makedirs(self.img_path) img_name = str(uuid.uuid1()) + "_" + res.url[-10:].replace('/', '-') with open(self.img_path + img_name, 'wb') as file: file.write(res.html) logging.info('Img downloaded successfully in {dir}'.format(dir=self.img_path + img_name))
def parse(self, res): # 将html转化为etree etree = self.e_html(res.html) # 提取目标值生成新的url pages = [i.get('href') for i in etree.cssselect('.paginator>a')] pages.insert(0, '?start=0&filter=') headers = {"User-Agent": get_random_user_agent()} for page in pages: url = self.start_urls[0] + page yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item)
class QidianRankingSpider(Spider): start_urls = [ "http://r.qidian.com/?chn=" + str(url) for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12] ] headers = {"User-Agent": get_random_user_agent()} set_mul = True def parse(self, res): items_data = RankingItem.get_items(html=res.html) result = [] res_dic = {} for item in items_data: each_book_list = [] # 只取排名前十的书籍数据 for index, value in enumerate(item.book_list[:10]): item_data = NameItem.get_item(html_etree=value) name = item_data.get('top_name') or item_data.get('other_name') each_book_list.append({'num': index + 1, 'name': name}) data = { 'title': item.ranking_title, 'more': item.more, 'book_list': each_book_list, 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['spider'] = "qidian" async_callback(self.save, res_dic=res_dic) async def save(self, **kwargs): # 存进数据库 res_dic = kwargs.get('res_dic') try: motor_db = MotorBase().db await motor_db.novels_ranking.update_one( {'target_url': res_dic['target_url']}, { '$set': { 'data': res_dic['data'], 'spider': res_dic['spider'], 'finished_at': time.strftime("%Y-%m-%d %X", time.localtime()) } }, upsert=True) except Exception as e: self.logger.exception(e)
class BdNovelSpider(Spider): start_urls = ['http://book.zongheng.com/api/rank/getZongHengRankList.htm?rankType=1&pageNum=1&pageSize=20'] set_mul = True headers = { "User-Agent": get_random_user_agent() } def start_request(self): for url in self.start_urls: yield Request(url=url, request_config=getattr(self, 'request_config'), headers=getattr(self, 'headers', None), callback=self.parse, file_type="json") def parse(self, res): data = res.html result = [] res_dic = {} if data: for each_data in data: data = { 'name': each_data.get('bookName', ''), 'type': each_data.get('bookShortCateName', ''), 'num': each_data.get('orderNo', ''), 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['type'] = "全部类别" res_dic['spider'] = "zh_bd_novels" async_callback(self.save, res_dic=res_dic) async def save(self, **kwargs): # 存进数据库 res_dic = kwargs.get('res_dic') try: motor_db = MotorBaseOld().db await motor_db.novels_ranking.update_one({ 'target_url': res_dic['target_url']}, {'$set': { 'data': res_dic['data'], 'spider': res_dic['spider'], 'type': res_dic['type'], 'finished_at': time.strftime("%Y-%m-%d %X", time.localtime()) }}, upsert=True) except Exception as e: self.logger.exception(e)
class QidianRankingSpider(Spider): start_urls = [ "http://r.qidian.com/?chn=" + str(url) for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12] ] headers = {"User-Agent": get_random_user_agent()} set_mul = True qidian_type = { '-1': '全部类别', '21': '玄幻', '1': '奇幻', '2': '武侠', '22': '仙侠', '4': '都市', '15': '职场', '6': '军事', '5': '历史', '7': '游戏', '8': '体育', '9': '科幻', '10': '灵异', '12': '二次元', } def parse(self, res): items_data = RankingItem.get_items(html=res.html) result = [] res_dic = {} for item in items_data: each_book_list = [] # 只取排名前十的书籍数据 for index, value in enumerate(item.book_list[:10]): item_data = NameItem.get_item(html_etree=value) name = item_data.get('top_name') or item_data.get('other_name') each_book_list.append({'num': index + 1, 'name': name}) data = { 'title': item.ranking_title, 'more': item.more, 'book_list': each_book_list, 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1]) res_dic['spider'] = "qidian" with open('qidian_ranking.txt', 'a+') as f: f.writelines(json.dumps(res_dic) + '\n')
#!/usr/bin/env python """ Created by howie.hu at 17-10-12. """ import os import uuid import logging from talonspider import Spider, Request from talonspider.utils import get_random_user_agent from pprint import pprint headers = { "User-Agent": get_random_user_agent() } demo = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&cl=2&lm=-1&ie=utf-8&oe=utf-8&word={word}&pn={pn}&rn={rn}" class BaiduImgSpider(Spider): start_urls = [] set_mul = True img_path = 'data/' headers = { "User-Agent": get_random_user_agent() } def start_request(self): for url in self.start_urls: yield Request(url=url, request_config=getattr(self, 'request_config'),