class QidianNovelsSpider(Spider): start_urls = ['https://www.qidian.com/all?page=1'] headers = {"User-Agent": get_random_user_agent()} set_mul = True request_config = {'RETRIES': 3, 'DELAY': 0, 'TIMEOUT': 10} all_novels_col = MongoDb().db.all_novels def parse(self, res): urls = [ 'https://www.qidian.com/all?page={i}'.format(i=i) for i in range(1, 41645) ] for url in urls: headers = {"User-Agent": get_random_user_agent()} yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item) def parse_item(self, res): items_data = QidianNovelsItem.get_items(html=res.html) for item in items_data: data = { 'novel_url': item.novel_url, 'novel_name': item.novel_name, 'novel_author': item.novel_author, 'novel_author_home_url': item.novel_author_home_url, 'spider': 'qidian' } if self.all_novels_col.find_one({"novel_name": item.novel_name }) is None: self.all_novels_col.insert_one(data) print(item.novel_name + ' - 抓取成功')
def parse(self, res): # 752 urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p{i}/v9/s9/t0/ALL.html'.format(i=i) for i in range(1, 752)] for url in urls: headers = { "User-Agent": get_random_user_agent() } yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item)
class ZHNovelsSpider(Spider): start_urls = [ 'http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html' ] headers = {"User-Agent": get_random_user_agent()} set_mul = True request_config = {'RETRIES': 3, 'DELAY': 1, 'TIMEOUT': 10} all_novels_col = MongoDb().db.all_novels def parse(self, res): # 752 urls = [ 'http://book.zongheng.com/store/c0/c0/b9/u0/p{i}/v9/s9/t0/ALL.html' .format(i=i) for i in range(1, 752) ] for url in urls: headers = {"User-Agent": get_random_user_agent()} yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item) def parse_item(self, res): items_data = ZHNovelsItem.get_items(html=res.html) for item in items_data: if item.novel_url: res_dic = { 'novel_url': item.novel_url, 'novel_name': item.novel_name, 'novel_author': item.novel_author, 'novel_author_home_url': item.novel_author_home_url, 'spider': 'zongheng', 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } if self.all_novels_col.find_one( { "novel_name": item.novel_name, 'novel_author': item.novel_author }) is None: self.all_novels_col.insert_one(res_dic) # async_callback(self.save, res_dic=res_dic) print(item.novel_name + ' - 抓取成功') async def save(self, **kwargs): # 存进数据库 res_dic = kwargs.get('res_dic') try: motor_db = MotorBaseOld().db await motor_db.all_novels.update_one( { 'novel_url': res_dic['novel_url'], 'novel_author': res_dic['novel_name'] }, {'$set': res_dic}, upsert=True) except Exception as e: self.logger.exception(e)
class BdNovelSpider(Spider): start_urls = [ 'http://book.zongheng.com/api/rank/getZongHengRankList.htm?rankType=1&pageNum=1&pageSize=20' ] set_mul = True headers = {"User-Agent": get_random_user_agent()} def start_request(self): for url in self.start_urls: yield Request(url=url, request_config=getattr(self, 'request_config'), headers=getattr(self, 'headers', None), callback=self.parse, file_type="json") def parse(self, res): data = res.html result = [] res_dic = {} if data: for each_data in data: data = { 'name': each_data.get('bookName', ''), 'type': each_data.get('bookShortCateName', ''), 'num': each_data.get('orderNo', ''), 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['type'] = "全部类别" res_dic['spider'] = "zh_bd_novels" async_callback(self.save, res_dic=res_dic) async def save(self, **kwargs): # 存进数据库 res_dic = kwargs.get('res_dic') try: motor_db = MotorBaseOld().db await motor_db.novels_ranking.update_one( {'target_url': res_dic['target_url']}, { '$set': { 'data': res_dic['data'], 'spider': res_dic['spider'], 'type': res_dic['type'], 'finished_at': time.strftime("%Y-%m-%d %X", time.localtime()) } }, upsert=True) except Exception as e: self.logger.exception(e)
def parse(self, res): urls = [ 'https://www.qidian.com/all?page={i}'.format(i=i) for i in range(1, 41645) ] for url in urls: headers = {"User-Agent": get_random_user_agent()} yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item)
def parse(self, res): # 将html转化为etree etree = self.e_html(res.html) # 提取目标值生成新的url pages = [i.get('href') for i in etree.cssselect('.paginator>a')] pages.insert(0, '?start=0&filter=') headers = { "User-Agent": get_random_user_agent() } for page in pages: url = self.start_urls[0] + page yield Request(url, request_config=self.request_config, headers=headers, callback=self.parse_item)
class QidianRankingSpider(Spider): start_urls = [ "http://r.qidian.com/?chn=" + str(url) for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12] ] headers = {"User-Agent": get_random_user_agent()} set_mul = True qidian_type = { '-1': '全部类别', '21': '玄幻', '1': '奇幻', '2': '武侠', '22': '仙侠', '4': '都市', '15': '职场', '6': '军事', '5': '历史', '7': '游戏', '8': '体育', '9': '科幻', '10': '灵异', '12': '二次元', } def parse(self, res): items_data = RankingItem.get_items(html=res.html) result = [] res_dic = {} for item in items_data: each_book_list = [] # 只取排名前十的书籍数据 for index, value in enumerate(item.book_list[:10]): item_data = NameItem.get_item(html_etree=value) name = item_data.get('top_name') or item_data.get('other_name') each_book_list.append({'num': index + 1, 'name': name}) data = { 'title': item.ranking_title, 'more': item.more, 'book_list': each_book_list, 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1]) res_dic['spider'] = "qidian" with open('qidian_ranking.txt', 'a+') as f: f.writelines(json.dumps(res_dic) + '\n')
def _get_html(cls, html, url, html_etree, params, **kwargs): if html: html = etree.HTML(html) elif url: if not kwargs.get('headers', None): kwargs['headers'] = {"User-Agent": get_random_user_agent()} response = requests.get(url, params, **kwargs) response.raise_for_status() content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) html = etree.HTML(text) elif html_etree is not None: return html_etree else: raise ValueError("html(url or html_etree) is expected") return html
class QidianHonorSpider(Spider): start_urls = ['https://book.qidian.com/honor/1009531496'] headers = { "User-Agent": get_random_user_agent() } set_mul = True request_config = { 'RETRIES': 3, 'DELAY': 0, 'TIMEOUT': 10 } def parse(self, res): items_data = QidianHonorItem.get_items(html=res.html) click_list, col_list, rec_list, other_list = [], [], [], [] for item in items_data: data = { 'honor_text': item.honor_text, 'honor_time': item.honor_time, } if "点击" in data['honor_text'] and '月点击' not in data['honor_text']: click_list.append(data) elif "收藏" in data['honor_text']: col_list.append(data) elif "推荐票" in data['honor_text']: rec_list.append(data) else: other_list.append(data) print('点击荣誉\n') for i in click_list: print(str(i.get('honor_time')) + " - " + str(i.get('honor_text'))) print('收藏荣誉\n') for i in col_list: print(str(i.get('honor_time')) + " - " + str(i.get('honor_text'))) print('推荐票荣誉\n') for i in rec_list: print(str(i.get('honor_time')) + " - " + str(i.get('honor_text'))) print('强推荣誉\n') for i in other_list: print(str(i.get('honor_time')) + " - " + str(i.get('honor_text')))
class BaiduImgSpider(Spider): start_urls = ['https://tieba.baidu.com/p/5062084136'] img_path = 'data/' set_mul = True headers = {"User-Agent": get_random_user_agent()} def parse(self, res): # 将html转化为etree etree = self.e_html(res.html) # 提取目标值生成新的url pages = list( set(i.get('href') for i in etree.cssselect('li.pb_list_pager>a'))) pages.append(self.start_urls[0]) for page in pages: url = urljoin(self.start_urls[0], page) yield Request(url, headers=self.headers, callback=self.parse_item) def parse_item(self, res): items_data = BaiduImgItem.get_item(html=res.html) img_urls = items_data['img_url'] for index, url in enumerate(img_urls): yield Request(url, headers=self.headers, callback=self.save_img, file_type='bytes', extra_value={'index': index}) def save_img(self, res): if not os.path.exists(self.img_path): os.makedirs(self.img_path) img_name = str(res.extra_value['index']) + "_" + res.url[-10:].replace( '/', '-') with open(self.img_path + img_name, 'wb') as file: file.write(res.html) logging.info('Img downloaded successfully in {dir}'.format( dir=self.img_path + img_name))
class HYNovelInfoSpider(Spider): start_urls = [] request_config = { 'RETRIES': 3, 'TIMEOUT': 10 } headers = { "User-Agent": get_random_user_agent() } all_novels_col = PyMongoDb().db.all_novels all_novels_info_col = PyMongoDb().db.all_novels_info def parse(self, res): item_data = HYNovelInfoItem.get_item(html=res.html) item_data['target_url'] = res.url item_data['spider'] = 'heiyan' item_data['updated_at'] = time.strftime("%Y-%m-%d %X", time.localtime()) print('获取 {} 小说信息成功'.format(item_data['novel_name'])) print(item_data) self.all_novels_info_col.update({'novel_name': item_data['novel_name'], 'spider': 'heiyan'}, item_data, upsert=True) async_callback(self.save, res_dic=item_data) async def save(self, **kwargs): # 存进数据库 res_dic = kwargs.get('res_dic') try: motor_db = MotorBase().get_db() await motor_db.all_novels_info.update_one({ 'novel_name': res_dic['novel_name'], 'spider': 'heiyan'}, {'$set': res_dic}, upsert=True) except Exception as e: self.logger.exception(e)
class QidianRankingSpider(Spider): start_urls = ["http://r.qidian.com/?chn=" + str(url) for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12]] headers = { "User-Agent": get_random_user_agent() } set_mul = True qidian_type = { '-1': '全部类别', '21': '玄幻', '1': '奇幻', '2': '武侠', '22': '仙侠', '4': '都市', '15': '职场', '6': '军事', '5': '历史', '7': '游戏', '8': '体育', '9': '科幻', '10': '灵异', '12': '二次元', } def parse(self, res): items_data = RankingItem.get_items(html=res.html) result = [] res_dic = {} for item in items_data: each_book_list = [] # 只取排名前十的书籍数据 for index, value in enumerate(item.book_list[:10]): item_data = NameItem.get_item(html_etree=value) name = item_data.get('top_name') or item_data.get('other_name') each_book_list.append({ 'num': index + 1, 'name': name }) data = { 'title': item.ranking_title, 'more': item.more, 'book_list': each_book_list, 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1]) res_dic['spider'] = "qidian" async_callback(self.save, res_dic=res_dic) async def save(self, **kwargs): # 存进数据库 res_dic = kwargs.get('res_dic') try: motor_db = MotorBaseOld().db await motor_db.novels_ranking.update_one({ 'target_url': res_dic['target_url']}, {'$set': { 'data': res_dic['data'], 'spider': res_dic['spider'], 'type': res_dic['type'], 'finished_at': time.strftime("%Y-%m-%d %X", time.localtime()) }}, upsert=True) except Exception as e: self.logger.exception(e)
def __init__(self, crawler): super(RandomUserAgentMiddlware, self).__init__() self.ua = get_random_user_agent()
from tornado import httpclient from talospider.utils import get_random_user_agent http_client = httpclient.HTTPClient() try: headers = {"User-Agent": get_random_user_agent()} response = http_client.fetch("http://www.baidu.com/", headers=headers) print(response.body.decode()) except httpclient.HTTPError as e: # HTTPError is raised for non-200 responses; the response # can be found in e.response. print("Error: " + str(e)) except Exception as e: # Other errors are possible, such as IOError. print("Error: " + str(e)) http_client.close()
:return: the motor collection instance """ collection_key = db_name + collection if collection_key not in self._collection: self._collection[collection_key] = self.get_db(db_name)[collection] return self._collection[collection_key] import aiohttp from talospider.utils import get_random_user_agent REQUEST_TIMEOUT = 0 REQUEST_DELAY = 0 HEADERS = {"User-Agent": get_random_user_agent()} async def _get_page(url, sleep, headers): """ 获取并返回网页内容 """ async with aiohttp.ClientSession() as session: try: await asyncio.sleep(sleep) async with session.get(url, headers=headers, timeout=REQUEST_TIMEOUT) as resp: return await resp.text() except: return ""