async def fetch(client, url, name, is_web): with async_timeout.timeout(15): try: headers = {'user-agent': get_random_user_agent()} if is_web: params = { 'wd': name, 'ie': 'utf-8', 'rn': CONFIG.BAIDU_RN, 'vf_bl': 1 } else: params = {'word': name} async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def fetch(client, url, novels_name): with async_timeout.timeout(20): try: headers = { 'User-Agent': get_random_user_agent(), 'Referer': "http://www.so.com/haosou.html?src=home" } params = { 'ie': 'utf-8', 'src': 'noscript_home', 'shb': 1, 'q': novels_name, } async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
class ZHRankingSpider(Spider): start_urls = ['http://book.zongheng.com/rank.html'] headers = { "User-Agent": asyncio.get_event_loop().run_until_complete(get_random_user_agent()) } concurrency = 3 async def parse(self, res): result = [] res_dic = {} items_data = await RankingItem.get_items(html=res.html) for item in items_data: each_book_list = [] # 只取排名前十的书籍数据 for index, value in enumerate(item.book_list[:10]): item_data = await NameItem.get_item(html_etree=value) name = item_data.top_name or item_data.other_name each_book_list.append({'num': index + 1, 'name': name}) data = { 'title': item.ranking_title, 'more': item.more, 'book_list': each_book_list, 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['type'] = "人气榜单" res_dic['spider'] = "zongheng" await self.save(res_dic) async def save(self, res_dic): try: motor_db = MotorBaseOld().db await motor_db.novels_ranking.update_one( {'target_url': res_dic['target_url']}, { '$set': { 'data': res_dic['data'], 'spider': res_dic['spider'], 'type': res_dic['type'], 'finished_at': time.strftime("%Y-%m-%d %X", time.localtime()) } }, upsert=True) except Exception as e: self.logger.exception(e)
async def fetch(client, url): with async_timeout.timeout(10): try: headers = {'user-agent': get_random_user_agent()} async with client.get(url, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def fetch(client, url, novels_name): with async_timeout.timeout(20): try: headers = { 'user-agent': get_random_user_agent(), 'referer': "https://www.bing.com/" } params = {'q': novels_name, 'ensearch': 0} async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None
async def get_real_url(client, url): with async_timeout.timeout(10): try: headers = {'user-agent': get_random_user_agent()} async with client.head(url, headers=headers, allow_redirects=True) as response: assert response.status == 200 LOGGER.info('Parse url: {}'.format(response.url)) # text = "" # try: # text = await response.text() # except: # text = await response.read() # if text: # print(text) # text = re.findall(r'replace\(\"(.*?)\"\)', str(text)) # text = text[0] if text[0] else "" url = response.url if response.url else None return url except Exception as e: LOGGER.exception(e) return None
class QidianRankingSpider(Spider): start_urls = [ "http://r.qidian.com/?chn=" + str(url) for url in [-1, 21, 1, 2, 22, 4, 15, 6, 5, 7, 8, 9, 10, 12] ] headers = { "User-Agent": asyncio.get_event_loop().run_until_complete(get_random_user_agent()) } concurrency = 3 qidian_type = { '-1': '全部类别', '21': '玄幻', '1': '奇幻', '2': '武侠', '22': '仙侠', '4': '都市', '15': '职场', '6': '军事', '5': '历史', '7': '游戏', '8': '体育', '9': '科幻', '10': '灵异', '12': '二次元', } async def parse(self, res): items_data = await RankingItem.get_items(html=res.html) result = [] res_dic = {} for item in items_data: each_book_list = [] # 只取排名前十的书籍数据 for index, value in enumerate(item.book_list[:10]): item_data = await NameItem.get_item(html_etree=value) name = item_data.top_name or item_data.other_name each_book_list.append({'num': index + 1, 'name': name}) data = { 'title': item.ranking_title, 'more': item.more, 'book_list': each_book_list, 'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()), } result.append(data) res_dic['data'] = result res_dic['target_url'] = res.url res_dic['type'] = self.qidian_type.get(res.url.split('=')[-1]) res_dic['spider'] = "qidian" await self.save(res_dic=res_dic) async def save(self, res_dic): # 存进数据库 try: motor_db = MotorBaseOld().db await motor_db.novels_ranking.update_one( {'target_url': res_dic['target_url']}, { '$set': { 'data': res_dic['data'], 'spider': res_dic['spider'], 'type': res_dic['type'], 'finished_at': time.strftime("%Y-%m-%d %X", time.localtime()) } }, upsert=True) except Exception as e: self.logger.exception(e)