def __init__(self): self.hashs = HashTool() self.start_urls = ['https://www.qiushibaike.com/history/page/1/'] for i in range(2, 14): self.start_urls.append( 'https://www.qiushibaike.com/history/page/{}/'.format(i)) print(self.start_urls)
def get_page(i): hashs = HashTool() headers = { # 'GET' :'/article/list/text?count=30&page=1 HTTP/1.1', 'Host': 'm2.qiushibaike.com', # 'Source':'ios_11.8.0', # 'Accept':'*/*', # 'app':'1', 'Uuid': 'ios_1e111a1d65d34295b21321e18671b97e', # 'screen':'414,736', # 'qbaid':'D5B0B037-0DF9-410C-B3E9-4C1AB8A55C6B', # 'User-Agent':'QiuBai/11.8.0 rv:31 (iPhone; iOS 11.1; zh_CN) PLHttpClient/1_WIFI', # 'Accept-Language':'zh-Hans-CN;q=1', # 'Accept-Encoding':'br, gzip, deflate', # 'Connection':'keep-alive' } url = 'https://119.29.47.97/article/list/text?count=30&page=' + str(i) a = requests.get(url=url, headers=headers, verify=False) b = json.loads(a.text) items = b['items'] for item in items: item_dic = {} formats = item['format'] content = item['content'] comments = item['comments_count'] id = item['id'] down = item['votes']['down'] up = item['votes']['up'] # user = item['user']['login'] # astrology = item['user']['astrology'] # user_id = item['user']['uid'] # gender = item['user']['gender'] # age = item['user']['age'] share_count = item['share_count'] item_dic['tag'] = formats item_dic['content'] = content item_dic['comments'] = comments # item_dic['types'] = types item_dic['oid'] = id item_dic['unlikes'] = -down item_dic['likes'] = up item_dic['title'] = '' item_dic['shares'] = share_count item_dic['url'] = 'https://www.qiushibaike.com/article/' + str(id) item_dic['simhash'] = hashs.get_hash(content) item_dic['platform'] = 1 item_dic['weight'] = 0 print(content) # item_dic['astrology'] = astrology # item_dic['user_id'] = user_id # item_dic['gender'] = gender # item_dic['age'] = age save_sql(item_dic)
class QiuBaiFreshSpider(scrapy.Spider): name = 'qbfreshspider' def __init__(self): self.hashs = HashTool() self.start_urls = ['https://www.qiushibaike.com/textnew/page/1/'] for i in range(2, 32): self.start_urls.append( 'https://www.qiushibaike.com/textnew/page/{}/'.format(i)) print(self.start_urls) def parse(self, response): flag = 1 print('运行到这里了') print(response.body) bodys = response.xpath( '//div[@class="article block untagged mb15"]').extract() print(bodys) i = 0 for body in bodys: i = i + 1 content = re.findall('<div class="content">.*?<span>(.*?)</span>', body, re.S) actor = re.findall('<h2>(.*?)</h2>', body, re.S) id = re.findall('<a href="/article/(.*?)" target="_blank" ', body, re.S) item = ShortnewsInfoItem() item['oid'] = id[0] item['title'] = actor[0].replace('\n', '') try: item['content'] = content[0].replace('\n', '') except: item['content'] = '暂无' item['platform'] = flag item['tag'] = 'Fresh' item['url'] = 'https://www.qiushibaike.com/article/' + item['id'] item['simhash'] = self.hashs.get_hash(item['content']) print(item) yield item
class QiuBai24HourSpider(scrapy.Spider): name = 'qb24hourspider' def __init__(self): self.hashs = HashTool() self.start_urls = ['https://www.qiushibaike.com/hot/page/1/'] for i in range(2, 14): self.start_urls.append( 'https://www.qiushibaike.com/hot/page/{}/'.format(i)) print(self.start_urls) def parse(self, response): bodys = response.xpath( '//div[re:test(@id,"qiushi_tag_.*?")]').extract() flag = 1 i = 0 for body in bodys: i = i + 1 content = re.findall('<div class="content">\n<span>(.*?)</span>', body, re.S) actor = re.findall('<h2>(.*?)</h2>', body, re.S) id = re.findall('<a href="/article/(.*?)" target="_blank" ', body, re.S) item = ShortnewsInfoItem() item['oid'] = id[0] item['title'] = actor[0].replace('\n', '') try: item['content'] = content[0].replace('\n', '') except: item['content'] = '暂无' item['platform'] = flag item['tag'] = '24hours' item['url'] = 'https://www.qiushibaike.com/article/' + item['id'] item['simhash'] = self.hashs.get_hash(item['content']) print(item) yield item print(i)