def parse(self, response): # print(response.text) # 5 & sub_ch=mobile&id=SI_Cont&page=1&num=20 self.db = BLDBHelper() baseurl = "http://api.slide.news.sina.com.cn/interface/api_album.php?activity_size=198_132&size=img&ch_id=" baseurl1 = "&id=SI_Cont&num=20&page=1" a = self.db.selectenameforxltp() # b=str(a[1][0]).split("&")[0] # c=str(a[1][0]).split("&")[1] # print(a[1][0]) # print(type(a[1][0])) # contenturl = baseurl + b + "&sub_ch=" + c + baseurl1 # print(contenturl) # print(b) # print(c) j = 0 for i in a: ch_id = a[j][0] b = str(ch_id).split("&")[0] c = str(ch_id).split("&")[1] j = j + 1 contenturl = baseurl + b + "&sub_ch=" + c + baseurl1 # print(contentid) # print(date) print(contenturl) yield scrapy.Request(url=contenturl, callback=self.parse_content_url)
def parse(self, response): self.db = BLDBHelper() a = self.db.selectenamebyvideo() print(a[1][0]) print(type(a[1][0])) j = 0 baseurl = "https://haokan.baidu.com/videoui/api/videorec?tab=" baseurl2 = "&act=pcFeed&pd=pc&num=20&shuaxin_id=1587787443537" for i in a: tyasf = a[j][0] print(a[j][0]) j = j + 1 yield scrapy.Request(url=baseurl + str(a[j][0]) + baseurl2, callback=self.parse_format2_url)
def parse(self, response): self.db = BLDBHelper() a = self.db.selectreaderid() print(a[1][0]) j = 0 for i in a: readerid = a[j][0] print(readerid) item = readertypeItem() item["readertype"] = "默认" item["readerid"]= int(readerid) item["level"] = "1" item["priority"] = "A" j = j + 1 yield item
def file_path(self, request, response=None, info=None): image_name = request.meta['item']['name'] videooriginurl =request.url self.db = BLDBHelper() videoidt = self.db.selectVideobyurl(videooriginurl) videoidI=videoidt[0][0] videoid= str(videoidI) print(videoidI) print(type(videoidI)) mp4='.mp4' print(type(mp4)) videoname=videoid+mp4 print(videoname) path = image_name + '/'+videoname print(path) print("!!!!!!!!!!!!!!!!!!!!!!!!") return path
def parse(self, response): # print(response.text) self.db = BLDBHelper() a = self.db.selectnewsurl() # print(a[1][0]) # print(type(a[1][0])) j = 0 for i in a: contentid = a[j][0] date = contentid[0:8] j = j + 1 contenturl = "https://new.qq.com/omn/" + date + "/" + contentid + ".html" # print(contentid) # print(date) # print(contenturl) yield scrapy.Request(url=contenturl, callback=self.parse_content_url)
def parse(self, response): # print(response.text) self.db = BLDBHelper() a=self.db.selectnewsename() print(a[1][0]) print(type(a[1][0])) j=0 for i in a: self.tyasf= a[j][0] print( self.tyasf) # cid = self.db.selectcidforename(self.tyasf) # print(cid[0][0]) # print(type(cid[0][0])) j=j+1 self.page_firsturl = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=" + self.token + "&ext=" + self.tyasf + "&page=0" print(self.page_firsturl) yield scrapy.Request(url=self.page_firsturl, callback=self.parse_page_url)
def parse(self, response): # print(response.text) self.db = BLDBHelper() a = self.db.selectpicsurl() print(a[1][0]) print(type(a[1][0])) j = 0 for i in a: contenturl = a[j][0] # date=contentid[0:8] j = j + 1 # print(contentid) # print(date) # print(contenturl) yield scrapy.Request(url=contenturl + "?vt=4&hd=1", callback=self.parse_content_url)
class InsertBysjDataSpider(Spider): # 爬虫名称 name = 'InsertBysjData' # 起始url地址 start_urls = ['http://baidu.com/'] custom_settings = { 'CONCURRENT_REQUESTS': 16, 'DOWNLOAD_DELAY': 0, 'COOKIES_ENABLED': False, 'RETRY_TIMES': 15, 'DEFAULT_REQUEST_HEADERS': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'max-age=0', }, } # # 处理 start_urls 对应的响应 # def parse(self, response): # for i in range(60): # item = readerItem() # item1= userItem() # item1["nick"]="匿名用户" + str(i) # item1["password"]="******" # item1["head"] = "https://xiaoliwaer.top:525/headerimg/temp.jpg" # item1["usertype"] = "r" # item["nick"] = "匿名用户" + str(i) # item["birthday"] = "1998-01-25" # item["telephone"] = "13656252033" # item["sex"]= "男" # item["work"]= "程序员" # item["likeread"] = "默认" # item["city"] = "广东东莞" # item["intro"] = "无" # print(str(i)) # yield item1 # yield item def parse(self, response): self.db = BLDBHelper() a = self.db.selectreaderid() print(a[1][0]) j = 0 for i in a: readerid = a[j][0] print(readerid) item = readertypeItem() item["readertype"] = "默认" item["readerid"]= int(readerid) item["level"] = "1" item["priority"] = "A" j = j + 1 yield item
class HaokanspSpider(Spider): # 爬虫名称 name = 'videodownload' # 起始url地址 start_urls = ['https://baidu.com'] custom_settings = { 'CONCURRENT_REQUESTS': 64, 'DOWNLOAD_DELAY': 0, 'COOKIES_ENABLED': False, 'RETRY_TIMES': 15, 'DEFAULT_REQUEST_HEADERS': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'max-age=0', }, } def parse(self, response): self.db = BLDBHelper() a = self.db.selectenamebyvideo() print(a[1][0]) print(type(a[1][0])) ename = a[7][0] cidt = self.db.selectcidforename(ename) cid = cidt[0] print(cid) b = self.db.selectVideo(cid) j = 0 file_urls = [] item = ExamplesItem() for i in b: url = b[j][0] print(b[j][0]) file = b[j][0] j = j + 1 file_urls.append(file) item["file_urls"] = file_urls item["name"] = ename yield item
def parse(self, response): self.db = BLDBHelper() a = self.db.selectenamebyvideo() print(a[1][0]) print(type(a[1][0])) ename = a[7][0] cidt = self.db.selectcidforename(ename) cid = cidt[0] print(cid) b = self.db.selectVideo(cid) j = 0 file_urls = [] item = ExamplesItem() for i in b: url = b[j][0] print(b[j][0]) file = b[j][0] j = j + 1 file_urls.append(file) item["file_urls"] = file_urls item["name"] = ename yield item
class VideoPipeline(FilesPipeline): def get_media_requests(self, item, info): # 处理对象:每组item中的每张图片 for video_url in item['file_urls']: yield Request(video_url, meta={'item': item}) def file_path(self, request, response=None, info=None): image_name = request.meta['item']['name'] videooriginurl =request.url self.db = BLDBHelper() videoidt = self.db.selectVideobyurl(videooriginurl) videoidI=videoidt[0][0] videoid= str(videoidI) print(videoidI) print(type(videoidI)) mp4='.mp4' print(type(mp4)) videoname=videoid+mp4 print(videoname) path = image_name + '/'+videoname print(path) print("!!!!!!!!!!!!!!!!!!!!!!!!") return path
class NewscontentSpider(Spider): # 爬虫名称 p = "" name = 'newscontent' # 起始url地址 start_urls = ['https://new.qq.com/omn/20200325/20200325A0LQ9J00.html'] page_num = "0" custom_settings = { 'CONCURRENT_REQUESTS': 64, 'DOWNLOAD_DELAY': 0, 'COOKIES_ENABLED': False, 'RETRY_TIMES': 15, 'DEFAULT_REQUEST_HEADERS': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'max-age=0', }, } # 处理 start_urls 对应的响应 def parse(self, response): # print(response.text) self.db = BLDBHelper() a = self.db.selectnewsurl() # print(a[1][0]) # print(type(a[1][0])) j = 0 for i in a: contentid = a[j][0] date = contentid[0:8] j = j + 1 contenturl = "https://new.qq.com/omn/" + date + "/" + contentid + ".html" # print(contentid) # print(date) # print(contenturl) yield scrapy.Request(url=contenturl, callback=self.parse_content_url) #yield scrapy.Request(url='https://new.qq.com/omn/20200325/20200325A0LQ9J00.html', callback=self.parse_content_url) def parse_content_url(self, response): # print(response.text) body = response.body.decode('GBK') newcontentp = Selector( text=body).xpath('//div[@class="content-article"]/p').extract() newcontenth1 = Selector( text=body).xpath('//div[@class="LEFT"]/h1[1]/text()').extract() # print(newcontentp) # print(type(newcontentp)) temh1 = self.p.join(item for item in newcontenth1) temp = self.p.join(item for item in newcontentp) # print(temp) temp1 = temp.replace("//", "https://") #print(temp1) # temp1=temp1.join(temp) # content=temp1+temp # for item in newcontentp: # print(item) # self.p= item.join(self.p) # print(type(item)) # print(content) # # for item in newcontenth1: # print(item) # print(type(item)) item = newscontentItem() url = response.url urltemp = url.split('/')[-1] item["newsurl"] = urltemp.split('.')[0] print(item["newsurl"]) print(type(temp1)) item["newscontent"] = temp1 # item["newstitle"]=temh1 yield item
class HaokanspSpider(Spider): # 爬虫名称 name = 'haokansp' # 起始url地址 start_urls = ['https://baidu.com'] custom_settings = { 'CONCURRENT_REQUESTS': 64, 'DOWNLOAD_DELAY': 0, 'COOKIES_ENABLED': False, 'RETRY_TIMES': 15, 'DEFAULT_REQUEST_HEADERS': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'max-age=0', }, } # 处理 start_urls 对应的响应 def parse(self, response): self.db = BLDBHelper() a = self.db.selectenamebyvideo() print(a[1][0]) print(type(a[1][0])) j = 0 baseurl = "https://haokan.baidu.com/videoui/api/videorec?tab=" baseurl2 = "&act=pcFeed&pd=pc&num=20&shuaxin_id=1587787443537" for i in a: tyasf = a[j][0] print(a[j][0]) j = j + 1 yield scrapy.Request(url=baseurl + str(a[j][0]) + baseurl2, callback=self.parse_format2_url) # yield scrapy.Request(url=baseurl + "junshi" + baseurl2, callback=self.parse_format2_url) def parse_format2_url(self, response): # print(response.text) haokanspurllist_dict = json.loads(response.text) ename = response.url[50:-49] cid = self.db.selectcidforename(ename) if haokanspurllist_dict.get("errno") == 0: data = haokanspurllist_dict.get('data') print(data) for name, info in haokanspurllist_dict.items(): print(name) if name == "data": print("****一如果一级元素名为data*****") for key, value in info.items(): print("****打印data下一级的键值对(二级)*****") print(key, ':', value) if key == "response": for ikey, ivalue in value.items(): print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print(ikey, ':', ivalue) item = VideoItem() item["videoorigin"] = "hksp" item["cid"] = str(cid[0][0]) item["videoctime"] = datetime.datetime.now() for j in ivalue: for jkey, jvalue in j.items(): print("****打印好看视频页面列表的视频项内容*****") print(jkey, ':', jvalue) if jkey == "title": item["videoname"] = jvalue if jkey == "poster": item["videoheadurl"] = jvalue if jkey == "url": item[ "videooriginurl"] = jvalue + "&" if jkey == "source_name": item["videokeyword"] = jvalue yield item
class NewsnbalistSpider(Spider): # 爬虫名称 name = 'newslist' # 起始url地址 start_urls = ['https://www.qq.com'] # page_firsturl = 'https://pacaio.match.qq.com/vlike/category?cid=1&num=20&page=0' # page_baseurl = 'https://pacaio.match.qq.com/vlike/category?cid=1&num=20&page=' page_num="0" token="49cbb2154853ef1a74ff4e53723372ce" custom_settings = { 'CONCURRENT_REQUESTS': 64, 'DOWNLOAD_DELAY': 0, 'COOKIES_ENABLED': False, 'RETRY_TIMES': 15, 'DEFAULT_REQUEST_HEADERS': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'no-cache', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', }, } # 处理 start_urls 对应的响应 def parse(self, response): # print(response.text) self.db = BLDBHelper() a=self.db.selectnewsename() print(a[1][0]) print(type(a[1][0])) j=0 for i in a: self.tyasf= a[j][0] print( self.tyasf) # cid = self.db.selectcidforename(self.tyasf) # print(cid[0][0]) # print(type(cid[0][0])) j=j+1 self.page_firsturl = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=" + self.token + "&ext=" + self.tyasf + "&page=0" print(self.page_firsturl) yield scrapy.Request(url=self.page_firsturl, callback=self.parse_page_url) def parse_page_url(self, response): # print(response.text) strename = response.url[87:] strename1 = strename[:-7] pagenum=strename[-1:] print(pagenum) # print(strename1) cid = self.db.selectcidforename(strename1) urllist_dict = json.loads(response.text) # print(cid) if urllist_dict.get("datanum") > 0: data = urllist_dict.get('data') for i in data: print("txxwpageurl:" + i["app_id"]) print("pagetitle:" + i["title"]) print("cid:" + i["category"]) print("keywords:" + i["keywords"]) print("keywords:" + i["img"]) item = NewsItem() item["newsoriginurl"] = i["app_id"] item["newstitle"] = i["title"] item["cid"] = cid[0][0] item["newsorigin"] = "txxw" item["newskeyword"] = i["keywords"] item["newsheadurl"] = i["img"] item["newsctime"]=datetime.datetime.now() yield item for i in range(4,8): self.page_num=str(i) page_url = "https://pacaio.match.qq.com/irs/rcd?cid=146&token=" + self.token + "&ext=" + strename1 + "&page="+self.page_num print(page_url) yield scrapy.Request(url=page_url, callback=self.parse_page_url)
class NewscontentSpider(Spider): # 爬虫名称 p = "" name = 'picscontent' # 起始url地址 start_urls = ['https://photo.sina.cn/'] page_num = "0" custom_settings = { 'CONCURRENT_REQUESTS': 64, 'DOWNLOAD_DELAY': 0, 'COOKIES_ENABLED': False, 'RETRY_TIMES': 15, 'DEFAULT_REQUEST_HEADERS': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'max-age=0', }, } # 处理 start_urls 对应的响应 def parse(self, response): # print(response.text) self.db = BLDBHelper() a = self.db.selectpicsurl() print(a[1][0]) print(type(a[1][0])) j = 0 for i in a: contenturl = a[j][0] # date=contentid[0:8] j = j + 1 # print(contentid) # print(date) # print(contenturl) yield scrapy.Request(url=contenturl + "?vt=4&hd=1", callback=self.parse_content_url) # yield scrapy.Request(url="https://photo.sina.cn/album_24_86328_133303.htm", callback=self.parse_content_url) def parse_content_url(self, response): print(response.text) # body = response.body.decode('GBK') newcontentp = Selector(text=response.text).xpath( '//section[@class="section-item"]').extract() newcontenth1 = Selector( text=response.text).xpath('//h1[1]/text()').extract() print(newcontentp) print(type(newcontentp)) temh1 = self.p.join(item for item in newcontenth1) temp = self.p.join(item for item in newcontentp) print(temp) print(type(temp)) temp2 = temp.replace('src="data:', 'id="') temp1 = temp2.replace("data-src", "src") print(temp2) item = picscontentItem() tempurl = response.url item["picsurl"] = tempurl.split('?')[0] item["picscontent"] = temp1 yield item
class XltpSpider(Spider): # 爬虫名称 name = 'xltp' # 起始url地址 start_urls = ['http://slide.ent.sina.com.cn/'] page_num = "0" custom_settings = { 'CONCURRENT_REQUESTS': 16, 'DOWNLOAD_DELAY': 0, 'COOKIES_ENABLED': False, 'RETRY_TIMES': 15, 'DEFAULT_REQUEST_HEADERS': { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'max-age=0', }, } # 处理 start_urls 对应的响应 def parse(self, response): # print(response.text) # 5 & sub_ch=mobile&id=SI_Cont&page=1&num=20 self.db = BLDBHelper() baseurl = "http://api.slide.news.sina.com.cn/interface/api_album.php?activity_size=198_132&size=img&ch_id=" baseurl1 = "&id=SI_Cont&num=20&page=1" a = self.db.selectenameforxltp() # b=str(a[1][0]).split("&")[0] # c=str(a[1][0]).split("&")[1] # print(a[1][0]) # print(type(a[1][0])) # contenturl = baseurl + b + "&sub_ch=" + c + baseurl1 # print(contenturl) # print(b) # print(c) j = 0 for i in a: ch_id = a[j][0] b = str(ch_id).split("&")[0] c = str(ch_id).split("&")[1] j = j + 1 contenturl = baseurl + b + "&sub_ch=" + c + baseurl1 # print(contentid) # print(date) print(contenturl) yield scrapy.Request(url=contenturl, callback=self.parse_content_url) # cid = self.db.selectcidforchname("海军力量") # print(cid[0][0]) # print(type(cid[0][0])) # yield scrapy.Request(url=contenturl, callback=self.parse_content_url) def parse_content_url(self, response): print(response.text) page = response.url[-1] # body = response.body.decode('GBK') xltp_dict = json.loads(response.text) if xltp_dict.get("count") == "20": data = xltp_dict.get('data') for i in data: print("!!!!!!!!!!!!!!!!!!!!!!!") print(i) item = PicsItem() item["picsorigin"] = "xltp" print(item["picsorigin"]) item["picsctime"] = datetime.datetime.now() for jkey, jvalue in i.items(): print("&&&&&&&&&&&&&&&&") if jkey == "name": item["picsname"] = jvalue print(item["picsname"]) if jkey == "img_url": item["picsheadurl"] = jvalue print(item["picsheadurl"]) if jkey == "url": # item["picsurl"] = jvalue strtemp = str(jvalue).split("/")[4] picsurl = "https://photo.sina.cn/" + strtemp.replace( "slide", "album") # print(picsurl) # print(str(jvalue).split("/")[4]) # print( item["picsurl"]) item["picsoriginurl"] = picsurl if jkey == "short_name": item["picskeyword"] = jvalue print(item["picskeyword"]) if jkey == "sub_ch": cid = self.db.selectcidforchname(str(jvalue)) item["cid"] = cid[0][0] yield item