def parse(self, response): """ hatype="dota2" vkey="longzhu" root_path='//div[@class="list-con"]/a' title_path='./h3[@class="listcard-caption"]/text()' href_path='.//@href' img_path='.//img/@src' author_path='.//strong/text()' audian_path='.//span[@class="livecard-meta-item-text"]/text()' url_prefix="" SpiderPathUrl.process(response,root_path,title_path,href_path,img_path,author_path,audian_path,url_prefix,hatype,vkey) """ responseStr = response.body_as_unicode() tripResponse = responseStr.replace("_callbacks_._36bxu1(", '') index = tripResponse.rfind(")") print("zuihou ) weizhi : " + str(index) + "\n") tripResponse = tripResponse[0:index] #print(tripResponse +"\n") #print("tripResponse:"+tripResponse) resultJson = json.loads(tripResponse) title_j = "channel.name" href_j = "channel.url" img_j = "preview" author_j = "channel.status" audian_j = "viewers" vkey = "channel.domain" url_prefix = "" vType_j = "game.0.name" SpiderPathUrl.processJSON(resultJson["data"]["items"], title_j, href_j, img_j, author_j, audian_j, url_prefix, vType_j, vkey, "longzhu", "")
def parse(self, response): """hatype="dota2" vkey="zhanqi" root_path='//div[@class="live-list-tabc active"]/ul/li' #root_path='//div[@class="live-list-tabc tabc js-room-list-tabc"]/ul/li' title_path='.//span[@class="name"]/text()' href_path='.//a/@href' img_path='.//img/@src' author_path='.//span[@class="anchor anchor-to-cut dv"]/text()' audian_path='.//span[@class="dv"]/text()' url_prefix="http://www.zhanqi.tv" SpiderPathUrl.process(response,root_path,title_path,href_path,img_path,author_path,audian_path,url_prefix,hatype,vkey)""" resultJson = json.loads(response.body_as_unicode()) root_j = "rooms" title_j = "title" href_j = "url" img_j = "bpic" author_j = "nickname" audian_j = "online" vkey = "id" url_prefix = "http://www.zhanqi.tv" vType_j = "gameName" SpiderPathUrl.processJSON(resultJson["data"]["rooms"], title_j, href_j, img_j, author_j, audian_j, url_prefix, vType_j, vkey, "zhanqi", "")
def parse(self, response): #hatype="dota2" #vkey="panda" #root_path='//ul[@class="video-list clearfix"]/li' #root_path='//div[@class="list-container"]/ul/li' """title_path='.//div[@class="video-title"]/text()' href_path='.//a/@href' img_path='.//img/@data-original' author_path='.//span[@class="video-nickname"]/text()' audian_path='.//span[@class="video-number"]/text()' url_prefix="http://www.panda.tv" SpiderPathUrl.process(response,root_path,title_path,href_path,img_path,author_path,audian_path,url_prefix,hatype,vkey)""" resultJson = json.loads(response.body_as_unicode()) title_j = "name" href_j = "id" img_j = "pictures.img" author_j = "userinfo.nickName" audian_j = "person_num" vkey = "id" url_prefix = "http://www.panda.tv/" vType_j = "classification.cname" SpiderPathUrl.processJSON(resultJson["data"]["items"], title_j, href_j, img_j, author_j, audian_j, url_prefix, vType_j, vkey, "panda", "")
def parse(self, response): resultJson= json.loads(response.body_as_unicode()) title_j="title" href_j="slug" img_j="thumb" author_j="nick" audian_j="view" vkey="slug"#uid vType_j="category_name" url_prefix="http://www.quanmin.tv/v/" SpiderPathUrl.processJSON(resultJson["data"],title_j,href_j,img_j,author_j,audian_j,url_prefix,vType_j,vkey,"quanming","")
def parse(self, response): #print("huya parse \n") #print(response.body) #print(self) #root_path='//div[@id="live-list-content"]/ul/li' root_path = '//div[@class="video-unit"]/ul[@class="video-list"]/li' title_path = './/div[@class="all_live_tit"]/a/text()' href_path = './/a/@href' img_path = './/span[@class="txt all_live_txt"]/span/img/@src' author_path = './/a/img/@alt' audian_path = './/span[@class="txt all_live_txt"]/span[@class="num"]/i/text()' url_prefix = "" vtype = './/a/@eid_desc' vkey = href_path SpiderPathUrl.process(response, root_path, title_path, href_path, img_path, author_path, audian_path, url_prefix, vtype, vkey, "huya", "")
def parse(self, response): #print(response) #root_path='//div[@id="live-list-content"]/ul/li' root_path = '//li' title_path = './/h3[@class="ellipsis"]/text()' href_path = './/a/@href' img_path = './/img/@data-original' author_path = './/span[@class="dy-name ellipsis fl"]/text()' audian_path = './/span[@class="dy-num fr"]/text()' url_prefix = "http://www.douyu.com/" vtype = './/span[@class="tag ellipsis"]/text()' vkey = './/@data-rid' SpiderPathUrl.process(response, root_path, title_path, href_path, img_path, author_path, audian_path, url_prefix, vtype, vkey, "douyu", "")
# def parse(self, response): # for href in response.css('.question-summary h3 a::attr(href)'): # full_url = response.urljoin(href.extract()) # yield scrapy.Request(full_url, callback=self.parse_question) # def parse_question(self, response): # yield { # 'title': response.css('h1 a::text').extract()[0], # 'votes': response.css('.question .vote-count-post::text').extract()[0], # 'body': response.css('.question .post-text').extract()[0], # 'tags': response.css('.question .post-tag::text').extract(), # 'link': response.url, # } process = CrawlerProcess(get_project_settings()) SpiderPathUrl.opendb(); SpiderPathUrl.clearDb(); # SpiderPathUrl.commitdb(); # SpiderPathUrl.opendb(); #print("hello \n") #process.crawl(douyuStar) process.crawl(huya) process.crawl(douyu) process.crawl(panda) process.crawl(zhanqi) process.crawl(quanming) process.crawl(longzhu) process.crawl(quanming) process.start()