def parse(self, response): div_list = response.xpath( '/html/body/div[1]/div/div[4]/div[2]/div[@class="item clearfix"]') # xp = '/html/body/div[1]/div/div[3]/div/div' # num = 1 for div in div_list: item = educationItem() item['museumID'] = 3 name = div.xpath('./div[5]/div[1]/text()').extract() name = ''.join(name) # print(name) item['eduName'] = name img = div.xpath('./div[4]/@data-src').extract_first() # img = ''.join(img) print(img) if img[0] == "/": img = "http://www.sstm.org.cn" + img print(img) item['eduImg'] = img time = div.xpath('./div[5]/div[2]/span[2]/text()').extract() time = ''.join(time) cont = div.xpath('./div[5]/div[3]/div[3]/text()').extract() cont = ''.join(cont) cont = cont + ' 活动时间:' + time # print(cont) item['eduContent'] = cont yield item self.num += 1
def parse(self, response): item = educationItem() item_list = response.xpath( "//div[@class='train_5']//div[@class='li']//a/@href").getall() for index, i in enumerate(item_list): yield scrapy.Request("http://www.zunyihy.cn" + i, callback=self.parse_content)
def parse(self, response): item = educationItem() #scrapy crawl education147 div_list = response.xpath('/html/body/div[3]/div[2]/div[2]/div/div/li') for li in div_list: name = li.xpath('.//a/text()').extract_first() print(name)
def parse(self, response): item = educationItem() item_list = response.xpath( '//ul[@class="newslist"]/li/a/@href').getall() for index, i in enumerate(item_list): yield scrapy.Request("http://www.beilin-museum.com" + i, callback=self.parse_content)
def parse(self, response): item = educationItem() item_list = response.xpath( "//div[@class='li scaleimg']//a/@href").getall() for index, i in enumerate(item_list): yield scrapy.Request(response.urljoin(i), callback=self.parse_content)
def parse(self, response): div_list = response.xpath('//*[@id="p_list"]/div') # //*[@id="p_item"]/table/tbody/tr[2]/td/div/a # xp = '/html/body/div[1]/div/div[3]/div/div' # num = 1 for div in div_list: item = educationItem() # if div.xpath('./tbody/tr/td[2]/table[1]/tbody/tr/td/a/text()').extract(): name = div.xpath('./table/tbody/tr[2]/td/div/a/text()').extract() # /html/body/table[2]/tbody/tr/td/table/tbody/tr[2]/td[2]/table/tbody/tr[2]/td/table[2]/tbody/tr/td[2]/table[1]/tbody/tr/td/a # /html/body/table[2]/tbody/tr/td/table/tbody/tr[2]/td[2]/table/tbody/tr[2]/td/table # /html/body/table[2]/tbody/tr/td/table/tbody/tr[2]/td[2]/table/tbody/tr[2]/td/table[2]/tbody/tr/td[2]/table[2]/tbody/tr/td # /html/body/table[2]/tbody/tr/td/table/tbody/tr[2]/td[2]/table/tbody/tr[2]/td/table[2]/tbody/tr/td[1]/table/tbody/tr/td/a/img name = ''.join(name) print(name) item['eduName'] = name # //*[@id="p_item"]/table/tbody/tr[1]/td/table/tbody/tr/td/a/img img = div.xpath('./table/tbody/tr[1]/td/table/tbody/tr/td/a/img/@src').extract() img = ''.join(img) # if img[0] == '/': img = 'http://www.ssmzd.com' + img print(img) item['eduImg'] = img # detail_url = "http://www.njmuseum.com" + div.xpath('./a/@href').extract_first() # print(detail_url) # cont = div.xpath('./table/tbody/tr[2]/td/div/a/@href').extract() # cont = ''.join(cont) # cont = 'http://www.ssmzd.com' + cont # print(cont) item['eduContent'] = '暂无' item['museumID'] = 12 yield item self.num += 1
def parse(self, response): # /html/body/section[2]/div[2]/div[2]/ul li_list = response.xpath('/html/body/section[2]/div[2]/div[2]/ul/li') # print(li_list) # xp = '/html/body/div[1]/div/div[3]/div/div' # num = 1 for li in li_list: item = educationItem() name = li.xpath('./div[@class="title"]//text()').extract() name = ''.join(name) print(name) img = li.xpath('./div[@class="l"]/img[1]/@src').extract() img = ''.join(img) img = 'http://www.wuhouci.net.cn' + img print(img) cont = li.xpath('./div[@class="r"]/div[2]//text()').extract() cont = ''.join(cont) time = li.xpath('./div[2]/div[3]/div[2]//text()').extract() # /html/body/section[2]/div[2]/div[2]/ul/li[1]/div[2]/div[3]/div[2] time = ''.join(time) cont = cont + '\n时间:' + time print(cont) item['eduContent'] = cont item['museumID'] = 13 item['eduName'] = name item['eduImg'] = img yield item self.num += 1
def parse(self, response): div_list = response.xpath( '/html/body/div[4]/div[3]/div[2]/div/div[2]/div/div[@class="jchg-cont"]' ) # print(div_list) # //*[@id="hd1-4"]/div[2]/div/div[1]/div/div[4]/div[2]/a/h2 for div in div_list: item = educationItem() img = str(div.xpath('./a/img/@src').extract_first()) img = img.replace(img[0], '', 1) img = "https://cstm.cdstm.cn/jyhd/zkgdjt" + img print(img) item['eduImg'] = img # name = div.xpath('./div[2]/a/h2/text()').extract_first() # print(name) url = str(div.xpath('./a/@href').extract_first()) url = url.replace(url[0], '', 1) # print(url) url_use = "https://cstm.cdstm.cn/jyhd/zkgdjt" + url # print(url_use) yield scrapy.Request(url_use, callback=self.parse_detail, meta={ 'item': item, 'url': url_use })
def parse(self, response): item = educationItem() #/html/body/table[4]/tbody/tr/td[3]/table/tbody/tr[1]/td/table[2]/tbody/tr[1]/td/table/tbody/tr/td[1]/a edu_name = response.xpath('/html/body/table[4]/tbody/tr/td[3]/table/tbody/tr[1]/td/table[2]/tbody/tr[1]/td/table/tbody/tr/td[1]/a/text()').extract_first() print(edu_name) if response.xpath('/html/body/table[4]/tbody/tr/td[3]/table/tbody/tr[1]/td/table[2]/tbody/tr[1]/td/table/tbody/tr/td[1]/a/@href'): detail_url = 'http://www.tengzhoumuseum.com' + response.xpath('/html/body/table[4]/tbody/tr/td[3]/table/tbody/tr[1]/td/table[2]/tbody/tr[1]/td/table/tbody/tr/td[1]/a/@href').extract_first() yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
def parse(self, response): item = educationItem() coll_list = json.loads(response.text)["data"] for i in coll_list: educationName = i["title"] educationDescription = i["abstracts"] educationImageUrl = "https://www.gzchenjiaci.com" + i["imgurl"] print((educationName, educationDescription, educationImageUrl))
def parse(self, response): item = educationItem() coll_list = json.loads(response.text)["body"]["list"] for i in coll_list: educationName = i["title"] educationDescription = i["description"] educationImageUrl = i["litPic"] print((educationName, educationDescription, educationImageUrl))
def parse(self, response): item = educationItem() coll_list = json.loads(response.text)["data"]["recordsList"] for i in coll_list: educationName = i["vTitle"] educationDescription = i["contentSummary"] educationImageUrl = i["imgUrl"] print((educationName, educationDescription, educationImageUrl))
def parse(self, response): item = educationItem() coll_list = json.loads(response.text)['data']['old_list'] for i in coll_list: educationName = i["title"] educationDescription = i["brief_desc"] educationImageUrl = "http://www.tibetmuseum.com.cn/" + i["list_img"] print((educationName, educationImageUrl, educationDescription))
def parse_content(self, response): item = educationItem() educationImageUrl = "http://www.sunyat-sen.org" + \ response.xpath("//div[@class='conBox']//img/@src").get() educationName = response.xpath("//h3/text()").get() educationDescription = "".join("".join( response.xpath( "//div[@class='contentBox']/p/text()").getall()).split()) print((educationName, educationImageUrl, educationDescription))
def parse(self, response): item = educationItem() coll_list = json.loads(response.text)["data"] for i in coll_list: coll_name = i["activityName"] #coll_desc = i[""] coll_img = i["picUrl"] print(coll_name) print(coll_img)
def parse(self, response): item = educationItem() #scrapy crawl education136 name = "徽博研学游" img = "http://www.hzwhbwg.com/upfiles/image/15528711260.jpg" cont = "随着我国素质教育的全面推进,学生的综合实践活动课越来越受到人们的关注和追捧。其中,研学旅行是研究性学习和旅行体验相结合的校外教育活动和综合实践活动课程,而博物馆因其丰富的文化资源以及独特的文化魅力,渐渐地成为了各类研学旅行的重要目的地之一。黄山市已成为“全国首批研学旅行目的地城市”,中国徽州文化博物馆作为全国唯一全面展现徽州文化的历史文化专题博物馆,吸引了众多大中小学校研学团队前来探秘学习。中国徽州文化博物馆日接待各地各级学生研学团队等观众1500人次以上仍为常态,同学们以班级为单位,在博物馆讲解员和志愿者的带领下,有序地进入各大展厅,欣赏着一件件文物展品,观摩着一处处复古场景,瞻仰着一尊尊人物雕像,聆听着一个个源远流长的徽州历史人文故事,一种肃然起敬的感情油然而生。不少同学纷纷感慨,在中国徽州文化博物馆,琳琅满目、造型各异的展品让人们大开眼界,更为古人精湛的技艺折服。" print(name) print(img) print((cont))
def parse(self, response): item = educationItem() title_list = response.xpath("//span[@class='ct']//a/text()").getall() img_list = response.xpath("//li[@class='clearfix']//img/@src").getall() for index, i in enumerate(img_list): educationName = title_list[index] educationDescription = "无介绍" educationImageUrl = i print((educationName, educationImageUrl, educationDescription))
def parse_content(self, response): item = educationItem() educationImageUrl = response.urljoin(response.xpath( "//div[@class='dsj-item-detail-content']//img/@src").get()) educationName = response.xpath( "//p[@class='nbsp-sp-detail-title']/text()").get() educationDescription = "".join("".join(response.xpath( "//div[@class='dsj-item-detail-content']/p/span/text()").getall()).split()) print((educationName, educationImageUrl, educationDescription))
def parse(self, response): item = educationItem() li_list = response.xpath('/html/body/div/div[3]/div/div/div[2]/div[2]/div/div/ul/li') for li in li_list: name = li.xpath('./a/div[2]/h3/text()').extract() name = ''.join(name) print(name) detail_url = "https://www.tjbwg.com/cn/" + li.xpath('./a/@href').extract_first() yield scrapy.Request(url=detail_url,callback=self.parse_detail)
def parse(self, response): item = educationItem() coll_list = json.loads(response.text)["Rows"] for i in coll_list: coll_name = i["Name"] coll_desc = i["Describe"] #coll_img = i[""] print(coll_name) print(coll_desc)
def parse(self, response): item = educationItem() #/html/body/div[5]/div/div[1]/div/div[1] #/html/body/div[5]/div/div[1]/div/div[1]/h2 #/html/body/div[5]/div/div[1]/div/div[1]/h2 name = response.xpath( '/html/body/div[5]/div/div[1]/div/div[1]/h2//text()').extract() name = ''.join(name) print(name)
def parse_content(self, response): item = educationItem() educationName = response.xpath( "//div[@class='news_conent_two_title']/text()").get() educationDescription = "".join("".join(response.xpath( "//p[@class='MsoNormal']//text()").getall()).split()).replace("\xa0", "") educationImageUrl = "http://www.gzsmzmuseum.cn/" + \ response.xpath( "//p[@class='MsoNormal']/img/@src").get(default="images/main_logo.png") print((educationName, educationImageUrl, educationDescription))
def parse(self, response): item = educationItem() #scrapy crawl education160 div_list = response.xpath('/html/body/div[2]/div[2]/div/div[2]/div[2]/div[1]/ul/li') for li in div_list: name = li.xpath('./a/text()').extract_first() print(name) detail_url=li.xpath('./a/@href').extract_first() detail_url='https://www.shmmc.com.cn'+detail_url print(detail_url) yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
def parse(self, response): item = educationItem() title_list = response.xpath("///h4/a/text()").getall() description_list = response.xpath( "//li[@class='top']/p/text()").getall() img_list = response.xpath("//li[@class='top']/a/img/@src").getall() for index, i in enumerate(img_list): educationName = title_list[index].split()[0] educationDescription = "".join(description_list[index].split()) educationImageUrl = "https://www.gzam.com.cn" + i print((educationName, educationImageUrl, educationDescription))
def parse(self, response): item = educationItem() #scrapy crawl education124 _list = response.xpath('/html/body/div[1]/div[3]/div/div[2]/ul/li') for li in _list: name = li.xpath('./a/text()').extract_first() print(name) detail_url = li.xpath('./a/@href').extract_first() #detail_url = 'http://www.zgtcbwg.com' + detail_url print(detail_url) yield scrapy.Request(url=detail_url, callback=self.parse_detail)
def parse(self, response): item = educationItem() #edu_list = response.xpath('/html/body/div[4]/div') #//*[@id="thumbnailUL"] #for div in edu_list: #/html/body/div[4]/div[2]/div[1]/a/p[1] /html/body/div[6]/div[4]/div[1]/div[9]/div/table/tbody/tr[1]/td/table[2]/tbody/tr[1]/td[2]/a/b edu_name = response.xpath( '/html/body/div[6]/div[4]/div[1]/div[9]/div/table/tbody/tr[1]/td/table[2]/tbody/tr[1]/td[2]/a/b/text()' ).extract_first() print(edu_name)
def parse(self, response): item = educationItem() edu_list = response.xpath('/html/body/div[3]/div/div[2]/ul/li') for div in edu_list: edu_name = div.xpath('./a/text()').extract_first() print(edu_name) detail_url = div.xpath('./a/@href').extract_first() detail_url = 'http://museum.linyi.cn/' + detail_url yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
def parse(self, response): item = educationItem() edu_list = response.xpath('//*[@id="c"]/div') for div in edu_list: edu_name = div.xpath('./div[3]/a/@title').extract_first() print(edu_name) detail_url = 'http://museum.sdu.edu.cn/' + div.xpath('./div[3]/a/@href').extract_first() edu_img = 'http://museum.sdu.edu.cn' + div.xpath('./div[2]/a/img/@src').extract_first() print(edu_img) yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
def parse(self, response): item = educationItem() edu_list = response.xpath('/html/body/div[5]/div/div/div[2]/div') for div in edu_list: #/a[1] if div.xpath('./a[1]/@href'): detail_url = 'http://www.qdyzyzmuseum.com' + div.xpath( './a[1]/@href').extract_first() yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item})
def parse(self, response): item = educationItem() #scrapy crawl education130 div_list = response.xpath('//*[@class="articleBox_list"]/ul/li') for li in div_list: name = li.xpath('./a/text()').extract_first() print(name) detail_url = li.xpath('./a/@href').extract_first() detail_url = 'http://www.qzhjg.cn' + detail_url print(detail_url) yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item})