def parse(self, response): item = exhibition75Item() item["museumID"] = 48 image_url = response.xpath( "//div[@id='img1']/@style").extract_first().strip() item["exhibition_picture"] = 'https://www.yzmuseum.com' + image_url[ 22:-1] item["exhibitionTheme"] = response.xpath( "//div[@class='content_head_item onfocus']/a/text()" ).extract_first().strip() content = response.xpath( "//div[@class='content_text']/p/text()").extract() content = "".join(content).strip() item["exhibitionIntroduction"] = content yield item li_list = response.xpath("//div[@class='content_head_item ']") for li in li_list: item = exhibition75Item() item["museumID"] = 48 item["exhibitionTheme"] = li.xpath( "./a/text()").extract_first().strip() url = 'https://www.yzmuseum.com/' + li.xpath( "./a/@href").extract_first().strip() # 处理详情页 yield scrapy.Request(url, callback=self.parse_detail, meta={'item': copy.deepcopy(item)})
def parse(self, response): item=exhibition75Item() item["museumID"]=15 item["exhibitionTheme"]=response.xpath("//div[@class='mainC maxW1100']/div[@class='exhibition exhibitionD']/div[@class='tit_menu_ex']/h3/text()").extract_first() item["exhibition_picture"]='www.tjbwg.com'+response.xpath("//div[@class='mainC maxW1100']/div[@class='exhibition exhibitionD']/div[@class='exh_us clearfix']/div[@class='exhUs_l']/div[@class='img']/img/@src").extract_first() item["exhibitionIntroduction"]=response.xpath("//div[@class='mainC maxW1100']/div[@class='exhibition exhibitionD']/div[@class='exh_us clearfix']/div[@class='exhUs_r']/p/text()").extract_first() yield item
def parse(self, response): names = [ '清代王府文化展', '恭王府历史沿革展', '恭王府博物馆馆史展', '《红楼梦》与恭王府', '神音禅韵——恭王府宗教生活展' ] imgs = [ 'http://www.pgm.org.cn/pgm/qdwfzzljj/201810/d95a77520ccd48309dc71541de698ca0/images/d87ba0327b744684b315c52770cf1d5e.jpg', 'http://www.pgm.org.cn/pgm/hlmzljjf/201810/c528df40fe9449c291c290623954fac7/images/cc8815729076447e930b7071d5e33267.jpg', 'http://www.pgm.org.cn/pgm/gwfbwgzljj/201810/27d838a1b65245e6a914cf3fb09a679b/images/862b31a5d09e4efaa6ac041feac22e5e.jpg', 'http://www.pgm.org.cn/pgm/hlmzljj/201810/199e2cd0555d4f52bc7a483dcea21801/images/eb68f11f2b3b4865a3cfdc4cbaceff5e.jpg', 'http://www.pgm.org.cn/pgm/sycyzljj/201810/4cf7f78dc13e44108272222488e010f2/images/44442f63028942dda374b111f521d8c8.jpg' ] intros = [ '展览分为三大部分,第一部分为“清代的封爵制度”,第二部分为“王府的建筑和规制”,第三部分为“身系国家的大清王公”。让观众从封爵制度、王府和王公的政治、军事、外交作用来初步了解清代王府文化。', '展览分为“和珅”、“恭亲王奕訢”、“私属皇室宅园”、“公共文化空间”四部分,全面介绍恭王府曾经作为和珅宅、和孝公主府、庆王府、恭王府、辅仁大学和现代文化空间的历史沿革情况。展览同时展示:天花脊檩彩绘及地面旧有金砖。葆光室正殿天花彩绘保存完整,傅彩古雅,其中脊檩部分为清中期包袱式宋锦图案苏式彩绘,是难得一见的真迹。', '在恭王府全面开放十周年之际,全新改版的“恭王府博物馆馆史展”与观众见面。展览在葆光室东西两个配殿分别讲述恭王府博物馆的历史进程与业务建设,让馆史展成为公众了解恭王府240年历史脉络的窗口、展示恭王府博物馆从修缮队到国家一级博物馆的40年筚路蓝缕奋斗历程的平台。', '《红楼梦》是中国传统文化的优秀代表,在世界文学史上占有重要地位。曹雪芹在《红楼梦》中所描绘的荣宁二府及大观园,引起了诸多读者的探索兴趣。恭王府是我国现今保存最完好的王府,与《红楼梦》渊源甚深,红学新时期以来,更成为《红楼梦》研究的重镇。一个是“名著”,一个是“名府”,二者竟有着深远而微妙的关系。展览分为“曹家与北京王府”、“《红楼梦》与恭王府”、“大观园‘原型’之谜”三大版块,通过大量翔实、有趣的文献和文物资料,向大家述说一段不寻常的历史与文学的渊源。', '展览通过历史资料、遗迹遗存照片和部分实物的展示,全面、客观地介绍了清代恭亲王奕䜣时期,恭王府内的宗教活动情况,包括府主人对萨满教、佛教及其他宗教的信仰和膜拜。' ] for i in range(5): item = exhibition75Item() item["museumID"] = 14 item["exhibitionTheme"] = names[i] item["exhibition_picture"] = imgs[i] item["exhibitionIntroduction"] = intros[i] yield item
def parse(self, response): names=['文明摇篮', '夏商踪迹', '晋国霸业', '民族熔炉', '佛风遗韵', '戏曲故乡', '明清晋商'] imgs=['http://www.shanximuseum.com/Uploads/Picture/2019/08/14/s5d53e2be8b97c_1510_848_225_190.jpg', 'http://www.shanximuseum.com/Uploads/Picture/2019/04/19/s5cb99aab2fec6_792_445_20_0.jpg', 'http://www.shanximuseum.com/Uploads/Picture/2019/08/14/s5d53e3a5e27ca.jpg', 'http://www.shanximuseum.com/Uploads/Picture/2019/08/14/s5d53e427d93f6.jpg', 'http://www.shanximuseum.com/Uploads/Picture/2019/08/14/s5d53e467c321d.jpg', 'http://www.shanximuseum.com/Uploads/Picture/2019/08/14/s5d53e48a8af8d.jpg', 'http://www.shanximuseum.com/Uploads/Picture/2019/08/14/s5d53e4b6358ed.jpg', ] intros=['母亲河九曲如龙,奔流向东。在其最大最急的转弯处,有一个叫“西侯度”的小山村。180万年前,这里的人们制造出中国最早的石器工具,燃起了中国第一堆文明之火。沧海桑田,生生不息。古人类艰难的行踪踏遍了太行和吕梁之间,不灭的篝火闪亮在汾水与桑干两岸。新石器时代,先民们创造的灿烂文化,遍布山西南北。涓涓细流,百川归海;星散古国,辐辏升华。塔儿山下,尘封4500年后重见天日的城市、宫殿、文字、铜器、“礼器”、观象台和中华民族的精神图腾——龙,昭示着我们的祖先跨进文明之门。最早的“中国”人从这里出发,走向未来。', '公元前21世纪,中国历史进入文明时代。“父传子、家天下”的第一个王朝——夏朝建立。山西南部古有“夏墟”之称,夏文化遗存,分布密集,灿若繁星。“东下冯遗址”的发现,清楚地表明晋南是夏文化的中心区域之一。继夏而起的商朝,是中国有文字记载的历史的开端,国家体制趋于完备,文明程度更高。山西南部发现的商代早期青铜重器和完整城池,表明这里是商王朝的经略要地。商时期,山西中西部吕梁山一线属于各部族“方国”领域。这些方国与华夏民族长期交往,深受影响,文化丰富多彩而独具地域特色。时至今日,我们还无缘窥其全貌。但他们留在黄土地上时断时续的踪迹,却也清晰可辨。', '3000多年前,武王克商,西周建立,分封诸侯,屏藩王室。成王“桐叶封弟”,叔虞入主唐国,其子燮父改号为“晋”。其后励精图治,开疆拓土,逐渐强盛。周室东迁,文侯首功;践土会盟,文公称霸;其后纵横中原,九合诸侯,成就百年霸业。晋国鼎盛时期,地跨晋、豫、冀、鲁,疆域辽阔。春秋晚期,公室衰落,六卿专权,最终导致“三家分晋”。韩、赵、魏变法图强,称雄战国。晋国六百年伟业,奠定了山西历史文化的基石。晋南是晋国的始封地和中心区域,遗存丰厚。“曲村—天马遗址”为晋国早期都城,“晋侯墓地”震动学界。“侯马晋国遗址”是晋国晚期都城——新田,“铸铜遗址”和“侯马盟书”名扬中外。中部的“晋阳古城”则是晋国末期执掌政柄的赵简子的政治军事基地,后来成为赵国的初期都城,“赵卿大墓”气势恢宏,新人耳目。', '山西北通塞外草原,南临中原腹地,不仅极具军事战略价值,而且是农耕社会与草原民族交汇的前沿地带,成为华夏各民族和文化交融的“大熔炉”', '佛教是世界三大宗教之一。两汉之际传入中国。南北朝社会动荡,佛教成为乱世百姓的精神寄托,经帝王显贵推崇,炽烈传播,隋唐达到极盛。与之相应,佛教艺术发展迅速,辉煌迭现。佛教作为中国雕塑艺术创作的主要题材,历时漫长。石雕和彩塑佛像,金铜造像,经久不衰。', '中国戏曲起源于原始宗教仪式中祈福或酬神的歌舞。千百年后,由于滑稽戏和说唱等艺术形式的加盟,最终形成唱、念、做、打的综合表演形式——戏曲。同古希腊戏剧和印度梵剧一样,成为世界古代文明中的艺术瑰宝。', '明朝初年(14世纪),山西商人以明朝北部边塞巨大的军事需求和“开中”盐法的推行为契机,开始经营粮和盐,渐渐崛起。其后不断扩大经营地域。二三百年间,足迹遍及全中国,采取多种经营,开拓对外贸易。“晋商”称富海内,名闻天下,成为中国明清时代最重要的商帮之一。'] for i in range(7): item=exhibition75Item() item["museumID"]=21 item["exhibitionTheme"]=names[i] item["exhibition_picture"]=imgs[i] item["exhibitionIntroduction"]=intros[i] yield item
def parse(self, response): names=['洗煤厂设备流程模拟', '煤炭发电原理', '煤炭与蒸气机的发明', '矿井的大脑·调度中心', '5米长的硅化木', '砌起的模拟矿井·运输大巷', '为开采光明的人提供光明·矿灯充电处'] imgs=['http://www.coalmus.org.cn/UploadFiles/2016-01/20074222312325613.jpg', 'http://www.coalmus.org.cn/UploadFiles/2016-01/200742222313582736.jpg', 'http://www.coalmus.org.cn/UploadFiles/2016-01/200742222325298119.jpg', 'http://www.coalmus.org.cn/UploadFiles/2016-01/20074231334890322.jpg', 'http://www.coalmus.org.cn/UploadFiles/2016-01/200742022233428746.jpg', 'http://www.coalmus.org.cn/UploadFiles/2016-01/2007423126897976.jpg', 'http://www.coalmus.org.cn/UploadFiles/2016-01/20074231322143122.jpg', ] intros=['洗煤厂设备流程模拟', '煤炭发电原理', '煤炭与蒸气机的发明', '矿井的大脑·调度中心', '5米长的硅化木', '砌起的模拟矿井·运输大巷', '为开采光明的人提供光明·矿灯充电处'] for i in range(7): item=exhibition75Item() item["museumID"]=22 item["exhibitionTheme"]=names[i] item["exhibition_picture"]=imgs[i] item["exhibitionIntroduction"]=intros[i] yield item
def parse(self, response): item=exhibition75Item() item["museumID"]=89 url1=response.xpath("//div[@class='allcenter']/ul/li[4]//li//@href").getall() for url in url1: url='http://www.chinajiandu.cn'+url yield scrapy.Request(url,callback=self.Intro,meta={"item":item})
def parse(self, response): tag=response.text date = re.findall(r'<tr>(.*?)</tr>',str(tag)) i=0 for d in date: item=exhibition75Item() item["museumID"]=55 i+=1 content=re.findall('<td height="45" align="left" style="color:#ffffff; padding-left:8px; font-size:13px;">(.*?)</td>',d) item["exhibitionTime"]=''.join(content) '''if i%10==2 or i%10==7: url1=re.findall('href="(.*?)"',d) url='http://www.nbmuseum.cn'+ ''.join(url1) #print(url) yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )''' url1=re.findall('href="(.*?)"',d) url='http://www.nbmuseum.cn'+ ''.join(url1) #print(url) yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item = exhibition75Item() item["museumID"] = 92 url1 = response.xpath("//div[@id='con_zzjs_1']/ul/li//@href").getall() for url in url1: url = 'http://www.sunyat-sen.org' + url yield scrapy.Request(url, callback=self.Intro, meta={"item": item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 86 url1 = response.xpath( "//div[@class='view-content']//h3//@href").getall() for url in url1: url = 'http://www.hnmuseum.com/' + url yield scrapy.Request(url, callback=self.Intro, meta={"item": item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 84 url2 = response.xpath( "//ul[@class='collectlist basicexhlist']/li//@href").getall() for url in url2: url = 'http://www.1911museum.com' + url yield scrapy.Request(url, callback=self.Info, meta={"item": item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 80 url1 = response.xpath("//div[@class='jyzn_l']/ul//@href").getall() for url in url1: url = 'http://www.jzmsm.org' + url yield scrapy.Request(url, callback=self.Others, meta={"item": item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 75 li_list = response.xpath("//div[@class='colInfoBox']/ul/li") for li in li_list: item["exhibitionTheme"] = li.xpath("./h5/a/text()").extract_first() item["exhibition_picture"] = 'http://www.chnmus.net' + li.xpath( "./a/img/@src").extract_first() yield item
def parse(self, response): item = exhibition75Item() item["museumID"] = 79 lurl = response.xpath('//*[@id="Map"]/area/@href').extract() for url in lurl: url = 'http://www.kfsbwg.com' + url yield scrapy.Request(url, callback=self.Others, meta={"item": item})
def parse(self, response): li_list=response.xpath("//div[@class='ul_exh']/ul/li") for li in li_list: item=exhibition75Item() item["museumID"]=56 item["exhibitionTheme"]=li.xpath("./div[@class='div2 rg']/p/a/text()").extract_first() item["exhibition_picture"]='http://www.hzmuseum.com/'+li.xpath("./div[@class='div1 lf']/a/img/@src").extract_first() item["exhibitionIntroduction"]=li.xpath("./div[@class='div2 rg']/p[@class='p1']/text()").extract_first() item["exhibitionTime"]=li.xpath("./div[@class='div2 rg']/p[3]/text()").extract_first() yield item
def parse(self, response): item = exhibition75Item() item["museumID"] = 13 item["exhibitionTheme"] = response.xpath( "//meta[@name='title']/@content").extract_first() item["exhibition_picture"] = 'www.bjp.org.cn' + response.xpath( "//p[@style='text-align:center;']/img/@src").extract_first() item["exhibitionIntroduction"] = response.xpath( "//p/text()").extract_first() yield item
def parse(self, response): li_list=response.xpath("//div[@class='mainbar_pic_nr']/ul/li") for li in li_list: item=exhibition75Item() item["museumID"]=66 item["exhibitionTheme"]=li.xpath("./a/@title").extract_first() item["exhibition_picture"]=li.xpath("./a/img/@src").extract_first() item["exhibitionTime"]='' item["exhibitionIntroduction"]='' yield item
def parse(self, response): item=exhibition75Item() item["museumID"]=104 a=1 url1=response.xpath("//ul[@class='menu-sidebar']/li//@href").getall() for url in url1: if a==2 or a==4: continue a+=1 yield scrapy.Request(url,callback=self.Intro,meta={"item":item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 90 url1 = [] url1 += response.xpath( "//div[@class='column_list']/a[1]/@href").getall() url1 += response.xpath( "//div[@class='column_list']/a[2]/@href").getall() for url in url1: url = 'http://www.gdmuseum.com/' + url yield scrapy.Request(url, callback=self.Intro, meta={"item": item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 96 a = 0 url1 = response.xpath("//ul[@class='d3']/li/a[1]/@href").getall() for url in url1: a += 1 if a > 3: break url = 'http://www.gxmuseum.cn' + url yield scrapy.Request(url, callback=self.Intro, meta={"item": item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 93 Li_list = response.xpath("//ul[@class='clear']/li[@class='item']") for li in Li_list: # exhibitionTime= item['exhibitionTheme'] = li.xpath(".//h3//text()").get().strip() item['exhibitionIntroduction'] = "".join( li.xpath(".//p//text()").getall()).strip() item['exhibition_picture'] = li.xpath(".//div/img/@src").get() yield item
def parse(self, response): li_list = response.xpath("//ul[@class='exbul']/li") for li in li_list: item = exhibition75Item() item["museumID"] = 52 item["exhibition_picture"] = 'http://www.wzmuseum.cn' + li.xpath(".//img/@src").extract_first().strip() item["exhibitionTheme"] = li.xpath(".//div[@class='titlebox']/span/text()").extract_first().strip() content = li.xpath(".//div[@class='exbtxt']/p/text()|.//div[@class='exbtxt']/text()").extract() content = "".join(content).strip() item["exhibitionIntroduction"] = content yield item
def parse(self, response): item=exhibition75Item() item["museumID"]=91 a=1 url1=response.xpath("///div[@id='zldtlist']/div//@href").getall() for url in url1: url='https://www.gznywmuseum.org/'+url if a>5: break a+=1 yield scrapy.Request(url,callback=self.Intro,meta={"item":item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 98 a = 0 url1 = response.xpath("//ul[@class='new-body']/li//@href").getall() for url in url1: a += 1 url = 'http://www.hainanmuseum.org/' + url if a > 6: break yield scrapy.Request(url, callback=self.Intro, meta={"item": item})
def parse(self, response): list=response.xpath("//div[@class='perexh_items']") for div in list: item=exhibition75Item() item["museumID"]=8 url='http://www.bmnh.org.cn'+div.xpath("./a/@href").extract_first() yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item=exhibition75Item() item["museumID"]=18 item["exhibitionTheme"]=response.xpath("//div[@class='k-d']/h3/text()").extract_first() li_list=response.xpath("//div[@id='focus']/ul/li") item["exhibition_picture"]='www.hebeimuseum.org.cn'+li_list[0].xpath("./img/@src").extract_first() p_list=response.xpath("//div[@class='k-d']/div[@class='text']/p/text()").extract() content="" for p in p_list: content+=p item["exhibitionIntroduction"]=content yield item
def parse(self, response): item = exhibition75Item() item["museumID"] = 40 li_list = response.xpath("//ul[@class='am-nav']/li[position()>1]") for li in li_list: url = 'http://www.luxunmuseum.cn' + li.xpath( "./a/@href").extract_first().strip() # 处理详情页 yield scrapy.Request(url, callback=self.parse_detail, meta={'item': copy.deepcopy(item)})
def parse(self, response): li_list=response.xpath("//div[@class='win_a']/ul/li") for li in li_list: item=exhibition75Item() item["museumID"]=60 item["exhibitionTheme"]=li.xpath("./a/p/text()").extract_first() url='http://www.gthyjng.com/gqjs/'+li.xpath("./a/@href").extract_first() yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item} )
def parse(self, response): li_list = response.xpath("//div[@id='divList']/ul/li") for li in li_list: item = exhibition75Item() item["museumID"] = 65 item["exhibition_picture"] = li.xpath( "./a/img/@src").extract_first() url = 'http://www.jxmuseum.cn' + li.xpath( "./a/@href").extract_first() yield scrapy.Request(url, callback=self.parse_detail, meta={"item": item})
def parse(self, response): item = exhibition75Item() item["museumID"] = 80 # exhibitionTime= item['exhibitionTheme'] = response.xpath( "//div[@id='D1pic1']/div/a/@title").get().strip() item['exhibitionIntroduction'] = response.xpath( "//div[@class='xlzl_intro fr']/text()").get().strip() item[ 'exhibition_picture'] = 'http://www.eywsqsfbwg.com/' + response.xpath( "//div[@id='D1pic1']/div/a/img/@src").get() yield item
def parse(self, response): li_list=response.xpath("//div[@class='cj_ercom_wai cj_huise cj_pb4577']/div[@class='cj_zlv_mar']/ul/li") for li in li_list: item=exhibition75Item() item["museumID"]=11 item["exhibitionTheme"]=li.xpath("./a/img/@alt").extract_first() item["exhibition_picture"]='http://www.chnmuseum.cn/zl'+li.xpath("./a/img/@src").extract_first()[2:] url='http://www.chnmuseum.cn/zl'+li.xpath("./a/@href").extract_first()[2:] yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )