def parse(self, response): item = MuseumsItem() item["museumID"] = 120 item["museumName"] = '西安博物院' item["Location"] = '陕西省西安市南门外友谊西路' #item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') #item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = 'http://www.xabwy.com' item["opentime"] = response.xpath( 'normalize-space(/html//div[4]/div[2]/div[2]/text()[1])' ).extract_first() item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"] = response.xpath( 'normalize-space(/html//div[4]/div[2]/div[2]/text()[2])' ).extract_first() item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'http://www.xabwy.com/Statics/2020.01/66.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 107 item["museumName"] = '遵义会议纪念馆' item["Location"] = response.xpath( '/html//div[4]/div[2]/div/div[1]/div/div[2]/div[3]/div[1]/text()' ).extract_first() item["Location"] = str(item["Location"]).replace(u'\\xa0', u'') item["Location"] = str(item["Location"]).replace(u'\xa0', u'') item["Link"] = 'http://www.zunyihy.cn/' item["opentime"] = response.xpath( 'normalize-space(/html//div[4]/div[2]/div/div[1]/div/div[2]/div[2])' ).extract_first() item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u'') item["opentime"] = str(item["opentime"]).replace(u'\xa0', u'') item["telephone"] = response.xpath( '/html/body/div[4]/div[2]/div/div[1]/div/div[2]/div[3]/div[3]/text()' ).extract_first() item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u'') item["telephone"] = str(item["telephone"]).replace(u'\xa0', u'') url = 'http://www.zunyihy.cn/about.html#about2' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 122 item["museumName"] = '大唐西市博物馆' item["Location"] = response.xpath( 'normalize-space(/html/body/div/form/div[4]/div[2]/div[2]/p[5]/span)' ).extract_first() item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = 'http://www.dtxsmuseum.com/' item[ "opentime"] = '全年开放(每周一及除夕闭馆,法定节假日正常开放),夏季:9:00-17:30(16:30停止票务办理),冬季:9:00-17:00(16::0停止票务办理)' #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') #item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"] = response.xpath( 'normalize-space(/html/body/div/form/div[4]/div[2]/div[2]/p[9]/span)' ).extract_first() item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'http://www.dtxsmuseum.com/news_show.aspx?id=1' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 111 item["museumName"] = '重庆红岩历史博物馆' item["Location"] = '重庆市渝中区红岩村52号' item["Link"] = 'http://www.hongyan.info/' item["opentime"] = '1月1日-12月31日 09:00-17:00' item["telephone"] = '023-63300192 63303065' url = 'http://www.hongyan.info/' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item=MuseumsItem() item["museumID"]=119 item["museumName"]='西安半坡博物馆' item["Location"]='西安市半坡路155号' item["Link"]='http://www.banpomuseum.com.cn/' item["opentime"]='旺季:(3月1日-11月30日)8:00----17:30 淡季:(12月1日-2月底)8:00----17:00' item["telephone"]='联系电话:029-62815385 投诉电话:18729251954' url='http://www.banpomuseum.com.cn/' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 113 item["museumName"] = '西藏博物馆' item["Location"] = '西藏自治区拉萨市城关区民族南路2号' item["Link"] = 'http://www.tibetmuseum.com.cn' item[ "opentime"] = '夏秋季(5月1日至10月31日):09:30-17:30 (17:00游客停止入场)冬春季(11月1日至次年4月30日):10:30-17:00(16:30游客停止入场)' item["telephone"] = '0891-6835244 0891-6812210' url = 'http://www.tibetmuseum.com.cn/zh-CN/brief/historyRecord?isNav=yes&navIndex=1' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 105 item["museumName"] = '成都金沙遗址博物馆' item["Location"] = '四川省成都市青羊区金沙遗址路2号' item["Link"] = 'http://www.jinshasitemuseum.com/' item[ "opentime"] = '夏令时8:30-20:00;冬令时8:30-18:30;周一闭馆' #response.xpath("/html/body/div[1]/section/footer/div[1]/div/text()[2]").extract_first() item["telephone"] = response.xpath( "/html/body/div[4]/div[1]/div[1]/div/a[5]/text()").extract_first() url = 'http://www.jinshasitemuseum.com/About/Introduction' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 117 item["museumName"] = '汉阳陵博物馆' item["Location"] = '地址:西安咸阳国际机场专线公路东段' item["Link"] = 'http://www.hylae.com/' item["opentime"] = response.xpath( "normalize-space(/html/body/div[3]/div[2]/div[2]/div/div[1]/div[2]/text()[1])" ).extract_first() item["telephone"] = '029-62657569' url = 'http://www.hylae.com/index.php?ac=article&at=list&tid=10' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 114 item["museumName"] = '陕西历史博物馆' item["Location"] = response.xpath( "/html/body/div[3]/div/div[2]/p[3]/text()").extract_first() item["Link"] = 'http://www.sxhm.com/' item["opentime"] = '周二至周日 09:00-17:30;遇法定节假日周一除外' item["telephone"] = response.xpath( "/html/body/div[3]/div/div[2]/p[1]/text()").extract_first() url = 'http://www.sxhm.com/index.php?ac=article&at=list&tid=230' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 115 item["museumName"] = '秦始皇帝陵兵马俑博物馆' item["Location"] = '河南省郑州市金水区农业路8号' item["Link"] = 'http://bmy.com.cn/' item["opentime"] = response.xpath( "/html/body/div[4]/div[1]/div[3]/div/div[2]/div[2]/text()" ).extract_first() item["telephone"] = response.xpath( "/html/body/div[3]/div/div/div[2]/p/span/text()").extract_first() url = 'http://www.bmy.com.cn/html/gov/jggk/8eaf8a3015b643b7adcb9d6815e0f845.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item=MuseumsItem() item["museumID"]=106 item["museumName"]='自贡市盐业历史博物馆' item["Location"]=response.xpath("normalize-space(/html/body/div[6]/div/div[2]/p[1])").extract_first() item["Location"] = str(item["Location"]).replace(u'\u3000', u'') item["Link"]='http://www.zgshm.cn/index.html' item["opentime"]='1月1日-12月31日 08:30-16:30' item["telephone"]=response.xpath("normalize-space(/html/body/div[6]/div/div[2]/p[2]/text())").extract_first() item["telephone"] = str(item["telephone"]).replace(u'\u3000', u' ') url='http://www.zgshm.cn/content.jsp?id=297e0fc26362ffbb016380a82d360199'# +response.xpath("/html/body/div[1]/ul//a/@href").extract_first() # 处理详情页r yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 110 item["museumName"] = '重庆中国三峡博物馆' item["Location"] = response.xpath( "/html//div/div/div[5]/div[2]/div[2]/div/div[2]/ul/li[5]/p/text()" ).extract_first() item["Link"] = 'http://www.3gmuseum.cn/' item["opentime"] = '每日9:00-17:00(16:00禁止入馆) 周一闭馆(法定节假日除外)' item["telephone"] = response.xpath( "/html//div/div/div[5]/div[2]/div[2]/div/div[2]/ul/li[1]/p/text()" ).extract_first() url = 'http://www.3gmuseum.cn/web/article/toArticleNo.do?articleno=1&base=&fullPath=http%3A%2F%2Fwww.3gmuseum.cn&type=&itemsonno=12121212&topitemno=402880b25a3bb962015a3bc512212223&itemno=402880b25a3bb962015a3bc512212223' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 116 item["museumName"] = '延安革命纪念馆' item["Location"] = response.xpath( "/html/body/div[2]/div[2]/div[2]/div/div[2]/p[2]/span/strong" ).extract_first() item["Link"] = 'http://www.yagmjng.com/' item["opentime"] = '每日09:00至17:00(16:00停止入馆)' item["telephone"] = response.xpath( "/html/body/div[2]/div[2]/div[2]/div/div[2]/p[4]/span/strong/text()" ).extract_first() url = 'http://www.yagmjng.com/rsf/site/jinianguan/zhanguanjianjie/info/2020/81013.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 112 item["museumName"] = '重庆自然博物馆' item["Location"] = response.xpath( 'normalize-space(//div[3]/div[5]//div[3]/div[1]/text())' ).extract_first() item["Link"] = 'https://www.cmnh.org.cn/' item["opentime"] = response.xpath( '//div[3]/div[1]//div[3]//div[1]/p[1]/text()[3]').extract_first() item["telephone"] = response.xpath( 'normalize-space(//div/div[3]/div[5]//div[3]/h3)').extract_first() url = 'https://www.cmnh.org.cn/about/?4.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 108 item["museumName"] = '云南省博物馆' item["Location"] = '云南省昆明市广福路6393号' item["Link"] = 'http://www.ynmuse um.org' item["opentime"] = response.xpath( "/html/body/div/div[3]/div[2]/div/div[1]/div/div[3]/div/text()" ).extract_first() item["telephone"] = response.xpath( "/html//div/div[3]/div[2]/div/div[1]/div/div[3]/div/text()" ).extract_first() url = 'http://www.ynmuseum.org' + response.xpath( "//div/div[1]/div/ul/li/div/div/div/a/@href").extract_first() # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 128 item["museumName"] = '青海省博物馆' item["Location"] = '青海省西宁市西关大街58号' #item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') #item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = 'http://www.qhmuseum.cn/' item["opentime"] = '夏季:9:00—16:30;冬季:9:30—16:00,每周一闭馆休整。' #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') #item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"] = '0971--6118691' #item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') #item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'https://baike.baidu.com/item/%E9%9D%92%E6%B5%B7%E7%9C%81%E5%8D%9A%E7%89%A9%E9%A6%86/1627225?fr=aladdin' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item=MuseumsItem() item["museumID"]=126 item["museumName"] ='宁夏固原博物馆' item["Location"] = response.xpath('normalize-space(/html//div[3]/div/div[2]/p[2]/text())').extract_first() item["Location"] = str(item["Location"]).replace(u'\\xa0', u'') item["Location"] = str(item["Location"]).replace(u'\xa0', u'') item["Link"]='http://www.nxgybwg.com/' item["opentime"] = response.xpath('normalize-space(/html//div[2]/div/div[7]/div/div/div[1]/div/div[1]/p/text()[2])').extract_first() item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u'') item["opentime"] = str(item["opentime"]).replace(u'\xa0', u'') item["telephone"]=response.xpath('normalize-space(/html//div[3]/div/div[2]/p[3]/text())').extract_first() item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u'') item["telephone"] = str(item["telephone"]).replace(u'\xa0', u'') url='http://www.nxgybwg.com/e/action/ShowInfo.php?classid=1&id=307' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item=MuseumsItem() item["museumID"]=123 item["museumName"] ='甘肃省博物馆' item["Location"] ='甘肃省兰州市七里河区西津西路3' #item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') #item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"]='http://www.gansumuseum.com/' item["opentime"] = response.xpath('/html/body/div[1]/div[1]/div[2]/div/ul/li[3]/text()').extract_first() item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"]='0931-2346308' #item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') #item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url='http://www.gansumuseum.com/about/show-1.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item=MuseumsItem() item["museumID"]=121 item["museumName"] ='宝鸡青铜器博物院' item["Location"] =response.xpath('normalize-space(/html/body/div[3]/div/div[2]/text()[4])').extract_first() item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"]='http://www.bjqtm.com/' item["opentime"] = response.xpath('/html/body/div[1]/div[2]/div[1]/p[1]/text()[1]').extract_first() #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') #item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"]=response.xpath('normalize-space(/html/body/div[1]/div[2]/div[1]/p[2])').extract_first() item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url='http://www.bjqtm.com/index.php?ac=article&at=list&tid=44' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 130 item["museumName"] = '吐鲁番博物馆' item["Location"] = '吐鲁番市高昌区木纳尔路1268号' #item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') #item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = '' item["opentime"] = '周二至周日10:00-18:00' #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') #item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"] = '0995-7619644;0995-7619645;0995-7619650' #item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') #item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'http://www.xabwy.com/Statics/2020.01/66.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 124 item["museumName"] = '天水市博物馆' item["Location"] = '甘肃省天水市秦州区伏羲路110号' #item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') #item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = 'http://www.tssbwg.com.cn/' item["opentime"] = '每天上午8:00 - 12:00;下午 14:00 - 18:00 开放' #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') #item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"] = '0938-8291377' #item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') #item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'http://www.tssbwg.com.cn/html/2013/zzjg_1127/218.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 125 item["museumName"] = '敦煌研究院' item["Location"] = '甘肃省酒泉市敦煌市' #item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') #item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = 'https://www.dha.ac.cn/' item["opentime"] = '' #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') #item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"] = '敦煌研究院网络中心 : 0937-8869123' #item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') #item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'https://www.dha.ac.cn/' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 109 item["museumName"] = '云南名族博物馆' item["Location"] = response.xpath('normalize-space(/html//div/div/div/div/div[2]/div/div[1]/div[2]/div/div/div[2]/div/div[1]/div[2]/div/div/p[1])').extract_first() item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = 'http://www.ynnmuseum.com/main.html' item["opentime"] = '开放时间:周二至周日上午9:00——下午4:30(周一闭馆)' item["telephone"] = response.xpath( 'normalize-space(/html//div/div/div/div/div[2]/div/div[1]/div[2]/div/div/div[2]/div/div[1]/div[2]/div/div/p[3])').extract_first() item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'http://www.ynnmuseum.com/abouts.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} # 传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 129 item["museumName"] = '新疆维吾尔自治区博物馆' item["Location"] = '乌鲁木齐市沙依巴克区西北路581号' #item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') #item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = '' #item["opentime"] = response.xpath('normalize-space(/html/body/div[4]/div[2]/div[2]/text()[1])').extract_first() #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') item["opentime"] = '每周二至周日的10:30—18:00' item["telephone"] = '0991-4536436' #item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') #item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'http://www.xabwy.com/Statics/2020.01/66.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )
def parse(self, response): item=MuseumsItem() item["museumID"]=118 item["museumName"]='西安碑林博物馆' item["Location"]='西安三学街15号' item["Link"]='http://www.beilin-museum.com/' content="" x = response.xpath("//table[4]//table[2]//td[3]/table[3]//div[3]/p[2]/text()[1]").extract_first() content += x x = response.xpath("//table[4]//table[2]//td[3]/table[3]//div[3]/p[2]/text()[2]").extract_first() content += x x = response.xpath("//table[4]//table[2]//td[3]/table[3]//div[3]/p[2]/text()[3]").extract_first() content += x item["opentime"]=content.replace(u'\xa0', u'') item["telephone"]='87210764' url='http://www.beilin-museum.com/contents/45/976.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item":item}#传递参数 )
def parse(self, response): item = MuseumsItem() item["museumID"] = 127 item["museumName"] = '宁夏回族自治区博物馆' item["Location"] = response.xpath( 'normalize-space(/html//footer/div/div[2]/div/div[1]/span[3])' ).extract_first() item["Location"] = str(item["Location"]).replace(u'\\xa0', u' ') item["Location"] = str(item["Location"]).replace(u'\xa0', u' ') item["Link"] = 'https://www.nxbwg.com/' item["opentime"] = '周一闭馆,周二-周日:9:00-16:50' #item["opentime"] = str(item["opentime"]).replace(u'\\xa0', u' ') #item["opentime"] = str(item["opentime"]).replace(u'\xa0', u' ') item["telephone"] = '电话:(0951)5085093' #item["telephone"] = str(item["telephone"]).replace(u'\\xa0', u' ') #item["telephone"] = str(item["telephone"]).replace(u'\xa0', u' ') url = 'https://www.nxbwg.com/a/30.html' # 处理详情页 yield scrapy.Request( url, callback=self.parse_detail, meta={"item": item} #传递参数 )