def parse(self, response):
     item = exhibitionItem()
     item_list = response.xpath(
         "//div[@class='list_box1 wf100']//a/@href").getall()
     for index, i in enumerate(item_list):
         yield scrapy.Request(response.urljoin(i),
                              callback=self.parse_content)
 def parse(self, response):
     item = exhibitionItem()
     #/html/body/div[3]/table[1]/tbody/tr[1]/td[1]/ul/table/tbody/tr/td[3]/table/tbody/tr[1]/td/a
     #/html/body/div[3]/table[1]/tbody/tr[1]/td[3]/ul/table/tbody/tr/td[3]/table/tbody/tr[1]/td/a
     #/html/body/div[3]/table[2]/tbody/tr[1]/td[1]/ul/table/tbody/tr/td[3]/table/tbody/tr[1]/td/a
     #/html/body/div[3]/table[2]/tbody/tr[1]/td[3]/ul/table/tbody/tr/td[3]/table/tbody/tr[1]/td/a
     #/html/body/div[3]/table[3]/tbody/tr[1]/td[1]/ul/table/tbody/tr/td[3]/table/tbody/tr[1]/td/a
     #1 由于分版块无法形成循环
     exhib_name = response.xpath(
         '/html/body/div[3]/table[1]/tbody/tr[1]/td[1]/ul/table/tbody/tr/td[3]/table/tbody/tr[1]/td/a/text()'
     ).extract_first()
     print(exhib_name)
     #detail_url = 'http://jzmsm.org' + response.xpath('/html/body/div[3]/table[1]/tbody/tr[1]/td[1]/ul/table/tbody/tr/td[1]/a/@href').extract_first()
     img = response.xpath(
         '/html/body/div[3]/table[1]/tbody/tr[1]/td[1]/ul/table/tbody/tr/td[1]/a/img/@src'
     ).extract_first()
     #/html/body/div[3]/table[1]/tbody/tr[1]/td[3]/ul/table/tbody/tr/td[1]/a/img
     print(img)
     #yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
     #2
     exhib_name = response.xpath(
         '/html/body/div[3]/table[1]/tbody/tr[1]/td[3]/ul/table/tbody/tr/td[3]/table/tbody/tr[1]/td/a/text()'
     ).extract_first()
     print(exhib_name)
     #http://jzmsm.org
     #detail_url = 'http://jzmsm.org' + response.xpath('/html/body/div[3]/table[1]/tbody/tr[1]/td[3]/ul/table/tbody/tr/td[1]/a/@href').extract_first()
     img = response.xpath(
         '/html/body/div[3]/table[1]/tbody/tr[1]/td[3]/ul/table/tbody/tr/td[1]/a/img/@src'
     ).extract_first()
     #/html/body/div[3]/table[1]/tbody/tr[1]/td[3]/ul/table/tbody/tr/td[1]/a/img
     print(img)
Ejemplo n.º 3
0
 def parse(self, response):
     div_list = response.xpath(
         '//*[@id="exhibitionIndex"]/div[2]/div[4]/div')
     # xp = '/html/body/div[1]/div/div[3]/div/div'
     # num = 1
     for div in div_list:
         item = exhibitionItem()
         name = div.xpath('./a/div[2]/h5/text()').extract()
         name = ''.join(name)
         print(name)
         item['exhibName'] = name
         img = div.xpath('./a/div[1]/img/@data-src').extract()
         img = ''.join(img)
         if img[0] == '/':
             img = 'http://www.njmuseum.com' + img
         print(img)
         item['exhibImg'] = img
         detail_url = "http://www.njmuseum.com" + div.xpath(
             './a/@href').extract_first()
         print(detail_url)
         self.deep_urls.append(detail_url)
         item['exhibName'] = name
         item['exhibImg'] = img
         item['museumID'] = 6
         yield scrapy.Request(url=detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
Ejemplo n.º 4
0
 def parse(self, response):
     div_list = response.xpath('/html/body/section[2]/div[2]/div[3]/ul//li')
     # data_list = response.xpath('/html/body/section[2]/div[2]/div[3]/ul/li//@data-id').extract()
     # num = 0
     # xp = '/html/body/div[1]/div/div[3]/div/div'
     # num = 1
     for div in div_list:
         item = exhibitionItem()
         name = div.xpath('./div[2]/div[@class="title"]/text()').extract()
         name = ''.join(name)
         print(name)
         item['exhibName'] = name
         # if div.xpath('./div[2]/div[@class="des"]//text()'):
         cont = div.xpath('./div[2]/div[@class="des"]//text()').extract()
         cont = ''.join(cont)
         time = div.xpath(
             './div[2]/div[@class="date a"]/div[@class="d2"]//text()'
         ).extract()
         time = ''.join(time)
         cont = cont + "\n展览时间:" + time
         print(cont)
         img = div.xpath('./div[1]/div[1]/@style').extract_first()
         img = "http://www.wuhouci.net.cn" + re.search(
             '(?<=\()\S+(?=\))', img).group()
         print(img)
         item['exhibImg'] = img
         item['exhibName'] = name
         item['exhibIntro'] = cont
         item['museumID'] = 13
         yield item
         self.num += 1
 def parse(self, response):
     item = exhibitionItem()
     p_list = response.xpath('/html/body/div[6]/div/div[1]/div[2]/div/p')
     for p in p_list:
         exhib_name = p.xpath('./span/strong//text()').extract()
         exhib_name = ''.join(exhib_name)
         print(exhib_name)
 def parse(self, response):
     # m = response.xpath('//*[@id="temporary2_list"]').extract()
     # print(m)
     # //*[@id="temporary2_list"]/div[1]
     # div_list = response.xpath('//*[@id="temporary2_list"]/div[1]/div')
     div_list = response.xpath('//*[@class="list clearfix"]/div')
     for div in div_list:
         item = exhibitionItem()
         # //*[@id="temporary2_list"]/div[1]/div[1]/div[2]/div/div[1]/a[1]
         exhib_name = div.xpath(
             './div[2]/div/div[1]/a[1]/text()').extract_first()
         # if exhib_name != None:
         # print(exhib_name)
         item["museumID"] = 1
         item["exhibName"] = exhib_name
         # print(item["exhibName"])
         img = 'https://www.dpm.org.cn' + div.xpath(
             './div[1]/a/img/@src').extract_first()
         item['exhibImg'] = img
         # print(item['exhibImg'])
         detail_url = div.xpath('./div[1]/a/@href').extract_first()
         if detail_url[0] == '/':
             detail_url = 'https://www.dpm.org.cn' + detail_url
         # print(detail_url)
         yield scrapy.Request(detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
 def parse(self, response):
     # item = collectionItem()
     # //*[@id="building2"]/div/div[2]/table/tbody
     li_list = response.xpath('//*[@id="list1"]/li')
     # print(coll_list)
     # for i in range(2):
     for div in li_list:
         item = exhibitionItem()
         # if li.xpath('./td/a/text()').extract_first() != None:
         # //*[@id="227613"]/text()
         name = div.xpath('./div[2]/text()').extract_first()
         # coll_name = ''.join(coll_name)
         print(name)
         item['exhibName'] = name
         # print(li.xpath('./td/a/@href').extract_first())
         detail_url = 'https://www.shanghaimuseum.net/mu/' + div.xpath(
             './div[1]/a/@href').extract_first()
         # img = div.xpath('./div[1]/div[1]/a/img/@src').extract_first()
         # # if img[0] == '/':
         # #     img = 'http://www.zhejiangmuseum.com' + img
         # img = 'https://www.shanghaimuseum.net/mu/' + img
         # print(img)
         self.deep_urls.append(detail_url)
         yield scrapy.Request(detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
 def parse_content(self, response):
     item = exhibitionItem()
     exhibitionImageUrl = "https://www.gzam.com.cn" + response.meta["img"]
     exhibitionDescription = "".join("".join(
         response.xpath(
             "//div[@class='info_txt']//p//text()").getall()).split())
     exhibitionName = response.xpath("//h3/text()").get()
     print((exhibitionName, exhibitionImageUrl, exhibitionDescription))
 def parse(self, response):
     item = exhibitionItem()
     url_list = response.xpath("//ul[@class='basiclist']//a/@href").getall()
     title_list = response.xpath("//h3[@class='ellipsis']//text()").getall()
     for index, url in enumerate(url_list):
         yield scrapy.Request(url.replace("Index.html", "qy.html"),
                              callback=self.parse_content,
                              meta={"title": title_list[index]})
Ejemplo n.º 10
0
 def parse(self, response):
     item = exhibitionItem()
     coll_list = json.loads(response.text)["data"]["recordsList"]
     for i in coll_list:
         educationName = i["title"]
         educationDescription = i["content"]
         educationImageUrl = i["mainPicUrl"]
         print((educationName, educationDescription, educationImageUrl))
Ejemplo n.º 11
0
 def parse(self, response):
     item = exhibitionItem()
     coll_list = json.loads(response.text)['data']['zhuanti']
     for i in coll_list:
         exhibitionName = i["title"]
         exhibitionDescription = i["theme"]
         exhibitionImageUrl = "http://www.tibetmuseum.com.cn/"+i["list_img"]
         print((exhibitionName, exhibitionImageUrl, exhibitionDescription))
Ejemplo n.º 12
0
 def parse_content(self, response):
     item = exhibitionItem()
     exhibitionImageUrl = response.urljoin(
         response.xpath("//div[@class='cont']//img/@src").get())
     exhibitionName = response.xpath("//h1/text()").get().split()[0]
     exhibitionDescription = "".join("".join(
         response.xpath("//div[@class='cont']//text()").getall()).split())
     print((exhibitionName, exhibitionImageUrl, exhibitionDescription))
 def parse(self, response):
     item = exhibitionItem()
     coll_list = json.loads(response.text)["body"]["list"]
     for i in coll_list:
         educationName = i["title"]
         educationDescription = i["description"]
         educationImageUrl = i["litPic"]
         print((educationName, educationDescription, educationImageUrl))
Ejemplo n.º 14
0
    def parse(self, response):
        item = exhibitionItem()
        exhib_list = response.xpath('//*[@id="caseListDIV"]/div')

        for li in exhib_list:

            exhib_name = li.xpath('./div/a/span/text()').extract_first()
            print(exhib_name)
Ejemplo n.º 15
0
 def parse(self, response):
     item = exhibitionItem()
     coll_list = json.loads(response.text)["data"]["list"]
     for i in coll_list:
         exhibitionName = i["name"]
         exhibitionDescription = i["exhibitIntroduce"]
         exhibitionImageUrl = i["coverPic"]
         print((exhibitionName, exhibitionDescription, exhibitionImageUrl))
    def parse(self, response):
        item = exhibitionItem()

        #exhib_list = response.xpath('/html/body/div[7]/div[2]/div[2]/div[1]/ul/li')
        exhib_name = response.xpath(
            '/html/body/div[3]/div/div[2]/table[1]/tbody/tr/td/div/table/tbody/tr[1]/td[1]/span/a/span/text()'
        ).extract_first()
        print(exhib_name)
Ejemplo n.º 17
0
 def parse(self, response):
     item = exhibitionItem()
     coll_list = json.loads(response.text)["Rows"]
     for i in coll_list:
         coll_name = i["Title"]
         coll_desc = i["Contents"]
         #coll_img = i[""]
         print(coll_name)
         print(coll_desc)
Ejemplo n.º 18
0
 def parse_content(self, response):
     item = exhibitionItem()
     exhibitionImageUrl = "http://www.sunyat-sen.org" + \
         response.xpath("//div[@class='conBox']//img/@src").get()
     exhibitionName = response.xpath("//h3/text()").get()
     exhibitionDescription = "".join("".join(
         response.xpath(
             "//div[@class='contentBox']/p/text()").getall()).split())
     print((exhibitionName, exhibitionImageUrl, exhibitionDescription))
 def parse(self, response):
     item = exhibitionItem()
     coll_list = json.loads(response.text)["data"]
     for i in coll_list:
         coll_name = i["showName"]
         coll_desc = i["showDescription"]
         #coll_img = i[""]
         print(coll_name)
         print(coll_desc)
Ejemplo n.º 20
0
 def parse(self, response):
     item = exhibitionItem()
     
     #exhib_list = response.xpath('/html/body/div[4]/div')
     
     #for div in exhib_list:
         #/html/body/div[4]/div[1]/ul/li/a/p[1]
     exhib_name = response.xpath('/html/body/div[6]/div[4]/div[1]/div[9]/table/tbody/tr[1]/td/table[1]/tbody/tr[1]/td/b//text()').extract()
     exhib_name = ''.join(exhib_name)
     print(exhib_name)
Ejemplo n.º 21
0
 def parse(self, response):
     li_list = response.xpath('//div[@id="LB"]/ul/li')
     for li in li_list:
         item = exhibitionItem()
         exhibName = li.xpath('./h2/a/text()').extract_first().strip()
         # print(exhibName)
         item['exhibName'] = exhibName
         url = li.xpath('./h2/a/@href').extract_first()
         # print(url)
         yield scrapy.Request(url, callback=self.parse_desc, meta={'item': item})
 def parse(self, response):
     item = exhibitionItem()
     div_list = response.xpath(
         '/html/body/div[4]/div/div[2]/div[2]/div[1]/ul/li')
     for div in div_list:
         detail_url = div.xpath('./a/@href').extract_first()
         detail_url = 'http://www.bjqtm.com' + detail_url
         yield scrapy.Request(detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
    def parse(self, response):
        item = exhibitionItem()

        exhib_list = response.xpath(
            '/html/body/div[4]/div[3]/div/div[2]/div[2]/div[2]/div')

        for div in exhib_list:
            #/html/body/div[4]/div[3]/div/div[2]/div[2]/div[2]/a[1]/p
            exhib_name = div.xpath('./p/text()').extract_first()
            print(exhib_name)
Ejemplo n.º 24
0
 def parse(self, response):
     item = exhibitionItem()
     exhib_name = response.xpath(
         '/html/body/div[2]/div/div[2]/h1/text()').extract_first()
     print(exhib_name)
     exhib_desc = response.xpath(
         '/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/div[1]//text()'
     ).extract()
     exhib_desc = ''.join(exhib_desc)
     print(exhib_desc)
Ejemplo n.º 25
0
 def parse(self, response):
     li_list = response.xpath('//div[@class="list-item"]')
     for li in li_list:
         item = exhibitionItem()
         exhibName = li.xpath('.//div[@class="h18"]/a/text()').extract_first()
         # print(exhibName)
         item['exhibName'] = exhibName
         url = 'https://www.wmhg.com.cn' + li.xpath('.//div[@class="cont"]/div[@class="h18"]/a/@href').extract_first()
         # print(url)
         yield scrapy.Request(url, callback=self.parse_desc, meta={'item': item})
 def parse(self, response):
     li_list = response.xpath('//ul[@class="titlist04  f14"]/div/li')
     for li in li_list:
         item = exhibitionItem()
         exhibName = li.xpath('./a//text()').extract_first().strip()
         # print(exhibName)
         item['exhibName'] = exhibName
         url = 'http://www.hljmuseum.com' + li.xpath('./a/@href').extract_first()
         # print(url)
         yield scrapy.Request(url, callback=self.parse_desc, meta={'item': item})
Ejemplo n.º 27
0
 def parse(self, response):
     item = exhibitionItem()
     title_list = response.xpath(
         "//li[@class='arr_w']/div[@class='con']/div[@class='h3']/a/@title").getall()
     description_list = response.xpath(
         "//li[@class='arr_w']/div[@class='con']/div[@class='p']/text()").getall()
     url_list = response.xpath(
         "//li[@class='arr_w']/div[@class='con']/div[@class='h3']/a/@href").getall()
     for index, i in enumerate(url_list):
         yield scrapy.Request("http://www.ynmuseum.org"+i, callback=self.parse_content, meta={"title": title_list[index], "description": description_list[index]})
    def parse(self, response):
        item = exhibitionItem()

        exhib_list = response.xpath(
            '/html/body/div[2]/table/tbody/tr[3]/td/table/tbody/tr')

        for li in exhib_list:
            #/html/body/div[2]/table/tbody/tr[3]/td/table[1]/tbody/tr[1]/td[2]/a/span
            exhib_name = li.xpath('./td[2]/a/span/text()').extract_first()
            print(exhib_name)
 def parse(self, response):
     item = exhibitionItem()
     div_list = response.xpath('/html/body/div[5]/div/div[2]/ul/li')
     # print(div_list)
     for div in div_list:
         detail_url = div.xpath('./a/@href').extract_first()
         detail_url = "http://www.sxgm.org" + detail_url
         self.deep_urls.append(detail_url)
         yield scrapy.Request(detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
Ejemplo n.º 30
0
 def parse_content(self, response):
     item = exhibitionItem()
     exhibitionImageUrl = response.xpath(
         "//div[@id='simTestContent']//img/@src").get()
     exhibitionName = response.xpath(
         "//div[@id='simTestContent']/h1/text()").get()
     exhibitionDescription = "".join("".join(
         response.xpath(
             "//div[@id='simTestContent']//span[@style='font-size: 9pt']/text()"
         ).getall()).split())
     print((exhibitionName, exhibitionImageUrl, exhibitionDescription))