Example #1
0
 def parse(self, response):
     #scrapy crawl collection145
     url1 = 'http://www.wzmuseum.cn/Col/Col29/Index.aspx'
     url2 = 'http://www.wzmuseum.cn/Col/Col30/Index.aspx'
     url3 = 'http://www.wzmuseum.cn/Col/Col31/Index.aspx'
     url4 = 'http://www.wzmuseum.cn/Col/Col32/Index.aspx'
     url5 = 'http://www.wzmuseum.cn/Col/Col33/Index.aspx'
     url6 = 'http://www.wzmuseum.cn/Col/Col34/Index.aspx'
     url7 = 'http://www.wzmuseum.cn/Col/Col29/Index_2.aspx'
     urlll = ('1', url1, url2, url3, url4, url5, url6, url7)
     item = collectionItem()
     a = (6, 2, 1, 1)
     _list = response.xpath('/html/body/div[1]/div[4]/div[2]/div[2]/ul/li')
     for li in _list:
         coll_name = li.xpath('.//span/text()').extract_first()
         #detail_url='http://www.westlakemuseum.com'+li.xpath('./td/a/@href').extract_first()
         #coll_name=str.strip(coll_name)
         coll_img = li.xpath('./a/@href').extract_first()
         print(coll_name)
         print(coll_img)
         coll_desc = ''
     if self.page_num <= 6:
         new_url = (self.url % self.page_num)
         print(new_url)
         self.page_num += 1
         yield scrapy.Request(new_url, callback=self.parse)
     else:
         if self.cnt <= 7:
             new_url = urlll[self.cnt]
             print(new_url)
             self.cnt += 1
             yield scrapy.Request(new_url, callback=self.parse)
Example #2
0
 def parse(self, response):
     # //*[@id="building2"]/div/div[2]/table/tbody
     coll_list = response.xpath('/html/body/div')
     for li in coll_list:
         item = collectionItem()
         # if li.xpath('./td/a/text()').extract_first() != None:
             # //*[@id="227613"]/text()
         # coll_name = li.xpath('./td/a/text()').extract_first()
         # # coll_name = ''.join(coll_name)
         # print(coll_name)
         # print(li.xpath('./td/a/@href').extract_first())
         img_new = li.xpath('./section/div[1]/div/a/img/@src').extract_first()
         img_new = img_new.replace(".",'',2)
         img = "http://www.19371213.com.cn/collection" + img_new
         print(img)
         url_new = li.xpath('./section/div[1]/div/a/@href').extract_first()
         url_new = url_new.replace(".",'',2)
         detail_url = "http://www.19371213.com.cn/collection" + url_new
         # detail_url = 'https://www.dpm.org.cn/' + li.xpath('./td/a/@href').extract_first()
         item['museumID']=11
         item['collectionImage']=img
         yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
     
     if self.page_num <= 17:
         new_url = (self.url%self.page_num)
         self.page_num += 1
         yield scrapy.Request(new_url,callback=self.parse)
Example #3
0
 def parse(self, response):
     item = collectionItem()
     self.cnt += 1
     if (self.cnt == 5):
         self.urll = 'http://www.gthyjng.com/gcww/wwjs/krzzsq/'
     if (self.cnt == 6):
         self.urll = 'http://www.gthyjng.com/gcww/wwjs/jfzzsq/'
     if (self.cnt == 7):
         self.urll = 'http://www.gthyjng.com/gcww/wwjs/gjdww/'
     urlll = ('1', '1',
              'http://www.gthyjng.com/gcww/wwjs/tdgmsq/index_2.htm',
              'http://www.gthyjng.com/gcww/wwjs/tdgmsq',
              'http://www.gthyjng.com/gcww/wwjs/tdgmsq/index_3.htm',
              'http://www.gthyjng.com/gcww/wwjs/krzzsq/',
              'http://www.gthyjng.com/gcww/wwjs/jfzzsq/',
              'http://www.gthyjng.com/gcww/wwjs/jfzzsq/index_1.htm',
              'http://www.gthyjng.com/gcww/wwjs/gjdww/')
     x = response.xpath('/html/body/div[4]/div/div[2]/div[2]/ul/li')
     for li in x:
         l1 = li.xpath('.//img/@src').extract_first()
         l1 = l1[1:len(l1)]
         coll_img = self.urll + l1
         print(coll_img)
         l1 = li.xpath('./a/@href').extract_first()
         l1 = l1[1:len(l1)]
         detail_url = self.urll + l1
         print(detail_url)
         print(self.cnt)
         yield scrapy.Request(detail_url,
                              callback=self.parse_detail,
                              meta={'item': item})
     if (self.cnt <= 7):
         new_url = urlll[self.cnt + 1]
         yield scrapy.Request(new_url, callback=self.parse)
Example #4
0
    def parse(self, response):
        item = collectionItem()
        coll_list = response.xpath(
            '/html/body/div[2]/div[3]/div[2]/div[2]/ul/li')

        for div in coll_list:
            #/html/body/div[2]/div[3]/div[2]/div[2]/ul/li[1]/div[1]/span
            coll_name = div.xpath('./div[1]/span/text()').extract_first()
            print(coll_name)
            coll_img = 'http:' + div.xpath(
                './div[2]/div[1]/a/img/@src').extract_first()
            #/html/body/div[2]/div[3]/div[2]/div[2]/ul/li[4]/div[2]/div[1]/a/img
            #http://services.ytta.cn
            print(coll_img)
            #/html/body/div[2]/div[3]/div[2]/div[2]/ul/li[1]/div[2]/div[1]/a
            detail_url = 'http://www.ytmuseum.com' + div.xpath(
                './div[2]/div[1]/a/@href').extract_first()
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        if self.page_num <= 8:
            new_url = (self.url % self.page_num)
            self.page_num += 1
            yield scrapy.Request(new_url, callback=self.parse)
 def parse2(self, response):
     item = collectionItem()
     a = (18, 1, 5, 1)
     div_list = response.xpath(
         '/html/body/div[1]/div[5]/div[3]/div[2]/ul/li')
     for li in div_list:
         coll_name = li.xpath('./div[2]/h3/a/text()').extract_first()
         #print(coll_name)
         x = li.xpath('./div[2]/p//text()').extract()
         x = switch(x)
         #x=ge(x)
         #print(x)
         coll_desc = x
         if (len(x) > 100):
             detail_url = 'http://www.81-china.com' + li.xpath(
                 './div[2]/h3/a/@href').extract_first()
             print(detail_url)
             yield scrapy.Request(detail_url,
                                  callback=self.parse_detail,
                                  meta={'item': item})
         coll_img = 'http://www.81-china.com' + li.xpath(
             './/img/@src').extract_first()
         print(coll_name)
         #print(coll_desc)
         print(coll_img)
    def parse(self, response):

        # //*[@id="building2"]/div/div[2]/table/tbody
        coll_list = response.xpath(
            '//*[@id="app"]/div/div/div/div/main/ul/li[@class="col-list-i"]')
        # print(coll_list)
        for li in coll_list:
            item = collectionItem()
            # if li.xpath('./td/a/text()').extract_first() != None:
            # //*[@id="227613"]/text()
            coll_name = li.xpath('./a/h3/text()').extract_first()
            # coll_name = ''.join(coll_name)
            print(coll_name)
            # print(li.xpath('./td/a/@href').extract_first())
            detail_url = 'http://www.zhejiangmuseum.com' + li.xpath(
                './a/@href').extract_first()
            img = li.xpath('./a/figure/img/@src').extract_first()
            if img[0] == '/':
                img = 'http://www.zhejiangmuseum.com' + img
            print(img)
            self.deep_urls.append(detail_url)
            item['collectionName'] = coll_name
            item['museumID'] = 5
            item['collectionImage'] = img
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        if self.page_num <= 291:
            new_url = (self.url % self.page_num)
            self.page_num += 1
            self.new_urls.append(new_url)
            yield scrapy.Request(new_url, callback=self.parse)
 def parse(self, response):
     #scrapy crawl collection148
     item = collectionItem()
     a=(126,35,5,3)
     url1='https://www.chinasilkmuseum.com/zgxd/list_22.aspx?page=%d'
     url2='https://www.chinasilkmuseum.com/xf/list_23.aspx?page=%d'
     url3='https://www.chinasilkmuseum.com/mzx/list_24.aspx?page=%d'
     urlll=('1',url1,url2,url3)
     _list=response.xpath('/html/body/div[1]/div/div[8]/div/ul/li')
     for li in _list:
         coll_name=li.xpath('./p/a/text()').extract_first()
         #detail_url='http://www.westlakemuseum.com'+li.xpath('./td/a/@href').extract_first()
         coll_name=str.strip(coll_name)
         coll_img='https://www.chinasilkmuseum.com'+li.xpath('./a/img/@src').extract_first()
         print(coll_name)
         print(coll_img)
         detail_url='https://www.chinasilkmuseum.com'+li.xpath('./a/@href').extract_first()
         #print((detail_url))
         yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
     if self.page_num <= a[self.cnt-1]:
         new_url = (self.url%self.page_num)
         print(new_url)
         self.page_num += 1
         yield scrapy.Request(new_url,callback=self.parse)
     else :
         if self.cnt<=3:
             self.url=urlll[self.cnt]
             self.cnt+=1
             new_url=(self.url%1)
             print(new_url)
             self.page_num=2
             yield scrapy.Request(new_url,callback=self.parse)
    def parse(self, response):
        # maxn = response.xpath('//*[@class="active"]/text()').extract_first()
        # maxn = ''.join(maxn)
        # maxn = int(maxn)
        # //*[@id="building2"]/div/div[2]/table/tbody
        # if maxn == '1':
        #     self.cot += 1
        coll_list = response.xpath('//*[@id="articleListTable"]/ul/li')
        for li in coll_list:
            item = collectionItem()
            # //*[@id="227613"]/text()
            coll_name = li.xpath('./a/h5/text()').extract_first()
            # coll_name = ''.join(coll_name)
            print(coll_name)
            # print(li.xpath('./td/a/@href').extract_first())
            detail_url = 'http://www.chnmus.net' + li.xpath(
                './a/@href').extract_first()
            coll_img = li.xpath('./a/img/@src').extract_first()
            coll_img = 'http://www.chnmus.net' + coll_img
            item['collectionName'] = coll_name
            item['museumID'] = 9
            item['collectionImage'] = coll_img
            print(coll_img)
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        if self.page_num <= 10:
            # new_url = (self.url%(self.co_list[self.cot],self.page_num))
            new_url = (self.url % self.page_num)
            self.page_num += 1
            yield scrapy.Request(new_url, callback=self.parse)
    def parse(self, response):
        item = collectionItem()
        coll_list = response.xpath(
            '/html/body/div[3]/div/div[2]/table[1]/tbody/tr/td/div/table/tbody/tr[1]/td'
        )
        coll_name = response.xpath(
            '/html/body/div[3]/div/div[2]/table[1]/tbody/tr/td/div/table/tbody/tr[1]/td[1]/span/a/span/text()'
        ).extract_first()
        print(coll_name)
        #/html/body/div[3]/div/div[2]/table[1]/tbody/tr/td/div/table/tbody/tr[1]/td[1]
        for div in coll_list:

            coll_name = div.xpath('./span/a/span/text()').extract_first()
            print(coll_name)

            coll_img = div.xpath(
                './table/tbody/tr/td/a/img/@src').extract_first()
            print(coll_img)

            detail_url = div.xpath(
                './table/tbody/tr/td/a/@href').extract_first()
            yield scrapy.Request(detail_url,
                                 callback=self.parse_detail,
                                 meta={'item': item})

        if self.page_num <= 4:
            new_url = (self.url % self.page_num)
            self.page_num += 1
            yield scrapy.Request(new_url, callback=self.parse)
 def parse(self, response):
     item = collectionItem()
     coll_name = response.xpath(
         '/html/body/table[3]/tbody/tr/td[3]/table/tbody/tr/td/table/tbody/tr[3]/td/ul/li[1]/a[1]/p//text()'
     ).extract()
     coll_name = ''.join(coll_name)
     print(coll_name)
Example #11
0
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["data"]["recordsList"]
     for i in coll_list:
         collectionName = i["name"]
         collectionDescription = i["introduce"]
         collectionImageUrl = i["picUrl"]
         print((collectionName, collectionDescription, collectionImageUrl))
 def parse_content(self, response):
     item = collectionItem()
     collectionImageUrl =response.urljoin(response.xpath("//div[@class='collectdetail clearfix ']//img/@src").get())
     collectionName = response.xpath(
         "//h1/text()").get()
     collectionDescription = "".join("".join(response.xpath(
         "//div[@class='cont']/p/text()").getall()).split())
     print((collectionName, collectionImageUrl,  collectionDescription))
Example #13
0
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["body"]["list"]
     for i in coll_list:
         collectionName = i["title"]
         collectionDescription = i["description"]
         collectionImageUrl = i["litPic"]
         print((collectionName, collectionDescription, collectionImageUrl))
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["data"]
     for i in coll_list:
         collectionName = i["title"]
         collectionDescription = i["texture"]
         collectionImageUrl = "https://www.gzchenjiaci.com" + i["imgurl"]
         print((collectionName, collectionDescription, collectionImageUrl))
Example #15
0
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["data"]["records"]
     for i in coll_list:
         coll_name = i["exhibitName"]
         coll_desc = i["description"]
         #coll_img = i[""]
         print((coll_name, coll_desc))
Example #16
0
 def parse_content(self, response):
     item = collectionItem()
     collectionName = response.xpath("//h1[2]").get()
     collectionDescription = "".join("".join(
         response.xpath('//span[@style]/text()').getall()).split())
     collectionImageUrl = response.xpath(
         '//div[@class="newsxx_nr"]//img/@src').get()
     print((collectionName, collectionImageUrl, collectionDescription))
 def parse_content(self, response):
     item = collectionItem()
     collectionName = response.xpath("//h2/text()").get()
     collectionDescription = "".join("".join(
         response.xpath(
             '//div[@class="neirong"]/p/text()').getall()).split())
     collectionImageUrl = "http://www.gxmuseum.cn" + response.xpath(
         '//div[@class="neirong"]//img/@src').get()
     print((collectionName, collectionImageUrl, collectionDescription))
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["Rows"]
     for i in coll_list:
         coll_name = i["Title"]
         coll_desc = i["Contents"]
         #coll_img = i[""]
         print(coll_name)
         print(coll_desc)
Example #19
0
 def parse(self, response):
     item = collectionItem()
     li_list = json.loads(response.text)['data']
     for li in li_list:
         collectionName = li['name']
         # print(collectionName)
         collectionIntroduction = li['introduce']
         # print(collectionIntroduction)
         collectionImage = li['imgPath']
 def parse_content(self, response):
     item = collectionItem()
     collectionImageUrl = "http://www.sunyat-sen.org" + response.xpath(
         "//div[@class='zwpic']/img/@src").get()
     collectionName = response.xpath("//h3/text()").get()
     collectionDescription = "".join("".join(
         response.xpath(
             "//div[@class='contentBox']//text()").getall()).split())
     print((collectionName, collectionImageUrl, collectionDescription))
 def parse_content(self, response):
     item = collectionItem()
     collectionImageUrl = response.meta['img']
     collectionName = response.xpath(
         "//div[@class='titleBox']/p/text()").get()
     collectionDescription = "".join("".join(
         response.xpath(
             "//div[@class='textBox']//text()").getall()).split())
     print((collectionName, collectionImageUrl, collectionDescription))
Example #22
0
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["data"]
     for i in coll_list:
         coll_name = i["collectionName"]
         #coll_desc = i["mipOpenCulturalrelicInfo"]["collectionsCategory"]
         coll_img = i["picUrl"]
         print(coll_name)
         print(coll_img)
 def parse(self, response):
     item = collectionItem()  
     coll_list = response.xpath('/html/body/div[5]/div/div/div[2]/div')
    
     for div in coll_list:
         #detail_url = '1' 
         if div.xpath('./a/@href'):
             detail_url = 'http://www.qdyzyzmuseum.com' + div.xpath('./a/@href').extract_first()
         yield scrapy.Request(detail_url,callback=self.parse_detail,meta={'item':item})
 def parse_content(self, response):
     item = collectionItem()
     collectionImageUrl = "https://www.jc-museum.cn" + \
         response.xpath("//div[@class='box2 wf100']/img/@src").get()
     collectionName = response.xpath(
         "//div[@class='box1 wf100']/span/text()").get()
     description = "".join("".join(response.xpath(
         "//div[@class='box2 wf100']//text()").getall()).split())
     print((collectionName, collectionImageUrl, description))
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["data"]["data"]
     for i in coll_list:
         collectionName = i["mingchen"]
         collectionDescription = i["niandai"] + " " + i["leibie"] + " " + i[
             "chicun"] + " " + i["baocun_zhuangtai"]
         collectionImageUrl = "http://www.hainanmuseum.org/cms/1/image/public/wenwu/" + i[
             "pics"][0] + ".png"
         print((collectionName, collectionDescription, collectionImageUrl))
Example #26
0
 def content_parse(self, response):
     item = collectionItem()
     doc = response.xpath("//div[@class='zhanlan-pic']")
     collectionName = doc.xpath(
         "./div[@class='list-right-bt']/text()").get()
     collectionImageUrl = "http://www.hylae.com" + doc.xpath(
         ".//img/@src").get()
     collectionDescription = "".join("".join(
         doc.xpath(".//p//text()").getall()).split())  #去除\xa0字符
     print((collectionName, collectionImageUrl, collectionDescription))
 def parse(self, response):
     item = collectionItem()
     coll_list = json.loads(response.text)["data"]
     for i in coll_list:
         collectionName = i["Title"]
         contentHTML = i["Contents"]
         Selector = etree.HTML(contentHTML)
         collectionDescription = "".join(Selector.xpath("//p/text()"))
         collectionImageUrl = Selector.xpath("//img/@src")[0]
         print((collectionName, collectionDescription, collectionImageUrl))
Example #28
0
 def parse_content(self, response):
     item = collectionItem()
     collectionImageUrl = "http://www.ynmuseum.org" + \
         response.xpath(
             "//div[@class='yc_info']/img/@src").get()
     collectionDescription = "".join("".join(
         response.xpath(
             "//div[@class='yc_infoCon']/p//text()").getall()).split())
     collectionName = collectionDescription.split("》")[0] + "》"
     print((collectionName, collectionImageUrl, collectionDescription))
 def parse(self, response):
     item = collectionItem()
     url_list = response.xpath(
         "//ul[@class='prod_list cf']/li/a/@href").getall()
     for i in url_list:
         yield scrapy.Request("http://www.ynmuseum.org" + i,
                              callback=self.parse_content)
     next_page = response.xpath(
         "//div[@class='page_w']/a[@class='next']/@href").get()
     if next_page != None:
         yield scrapy.Request("http://www.ynmuseum.org" + next_page)
Example #30
0
 def parse(self, response):
     item = collectionItem()
     doc = response.xpath("//div[@class='list-right']")
     educationName = doc.xpath(
         ".//div[@class='list-right-bt']/text()").get()
     educationImageUrl = "http://www.hylae.com" + doc.xpath(
         ".//img/@src").get(
             default="/upfile/2019/07/20190731171111_475.jpg")
     educationDescription = "".join("".join(
         doc.xpath(".//p//text()").getall()).split())  #去除\xa0字符
     print((educationName, educationImageUrl, educationDescription))