Ejemplo n.º 1
0
 def parse_exhibit(self, response):
     if response.url != 'https://www.vmfa.museum/exhibitions/exhibitions/fellowship-exhibitions/':
         exhibit_item = ExhibitItem()
         title = response.meta['title']
         image_link = response.meta['image_link']
         date = response.meta['date']
         if date.find("–") != -1:
             start_date = date.split("–")[0]
             end_date = date.split("–")[1]
         else:
             start_date = date
             end_date = ""
         description_list = response.css(
             '.tab.tab_content > p::text').getall()
         description = list()
         for d in description_list:
             d = d.strip().rstrip()
             if len(d) > 3:
                 if d[-1] == "." or d[-2] == "." or d[-3] == ".":
                     d = d + "\n"
                     description.append(d)
                 else:
                     description.append(d)
         description = "".join(description)
         exhibit_item['title'] = title
         exhibit_item['start_date'] = start_date
         exhibit_item['end_date'] = end_date
         exhibit_item['description'] = description
         exhibit_item['exhibit_url'] = response.url
         exhibit_item['image_link'] = image_link
         exhibit_item['museum'] = 'Virgina Museum of Fine Arts'
         exhibit_item['exhibit_html'] = response.body
         yield exhibit_item
Ejemplo n.º 2
0
    def parse_exhibit(self, response):
        exhibit_item = ExhibitItem()
        title = response.meta['title']
        image_link = response.meta['image_link']
        date = response.meta['date']
        if date.find("-") != -1:
            start_date = date.split("-")[0]
            end_date = date.split("-")[1]
        elif date.find(" to ") != -1:
            start_date = date.split(" to ")[0]
            end_date = date.split(" to ")[1]
        elif date.find("–") != -1:
            start_date = date.split("–")[0]
            end_date = date.split("–")[1]
        else:
            start_date = date
            end_date = ""
        description_list = response.css(
            ".clearfix.text-formatted.field.field--name-body.field--type-text-with-summary"
            ".field--label-hidden.field__item").css("*::text").getall()
        description = "".join(description_list)

        if description.find("\n\n\n\n\n") != -1:
            description = description.split("\n\n\n\n\n")[0]
        exhibit_item['title'] = title
        exhibit_item['start_date'] = start_date
        exhibit_item['end_date'] = end_date
        exhibit_item['description'] = description
        exhibit_item['exhibit_url'] = response.url
        exhibit_item['image_link'] = image_link
        exhibit_item['museum'] = 'Art Gallery of Ontario'
        exhibit_item['exhibit_html'] = response.body
        yield exhibit_item
Ejemplo n.º 3
0
    def parse_exhibit(self, response):
        self.logger.info("Visting {}".format(response.url))
        exhibit_item = ExhibitItem()
        title = response.css('.page_title > h1::text').get()
        description_list = response.css(
            '.module.wysiwyg.row > .col-xs-12 > p > *::text').getall()
        print(description_list, "jjj")
        description = list()
        months = "september|october|november|december|january|february|march|april|may|june|july|august"
        for d in description_list:

            if (re.search(months, d.lower())) and (d.find("–") != -1):
                start_date = d.split("–")[0]
                end_date = d.split("–")[1]
            if d == "Year-round":
                start_date = d
                end_date = ""
            description.append(d.rstrip().strip() + "\n")

        image_link = response.meta['image_link']
        description = "".join(description)
        exhibit_item['title'] = title
        exhibit_item['start_date'] = start_date
        exhibit_item['end_date'] = end_date
        exhibit_item['description'] = description
        exhibit_item['exhibit_url'] = response.url
        exhibit_item['image_link'] = image_link
        exhibit_item['museum'] = 'New York Botanical Gardens'
        #exhibit_item['exhibit_html'] = response.body
        yield exhibit_item
Ejemplo n.º 4
0
 def parse_description(self, response):
     title = response.meta['title']
     date = str(response.meta['date'])
     print(len(date.split("-")), "bbb", date)
     if len(date.split("-")) > 1:
         start_date = date.split("-")[0]
         end_date = date.split("-")[1]
         # no year provided for start date
         if (len(start_date.split(", "))) == 1:
             start_date = start_date + ", " + end_date.split(", ")[1]
     elif len(date.split("–")) > 1:
         start_date = date.split("–")[0]
         end_date = date.split("–")[1]
         if (len(start_date.split(", "))) == 1:
             start_date = start_date + ", " + end_date.split(", ")[1]
     else:
         start_date = date
         end_date = ""
     image_link = response.meta['image link']
     url = response.url
     description = "".join(
         response.css('.exhibitioncard-wrapper-copy').css(
             '*::text').getall()).strip().rstrip()
     description = "".join(description).rstrip().strip()
     exhibit_item = ExhibitItem()
     exhibit_item['title'] = title
     exhibit_item['start_date'] = start_date
     exhibit_item['end_date'] = end_date
     exhibit_item['description'] = description
     exhibit_item['exhibit_url'] = url
     exhibit_item['image_link'] = image_link
     exhibit_item['museum'] = "San Francisco Museum of Modern Art"
     exhibit_item['exhibit_html'] = response.body
     yield exhibit_item
Ejemplo n.º 5
0
    def parse_exhibit(self, response):
        self.logger.info("Visting {}".format(response.url))
        exhibit_item = ExhibitItem()
        if response.css('.exhibition'):
            title = response.css('.title.knockout').css('h1::text').get().rstrip().strip()
            # subtitle?
            if response.css('.subtitle::text').get():
                subtitle = response.css('.subtitle::text').get().rstrip().strip()
                title = title + ": " + subtitle
            else:
                subtitle = ""
            if response.css('.date-recur-date'):
                start_date = response.css('.date-recur-date').css("*::text").getall()[0]
                end_date = response.css('.date-recur-date').css("*::text").getall()[2]
            else:
                start_date = response.css('.dateline::text').get().rstrip().strip()
                end_date = ""
            if response.xpath('/html/body/div[1]/div[1]/main/div/header/div[1]/div[1]/div[1]/picture/img/@data-src').get():
                image_link = response.xpath('/html/body/div[1]/div[1]/main/div/header/div[1]/div[1]/div[1]/picture/img/@data-src').get()
            else:
                image_link = response.xpath('//*[@id="header-video"]/source//@src').get()

            description = ("".join(response.css(".overview").css("*::text").getall())).rstrip().strip()

            exhibit_item['title'] = title
            exhibit_item['start_date'] = start_date
            exhibit_item['end_date'] = end_date
            exhibit_item['description'] = description
            exhibit_item['exhibit_url'] = response.url
            exhibit_item['image_link'] = image_link
            exhibit_item['museum'] = 'Denver Art Museum'
            exhibit_item['exhibit_html'] = response.body
            yield exhibit_item
        else:
            print(response.url)
 def parse_exhibit(self, response):
     exhibit_item = ExhibitItem()
     title = response.meta['title']
     image_link = response.meta['image_link']
     date = response.meta['date']
     if date:
         if date.find("–") != -1:
             start_date = date.split("–")[0]
             end_date = date.split("–")[1]
         elif date.find("Until") != -1:
             start_date = ""
             end_date = date.split("Until ")[1]
         else:
             start_date = date
             end_date = ""
     else:
         start_date = ""
         end_date = ""
     description = list()
     description_list = response.css('.m-block__column').css(
         'p *::text').getall()
     for d in description_list:
         d = d.strip() + "\n"
         description.append(d)
     description = "".join(description)
     exhibit_item['title'] = title
     exhibit_item['start_date'] = start_date
     exhibit_item['end_date'] = end_date
     exhibit_item['description'] = description
     exhibit_item['exhibit_url'] = response.url
     exhibit_item['image_link'] = image_link
     exhibit_item['museum'] = 'The Barnes Foundation'
     exhibit_item['exhibit_html'] = response.body
     yield exhibit_item
Ejemplo n.º 7
0
 def parse_html(self, response):
     exhibit_item = ExhibitItem()
     exhibit_item['title'] = response.meta['title']
     exhibit_item['start_date'] = response.meta['start_date']
     exhibit_item['end_date'] = response.meta['end_date']
     exhibit_item['description'] = response.meta['description']
     exhibit_item['exhibit_url'] = response.url
     exhibit_item['museum'] = 'The Museum of Fine Arts, Houston'
     exhibit_item['image_link'] = response.meta['image_link']
     exhibit_item['exhibit_html'] = response.body
     yield exhibit_item
 def parse_exhibit(self, response):
     exhibit_item = ExhibitItem()
     title = response.css('.main-article__header-inner > h1::text').get()
     image_link = response.css('picture > img::attr(src)').get()
     start_date = response.meta['date']
     end_date = ""
     description = response.css('.main-article__content > *::text').getall()
     description = "".join(description).rstrip().strip()
     exhibit_item['title'] = title
     exhibit_item['start_date'] = start_date
     exhibit_item['end_date'] = end_date
     exhibit_item['description'] = description
     exhibit_item['exhibit_url'] = response.url
     exhibit_item['image_link'] = image_link
     exhibit_item['museum'] = 'California Science Center'
     exhibit_item['exhibit_html'] = response.body
     yield exhibit_item
Ejemplo n.º 9
0
    def parse_exhibit(self, response):
        exhibit_item = ExhibitItem()
        title = response.meta['title']
        date = response.meta['date']
        image_link = response.meta['image_link']
        url = response.url
        description_list = response.css('.exhibition-description').css('p *::text').getall()
        if not description_list:
            description_list = response.css('.exhibition-details').css('p *::text').getall()
        if not description_list:
            description_list = response.css('#main').css('p *::text').getall()
        description = list()
        for d in description_list:
            d = d.strip().rstrip()
            if len(d) > 3:
                if d[-1] == "." or d[-2] == "." or d[-3] == ".":
                    d = d + "\n"
                    description.append(d)
                else:
                    d = d + " "
                    description.append(d)
        description = "".join(description)
        if date:
            if date.find("–") != -1:
                start_date = date.split("–")[0]
                end_date = date.split("–")[1]
            elif date.find("Until") != -1:
                start_date = ""
                end_date = date.split("Until ")[1]
            else:
                start_date = date
                end_date = ""
        else:
            start_date = ""
            end_date = ""

        exhibit_item['title'] = title
        exhibit_item['start_date'] = start_date
        exhibit_item['end_date'] = end_date
        exhibit_item['description'] = description
        exhibit_item['exhibit_url'] = url
        exhibit_item['image_link'] = image_link
        exhibit_item['museum'] = 'Milwaukee Art Museum'
        exhibit_item['exhibit_html'] = response.body
        yield exhibit_item
Ejemplo n.º 10
0
 def parse(self, response):
     exhibit_item = ExhibitItem()
     exhibits = response.css('.exhibitionsgrid-wrapper-grid').css(
         '.exhibitionsgrid-wrapper-grid-item.item-with-border')
     for exhibit in exhibits:
         title = exhibit.css('::attr(title)').get()
         date = exhibit.css(
             '.exhibitionsgrid-wrapper-grid-item-text-date::text').get(
             ).strip().rstrip()
         url = exhibit.css('::attr(href)').get()
         image_link = exhibit.css(
             '.exhibitionsgrid-wrapper-grid-item-image::attr(src)').get()
         yield Request(url=url,
                       callback=self.parse_description,
                       meta={
                           'title': title,
                           'date': date,
                           'image link': image_link
                       })
Ejemplo n.º 11
0
 def parse_exhibit(self, response):
     exhibit_item = ExhibitItem()
     title = response.meta['title']
     image_link = response.meta['image_link']
     date = response.meta['date']
     if date:
         if date.find("–") != -1:
             start_date = date.split("–")[0]
             end_date = date.split("–")[1]
         elif date.find("Through") != -1:
             start_date = ""
             end_date = date.split("Through ")[1]
         else:
             start_date = date
             end_date = ""
     else:
         start_date = ""
         end_date = ""
     description_list = response.css('.two-columns > .one-column > p').css("*::text").getall()
     if not description_list:
         description_list = response.css('.two-columns > .overview')[1].css('*::text').getall()
     description = list()
     for d in description_list:
         d = d.strip().rstrip()
         if len(d) > 3:
             if d[-1] == "." or d[-2] == "." or d[-3] == ".":
                 d = d + "\n"
                 description.append(d)
             else:
                 d = d + " "
                 description.append(d)
     imgkit.from_url(response.url, './screenshots/{}.jpg'.format(title))
     description = "".join(description)
     exhibit_item['title'] = title
     exhibit_item['start_date'] = start_date
     exhibit_item['end_date'] = end_date
     exhibit_item['description'] = description
     exhibit_item['exhibit_url'] = response.url
     exhibit_item['image_link'] = image_link
     exhibit_item['museum'] = 'High Museum of Art'
     exhibit_item['exhibit_html'] = response.body
     yield exhibit_item
Ejemplo n.º 12
0
    def parse_exhibit(self, response):
        exhibit_item = ExhibitItem()
        title = response.meta['title']
        description = response.meta['description']
        image_link = response.meta['image_link']
        date = response.meta['date']
        if date.find("–") != -1:
            start_date = date.split("–")[0]
            end_date = date.split("–")[1]
        else:
            start_date = date
            end_date = ""

        exhibit_item['title'] = title
        exhibit_item['start_date'] = start_date
        exhibit_item['end_date'] = end_date
        exhibit_item['description'] = description
        exhibit_item['exhibit_url'] = response.url
        exhibit_item['image_link'] = image_link
        exhibit_item['museum'] = 'The Library of Congress'
        exhibit_item['exhibit_html'] = response.body
        yield exhibit_item
Ejemplo n.º 13
0
 def parse_exhibit(self, response):
     exhibit_item = ExhibitItem()
     title = response.css('.fixed-page-title::text').get()
     start_date = response.meta['start date']
     end_date = response.meta['end date']
     if not start_date and end_date:
         start_date = end_date
         end_date = ""
     image_link = response.meta['image link']
     url = response.url
     description = response.css(
         '.field.field--name-body.field--type-text-with-summary.field--label-hidden.field--item'
     )[0].css('*::text').getall()
     description = "".join(description).strip().rstrip()
     exhibit_item['title'] = title
     exhibit_item['start_date'] = start_date
     exhibit_item['end_date'] = end_date
     exhibit_item['description'] = description
     exhibit_item['exhibit_url'] = url
     exhibit_item['image_link'] = image_link
     exhibit_item['museum'] = 'Los Angeles County Museum of Art'
     exhibit_item['exhibit_html'] = response.body
     yield exhibit_item
Ejemplo n.º 14
0
 def parse_exhibit(self, response):
     exhibit_item = ExhibitItem()
     title = response.css('.panel-pane.pane-node-content').css(
         '.pane-title::text').get().rstrip().strip()
     image_link = response.meta['image link']
     url = response.url
     description_list = response.css(
         '.entity.entity-field-collection-item.field-collection-item-field-event.clearfix'
     ).css('.field-item.even').css('p')[0].css('::text').getall()
     # in one instance, the body of the text is in the third paragraph tag.. so handle that
     if len(description_list) == 0:
         description_list = response.css(
             '.entity.entity-field-collection-item.field-collection-item-field-event.clearfix'
         ).css('.field-item.even').css('p')[2].css('::text').getall()
     description = list()
     for d in description_list:
         if len(d) >= 2:
             if (d[-1] == ".") or (d[-2] == "."):
                 d = d + '\n'
                 description.append(d)
             else:
                 description.append(d)
     description = "".join(description)
     # idk they all have dates so
     start_date = response.css('.date-display-range').css(
         '*::text').getall()[0].split(", ")[1]
     end_date = response.css('.date-display-range').css(
         '*::text').getall()[2].split(", ")[1]
     exhibit_item['title'] = title
     exhibit_item['start_date'] = start_date
     exhibit_item['end_date'] = end_date
     exhibit_item['description'] = description
     exhibit_item['exhibit_url'] = url
     exhibit_item['image_link'] = image_link
     exhibit_item['museum'] = 'The Cleveland Museum of Art'
     exhibit_item['exhibit_html'] = response.body
     yield exhibit_item
Ejemplo n.º 15
0
    def parse_exhibit(self, response):
        exhibit_item = ExhibitItem()
        title = response.meta['title']
        image_link = response.meta['image_link']
        date = response.meta['date']
        if date.find("–") != -1:
            start_date = date.split("–")[0]
            end_date = date.split("–")[1]
        else:
            start_date = date
            end_date = ""
        description_list = response.css('.entry-content > *').css(
            '::text').getall()
        if not description_list:
            description_list = response.css('.sam-event-description > *').css(
                '::text').getall()
        description = list()
        for d in description_list:
            d = d.strip().rstrip()
            if len(d) > 3:
                if d[-1] == "." or d[-2] == "." or d[-3] == ".":
                    d = d + "\n"
                    description.append(d)
                else:
                    description.append(d)
        description = "".join(description)

        exhibit_item['title'] = title
        exhibit_item['start_date'] = start_date
        exhibit_item['end_date'] = end_date
        exhibit_item['description'] = description
        exhibit_item['exhibit_url'] = response.url
        exhibit_item['image_link'] = image_link
        exhibit_item['museum'] = 'Seattle Art Museum'
        exhibit_item['exhibit_html'] = response.body
        yield exhibit_item