Exemple #1
0
    def parse_event_page(self, response):
        hxs = HtmlXPathSelector(response)
        item = response.meta.get("item", EventItem())
        loader = GuggenheimLoader(item=item, response=response)
        if not item.get(u"name", ""):
            loader.add_xpath(u"name", '//div[@class="cal_event_catname"]/following-sibling::h4/text()')
        loader.add_xpath(u"description", "//div[@class='cal_event_description']/p/text()")
        # loader.add_xpath("place_name", '//text()[contains(.,"Venue:")]/following-sibling::a[1]/text()')

        DT_RE = "(?P<year>\d{4})\/(?P<month>\d{2})\/(?P<day>\d{2})\/"
        regex = re.compile(DT_RE)
        r = regex.search(response.url)
        dates = r.groupdict()
        yr = dates.get("year")
        month = dates.get("month")
        day = dates.get("day")
        if yr and month and day:
            loader.add_value(u"start_date", "%s/%s/%s" % (yr, month, day))

        image_url = hxs.select('//div[contains(@class,"calendar_item")]/div[@class="row-picture"]/img/@src').extract()
        if image_url:
            image_url = image_url[0]
            image_name = os.path.basename(image_url)
            loader.add_value(u"image_urls", [image_url])
            loader.add_value(u"image_name", image_name)

        yield loader.load_item()
Exemple #2
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        events = hxs.select('//div[contains(@class,"clickable")][contains(@class,"cal_events_month_item")]//a')
        for e in events:
            loader = GuggenheimLoader(item=EventItem(), response=response)
            loader.add_value(u"rr_identifier", u"GHM")
            loader.add_value(u"rr_publisher_market", u"NYC")
            loader.add_value(u"place_name", u"Solomon R. Guggenheim Museum")
            loader.add_value(u"place_street_addr", u"1071 5th Ave (at E 89th St)")
            loader.add_value(u"place_city", u"New York")
            loader.add_value(u"place_state", u"NY")
            loader.add_value(u"place_zip", u"10128")
            loader.add_value(u"place_lat", u"40.783018")
            loader.add_value(u"place_long", u"-73.958888")
            loader.add_value(u"foursquare_id", u"41706480f964a520a51d1fe3")

            date = "".join(e.select("../preceding-sibling::text()").extract()).strip()
            if date:
                if "," in date:
                    print "got a `,`"
                    # FIXME
                    # a comma in the datetime values here means that the event will take place
                    # twice in the same day.
                    # need to create another EventItem.
                    # for now adding only the first one.
                    parts = date.split(",")
                    loader.add_value(u"start_time", parts[0])
                    loader.add_value(u"end_time", parts[1])
                else:
                    parts = re.split("\xe2\x80\x93", date)
                    if len(parts) > 1:
                        print "got a `-`"
                        parts = date.split("-")
                        if "pm" in date.lower():
                            loader.add_value(u"start_time", " ".join(parts[0], "pm"))
                        elif "am" in date.lower():
                            loader.add_value(u"start_time", " ".join(parts[0], "am"))
                        loader.add_value(u"end_time", parts[1])
                    else:
                        print "got ", date
                        loader.add_value(u"start_time", date)

            loader.add_value(u"name", e.select("./text()").extract())
            url = urlparse.urljoin(response.url, e.select("./@href").extract()[0])
            if url:
                loader.add_value(u"link", response.url)
                yield Request(url=url, callback=self.parse_event_page, meta={"item": loader.load_item()})