def parse_event_page(self, response): hxs = HtmlXPathSelector(response) item = response.meta.get("item", EventItem()) loader = GuggenheimLoader(item=item, response=response) if not item.get(u"name", ""): loader.add_xpath(u"name", '//div[@class="cal_event_catname"]/following-sibling::h4/text()') loader.add_xpath(u"description", "//div[@class='cal_event_description']/p/text()") # loader.add_xpath("place_name", '//text()[contains(.,"Venue:")]/following-sibling::a[1]/text()') DT_RE = "(?P<year>\d{4})\/(?P<month>\d{2})\/(?P<day>\d{2})\/" regex = re.compile(DT_RE) r = regex.search(response.url) dates = r.groupdict() yr = dates.get("year") month = dates.get("month") day = dates.get("day") if yr and month and day: loader.add_value(u"start_date", "%s/%s/%s" % (yr, month, day)) image_url = hxs.select('//div[contains(@class,"calendar_item")]/div[@class="row-picture"]/img/@src').extract() if image_url: image_url = image_url[0] image_name = os.path.basename(image_url) loader.add_value(u"image_urls", [image_url]) loader.add_value(u"image_name", image_name) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) events = hxs.select('//div[contains(@class,"clickable")][contains(@class,"cal_events_month_item")]//a') for e in events: loader = GuggenheimLoader(item=EventItem(), response=response) loader.add_value(u"rr_identifier", u"GHM") loader.add_value(u"rr_publisher_market", u"NYC") loader.add_value(u"place_name", u"Solomon R. Guggenheim Museum") loader.add_value(u"place_street_addr", u"1071 5th Ave (at E 89th St)") loader.add_value(u"place_city", u"New York") loader.add_value(u"place_state", u"NY") loader.add_value(u"place_zip", u"10128") loader.add_value(u"place_lat", u"40.783018") loader.add_value(u"place_long", u"-73.958888") loader.add_value(u"foursquare_id", u"41706480f964a520a51d1fe3") date = "".join(e.select("../preceding-sibling::text()").extract()).strip() if date: if "," in date: print "got a `,`" # FIXME # a comma in the datetime values here means that the event will take place # twice in the same day. # need to create another EventItem. # for now adding only the first one. parts = date.split(",") loader.add_value(u"start_time", parts[0]) loader.add_value(u"end_time", parts[1]) else: parts = re.split("\xe2\x80\x93", date) if len(parts) > 1: print "got a `-`" parts = date.split("-") if "pm" in date.lower(): loader.add_value(u"start_time", " ".join(parts[0], "pm")) elif "am" in date.lower(): loader.add_value(u"start_time", " ".join(parts[0], "am")) loader.add_value(u"end_time", parts[1]) else: print "got ", date loader.add_value(u"start_time", date) loader.add_value(u"name", e.select("./text()").extract()) url = urlparse.urljoin(response.url, e.select("./@href").extract()[0]) if url: loader.add_value(u"link", response.url) yield Request(url=url, callback=self.parse_event_page, meta={"item": loader.load_item()})