elems = self.root.findall('body//div[@class="addresspanel"]//p[@class="address"]') return '%s %s' % (tostring(elems[0], True), tostring(elems[1], True)) def get_primary_photo(self): return self.absolute_url(self.root.find('body//div[@id="mastHeadCarousel"]//li//img').get('data-original')) def get_photos(self): return [self.absolute_url(e.get('data-original')) for e in self.root.findall('body//div[@id="mastHeadCarousel"]//li//img')] def get_category(self): return values.Category.LODGING def get_sub_category(self): return values.SubCategory.HOTEL @staticmethod def expand_reservation_page(url, page_source_tree): new_url = page_source_tree.getroot().find('body//li[@class="img_info"]//p[@class="bw"]//a').get('href') return (new_url,) @staticmethod def expand_deep_info_page(url, ignored): host = urlparse.urlparse(url).netloc.lower() new_url = 'http://%s/en/hotel/home.html' % host return (new_url,) HyattScraper.HANDLEABLE_URL_PATTERNS = scraped_page.urlpatterns( '^http(s)?://[^/]+\.hyatt.com/[a-z]+/hotel/home.html.*$', ('^http(s)?://[^/]+\.hyatt.com/hyatt/reservations.*$', HyattScraper.expand_reservation_page, False, REQUIRES_CLIENT_PAGE_SOURCE), ('^http(s)?://[^/]+\.hyatt.com/[a-z]+/hotel/(?!home).*$', HyattScraper.expand_deep_info_page))
return [thumb.replace('_tn.jpg', '_lg.jpg') for thumb in thumb_srcs if '_tn.jpg' in thumb] def get_photo_page_url(self): return 'http://www.starwoodhotels.com/preferredguest/property/photos/index.html?propertyID=%s' % self.get_site_specific_entity_id() def get_photo_page(self): if not hasattr(self, '_photo_page'): self._photo_page = html_parsing.parse_tree(self.get_photo_page_url()) return self._photo_page def get_primary_photo(self): photo_url_re = re.compile('''entity\.thumbnailUrl=([^'",]+)''') for script in self.root.findall('body//script'): match = photo_url_re.search(script.text) if match: return self.absolute_url(match.group(1).replace('_tn.jpg', '_lg.jpg')) return None def get_site_specific_entity_id(self): return urlparse.parse_qs(urlparse.urlparse(self.url.lower()).query)['propertyid'][0] @staticmethod def expand_using_property_id(url, ignored): property_id = urlparse.parse_qs(urlparse.urlparse(url.lower()).query)['propertyid'][0] new_url = 'http://www.starwoodhotels.com/preferredguest/property/overview/index.html?propertyID=%s' % property_id return (new_url,) StarwoodScraper.HANDLEABLE_URL_PATTERNS = scraped_page.urlpatterns( '^http(s)?://www\.starwoodhotels\.com/preferredguest/property/overview/index\.html\?propertyID=\d+.*$', ('(?i)^http(s)?://www\.starwoodhotels\.com/.*propertyid=\d+.*$', StarwoodScraper.expand_using_property_id))
url = url.replace('/thumb/', '/main/') if '81x50' in url: url = url.replace('81x50', '675x359') else: url = url.replace('.jpg', '_675x359_FitToBoxSmallDimension_Center.jpg') return url INFO_PAGE_RE = re.compile('^http(?:s)?://www(?:\d)?\.hilton\.com/([a-z]+)/hotels/([\w-]+)/([\w-]+)/[\w-]+/[\w-]+\.html.*$') @staticmethod def expand_info_page_url(url, ignored): match = HiltonScraper.INFO_PAGE_RE.match(url) language, region, property_name = match.group(1), match.group(2), match.group(3) new_url = 'http://www3.hilton.com/%s/hotels/%s/%s/index.html' % (language, region, property_name) return (new_url,) @staticmethod def expand_reservation_page(url, page_source_tree): details_popup = page_source_tree.getroot().find('body//div[@class="resHeaderHotelInfo"]//span[@class="links"]//a[@class="popup"]') if details_popup is not None: details_url = details_popup.get('href') new_url = details_url.replace('/popup/hotelDetails.html', '/index.html') return (new_url,) return () HiltonScraper.HANDLEABLE_URL_PATTERNS = scraped_page.urlpatterns( '^http(s)?://www(\d)?\.hilton\.com/[a-z]+/hotels/[\w-]+/[\w-]+/index\.html.*$', ('^http(s)?://www(\d)?\.hilton\.com/[a-z]+/hotels/[\w-]+/[\w-]+/[\w-]+/[\w-]+\.html.*$', HiltonScraper.expand_info_page_url), ('^http(s)?://secure(\d)?\.hilton\.com/.*$', HiltonScraper.expand_reservation_page, False, REQUIRES_CLIENT_PAGE_SOURCE))