def scrape_entity_page(self, url):
     entity_root = html_parsing.parse_tree(url).getroot()
     name = html_parsing.tostring(entity_root.xpath('.//div[@class="title-desc-inner"]//h1')[0])
     content_p_elems = entity_root.xpath(".//div[@class='content']//div[not(@class='image-caption')]/p")
     description = '\n\n'.join(html_parsing.tostring(p) for p in content_p_elems)
     photo_urls = entity_root.xpath(".//div[@class='content']//img/@data-src")
     return data.Entity(name=name, description=description, photo_urls=photo_urls)
 def placemark_to_entity(self, placemark_elem):
     pm = placemark_elem
     name_elem = self.xpath(pm, 'ns:name')
     name = tostring(name_elem[0]) if name_elem is not None else None
     description_elem = self.xpath(pm, 'ns:description')
     description_html = tostring(description_elem[0]) if description_elem else None
     description = self.html_str_to_text(description_html) if description_html else None
     latlng = self.parse_latlng(self.xpath(pm, 'ns:Point'))
     return data.Entity(name=name, description=description, latlng=latlng)
Exemple #3
0
 def get_opening_hours(self):
     hours_nodes = self.root.xpath('.//div[@class="place-resume"]//table[@class="hours-open"]//tr')
     texts = []
     for node in hours_nodes:
         day = tostring(node.xpath('.//td')[0])
         times = tostring(node.xpath('.//td')[1])
         texts.append('%s\t%s' % (day, times))
     source_text = '\n'.join(texts)
     return data.OpeningHours(source_text=source_text)
 def get_opening_hours(self):
     timeframes = self.root.xpath('.//div[@class="venueDetail"]//div[@class="allHours"]//ul[@class="timeframes"]//li[@class="timeframe"]')
     timeframes_text = []
     for t in timeframes:
         text = '%s\t%s' % (tostring(t.xpath('.//span[@class="timeframeDays"]')[0]),
             tostring(t.xpath('.//span[@class="timeframeHours"]')[0]))
         timeframes_text.append(text)
     source_text = '\n'.join(timeframes_text)
     return data.OpeningHours(source_text=source_text)
 def parse_latlng(self):
     lat_elem = self.root.find('.//span[@class="geo-default"]//span[@class="latitude"]')
     lng_elem = self.root.find('.//span[@class="geo-default"]//span[@class="longitude"]')
     if lat_elem is not None:
         return utils.latlng_to_decimal(tostring(lat_elem), tostring(lng_elem))
     geo_elem = self.root.find('.//span[@class="geo-default"]//span[@class="geo"]')
     if geo_elem is not None:
         lat, lng = tostring(geo_elem).split(';')
         return {
             'lat': float(lat.strip()),
             'lng': float(lng.strip())
             }
     return None
 def get_address(self):
     city, country = self.get_city_and_country()
     if '/hotels/' in self.url:
         street_and_city_node = self.root.xpath('.//span[contains(@class, "lodging__subtitle--address")]')[0]
         street_and_city = tostring(street_and_city_node, True)
         return '%s %s' % (street_and_city, country)
     else:
         street_node = self.root.find('.//dl[@class="info-list"]//dd[@class="copy--meta"]//strong')
         if street_node is not None:
             street = tostring(street_node, True)
             return '%s %s %s' % (street, city, country)
         else:
             google_place = self.lookup_google_place()
             return google_place.address if google_place else None
 def get_address(self):
     street_node = self.root.find('.//li[@class="address"]//span[@itemprop="streetAddress"]')
     locality_node = self.root.find('.//li[@class="address"]//span[@itemprop="addressLocality"]')
     postal_node = self.root.find('.//li[@class="address"]//span[@itemprop="postalCode"]')
     
     if street_node is not None and locality_node is not None:
         street = tostring(street_node, True).replace(',', '')
         locality = tostring(locality_node, True).replace(',', '')
         if postal_node is not None:
             postal_code = tostring(postal_node, True).replace(',', '')
             return '%s %s %s' % (street, locality, postal_code)
         else:
             return '%s %s' % (street, locality)
     else:
         return self.lookup_google_place().address
 def get_address(self):
     infocard_cells = self.root.findall('.//table[@class="infobox vcard"]//tr')
     for tr in infocard_cells:
         th = tr.find('.//th')
         if th is not None and th.text == 'Address':
             return tostring(tr.find('.//td'))
     return None
 def get_location_name(self):
     page_header_node = self.root.xpath('.//h1[contains(@class, "header")]')
     if page_header_node:
         page_header = html_parsing.tostring(page_header_node[0])
         if 'Travel Guide for' in page_header:
             return page_header.replace('Travel Guide for ', '')
     return None
    def get_sub_category(self):
        category_node = self.root.xpath('.//li[contains(@class, "categoriesList")]//div[contains(@class, "categories")]')
        if category_node:
            categories = tostring(category_node[0]).split(',')
            categories = [s.strip().lower() for s in categories]
        else:
            categories = []

        path_root = self.get_path_root()
        tc_category = self.get_category()
        if tc_category == values.Category.FOOD_AND_DRINK:
            # Gogobot doesn't seem to have categories like restaurant/bar/bakery
            # They do have cuisine types like French though.
            return values.SubCategory.RESTAURANT
        elif tc_category == values.Category.ATTRACTIONS:
            if contains_any(categories, ['monument', 'historic site']):
                return values.SubCategory.LANDMARK
            elif contains_any(categories, ['sights and museums', 'art museum']):
                return values.SubCategory.MUSEUM
        elif tc_category == values.SubCategory.LODGING:
            if path_root.endswith('hotel'):
                return values.SubCategory.HOTEL
            elif path_root.endswith('vacation-rental'):
                return values.SubCategory.VACATION_RENTAL
        return None
 def get_sub_category(self):
     category_node = self.root.find('.//div[@class="primaryInfo"]//div[@class="categories"]')
     category_str = tostring(category_node).lower()
     parsed_category = self.get_category()
     if parsed_category == values.Category.FOOD_AND_DRINK:
         if 'restaurant' in category_str:
             return values.SubCategory.RESTAURANT
         if 'coffee' in category_str:
             return values.SubCategory.COFFEE_SHOP
         if 'bar' in category_str:
             return values.SubCategory.BAR
         if contains_any(category_str, ('ice cream', 'dessert')):
             return values.SubCategory.DESSERT
         if 'bakery' in category_str:
             return values.SubCategory.BAKERY
     if parsed_category == values.Category.LODGING:
         if contains_any(category_str, ('hotel', 'motel')):
             return values.SubCategory.HOTEL
         if 'hostel' in category_str:
             return values.SubCategory.HOSTEL
     if parsed_category == values.Category.ENTERTAINMENT:
         if contains_any(category_str, ('concert hall', 'jazz club', 'rock club')):
             return values.SubCategory.MUSIC
         if 'stadium' in category_str:
             return values.SubCategory.SPORTS
     return None
Exemple #12
0
 def get_sub_category(self):
     category_node = self.root.xpath('.//div[@class="place-post"]//span[@class="date"]')[0]
     category_text = tostring(category_node).split('|')[0].strip().lower()
     if 'bar' in category_text:
         return values.SubCategory.BAR
     elif 'club' in category_text:
         return values.SubCategory.NIGHTCLUB
     return values.SubCategory.RESTAURANT
Exemple #13
0
 def get_category(self):
     categories_parent = self.root.find('body//span[@class="category-str-list"]')
     categories_str = tostring(categories_parent)
     categories = [c.strip().lower() for c in categories_str.split(',')]
     if 'hotel' in categories or 'hotels' in categories or 'bed & breakfast' in categories:
         return values.Category.LODGING
     else:
         return values.Category.FOOD_AND_DRINK
 def parse(self):
     raw_entities = []
     for placemark in self.xpath(self.root, './/ns:Placemark'):
         raw_entities.append(self.placemark_to_entity(placemark))
     entities = utils.parallelize(self.augment_entity, [(e,) for e in raw_entities])
     name = tostring(self.xpath(self.root, 'ns:Document/ns:name')[0])
     # TODO: Parse the latlngs into a Bounds object for the trip plan.
     # Right now this is happening the javascript as a hack.
     return data.TripPlan(name=name, entities=entities)
Exemple #15
0
 def get_opening_hours(self):
     hours_nodes = self.root.xpath('.//table[contains(@class, "hours-table")]//tr')
     texts = []
     for node in hours_nodes:
         day = tostring(node.find('th'))
         times = tostring_with_breaks(node.find('td'))
         texts.append('%s\t%s' % (day, times))
     source_text = '\n'.join(texts)
     return data.OpeningHours(source_text=source_text)
 def get_raw_entities(self):
     items = self.root.xpath(
         ".//div[@id='guides']//h3[text() = 'Top Things to Do' or text() = 'Top Things to See and Do']/following-sibling::ul//li")
     entities = []
     for item in items:
         raw_text  = html_parsing.tostring(item).strip()
         name, desc = re.split(u'\s?(?:\u2013|-|:)\s?', raw_text, 1, re.UNICODE)[:2]
         entities.append(data.Entity(name=name, description=desc))
     return entities
 def get_raw_entities(self):
     entities = []
     items = self.root.xpath('.//h2[@class="accordion-title" and contains(., "At a Glance")]/following-sibling::div//p')
     for item in items:
         num_stars = len(item.text.strip())
         starred = num_stars == 3
         name = item.xpath('.//strong')[0].text.strip()
         temp_html = re.sub('<strong>.*</strong>', 'SPLIT_POINT', etree.tostring(item))
         temp_node = html_parsing.parse_tree_from_string(temp_html.encode('utf-8'))
         desc = html_parsing.tostring(temp_node).split('SPLIT_POINT')[1].strip()
         entities.append(data.Entity(name=name, starred=starred, description=desc))
     return entities
Exemple #18
0
 def get_sub_category(self):
     categories_parent = self.root.find('body//span[@class="category-str-list"]')
     categories_str = tostring(categories_parent)
     categories = [c.strip().lower() for c in categories_str.split(',')]
     if 'bed & breakfast' in categories:
         return values.SubCategory.BED_AND_BREAKFAST
     elif 'hotel' in categories or 'hotels' in categories:
         return values.SubCategory.HOTEL
     else:
         for category in categories:
             if 'bar' in category:
                 return values.SubCategory.BAR
         return values.SubCategory.RESTAURANT
 def get_category(self):
     category_node = self.root.find('.//div[@class="primaryInfo"]//div[@class="categories"]')
     category_str = tostring(category_node).lower()
     if contains_any(category_str, ('restaurant', 'bar', 'ice cream', 'dessert', 'bakery', 'coffee')):
         return values.Category.FOOD_AND_DRINK
     if contains_any(category_str, ('hotel', 'motel', 'hostel')):
         return values.Category.LODGING
     if contains_any(category_str, ('monument', 'landmark')):
         return values.Category.ATTRACTIONS
     if contains_any(category_str, ('store', 'shop', 'boutique')):
         return values.Category.SHOPPING
     if contains_any(category_str, ('concert hall', 'jazz club', 'rock club', 'stadium')):
         return values.Category.ENTERTAINMENT
     return None
 def get_sub_category(self):
     url = self.url.lower()
     if '/hotels/' in url:
         hotel_type_node = self.root.xpath('.//span[contains(@class, "lodging__subtitle")]')[0]
         hotel_type = tostring(hotel_type_node, True)
         if hotel_type == 'Guesthouse':
             return values.SubCategory.BED_AND_BREAKFAST
         elif hotel_type == 'Hostel':
             return values.SubCategory.HOSTEL
         else:
             return values.SubCategory.HOTEL
     elif '/restaurants/' in url:
         return values.SubCategory.RESTAURANT
     return None
 def run(self):
     entity_datas = []
     for p in self.getroot().findall('.//footer//div[@class="story-info"]//p'):
         line_text = html_parsing.tostring(p, with_tail=False)
         if self.NUMBERED_LINE_RE.match(line_text):
             for child in p.iterchildren():
                 tag = child.tag.lower()
                 text = html_parsing.tostring(child, with_tail=False)
                 if tag == 'strong':
                     if self.NUMBERED_LINE_RE.match(text):
                         name = text.split('.')[1]
                     else:
                         name = text
                     current_entity = EntityData(name=name.strip().strip(string.punctuation))
                     entity_datas.append(current_entity)
                 elif tag == 'a':
                     current_entity.website = child.get('href')
                 tail = child.tail.strip().strip(string.punctuation) if child.tail else ''
                 if tail:
                     parts = tail.split(';')
                     current_entity.address = parts[0].strip()
                     if len(parts) >= 2:
                         current_entity.phone = parts[1].strip()
     self.build_from_entity_data(entity_datas)
 def get_description(self):
     desc_nodes = self.root.xpath('.//div[@id="listing_main"]//div[@class="listing_description"]')
     if not desc_nodes:
         return None
     desc_node = desc_nodes[0]
     details_link = desc_node.xpath('.//a/@href')
     if details_link:
         url = self.absolute_url(details_link[0])
         details_page_tree = html_parsing.parse_tree(url)
         details_node = details_page_tree.getroot().xpath('.//div[@class="articleBody"]')[0]
         if details_node.xpath('.//p'):
             return html_parsing.join_element_text_using_xpaths(details_node, ['.//p'], '\n\n')
         else:
             return html_parsing.tostring(details_node)
     elif desc_node.xpath('.//span[@class="onShow"]'):
         return ''.join(desc_node.xpath('.//span[@class="onShow"]/text()')).strip()
     else:
         return ''.join(desc_node.xpath('text()')).strip()
    def get_entity_overrides(self):
        overrides = {}
        current_day = 0
        for node in self.root.xpath(
            './/div[@id="GUIDE_DETAIL"]//div[contains(@class, "guideOverview")]')[0].itersiblings():
            if node.tag == 'h5':
                current_day = int(node.text.replace('Day', '').strip())
            elif node.tag == 'div':
                tags = [data.Tag(text='Day %d' % current_day)]

                desc = None
                # Items with long descriptions on the entity page will not have 'shortDesc',
                # node, they'll have an untagged <p> tagged that contains a 'more' link.
                desc_nodes = node.xpath('.//p[contains(@id, "shortDesc")]')
                if desc_nodes:
                    desc = html_parsing.tostring(desc_nodes[0])

                rel_source_url = node.xpath('div[@class="guideItemInfo"]//a[@class="titleLink"]/@href')[0]
                overrides[self.absolute_url(rel_source_url)] = data.Entity(tags=tags, description=desc)

        return overrides
 def get_trip_plan_name(self):
     if self.trip_plan_name:
         return self.trip_plan_name
     return html_parsing.tostring(self.getroot().find('.//h1[@id="HEADING"]'))
 def get_description(self):
     return html_parsing.tostring(
         self.root.xpath('.//div[@class="excerpt"]/p')[0])
Exemple #26
0
 def get_address(self):
     elems = self.root.findall('body//div[@class="addresspanel"]//p[@class="address"]')
     return '%s %s' % (tostring(elems[0], True), tostring(elems[1], True)) 
 def get_description(self):
     guide_text = html_parsing.tostring(self.root.find(".//div[@id='guides']")).strip()
     summary_text = guide_text[:guide_text.find('Top Things to Do')].strip()
     if summary_text.startswith(self.get_title()):
         summary_text = summary_text[len(self.get_title()):].strip()
     return summary_text
 def get_trip_plan_name(self):
     base_name = super(Nytimes36hours, self).get_trip_plan_name()
     if base_name:
         return base_name
     return html_parsing.tostring(self.getroot().find('.//h1[@itemprop="headline"]'))
 def get_opening_hours(self):
     source_text = tostring(self.root.xpath('.//dl[@class="info-list"]//dt[contains(@class, "icon--time")]/following-sibling::dd')[0])
     return data.OpeningHours(source_text=source_text)
 def get_location_name(self):
     return html_parsing.tostring(
         self.root.xpath('.//div[contains(@class, "left-sidebar")]//h3')[0])