def geocode_url(self, url, attempted=None): if attempted is None: attempted = set() util.logger.debug("Fetching %s...", url) page = self._call_geocoder(url) soup = BeautifulSoup(page) rdf_url = self.parse_rdf_link(soup) util.logger.debug("Fetching %s..." % rdf_url) page = self.urlopen(rdf_url) things, thing = self.parse_rdf(page) # TODO name = self.get_label(thing) attributes = self.get_attributes(thing) for _, value in attributes: latitude, longitude = util.parse_geo(value) if None not in (latitude, longitude): break if None in (latitude, longitude): tried = set() # TODO undefined tried -- is this right? relations = self.get_relations(thing) for _, resource in relations: url = things.get(resource, resource) # pylint: disable=E1103 if url in tried: # Avoid cyclic relationships. continue tried.add(url) name, (latitude, longitude) = self.geocode_url(url, tried) if None not in (name, latitude, longitude): break return (name, (latitude, longitude))
def parse_rdf_link(page, mime_type='application/rdf+xml'): """Parse the URL of the RDF link from the <head> of ``page``.""" soup = BeautifulSoup(page) link = soup.head.find( # pylint: disable=E1101,E1103 'link', rel='alternate', type=mime_type) return link and link['href'] or None
def parse_xhtml(self, page): soup = isinstance(page, BeautifulSoup) and page or BeautifulSoup(page) meta = soup.head.find('meta', {'name': 'geo.placename'}) name = meta and meta['content'] or None meta = soup.head.find('meta', {'name': 'geo.position'}) if meta: position = meta['content'] # no parse_geo? TODO latitude, longitude = parse_geo(position) if latitude == 0 or longitude == 0: latitude = longitude = None else: latitude = longitude = None return (name, (latitude, longitude))