def run(): # connect to db db = Database() source_id = db.insert_update('source', {'url': 'http://brnonow.com'}) # fetch all article urls url = 'http://brnonow.com/page/%s' articles = [] for i in range(1000): log('page', i) try: links = Downloader(url, i).html().findAll('a', {'rel': 'bookmark'}) if not links: break for link in links: articles.append(link['href']) except HTMLParseError: log('error', 'parsing failure, skipping') # make it unique articles = set(articles) # process articles for url in articles: log('article', url) try: html = Downloader(url).html() links = html.findAll(lambda tag: tag.name == 'a' and re.match(r'http://[^\.]+\.google\.[^/]+/maps', tag.get('href', ''))) # get title & save article title = unicode(decode_unicode_entities(html.find('h1', 'entry-title').string)) article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id}) # get places for link in links: query = GoogleMaps().parse_link_url(link['href']) # bud 49.234553,16.567812 nebo u'krav\xc3\xad hora' log('link', query) geocoded = Geocoder(query, resolve_coords=False).fetch() if geocoded: geocoded['name'] = None place_id = db.insert_update('place', geocoded) # save relations db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False) db.commit() except HTMLParseError: log('error', 'parsing failure, skipping') except AttributeError: log('error', 'attribute error, skipping')