Esempio n. 1
0
def run():
    # connect to db
    db = Database()
    source_id = db.insert_update('source', {'url': 'http://brnonow.com'})
    
    # fetch all article urls
    url = 'http://brnonow.com/page/%s'
    articles = []
    for i in range(1000):
        log('page', i)
        try:
            links = Downloader(url, i).html().findAll('a', {'rel': 'bookmark'})
            if not links:
                break
            
            for link in links:
                articles.append(link['href'])
                
        except HTMLParseError:
            log('error', 'parsing failure, skipping')
    
    # make it unique
    articles = set(articles)
    
    # process articles
    for url in articles:
        log('article', url)
        try:
            html = Downloader(url).html()
            links = html.findAll(lambda tag: tag.name == 'a' and re.match(r'http://[^\.]+\.google\.[^/]+/maps', tag.get('href', ''))) 
            
            # get title & save article
            title = unicode(decode_unicode_entities(html.find('h1', 'entry-title').string))
            article_id = db.insert_update('article', {'title': title, 'url': url, 'source_id': source_id})
            
            # get places
            for link in links:
                query = GoogleMaps().parse_link_url(link['href']) # bud 49.234553,16.567812 nebo u'krav\xc3\xad hora'
                log('link', query)
    
                geocoded = Geocoder(query, resolve_coords=False).fetch()
                if geocoded:
                    geocoded['name'] = None
                    place_id = db.insert_update('place', geocoded)
                    
                    # save relations
                    db.insert_update('is_about', {'place_id': place_id, 'article_id': article_id}, last_id=False)
            
            db.commit()
                
        except HTMLParseError:
            log('error', 'parsing failure, skipping')
            
        except AttributeError:
            log('error', 'attribute error, skipping')