Example #1
0
def scrape_and_store_subcategories(url, location_regex, db_collection):
    wiki = CategoryScraper()
    for text, url in wiki.scrape_subcategories(url):
        location_pattern = re.compile(location_regex, flags=re.IGNORECASE)
        location_search = location_pattern.search(text)
        if location_search:
            logging.debug(text)
            location = location_search.group('location')
            type = location_search.group('type')
            scrape_and_store_subcategories(url, location_regex, db_collection)
            scrape_and_store_pages(url, location, type, db_collection)
Example #2
0
def scrape_and_store_subcategories(url, location_regex, db_collection):
    wiki = CategoryScraper()
    for text, url in wiki.scrape_subcategories(url):
        location_pattern = re.compile(location_regex, flags=re.IGNORECASE)
        location_search = location_pattern.search(text)
        if location_search:
            logging.debug(text)
            location = location_search.group('location')
            type = location_search.group('type')
            scrape_and_store_subcategories(url, location_regex, db_collection)
            scrape_and_store_pages(url, location, type, db_collection)
Example #3
0
def scrape_and_store_pages(url, location, type, db_collection):
    wiki = CategoryScraper()
    pages = wiki.scrape_pages(url)
    logging.debug('%s, %s pages' % (location, len(pages)))
    for page in pages:
        db_collection.update({'name': page.lower()}, {
            '$set': {
                'type': type
            },
            '$addToSet': {
                'locations': parse_location(location)
            }
        },
                             upsert=True)
Example #4
0
def scrape_and_store_pages(url, location, type, db_collection):
    wiki = CategoryScraper()
    pages = wiki.scrape_pages(url)
    logging.debug('%s, %s pages' % (location, len(pages)))        
    for page in pages:
        db_collection.update({ 'name' : page.lower() }, { '$set' : { 'type' : type }, '$addToSet' : { 'locations' : parse_location(location) }}, upsert=True)