def scrape_and_store_subcategories(url, location_regex, db_collection): wiki = CategoryScraper() for text, url in wiki.scrape_subcategories(url): location_pattern = re.compile(location_regex, flags=re.IGNORECASE) location_search = location_pattern.search(text) if location_search: logging.debug(text) location = location_search.group('location') type = location_search.group('type') scrape_and_store_subcategories(url, location_regex, db_collection) scrape_and_store_pages(url, location, type, db_collection)
def scrape_and_store_pages(url, location, type, db_collection): wiki = CategoryScraper() pages = wiki.scrape_pages(url) logging.debug('%s, %s pages' % (location, len(pages))) for page in pages: db_collection.update({'name': page.lower()}, { '$set': { 'type': type }, '$addToSet': { 'locations': parse_location(location) } }, upsert=True)
def scrape_and_store_pages(url, location, type, db_collection): wiki = CategoryScraper() pages = wiki.scrape_pages(url) logging.debug('%s, %s pages' % (location, len(pages))) for page in pages: db_collection.update({ 'name' : page.lower() }, { '$set' : { 'type' : type }, '$addToSet' : { 'locations' : parse_location(location) }}, upsert=True)