def scrape_cdcd(): """Scrape information for each building from the CDCD site (cdcd.vt.edu).""" URL = 'http://www.cdcd.vt.edu/Building.Info/%s.html' scrapers = { 'abbrev': (r'Abbreviation:\s+([A-Z0-9\s]+)'), 'address': (r'Address:\s(.+)'), 'built': (r'Original\s+Construction\s+Year:\s+([0-9]{4})'), 'gross_sq_ft': (r'\(GSF\):\s([0-9,]+)'), } for bldg in db.buildings.find(): try: html = urllib.urlopen(URI % bldg['id']) html = lxml.parse(html).getroot() except: logger.warning('The CDCD page for %s is malformed.' % bldg['id']) continue for paragraph in html.cssselect('#vt_body_col > p'): text = paragraph.text_content() for key, regex in scrapers: match = re.compile(*regex).search(text) if match: bldg[key] = match.group(1).strip() db.buildings.save(bldg)
def scrape_about(): """Scrape the "about" pages from the Virginia Tech website.""" URL = 'http://www.vt.edu/about/buildings/%s.html' # Load the index html = lxml.html.parse(urllib.urlopen(URL % 'index')).getroot() for anchor in html.cssselect('#vt_body_col li a'): slug, name = anchor.get('href'), anchor.text_content() # Extract page slug (filename without the extension) slug = slug[slug.rfind('/') + 1: -5] try: bldg = fuzzy_find('name', slug)[0] bldg['about_slug'] = slug except: logger.warning("Couldn't find matching building for '%s'." % slug) continue db.buildings.save(bldg) scrapers = { 'abbrev': (r'Abbreviation:\s+([A-Z0-9\s]+)'), 'address': (r'Address:\s+(\w\s+)\s+\|'), 'built': (r'Originally\s+Built:\s+([0-9]{4})\s+\|'), 'floors': (r'(%s)\s+floors' % '|'.join(range(1, 11)), re.I), 'grid': (r'Map\s+Grid:\s+([A-Z]-[0-9])'), 'latitude': (r'Latitude:\s+(\-?[0-9]+\.[0-9]+)'), 'longitude': (r'Longitude:\s+(\-?[0-9]+\.[0-9]+)'), } for bldg in db.buildings.find({ 'about_slug': { '$exists': True } }): try: html = urllib.urlopen(URL % bldg['about_slug']) html = lxml.parse(html).getroot() except: logger.warning('The about page for %s is malformed.' % bldg['id']) continue for paragraph in html.cssselect('#vt_body_col > p'): text = paragraph.text_content() for key, regex in scrapers: match = re.compile(*regex).search(text) if match: bldg[key] = match.group(1).strip() db.buildings.save(bldg)
def scrape_housing(): """Scrape the housing site (housing.vt.edu) for each building to determine whether it's a residence hall.""" URL = 'http://www.housing.vt.edu/halls/%s.php' # Load the index html = lxml.html.parse(urllib.urlopen(URL % 'index')).getroot() for anchor in html.cssselect('#content_container_middle li a'): slug = anchor.get('href') text = anchor.text_content() try: bldg = fuzzy_find('name', text)[0] bldg.update({ 'housing_slug': slug, 'type': 'residence_hall' }) except: logger.warning("Couldn't find matching building for '%s'." % slug) continue db.buildings.save(bldg)