Ejemplo n.º 1
0
def scrape_cdcd():
    """Scrape information for each building from the CDCD site (cdcd.vt.edu)."""
    URL = 'http://www.cdcd.vt.edu/Building.Info/%s.html'

    scrapers = {
        'abbrev':      (r'Abbreviation:\s+([A-Z0-9\s]+)'),
        'address':     (r'Address:\s(.+)'),
        'built':       (r'Original\s+Construction\s+Year:\s+([0-9]{4})'),
        'gross_sq_ft': (r'\(GSF\):\s([0-9,]+)'),
    }

    for bldg in db.buildings.find():
        try:
            html = urllib.urlopen(URI % bldg['id'])
            html = lxml.parse(html).getroot()
        except:
            logger.warning('The CDCD page for %s is malformed.' % bldg['id'])
            continue

        for paragraph in html.cssselect('#vt_body_col > p'):
            text = paragraph.text_content()

            for key, regex in scrapers:
                match = re.compile(*regex).search(text)

                if match:
                    bldg[key] = match.group(1).strip()

        db.buildings.save(bldg)
Ejemplo n.º 2
0
def scrape_about():
    """Scrape the "about" pages from the Virginia Tech website."""
    URL = 'http://www.vt.edu/about/buildings/%s.html'

    # Load the index
    html = lxml.html.parse(urllib.urlopen(URL % 'index')).getroot()

    for anchor in html.cssselect('#vt_body_col li a'):
        slug, name = anchor.get('href'), anchor.text_content()

        # Extract page slug (filename without the extension)
        slug = slug[slug.rfind('/') + 1: -5]

        try:
            bldg = fuzzy_find('name', slug)[0]
            bldg['about_slug'] = slug
        except:
            logger.warning("Couldn't find matching building for '%s'." % slug)
            continue

        db.buildings.save(bldg)

    scrapers = {
        'abbrev':    (r'Abbreviation:\s+([A-Z0-9\s]+)'),
        'address':   (r'Address:\s+(\w\s+)\s+\|'),
        'built':     (r'Originally\s+Built:\s+([0-9]{4})\s+\|'),
        'floors':    (r'(%s)\s+floors' % '|'.join(range(1, 11)), re.I),
        'grid':      (r'Map\s+Grid:\s+([A-Z]-[0-9])'),
        'latitude':  (r'Latitude:\s+(\-?[0-9]+\.[0-9]+)'),
        'longitude': (r'Longitude:\s+(\-?[0-9]+\.[0-9]+)'),
    }

    for bldg in db.buildings.find({ 'about_slug': { '$exists': True } }):
        try:
            html = urllib.urlopen(URL % bldg['about_slug'])
            html = lxml.parse(html).getroot()
        except:
            logger.warning('The about page for %s is malformed.' % bldg['id'])
            continue

        for paragraph in html.cssselect('#vt_body_col > p'):
            text = paragraph.text_content()

            for key, regex in scrapers:
                match = re.compile(*regex).search(text)

                if match:
                    bldg[key] = match.group(1).strip()

        db.buildings.save(bldg)
Ejemplo n.º 3
0
def scrape_housing():
    """Scrape the housing site (housing.vt.edu) for each building to determine
    whether it's a residence hall."""
    URL = 'http://www.housing.vt.edu/halls/%s.php'

    # Load the index
    html = lxml.html.parse(urllib.urlopen(URL % 'index')).getroot()

    for anchor in html.cssselect('#content_container_middle li a'):
        slug = anchor.get('href')
        text = anchor.text_content()

        try:
            bldg = fuzzy_find('name', text)[0]
            bldg.update({ 'housing_slug': slug, 'type': 'residence_hall' })
        except:
            logger.warning("Couldn't find matching building for '%s'." % slug)
            continue

        db.buildings.save(bldg)