def scrape_recurse(self, data, parent=None): me = Location() if 'area_name' in data.keys(): me.name = data['area_name'].strip() if 'total_custs' in data.keys(): custs = data['total_custs'] me.total_customers = custs if 'custs_out' in data.keys(): out = data['custs_out'] etr = data['etrmillis'] if out > 0: outage = Outage() outage.affected_customers = out if etr >= 0: outage.proposed_end_time = datetime.fromtimestamp(etr/1000.0) me.outage = outage if 'areas' in data.keys(): for area in data['areas']: self.scrape_recurse(area, me) if parent: parent.locations.append(me)
def scrape(self, url, parent=None): print "Getting:", url soup = self.get_soup(url) table = self.extract_table(soup) update_time, location_level = self.get_metadata(table) if location_level is None: return first_data_row = None # get all rows that have no attributes on them for row in table.findAll(lambda tag : tag.name == 'tr' and not tag.attrs): if not row.findAll('td'): continue first_data_row = row break # Get data-rows then prepend the first rows = first_data_row.findNextSiblings('tr') rows.insert(0, first_data_row) # The last row is junk we don't need rows.pop() locations = [] for row in rows: loc = Location() loc.update_time = update_time loc.location_level = location_level if parent: parent.locations.append(loc) cells = row.findAll('td') loc.total_customers = int(cells[1].string.replace(',','')) out_customers = int(cells[2].string.replace(',','')) if cells[0].findAll('a'): # Theres more data here, recurse child_url = urljoin(url, cells[0].contents[0]['href']) loc.name = cells[0].contents[0].contents[0].string self.scrape(child_url, loc) else: # This is drilled-down as far as we can go, make an outage object loc.name = cells[0].string outage = Outage() outage.affected_customers = out_customers try: outage.proposed_end_time = datetime.strptime(cells[3].string, "%b %d, %Y %I:%M %p") except: # no proposed time pass loc.outage = outage