class DepartmentScraper(Scraper): def __init__(self, department): self.department = department self.infosource = InfoSource() url = self.infosource.department_index(department) return( super(DepartmentScraper,self).__init__(url )) """ return the link to the institution-specific list of data categories for this department. """ def institution_data_page(self): link_text = self.soup.find(text=re.compile("Institution-Specific Classes of Records")) if link_text: link = link_text.findParent('a') return(self.infosource.department_page(self.department,link['href'])) else: return( None ) """ return the link to the standard list of data categories for this department. """ def standard_data_page(self): link_text = self.soup.find(text=re.compile("Standard Classes of Records")) if link_text: link = link_text.findParent('a') return( self.infosource.department_page(self.department,link['href'])) else: print "All departments should have a standard classes of record. None found on %s." % (self.cache.path) return( None ) def scrape(self): print "scraping department " + self.department.name to_scrape = { self.standard_data_page() : DepartmentStandardCategoryScraper, self.institution_data_page() : DepartmentSpecificCategoryScraper } for page, scraper_class in to_scrape.items(): if page: scraper = scraper_class(self.department,page) scraper.scrape()
def __init__(self, department): self.department = department self.infosource = InfoSource() url = self.infosource.department_index(department) return( super(DepartmentScraper,self).__init__(url ))