def scrape(self,scrape_depts=True): links = self.soup.findAll(href=re.compile("/inst/.../fed00-eng.asp")) for link in links: match = re.search("/inst/(...)/fed00-eng.asp",link['href']) abbrev = match.group(1) name = link.contents[0] name = re.sub('\s*See\s+','',name) dept,g_or_c = Department.objects.get_or_create(name=name,abbrev=abbrev) if scrape_depts: dept_scraper = DepartmentScraper(dept) dept_scraper.scrape()
def test_resolve_pages(self): scraper_w_both = DepartmentScraper(self.dept_w_both) self.assertEquals(scraper_w_both.institution_data_page(),'http://infosource.gc.ca/inst/agr/fed05-eng.asp') self.assertEquals(scraper_w_both.standard_data_page(),'http://infosource.gc.ca/inst/agr/fed06-eng.asp') scraper_w_only_standard = DepartmentScraper(self.dept_w_only_standard) self.assertEquals(scraper_w_only_standard.institution_data_page(),None) self.assertEquals(scraper_w_only_standard.standard_data_page(),'http://infosource.gc.ca/inst/apf/fed05-eng.asp')
def handle(self, *args, **options): abbrev = args[0] dept = Department.objects.get(abbrev=abbrev) dept_scraper = DepartmentScraper(dept) dept_scraper.scrape()