def scrape(self,scrape_depts=True):
     links = self.soup.findAll(href=re.compile("/inst/.../fed00-eng.asp"))
     for link in links:
         match = re.search("/inst/(...)/fed00-eng.asp",link['href'])
         abbrev = match.group(1)
         name = link.contents[0]
         name = re.sub('\s*See\s+','',name)
         dept,g_or_c = Department.objects.get_or_create(name=name,abbrev=abbrev)
         if scrape_depts:
             dept_scraper = DepartmentScraper(dept)
             dept_scraper.scrape()
Example #2
0
 def test_resolve_pages(self):
     scraper_w_both = DepartmentScraper(self.dept_w_both)
     self.assertEquals(scraper_w_both.institution_data_page(),'http://infosource.gc.ca/inst/agr/fed05-eng.asp')
     self.assertEquals(scraper_w_both.standard_data_page(),'http://infosource.gc.ca/inst/agr/fed06-eng.asp')
     scraper_w_only_standard = DepartmentScraper(self.dept_w_only_standard)
     self.assertEquals(scraper_w_only_standard.institution_data_page(),None)
     self.assertEquals(scraper_w_only_standard.standard_data_page(),'http://infosource.gc.ca/inst/apf/fed05-eng.asp')
 def handle(self, *args, **options):
     abbrev = args[0]
     dept = Department.objects.get(abbrev=abbrev)
     dept_scraper = DepartmentScraper(dept)
     dept_scraper.scrape()