def _fetch_company(self, company, path): # print("Fetching HTML for '{}' ...".format(company)) url = "{}/members/register/register-profile/".format(self.base_url) filename = "{}.html".format(slugify(company)) headers = {'User-Agent': 'Mozilla/5.0'} data = {"company": company} t = helpers.fetch_text(url, filename, path=path, headers=headers, data=data, method="post", refresh=self.refresh) return t
def _download_mps_interests(self): helpers.create_data_folder("mpsinterests") parl_data_path = join(self.mps_datadir, 'parldata', 'scrapedxml', 'regmem') if not exists(parl_data_path): raise CommandError("You should fetch historical MPs’ interests data with `git submodule update`") url = "{}changedates.txt".format(self.base_url) r = helpers.fetch_text(url, "changedates.txt", path="mpsinterests", refresh=self.refresh) to_fetch = [x.split(",") for x in r.split("\n") if x != ""] for timestamp, filename in to_fetch: date = filename[6:16] if date <= "2015-01-06": # we already have this as part of the historical data continue filepath = join(self.mps_datadir, filename) url = self.base_url + filename print("Fetching %s ..." % url) helpers.fetch_text(url, filepath, refresh=self.refresh)
def handle(self, *args, **options): self.refresh = options.get('refresh') helpers.create_data_folder("appc") index_url = "{}/members/register/".format(self.base_url) t = helpers.fetch_text(index_url, "index.html", path="appc", refresh=True) soup = BeautifulSoup(t, "html5lib") date_range = self.get_dates(soup.h1.text) path = join("appc", date_range[1]) helpers.create_data_folder(path) companies = [x["value"] for x in soup.find_all("input", {"name": "company"})] for company in companies: html = self._fetch_company(company, path) self._scrape_company_html(html, date_range)