Ejemplo n.º 1
0
 def _fetch_company(self, company, path):
     # print("Fetching HTML for '{}' ...".format(company))
     url = "{}/members/register/register-profile/".format(self.base_url)
     filename = "{}.html".format(slugify(company))
     headers = {'User-Agent': 'Mozilla/5.0'}
     data = {"company": company}
     t = helpers.fetch_text(url, filename, path=path, headers=headers, data=data, method="post", refresh=self.refresh)
     return t
    def _download_mps_interests(self):
        helpers.create_data_folder("mpsinterests")

        parl_data_path = join(self.mps_datadir, 'parldata', 'scrapedxml', 'regmem')
        if not exists(parl_data_path):
            raise CommandError("You should fetch historical MPs’ interests data with `git submodule update`")

        url = "{}changedates.txt".format(self.base_url)
        r = helpers.fetch_text(url, "changedates.txt", path="mpsinterests", refresh=self.refresh)
        to_fetch = [x.split(",") for x in r.split("\n") if x != ""]

        for timestamp, filename in to_fetch:
            date = filename[6:16]
            if date <= "2015-01-06":
                # we already have this as part of the historical data
                continue

            filepath = join(self.mps_datadir, filename)
            url = self.base_url + filename
            print("Fetching %s ..." % url)
            helpers.fetch_text(url, filepath, refresh=self.refresh)
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        self.refresh = options.get('refresh')

        helpers.create_data_folder("appc")

        index_url = "{}/members/register/".format(self.base_url)
        t = helpers.fetch_text(index_url, "index.html", path="appc", refresh=True)
        soup = BeautifulSoup(t, "html5lib")

        date_range = self.get_dates(soup.h1.text)

        path = join("appc", date_range[1])
        helpers.create_data_folder(path)

        companies = [x["value"] for x in soup.find_all("input", {"name": "company"})]
        for company in companies:
            html = self._fetch_company(company, path)
            self._scrape_company_html(html, date_range)