def _fetch_companies_house(self, identifiers): address_parts = ('CareofName', 'PoBox', 'AddressLine1', 'AddressLine2', 'PostTown', 'Postcode', 'County', 'Country',) helpers.create_data_folder('companieshouse') for identifier in identifiers: url = "http://data.companieshouse.gov.uk/doc/company/{}.json".format(identifier.identifier) filename = "ch_{}.json".format(identifier.identifier) try: j = helpers.fetch_json(url, filename, path='companieshouse', refresh=self.refresh) except ValueError: continue org = models.Organization.objects.get(identifiers=identifier) org.founding_date = self._parse_date(j['primaryTopic'].get('IncorporationDate')) org.dissolution_date = self._parse_date(j['primaryTopic'].get('DissolutionDate')) classification = j['primaryTopic']['CompanyCategory'] if classification is None: classification = '' org.classification = classification org.save() # # TODO: Other names # name = j['primaryTopic']['CompanyName'] # other_names = ... address = j['primaryTopic'].get('RegAddress') if address: address = ', '.join([address[k] for k in address_parts if k in address])
def _download_lords_interests(self): url = "http://lda.data.parliament.uk/lordsregisteredinterests.json?_view=Registered+Interest&_pageSize=50&_page=0" page = 0 helpers.create_data_folder("lordsinterests") data = [] while url: j = helpers.fetch_json(url, "lords_interests_{:02d}.json".format(page), path="lordsinterests", refresh=self.refresh) data += j['result']['items'] url = j['result'].get('next') page += 1 return data
def handle(self, *args, **options): self.refresh = options.get('refresh') helpers.create_data_folder("appc") index_url = "{}/members/register/".format(self.base_url) t = helpers.fetch_text(index_url, "index.html", path="appc", refresh=True) soup = BeautifulSoup(t, "html5lib") date_range = self.get_dates(soup.h1.text) path = join("appc", date_range[1]) helpers.create_data_folder(path) companies = [x["value"] for x in soup.find_all("input", {"name": "company"})] for company in companies: html = self._fetch_company(company, path) self._scrape_company_html(html, date_range)
def _download_mps_interests(self): helpers.create_data_folder("mpsinterests") parl_data_path = join(self.mps_datadir, 'parldata', 'scrapedxml', 'regmem') if not exists(parl_data_path): raise CommandError("You should fetch historical MPs’ interests data with `git submodule update`") url = "{}changedates.txt".format(self.base_url) r = helpers.fetch_text(url, "changedates.txt", path="mpsinterests", refresh=self.refresh) to_fetch = [x.split(",") for x in r.split("\n") if x != ""] for timestamp, filename in to_fetch: date = filename[6:16] if date <= "2015-01-06": # we already have this as part of the historical data continue filepath = join(self.mps_datadir, filename) url = self.base_url + filename print("Fetching %s ..." % url) helpers.fetch_text(url, filepath, refresh=self.refresh)
def _process_people(self, people): person_rels = { 'links': models.Link, 'identifiers': models.Identifier, } ignore_fields = ( 'id', 'links', 'identifiers', 'images', 'name', 'contact_details', 'other_names', ) path_to_images = join('..', '..', 'media', 'actors') helpers.create_data_folder(path_to_images) for person in people: try: i = models.Identifier.objects.get(identifier=person['id'], scheme="uk.org.publicwhip") except models.Identifier.DoesNotExist: continue p = models.Person.objects.get(identifiers=i) for k, v in person.items(): if k not in ignore_fields: setattr(p, k, v) p.save() if p.image: filename = str(p.id) if p.image.startswith('http://yournextmp.popit.mysociety.org'): filename += '.png' else: print(p.image) helpers.fetch_file(p.image, filename, path=path_to_images, refresh=self.refresh) for rel_id, rel_model in person_rels.items(): for rel_dict in person.get(rel_id, []): getattr(p, rel_id).add(rel_model.objects.get_or_create(**rel_dict)[0]) for contact_dict in person.get('contact_details', []): contact_dict = {'contact_type': contact_dict['type'], 'value': contact_dict['value']} p.contact_details.add(models.ContactDetail.objects.get_or_create(**contact_dict)[0])