def _fetch_companies_house(self, identifiers):
        address_parts = ('CareofName', 'PoBox', 'AddressLine1', 'AddressLine2', 'PostTown', 'Postcode', 'County', 'Country',)
        helpers.create_data_folder('companieshouse')

        for identifier in identifiers:
            url = "http://data.companieshouse.gov.uk/doc/company/{}.json".format(identifier.identifier)
            filename = "ch_{}.json".format(identifier.identifier)
            try:
                j = helpers.fetch_json(url, filename, path='companieshouse', refresh=self.refresh)
            except ValueError:
                continue

            org = models.Organization.objects.get(identifiers=identifier)

            org.founding_date = self._parse_date(j['primaryTopic'].get('IncorporationDate'))
            org.dissolution_date = self._parse_date(j['primaryTopic'].get('DissolutionDate'))
            classification = j['primaryTopic']['CompanyCategory']
            if classification is None:
                classification = ''
            org.classification = classification
            org.save()

            # # TODO: Other names
            # name = j['primaryTopic']['CompanyName']
            # other_names = ...

            address = j['primaryTopic'].get('RegAddress')
            if address:
                address = ', '.join([address[k] for k in address_parts if k in address])
 def _download_lords_interests(self):
     url = "http://lda.data.parliament.uk/lordsregisteredinterests.json?_view=Registered+Interest&_pageSize=50&_page=0"
     page = 0
     helpers.create_data_folder("lordsinterests")
     data = []
     while url:
         j = helpers.fetch_json(url, "lords_interests_{:02d}.json".format(page), path="lordsinterests", refresh=self.refresh)
         data += j['result']['items']
         url = j['result'].get('next')
         page += 1
     return data
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        self.refresh = options.get('refresh')

        helpers.create_data_folder("appc")

        index_url = "{}/members/register/".format(self.base_url)
        t = helpers.fetch_text(index_url, "index.html", path="appc", refresh=True)
        soup = BeautifulSoup(t, "html5lib")

        date_range = self.get_dates(soup.h1.text)

        path = join("appc", date_range[1])
        helpers.create_data_folder(path)

        companies = [x["value"] for x in soup.find_all("input", {"name": "company"})]
        for company in companies:
            html = self._fetch_company(company, path)
            self._scrape_company_html(html, date_range)
    def _download_mps_interests(self):
        helpers.create_data_folder("mpsinterests")

        parl_data_path = join(self.mps_datadir, 'parldata', 'scrapedxml', 'regmem')
        if not exists(parl_data_path):
            raise CommandError("You should fetch historical MPs’ interests data with `git submodule update`")

        url = "{}changedates.txt".format(self.base_url)
        r = helpers.fetch_text(url, "changedates.txt", path="mpsinterests", refresh=self.refresh)
        to_fetch = [x.split(",") for x in r.split("\n") if x != ""]

        for timestamp, filename in to_fetch:
            date = filename[6:16]
            if date <= "2015-01-06":
                # we already have this as part of the historical data
                continue

            filepath = join(self.mps_datadir, filename)
            url = self.base_url + filename
            print("Fetching %s ..." % url)
            helpers.fetch_text(url, filepath, refresh=self.refresh)
    def _process_people(self, people):
        person_rels = {
            'links': models.Link,
            'identifiers': models.Identifier,
        }

        ignore_fields = (
            'id', 'links', 'identifiers', 'images',
            'name', 'contact_details', 'other_names',
        )

        path_to_images = join('..', '..', 'media', 'actors')
        helpers.create_data_folder(path_to_images)

        for person in people:
            try:
                i = models.Identifier.objects.get(identifier=person['id'], scheme="uk.org.publicwhip")
            except models.Identifier.DoesNotExist:
                continue
            p = models.Person.objects.get(identifiers=i)
            for k, v in person.items():
                if k not in ignore_fields:
                    setattr(p, k, v)
            p.save()
            if p.image:
                filename = str(p.id)
                if p.image.startswith('http://yournextmp.popit.mysociety.org'):
                    filename += '.png'
                else:
                    print(p.image)
                helpers.fetch_file(p.image, filename, path=path_to_images, refresh=self.refresh)
            for rel_id, rel_model in person_rels.items():
                for rel_dict in person.get(rel_id, []):
                    getattr(p, rel_id).add(rel_model.objects.get_or_create(**rel_dict)[0])
                for contact_dict in person.get('contact_details', []):
                    contact_dict = {'contact_type': contact_dict['type'], 'value': contact_dict['value']}
                    p.contact_details.add(models.ContactDetail.objects.get_or_create(**contact_dict)[0])