Beispiel #1
0
 def augment_parliamentarian_open_parliament(self, parliamentarian, url):
     soup = BeautifulSoup(fetch_url(url), "html.parser")
     for lang in (EN, FR):
         parliamentarian.names[lang][sources.NAME_OP[lang]] = soup.find(
             "h1").text
     parliamentarian.links[EN][sources.NAME_OP[EN]] = url
     for link in soup.select("ul.bulleted a"):
         if link.text == "Wikipedia":
             wiki_soup = BeautifulSoup(
                 fetch_url(link.attrs["href"], allow_redirects=True),
                 "html.parser")
             parliamentarian.links[EN][sources.NAME_WIKI[EN]] = urljoin(
                 link.attrs["href"],
                 wiki_soup.select("#ca-nstab-main a")[0].attrs["href"])
             try:
                 parliamentarian.links[FR][
                     sources.NAME_WIKI[FR]] = wiki_soup.select(
                         ".interwiki-fr a.interlanguage-link-target"
                     )[0].attrs["href"]
             except:
                 pass
         elif link.text == "Twitter":
             for lang in (EN, FR):
                 parliamentarian.links[lang][
                     sources.NAME_TWITTER[lang]] = link.attrs["href"]
     parliamentarian.save()
    def fetch_votes_session(self, session, list_url, remote_session_id):
        session.links[EN][sources.NAME_HOC_VOTES[EN]] = url_tweak(
            list_url, update={"sessionId": remote_session_id})
        session.links[FR][sources.NAME_HOC_VOTES[FR]] = get_french_parl_url(
            session.links[EN][sources.NAME_HOC_VOTES[EN]],
            BeautifulSoup(
                fetch_url(session.links[EN][sources.NAME_HOC_VOTES[EN]]),
                "lxml"),
        )
        session.save()

        parl_soup = BeautifulSoup(
            fetch_url(url_tweak(
                "http://www.ourcommons.ca/Parliamentarians/en/HouseVotes/ExportVotes?output=XML",
                update={"sessionId": remote_session_id},
            ),
                      use_cache=session.parliament.number < 42), "lxml")

        for overview in tqdm(
                parl_soup.find_all(
                    "voteparticipant"
                ),  # Oddly named considering the previous format we found this in
                desc=str(session),
                unit="vote",
        ):
            self.fetch_vote(overview, session)
 def fetch_hoc_committees_session(self, session, session_url):
     for link in tqdm(
             BeautifulSoup(
                 fetch_url(session_url),
                 "html.parser",
             ).select(".committees-list .accordion-content a"),
             desc=str(session),
             unit="committee",
     ):
         committee_url = {
             EN: url_tweak(urljoin(session_url, link.attrs["href"]))
         }
         committee = models.Committee(
             session=session,
             chamber=models.Committee.CHAMBER_HOC,
         )
         for lang in (EN, FR):
             soup = BeautifulSoup(fetch_url(committee_url[lang]),
                                  "html.parser")
             committee.names[lang][sources.NAME_PARL_COMMITTEE[
                 lang]] = soup.select(".institution-brand")[0].text
             committee.names[lang][
                 sources.NAME_PARL_COMMITTEE_CODE[lang]] = soup.select(
                     ".header-title.current-committee-profile")[0].text
             committee.links[lang][
                 sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang]
             if not committee.slug:
                 if "Joint" in committee.names[lang][
                         sources.NAME_PARL_COMMITTEE[lang]]:
                     committee.chamber = models.Committee.CHAMBER_JOINT
                 committee.slug = self.get_slug(committee)
                 committee_url[FR] = get_french_parl_url(
                     committee_url[lang], soup)
         committee.save()
Beispiel #4
0
    def fetch_ridings(self, parliament):
        logger.debug("Fetch ridings, {}".format(parliament))
        skipped_codes = set()
        codes_to_ridings = dict()
        soup = BeautifulSoup(
            fetch_url(parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]]),
            "html.parser",
        )
        for select in (
            "#ctl00_cphContent_ctl04_repGeneralElection_ctl00_grdMembers tr",
            "#ctl00_cphContent_ctl04_pnlSectionByElectionContent tr",
        ):
            for row in soup.select(select):
                cells = row.find_all("td", recursive=False)
                if cells:
                    riding_name, province_name = sources.LOP_RIDING_AND_PROVINCE.search(cells[1].text.strip()).groups()
                    province_slug = slugify(province_name)
                    riding_slug = slugify(" ".join((province_slug, riding_name)))
                    code = sources.LOP_CODE.search(cells[0].a.attrs["href"]).group().lower()
                    if riding_slug not in self.known_ridings:
                        try:
                            province = self.cache_provinces[province_slug]
                        except KeyError:
                            province = models.Province.objects.get(slug=province_slug)
                            self.cache_provinces[province_slug] = province
                        riding, created = models.Riding.objects.get_or_create(
                            slug=riding_slug,
                            province=province,
                        )
                        if created:
                            riding.names[EN][sources.NAME_LOP_PARLIAMENT[EN]] = riding_name
                            riding.save()
                        self.known_ridings.add(riding_slug)
                        codes_to_ridings[code] = riding
                    else:
                        skipped_codes.add(code)

        soup = BeautifulSoup(
            fetch_url(parliament.links[FR][sources.NAME_LOP_PARLIAMENT[FR]]),
            "html.parser",
        )
        for select in (
            "#ctl00_cphContent_ctl04_repGeneralElection_ctl00_grdMembers tr",
            "#ctl00_cphContent_ctl04_pnlSectionByElectionContent tr",
        ):
            for row in soup.select(select):
                cells = row.find_all("td", recursive=False)
                if cells:
                    code = sources.LOP_CODE.search(cells[0].a.attrs["href"]).group().lower()
                    if code not in skipped_codes:
                        riding_name, province_name = sources.LOP_RIDING_AND_PROVINCE.search(cells[1].text.strip()).groups()
                        riding = codes_to_ridings[code]
                        riding.names[FR][sources.NAME_LOP_PARLIAMENT[FR]] = riding_name
                        riding.save()
Beispiel #5
0
 def fetch_riding(self, riding, url):
     for lang in (EN, FR):
         riding.links[lang][
             sources.NAME_LOP_RIDING_HISTORY[lang]] = url_tweak(
                 url, update={"Language": sources.LANG_LOP[lang]})
         try:
             fetch_url(
                 riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]])
         except Exception as e:
             logger.exception(e)
     riding.save()
     self.cached_ridings[riding.slug] = riding
    def augment_riding(self, riding):
        try:
            for lang in (FR, EN):
                url = riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]]
                soup = BeautifulSoup(fetch_url(url), "html.parser")
                riding.names[lang][sources.NAME_LOP_RIDING_HISTORY[
                    lang]] = soup.select("h4")[0].text.split(", ")[0]
        except (KeyError, FetchFailure, FetchSuppressed) as e:
            logger.exception(e)
            return

        riding.save()
        for tag_id in ("#previous", "#became"):
            related_ridings = soup.select(tag_id)
            if related_ridings:
                for link in related_ridings[0].parent.select("a"):
                    match = re.search(
                        r"^(?P<name>.*) \((?P<province>.*)\)\((?P<daterange>.*)\)",
                        link.text).groupdict()
                    riding_slug = slugify("{province}-{name}".format(**match))
                    try:
                        related_riding = get_cached_obj(
                            self.cached_ridings, riding_slug)
                    except AssertionError:
                        province = get_cached_obj(self.cached_provinces,
                                                  match["province"])
                        related_riding, created = models.Riding.objects.get_or_create(
                            slug=riding_slug, province=province)
                        logger.debug("Auxilliary riding detected: {}".format(
                            riding_slug))
                    for lang in (EN, FR):
                        if sources.NAME_LOP_RIDING_HISTORY[
                                lang] not in related_riding.links[lang]:
                            related_riding.links[lang][
                                sources.
                                NAME_LOP_RIDING_HISTORY[lang]] = url_tweak(
                                    urljoin(url, link.attrs["href"]),
                                    update={
                                        "Language": sources.LANG_LOP[lang]
                                    },
                                )
                            related_riding.names[lang][
                                sources.
                                NAME_LOP_RIDING_HISTORY[lang]] = BeautifulSoup(
                                    fetch_url(related_riding.links[lang][
                                        sources.NAME_LOP_RIDING_HISTORY[lang]]
                                              ),
                                    "html.parser",
                                ).select("h4")[0].text.split(", ")[0]
                            related_riding.save()
                    riding.related_historically.add(related_riding)
Beispiel #7
0
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        fsas = set()
        index_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_in_Canada"
        index_all = BeautifulSoup(fetch_url(index_url), "html.parser")
        for link in tqdm(index_all.findAll("a", {"title": LIST})):
            index_letter = BeautifulSoup(
                fetch_url(urljoin(index_url, link.attrs["href"])),
                "html.parser")
            for fsa in tqdm(index_letter.findAll("b", text=XNX)):
                if cssutils.parseStyle(fsa.parent.attrs.get(
                        "style", "")).color != "#CCC":
                    fsas.add(fsa.text)

        cached_ridings = get_cached_dict(
            models.Riding.objects.filter(
                election_ridings__date__year__gte=2015))
        person_id_to_riding = {}
        for person in BeautifulSoup(
                fetch_url(
                    "http://www.ourcommons.ca/Parliamentarians/en/floorplan"),
                "html.parser",
        ).select(".FloorPlanSeat .Person"):
            riding = get_cached_obj(cached_ridings,
                                    person.attrs["constituencyname"])
            person_id_to_riding[int(person.attrs["personid"])] = riding
            riding.post_code_fsas = set()

        for fsa in tqdm(fsas):
            result = fetch_url(
                "http://www.ourcommons.ca/Parliamentarians/en/FloorPlan/FindMPs?textCriteria={}"
                .format(fsa))
            try:
                result = result.decode()
            except AttributeError:
                pass
            for person_id in filter(None, result.split(",")):
                try:
                    person_id_to_riding[int(person_id)].post_code_fsas.add(fsa)
                except:
                    logger.warning(
                        f"Person ID {person_id} expected for FSA {fsa}, but that wasn't found in the floorplan"
                    )

        for riding in person_id_to_riding.values():
            riding.post_code_fsas = sorted(riding.post_code_fsas)
            riding.save()
    def augment_ridings_ec(self):
        for row in tqdm(
                BeautifulSoup(
                    fetch_url(
                        url_tweak(
                            "http://www.elections.ca/Scripts/vis/SearchProvinces?PROV=CA&PROVID=99999&QID=-1&PAGEID=20",
                            update={"L": sources.LANG_EC[EN]})),
                    "html.parser").select("table tr")):
            cells = row.find_all("td", recursive=False)
            if cells:
                riding = models.Riding.objects.get(slug=slugify("{} {}".format(
                    cells[1].text,
                    cells[0].text,
                )))
                riding.electoral_district_number = parse_qs(
                    urlparse(cells[0].a.attrs["href"]).query)["ED"][0]
                self.cached_ridings[riding.electoral_district_number] = riding
                riding.save()

        for riding in tqdm(
                models.Riding.objects.filter(
                    electoral_district_number__isnull=False),
                desc="Augment Ridings, Elections Canada",
                unit="riding",
        ):
            self.augment_riding_ec(riding)
Beispiel #9
0
 def fetch_item(self, url):
     url[EN] = ensure_trailing_slash(url[EN])
     soup = BeautifulSoup(fetch_url(url[EN]), "html.parser")
     url[FR] = ensure_trailing_slash(
         urljoin(url[EN],
                 one_or_none(
                     soup.select("#language-toggle")).attrs["href"]))
Beispiel #10
0
    def fetch_hansard(self, sitting):

        # Fetch and parse the hansard XML
        self.tree = {
            lang: etree.ElementTree(
                etree.fromstring(
                    fetch_url(
                        sitting.links[lang][
                            sources.NAME_HOC_HANSARD_XML[lang]], )))
            for lang in (EN, FR)
        }

        # Strip out incorrect elements
        for lang in (EN, FR):
            strip_empty_elements(self.tree[lang].getroot())
            for duplicate in self.tree[lang].xpath(
                    "//PersonSpeaking/Affiliation[2]"):
                duplicate.getparent().remove(duplicate)
            merge_adjacent_quotes(self.tree[lang].getroot())

        # If the structure checks out, parse down from the root
        self.floor_language = None
        self.hansard_block = None
        self.hansard_block_number = 0
        self.metadata = {}
        self.parliamentarian = None
        self.person_speaking = None
        self.previous_hansard_block = None
        self.sitting = sitting
        self.timestamp = datetimeparse(self.tree[EN].find(
            "//ExtractedItem[@Name='MetaCreationTime']").text)
        self.new_hansard_block()
        self.parse_element(self.tree[EN].getroot())
 def augment_party_by_wikipedia(self, party, link_en, style):
     party.color = cssutils.parseStyle(style).background
     party.color = re.sub(r"^#([0-9a-f])([0-9a-f])([0-9a-f])$", r"#\1\1\2\2\3\3", party.color, flags=re.I)
     if party.color == "#DCDCDC":
         party.color = ""
     try:
         party.links[EN][sources.NAME_WIKI[EN]] = link_en
         soup_en = BeautifulSoup(fetch_url(link_en), "html.parser")
         party.names[EN][sources.NAME_WIKI[EN]] = soup_en.select("#firstHeading")[0].text.strip()
         link_fr = soup_en.select(".interwiki-fr a.interlanguage-link-target")[0].attrs["href"]
         party.links[FR][sources.NAME_WIKI[FR]] = link_fr
         soup_fr = BeautifulSoup(fetch_url(link_fr), "html.parser")
         party.names[FR][sources.NAME_WIKI[FR]] = soup_fr.select("#firstHeading")[0].text.strip()
     except IndexError:
         logger.debug("{} doesn't have a French-language equivalent in Wikipedia at the moment".format(party))
     party.save()
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        cached_parties = get_cached_dict(models.Party.objects.all())
        list_url = "https://en.wikipedia.org/wiki/List_of_federal_political_parties_in_Canada"
        for tr in tqdm(
            BeautifulSoup(
                fetch_url(list_url),
                "html.parser",
            ).select("table.wikitable > tr"),
            desc="Augment Parties, Wikipedia",
            unit="party",
        ):
            if tr.find_all("td", recursive=False):
                for link in tr.find_all("td", recursive=False)[1].find_all("a"):
                    name = link.attrs["title"].strip()
                    name = WIKI_MAPPING.get(name, name)
                    if name is None:
                        continue
                    try:
                        party = get_cached_obj(cached_parties, name)
                    except AssertionError:
                        logger.warning("Wikipedia mentions {}, but we don't have a mapping for it".format(link.attrs["title"].strip()))
                        continue
                    self.augment_party_by_wikipedia(
                        party,
                        urljoin(list_url, link.attrs["href"]),
                        tr.find_all("td", recursive=False)[0].attrs["style"],
                    )
        models.Party.objects.filter(color="").update(color="#666666")
    def augment_election_wiki(self, election):
        soup = BeautifulSoup(fetch_url(url_tweak(
            election.links[EN][sources.NAME_WIKI[EN]],
            update={"action": "edit"},
        )), "html.parser")

        # Get the info box
        page_source = soup.select("#wpTextbox1")[0].text
        infobox_lines = re.search("{{Infobox election\n(.*?)\n}}", page_source, re.S | re.I).groups()[0].splitlines()
        infobox = {}
        infobox["parties"] = []
        for key, value in [
            line[2:].split("=", 1)
            for line in infobox_lines
            if line.startswith("| ")
        ]:
            key = key.strip()
            value = value.strip()
            try:
                party_place = int(key[-1]) - 1
                while len(infobox["parties"]) <= party_place:
                    infobox["parties"].append({})
                infobox["parties"][party_place][key[:-1]] = value
            except ValueError:
                infobox[key] = value
        election.wiki_info_box = infobox
        election.save()
    def fetch_provinces(self):
        url = url_tweak(self.ROOT_URL,
                        update={"Language": sources.LANG_LOP[EN]})
        for link in tqdm(
                BeautifulSoup(
                    fetch_url(url),
                    "html.parser",
                ).select("#ctl00_pnlContent a"),
                desc="Fetch Provinces, LoP (EN)",
                unit="province",
        ):
            if link.attrs.get("id",
                              "").startswith("ctl00_cphContent_repProvinces_"):
                province, created = models.Province.objects.get_or_create(
                    slug=slugify(link.text.strip()))
                url_en = url_tweak(
                    urljoin(url, link.attrs["href"]),
                    remove=("MenuID", "MenuQuery"),
                    update={"Section": "All"},
                )
                self.augment_province(province, EN, url_en)

        url = url_tweak(self.ROOT_URL, update={"Language": FR})
        for link in tqdm(
                BeautifulSoup(
                    fetch_url(url),
                    "html.parser",
                ).select("#ctl00_pnlContent a"),
                desc="Fetch Provinces, LoP (FR)",
                unit="province",
        ):
            if link.attrs.get("id",
                              "").startswith("ctl00_cphContent_repProvinces_"):
                url_fr = url_tweak(
                    urljoin(url, link.attrs["href"]),
                    remove=("MenuID", "MenuQuery"),
                    update={"Section": "All"},
                )
                province = models.Province.objects.get(
                    links__contains=url_tweak(
                        url_fr,
                        update={"Language": sources.LANG_LOP[EN]},
                    ))
                self.augment_province(province, FR, url_fr)
 def fetch_year(self, year):
     days = [
         dateparse(day)
         for day in json.loads(fetch_url(
             "http://parlvu.parl.gc.ca/XRender/en/api/Data/GetCalendarYearData/{}0101/-1".format(year),
             use_cache=year < 2017,
         ))
     ]
     for day in tqdm(days, desc=str(year), unit="day"):
         self.fetch_day(day)
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        cached_parties = get_cached_dict(models.Party.objects.all())
        list_url = "https://lop.parl.ca/parlinfo/Lists/Party.aspx"
        for lang in (EN, FR):
            for a in tqdm(
                    BeautifulSoup(
                        fetch_url(
                            url_tweak(
                                list_url,
                                update={"Language": sources.LANG_LOP[lang]})),
                        "html.parser").select("td > a"),
                    desc="Augment Parties, LoP",
                    unit="party",
            ):
                if "_lnkParty_" not in a.attrs.get("id", ""):
                    continue
                url = url_tweak(
                    urljoin(list_url, a.attrs["href"]),
                    update={"Section": "ALL"},
                    remove=("MenuID", "MenuQuery"),
                )
                lop_item_code = sources.LOP_CODE.search(url).group().lower()
                party = models.Party.objects.filter(
                    lop_item_code=lop_item_code).first()
                if not party:
                    name = sources.WHITESPACE.sub(" ", a.text.strip())
                    name = LOP_LIST_MAPPING.get(name, name)
                    if name is None:
                        continue
                    party = get_cached_obj(cached_parties, name)
                party.links[lang][sources.NAME_LOP_PARTY[lang]] = url
                party.names[lang][
                    sources.NAME_LOP_PARTY[lang]] = a.text.strip()
                party.lop_item_code = sources.LOP_CODE.search(
                    url).group().lower()
                soup = BeautifulSoup(fetch_url(url), "html.parser")
                for link in soup.select("#ctl00_cphContent_dataLinks a"):
                    party.links[lang][sources.AVAILABILITY_WARNINGS.sub(
                        "", link.text.strip())] = link.attrs["href"]
                party.save()
Beispiel #17
0
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        url = "http://www.cpac.ca/en/page/1/?s&category=all&person=all&order=newest&type=videos"
        while url:
            soup = BeautifulSoup(fetch_url(url), "html.parser")
            for item in soup.select(".vidlist-main__item"):
                self.fetch_item(
                    {EN: urljoin(url,
                                 item.select("a")[0].attrs["href"])})
            button_next = one_or_none(soup.select("a.latest-slider__next"))
            if button_next:
                url = urljoin(url, button_next.attrs["href"])
    def fetch_senate_committees_session(self, session, session_url):
        for link in tqdm(
                BeautifulSoup(
                    fetch_url(session_url),
                    "html.parser").select(".committee-list-boxes-wrapper a"),
                desc=str(session),
                unit="committee",
        ):
            committee_url = {
                EN: url_tweak(urljoin(session_url, link.attrs["href"]))
            }
            if link.select(".joint-committee-list-boxes"):
                logger.debug(
                    "Skipping {} (broken, reported, joint committees are covered in HoC anyway)"
                    .format(committee_url[EN]))
                continue

            committee = models.Committee(
                session=session,
                chamber=models.Committee.CHAMBER_SEN,
            )
            for lang in (EN, FR):
                soup = BeautifulSoup(fetch_url(committee_url[lang]),
                                     "html.parser")
                committee.names[lang][
                    sources.NAME_PARL_COMMITTEE[lang]] = soup.select(
                        "meta[name=dc.description]")[0].attrs["content"]
                committee.names[lang][sources.NAME_PARL_COMMITTEE_CODE[
                    lang]] = committee_url[lang].strip("/").split(
                        "/")[-2].upper()
                committee.links[lang][
                    sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang]
                if not committee.slug:
                    committee.slug = self.get_slug(committee)
                    committee_url[FR] = get_french_parl_url(
                        committee_url[lang], soup)
            committee.save()
 def fetch_parliaments(self):
     url = "https://lop.parl.ca/parlinfo/Lists/Parliament.aspx"
     for link in tqdm(
         BeautifulSoup(
             fetch_url(url),
             "html.parser",
         ).select("#ctl00_cphContent_ctl00_grdParliamentList td > a"),
         desc="Fetch Parliaments, LoP",
         unit="parliament",
     ):
         parliament, created = models.Parliament.objects.get_or_create(
             number=int(REVERSE_ORDINAL.sub(r"\1", link.text)),
         )
         if created or parliament.number >= 42:
             url = url_tweak(
                 urljoin(url, link.attrs["href"]),
                 remove=("MenuID", "MenuQuery"),
                 update={"Section": "All"},
             )
             parliament.links = {
                 EN: {sources.NAME_WIKI[EN]: "https://en.wikipedia.org/wiki/{}_Canadian_Parliament".format(inflector.ordinal(parliament.number))},
                 FR: {sources.NAME_WIKI[FR]: "https://fr.wikipedia.org/wiki/{}{}_législature_du_Canada".format(parliament.number, "re" if parliament.number == 1 else "e")},
             }
             for lang in (EN, FR):
                 parliament.links[lang][sources.NAME_LOP_PARLIAMENT[lang]] = url_tweak(url, update={"Language": sources.LANG_LOP[lang]})
                 if parliament.number <= 35:
                     parliament.links[lang][sources.NAME_CANADIANA[lang]] = "http://parl.canadiana.ca/search?usrlang={}&lang={}&identifier=P{}".format(
                         sources.LANG_CANADIANA_UI[lang],
                         sources.LANG_CANADIANA_CONTENT[lang],
                         parliament.number,
                     )
             parliament.seats = int(BeautifulSoup(
                 fetch_url(parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]]),
                 "html.parser",
             ).select("#ctl00_cphContent_ctl06_pnlSectionPartyStandingsContent .GridRows")[0].contents[-1].text)
             parliament.save()
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        list_url = "http://www.parl.gc.ca/LegisInfo/Home.aspx?Page=1"
        for link in tqdm(
            BeautifulSoup(
                fetch_url(list_url, allow_redirects=True),
                "html.parser",
            ).select("#ctl00_PageContentSection_BillListingControl_BillFacetSearch_SessionSelector1_pnlSessions a"),
            desc="Fetch Bills, LEGISinfo",
            unit="session",
        ):
            if " - " in link.text:
                parliament_number, session_number = link.text.split()[0].split("-")
                self.fetch_bills_session(Session.objects.get(parliament__number=parliament_number, number=session_number))
 def fetch_hoc_committees(self):
     list_url = "http://www.ourcommons.ca/Committees/en/List"
     for link in tqdm(
             BeautifulSoup(
                 fetch_url(list_url),
                 "html.parser",
             ).select(".session-selector"),
             desc="Fetch Committees, HoC",
             unit="session",
     ):
         querydict = parse_qs(urlparse(link.attrs["href"]).query)
         self.fetch_hoc_committees_session(
             Session.objects.get(parliament__number=querydict["parl"][0],
                                 number=querydict["session"][0]),
             url_tweak(urljoin(list_url, link.attrs["href"])),
         )
 def fetch_senate_committees(self):
     list_url = "https://sencanada.ca/en/committees/"
     for link in tqdm(
             BeautifulSoup(
                 fetch_url(list_url),
                 "html.parser",
             ).select(".session-dropdown-session a"),
             desc="Fetch Committees, Senate",
             unit="session",
     ):
         parliament_number, session_number = link.attrs["href"].strip(
             "/").rsplit("/", 1)[1].split("-")
         self.fetch_senate_committees_session(
             Session.objects.get(parliament__number=parliament_number,
                                 number=session_number),
             url_tweak(urljoin(list_url, link.attrs["href"])),
         )
Beispiel #23
0
    def fetch_parliamentarian(self, slug, name, lang_naive_url):
        parliamentarian, created = models.Parliamentarian.objects.get_or_create(
            slug=slug)
        if not created:
            return

        for lang in (EN, FR):
            parliamentarian.names[lang][
                sources.NAME_LOP_PARLIAMENT[lang]] = name
            url = url_tweak(lang_naive_url,
                            update={"Language": sources.LANG_LOP[lang]})
            parliamentarian.links[lang][
                sources.NAME_LOP_PARLIAMENTARIAN[lang]] = url
            soup = BeautifulSoup(fetch_url(url), "html.parser")
            parliamentarian.names[lang][sources.NAME_LOP_PARLIAMENTARIAN[
                lang]] = sources.WHITESPACE.sub(
                    " ",
                    soup.select("#ctl00_cphContent_lblTitle")[0].text)
            for link in soup.select("#ctl00_cphContent_dataLinks a"):
                parliamentarian.links[lang][sources.AVAILABILITY_WARNINGS.sub(
                    "", link.text.strip())] = link.attrs["href"]
        try:
            parliamentarian.lop_item_code = sources.LOP_CODE.search(
                url).group().lower()
            parliamentarian.birthdate = soup.select(
                "#ctl00_cphContent_DateOfBirthData")[0].text.strip().replace(
                    ".", "-")
        except:
            pass

        # Download the parliamentarian's photo if they have one
        photo_url = urljoin(
            url,
            soup.select("#ctl00_cphContent_imgParliamentarianPicture")
            [0].attrs["src"])
        code = sources.LOP_CODE.search(photo_url).group().lower()
        if code != "00000000-0000-0000-0000-000000000000":
            filename = "{}.jpg".format(code)
            filepath = parliamentarian.photo.field.upload_to(None, filename)
            if os.path.exists(os.path.join(settings.MEDIA_ROOT, filepath)):
                parliamentarian.photo = filepath
            else:
                parliamentarian.photo.save(
                    filename, ContentFile(requests.get(photo_url).content))

        parliamentarian.save()
    def fetch_bills_session(self, session):
        cached_committees = get_cached_dict(models.Committee.objects.filter(session=session))

        url = "http://www.parl.ca/LegisInfo/Home.aspx?download=xml&ParliamentSession={}-{}".format(session.parliament.number, session.number)
        soup = BeautifulSoup(fetch_url(url, use_cache=session.parliament.number < 42), "lxml")
        for bill_soup in tqdm(
            soup.find_all("bill"),
            desc=str(session),
            unit="bill",
        ):
            bill_number = bill_soup.select("billnumber")[0]
            bill_number = "-".join(filter(None, (
                bill_number.attrs["prefix"],
                bill_number.attrs["number"],
                bill_number.get("suffix", None),
            )))
            bill = models.Bill(
                session=session,
                slug=slugify("{}-{}".format(
                    session.slug,
                    bill_number,
                )),
            )
            for lang in (EN, FR):
                bill.links[lang][sources.NAME_LEGISINFO[lang]] = url_tweak(
                    "http://www.parl.gc.ca/LegisInfo/BillDetails.aspx",
                    update={
                        "billId": bill_soup.attrs["id"],
                        "Language": sources.LANG_LEGISINFO_UI[lang],
                    },
                )
                bill.names[lang][sources.NAME_LEGISINFO_NUMBER[lang]] = bill_number
                bill.names[lang][sources.NAME_LEGISINFO_TITLE[lang]] = bill_soup.select("billtitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text
                title_short = bill_soup.select("shorttitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text
                if title_short:
                    bill.names[lang][sources.NAME_LEGISINFO_TITLE_SHORT[lang]] = title_short
            bill.save()

            for event_soup in bill_soup.select("event"):
                try:
                    committee_soup = bill_soup.select("committee[accronym]")[0]  # They misspelled "acronym" in their XML
                    code = committee_soup.attrs["accronym"]
                    if code != "WHOL":
                        bill.committees.add(get_cached_obj(cached_committees, code))
                except IndexError:
                    pass
Beispiel #25
0
    def fetch_party(self, name, popup, election_riding):
        if name in ("Unknown", "Ind.", "N/A") or name.startswith("I "):
            return None

        # Some parties share the same name, but are effectively separate
        if name in ("Rhino", "Nrhino"):
            name = "Rhino ({})".format(
                "1" if election_riding.date.year < 2000 else "2")
        elif name == "C.P.":
            name = "C.P. ({})".format(
                "1" if election_riding.date.year < 2000 else "2")
        elif name == "Soc":
            name = "Soc ({})".format(
                "1" if election_riding.date.year < 1930 else "2")

        # Others just slugify ambiguously
        if name == "NCP":
            name = "NCP (1)"
        elif name == "N.C.P.":
            name = "N.C.P. (2)"
        elif name == "BPC":
            name = "BPC (1)"
        elif name == "B.P.C.":
            name = "B.P.C. (2)"

        try:
            party = self.cached_parties[name]
        except KeyError:
            party = Party()
            for lang in (EN, FR):
                popup_soup = BeautifulSoup(
                    fetch_url(
                        "https://lop.parl.ca/About/Parliament/FederalRidingsHistory/hfer-party.asp?lang={}&Party={}"
                        .format(
                            sources.LANG_LOP[lang],
                            PARTY_POPUP.search(popup).groups()[0],
                        )), "html.parser")
                party.names[lang][sources.NAME_LOP_PARTY_SHORT[
                    lang]] = popup_soup.find_all("td")[0].text.strip()
                party.names[lang][sources.NAME_LOP_RIDING_HISTORY[
                    lang]] = popup_soup.find_all("td")[1].text.strip()
            party.slug = slugify(name)
            party.lop_item_code = None
            party.save()
            self.cached_parties[name] = party
        return party
Beispiel #26
0
 def fetch_parliamentarians(self, parliament):
     logger.debug("Fetch parliamentarians, {}".format(parliament))
     url = parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]]
     for link in tqdm(
             BeautifulSoup(
                 fetch_url(url),
                 "html.parser").select("a[href^=Parliamentarian]"),
             desc=str(parliament),
             unit="parliamentarian",
     ):
         # We slugify the parliamentarian's name to disambiguate
         # names like "Marcel Masse" and "Marcel Massé"
         self.cache_parliamentarians[slugify(link.text)][url_tweak(
             urljoin(url, link.attrs["href"]),
             update={
                 "MoreInfo": "True",
                 "Section": "All",
             },
         )] = link.text
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        for session_link in tqdm(
                BeautifulSoup(
                    fetch_url(
                        "http://www.ourcommons.ca/DocumentViewer/en/42-1/house/sitting-1/hansard",
                        allow_redirects=True,
                        use_cache=False,
                    ), "html.parser").select(".session-selector"),
                desc="Fetch Sittings, HoC",
                unit="session",
        ):
            session = Session.objects.get(
                parliament__number=session_link.attrs["data-parliament"],
                number=session_link.attrs["data-session"],
            )
            self.parse_session(session)
 def parse_session(self, session):
     session_url = url_tweak(
         "http://www.ourcommons.ca/DocumentViewer/en/SessionPublicationCalendarsWidget?organization=HOC&publicationTypeId=37",
         update={
             "parliament": session.parliament.number,
             "session": session.number
         },
     )
     for sitting_link in tqdm(
             BeautifulSoup(
                 fetch_url(
                     session_url,
                     use_cache=session.parliament.number < 42,
                 ), "html.parser").select("td a"),
             desc=str(session),
             unit="sitting",
     ):
         self.parse_sitting_url(
             urljoin(session_url, sitting_link.attrs["href"]), session)
 def augment_province(self, province, lang, url):
     soup = BeautifulSoup(
         fetch_url(url),
         "html.parser",
     )
     province.links[lang][sources.NAME_LOP_PROVINCE[lang]] = url
     province.names[lang][sources.NAME_LOP_PROVINCE[lang]] = soup.select(
         "#ctl00_cphContent_lblTitle")[0].text
     province.links[lang][sources.NAME_WIKI[
         lang]] = "https://{}.wikipedia.org/wiki/{}".format(
             sources.LANG_WIKI[lang],
             province.names[lang][sources.NAME_LOP_PROVINCE[lang]].replace(
                 " ", "_"),
         )
     province.links[lang].update(
         dict((sources.AVAILABILITY_WARNINGS.sub("", link.text.strip()),
               link.attrs["href"])
              for link in soup.select("#ctl00_cphContent_dataLinks a")))
     province.save()
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        self.cached_parliamentarians = get_cached_dict(
            Parliamentarian.objects.filter(
                election_candidates__election_riding__date__year__gte=2000))
        self.cached_ridings = get_cached_dict(
            Riding.objects.filter(election_ridings__date__year__gte=2000))
        self.cached_parties = get_cached_dict(Party.objects.all())
        self.cached_parties.update({
            "Independent": [None],
            "Conservative Independent": [None],
            "Independent Conservative": [None],
        })

        list_url = "http://www.ourcommons.ca/Parliamentarians/en/HouseVotes/Index"
        parl_soup = BeautifulSoup(fetch_url(list_url), "html.parser")

        # Super-irritating that Parliament uses parliament=X&session=Y in some places, but then session=PK in others
        default_session_id = parse_qs(
            urlparse(
                parl_soup.select(".refiner-display-daterange .refinement a")
                [0].attrs["href"]).query)["sessionId"][0]

        for link in tqdm(
                parl_soup.select(".refiner-display-parliament .refinement a"),
                desc="Fetch Votes, HoC",
                unit="session",
        ):
            groupdict = re.search(
                r"(?P<parliament>[345][0-9])(st|nd|rd|th) Parliament\s+(?P<session>[1-9])(st|nd|rd|th)\s+",
                link.text).groupdict()
            self.fetch_votes_session(
                Session.objects.get(
                    parliament__number=groupdict["parliament"],
                    number=groupdict["session"],
                ),
                list_url,
                parse_qs(urlparse(link.attrs["href"]).query).get(
                    "sessionId", [default_session_id])[0],
            )