def augment_riding(self, riding):
        try:
            for lang in (FR, EN):
                url = riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]]
                soup = BeautifulSoup(fetch_url(url), "html.parser")
                riding.names[lang][sources.NAME_LOP_RIDING_HISTORY[
                    lang]] = soup.select("h4")[0].text.split(", ")[0]
        except (KeyError, FetchFailure, FetchSuppressed) as e:
            logger.exception(e)
            return

        riding.save()
        for tag_id in ("#previous", "#became"):
            related_ridings = soup.select(tag_id)
            if related_ridings:
                for link in related_ridings[0].parent.select("a"):
                    match = re.search(
                        r"^(?P<name>.*) \((?P<province>.*)\)\((?P<daterange>.*)\)",
                        link.text).groupdict()
                    riding_slug = slugify("{province}-{name}".format(**match))
                    try:
                        related_riding = get_cached_obj(
                            self.cached_ridings, riding_slug)
                    except AssertionError:
                        province = get_cached_obj(self.cached_provinces,
                                                  match["province"])
                        related_riding, created = models.Riding.objects.get_or_create(
                            slug=riding_slug, province=province)
                        logger.debug("Auxilliary riding detected: {}".format(
                            riding_slug))
                    for lang in (EN, FR):
                        if sources.NAME_LOP_RIDING_HISTORY[
                                lang] not in related_riding.links[lang]:
                            related_riding.links[lang][
                                sources.
                                NAME_LOP_RIDING_HISTORY[lang]] = url_tweak(
                                    urljoin(url, link.attrs["href"]),
                                    update={
                                        "Language": sources.LANG_LOP[lang]
                                    },
                                )
                            related_riding.names[lang][
                                sources.
                                NAME_LOP_RIDING_HISTORY[lang]] = BeautifulSoup(
                                    fetch_url(related_riding.links[lang][
                                        sources.NAME_LOP_RIDING_HISTORY[lang]]
                                              ),
                                    "html.parser",
                                ).select("h4")[0].text.split(", ")[0]
                            related_riding.save()
                    riding.related_historically.add(related_riding)
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        cached_parties = get_cached_dict(models.Party.objects.all())
        list_url = "https://en.wikipedia.org/wiki/List_of_federal_political_parties_in_Canada"
        for tr in tqdm(
            BeautifulSoup(
                fetch_url(list_url),
                "html.parser",
            ).select("table.wikitable > tr"),
            desc="Augment Parties, Wikipedia",
            unit="party",
        ):
            if tr.find_all("td", recursive=False):
                for link in tr.find_all("td", recursive=False)[1].find_all("a"):
                    name = link.attrs["title"].strip()
                    name = WIKI_MAPPING.get(name, name)
                    if name is None:
                        continue
                    try:
                        party = get_cached_obj(cached_parties, name)
                    except AssertionError:
                        logger.warning("Wikipedia mentions {}, but we don't have a mapping for it".format(link.attrs["title"].strip()))
                        continue
                    self.augment_party_by_wikipedia(
                        party,
                        urljoin(list_url, link.attrs["href"]),
                        tr.find_all("td", recursive=False)[0].attrs["style"],
                    )
        models.Party.objects.filter(color="").update(color="#666666")
Example #3
0
    def augment_parliamentarians_open_parliament(self):
        cached_provinces = get_cached_dict(models.Province.objects.all())
        cached_parliamentarians = get_cached_dict(
            models.Parliamentarian.objects.filter(
                Q(election_candidates__election_riding__general_election__parliament__number__gte
                  =35) |
                Q(election_candidates__election_riding__by_election__parliament__number__gte
                  =35)))

        for url in (
                "https://openparliament.ca/politicians/former/",
                "https://openparliament.ca/politicians/",
        ):
            for row in tqdm(
                    BeautifulSoup(fetch_url(url),
                                  "html.parser").select(".content > .row"),
                    desc="Augment Parliamentarians, OpenParliament.ca",
                    unit="parliamentarian",
            ):
                columns = row.find_all("div", recursive=False)
                if len(columns) == 2 and columns[0].find(
                        "h2") and columns[1].find("a"):
                    province_name = columns[0].find("h2").text.strip()
                    province = get_cached_obj(
                        cached_provinces,
                        PROVINCE_MAPPING.get(province_name, province_name))
                    if sources.NAME_OP[EN] not in province.names[EN]:
                        province.names[EN][sources.NAME_OP[EN]] = province_name
                        province.save()
                    for link in columns[1].select("a[href^=/politicians/]"):
                        if link.attrs["href"] not in ("/politicians/",
                                                      "/politicians/former/"):
                            self.augment_parliamentarian_open_parliament(
                                get_cached_obj(
                                    cached_parliamentarians,
                                    PARLIAMENTARIAN_MAPPING.get(
                                        (link.text, province.slug),
                                        slugify(link.text),
                                    )), urljoin(url, link.attrs["href"]))
Example #4
0
 def handle(self, *args, **options):
     if options["verbosity"] > 1:
         logger.setLevel(logging.DEBUG)
     cached_parties = get_cached_dict(models.Party.objects.all())
     url = "http://www.elections.ca/content.aspx?dir=par&document=index&section=pol"
     for lang in (EN, FR):
         url_lang = url_tweak(url, update={"lang": sources.LANG_EC[lang]})
         ec_soup = BeautifulSoup(fetch_url(url_lang), "html.parser")
         for h3 in ec_soup.select("h3.partytitle"):
             name = h3.text.strip()
             name_short = h3.attrs["id"]
             name = EC_MAPPING.get(name, name)
             try:
                 party = get_cached_obj(cached_parties, name)
             except AssertionError:
                 party = get_cached_obj(cached_parties, name_short)
             party.names[lang][sources.NAME_EC[lang]] = name
             party.names[lang][sources.NAME_EC_SHORT[lang]] = name_short
             party.links[lang][sources.NAME_EC[lang]] = "{}#{}".format(
                 url_lang, name_short)
             party.save()
             cached_parties[name].add(party)
             cached_parties[name_short].add(party)
Example #5
0
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        fsas = set()
        index_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_in_Canada"
        index_all = BeautifulSoup(fetch_url(index_url), "html.parser")
        for link in tqdm(index_all.findAll("a", {"title": LIST})):
            index_letter = BeautifulSoup(
                fetch_url(urljoin(index_url, link.attrs["href"])),
                "html.parser")
            for fsa in tqdm(index_letter.findAll("b", text=XNX)):
                if cssutils.parseStyle(fsa.parent.attrs.get(
                        "style", "")).color != "#CCC":
                    fsas.add(fsa.text)

        cached_ridings = get_cached_dict(
            models.Riding.objects.filter(
                election_ridings__date__year__gte=2015))
        person_id_to_riding = {}
        for person in BeautifulSoup(
                fetch_url(
                    "http://www.ourcommons.ca/Parliamentarians/en/floorplan"),
                "html.parser",
        ).select(".FloorPlanSeat .Person"):
            riding = get_cached_obj(cached_ridings,
                                    person.attrs["constituencyname"])
            person_id_to_riding[int(person.attrs["personid"])] = riding
            riding.post_code_fsas = set()

        for fsa in tqdm(fsas):
            result = fetch_url(
                "http://www.ourcommons.ca/Parliamentarians/en/FloorPlan/FindMPs?textCriteria={}"
                .format(fsa))
            try:
                result = result.decode()
            except AttributeError:
                pass
            for person_id in filter(None, result.split(",")):
                try:
                    person_id_to_riding[int(person_id)].post_code_fsas.add(fsa)
                except:
                    logger.warning(
                        f"Person ID {person_id} expected for FSA {fsa}, but that wasn't found in the floorplan"
                    )

        for riding in person_id_to_riding.values():
            riding.post_code_fsas = sorted(riding.post_code_fsas)
            riding.save()
    def fetch_bills_session(self, session):
        cached_committees = get_cached_dict(models.Committee.objects.filter(session=session))

        url = "http://www.parl.ca/LegisInfo/Home.aspx?download=xml&ParliamentSession={}-{}".format(session.parliament.number, session.number)
        soup = BeautifulSoup(fetch_url(url, use_cache=session.parliament.number < 42), "lxml")
        for bill_soup in tqdm(
            soup.find_all("bill"),
            desc=str(session),
            unit="bill",
        ):
            bill_number = bill_soup.select("billnumber")[0]
            bill_number = "-".join(filter(None, (
                bill_number.attrs["prefix"],
                bill_number.attrs["number"],
                bill_number.get("suffix", None),
            )))
            bill = models.Bill(
                session=session,
                slug=slugify("{}-{}".format(
                    session.slug,
                    bill_number,
                )),
            )
            for lang in (EN, FR):
                bill.links[lang][sources.NAME_LEGISINFO[lang]] = url_tweak(
                    "http://www.parl.gc.ca/LegisInfo/BillDetails.aspx",
                    update={
                        "billId": bill_soup.attrs["id"],
                        "Language": sources.LANG_LEGISINFO_UI[lang],
                    },
                )
                bill.names[lang][sources.NAME_LEGISINFO_NUMBER[lang]] = bill_number
                bill.names[lang][sources.NAME_LEGISINFO_TITLE[lang]] = bill_soup.select("billtitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text
                title_short = bill_soup.select("shorttitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text
                if title_short:
                    bill.names[lang][sources.NAME_LEGISINFO_TITLE_SHORT[lang]] = title_short
            bill.save()

            for event_soup in bill_soup.select("event"):
                try:
                    committee_soup = bill_soup.select("committee[accronym]")[0]  # They misspelled "acronym" in their XML
                    code = committee_soup.attrs["accronym"]
                    if code != "WHOL":
                        bill.committees.add(get_cached_obj(cached_committees, code))
                except IndexError:
                    pass
    def handle(self, *args, **options):
        if options["verbosity"] > 1:
            logger.setLevel(logging.DEBUG)

        cached_parties = get_cached_dict(models.Party.objects.all())
        list_url = "https://lop.parl.ca/parlinfo/Lists/Party.aspx"
        for lang in (EN, FR):
            for a in tqdm(
                    BeautifulSoup(
                        fetch_url(
                            url_tweak(
                                list_url,
                                update={"Language": sources.LANG_LOP[lang]})),
                        "html.parser").select("td > a"),
                    desc="Augment Parties, LoP",
                    unit="party",
            ):
                if "_lnkParty_" not in a.attrs.get("id", ""):
                    continue
                url = url_tweak(
                    urljoin(list_url, a.attrs["href"]),
                    update={"Section": "ALL"},
                    remove=("MenuID", "MenuQuery"),
                )
                lop_item_code = sources.LOP_CODE.search(url).group().lower()
                party = models.Party.objects.filter(
                    lop_item_code=lop_item_code).first()
                if not party:
                    name = sources.WHITESPACE.sub(" ", a.text.strip())
                    name = LOP_LIST_MAPPING.get(name, name)
                    if name is None:
                        continue
                    party = get_cached_obj(cached_parties, name)
                party.links[lang][sources.NAME_LOP_PARTY[lang]] = url
                party.names[lang][
                    sources.NAME_LOP_PARTY[lang]] = a.text.strip()
                party.lop_item_code = sources.LOP_CODE.search(
                    url).group().lower()
                soup = BeautifulSoup(fetch_url(url), "html.parser")
                for link in soup.select("#ctl00_cphContent_dataLinks a"):
                    party.links[lang][sources.AVAILABILITY_WARNINGS.sub(
                        "", link.text.strip())] = link.attrs["href"]
                party.save()
Example #8
0
    def personspeaking_open(self, element, lang):
        assert not self.person_speaking and not self.parliamentarian, "Person speaking opened, but wasn't flushed"
        affiliation = element.find("Affiliation")
        if affiliation is None:
            return
        try:
            self.person_speaking = normalize_whitespace(
                {
                    EN:
                    affiliation.text,
                    FR:
                    self.get_french_element(element).xpath("Affiliation")
                    [0].text,
                },
                strip=True)
        except:
            pass
        if not self.person_speaking or not self.person_speaking[EN]:
            self.person_speaking = normalize_whitespace(
                {
                    EN:
                    element.getparent().attrib["ToCText"],
                    FR:
                    self.get_french_element(element.getparent(),
                                            by_attrib="id").attrib["ToCText"],
                },
                strip=True)

        if self.person_speaking[EN] not in UNMAPPED_NAMES:
            try:
                self.parliamentarian = get_cached_obj(
                    CACHED_PARLIAMENTARIANS, affiliation.attrib["DbId"])
            except:
                try:
                    self.parliamentarian = get_cached_obj(
                        CACHED_PARLIAMENTARIANS, self.person_speaking[EN])
                except AssertionError:
                    for speaker_format in SPEAKER_FORMATS:
                        match = speaker_format.search(self.person_speaking[EN])
                        if match:
                            try:
                                self.parliamentarian = get_cached_obj(
                                    CACHED_PARLIAMENTARIANS,
                                    normalize_whitespace(
                                        match.groupdict()["name"], strip=True),
                                )
                            except AssertionError:
                                logger.warning(
                                    "UNMATCHED SPEAKER", self.sitting,
                                    affiliation.attrib, [
                                        self.person_speaking[EN],
                                        match.groupdict()["name"].strip()
                                    ],
                                    element.getparent().attrib)
                            break
                    else:
                        logger.warning("SPEAKER FORMAT MISMATCH", self.sitting,
                                       [self.person_speaking[EN]],
                                       element.getparent().attrib)
                if self.parliamentarian:
                    CACHED_PARLIAMENTARIANS[affiliation.attrib["DbId"]].add(
                        self.parliamentarian)
        return {}
    def fetch_vote_participant(self, row, vote, soup):
        hvp = models.HouseVoteParticipant(house_vote=vote)
        cells = row.find_all("td", recursive=False)
        mp_link = {EN: cells[0].a}
        mp_name = {EN: mp_link[EN].text.strip()}
        riding_name = cells[0].find_all("span",
                                        recursive=False)[1].text.strip()[1:-1]
        party_name = cells[1].text.strip()
        recorded_votes = (bool(cells[2].img), bool(cells[3].img),
                          bool(cells[4].img))

        try:
            without_honorific = HONORIFIC.sub("", mp_name[EN])
            parliamentarian = get_cached_obj(
                self.cached_parliamentarians,
                without_honorific,
            )
        except AssertionError:
            try:
                riding = get_cached_obj(self.cached_ridings,
                                        riding_name.replace("—", "--"))
            except AssertionError:
                logger.warning("ERR RIDING {}: {}".format(vote, riding_name))
                return
            try:
                parliamentarian = get_cached_obj(
                    self.cached_parliamentarians,
                    PARLIAMENTARIAN_MAPPING.get(
                        (without_honorific, riding.slug)),
                )
            except AssertionError:
                logger.warning("ERR PARLIMENTARIAN {}: {}".format(
                    vote, (without_honorific, riding.slug)))
                return
        if sources.NAME_HOC_VOTES[EN] not in parliamentarian.names[EN]:
            mp_link[FR] = soup[FR].find(
                "a",
                href=re.compile(r"/ParlDataWidgets/fr/affiliation/{}".format(
                    WIDGET_ID.search(cells[0].a.attrs["href"]).groups()[0])))
            mp_name[FR] = mp_link[FR].text.strip()
            for lang in (EN, FR):
                parliamentarian.names[lang][
                    sources.NAME_HOC_VOTES[lang]] = mp_name[lang]
                parliamentarian.links[lang][
                    sources.NAME_HOC_VOTES[lang]] = urljoin(
                        vote.links[lang][sources.NAME_HOC_VOTE_DETAILS[lang]],
                        mp_link[lang].attrs["href"])
            parliamentarian.save()
        hvp.parliamentarian = parliamentarian
        hvp.slug = f"{vote.slug}-{parliamentarian.slug}"

        try:
            party = get_cached_obj(self.cached_parties,
                                   PARTY_MAPPING.get(party_name, party_name))
            hvp.party = party
        except AssertionError:
            logger.warning("ERR PARTY {}".format(party_name))
            return

        try:
            hvp.recorded_vote = RECORDED_VOTE_MAPPING[recorded_votes]
        except KeyError:
            logger.warning("ERR VOTE {} {}: {}".format(vote, mp_name,
                                                       recorded_votes))
            return
        hvp.save()