def augment_riding(self, riding): try: for lang in (FR, EN): url = riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]] soup = BeautifulSoup(fetch_url(url), "html.parser") riding.names[lang][sources.NAME_LOP_RIDING_HISTORY[ lang]] = soup.select("h4")[0].text.split(", ")[0] except (KeyError, FetchFailure, FetchSuppressed) as e: logger.exception(e) return riding.save() for tag_id in ("#previous", "#became"): related_ridings = soup.select(tag_id) if related_ridings: for link in related_ridings[0].parent.select("a"): match = re.search( r"^(?P<name>.*) \((?P<province>.*)\)\((?P<daterange>.*)\)", link.text).groupdict() riding_slug = slugify("{province}-{name}".format(**match)) try: related_riding = get_cached_obj( self.cached_ridings, riding_slug) except AssertionError: province = get_cached_obj(self.cached_provinces, match["province"]) related_riding, created = models.Riding.objects.get_or_create( slug=riding_slug, province=province) logger.debug("Auxilliary riding detected: {}".format( riding_slug)) for lang in (EN, FR): if sources.NAME_LOP_RIDING_HISTORY[ lang] not in related_riding.links[lang]: related_riding.links[lang][ sources. NAME_LOP_RIDING_HISTORY[lang]] = url_tweak( urljoin(url, link.attrs["href"]), update={ "Language": sources.LANG_LOP[lang] }, ) related_riding.names[lang][ sources. NAME_LOP_RIDING_HISTORY[lang]] = BeautifulSoup( fetch_url(related_riding.links[lang][ sources.NAME_LOP_RIDING_HISTORY[lang]] ), "html.parser", ).select("h4")[0].text.split(", ")[0] related_riding.save() riding.related_historically.add(related_riding)
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) cached_parties = get_cached_dict(models.Party.objects.all()) list_url = "https://en.wikipedia.org/wiki/List_of_federal_political_parties_in_Canada" for tr in tqdm( BeautifulSoup( fetch_url(list_url), "html.parser", ).select("table.wikitable > tr"), desc="Augment Parties, Wikipedia", unit="party", ): if tr.find_all("td", recursive=False): for link in tr.find_all("td", recursive=False)[1].find_all("a"): name = link.attrs["title"].strip() name = WIKI_MAPPING.get(name, name) if name is None: continue try: party = get_cached_obj(cached_parties, name) except AssertionError: logger.warning("Wikipedia mentions {}, but we don't have a mapping for it".format(link.attrs["title"].strip())) continue self.augment_party_by_wikipedia( party, urljoin(list_url, link.attrs["href"]), tr.find_all("td", recursive=False)[0].attrs["style"], ) models.Party.objects.filter(color="").update(color="#666666")
def augment_parliamentarians_open_parliament(self): cached_provinces = get_cached_dict(models.Province.objects.all()) cached_parliamentarians = get_cached_dict( models.Parliamentarian.objects.filter( Q(election_candidates__election_riding__general_election__parliament__number__gte =35) | Q(election_candidates__election_riding__by_election__parliament__number__gte =35))) for url in ( "https://openparliament.ca/politicians/former/", "https://openparliament.ca/politicians/", ): for row in tqdm( BeautifulSoup(fetch_url(url), "html.parser").select(".content > .row"), desc="Augment Parliamentarians, OpenParliament.ca", unit="parliamentarian", ): columns = row.find_all("div", recursive=False) if len(columns) == 2 and columns[0].find( "h2") and columns[1].find("a"): province_name = columns[0].find("h2").text.strip() province = get_cached_obj( cached_provinces, PROVINCE_MAPPING.get(province_name, province_name)) if sources.NAME_OP[EN] not in province.names[EN]: province.names[EN][sources.NAME_OP[EN]] = province_name province.save() for link in columns[1].select("a[href^=/politicians/]"): if link.attrs["href"] not in ("/politicians/", "/politicians/former/"): self.augment_parliamentarian_open_parliament( get_cached_obj( cached_parliamentarians, PARLIAMENTARIAN_MAPPING.get( (link.text, province.slug), slugify(link.text), )), urljoin(url, link.attrs["href"]))
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) cached_parties = get_cached_dict(models.Party.objects.all()) url = "http://www.elections.ca/content.aspx?dir=par&document=index§ion=pol" for lang in (EN, FR): url_lang = url_tweak(url, update={"lang": sources.LANG_EC[lang]}) ec_soup = BeautifulSoup(fetch_url(url_lang), "html.parser") for h3 in ec_soup.select("h3.partytitle"): name = h3.text.strip() name_short = h3.attrs["id"] name = EC_MAPPING.get(name, name) try: party = get_cached_obj(cached_parties, name) except AssertionError: party = get_cached_obj(cached_parties, name_short) party.names[lang][sources.NAME_EC[lang]] = name party.names[lang][sources.NAME_EC_SHORT[lang]] = name_short party.links[lang][sources.NAME_EC[lang]] = "{}#{}".format( url_lang, name_short) party.save() cached_parties[name].add(party) cached_parties[name_short].add(party)
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) fsas = set() index_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_in_Canada" index_all = BeautifulSoup(fetch_url(index_url), "html.parser") for link in tqdm(index_all.findAll("a", {"title": LIST})): index_letter = BeautifulSoup( fetch_url(urljoin(index_url, link.attrs["href"])), "html.parser") for fsa in tqdm(index_letter.findAll("b", text=XNX)): if cssutils.parseStyle(fsa.parent.attrs.get( "style", "")).color != "#CCC": fsas.add(fsa.text) cached_ridings = get_cached_dict( models.Riding.objects.filter( election_ridings__date__year__gte=2015)) person_id_to_riding = {} for person in BeautifulSoup( fetch_url( "http://www.ourcommons.ca/Parliamentarians/en/floorplan"), "html.parser", ).select(".FloorPlanSeat .Person"): riding = get_cached_obj(cached_ridings, person.attrs["constituencyname"]) person_id_to_riding[int(person.attrs["personid"])] = riding riding.post_code_fsas = set() for fsa in tqdm(fsas): result = fetch_url( "http://www.ourcommons.ca/Parliamentarians/en/FloorPlan/FindMPs?textCriteria={}" .format(fsa)) try: result = result.decode() except AttributeError: pass for person_id in filter(None, result.split(",")): try: person_id_to_riding[int(person_id)].post_code_fsas.add(fsa) except: logger.warning( f"Person ID {person_id} expected for FSA {fsa}, but that wasn't found in the floorplan" ) for riding in person_id_to_riding.values(): riding.post_code_fsas = sorted(riding.post_code_fsas) riding.save()
def fetch_bills_session(self, session): cached_committees = get_cached_dict(models.Committee.objects.filter(session=session)) url = "http://www.parl.ca/LegisInfo/Home.aspx?download=xml&ParliamentSession={}-{}".format(session.parliament.number, session.number) soup = BeautifulSoup(fetch_url(url, use_cache=session.parliament.number < 42), "lxml") for bill_soup in tqdm( soup.find_all("bill"), desc=str(session), unit="bill", ): bill_number = bill_soup.select("billnumber")[0] bill_number = "-".join(filter(None, ( bill_number.attrs["prefix"], bill_number.attrs["number"], bill_number.get("suffix", None), ))) bill = models.Bill( session=session, slug=slugify("{}-{}".format( session.slug, bill_number, )), ) for lang in (EN, FR): bill.links[lang][sources.NAME_LEGISINFO[lang]] = url_tweak( "http://www.parl.gc.ca/LegisInfo/BillDetails.aspx", update={ "billId": bill_soup.attrs["id"], "Language": sources.LANG_LEGISINFO_UI[lang], }, ) bill.names[lang][sources.NAME_LEGISINFO_NUMBER[lang]] = bill_number bill.names[lang][sources.NAME_LEGISINFO_TITLE[lang]] = bill_soup.select("billtitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text title_short = bill_soup.select("shorttitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text if title_short: bill.names[lang][sources.NAME_LEGISINFO_TITLE_SHORT[lang]] = title_short bill.save() for event_soup in bill_soup.select("event"): try: committee_soup = bill_soup.select("committee[accronym]")[0] # They misspelled "acronym" in their XML code = committee_soup.attrs["accronym"] if code != "WHOL": bill.committees.add(get_cached_obj(cached_committees, code)) except IndexError: pass
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) cached_parties = get_cached_dict(models.Party.objects.all()) list_url = "https://lop.parl.ca/parlinfo/Lists/Party.aspx" for lang in (EN, FR): for a in tqdm( BeautifulSoup( fetch_url( url_tweak( list_url, update={"Language": sources.LANG_LOP[lang]})), "html.parser").select("td > a"), desc="Augment Parties, LoP", unit="party", ): if "_lnkParty_" not in a.attrs.get("id", ""): continue url = url_tweak( urljoin(list_url, a.attrs["href"]), update={"Section": "ALL"}, remove=("MenuID", "MenuQuery"), ) lop_item_code = sources.LOP_CODE.search(url).group().lower() party = models.Party.objects.filter( lop_item_code=lop_item_code).first() if not party: name = sources.WHITESPACE.sub(" ", a.text.strip()) name = LOP_LIST_MAPPING.get(name, name) if name is None: continue party = get_cached_obj(cached_parties, name) party.links[lang][sources.NAME_LOP_PARTY[lang]] = url party.names[lang][ sources.NAME_LOP_PARTY[lang]] = a.text.strip() party.lop_item_code = sources.LOP_CODE.search( url).group().lower() soup = BeautifulSoup(fetch_url(url), "html.parser") for link in soup.select("#ctl00_cphContent_dataLinks a"): party.links[lang][sources.AVAILABILITY_WARNINGS.sub( "", link.text.strip())] = link.attrs["href"] party.save()
def personspeaking_open(self, element, lang): assert not self.person_speaking and not self.parliamentarian, "Person speaking opened, but wasn't flushed" affiliation = element.find("Affiliation") if affiliation is None: return try: self.person_speaking = normalize_whitespace( { EN: affiliation.text, FR: self.get_french_element(element).xpath("Affiliation") [0].text, }, strip=True) except: pass if not self.person_speaking or not self.person_speaking[EN]: self.person_speaking = normalize_whitespace( { EN: element.getparent().attrib["ToCText"], FR: self.get_french_element(element.getparent(), by_attrib="id").attrib["ToCText"], }, strip=True) if self.person_speaking[EN] not in UNMAPPED_NAMES: try: self.parliamentarian = get_cached_obj( CACHED_PARLIAMENTARIANS, affiliation.attrib["DbId"]) except: try: self.parliamentarian = get_cached_obj( CACHED_PARLIAMENTARIANS, self.person_speaking[EN]) except AssertionError: for speaker_format in SPEAKER_FORMATS: match = speaker_format.search(self.person_speaking[EN]) if match: try: self.parliamentarian = get_cached_obj( CACHED_PARLIAMENTARIANS, normalize_whitespace( match.groupdict()["name"], strip=True), ) except AssertionError: logger.warning( "UNMATCHED SPEAKER", self.sitting, affiliation.attrib, [ self.person_speaking[EN], match.groupdict()["name"].strip() ], element.getparent().attrib) break else: logger.warning("SPEAKER FORMAT MISMATCH", self.sitting, [self.person_speaking[EN]], element.getparent().attrib) if self.parliamentarian: CACHED_PARLIAMENTARIANS[affiliation.attrib["DbId"]].add( self.parliamentarian) return {}
def fetch_vote_participant(self, row, vote, soup): hvp = models.HouseVoteParticipant(house_vote=vote) cells = row.find_all("td", recursive=False) mp_link = {EN: cells[0].a} mp_name = {EN: mp_link[EN].text.strip()} riding_name = cells[0].find_all("span", recursive=False)[1].text.strip()[1:-1] party_name = cells[1].text.strip() recorded_votes = (bool(cells[2].img), bool(cells[3].img), bool(cells[4].img)) try: without_honorific = HONORIFIC.sub("", mp_name[EN]) parliamentarian = get_cached_obj( self.cached_parliamentarians, without_honorific, ) except AssertionError: try: riding = get_cached_obj(self.cached_ridings, riding_name.replace("—", "--")) except AssertionError: logger.warning("ERR RIDING {}: {}".format(vote, riding_name)) return try: parliamentarian = get_cached_obj( self.cached_parliamentarians, PARLIAMENTARIAN_MAPPING.get( (without_honorific, riding.slug)), ) except AssertionError: logger.warning("ERR PARLIMENTARIAN {}: {}".format( vote, (without_honorific, riding.slug))) return if sources.NAME_HOC_VOTES[EN] not in parliamentarian.names[EN]: mp_link[FR] = soup[FR].find( "a", href=re.compile(r"/ParlDataWidgets/fr/affiliation/{}".format( WIDGET_ID.search(cells[0].a.attrs["href"]).groups()[0]))) mp_name[FR] = mp_link[FR].text.strip() for lang in (EN, FR): parliamentarian.names[lang][ sources.NAME_HOC_VOTES[lang]] = mp_name[lang] parliamentarian.links[lang][ sources.NAME_HOC_VOTES[lang]] = urljoin( vote.links[lang][sources.NAME_HOC_VOTE_DETAILS[lang]], mp_link[lang].attrs["href"]) parliamentarian.save() hvp.parliamentarian = parliamentarian hvp.slug = f"{vote.slug}-{parliamentarian.slug}" try: party = get_cached_obj(self.cached_parties, PARTY_MAPPING.get(party_name, party_name)) hvp.party = party except AssertionError: logger.warning("ERR PARTY {}".format(party_name)) return try: hvp.recorded_vote = RECORDED_VOTE_MAPPING[recorded_votes] except KeyError: logger.warning("ERR VOTE {} {}: {}".format(vote, mp_name, recorded_votes)) return hvp.save()