def augment_parliamentarian_open_parliament(self, parliamentarian, url): soup = BeautifulSoup(fetch_url(url), "html.parser") for lang in (EN, FR): parliamentarian.names[lang][sources.NAME_OP[lang]] = soup.find( "h1").text parliamentarian.links[EN][sources.NAME_OP[EN]] = url for link in soup.select("ul.bulleted a"): if link.text == "Wikipedia": wiki_soup = BeautifulSoup( fetch_url(link.attrs["href"], allow_redirects=True), "html.parser") parliamentarian.links[EN][sources.NAME_WIKI[EN]] = urljoin( link.attrs["href"], wiki_soup.select("#ca-nstab-main a")[0].attrs["href"]) try: parliamentarian.links[FR][ sources.NAME_WIKI[FR]] = wiki_soup.select( ".interwiki-fr a.interlanguage-link-target" )[0].attrs["href"] except: pass elif link.text == "Twitter": for lang in (EN, FR): parliamentarian.links[lang][ sources.NAME_TWITTER[lang]] = link.attrs["href"] parliamentarian.save()
def fetch_votes_session(self, session, list_url, remote_session_id): session.links[EN][sources.NAME_HOC_VOTES[EN]] = url_tweak( list_url, update={"sessionId": remote_session_id}) session.links[FR][sources.NAME_HOC_VOTES[FR]] = get_french_parl_url( session.links[EN][sources.NAME_HOC_VOTES[EN]], BeautifulSoup( fetch_url(session.links[EN][sources.NAME_HOC_VOTES[EN]]), "lxml"), ) session.save() parl_soup = BeautifulSoup( fetch_url(url_tweak( "http://www.ourcommons.ca/Parliamentarians/en/HouseVotes/ExportVotes?output=XML", update={"sessionId": remote_session_id}, ), use_cache=session.parliament.number < 42), "lxml") for overview in tqdm( parl_soup.find_all( "voteparticipant" ), # Oddly named considering the previous format we found this in desc=str(session), unit="vote", ): self.fetch_vote(overview, session)
def fetch_hoc_committees_session(self, session, session_url): for link in tqdm( BeautifulSoup( fetch_url(session_url), "html.parser", ).select(".committees-list .accordion-content a"), desc=str(session), unit="committee", ): committee_url = { EN: url_tweak(urljoin(session_url, link.attrs["href"])) } committee = models.Committee( session=session, chamber=models.Committee.CHAMBER_HOC, ) for lang in (EN, FR): soup = BeautifulSoup(fetch_url(committee_url[lang]), "html.parser") committee.names[lang][sources.NAME_PARL_COMMITTEE[ lang]] = soup.select(".institution-brand")[0].text committee.names[lang][ sources.NAME_PARL_COMMITTEE_CODE[lang]] = soup.select( ".header-title.current-committee-profile")[0].text committee.links[lang][ sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang] if not committee.slug: if "Joint" in committee.names[lang][ sources.NAME_PARL_COMMITTEE[lang]]: committee.chamber = models.Committee.CHAMBER_JOINT committee.slug = self.get_slug(committee) committee_url[FR] = get_french_parl_url( committee_url[lang], soup) committee.save()
def fetch_ridings(self, parliament): logger.debug("Fetch ridings, {}".format(parliament)) skipped_codes = set() codes_to_ridings = dict() soup = BeautifulSoup( fetch_url(parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]]), "html.parser", ) for select in ( "#ctl00_cphContent_ctl04_repGeneralElection_ctl00_grdMembers tr", "#ctl00_cphContent_ctl04_pnlSectionByElectionContent tr", ): for row in soup.select(select): cells = row.find_all("td", recursive=False) if cells: riding_name, province_name = sources.LOP_RIDING_AND_PROVINCE.search(cells[1].text.strip()).groups() province_slug = slugify(province_name) riding_slug = slugify(" ".join((province_slug, riding_name))) code = sources.LOP_CODE.search(cells[0].a.attrs["href"]).group().lower() if riding_slug not in self.known_ridings: try: province = self.cache_provinces[province_slug] except KeyError: province = models.Province.objects.get(slug=province_slug) self.cache_provinces[province_slug] = province riding, created = models.Riding.objects.get_or_create( slug=riding_slug, province=province, ) if created: riding.names[EN][sources.NAME_LOP_PARLIAMENT[EN]] = riding_name riding.save() self.known_ridings.add(riding_slug) codes_to_ridings[code] = riding else: skipped_codes.add(code) soup = BeautifulSoup( fetch_url(parliament.links[FR][sources.NAME_LOP_PARLIAMENT[FR]]), "html.parser", ) for select in ( "#ctl00_cphContent_ctl04_repGeneralElection_ctl00_grdMembers tr", "#ctl00_cphContent_ctl04_pnlSectionByElectionContent tr", ): for row in soup.select(select): cells = row.find_all("td", recursive=False) if cells: code = sources.LOP_CODE.search(cells[0].a.attrs["href"]).group().lower() if code not in skipped_codes: riding_name, province_name = sources.LOP_RIDING_AND_PROVINCE.search(cells[1].text.strip()).groups() riding = codes_to_ridings[code] riding.names[FR][sources.NAME_LOP_PARLIAMENT[FR]] = riding_name riding.save()
def fetch_riding(self, riding, url): for lang in (EN, FR): riding.links[lang][ sources.NAME_LOP_RIDING_HISTORY[lang]] = url_tweak( url, update={"Language": sources.LANG_LOP[lang]}) try: fetch_url( riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]]) except Exception as e: logger.exception(e) riding.save() self.cached_ridings[riding.slug] = riding
def augment_riding(self, riding): try: for lang in (FR, EN): url = riding.links[lang][sources.NAME_LOP_RIDING_HISTORY[lang]] soup = BeautifulSoup(fetch_url(url), "html.parser") riding.names[lang][sources.NAME_LOP_RIDING_HISTORY[ lang]] = soup.select("h4")[0].text.split(", ")[0] except (KeyError, FetchFailure, FetchSuppressed) as e: logger.exception(e) return riding.save() for tag_id in ("#previous", "#became"): related_ridings = soup.select(tag_id) if related_ridings: for link in related_ridings[0].parent.select("a"): match = re.search( r"^(?P<name>.*) \((?P<province>.*)\)\((?P<daterange>.*)\)", link.text).groupdict() riding_slug = slugify("{province}-{name}".format(**match)) try: related_riding = get_cached_obj( self.cached_ridings, riding_slug) except AssertionError: province = get_cached_obj(self.cached_provinces, match["province"]) related_riding, created = models.Riding.objects.get_or_create( slug=riding_slug, province=province) logger.debug("Auxilliary riding detected: {}".format( riding_slug)) for lang in (EN, FR): if sources.NAME_LOP_RIDING_HISTORY[ lang] not in related_riding.links[lang]: related_riding.links[lang][ sources. NAME_LOP_RIDING_HISTORY[lang]] = url_tweak( urljoin(url, link.attrs["href"]), update={ "Language": sources.LANG_LOP[lang] }, ) related_riding.names[lang][ sources. NAME_LOP_RIDING_HISTORY[lang]] = BeautifulSoup( fetch_url(related_riding.links[lang][ sources.NAME_LOP_RIDING_HISTORY[lang]] ), "html.parser", ).select("h4")[0].text.split(", ")[0] related_riding.save() riding.related_historically.add(related_riding)
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) fsas = set() index_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_in_Canada" index_all = BeautifulSoup(fetch_url(index_url), "html.parser") for link in tqdm(index_all.findAll("a", {"title": LIST})): index_letter = BeautifulSoup( fetch_url(urljoin(index_url, link.attrs["href"])), "html.parser") for fsa in tqdm(index_letter.findAll("b", text=XNX)): if cssutils.parseStyle(fsa.parent.attrs.get( "style", "")).color != "#CCC": fsas.add(fsa.text) cached_ridings = get_cached_dict( models.Riding.objects.filter( election_ridings__date__year__gte=2015)) person_id_to_riding = {} for person in BeautifulSoup( fetch_url( "http://www.ourcommons.ca/Parliamentarians/en/floorplan"), "html.parser", ).select(".FloorPlanSeat .Person"): riding = get_cached_obj(cached_ridings, person.attrs["constituencyname"]) person_id_to_riding[int(person.attrs["personid"])] = riding riding.post_code_fsas = set() for fsa in tqdm(fsas): result = fetch_url( "http://www.ourcommons.ca/Parliamentarians/en/FloorPlan/FindMPs?textCriteria={}" .format(fsa)) try: result = result.decode() except AttributeError: pass for person_id in filter(None, result.split(",")): try: person_id_to_riding[int(person_id)].post_code_fsas.add(fsa) except: logger.warning( f"Person ID {person_id} expected for FSA {fsa}, but that wasn't found in the floorplan" ) for riding in person_id_to_riding.values(): riding.post_code_fsas = sorted(riding.post_code_fsas) riding.save()
def augment_ridings_ec(self): for row in tqdm( BeautifulSoup( fetch_url( url_tweak( "http://www.elections.ca/Scripts/vis/SearchProvinces?PROV=CA&PROVID=99999&QID=-1&PAGEID=20", update={"L": sources.LANG_EC[EN]})), "html.parser").select("table tr")): cells = row.find_all("td", recursive=False) if cells: riding = models.Riding.objects.get(slug=slugify("{} {}".format( cells[1].text, cells[0].text, ))) riding.electoral_district_number = parse_qs( urlparse(cells[0].a.attrs["href"]).query)["ED"][0] self.cached_ridings[riding.electoral_district_number] = riding riding.save() for riding in tqdm( models.Riding.objects.filter( electoral_district_number__isnull=False), desc="Augment Ridings, Elections Canada", unit="riding", ): self.augment_riding_ec(riding)
def fetch_item(self, url): url[EN] = ensure_trailing_slash(url[EN]) soup = BeautifulSoup(fetch_url(url[EN]), "html.parser") url[FR] = ensure_trailing_slash( urljoin(url[EN], one_or_none( soup.select("#language-toggle")).attrs["href"]))
def fetch_hansard(self, sitting): # Fetch and parse the hansard XML self.tree = { lang: etree.ElementTree( etree.fromstring( fetch_url( sitting.links[lang][ sources.NAME_HOC_HANSARD_XML[lang]], ))) for lang in (EN, FR) } # Strip out incorrect elements for lang in (EN, FR): strip_empty_elements(self.tree[lang].getroot()) for duplicate in self.tree[lang].xpath( "//PersonSpeaking/Affiliation[2]"): duplicate.getparent().remove(duplicate) merge_adjacent_quotes(self.tree[lang].getroot()) # If the structure checks out, parse down from the root self.floor_language = None self.hansard_block = None self.hansard_block_number = 0 self.metadata = {} self.parliamentarian = None self.person_speaking = None self.previous_hansard_block = None self.sitting = sitting self.timestamp = datetimeparse(self.tree[EN].find( "//ExtractedItem[@Name='MetaCreationTime']").text) self.new_hansard_block() self.parse_element(self.tree[EN].getroot())
def augment_party_by_wikipedia(self, party, link_en, style): party.color = cssutils.parseStyle(style).background party.color = re.sub(r"^#([0-9a-f])([0-9a-f])([0-9a-f])$", r"#\1\1\2\2\3\3", party.color, flags=re.I) if party.color == "#DCDCDC": party.color = "" try: party.links[EN][sources.NAME_WIKI[EN]] = link_en soup_en = BeautifulSoup(fetch_url(link_en), "html.parser") party.names[EN][sources.NAME_WIKI[EN]] = soup_en.select("#firstHeading")[0].text.strip() link_fr = soup_en.select(".interwiki-fr a.interlanguage-link-target")[0].attrs["href"] party.links[FR][sources.NAME_WIKI[FR]] = link_fr soup_fr = BeautifulSoup(fetch_url(link_fr), "html.parser") party.names[FR][sources.NAME_WIKI[FR]] = soup_fr.select("#firstHeading")[0].text.strip() except IndexError: logger.debug("{} doesn't have a French-language equivalent in Wikipedia at the moment".format(party)) party.save()
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) cached_parties = get_cached_dict(models.Party.objects.all()) list_url = "https://en.wikipedia.org/wiki/List_of_federal_political_parties_in_Canada" for tr in tqdm( BeautifulSoup( fetch_url(list_url), "html.parser", ).select("table.wikitable > tr"), desc="Augment Parties, Wikipedia", unit="party", ): if tr.find_all("td", recursive=False): for link in tr.find_all("td", recursive=False)[1].find_all("a"): name = link.attrs["title"].strip() name = WIKI_MAPPING.get(name, name) if name is None: continue try: party = get_cached_obj(cached_parties, name) except AssertionError: logger.warning("Wikipedia mentions {}, but we don't have a mapping for it".format(link.attrs["title"].strip())) continue self.augment_party_by_wikipedia( party, urljoin(list_url, link.attrs["href"]), tr.find_all("td", recursive=False)[0].attrs["style"], ) models.Party.objects.filter(color="").update(color="#666666")
def augment_election_wiki(self, election): soup = BeautifulSoup(fetch_url(url_tweak( election.links[EN][sources.NAME_WIKI[EN]], update={"action": "edit"}, )), "html.parser") # Get the info box page_source = soup.select("#wpTextbox1")[0].text infobox_lines = re.search("{{Infobox election\n(.*?)\n}}", page_source, re.S | re.I).groups()[0].splitlines() infobox = {} infobox["parties"] = [] for key, value in [ line[2:].split("=", 1) for line in infobox_lines if line.startswith("| ") ]: key = key.strip() value = value.strip() try: party_place = int(key[-1]) - 1 while len(infobox["parties"]) <= party_place: infobox["parties"].append({}) infobox["parties"][party_place][key[:-1]] = value except ValueError: infobox[key] = value election.wiki_info_box = infobox election.save()
def fetch_provinces(self): url = url_tweak(self.ROOT_URL, update={"Language": sources.LANG_LOP[EN]}) for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser", ).select("#ctl00_pnlContent a"), desc="Fetch Provinces, LoP (EN)", unit="province", ): if link.attrs.get("id", "").startswith("ctl00_cphContent_repProvinces_"): province, created = models.Province.objects.get_or_create( slug=slugify(link.text.strip())) url_en = url_tweak( urljoin(url, link.attrs["href"]), remove=("MenuID", "MenuQuery"), update={"Section": "All"}, ) self.augment_province(province, EN, url_en) url = url_tweak(self.ROOT_URL, update={"Language": FR}) for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser", ).select("#ctl00_pnlContent a"), desc="Fetch Provinces, LoP (FR)", unit="province", ): if link.attrs.get("id", "").startswith("ctl00_cphContent_repProvinces_"): url_fr = url_tweak( urljoin(url, link.attrs["href"]), remove=("MenuID", "MenuQuery"), update={"Section": "All"}, ) province = models.Province.objects.get( links__contains=url_tweak( url_fr, update={"Language": sources.LANG_LOP[EN]}, )) self.augment_province(province, FR, url_fr)
def fetch_year(self, year): days = [ dateparse(day) for day in json.loads(fetch_url( "http://parlvu.parl.gc.ca/XRender/en/api/Data/GetCalendarYearData/{}0101/-1".format(year), use_cache=year < 2017, )) ] for day in tqdm(days, desc=str(year), unit="day"): self.fetch_day(day)
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) cached_parties = get_cached_dict(models.Party.objects.all()) list_url = "https://lop.parl.ca/parlinfo/Lists/Party.aspx" for lang in (EN, FR): for a in tqdm( BeautifulSoup( fetch_url( url_tweak( list_url, update={"Language": sources.LANG_LOP[lang]})), "html.parser").select("td > a"), desc="Augment Parties, LoP", unit="party", ): if "_lnkParty_" not in a.attrs.get("id", ""): continue url = url_tweak( urljoin(list_url, a.attrs["href"]), update={"Section": "ALL"}, remove=("MenuID", "MenuQuery"), ) lop_item_code = sources.LOP_CODE.search(url).group().lower() party = models.Party.objects.filter( lop_item_code=lop_item_code).first() if not party: name = sources.WHITESPACE.sub(" ", a.text.strip()) name = LOP_LIST_MAPPING.get(name, name) if name is None: continue party = get_cached_obj(cached_parties, name) party.links[lang][sources.NAME_LOP_PARTY[lang]] = url party.names[lang][ sources.NAME_LOP_PARTY[lang]] = a.text.strip() party.lop_item_code = sources.LOP_CODE.search( url).group().lower() soup = BeautifulSoup(fetch_url(url), "html.parser") for link in soup.select("#ctl00_cphContent_dataLinks a"): party.links[lang][sources.AVAILABILITY_WARNINGS.sub( "", link.text.strip())] = link.attrs["href"] party.save()
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) url = "http://www.cpac.ca/en/page/1/?s&category=all&person=all&order=newest&type=videos" while url: soup = BeautifulSoup(fetch_url(url), "html.parser") for item in soup.select(".vidlist-main__item"): self.fetch_item( {EN: urljoin(url, item.select("a")[0].attrs["href"])}) button_next = one_or_none(soup.select("a.latest-slider__next")) if button_next: url = urljoin(url, button_next.attrs["href"])
def fetch_senate_committees_session(self, session, session_url): for link in tqdm( BeautifulSoup( fetch_url(session_url), "html.parser").select(".committee-list-boxes-wrapper a"), desc=str(session), unit="committee", ): committee_url = { EN: url_tweak(urljoin(session_url, link.attrs["href"])) } if link.select(".joint-committee-list-boxes"): logger.debug( "Skipping {} (broken, reported, joint committees are covered in HoC anyway)" .format(committee_url[EN])) continue committee = models.Committee( session=session, chamber=models.Committee.CHAMBER_SEN, ) for lang in (EN, FR): soup = BeautifulSoup(fetch_url(committee_url[lang]), "html.parser") committee.names[lang][ sources.NAME_PARL_COMMITTEE[lang]] = soup.select( "meta[name=dc.description]")[0].attrs["content"] committee.names[lang][sources.NAME_PARL_COMMITTEE_CODE[ lang]] = committee_url[lang].strip("/").split( "/")[-2].upper() committee.links[lang][ sources.NAME_PARL_COMMITTEE[lang]] = committee_url[lang] if not committee.slug: committee.slug = self.get_slug(committee) committee_url[FR] = get_french_parl_url( committee_url[lang], soup) committee.save()
def fetch_parliaments(self): url = "https://lop.parl.ca/parlinfo/Lists/Parliament.aspx" for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser", ).select("#ctl00_cphContent_ctl00_grdParliamentList td > a"), desc="Fetch Parliaments, LoP", unit="parliament", ): parliament, created = models.Parliament.objects.get_or_create( number=int(REVERSE_ORDINAL.sub(r"\1", link.text)), ) if created or parliament.number >= 42: url = url_tweak( urljoin(url, link.attrs["href"]), remove=("MenuID", "MenuQuery"), update={"Section": "All"}, ) parliament.links = { EN: {sources.NAME_WIKI[EN]: "https://en.wikipedia.org/wiki/{}_Canadian_Parliament".format(inflector.ordinal(parliament.number))}, FR: {sources.NAME_WIKI[FR]: "https://fr.wikipedia.org/wiki/{}{}_législature_du_Canada".format(parliament.number, "re" if parliament.number == 1 else "e")}, } for lang in (EN, FR): parliament.links[lang][sources.NAME_LOP_PARLIAMENT[lang]] = url_tweak(url, update={"Language": sources.LANG_LOP[lang]}) if parliament.number <= 35: parliament.links[lang][sources.NAME_CANADIANA[lang]] = "http://parl.canadiana.ca/search?usrlang={}&lang={}&identifier=P{}".format( sources.LANG_CANADIANA_UI[lang], sources.LANG_CANADIANA_CONTENT[lang], parliament.number, ) parliament.seats = int(BeautifulSoup( fetch_url(parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]]), "html.parser", ).select("#ctl00_cphContent_ctl06_pnlSectionPartyStandingsContent .GridRows")[0].contents[-1].text) parliament.save()
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) list_url = "http://www.parl.gc.ca/LegisInfo/Home.aspx?Page=1" for link in tqdm( BeautifulSoup( fetch_url(list_url, allow_redirects=True), "html.parser", ).select("#ctl00_PageContentSection_BillListingControl_BillFacetSearch_SessionSelector1_pnlSessions a"), desc="Fetch Bills, LEGISinfo", unit="session", ): if " - " in link.text: parliament_number, session_number = link.text.split()[0].split("-") self.fetch_bills_session(Session.objects.get(parliament__number=parliament_number, number=session_number))
def fetch_hoc_committees(self): list_url = "http://www.ourcommons.ca/Committees/en/List" for link in tqdm( BeautifulSoup( fetch_url(list_url), "html.parser", ).select(".session-selector"), desc="Fetch Committees, HoC", unit="session", ): querydict = parse_qs(urlparse(link.attrs["href"]).query) self.fetch_hoc_committees_session( Session.objects.get(parliament__number=querydict["parl"][0], number=querydict["session"][0]), url_tweak(urljoin(list_url, link.attrs["href"])), )
def fetch_senate_committees(self): list_url = "https://sencanada.ca/en/committees/" for link in tqdm( BeautifulSoup( fetch_url(list_url), "html.parser", ).select(".session-dropdown-session a"), desc="Fetch Committees, Senate", unit="session", ): parliament_number, session_number = link.attrs["href"].strip( "/").rsplit("/", 1)[1].split("-") self.fetch_senate_committees_session( Session.objects.get(parliament__number=parliament_number, number=session_number), url_tweak(urljoin(list_url, link.attrs["href"])), )
def fetch_parliamentarian(self, slug, name, lang_naive_url): parliamentarian, created = models.Parliamentarian.objects.get_or_create( slug=slug) if not created: return for lang in (EN, FR): parliamentarian.names[lang][ sources.NAME_LOP_PARLIAMENT[lang]] = name url = url_tweak(lang_naive_url, update={"Language": sources.LANG_LOP[lang]}) parliamentarian.links[lang][ sources.NAME_LOP_PARLIAMENTARIAN[lang]] = url soup = BeautifulSoup(fetch_url(url), "html.parser") parliamentarian.names[lang][sources.NAME_LOP_PARLIAMENTARIAN[ lang]] = sources.WHITESPACE.sub( " ", soup.select("#ctl00_cphContent_lblTitle")[0].text) for link in soup.select("#ctl00_cphContent_dataLinks a"): parliamentarian.links[lang][sources.AVAILABILITY_WARNINGS.sub( "", link.text.strip())] = link.attrs["href"] try: parliamentarian.lop_item_code = sources.LOP_CODE.search( url).group().lower() parliamentarian.birthdate = soup.select( "#ctl00_cphContent_DateOfBirthData")[0].text.strip().replace( ".", "-") except: pass # Download the parliamentarian's photo if they have one photo_url = urljoin( url, soup.select("#ctl00_cphContent_imgParliamentarianPicture") [0].attrs["src"]) code = sources.LOP_CODE.search(photo_url).group().lower() if code != "00000000-0000-0000-0000-000000000000": filename = "{}.jpg".format(code) filepath = parliamentarian.photo.field.upload_to(None, filename) if os.path.exists(os.path.join(settings.MEDIA_ROOT, filepath)): parliamentarian.photo = filepath else: parliamentarian.photo.save( filename, ContentFile(requests.get(photo_url).content)) parliamentarian.save()
def fetch_bills_session(self, session): cached_committees = get_cached_dict(models.Committee.objects.filter(session=session)) url = "http://www.parl.ca/LegisInfo/Home.aspx?download=xml&ParliamentSession={}-{}".format(session.parliament.number, session.number) soup = BeautifulSoup(fetch_url(url, use_cache=session.parliament.number < 42), "lxml") for bill_soup in tqdm( soup.find_all("bill"), desc=str(session), unit="bill", ): bill_number = bill_soup.select("billnumber")[0] bill_number = "-".join(filter(None, ( bill_number.attrs["prefix"], bill_number.attrs["number"], bill_number.get("suffix", None), ))) bill = models.Bill( session=session, slug=slugify("{}-{}".format( session.slug, bill_number, )), ) for lang in (EN, FR): bill.links[lang][sources.NAME_LEGISINFO[lang]] = url_tweak( "http://www.parl.gc.ca/LegisInfo/BillDetails.aspx", update={ "billId": bill_soup.attrs["id"], "Language": sources.LANG_LEGISINFO_UI[lang], }, ) bill.names[lang][sources.NAME_LEGISINFO_NUMBER[lang]] = bill_number bill.names[lang][sources.NAME_LEGISINFO_TITLE[lang]] = bill_soup.select("billtitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text title_short = bill_soup.select("shorttitle > title[language={}]".format(sources.LANG_LEGISINFO_XML[lang]))[0].text if title_short: bill.names[lang][sources.NAME_LEGISINFO_TITLE_SHORT[lang]] = title_short bill.save() for event_soup in bill_soup.select("event"): try: committee_soup = bill_soup.select("committee[accronym]")[0] # They misspelled "acronym" in their XML code = committee_soup.attrs["accronym"] if code != "WHOL": bill.committees.add(get_cached_obj(cached_committees, code)) except IndexError: pass
def fetch_party(self, name, popup, election_riding): if name in ("Unknown", "Ind.", "N/A") or name.startswith("I "): return None # Some parties share the same name, but are effectively separate if name in ("Rhino", "Nrhino"): name = "Rhino ({})".format( "1" if election_riding.date.year < 2000 else "2") elif name == "C.P.": name = "C.P. ({})".format( "1" if election_riding.date.year < 2000 else "2") elif name == "Soc": name = "Soc ({})".format( "1" if election_riding.date.year < 1930 else "2") # Others just slugify ambiguously if name == "NCP": name = "NCP (1)" elif name == "N.C.P.": name = "N.C.P. (2)" elif name == "BPC": name = "BPC (1)" elif name == "B.P.C.": name = "B.P.C. (2)" try: party = self.cached_parties[name] except KeyError: party = Party() for lang in (EN, FR): popup_soup = BeautifulSoup( fetch_url( "https://lop.parl.ca/About/Parliament/FederalRidingsHistory/hfer-party.asp?lang={}&Party={}" .format( sources.LANG_LOP[lang], PARTY_POPUP.search(popup).groups()[0], )), "html.parser") party.names[lang][sources.NAME_LOP_PARTY_SHORT[ lang]] = popup_soup.find_all("td")[0].text.strip() party.names[lang][sources.NAME_LOP_RIDING_HISTORY[ lang]] = popup_soup.find_all("td")[1].text.strip() party.slug = slugify(name) party.lop_item_code = None party.save() self.cached_parties[name] = party return party
def fetch_parliamentarians(self, parliament): logger.debug("Fetch parliamentarians, {}".format(parliament)) url = parliament.links[EN][sources.NAME_LOP_PARLIAMENT[EN]] for link in tqdm( BeautifulSoup( fetch_url(url), "html.parser").select("a[href^=Parliamentarian]"), desc=str(parliament), unit="parliamentarian", ): # We slugify the parliamentarian's name to disambiguate # names like "Marcel Masse" and "Marcel Massé" self.cache_parliamentarians[slugify(link.text)][url_tweak( urljoin(url, link.attrs["href"]), update={ "MoreInfo": "True", "Section": "All", }, )] = link.text
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) for session_link in tqdm( BeautifulSoup( fetch_url( "http://www.ourcommons.ca/DocumentViewer/en/42-1/house/sitting-1/hansard", allow_redirects=True, use_cache=False, ), "html.parser").select(".session-selector"), desc="Fetch Sittings, HoC", unit="session", ): session = Session.objects.get( parliament__number=session_link.attrs["data-parliament"], number=session_link.attrs["data-session"], ) self.parse_session(session)
def parse_session(self, session): session_url = url_tweak( "http://www.ourcommons.ca/DocumentViewer/en/SessionPublicationCalendarsWidget?organization=HOC&publicationTypeId=37", update={ "parliament": session.parliament.number, "session": session.number }, ) for sitting_link in tqdm( BeautifulSoup( fetch_url( session_url, use_cache=session.parliament.number < 42, ), "html.parser").select("td a"), desc=str(session), unit="sitting", ): self.parse_sitting_url( urljoin(session_url, sitting_link.attrs["href"]), session)
def augment_province(self, province, lang, url): soup = BeautifulSoup( fetch_url(url), "html.parser", ) province.links[lang][sources.NAME_LOP_PROVINCE[lang]] = url province.names[lang][sources.NAME_LOP_PROVINCE[lang]] = soup.select( "#ctl00_cphContent_lblTitle")[0].text province.links[lang][sources.NAME_WIKI[ lang]] = "https://{}.wikipedia.org/wiki/{}".format( sources.LANG_WIKI[lang], province.names[lang][sources.NAME_LOP_PROVINCE[lang]].replace( " ", "_"), ) province.links[lang].update( dict((sources.AVAILABILITY_WARNINGS.sub("", link.text.strip()), link.attrs["href"]) for link in soup.select("#ctl00_cphContent_dataLinks a"))) province.save()
def handle(self, *args, **options): if options["verbosity"] > 1: logger.setLevel(logging.DEBUG) self.cached_parliamentarians = get_cached_dict( Parliamentarian.objects.filter( election_candidates__election_riding__date__year__gte=2000)) self.cached_ridings = get_cached_dict( Riding.objects.filter(election_ridings__date__year__gte=2000)) self.cached_parties = get_cached_dict(Party.objects.all()) self.cached_parties.update({ "Independent": [None], "Conservative Independent": [None], "Independent Conservative": [None], }) list_url = "http://www.ourcommons.ca/Parliamentarians/en/HouseVotes/Index" parl_soup = BeautifulSoup(fetch_url(list_url), "html.parser") # Super-irritating that Parliament uses parliament=X&session=Y in some places, but then session=PK in others default_session_id = parse_qs( urlparse( parl_soup.select(".refiner-display-daterange .refinement a") [0].attrs["href"]).query)["sessionId"][0] for link in tqdm( parl_soup.select(".refiner-display-parliament .refinement a"), desc="Fetch Votes, HoC", unit="session", ): groupdict = re.search( r"(?P<parliament>[345][0-9])(st|nd|rd|th) Parliament\s+(?P<session>[1-9])(st|nd|rd|th)\s+", link.text).groupdict() self.fetch_votes_session( Session.objects.get( parliament__number=groupdict["parliament"], number=groupdict["session"], ), list_url, parse_qs(urlparse(link.attrs["href"]).query).get( "sessionId", [default_session_id])[0], )