def _save_a_written_question(link): soupsoup, suppesuppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + link.a["href"], "written question %s" % re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0]) data = AccessControlDict(((x.td.text, x('td')[1]) for x in soupsoup.find('table', 'txt')('tr') if x.td.text)) data_nl = AccessControlDict(((x.td.text, x('td')[1]) for x in suppesuppe.find('table', 'txt')('tr') if x.td.text)) get_or_create(WrittenQuestion, _id="lachambre_id", lachambre_id=re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0], title={"fr": data["Titre"].text, "nl": data_nl["Titel"].text}, departement={"fr": data[u"Département"].text, "nl": data_nl[u"Departement"].text}, sub_departement={"fr": data[u"Sous-département"].text, "nl": data_nl[u"Sub-departement"].text}, deposition_date=data[u"Date de dépôt"].text, delay_date=dico_get_text(data, u"Date de délai"), publication_date=dico_get_text(data, "Date publication"), # TODO: link to the actual deputy author=data[u"Auteur"].text, language=data[u"Langue"].text, question_status={"fr": dico_get_text(data, "Statut question"), "nl": dico_get_text(data_nl, "Status vraag")}, status={"fr": dico_get_text(data, "Statut"), "nl": dico_get_text(data_nl, "Status")}, question={"fr": u"%s" % data["Question"], "nl": "%s" % data_nl["Vraag"]}, answer={"fr": dico_get_text(data, u"Réponse"), "nl": dico_get_text(data_nl, u"Antwoord")}, publication_reponse_pdf_url=get_href_else_blank(data, u"Publication réponse"), publication_question_pdf_url=get_href_else_blank(data, u"Publication question"), publication_reponse=get_text_else_blank(data, u"Publication réponse"), publication_question=get_text_else_blank(data, u"Publication question"), eurovoc_descriptors={"fr": get_items_list_else_empty_list(data, "Descripteurs Eurovoc"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-descriptoren")}, eurovoc_candidats_descriptors={"fr": get_items_list_else_empty_list(data, "Candidats-descripteurs Eurovoc"), "nl": get_items_list_else_empty_list(data_nl, "Eurovoc kandidaat-descriptoren")}, keywords={"fr": get_items_list_else_empty_list(data, u"Mots-clés libres"), "nl": get_items_list_else_empty_list(data_nl, u"Vrije trefwoorden")}, url=link.a["href"], ) data.die_if_got_not_accessed_keys()
def scrape(): soup, suppe = read_or_dl_with_nl( "http://www.lachambre.be/kvvcr/showpage.cfm?section=/comm/commissions&language=fr&cfm=/site/wwwcfm/comm/LstCom.cfm&rightmenu=right_cricra", "commissions list") _type = "" for i, j in zip(soup("div", id="story")[1], suppe("div", id="story")[1]): if not isinstance(i, NavigableString) and (i.h4 or i.a): if i.h4: _type = i.h4.text _type_nl = j.h4.text elif i.a: commission = get_or_create(Commission, lachambre_id=int( re.search( "com=(\d+)", i.a["href"]).groups()[0])) commission.type["fr"] = _type commission.type["nl"] = _type_nl commission.name["fr"] = i.a.text commission.name["nl"] = j.a.text commission.url = i.a["href"] commission.save() for com in list(Commission.objects.all()): handle_commission(com)
def handle_commission(commission): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + commission.url, "commission %s" % commission.lachambre_id) commission.full_name["fr"] = soup.h1.text commission.full_name["nl"] = suppe.h1.text commission.deputies = [] seats = {"fr": {}, "nl": {}} for i, j in zip(soup('p'), suppe('p')): role = i.b.text[:-1] role_nl = j.b.text[:-1] for dep in i('a'): deputy = Deputy.objects.get(lachambre_id=re.search( "key=([O0-9]+)", dep["href"]).groups()[0]) membership = get_or_create(CommissionMembership, deputy=deputy, commission=commission) membership.role = role membership.save() commission.deputies.append(membership.id) seats["fr"][role] = map( lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) seats["nl"][role_nl] = map( lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) commission.seats = seats commission.save()
def scrape(): for a, url in enumerate( ('http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=y', 'http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=n' )): soup, suppe = read_or_dl_with_nl(url, "annual repports %i" % a) for i, j in zip( soup.find('div', id="story")('table')[1]('tr', recursive=False)[::5], suppe.find('div', id="story")('table')[1]('tr', recursive=False)[::5]): get_or_create( AnnualReport, title={ "fr": i('td')[2].text, "nl": j('td')[2].text }, date=i('td')[0].text, law_and_article={ "fr": i('td')[4].text, "nl": j('td')[4].text }, periodicity=re.sub("[^0-9]", "", i('td')[5].text), pdf_url=i('td')[1].a["href"] if i('td')[1].a else "", )
def check_for_new_documents(): for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text) for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")): if not Document.objects.filter(lachambre_id=soup.div.text): url = soup.a["href"] title = soup("div")[1].text lachambre_id = soup.div.text logger.info("find a new document: %s - [%s] - %s" % (LACHAMBRE_PREFIX + url, lachambre_id, title)) document = Document(title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=lachambre_id, url=soup.a["href"]) handle_document(document)
def scrape(): for a, url in enumerate(('http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=y', 'http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=n')): soup, suppe = read_or_dl_with_nl(url, "annual repports %i" % a) for i, j in zip(soup.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5], suppe.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5]): get_or_create(AnnualReport, title={"fr": i('td')[2].text, "nl": j('td')[2].text}, date=i('td')[0].text, law_and_article={"fr": i('td')[4].text, "nl": j('td')[4].text}, periodicity=re.sub("[^0-9]", "", i('td')[5].text), pdf_url=i('td')[1].a["href"] if i('td')[1].a else "", )
def _handle_deputy(deputy, reset=False): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + deputy.url, deputy.full_name, reset) deputy.language = soup.i.parent.text.split(":")[1] if soup.i else None deputy.cv["fr"] = re.sub(' +', ' ', soup('table')[5].p.text) deputy.cv["nl"] = re.sub(' +', ' ', suppe('table')[5].p.text) if deputy.cv["fr"].encode("Utf-8").startswith("Députée"): deputy.sex = "F" elif deputy.cv["fr"].encode("Utf-8").startswith("Député"): deputy.sex = "M" else: deputy.sex = None _split_deputy_full_name(deputy, soup) #_get_deputie_commissions(soup, deputy) #_deputy_documents(soup, deputy) deputy.save()
def handle_deputy(deputy, reset=False): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + deputy.url, deputy.full_name, reset) deputy.photo_uri = "http://www.lachambre.be" + soup.table.img["src"] # XXX can't get this anymore I guess :( # deputy.language = soup.table.i.parent.text.split(":")[1] if soup.i else None deputy.cv["fr"] = re.sub(' +', ' ', soup('table')[1].p.text) deputy.cv["nl"] = re.sub(' +', ' ', suppe('table')[1].p.text) if deputy.cv["fr"].encode("Utf-8").startswith("Députée"): deputy.sex = "F" elif deputy.cv["fr"].encode("Utf-8").startswith("Député"): deputy.sex = "M" else: deputy.sex = None split_deputy_full_name(deputy, soup) # _get_deputie_commissions(soup, deputy) # _deputy_documents(soup, deputy) deputy.save()
def handle_commission(commission): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + commission.url, "commission %s" % commission.lachambre_id) commission.full_name["fr"] = soup.h1.text commission.full_name["nl"] = suppe.h1.text commission.deputies = [] seats = {"fr": {}, "nl": {}} for i, j in zip(soup('p'), suppe('p')): role = i.b.text[:-1] role_nl = j.b.text[:-1] for dep in i('a'): deputy = Deputy.objects.get(lachambre_id=re.search("key=([O0-9]+)", dep["href"]).groups()[0]) membership = get_or_create(CommissionMembership, deputy=deputy, commission=commission) membership.role = role membership.save() commission.deputies.append(membership.id) seats["fr"][role] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) seats["nl"][role_nl] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) commission.seats = seats commission.save()
def scrape(): soup, suppe = read_or_dl_with_nl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/comm/commissions&language=fr&cfm=/site/wwwcfm/comm/LstCom.cfm&rightmenu=right_cricra", "commissions list") _type = "" for i, j in zip(soup("div", id="story")[1], suppe("div", id="story")[1]): if not isinstance(i, NavigableString) and (i.h4 or i.a): if i.h4: _type = i.h4.text _type_nl = j.h4.text elif i.a: commission = get_or_create(Commission, lachambre_id=int(re.search("com=(\d+)", i.a["href"]).groups()[0])) commission.type["fr"] = _type commission.type["nl"] = _type_nl commission.name["fr"] = i.a.text commission.name["nl"] = j.a.text commission.url = i.a["href"] commission.save() for com in list(Commission.objects.all()): handle_commission(com)
def get_new_documents(): for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text) for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")): get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])