def deputies_list(reset=False): soup = read_or_dl( "http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", reset) for dep in soup('table')[4]('tr'): items = dep('td') full_name = re.sub(' +', ' ', items[0].a.text).strip() url = items[0].a['href'] party = get_or_create(Party, name=items[1].a.text, url=dict(items[1].a.attrs)['href']) email = items[2].a.text website = items[3].a['href'] if items[3].a else None # yes, one deputies key contains a O instead of an 0, I'm not joking lachambre_id = re.search('key=([0-9O]+)', url).groups()[0] if not Deputy.objects.filter(lachambre_id=lachambre_id): deputy = Deputy.objects.create( full_name=full_name, party=party, url=url, websites=[website] if website else [], lachambre_id=lachambre_id, emails=[email]) logger.debug( 'adding new deputy %s %s %s %s %s' % (lachambre_id.encode("Utf-8"), full_name.encode("Utf-8"), party, email.encode("Utf-8"), website.encode("Utf-8") if website else '')) logger.info("[NEW] deputy: %s" % deputy)
def _get_written_question_bulletin(): for i in range(48, 54): soup = read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/qrva&language=fr&rightmenu=right?legislat=52&cfm=/site/wwwcfm/qrva/qrvaList.cfm?legislat=%i" % i, "bulletin list %i" % i) for b in soup('table')[4]('tr')[1:]: try: if i == 53: get_or_create(WrittenQuestionBulletin, legislature="53", lachambre_id=b('td')[0]('a')[-1].text.split()[-1], date=b('td')[2].text, publication_date=b('td')[3].text, url=b('td')[1].a["href"], pdf_url=b('td')[0].a["href"], ) else: get_or_create(WrittenQuestionBulletin, legislature=str(i), lachambre_id=b('td')[0]('a')[-1].text.split()[-1], publication_date=b('td')[2].text, url=b('td')[1].a["href"] if b('td')[1].a else None, pdf_url=b('td')[0].a["href"], ) logger.debug("%s" % b('td')[0]('a')[-1].text.split()[-1]) except TypeError: continue
def handle_document(document): soup = read_or_dl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) document.full_details_url = soup.table.a["href"] # f*****g stupid hack because BeautifulSoup fails to parse correctly the html soup, suppe = lxml_read_or_dl_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) table = BeautifulSoup(etree.tostring(soup.xpath('//table')[0], pretty_print=True)) table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[0], pretty_print=True)) dico = document_to_dico(list(table.table('tr', recursive=False))) dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False))) del dico[""] del dico_nl[""] _get_first_level_data(dico, dico_nl, document) _get_in_charged_commissions(dico, dico_nl, document) _get_plenaries(dico, dico_nl, document) _get_senat_plenaries(dico, dico_nl, document) _get_competences(dico, dico_nl, document) _get_document_chambre(dico, dico_nl, document) _get_document_senat(dico, dico_nl, document) document.done = True document.save() logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"])) dico.die_if_got_not_accessed_keys()
def check_for_new_documents(): for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text) for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")): if not Document.objects.filter(lachambre_id=soup.div.text): url = soup.a["href"] title = soup("div")[1].text lachambre_id = soup.div.text logger.info("find a new document: %s - [%s] - %s" % (LACHAMBRE_PREFIX + url, lachambre_id, title)) document = Document(title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=lachambre_id, url=soup.a["href"]) handle_document(document)
def deputies_list(reset=False): soup = read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", reset) for dep in soup.table('tr'): items = dep('td') url = items[0].a['href'] lachambre_id = re.search('key=([0-9O]+)', url).groups()[0] deputy = Deputy.objects.filter(lachambre_id=lachambre_id) if not deputy: logger.info("[NEW] deputy: %s" % deputy) deputy = deputy[0] if deputy else Deputy(lachambre_id=lachambre_id) deputy.full_name = items[0].a.text.strip() deputy.url = url deputy.save()
def _get_deputy_analysis(url, deputy, type, reset=False): soupsoup = read_or_dl(LACHAMBRE_PREFIX + lame_url(url), '%s %s' % (deputy.full_name, type), reset) setattr(deputy, "analysis_%s_url" % type, url) setattr(deputy, "analysis_%s_list" % type, []) for i in soupsoup('table')[3]('tr', valign="top"): logger.debug("add", type, i.tr('td')[1].text.strip()) dico = table2dic(i.table('td')) logger.debug("%s" % dico) getattr(deputy, "analysis_%s_list" % type).\ append(get_or_create(Analysis, _id="lachambre_id", lachambre_id=re.search("dossierID=([0-9A-Za-z-]+)", i.a["href"]).groups()[0], title=dico["Titre"], descriptor=dico["Descripteurs"], url=i.a['href'], type=type))
def scrape(): _get_written_question_bulletin() for bulletin in list(WrittenQuestionBulletin.objects.filter(done=False, url__isnull=False)): soup = read_or_dl(LACHAMBRE_PREFIX + bulletin.url, "bulletin %s %s" % (bulletin.lachambre_id, bulletin.legislature)) if not soup.find('table', 'txt'): continue for link in soup.find('table', 'txt')('tr', recursive=False): _id = re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0] if link.a is None: continue # tempory if WrittenQuestion.objects.filter(lachambre_id=_id): logger.debug("pass %s, already parsed" % (re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0])) continue _save_a_written_question(link) bulletin.done = True bulletin.save()
def _get_deputy_questions(url, deputy, type, reset=False): soupsoup = read_or_dl(LACHAMBRE_PREFIX + lame_url(url), '%s %s' % (deputy.full_name, type), reset) setattr(deputy, "questions_%s_url" % type, url) setattr(deputy, "questions_%s_list" % type, []) for i in soupsoup('table')[3]('tr', valign="top"): logger.debug("add", type, i.tr('td')[1].text.strip()) dico = table2dic(i.table('td')) logger.debug("%s" % dico) getattr(deputy, "questions_%s_list" % type).\ append(get_or_create(Question, _id="lachambre_id", title=dico["Titre"], lachambre_id=re.search("dossierID=([0-9A-Za-z-]+)", i.a["href"]).groups()[0], reunion_type=dico.get(u"Réunion"), reunion_date=dico.get("Date discussion"), session_id=dico.get("Session"), pdf_url=dico.get(u"Compte rendu intégral", {"href": None})["href"], eurovoc_descriptors=map(lambda x: x.strip(), dico.get("Descripteurs Eurovoc", "").split('|')), keywords=map(lambda x: x.strip(), dico.get(u"Mots-clés libres", "").split("|")), url=i.a['href'], type=type))
def deputies_list(reset=False): soup = read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", reset) for dep in soup('table')[4]('tr'): items = dep('td') full_name = re.sub(' +', ' ', items[0].a.text).strip() url = items[0].a['href'] party = get_or_create(Party, name=items[1].a.text, url=dict(items[1].a.attrs)['href']) email = items[2].a.text website = items[3].a['href'] if items[3].a else None # yes, one deputies key contains a O instead of an 0, I'm not joking lachambre_id = re.search('key=([0-9O]+)', url).groups()[0] if not Deputy.objects.filter(lachambre_id=lachambre_id): deputy = Deputy.objects.create(full_name=full_name, party=party, url=url, websites=[website] if website else [], lachambre_id=lachambre_id, emails=[email]) logger.debug('adding new deputy %s %s %s %s %s' % (lachambre_id.encode("Utf-8"), full_name.encode("Utf-8"), party, email.encode("Utf-8"), website.encode("Utf-8") if website else '')) logger.info("[NEW] deputy: %s" % deputy)
def handle_document(document): soup = read_or_dl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) document.full_details_url = soup('table')[4].a["href"] # f*****g stupid hack because BeautifulSoup fails to parse correctly the html soup, suppe = lxml_read_or_dl_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) table = BeautifulSoup(etree.tostring(soup.xpath('//table')[4], pretty_print=True)) table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[4], pretty_print=True)) dico = document_to_dico(list(table.table('tr', recursive=False))) dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False))) _get_first_level_data(dico, dico_nl, document) _get_in_charged_commissions(dico, dico_nl, document) _get_plenaries(dico, dico_nl, document) _get_senat_plenaries(dico, dico_nl, document) _get_competences(dico, dico_nl, document) _get_document_chambre(dico, dico_nl, document) _get_document_senat(dico, dico_nl, document) document.done = True document.save() logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"])) dico.die_if_got_not_accessed_keys()
def get_new_documents(): for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}): soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text) for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")): get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])