Exemple #1
0
def deputies_list(reset=False):
    soup = read_or_dl(
        "http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm",
        "deputies", reset)

    for dep in soup('table')[4]('tr'):
        items = dep('td')
        full_name = re.sub('  +', ' ', items[0].a.text).strip()
        url = items[0].a['href']
        party = get_or_create(Party,
                              name=items[1].a.text,
                              url=dict(items[1].a.attrs)['href'])
        email = items[2].a.text
        website = items[3].a['href'] if items[3].a else None
        # yes, one deputies key contains a O instead of an 0, I'm not joking
        lachambre_id = re.search('key=([0-9O]+)', url).groups()[0]
        if not Deputy.objects.filter(lachambre_id=lachambre_id):
            deputy = Deputy.objects.create(
                full_name=full_name,
                party=party,
                url=url,
                websites=[website] if website else [],
                lachambre_id=lachambre_id,
                emails=[email])
            logger.debug(
                'adding new deputy %s %s %s %s %s' %
                (lachambre_id.encode("Utf-8"),
                 full_name.encode("Utf-8"), party, email.encode("Utf-8"),
                 website.encode("Utf-8") if website else ''))
            logger.info("[NEW] deputy: %s" % deputy)
Exemple #2
0
def _get_written_question_bulletin():
    for i in range(48, 54):
        soup = read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/qrva&language=fr&rightmenu=right?legislat=52&cfm=/site/wwwcfm/qrva/qrvaList.cfm?legislat=%i" % i, "bulletin list %i" % i)
        for b in soup('table')[4]('tr')[1:]:
            try:
                if i == 53:
                    get_or_create(WrittenQuestionBulletin,
                                  legislature="53",
                                  lachambre_id=b('td')[0]('a')[-1].text.split()[-1],
                                  date=b('td')[2].text,
                                  publication_date=b('td')[3].text,
                                  url=b('td')[1].a["href"],
                                  pdf_url=b('td')[0].a["href"],
                                 )
                else:
                    get_or_create(WrittenQuestionBulletin,
                                  legislature=str(i),
                                  lachambre_id=b('td')[0]('a')[-1].text.split()[-1],
                                  publication_date=b('td')[2].text,
                                  url=b('td')[1].a["href"] if b('td')[1].a else None,
                                  pdf_url=b('td')[0].a["href"],
                                 )
                    logger.debug("%s" % b('td')[0]('a')[-1].text.split()[-1])
            except TypeError:
                continue
Exemple #3
0
def handle_document(document):
    soup = read_or_dl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
    document.full_details_url = soup.table.a["href"]
    # f*****g stupid hack because BeautifulSoup fails to parse correctly the html
    soup, suppe = lxml_read_or_dl_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
    table = BeautifulSoup(etree.tostring(soup.xpath('//table')[0], pretty_print=True))
    table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[0], pretty_print=True))
    dico = document_to_dico(list(table.table('tr', recursive=False)))
    dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False)))

    del dico[""]
    del dico_nl[""]

    _get_first_level_data(dico, dico_nl, document)
    _get_in_charged_commissions(dico, dico_nl, document)
    _get_plenaries(dico, dico_nl, document)
    _get_senat_plenaries(dico, dico_nl, document)
    _get_competences(dico, dico_nl, document)
    _get_document_chambre(dico, dico_nl, document)
    _get_document_senat(dico, dico_nl, document)

    document.done = True
    document.save()
    logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"]))
    dico.die_if_got_not_accessed_keys()
Exemple #4
0
def check_for_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            if not Document.objects.filter(lachambre_id=soup.div.text):
                url = soup.a["href"]
                title = soup("div")[1].text
                lachambre_id = soup.div.text
                logger.info("find a new document: %s - [%s] -  %s" % (LACHAMBRE_PREFIX + url, lachambre_id, title))
                document = Document(title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=lachambre_id, url=soup.a["href"])
                handle_document(document)
Exemple #5
0
def check_for_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            if not Document.objects.filter(lachambre_id=soup.div.text):
                url = soup.a["href"]
                title = soup("div")[1].text
                lachambre_id = soup.div.text
                logger.info("find a new document: %s - [%s] -  %s" % (LACHAMBRE_PREFIX + url, lachambre_id, title))
                document = Document(title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=lachambre_id, url=soup.a["href"])
                handle_document(document)
Exemple #6
0
def deputies_list(reset=False):
    soup = read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", reset)

    for dep in soup.table('tr'):
        items = dep('td')
        url = items[0].a['href']
        lachambre_id = re.search('key=([0-9O]+)', url).groups()[0]

        deputy = Deputy.objects.filter(lachambre_id=lachambre_id)
        if not deputy:
            logger.info("[NEW] deputy: %s" % deputy)
        deputy = deputy[0] if deputy else Deputy(lachambre_id=lachambre_id)

        deputy.full_name = items[0].a.text.strip()
        deputy.url = url
        deputy.save()
Exemple #7
0
def _get_deputy_analysis(url, deputy, type, reset=False):
    soupsoup = read_or_dl(LACHAMBRE_PREFIX + lame_url(url), '%s %s' % (deputy.full_name, type), reset)
    setattr(deputy, "analysis_%s_url" % type, url)
    setattr(deputy, "analysis_%s_list" % type, [])
    for i in soupsoup('table')[3]('tr', valign="top"):
        logger.debug("add", type, i.tr('td')[1].text.strip())
        dico = table2dic(i.table('td'))
        logger.debug("%s" % dico)
        getattr(deputy, "analysis_%s_list" % type).\
            append(get_or_create(Analysis,
            _id="lachambre_id",
            lachambre_id=re.search("dossierID=([0-9A-Za-z-]+)", i.a["href"]).groups()[0],
            title=dico["Titre"],
                   descriptor=dico["Descripteurs"],
            url=i.a['href'],
            type=type))
Exemple #8
0
def _get_deputy_analysis(url, deputy, type, reset=False):
    soupsoup = read_or_dl(LACHAMBRE_PREFIX + lame_url(url),
                          '%s %s' % (deputy.full_name, type), reset)
    setattr(deputy, "analysis_%s_url" % type, url)
    setattr(deputy, "analysis_%s_list" % type, [])
    for i in soupsoup('table')[3]('tr', valign="top"):
        logger.debug("add", type, i.tr('td')[1].text.strip())
        dico = table2dic(i.table('td'))
        logger.debug("%s" % dico)
        getattr(deputy, "analysis_%s_list" % type).\
                append(get_or_create(Analysis,
                                     _id="lachambre_id",
                                     lachambre_id=re.search("dossierID=([0-9A-Za-z-]+)", i.a["href"]).groups()[0],
                                     title=dico["Titre"],
                                     descriptor=dico["Descripteurs"],
                                     url=i.a['href'],
                                     type=type))
Exemple #9
0
def scrape():
    _get_written_question_bulletin()

    for bulletin in list(WrittenQuestionBulletin.objects.filter(done=False, url__isnull=False)):
        soup = read_or_dl(LACHAMBRE_PREFIX + bulletin.url, "bulletin %s %s" % (bulletin.lachambre_id, bulletin.legislature))
        if not soup.find('table', 'txt'):
            continue
        for link in soup.find('table', 'txt')('tr', recursive=False):
            _id = re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0]
            if link.a is None:
                continue
            # tempory
            if WrittenQuestion.objects.filter(lachambre_id=_id):
                logger.debug("pass %s, already parsed" % (re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0]))
                continue
            _save_a_written_question(link)
        bulletin.done = True
        bulletin.save()
Exemple #10
0
def _get_deputy_questions(url, deputy, type, reset=False):
    soupsoup = read_or_dl(LACHAMBRE_PREFIX + lame_url(url), '%s %s' % (deputy.full_name, type), reset)
    setattr(deputy, "questions_%s_url" % type, url)
    setattr(deputy, "questions_%s_list" % type, [])
    for i in soupsoup('table')[3]('tr', valign="top"):
        logger.debug("add", type, i.tr('td')[1].text.strip())
        dico = table2dic(i.table('td'))
        logger.debug("%s" % dico)
        getattr(deputy, "questions_%s_list" % type).\
            append(get_or_create(Question,
            _id="lachambre_id",
            title=dico["Titre"],
                   lachambre_id=re.search("dossierID=([0-9A-Za-z-]+)", i.a["href"]).groups()[0],
            reunion_type=dico.get(u"Réunion"),
            reunion_date=dico.get("Date discussion"),
            session_id=dico.get("Session"),
            pdf_url=dico.get(u"Compte rendu intégral", {"href": None})["href"],
            eurovoc_descriptors=map(lambda x: x.strip(), dico.get("Descripteurs Eurovoc", "").split('|')),
            keywords=map(lambda x: x.strip(), dico.get(u"Mots-clés libres", "").split("|")),
            url=i.a['href'],
            type=type))
def deputies_list(reset=False):
    soup = read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", reset)

    for dep in soup('table')[4]('tr'):
        items = dep('td')
        full_name = re.sub('  +', ' ', items[0].a.text).strip()
        url = items[0].a['href']
        party = get_or_create(Party, name=items[1].a.text, url=dict(items[1].a.attrs)['href'])
        email = items[2].a.text
        website = items[3].a['href'] if items[3].a else None
        # yes, one deputies key contains a O instead of an 0, I'm not joking
        lachambre_id = re.search('key=([0-9O]+)', url).groups()[0]
        if not Deputy.objects.filter(lachambre_id=lachambre_id):
            deputy = Deputy.objects.create(full_name=full_name,
                                           party=party,
                                           url=url,
                                           websites=[website] if website else [],
                                           lachambre_id=lachambre_id,
                                           emails=[email])
            logger.debug('adding new deputy %s %s %s %s %s' % (lachambre_id.encode("Utf-8"), full_name.encode("Utf-8"), party, email.encode("Utf-8"), website.encode("Utf-8") if website else ''))
            logger.info("[NEW] deputy: %s" % deputy)
Exemple #12
0
def handle_document(document):
    soup = read_or_dl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
    document.full_details_url = soup('table')[4].a["href"]
    # f*****g stupid hack because BeautifulSoup fails to parse correctly the html
    soup, suppe = lxml_read_or_dl_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
    table = BeautifulSoup(etree.tostring(soup.xpath('//table')[4], pretty_print=True))
    table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[4], pretty_print=True))
    dico = document_to_dico(list(table.table('tr', recursive=False)))
    dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False)))

    _get_first_level_data(dico, dico_nl, document)
    _get_in_charged_commissions(dico, dico_nl, document)
    _get_plenaries(dico, dico_nl, document)
    _get_senat_plenaries(dico, dico_nl, document)
    _get_competences(dico, dico_nl, document)
    _get_document_chambre(dico, dico_nl, document)
    _get_document_senat(dico, dico_nl, document)

    document.done = True
    document.save()
    logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"]))
    dico.die_if_got_not_accessed_keys()
Exemple #13
0
def _get_deputy_questions(url, deputy, type, reset=False):
    soupsoup = read_or_dl(LACHAMBRE_PREFIX + lame_url(url),
                          '%s %s' % (deputy.full_name, type), reset)
    setattr(deputy, "questions_%s_url" % type, url)
    setattr(deputy, "questions_%s_list" % type, [])
    for i in soupsoup('table')[3]('tr', valign="top"):
        logger.debug("add", type, i.tr('td')[1].text.strip())
        dico = table2dic(i.table('td'))
        logger.debug("%s" % dico)
        getattr(deputy, "questions_%s_list" % type).\
                append(get_or_create(Question,
                                     _id="lachambre_id",
                                     title=dico["Titre"],
                                     lachambre_id=re.search("dossierID=([0-9A-Za-z-]+)", i.a["href"]).groups()[0],
                                     reunion_type=dico.get(u"Réunion"),
                                     reunion_date=dico.get("Date discussion"),
                                     session_id=dico.get("Session"),
                                     pdf_url=dico.get(u"Compte rendu intégral", {"href": None})["href"],
                                     eurovoc_descriptors=map(lambda x: x.strip(), dico.get("Descripteurs Eurovoc", "").split('|')),
                                     keywords=map(lambda x: x.strip(), dico.get(u"Mots-clés libres", "").split("|")),
                                     url=i.a['href'],
                                     type=type))
Exemple #14
0
def get_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])
Exemple #15
0
def get_new_documents():
    for document_page in read_or_dl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
        soup, suppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
        for soup, suppe in zip(soup('table')[4]('tr', valign="top"), suppe('table')[4]('tr', valign="top")):
            get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])