Beispiel #1
0
    def fetch_one(klass, document, cache=False, sync=False):
        soup = scraper.get(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
        document.full_details_url = soup.table.a["href"]
        # f*****g stupid hack because BeautifulSoup fails to parse correctly the html
        soup, suppe = scraper.lxml_get_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id)
        table = BeautifulSoup(etree.tostring(soup.xpath('//table')[0], pretty_print=True))
        table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[0], pretty_print=True))
        dico = document_to_dico(list(table.table('tr', recursive=False)))
        dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False)))

        del dico[""]
        del dico_nl[""]

        klass._get_first_level_data(dico, dico_nl, document)
        klass._get_in_charged_commissions(dico, dico_nl, document)
        klass._get_plenaries(dico, dico_nl, document)
        klass._get_senat_plenaries(dico, dico_nl, document)
        klass._get_competences(dico, dico_nl, document)
        klass._get_document_chambre(dico, dico_nl, document)
        klass._get_document_senat(dico, dico_nl, document)

        document.done = True
        document.save()
        logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"]))
        dico.die_if_got_not_accessed_keys()
Beispiel #2
0
 def fetch_list(klass, cache=False, sync=False):
     for i in range(48, 55):
         soup = scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/qrva&language=fr&rightmenu=right?legislat=52&cfm=/site/wwwcfm/qrva/qrvaList.cfm?legislat=%i" % i, "bulletin list %i" % i)
         for b in soup.table('tr')[1:]:
             try:
                 klass.fetch_one(soup, legislation=i)
             except TypeError, e:
                 logger.debug("Error on written question bulleting of legislation %s:" % i, e)
                 continue
Beispiel #3
0
    def fetch_list(klass, cache=False, sync=False):
        WrittenQuestionBulletin.fetch_list()

        for bulletin in list(WrittenQuestionBulletin.objects.filter(url__isnull=False)):
            soup = scraper.get(LACHAMBRE_PREFIX + bulletin.url, "bulletin %s %s" % (bulletin.lachambre_id, bulletin.legislature))
            if not soup.find('table', 'txt'):
                continue
            for link in soup.find('table', 'txt').tbody('tr', recursive=False):
                if link.a is None:
                    raise Exception("I should check that")
                klass.fetch_one(link)
            bulletin.done = True
            bulletin.save()
Beispiel #4
0
    def fetch_list(klass, cache=False, sync=False):
        for document_page in scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}):
            soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text)
            for soup, suppe in zip(soup.table('tr'), suppe.table('tr')):
                get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"])

        # list otherwise mongodb will timeout if we stay in a query mode
        for document in list(Document.objects.filter(done=False)):
            if document.lachambre_id == 25:
                continue
            try:
                klass.fetch_one(document)
            except Exception, e:
                traceback.print_exc(file=sys.stdout)
                logger.error("/!\ %s didn't succed! Error: while reparsing document %s" % (document.lachambre_id, e))
Beispiel #5
0
    def fetch_list(klass, cache=False, sync=False):
        soup = scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", cache=cache, sync=sync)

        for dep in soup.table('tr'):
            items = dep('td')
            url = items[0].a['href']
            lachambre_id = re.search('key=([0-9O]+)', url).groups()[0]

            deputy = Deputy.objects.filter(lachambre_id=lachambre_id)
            full_name = re.sub(" +", " ", items[0].a.text.strip())

            if not deputy:
                logger.info("[NEW] deputy: %s" % full_name)
            deputy = deputy[0] if deputy else Deputy(lachambre_id=lachambre_id)

            if items[1].a.text.strip():
                deputy.party = get_or_create(Party, name=items[1].a.text.strip(), url=dict(items[1].a.attrs)['href'])

            email = items[2].a.text
            website = items[3].a['href'] if items[3].a else None

            if email not in deputy.emails:
                deputy.emails.append(email)
            if website not in deputy.websites:
                deputy.websites.append(website)

            deputy.full_name = full_name
            deputy.url = url
            deputy.save()

        tasks = []

        for index, deputy in enumerate(list(Deputy.objects.all())):
            logger.debug("%s %s" % (index, deputy.full_name))
            if sync:
                klass.fetch_one(deputy, cache=cache, sync=sync)
            else:
                klass.fetch_one.delay(deputy, cache=cache, sync=sync)

        if not sync:
            map(lambda x: x.get, tasks)
Beispiel #6
0
def test_get():
    company_data = scraper.get()
    assert len(company_data) > 0