def fetch_one(klass, document, cache=False, sync=False): soup = scraper.get(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) document.full_details_url = soup.table.a["href"] # f*****g stupid hack because BeautifulSoup fails to parse correctly the html soup, suppe = scraper.lxml_get_with_nl(LACHAMBRE_PREFIX + document.url if not document.url.startswith("http") else document.url, "a document %s" % document.lachambre_id) table = BeautifulSoup(etree.tostring(soup.xpath('//table')[0], pretty_print=True)) table_nl = BeautifulSoup(etree.tostring(suppe.xpath('//table')[0], pretty_print=True)) dico = document_to_dico(list(table.table('tr', recursive=False))) dico_nl = document_to_dico(list(table_nl.table('tr', recursive=False))) del dico[""] del dico_nl[""] klass._get_first_level_data(dico, dico_nl, document) klass._get_in_charged_commissions(dico, dico_nl, document) klass._get_plenaries(dico, dico_nl, document) klass._get_senat_plenaries(dico, dico_nl, document) klass._get_competences(dico, dico_nl, document) klass._get_document_chambre(dico, dico_nl, document) klass._get_document_senat(dico, dico_nl, document) document.done = True document.save() logger.info("parsed document [%s] %s" % (document.lachambre_id, document.title["fr"])) dico.die_if_got_not_accessed_keys()
def fetch_list(klass, cache=False, sync=False): for i in range(48, 55): soup = scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/qrva&language=fr&rightmenu=right?legislat=52&cfm=/site/wwwcfm/qrva/qrvaList.cfm?legislat=%i" % i, "bulletin list %i" % i) for b in soup.table('tr')[1:]: try: klass.fetch_one(soup, legislation=i) except TypeError, e: logger.debug("Error on written question bulleting of legislation %s:" % i, e) continue
def fetch_list(klass, cache=False, sync=False): WrittenQuestionBulletin.fetch_list() for bulletin in list(WrittenQuestionBulletin.objects.filter(url__isnull=False)): soup = scraper.get(LACHAMBRE_PREFIX + bulletin.url, "bulletin %s %s" % (bulletin.lachambre_id, bulletin.legislature)) if not soup.find('table', 'txt'): continue for link in soup.find('table', 'txt').tbody('tr', recursive=False): if link.a is None: raise Exception("I should check that") klass.fetch_one(link) bulletin.done = True bulletin.save()
def fetch_list(klass, cache=False, sync=False): for document_page in scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}): soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text) for soup, suppe in zip(soup.table('tr'), suppe.table('tr')): get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"]) # list otherwise mongodb will timeout if we stay in a query mode for document in list(Document.objects.filter(done=False)): if document.lachambre_id == 25: continue try: klass.fetch_one(document) except Exception, e: traceback.print_exc(file=sys.stdout) logger.error("/!\ %s didn't succed! Error: while reparsing document %s" % (document.lachambre_id, e))
def fetch_list(klass, cache=False, sync=False): soup = scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", cache=cache, sync=sync) for dep in soup.table('tr'): items = dep('td') url = items[0].a['href'] lachambre_id = re.search('key=([0-9O]+)', url).groups()[0] deputy = Deputy.objects.filter(lachambre_id=lachambre_id) full_name = re.sub(" +", " ", items[0].a.text.strip()) if not deputy: logger.info("[NEW] deputy: %s" % full_name) deputy = deputy[0] if deputy else Deputy(lachambre_id=lachambre_id) if items[1].a.text.strip(): deputy.party = get_or_create(Party, name=items[1].a.text.strip(), url=dict(items[1].a.attrs)['href']) email = items[2].a.text website = items[3].a['href'] if items[3].a else None if email not in deputy.emails: deputy.emails.append(email) if website not in deputy.websites: deputy.websites.append(website) deputy.full_name = full_name deputy.url = url deputy.save() tasks = [] for index, deputy in enumerate(list(Deputy.objects.all())): logger.debug("%s %s" % (index, deputy.full_name)) if sync: klass.fetch_one(deputy, cache=cache, sync=sync) else: klass.fetch_one.delay(deputy, cache=cache, sync=sync) if not sync: map(lambda x: x.get, tasks)
def test_get(): company_data = scraper.get() assert len(company_data) > 0