def fetch_list(klass, cache=False, sync=False): for a, url in enumerate(('http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=y', 'http://www.lachambre.be/kvvcr/showpage.cfm?section=none&language=fr&cfm=/site/wwwcfm/rajv/rajvlist.cfm?lastreports=n')): soup, suppe = scraper.get_with_nl(url, "annual repports %i" % a) for i, j in zip(soup.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5], suppe.find('div', id="story")('table')[1].tbody('tr', recursive=False)[::5]): get_or_create(AnnualReport, title={"fr": i('td')[2].text, "nl": j('td')[2].text}, date=i('td')[0].text, law_and_article={"fr": i('td')[4].text, "nl": j('td')[4].text}, periodicity=re.sub("[^0-9]", "", i('td')[5].text), pdf_url=i('td')[1].a["href"] if i('td')[1].a else "", )
def fetch_list(klass, cache=False, sync=False): for document_page in scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/flwb&language=fr&rightmenu=right&cfm=ListDocument.cfm", "all documents")('div', **{'class': re.compile("linklist_[01]")}): soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + document_page.a["href"], "document %s" % document_page.a.text) for soup, suppe in zip(soup.table('tr'), suppe.table('tr')): get_or_create(Document, _id="lachambre_id", title={"fr": soup('div')[1].text, "nl": suppe('div')[1].text}, lachambre_id=soup.div.text, url=soup.a["href"]) # list otherwise mongodb will timeout if we stay in a query mode for document in list(Document.objects.filter(done=False)): if document.lachambre_id == 25: continue try: klass.fetch_one(document) except Exception, e: traceback.print_exc(file=sys.stdout) logger.error("/!\ %s didn't succed! Error: while reparsing document %s" % (document.lachambre_id, e))
def fetch_one(klass, soup, legislation, cache=False, sync=False): if legislation == 54: get_or_create(WrittenQuestionBulletin, legislature="53", lachambre_id=soup('td')[0]('a')[-1].text.split()[-1], date=soup('td')[2].text, publication_date=soup('td')[3].text, url=soup('td')[1].a["href"], pdf_url=soup('td')[0].a["href"], ) else: get_or_create(WrittenQuestionBulletin, legislature=str(legislation), lachambre_id=soup('td')[0]('a')[-1].text.split()[-1], publication_date=soup('td')[2].text, url=soup('td')[1].a["href"] if soup('td')[1].a else None, pdf_url=soup('td')[0].a["href"], ) logger.debug("%s" % soup('td')[0]('a')[-1].text.split()[-1])
def _get_competences(dico, dico_nl, document): # FIXME: meh, DRY if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"): document.timeline = [] for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]], [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]): logger.debug("append time line %s %s %s" % (_date, _title, _title_nl)) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date)) elif dico.get(u"Compétence"): document.timeline = [] for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]: logger.debug("append time line %s %s %s" % (_date, _title, "")) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date)) elif dico_nl.get(u"Bevoegdheid"): document.timeline = [] for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]: logger.debug("append time line %s %s %s" % (_date, "", _title_nl)) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date)) if dico.get("Analyse des interventions"): document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])
def fetch_one(klass, commission, cache=False, sync=False): soup, suppe = scraper.get_with_nl(LACHAMBRE_PREFIX + commission.url, "commission %s" % commission.lachambre_id) commission.full_name["fr"] = soup.h1.text commission.full_name["nl"] = suppe.h1.text commission.deputies = [] seats = {"fr": {}, "nl": {}} for i, j in zip(soup('p')[2:], suppe('p')[2:]): role = i.b.text[:-1] role_nl = j.b.text[:-1] for dep in i('a'): deputy = Deputy.objects.get(lachambre_id=re.search("key=([O0-9]+)", dep["href"]).groups()[0]) membership = get_or_create(CommissionMembership, deputy=deputy, commission=commission) membership.role = role membership.save() commission.deputies.append(membership.id) seats["fr"][role] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) seats["nl"][role_nl] = map(lambda x: (x[0], len(x[1].split(','))), zip(map(lambda x: x.text[:-1], i('b')[1:]), str(i).split("<br />")[1:])) commission.seats = seats commission.save()
def fetch_list(klass, cache=False, sync=False): soup, suppe = scraper.get_with_nl("http://www.lachambre.be/kvvcr/showpage.cfm?section=/comm/commissions&language=fr&cfm=/site/wwwcfm/comm/LstCom.cfm&rightmenu=right_cricra", "commissions list") _type = "" for i, j in zip(soup("div", id="story")[1], suppe("div", id="story")[1]): if not isinstance(i, NavigableString) and (i.h4 or i.a): if i.h4: _type = i.h4.text _type_nl = j.h4.text elif i.a: commission = get_or_create(Commission, lachambre_id=int(re.search("com=(\d+)", i.a["href"]).groups()[0])) commission.type["fr"] = _type commission.type["nl"] = _type_nl commission.name["fr"] = i.a.text commission.name["nl"] = j.a.text commission.url = i.a["href"] commission.save() for com in list(Commission.objects.all()): klass.fetch_one(com, cache=cache, sync=sync)
def fetch_list(klass, cache=False, sync=False): soup = scraper.get("http://www.lachambre.be/kvvcr/showpage.cfm?section=/depute&language=fr&rightmenu=right_depute&cfm=/site/wwwcfm/depute/cvlist.cfm", "deputies", cache=cache, sync=sync) for dep in soup.table('tr'): items = dep('td') url = items[0].a['href'] lachambre_id = re.search('key=([0-9O]+)', url).groups()[0] deputy = Deputy.objects.filter(lachambre_id=lachambre_id) full_name = re.sub(" +", " ", items[0].a.text.strip()) if not deputy: logger.info("[NEW] deputy: %s" % full_name) deputy = deputy[0] if deputy else Deputy(lachambre_id=lachambre_id) if items[1].a.text.strip(): deputy.party = get_or_create(Party, name=items[1].a.text.strip(), url=dict(items[1].a.attrs)['href']) email = items[2].a.text website = items[3].a['href'] if items[3].a else None if email not in deputy.emails: deputy.emails.append(email) if website not in deputy.websites: deputy.websites.append(website) deputy.full_name = full_name deputy.url = url deputy.save() tasks = [] for index, deputy in enumerate(list(Deputy.objects.all())): logger.debug("%s %s" % (index, deputy.full_name)) if sync: klass.fetch_one(deputy, cache=cache, sync=sync) else: klass.fetch_one.delay(deputy, cache=cache, sync=sync) if not sync: map(lambda x: x.get, tasks)