def _get_document_chambre(dico, dico_nl, document): if not dico.get("Document Chambre"): return chambre_dico = dico['Document Chambre'] chambre_dico_nl = dico_nl['Document Kamer'] document_chambre = DocumentChambre() document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt') document_chambre.type["fr"] = chambre_dico[u'Type de document'].text document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération') document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution') document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi') document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin') document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut') document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status') document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ') document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ') Document._get_authors(chambre_dico, chambre_dico_nl, document_chambre) url, tipe, session = clean_text(str(chambre_dico[u'head']).replace(" ", "")).split("<br />") _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace(" ", "")).split("<br />") url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2]) Document._get_next_documents(chambre_dico, chambre_dico_nl, document_chambre) if chambre_dico.get(u'Document(s) joint(s)/lié(s)'): document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'], chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)] document_chambre.save() document.document_chambre = document_chambre
def _get_in_charged_commissions(dico, dico_nl, document): document.in_charge_commissions = [] for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))): icc = InChargeCommissions() icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1] icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1] icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1]) icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1]) if dico[key].get("Rapporteur"): # FIXME link to actual deputies icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t")) icc.incident = [] if dico[key].get("Incident"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) icc.agenda = [] if dico[key].get("Calendrier"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) if dico[key].get("Rapport"): icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])} icc.save() document.in_charge_commissions.append(icc)
def _get_plenaries(dico, dico_nl, document): document.plenaries = [] for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))): pl = DocumentPlenary() pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1] pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1] pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1]) pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1]) pl.agenda = [] if dico[key].get("Calendrier"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) pl.incident = [] if dico[key].get("Incident"): fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2]))) nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2]))) for (_date, _type), (_, _type_nl) in zip(fr, nl): pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}}) pl.save() document.plenaries.append(pl)
def _build_sub_section(i, dico): sub_section = clean_text(i.td.b.text) if dico.get(sub_section): raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section])) dico[sub_section] = AccessControlDict() dico[sub_section]["head"] = i('td')[1] return sub_section
def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre): if chambre_dico.get('Document(s) suivant(s)'): for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])): logger.debug("add pdf %s" % clean_text(d[0].font.text)) doc = OtherDocumentChambrePdf() doc.url = d[0].a['href'] if d[0].a else d[0].td.text doc.type["fr"] = clean_text(d[0].font.text) doc.type["nl"] = clean_text(d_nl[0].font.text) doc.distribution_date = d[1]('td')[-1].text for dep, dep_nl in zip(d[2:], d_nl[2:]): if dep.a: lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0] deputy = Deputy.objects.get(lachambre_id=lachambre_id) doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) else: doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) doc.save() document_chambre.other_pdfs.append(doc)
def _get_competences(dico, dico_nl, document): # FIXME: meh, DRY if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"): document.timeline = [] for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]], [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]): logger.debug("append time line %s %s %s" % (_date, _title, _title_nl)) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date)) elif dico.get(u"Compétence"): document.timeline = [] for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]: logger.debug("append time line %s %s %s" % (_date, _title, "")) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date)) elif dico_nl.get(u"Bevoegdheid"): document.timeline = [] for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]: logger.debug("append time line %s %s %s" % (_date, "", _title_nl)) document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date)) if dico.get("Analyse des interventions"): document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])
def _build_first_level(i, dico): key = clean_text(i.td.text) # we can get severals Moniter erratum if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'): if not dico.get(key): dico[key] = [] dico[key].append(i('td')[1]) else: if dico.get(key): raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key])) dico[key] = i('td')[1]
def _build_pdf_sub_section(i, dico, sub_section): key = clean_text(i.td.text) # we can have a list on joined documents if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'): if not dico[sub_section].get(key): dico[sub_section][key] = [] dico[sub_section][key].append(i('td')[1]) elif dico[sub_section].get(key): raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key])) else: dico[sub_section][key] = i('td')[1]
def document_pdf_part_cutter(soup): result = [] blob = [soup('tr')[0]] for i in soup('tr')[1:]: if not clean_text(i.text): continue if not i.img or not i.img.get("class") or i.img["class"] != "picto": blob.append(i) else: result.append(blob) blob = [i] result.append(blob) return result
def _build_pdf_sub_section(i, dico, sub_section): key = clean_text(i.td.text) # we can have a list on joined documents if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'): if not dico[sub_section].get(key): dico[sub_section][key] = [] dico[sub_section][key].append(i('td')[1]) elif dico[sub_section].get(key): raise Exception( "'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key])) else: dico[sub_section][key] = i('td')[1]
def _get_document_senat(dico, dico_nl, document): if not dico.get(u"Document Sénat"): return senat_dico = dico[u"Document Sénat"] senat_dico_nl = dico_nl[u"Document Senaat"] document_senat = DocumentSenat() document_senat.deposition_date = senat_dico[u"Date de dépôt"].text document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin") document_senat.type["fr"] = senat_dico[u"Type de document"].text document_senat.type["nl"] = senat_dico_nl[u"Document type"].text document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ') document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ') document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)")) document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut') document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status') url, tipe, session = clean_text(str(senat_dico[u'head']).replace(" ", "")).split("<br />") _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace(" ", "")).split("<br />") url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2]) if senat_dico.get('Document(s) suivant(s)'): for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])): logger.debug("add pdf %s" % clean_text(d[0].font.text)) doc = OtherDocumentSenatPdf() doc.url = d[0].a['href'] if d[0].a else d[0].td.text doc.type["fr"] = clean_text(d[0].font.text) doc.type["nl"] = clean_text(d_nl[0].font.text) doc.date = d[0]('td')[-1].contents[0] doc.authors = [] for dep, dep_nl in zip(d[1:], d_nl[1:]): doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}}) doc.save() document_senat.other_pdfs.append(doc) document_senat.save() document.document_senat = document_senat
def _get_first_level_data(dico, dico_nl, document): document.deposition_date = get_text_else_blank(dico, u"Date de dépôt") document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution")) document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet")) if dico.get("Descripteur Eurovoc principal"): document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text if dico.get("Eurovoc-hoofddescriptor"): document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text document.vote_date = get_text_else_blank(dico, "Vote Chambre") document.law_date = get_text_else_blank(dico, "Date de la loi") document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°") document.moniteur_date = get_text_else_blank(dico, u"Date moniteur") document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat") document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature") if dico.get("Etat d'avancement"): document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0]) document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None if dico.get("Stand van zaken"): document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0]) document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None if dico.get("Descripteurs Eurovoc"): document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|")) if dico.get("Eurovoc descriptoren"): document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|")) if dico.get("Candidats-descripteurs Eurovoc"): document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|")) if dico.get("Eurovoc kandidaat-descriptoren"): document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|")) if dico.get(u"Mots-clés libres"): document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|")) if dico.get(u"Vrije trefwoorden"): document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|")) if dico.get("Documents principaux"): document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents)) if dico.get("Hoodfdocumenten"): document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))