Example #1
0
def _save_a_written_question(link):
    soupsoup, suppesuppe = read_or_dl_with_nl(LACHAMBRE_PREFIX + link.a["href"], "written question %s" % re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0])
    data = AccessControlDict(((x.td.text, x('td')[1]) for x in soupsoup.find('table', 'txt')('tr') if x.td.text))
    data_nl = AccessControlDict(((x.td.text, x('td')[1]) for x in suppesuppe.find('table', 'txt')('tr') if x.td.text))
    get_or_create(WrittenQuestion,
                  _id="lachambre_id",
                  lachambre_id=re.search("dossierID=([0-9A-Z-]+).xml", link.a["href"]).groups()[0],
                  title={"fr": data["Titre"].text, "nl": data_nl["Titel"].text},
                  departement={"fr": data[u"Département"].text, "nl": data_nl[u"Departement"].text},
                  sub_departement={"fr": data[u"Sous-département"].text, "nl": data_nl[u"Sub-departement"].text},
                  deposition_date=data[u"Date de dépôt"].text,
                  delay_date=dico_get_text(data, u"Date de délai"),
                  publication_date=dico_get_text(data, "Date publication"),
                  # TODO: link to the actual deputy
                  author=data[u"Auteur"].text,
                  language=data[u"Langue"].text,
                  question_status={"fr": dico_get_text(data, "Statut question"), "nl": dico_get_text(data_nl, "Status vraag")},
                  status={"fr": dico_get_text(data, "Statut"), "nl": dico_get_text(data_nl, "Status")},
                  question={"fr": u"%s" % data["Question"], "nl": "%s" % data_nl["Vraag"]},
                  answer={"fr": dico_get_text(data, u"Réponse"), "nl": dico_get_text(data_nl, u"Antwoord")},
                  publication_reponse_pdf_url=get_href_else_blank(data, u"Publication réponse"),
                  publication_question_pdf_url=get_href_else_blank(data, u"Publication question"),
                  publication_reponse=get_text_else_blank(data, u"Publication réponse"),
                  publication_question=get_text_else_blank(data, u"Publication question"),
                  eurovoc_descriptors={"fr": get_items_list_else_empty_list(data, "Descripteurs Eurovoc"),
                                       "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-descriptoren")},
                  eurovoc_candidats_descriptors={"fr": get_items_list_else_empty_list(data, "Candidats-descripteurs Eurovoc"),
                                                 "nl": get_items_list_else_empty_list(data_nl, "Eurovoc kandidaat-descriptoren")},
                  keywords={"fr": get_items_list_else_empty_list(data, u"Mots-clés libres"),
                            "nl": get_items_list_else_empty_list(data_nl, u"Vrije trefwoorden")},
                  url=link.a["href"],
                 )

    data.die_if_got_not_accessed_keys()
Example #2
0
def _get_document_chambre(dico, dico_nl, document):
    if not dico.get("Document Chambre"):
        return

    chambre_dico = dico['Document Chambre']
    chambre_dico_nl = dico_nl['Document Kamer']

    document_chambre = DocumentChambre()
    document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
    document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
    document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
    document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
    document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
    document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
    document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
    document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
    document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
    document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
    document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')

    _get_authors(chambre_dico, chambre_dico_nl, document_chambre)

    url, tipe, session = clean_text(str(chambre_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)

    if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
        document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
                                                                                                                                             chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]

    document_chambre.save()
    document.document_chambre = document_chambre
Example #3
0
def _get_document_chambre(dico, dico_nl, document):
    if not dico.get("Document Chambre"):
        return

    chambre_dico = dico['Document Chambre']
    chambre_dico_nl = dico_nl['Document Kamer']

    document_chambre = DocumentChambre()
    document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
    document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
    document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
    document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
    document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
    document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
    document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
    document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
    document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
    document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
    document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')

    _get_authors(chambre_dico, chambre_dico_nl, document_chambre)

    url, tipe, session = clean_text(str(chambre_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)

    if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
        document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
                                                                                                                                             chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]

    document_chambre.save()
    document.document_chambre = document_chambre
Example #4
0
def _get_first_level_data(dico, dico_nl, document):
    document.deposition_date = get_text_else_blank(dico, u"Date de dépôt")
    document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution"))
    document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet"))
    if dico.get("Descripteur Eurovoc principal"):
        document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text
    if dico.get("Eurovoc-hoofddescriptor"):
        document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text
    document.vote_date = get_text_else_blank(dico, "Vote Chambre")
    document.law_date = get_text_else_blank(dico, "Date de la loi")
    document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°")
    document.moniteur_date = get_text_else_blank(dico, u"Date moniteur")
    document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat")
    document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature")

    if dico.get("Etat d'avancement"):
        document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0])
        document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None
    if dico.get("Stand van zaken"):
        document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0])
        document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None

    if dico.get("Descripteurs Eurovoc"):
        document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|"))
    if dico.get("Eurovoc descriptoren"):
        document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|"))
    if dico.get("Candidats-descripteurs Eurovoc"):
        document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|"))
    if dico.get("Eurovoc kandidaat-descriptoren"):
        document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|"))
    if dico.get(u"Mots-clés libres"):
        document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|"))
    if dico.get(u"Vrije trefwoorden"):
        document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|"))
    if dico.get("Documents principaux"):
        document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents))
    if dico.get("Hoodfdocumenten"):
        document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))
Example #5
0
def _get_first_level_data(dico, dico_nl, document):
    document.deposition_date = get_text_else_blank(dico, u"Date de dépôt")
    document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution"))
    document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet"))
    if dico.get("Descripteur Eurovoc principal"):
        document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text
    if dico.get("Eurovoc-hoofddescriptor"):
        document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text
    document.vote_date = get_text_else_blank(dico, "Vote Chambre")
    document.law_date = get_text_else_blank(dico, "Date de la loi")
    document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°")
    document.moniteur_date = get_text_else_blank(dico, u"Date moniteur")
    document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat")
    document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature")

    if dico.get("Etat d'avancement"):
        document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0])
        document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None
    if dico.get("Stand van zaken"):
        document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0])
        document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None

    if dico.get("Descripteurs Eurovoc"):
        document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|"))
    if dico.get("Eurovoc descriptoren"):
        document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|"))
    if dico.get("Candidats-descripteurs Eurovoc"):
        document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|"))
    if dico.get("Eurovoc kandidaat-descriptoren"):
        document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|"))
    if dico.get(u"Mots-clés libres"):
        document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|"))
    if dico.get(u"Vrije trefwoorden"):
        document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|"))
    if dico.get("Documents principaux"):
        document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents))
    if dico.get("Hoodfdocumenten"):
        document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))
Example #6
0
def _get_document_senat(dico, dico_nl, document):
    if not dico.get(u"Document Sénat"):
        return

    senat_dico = dico[u"Document Sénat"]
    senat_dico_nl = dico_nl[u"Document Senaat"]

    document_senat = DocumentSenat()
    document_senat.deposition_date = senat_dico[u"Date de dépôt"].text
    document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin")
    document_senat.type["fr"] = senat_dico[u"Type de document"].text
    document_senat.type["nl"] = senat_dico_nl[u"Document type"].text
    document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ')
    document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ')
    document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)"))
    document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut')
    document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status')

    url, tipe, session = clean_text(str(senat_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    if senat_dico.get('Document(s) suivant(s)'):
        for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])):
            logger.debug("add pdf %s" % clean_text(d[0].font.text))
            doc = OtherDocumentSenatPdf()
            doc.url = d[0].a['href'] if d[0].a else d[0].td.text
            doc.type["fr"] = clean_text(d[0].font.text)
            doc.type["nl"] = clean_text(d_nl[0].font.text)
            doc.date = d[0]('td')[-1].contents[0]
            doc.authors = []
            for dep, dep_nl in zip(d[1:], d_nl[1:]):
                doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
            doc.save()
            document_senat.other_pdfs.append(doc)

    document_senat.save()
    document.document_senat = document_senat
Example #7
0
def _get_document_senat(dico, dico_nl, document):
    if not dico.get(u"Document Sénat"):
        return

    senat_dico = dico[u"Document Sénat"]
    senat_dico_nl = dico_nl[u"Document Senaat"]

    document_senat = DocumentSenat()
    document_senat.deposition_date = senat_dico[u"Date de dépôt"].text
    document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin")
    document_senat.type["fr"] = senat_dico[u"Type de document"].text
    document_senat.type["nl"] = senat_dico_nl[u"Document type"].text
    document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ')
    document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ')
    document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)"))
    document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut')
    document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status')

    url, tipe, session = clean_text(str(senat_dico[u'head']).replace("&#160;", "")).split("<br />")
    _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
    url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
    document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

    if senat_dico.get('Document(s) suivant(s)'):
        for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])):
            logger.debug("add pdf %s" % clean_text(d[0].font.text))
            doc = OtherDocumentSenatPdf()
            doc.url = d[0].a['href'] if d[0].a else d[0].td.text
            doc.type["fr"] = clean_text(d[0].font.text)
            doc.type["nl"] = clean_text(d_nl[0].font.text)
            doc.date = d[0]('td')[-1].contents[0]
            doc.authors = []
            for dep, dep_nl in zip(d[1:], d_nl[1:]):
                doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
            doc.save()
            document_senat.other_pdfs.append(doc)

    document_senat.save()
    document.document_senat = document_senat