コード例 #1
0
    def _get_document_chambre(dico, dico_nl, document):
        if not dico.get("Document Chambre"):
            return

        chambre_dico = dico['Document Chambre']
        chambre_dico_nl = dico_nl['Document Kamer']

        document_chambre = DocumentChambre()
        document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
        document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
        document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
        document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
        document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
        document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
        document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
        document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
        document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
        document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
        document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')

        Document._get_authors(chambre_dico, chambre_dico_nl, document_chambre)

        url, tipe, session = clean_text(str(chambre_dico[u'head']).replace("&#160;", "")).split("<br />")
        _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
        url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
        document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

        Document._get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)

        if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
            document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
                                                                                                                                                 chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]

        document_chambre.save()
        document.document_chambre = document_chambre
コード例 #2
0
    def fetch_one(klass, link, cache=False, sync=False):
        soupsoup, suppesuppe = scraper.get_with_nl(LACHAMBRE_PREFIX + link.a["href"], "written question %s" % re.search(DOSSIER_ID_REGEX, link.a["href"]).groups()[0])
        data = AccessControlDict(((x.td.text.strip(), x('td')[1]) for x in soupsoup.find('table', 'txt')('tr') if x.td.text))
        data_nl = AccessControlDict(((x.td.text.strip(), x('td')[1]) for x in suppesuppe.find('table', 'txt')('tr') if x.td.text))
        print sorted(data.keys())
        print sorted(data_nl.keys())
        update_or_create(WrittenQuestion,
                      _id="lachambre_id",
                      lachambre_id=re.search(DOSSIER_ID_REGEX, link.a["href"]).groups()[0],
                      title={"fr": data["Titre"].text, "nl": data_nl["Titel"].text},
                      departement={"fr": data[u"D\xe9partement"].text, "nl": data_nl[u"Departement"].text},
                      sub_departement={"fr": data[u"Sous-d\xe9partement"].text, "nl": data_nl[u"Sub-departement"].text},
                      deposition_date=data[u"Date de d\xe9p\xf4t"].text,
                      delay_date=dico_get_text(data, u"Date de d\xe9lai"),
                      publication_date=dico_get_text(data, "Date publication"),
                      # TODO: link to the actual deputy
                      author=data[u"Auteur"].text,
                      language=data[u"Langue"].text,
                      question_status={"fr": dico_get_text(data, "Statut question"), "nl": dico_get_text(data_nl, "Status vraag")},
                      status={"fr": dico_get_text(data, "Statut"), "nl": dico_get_text(data_nl, "Status")},
                      question={"fr": u"%s" % data["Question"], "nl": "%s" % data_nl["Vraag"]},
                      answer={"fr": dico_get_text(data, u"R\xe9ponse"), "nl": dico_get_text(data_nl, u"Antwoord")},
                      publication_reponse_pdf_url=get_href_else_blank(data, u"Publication r\xe9ponse"),
                      publication_question_pdf_url=get_href_else_blank(data, u"Publication question"),
                      publication_reponse=get_text_else_blank(data, u"Publication r\xe9ponse"),
                      publication_question=get_text_else_blank(data, u"Publication question"),
                      eurovoc_descriptors={"fr": get_items_list_else_empty_list(data, "Descripteurs Eurovoc"),
                                           "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-descriptoren")},
                      eurovoc_principal_descriptors={"fr": get_items_list_else_empty_list(data, "Desc. Eurovoc principal"),
                                           "nl": get_items_list_else_empty_list(data_nl, "Eurovoc-hoofddescriptor")},
                      eurovoc_candidats_descriptors={"fr": get_items_list_else_empty_list(data, "Candidats-descripteurs Eurovoc"),
                                                     "nl": get_items_list_else_empty_list(data_nl, "Eurovoc kandidaat-descriptoren")},
                      keywords={"fr": get_items_list_else_empty_list(data, u"Mots-cl\xe9s libres"),
                                "nl": get_items_list_else_empty_list(data_nl, u"Vrije trefwoorden")},
                      url=link.a["href"],
                         )

        data.die_if_got_not_accessed_keys()
コード例 #3
0
    def _get_first_level_data(dico, dico_nl, document):
        document.deposition_date = get_text_else_blank(dico, u"Date de dépôt")
        document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution"))
        document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet"))
        if dico.get("Descripteur Eurovoc principal"):
            document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text
        if dico.get("Eurovoc-hoofddescriptor"):
            document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text
        document.vote_date = get_text_else_blank(dico, "Vote Chambre")
        document.law_date = get_text_else_blank(dico, "Date de la loi")
        document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°")
        document.moniteur_date = get_text_else_blank(dico, u"Date moniteur")
        document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat")
        document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature")

        if dico.get("Etat d'avancement"):
            document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0])
            document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None
        if dico.get("Stand van zaken"):
            document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0])
            document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None

        if dico.get("Descripteurs Eurovoc"):
            document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|"))
        if dico.get("Eurovoc descriptoren"):
            document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|"))
        if dico.get("Candidats-descripteurs Eurovoc"):
            document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|"))
        if dico.get("Eurovoc kandidaat-descriptoren"):
            document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|"))
        if dico.get(u"Mots-clés libres"):
            document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|"))
        if dico.get(u"Vrije trefwoorden"):
            document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|"))
        if dico.get("Documents principaux"):
            document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents))
        if dico.get("Hoodfdocumenten"):
            document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))
コード例 #4
0
    def _get_document_senat(dico, dico_nl, document):
        if not dico.get(u"Document Sénat"):
            return

        senat_dico = dico[u"Document Sénat"]
        senat_dico_nl = dico_nl[u"Document Senaat"]

        document_senat = DocumentSenat()
        document_senat.deposition_date = senat_dico[u"Date de dépôt"].text
        document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin")
        document_senat.type["fr"] = senat_dico[u"Type de document"].text
        document_senat.type["nl"] = senat_dico_nl[u"Document type"].text
        document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ')
        document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ')
        document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)"))
        document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut')
        document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status')

        url, tipe, session = clean_text(str(senat_dico[u'head']).replace("&#160;", "")).split("<br />")
        _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
        url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
        document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

        if senat_dico.get('Document(s) suivant(s)'):
            for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])):
                logger.debug("add pdf %s" % clean_text(d[0].font.text))
                doc = OtherDocumentSenatPdf()
                doc.url = d[0].a['href'] if d[0].a else d[0].td.text
                doc.type["fr"] = clean_text(d[0].font.text)
                doc.type["nl"] = clean_text(d_nl[0].font.text)
                doc.date = d[0]('td')[-1].contents[0]
                doc.authors = []
                for dep, dep_nl in zip(d[1:], d_nl[1:]):
                    doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
                doc.save()
                document_senat.other_pdfs.append(doc)

        document_senat.save()
        document.document_senat = document_senat