Python clean_textの例、scraper.utils.clean_text Pythonの例

コード例 #1

0

ファイルを表示

    def _get_document_chambre(dico, dico_nl, document):
        if not dico.get("Document Chambre"):
            return

        chambre_dico = dico['Document Chambre']
        chambre_dico_nl = dico_nl['Document Kamer']

        document_chambre = DocumentChambre()
        document_chambre.deposition_date = get_text_else_blank(chambre_dico, u'Date de dépôt')
        document_chambre.type["fr"] = chambre_dico[u'Type de document'].text
        document_chambre.type["nl"] = chambre_dico_nl[u'Document type'].text
        document_chambre.taken_in_account_date = get_text_else_blank(chambre_dico, u'Prise en considération')
        document_chambre.distribution_date = get_text_else_blank(chambre_dico, u'Date de distribution')
        document_chambre.sending_date = get_text_else_blank(chambre_dico, u'Date d\'envoi')
        document_chambre.ending_date = get_text_else_blank(chambre_dico, u'Date de fin')
        document_chambre.status["fr"] = get_text_else_blank(chambre_dico, u'Statut')
        document_chambre.status["nl"] = get_text_else_blank(chambre_dico_nl, u'Status')
        document_chambre.comments["fr"] = get_text_else_blank(chambre_dico, u'Commentaire').split(' ')
        document_chambre.comments["nl"] = get_text_else_blank(chambre_dico_nl, u'Commentaar').split(' ')

        Document._get_authors(chambre_dico, chambre_dico_nl, document_chambre)

        url, tipe, session = clean_text(str(chambre_dico[u'head']).replace("&#160;", "")).split("<br />")
        _, tipe_nl, _ = clean_text(str(chambre_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
        url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
        document_chambre.pdf = DocumentChambrePdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

        Document._get_next_documents(chambre_dico, chambre_dico_nl, document_chambre)

        if chambre_dico.get(u'Document(s) joint(s)/lié(s)'):
            document_chambre.joint_pdfs = [{"url": x.a["href"], "title": {"fr": x.contents[0][1:-1], "nl": y.contents[0][1:-1]}} for x, y in zip(chambre_dico[u'Document(s) joint(s)/lié(s)'],
                                                                                                                                                 chambre_dico_nl[u'Gekoppeld(e)/verbonden document(en)'],)]

        document_chambre.save()
        document.document_chambre = document_chambre

コード例 #2

0

ファイルを表示

    def _get_in_charged_commissions(dico, dico_nl, document):
        document.in_charge_commissions = []
        for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?COMMISSION CHAMBRE", x), dico.keys())), sorted(filter(lambda x: re.match("(\d+. )?COMMISSIE KAMER", x), dico_nl.keys()))):
            icc = InChargeCommissions()
            icc.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
            icc.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
            icc.commission["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
            icc.commission["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])
            if dico[key].get("Rapporteur"):
                # FIXME link to actual deputies
                icc.rapporters = map(clean_text, dico[key]["Rapporteur"].text.split("\n\t\t\t\t\t"))

            icc.incident = []
            if dico[key].get("Incident"):
                fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
                nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
                for (_date, _type), (_, _type_nl) in zip(fr, nl):
                    icc.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

            icc.agenda = []
            if dico[key].get("Calendrier"):
                fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
                nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
                for (_date, _type), (_, _type_nl) in zip(fr, nl):
                    icc.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

            if dico[key].get("Rapport"):
                icc.rapport = {"url": dico[key]["Rapport"].a["href"], "date": clean_text(dico[key]["Rapport"].contents[-2])}

            icc.save()
            document.in_charge_commissions.append(icc)

コード例 #3

0

ファイルを表示

    def _get_plenaries(dico, dico_nl, document):
        document.plenaries = []
        for key, key_nl in zip(sorted(filter(lambda x: re.match("(\d+. )?SEANCE PLENIERE CHAMBRE", x), dico.keys())),
                               sorted(filter(lambda x: re.match("(\d+. )?PLENAIRE VERGADERING KAMER", x), dico_nl.keys()))):
            pl = DocumentPlenary()
            pl.visibility["fr"] = clean_text(dico[key]["head"].text).split()[-1]
            pl.visibility["nl"] = clean_text(dico_nl[key_nl]["head"].text).split()[-1]
            pl.type["fr"] = " ".join(clean_text(dico[key]["head"].text).split()[:-1])
            pl.type["nl"] = " ".join(clean_text(dico_nl[key_nl]["head"].text).split()[:-1])

            pl.agenda = []
            if dico[key].get("Calendrier"):
                fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Calendrier"].contents[::2])))
                nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Kalender"].contents[::2])))
                for (_date, _type), (_, _type_nl) in zip(fr, nl):
                    pl.agenda.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

            pl.incident = []
            if dico[key].get("Incident"):
                fr = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico[key]["Incident"].contents[::2])))
                nl = filter(lambda x: x[0], map(lambda x: x.split(u" \xa0 ", 1), map(clean_text, dico_nl[key_nl]["Incident"].contents[::2])))
                for (_date, _type), (_, _type_nl) in zip(fr, nl):
                    pl.incident.append({"date": _date, "type": {"fr": _type, "nl": _type_nl}})

            pl.save()
            document.plenaries.append(pl)

コード例 #4

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def _build_sub_section(i, dico):
    sub_section = clean_text(i.td.b.text)
    if dico.get(sub_section):
        raise Exception("'%s' is already use as a key for '%s'" % (sub_section, dico[sub_section]))
    dico[sub_section] = AccessControlDict()
    dico[sub_section]["head"] = i('td')[1]
    return sub_section

コード例 #5

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def _build_sub_section(i, dico):
    sub_section = clean_text(i.td.b.text)
    if dico.get(sub_section):
        raise Exception("'%s' is already use as a key for '%s'" %
                        (sub_section, dico[sub_section]))
    dico[sub_section] = AccessControlDict()
    dico[sub_section]["head"] = i('td')[1]
    return sub_section

コード例 #6

0

ファイルを表示

 def _get_next_documents(chambre_dico, chambre_dico_nl, document_chambre):
     if chambre_dico.get('Document(s) suivant(s)'):
         for d, d_nl in zip(document_pdf_part_cutter(chambre_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(chambre_dico_nl[u'Opvolgend(e) document(en)'])):
             logger.debug("add pdf %s" % clean_text(d[0].font.text))
             doc = OtherDocumentChambrePdf()
             doc.url = d[0].a['href'] if d[0].a else d[0].td.text
             doc.type["fr"] = clean_text(d[0].font.text)
             doc.type["nl"] = clean_text(d_nl[0].font.text)
             doc.distribution_date = d[1]('td')[-1].text
             for dep, dep_nl in zip(d[2:], d_nl[2:]):
                 if dep.a:
                     lachambre_id = re.search('key=(\d+)', dep.a["href"]).groups()[0]
                     deputy = Deputy.objects.get(lachambre_id=lachambre_id)
                     doc.authors.append({"lachambre_id": deputy.lachambre_id, "id": deputy.id, "full_name": deputy.full_name, "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
                 else:
                     doc.authors.append({"lachambre_id": -1, "id": -1, "full_name": dep('td')[-1].contents[2].strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
             doc.save()
             document_chambre.other_pdfs.append(doc)

コード例 #7

0

ファイルを表示

 def _get_competences(dico, dico_nl, document):
     # FIXME: meh, DRY
     if dico.get(u"Compétence") and dico_nl.get(u"Bevoegdheid"):
         document.timeline = []
         for (_date, _title), (_, _title_nl) in zip([clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]],
                                                    [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]):
             logger.debug("append time line %s %s %s" % (_date, _title, _title_nl))
             document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": _title_nl}, date=_date))
     elif dico.get(u"Compétence"):
         document.timeline = []
         for (_date, _title) in [clean_text(x).split(u" \xa0 ", 1) for x in dico[u"Compétence"]["head"].contents[::2]]:
             logger.debug("append time line %s %s %s" % (_date, _title, ""))
             document.timeline.append(DocumentTimeLine.objects.create(title={"fr": _title, "nl": ""}, date=_date))
     elif dico_nl.get(u"Bevoegdheid"):
         document.timeline = []
         for (_date, _title_nl) in [clean_text(x).split(u" \xa0 ", 1) for x in dico_nl[u"Bevoegdheid"]["head"].contents[::2]]:
             logger.debug("append time line %s %s %s" % (_date, "", _title_nl))
             document.timeline.append(DocumentTimeLine.objects.create(title={"fr": "", "nl": _title_nl}, date=_date))
     if dico.get("Analyse des interventions"):
         document.analysis = get_or_create(Analysis, _id="lachambre_id", lachambre_id=dico["Analyse des interventions"]["head"].a.text, url=dico["Analyse des interventions"]["head"].a["href"])

コード例 #8

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def _build_first_level(i, dico):
    key = clean_text(i.td.text)
    # we can get severals Moniter erratum
    if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'):
        if not dico.get(key):
            dico[key] = []
        dico[key].append(i('td')[1])
    else:
        if dico.get(key):
            raise Exception("'%s' is already use as a key for '%s'" % (key, dico[key]))
        dico[key] = i('td')[1]

コード例 #9

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def _build_pdf_sub_section(i, dico, sub_section):
    key = clean_text(i.td.text)
    # we can have a list on joined documents
    if unicode(key) in (u'Document(s) joint(s)/lié(s)', u'Gekoppeld(e)/verbonden document(en)'):
        if not dico[sub_section].get(key):
            dico[sub_section][key] = []
        dico[sub_section][key].append(i('td')[1])
    elif dico[sub_section].get(key):
        raise Exception("'%s' is already use as a key in the sub_section '%s' for '%s'" % (key, sub_section, dico[sub_section][key]))
    else:
        dico[sub_section][key] = i('td')[1]

コード例 #10

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def _build_first_level(i, dico):
    key = clean_text(i.td.text)
    # we can get severals Moniter erratum
    if unicode(key) in ('Moniteur erratum', 'Staatsblad erratum'):
        if not dico.get(key):
            dico[key] = []
        dico[key].append(i('td')[1])
    else:
        if dico.get(key):
            raise Exception("'%s' is already use as a key for '%s'" %
                            (key, dico[key]))
        dico[key] = i('td')[1]

コード例 #11

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def document_pdf_part_cutter(soup):
    result = []
    blob = [soup('tr')[0]]
    for i in soup('tr')[1:]:
        if not clean_text(i.text):
            continue
        if not i.img or not i.img.get("class") or i.img["class"] != "picto":
            blob.append(i)
        else:
            result.append(blob)
            blob = [i]

    result.append(blob)
    return result

コード例 #12

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def _build_pdf_sub_section(i, dico, sub_section):
    key = clean_text(i.td.text)
    # we can have a list on joined documents
    if unicode(key) in (u'Document(s) joint(s)/lié(s)',
                        u'Gekoppeld(e)/verbonden document(en)'):
        if not dico[sub_section].get(key):
            dico[sub_section][key] = []
        dico[sub_section][key].append(i('td')[1])
    elif dico[sub_section].get(key):
        raise Exception(
            "'%s' is already use as a key in the sub_section '%s' for '%s'" %
            (key, sub_section, dico[sub_section][key]))
    else:
        dico[sub_section][key] = i('td')[1]

コード例 #13

0

ファイルを表示

ファイル: documents_parsing_utils.py プロジェクト: Psycojoker/dierentheater

def document_pdf_part_cutter(soup):
    result = []
    blob = [soup('tr')[0]]
    for i in soup('tr')[1:]:
        if not clean_text(i.text):
            continue
        if not i.img or not i.img.get("class") or i.img["class"] != "picto":
            blob.append(i)
        else:
            result.append(blob)
            blob = [i]

    result.append(blob)
    return result

コード例 #14

0

ファイルを表示

    def _get_document_senat(dico, dico_nl, document):
        if not dico.get(u"Document Sénat"):
            return

        senat_dico = dico[u"Document Sénat"]
        senat_dico_nl = dico_nl[u"Document Senaat"]

        document_senat = DocumentSenat()
        document_senat.deposition_date = senat_dico[u"Date de dépôt"].text
        document_senat.ending_date = get_text_else_blank(senat_dico, u"Date de fin")
        document_senat.type["fr"] = senat_dico[u"Type de document"].text
        document_senat.type["nl"] = senat_dico_nl[u"Document type"].text
        document_senat.comments["fr"] = get_text_else_blank(senat_dico, u'Commentaire').split(' - ')
        document_senat.comments["nl"] = get_text_else_blank(senat_dico_nl, u'Commentaar').split(' - ')
        document_senat.author = clean_text(get_text_else_blank(senat_dico, u"Auteur(s)"))
        document_senat.status["fr"] = get_text_else_blank(senat_dico, u'Statut')
        document_senat.status["nl"] = get_text_else_blank(senat_dico_nl, u'Status')

        url, tipe, session = clean_text(str(senat_dico[u'head']).replace("&#160;", "")).split("<br />")
        _, tipe_nl, _ = clean_text(str(senat_dico_nl[u'head']).replace("&#160;", "")).split("<br />")
        url = re.search('href="([^"]+)', url).groups()[0] if "href" in url else url
        document_senat.pdf = DocumentSenatPdf.objects.create(url=url, type={"fr": tipe.strip(), "nl": tipe_nl.strip()}, session=session.split()[-2])

        if senat_dico.get('Document(s) suivant(s)'):
            for d, d_nl in zip(document_pdf_part_cutter(senat_dico[u'Document(s) suivant(s)']), document_pdf_part_cutter(senat_dico_nl[u'Opvolgend(e) document(en)'])):
                logger.debug("add pdf %s" % clean_text(d[0].font.text))
                doc = OtherDocumentSenatPdf()
                doc.url = d[0].a['href'] if d[0].a else d[0].td.text
                doc.type["fr"] = clean_text(d[0].font.text)
                doc.type["nl"] = clean_text(d_nl[0].font.text)
                doc.date = d[0]('td')[-1].contents[0]
                doc.authors = []
                for dep, dep_nl in zip(d[1:], d_nl[1:]):
                    doc.authors.append({"full_name": unicode(dep('td')[-1].contents[2]).strip(), "role": {"fr": dep('td')[-1].i.text[1:-1], "nl": dep_nl('td')[-1].i.text[1:-1]}})
                doc.save()
                document_senat.other_pdfs.append(doc)

        document_senat.save()
        document.document_senat = document_senat

コード例 #15

0

ファイルを表示

    def _get_first_level_data(dico, dico_nl, document):
        document.deposition_date = get_text_else_blank(dico, u"Date de dépôt")
        document.constitution_article["fr"] = clean_text(get_text_else_blank(dico, "Article Constitution"))
        document.constitution_article["nl"] = clean_text(get_text_else_blank(dico_nl, "Artikel Grondwet"))
        if dico.get("Descripteur Eurovoc principal"):
            document.eurovoc_main_descriptor["fr"] = dico["Descripteur Eurovoc principal"]["head"].text
        if dico.get("Eurovoc-hoofddescriptor"):
            document.eurovoc_main_descriptor["nl"] = dico_nl["Eurovoc-hoofddescriptor"]["head"].text
        document.vote_date = get_text_else_blank(dico, "Vote Chambre")
        document.law_date = get_text_else_blank(dico, "Date de la loi")
        document.moniteur_number = get_text_else_blank(dico, u"Moniteur n°")
        document.moniteur_date = get_text_else_blank(dico, u"Date moniteur")
        document.vote_senat_date = get_text_else_blank(dico, u"Vote Sénat")
        document.candidature_vote_date = get_text_else_blank(dico, u"Vote candidature")

        if dico.get("Etat d'avancement"):
            document.status_chambre["fr"] = clean_text(dico["Etat d'avancement"].contents[0])
            document.status_senat["fr"] = clean_text(dico["Etat d'avancement"].contents[2]) if len(dico["Etat d'avancement"]) >= 3 else None
        if dico.get("Stand van zaken"):
            document.status_chambre["nl"] = clean_text(dico_nl["Stand van zaken"].contents[0])
            document.status_senat["nl"] = clean_text(dico_nl["Stand van zaken"].contents[2]) if len(dico_nl["Stand van zaken"]) >= 3 else None

        if dico.get("Descripteurs Eurovoc"):
            document.eurovoc_descriptors["fr"] = map(lambda x: x.strip(), dico["Descripteurs Eurovoc"]["head"].text.split("|"))
        if dico.get("Eurovoc descriptoren"):
            document.eurovoc_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc descriptoren"]["head"].text.split("|"))
        if dico.get("Candidats-descripteurs Eurovoc"):
            document.eurovoc_candidats_descriptors["fr"] = map(lambda x: x.strip(), dico["Candidats-descripteurs Eurovoc"]["head"].text.split("|"))
        if dico.get("Eurovoc kandidaat-descriptoren"):
            document.eurovoc_candidats_descriptors["nl"] = map(lambda x: x.strip(), dico_nl["Eurovoc kandidaat-descriptoren"]["head"].text.split("|"))
        if dico.get(u"Mots-clés libres"):
            document.keywords["fr"] = map(lambda x: x.strip(), dico[u"Mots-clés libres"]["head"].text.split("|"))
        if dico.get(u"Vrije trefwoorden"):
            document.keywords["nl"] = map(lambda x: x.strip(), dico_nl[u"Vrije trefwoorden"]["head"].text.split("|"))
        if dico.get("Documents principaux"):
            document.main_docs["fr"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico["Documents principaux"].contents))
        if dico.get("Hoodfdocumenten"):
            document.main_docs["nl"] = map(lambda x: x.strip(), filter(lambda x: x != "<br>", dico_nl["Hoodfdocumenten"].contents))