コード例 #1
0
ファイル: dossier_like_senapy.py プロジェクト: mdamien/anpy
def download_historic_dosleg(url):
    resp = download(url)

    if '/dyn/' in resp.url:
        # fallback to backed-up doslegs when the redirect is forced
        legislature, slug = parse_national_assembly_url(url)
        display_url = AN_OLD_URL_TEMPLATE.format(legislature=legislature,
                                                 slug=slug)
        download_url = 'https://raw.githubusercontent.com/regardscitoyens/archive-AN-doslegs/master/archive/' \
            + display_url.split('.fr/')[1]
        resp = download(download_url)
        resp.url = display_url

    resp.encoding = 'Windows-1252'
    return resp
コード例 #2
0
def process(output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    yesterday = time.time() - 86400
    for url in "2007-2012.nosdeputes", "2012-2017.nosdeputes", "2017-2022.nosdeputes", "www.nosdeputes", "www.nossenateurs":
        dfile = '%s-groupes.json' % url
        destfile = os.path.join(output_directory, dfile)
        if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday:
            print('downloading', dfile)
            open(destfile, 'w').write(download("https://%s.fr/organismes/groupe/json" % url).text)
        dfile = '%s.parlementaires.json' % url
        destfile = os.path.join(output_directory, dfile)
        if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday:
            print('downloading', dfile)
            open(destfile, 'w').write(download("http://%s.fr/%s/json" %
                (url, 'deputes' if 'deputes' in url else 'senateurs')).text)
コード例 #3
0
def download_open_data_file(filename, file_url):
    raw_data = download(file_url)
    data_zip = zipfile.ZipFile(io.BytesIO(raw_data.content))
    if filename:
        with data_zip.open(filename) as d:
            return json.loads(d.read().decode('utf-8'))
    data = {
        "export": {
            "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
            "dossiersLegislatifs": {
                "dossier": []
            },
            "textesLegislatifs": {
                "document": []
            }
        }
    }
    for filename in data_zip.namelist():
        with data_zip.open(filename) as d:
            filedata = json.loads(d.read().decode('utf-8'))
            if "dossierParlementaire" in filename:
                data["export"]["dossiersLegislatifs"]["dossier"].append(
                    filedata)
            else:
                data["export"]["textesLegislatifs"]["document"].append(
                    filedata["document"])
    return data
コード例 #4
0
def download_open_data_file(filename, file_url):
    raw_data = download(file_url)
    data_zip = zipfile.ZipFile(io.BytesIO(raw_data.content))
    if filename:
        with data_zip.open(filename) as d:
            return json.loads(d.read().decode('utf-8'))
    return data_zip
コード例 #5
0
def test_status(url):
    try:
        resp = download(url)
        if resp.status_code != 200:
            return False
    except Exception:
        return False
    return resp
コード例 #6
0
def test_status(url):
    resp = download(url)
    if resp.status_code != 200:
        return False
    # TODO: do this in download()
    if 'assemblee-nationale.fr' in url:
        resp.encoding = 'Windows-1252'
    return resp
コード例 #7
0
def test_status(url):
    resp = download(url)
    if resp.status_code != 200:
        return False
    # TODO: do this in download()
    if 'assemblee-nationale.fr' in url:
        resp.encoding = 'Windows-1252'
    return resp
コード例 #8
0
def download_senat(url, log=sys.stderr):
    print('  [] download SENAT version')
    resp = download(url)
    if resp.status_code != 200:
        print('WARNING: Invalid response -', resp.status_code)
        return
    html = resp.text
    print('  [] parse SENAT version')
    senat_dos = senapy_parse(html, url, logfile=log)
    debug_file(senat_dos, 'senat_dos.json')
    return senat_dos
コード例 #9
0
def extract_full_decision(url):
    decision_src = download(url).text
    if '<a name=\'visa\' id="visa"></a>' not in decision_src:
        print("ERROR: could not find visa in decision CC", url, file=sys.stderr)
        return None
    decision_txt = decision_src.split('<a name=\'visa\' id="visa"></a>')[1]
    if not re_delibere.search(decision_txt):
        print("ERROR: could not find siège in décision CC", url, file=sys.stderr)
        return None
    decision_txt = clean_delib(decision_txt)
    return strip_text(decision_txt)
コード例 #10
0
def download_senat(url, log=sys.stderr):
    print('  [] download SENAT version')
    resp = download(url)
    if resp.status_code != 200:
        print('WARNING: Invalid response -', resp.status_code)
        return
    html = resp.text
    print('  [] parse SENAT version')
    senat_dos = senapy_parse(html, url, logfile=log)
    debug_file(senat_dos, 'senat_dos.json')
    return senat_dos
コード例 #11
0
def are_same_doslegs(senat_dos, an_dos):
    # same dosleg url ?
    if an_dos['url_dossier_senat'] == senat_dos['url_dossier_senat']:
        return True
    elif download(an_dos['url_dossier_senat']).status_code == 404:
        return True
    # same first text  ?
    if senat_dos.get('steps') and an_dos.get('steps') \
        and senat_dos['steps'][0].get('source_url') == an_dos['steps'][0].get('source_url'):
        return True
    # it's not the same dosleg !
    return False
コード例 #12
0
ファイル: parser.py プロジェクト: regardscitoyens/senapy
def find_an_url(data):
    if not data['steps']:
        return
    an_text_url = [step['source_url'] for step in data['steps'] if step.get('source_url') and 'assemblee-nationale' in step.get('source_url')]
    for url in an_text_url:
        html = download(url).text
        soup = BeautifulSoup(html, 'lxml')
        btn = soup.select_one('#btn_dossier')
        if btn:
            a = btn.parent
            if a.attrs.get('href'):
                return clean_url(urljoin(url, a.attrs['href']))
コード例 #13
0
ファイル: parser.py プロジェクト: regardscitoyens/senapy
def parse_table_concordance(url):
    html = download(url).text
    soup = BeautifulSoup(html, 'html5lib')

    old_to_adopted = {}
    confusing_entries = set()

    rows = soup.select('div[align="center"] > table tr') + soup.select('div[align="left"] > table tr')

    def normalize(entry):
        if entry.lower() in ('unique', '1'):
            return '1er'
        return entry

    def add(old, adopted):
        nonlocal old_to_adopted, confusing_entries
        if ' et ' in old:
            for el in old.split(' et '):
                add(el, adopted)
            return
        adopted, old = normalize(adopted), normalize(old)
        if adopted.lower() in ('id', 'idem'):  # id: Abbreviation of the Latin idem (“same”)
            adopted = old
        if adopted == '':
            adopted = 'supprimé'
        if 'suppr' in adopted.lower():
            adopted = adopted.lower()
        if old in old_to_adopted:
            print('## ERROR ###', 'DOUBLE ENTRY IN CONCORDANCE TABLE FOR', old, file=sys.stderr)
            confusing_entries.add(old)
        else:
            if 'suppr' not in adopted and adopted in old_to_adopted.values():
                print('## WARNING ###', 'MULTIPLE ARTICLES MERGED INTO ONE IN CONCORDANCE TABLE FOR', adopted, file=sys.stderr)
                adopted += ' (supprimé)'
            old_to_adopted[old] = adopted

    for line in rows:
        cells = [x.text.strip() for x in line.select('td')]
        old, adopted, *_ = cells
        if 'numérotation' in old.lower() or not old:
            continue
        add(old, adopted)

        # there can be two concordances per line
        # ex: https://www.senat.fr/dossier-legislatif/tc/tc_pjl08-155.html
        if len(cells) == 5:
            *_, old, adopted = cells
            add(old, adopted)

    for entry in confusing_entries:
        del old_to_adopted[entry]

    return old_to_adopted, list(confusing_entries)
コード例 #14
0
def are_same_doslegs(senat_dos, an_dos):
    # same dosleg url ?
    if an_dos['url_dossier_senat'] == senat_dos['url_dossier_senat']:
        return True
    elif download(an_dos['url_dossier_senat']).status_code == 404:
        return True
    # same first text  ?
    if senat_dos.get('steps') and an_dos.get('steps') \
        and senat_dos['steps'][0].get('source_url') == an_dos['steps'][0].get('source_url'):
        return True
    # it's not the same dosleg !
    return False
コード例 #15
0
ファイル: dossier_like_senapy.py プロジェクト: mdamien/anpy
def find_senat_url(data):
    if not data['steps']:
        return
    senat_text_url = [
        step['source_url'] for step in data['steps']
        if step.get('source_url') and 'senat.fr' in step.get('source_url')
    ]
    for url in senat_text_url:
        html = download(url).text
        soup = BeautifulSoup(html, 'lxml')
        for a in soup.select('#primary a'):
            href = urljoin(url, a.attrs.get('href', ''))
            if 'dossier-legislatif/' in href or 'dossierleg/' in href:
                return clean_url(href)
コード例 #16
0
ファイル: parser.py プロジェクト: regardscitoyens/senapy
def find_an_url(data):
    if not data['steps']:
        return
    an_text_url = [
        step['source_url'] for step in data['steps'] if step.get('source_url')
        and 'assemblee-nationale' in step.get('source_url')
    ]
    for url in an_text_url:
        html = download(url).text
        soup = BeautifulSoup(html, 'html5lib')
        btn = soup.select_one('#btn_dossier')
        if btn:
            a = btn.parent
            if a.attrs.get('href'):
                return clean_url(urljoin(url, a.attrs['href']))
コード例 #17
0
def download_daily(url_or_collecter, filename, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    yesterday = time.time() - 86400
    destfile = os.path.join(output_directory, filename + ".json")
    if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday:
        print('downloading', filename)
        if isinstance(url_or_collecter, str):
            jsondata = download(url_or_collecter).json()
        else:
            jsondata = url_or_collecter()
        print_json(jsondata, destfile)
    else:
        jsondata = open_json(destfile)
    return jsondata
コード例 #18
0
def extract_full_decision(url):
    decision_src = download(url).text
    if '<a name=\'visa\' id="visa"></a>' not in decision_src:
        print("ERROR: could not find visa in decision CC",
              url,
              file=sys.stderr)
        return None
    decision_txt = decision_src.split('<a name=\'visa\' id="visa"></a>')[1]
    if not re_delibere.search(decision_txt):
        print("ERROR: could not find siège in décision CC",
              url,
              file=sys.stderr)
        return None
    decision_txt = clean_delib(decision_txt)
    return strip_text(decision_txt)
コード例 #19
0
def download_daily(url_or_collecter, filename, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    yesterday = time.time() - 86400
    destfile = os.path.join(output_directory, filename + ".json")
    if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday:
        print('downloading', filename)
        if isinstance(url_or_collecter, str):
            jsondata = download(url_or_collecter).json()
        else:
            jsondata = url_or_collecter()
        print_json(jsondata, destfile)
    else:
        jsondata = open_json(destfile)
    return jsondata
コード例 #20
0
def download_an(url, url_senat=False, log=sys.stderr, verbose=True):
    if verbose: print('  [] download AN version')
    resp = download(url)
    resp.encoding = 'Windows-1252'
    html = resp.text
    if verbose: print('  [] parse AN version')
    # TODO: do both instead of first
    results = anpy_parse(html, url, logfile=log, verbose=verbose)
    if len(results) > 1:
        if url_senat:
            for result in results:
                if result.get('url_dossier_senat') == url_senat:
                    return result
        if verbose:
            print('     WARNING: TOOK FIRST DOSLEG BUT THERE ARE %d OF THEM' %
                  len(results))
    return results[0]
コード例 #21
0
ファイル: dossier_from_opendata.py プロジェクト: mdamien/anpy
def download_open_data_doslegs(legislature):
    files = {
        15: (
            "Dossiers_Legislatifs_XV.json",
            "http://data.assemblee-nationale.fr/static/openData/repository/15/loi/dossiers_legislatifs/Dossiers_Legislatifs_XV.json.zip",
        ),
        14: (
            "Dossiers_Legislatifs_XIV.json",
            "http://data.assemblee-nationale.fr/static/openData/repository/14/loi/dossiers_legislatifs/Dossiers_Legislatifs_XIV.json.zip",
        ),
    }
    file, file_url = files[legislature]
    doslegs_resp = download(file_url)
    doslegs_zip = zipfile.ZipFile(io.BytesIO(doslegs_resp.content))
    DATA = json.loads(doslegs_zip.open(file).read().decode("utf-8"))

    return DATA
コード例 #22
0
def parse(url, resp=None, DEBUG=False, include_annexes=False):
    """
    parse the text of an url, an already cached  to`resp` can be passed to avoid an extra network request
    """
    all_articles = []
    def pr_js(article):
        nonlocal all_articles, texte
        if not len(all_articles):
            add_to_articles(texte, all_articles)
        add_to_articles(article, all_articles)

    if url.endswith('.pdf'):
        print("WARNING: text url is a pdf: %s skipping it..." % url)
        return all_articles
    if 'assemblee-nat.fr' in url:
        print("WARNING: url corresponds to old AN website: %s skipping it..." % url)
        return all_articles


    if url.startswith('http'):
        resp = download(url) if resp is None else resp
        if '/textes/'in url:
            resp.encoding = 'utf-8'
        if 'assemblee-nationale.fr' in url:
            resp.encoding = 'Windows-1252'
        string = resp.text
    elif url == '-':
        string = sys.stdin.read()
    else:
        string = open(url).read()

    string, has_multiple_expose = clean_extra_expose_des_motifs(string)

    if 'legifrance.gouv.fr' in url:
        for reg, res in clean_legifrance_regexps:
            string = reg.sub(res, string)
    else:
        for reg, res in clean_texte_regexps:
            string = reg.sub(res, string)

    #fix weird Sénat formatting with single cells tables around pieces of text sometimes multiline... ex: https://www.senat.fr/leg/ppl15-246.html
    for match in re.findall(r'(<table[^>]*>\s*(?:<t(?:body|r|d|h)[^>]*>\s*)+)(.*?)((?:\s*</t(?:body|r|d|h)[^>]*>)+\s*</table>)', string, re.I):
        if not re.search(r'<t(r|d|h)[^>]*>', match[1], re.I):
            string = string.replace(''.join(match), match[1])

    srclst = []
    source_avenants = False
    m = re.search(r"NB(\s|&nbsp;)+:(\s|&nbsp;)+[lL]es? textes? d(u |es |e la |e l&#8217;)((convention|traité|avenant)s? et de(s| l&#8217;))?(accord|convention)s?(-cadres?)? figuren?t? (respectivement )?en annexe aux (deux |trois )?projets de loi \(n", re.sub(r'</?span[^>]*>', '', string), re.I)
    if m:
        try:
            srclst = [int(s.strip('no ')) for s in (
                    string.replace('<sup>', '').replace('</sup>', '').replace('&nbsp;', ' ')
                    .replace('aux deux projets', 'aux projets').replace('aux trois projets', 'aux projets')
                    .replace('°', 'o').replace('nos ', 'no ').replace('ns ', 'no ').replace('(n ', '(no ')
                    .split(' en annexe aux projets de loi (no ')[1]
                    .strip()
                    .split(')')[0]
                    .strip()
                    .replace(' et ', ', ')
                    .split(', '))]
            source_avenants = True
        except Exception as e:
            if DEBUG:
                print("WARNING, multi-reports detected with NB method crashing (%s: %s), trying regular method..." % (type(e), e))
    if not source_avenants and "/rapports/r" in url and "TEXTES ADOPTÉS PAR LA COMMISSION" in string and string.count(">Article unique<") == 2:
        m = re.search(r'<i>Assemblée nationale&nbsp;:&nbsp;</i><b>(\d+) </b>et<b> (\d+)</b>', string)
        if m:
            srclst = [int(m.group(1)), int(m.group(2))]
            source_avenants = True

    definitif = re_definitif.search(string) is not None or re_definitif_new_format.search(string) is not None or 'legifrance.gouv.fr' in url
    soup = BeautifulSoup(string, "html5lib")
    texte = {"type": "texte", "source": url, "definitif": definitif}

    # Generate Senat or AN ID from URL
    if url.startswith('http'):
        if "legifrance.gouv.fr" in url:
            m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I)
            if m:
                texte["id"] = m.group(1)
            elif "/jo/texte" in url:
                texte["id"] = url.split('/')[-3]
        elif re.search(r"assemblee-?nationale", url, re.I):
            m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I)
            numero = int(m.group(3))
            texte["id"] = "A" + m.group(1) + "-"
            if m.group(2) is not None:
                texte["id"] += m.group(2)
            texte["id"] += str(numero)
            texte["nosdeputes_id"] = get_text_id(url)
        else:
            m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})(rec)?\d?(_mono)?\.", url, re.I)
            if m is None:
                m = re.search(r"/(-)?20(\d+)-\d+/(\d+)(_mono)?.html", url, re.I)
            numero = int(m.group(3))
            texte["id"] = "S" + m.group(2) + "-"
            if m.group(1) is not None:
                texte["id"] += m.group(1)
            texte["id"] += "%03d" % numero
            texte["nossenateurs_id"] = get_text_id(url)

    texte["titre"] = clean_html(re_clean_title_legif.sub('', soup.title.string.strip())) if soup.title else ""
    texte["expose"] = ""
    expose = False

    # states 'read' can be set to:
    READ_DISABLED = -1 # the text is not detected yet
    READ_TEXT = 0 # read the text
    READ_TITLE = 1 # titles lecture
    READ_ALINEAS = 2 # alineas lecture

    read = READ_TEXT
    art_num = ali_num = 0
    article = {}
    indextext = -1
    curtext = -1
    section = {"type": "section", "id": ""}

    rejected_all_articles = [] # we only keep the last detected text by default, here are stored the previous texts

    def should_be_parsed(x):
        """returns True if x can contain useful information"""
        if x.name not in ('p', 'table', 'h1', 'h2', 'h4'):
            return False
        # hack: we don't want to parse the table containing the conclusion from the senat
        #       ex: https://www.senat.fr/leg/tas12-040.html
        if x.name == "table" and re.search("SESSION (EXTRA)?ORDINAIRE DE", str(x)):
            return False
        # hack: senate can copy paste the /textes/ output from the AN
        #       ex: https://www.senat.fr/leg/ppl17-545.html
        # TODO: they also mess up the encoding by doing that
        if x.name == "table" and re.search("<!-- Col de controle de taille -->", str(x)):
            return False
        return True


    def should_be_ignored(x):
        if hasattr(x, 'attrs') and 'display: none' in x.attrs.get('style', ''):
            return True
        return False


    for text in non_recursive_find_all(soup, should_be_parsed, should_be_ignored):
        line = clean_html(str(text))
        if DEBUG:
            print(read, article.get('titre') or art_num, ali_num, line, file=sys.stderr)

        # limit h2/h4 matches to PPL headers or Article unique
        if text.name not in ('p', 'table') and not re_mat_ppl.match(line) and not re_mat_tco.match(line) and 'Article unique' not in line:
            if DEBUG:
                print(" -> IGNORING LINE", file=sys.stderr)
            continue

        if re_stars.match(line):
            continue
        if line == "<b>RAPPORT</b>" or line == "Mesdames, Messieurs,":
            read = READ_DISABLED
        if (srclst or indextext != -1) and re_sep_text.match(line):
            curtext += 1
            art_num = 0
        srcl = re_src_mult.search(line)
        if not source_avenants and srcl and read in (READ_DISABLED, READ_TEXT):
            srclst.append(int(srcl.group(1)))
            continue
        cl_line = re_cl_html.sub("", line).strip()
        if re_rap_mult.match(line):
            line = cl_line
            line = re_clean_mult_1.sub(",", line)
            line = re_clean_mult_2.sub("", line)
            cl_line = re_cl_html.sub("", line).strip()
            for n_t in line.split(','):
                indextext += 1
                if int(n_t) == numero:
                    break
        elif re_mat_ppl.match(line) or re_mat_tco.match(line) or (
                read == READ_DISABLED and line == "<b>Article 1er</b>"):
            read = READ_TEXT
            if len(all_articles):
                if DEBUG:
                    print('WARNING: Found articles before the real text')
                if article is not None:
                    pr_js(article)
                rejected_all_articles.append(all_articles)
                all_articles = []
                article = {}
                art_num = 0
        elif re_mat_exp.match(line):
            read = READ_DISABLED # Deactivate description lecture
            expose = True
        elif read == READ_TEXT and definitif_before_congres in line or definitif_after_congres in line:
            texte['definitif'] = True
            if all_articles:
                all_articles[0]['definitif'] = True
            continue
        elif (re_echec_cmp.search(cl_line)
                or re_echec_com.search(cl_line)
                or re_echec_com2.search(cl_line)
                or re_echec_com3.search(cl_line)
                or re_echec_com4.search(cl_line)
                or re_echec_com5.search(cl_line)
                or re_echec_com6.search(cl_line)
                or re_echec_hemi.match(cl_line)
                or re_echec_hemi2.search(cl_line)
                or re_echec_hemi3.search(cl_line)
            ) and 'dont la teneur suit' not in cl_line:
            pr_js({"type": "echec", "texte": cl_line})
            break
        elif read == READ_DISABLED:
            continue
        # or (indextext != -1 and curtext != indextext): #keep all texts resulting from multireport now it's selected then in complete

        # crazy edge case: "(Conforme)Article 24 bis A (nouveau)" on one line
        # http://www.assemblee-nationale.fr/13/projets/pl3324.asp
        # simplified, just do the "(Conforme)" case
        if '<i>(Conforme)</i>' in line and re_mat_art.search(line):
            article["statut"] = 'conforme'
            line = line.replace('<i>(Conforme)</i>', '')
            cl_line = cl_line.replace('(Conforme)', '')

        # another crazy edge case: the text is inside the annexe
        # ex: http://www.assemblee-nationale.fr/13/rapports/r2083.asp
        # TODO: could detect via "le présent projet de loi dans le texte figurant en annexe"
        #       like the source_avenants logic
        if read != READ_ALINEAS and re_approb.match(line):
            art_num += 1
            article = {
                "type": "article",
                "order": art_num,
                "alineas": {},
                "statut": "none",
                "titre": "1er"
            }
            read = READ_ALINEAS

        # Identify section zones
        line = normalize_section_title(line, text, has_multiple_expose)
        m = re_mat_sec.match(line)
        if m:
            read = READ_TITLE # Activate titles lecture
            section["type_section"] = real_lower(m.group(1))
            section_typ = m.group(1).upper()[0]
            if m.group(3) is not None:
                section_typ += "S"

            if re.search(re_préliminaire, line) or " LIMINAIRE" in line.upper():
                section_num = "L"
            else:
                section_num = re_cl_html.sub('', m.group(5).strip())
                if word_to_number(section_num) is not None:
                    section_num = word_to_number(section_num)
                section_num = normalize_1(section_num, '1')
                section_num = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), section_num)
                section_num = re_mat_new.sub('', section_num).strip()
                m2 = re_mat_romans.match(section_num)
                if m2:
                    rest = section_num.replace(m2.group(0), '')
                    section_num = romans(m2.group(0))
                    if rest:
                        section_num = str(section_num) + rest
            # Get parent section id to build current section id
            section_par = re.sub(r"" + section_typ + r"[\dL].*$", "", section["id"])
            section["id"] = section_par + section_typ + str(section_num)
            # check_section_is_not_a_duplicate(section["id"])

            titre = blank_none(m.group('titre')).strip()
            if titre:
                section['titre'] = titre
                if article is not None:
                    pr_js(article)
                    article = {}
                pr_js(section)
                read = READ_TEXT
        elif re_mat_end.match(line) and not include_annexes:
            if not expose:
                break
            expose = False
            continue
        # Annexes.
        elif read == READ_ALINEAS and re_mat_ann.match(line):
            if include_annexes:
                if article is not None:
                    pr_js(article)
                titre = re_cl_html.sub("", re_mat_ann.sub("", line))
                art_num += 1
                article = {
                    "type": "annexe",
                    "order": art_num,
                    "alineas": {},
                    "statut": "none",
                    "titre": titre
                }
                ali_num = 0
            else:
                break
        # Identify titles and new article zones
        elif (re.match(r"(<i>)?<b>", line) or
                re_art_uni.match(cl_line) or
                re.match(r"^Articles? ", line)
              ) and not re.search(r">Articles? supprimé", line):

            line = cl_line.strip()
            # Read a new article
            if re_mat_art.match(line):
                if article is not None:
                    pr_js(article)
                read = READ_ALINEAS # Activate alineas lecture
                expose = False
                art_num += 1
                ali_num = 0
                article = {"type": "article", "order": art_num, "alineas": {}, "statut": "none"}
                if srclst:
                    article["source_text"] = srclst[curtext]
                m = re_mat_art.match(clean_article_name(text))
                article["titre"] = normalize_1(m.group(1), "1er")

                assert article["titre"]  # avoid empty titles
                assert not texte['definitif'] or ' bis' not in article["titre"]  # detect invalid article names

                if m.group(2) is not None:
                    article["statut"] = re_cl_par.sub("", real_lower(m.group(2))).strip()
                if section["id"] != "":
                    article["section"] = section["id"]
            # Read a section's title
            elif read == READ_TITLE and line:
                section["titre"] = lower_but_first(line)
                if article is not None:
                    pr_js(article)
                    article = {}
                pr_js(section)
                read = READ_TEXT

        # detect dots, used as hints for later completion
        if read != READ_DISABLED:
            if re_mat_dots.match(line):
                if article is not None:
                    pr_js(article)
                    article = {}
                pr_js({"type": "dots"})
                read = READ_TEXT # ignore alineas after the dots
                continue

        # Read articles with alineas
        if read == READ_ALINEAS and not m:
            line = re_clean_coord.sub('', line)
            # if the line was only "Pour coordination", ignore it
            if not line:
                continue
            # Find extra status information
            if ali_num == 0 and re_mat_st.match(line):
                article["statut"] = re_cl_html.sub("", re_cl_par.sub("", real_lower(line)).strip()).strip()
                continue
            if "<table>" in line:
                cl_line = cl_html_except_tables(line)
            line = re_clean_art_spaces2.sub('. - ', re_clean_art_spaces.sub(r'\1', re_clean_idx_spaces.sub(r'\1. ', re_mat_new.sub(" ", cl_line).strip())))
            # Clean low/upcase issues with BIS TER etc.
            line = line.replace("oeUVRE", "OEUVRE")
            line = clean_full_upcase(line)
            line = re_clean_premier.sub(lambda m: (real_lower(m.group(0)) if m.group(1) else "")+m.group(3)+"er", line)
            line = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), line)
            # Clean different versions of same comment.
            line = re_clean_supr.sub('(Supprimé)', line)
            line = re_clean_conf.sub(r'\1(Non modifié)', line)
            line = re_clean_subsec_space.sub(r'\1\4 \5', line)
            line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line)

            tmp = line
            line = re_clean_punc_space.sub(r'\1 \2', tmp)
            line = re_clean_spaces.sub(' ', line)
            line = re_mat_sec.sub(lambda x: lower_but_first(x.group(1))+x.group(4) if re_mat_n.match(x.group(4)) else x.group(0), line)
            line = re_clean_footer_notes.sub(".", line)
            # Clean comments (Texte du Sénat), (Texte de la Commission), ...
            if ali_num == 0 and re_mat_texte.match(line):
                continue
            line = re_mat_single_char.sub("", line)
            line = line.strip()
            if line:
                ali_num += 1
                article["alineas"]["%03d" % ali_num] = line
        else:
            #metas
            continue

    # sometimes we find multiple text starts inside one (mainly due to annotations),
    # by default we normally keep only the latest one, but if it is empty,
    # try to find a good one from previously parsed articles
    # ex: http://www.assemblee-nationale.fr/15/propositions/pion0965.asp
    if not all_articles:
        for rejected in rejected_all_articles:
            articles_parsed = [art for art in rejected if art.get('type') == 'article']
            if len(articles_parsed):
                print('WARNING: retrieving parsed text from a previously rejected text')
                all_articles = rejected
                break

    if article is not None:
        pr_js(article)

    if indextext != -1 and curtext + 1 != len(srclst):
        print("WARNING: multiple texts announced but %d/%d found %s" % (curtext + 1, len(srclst), srclst), indextext)

    return all_articles
コード例 #23
0
ファイル: parser.py プロジェクト: regardscitoyens/senapy
def parse_table_concordance(url):
    html = download(url).text
    soup = BeautifulSoup(html, 'html5lib')

    old_to_adopted = {}
    confusing_entries = set()

    rows = soup.select('div[align="center"] > table tr') + soup.select(
        'div[align="left"] > table tr')

    def normalize(entry):
        if entry.lower() in ('unique', '1'):
            return '1er'
        return entry

    def add(old, adopted):
        nonlocal old_to_adopted, confusing_entries
        if ' et ' in old:
            for el in old.split(' et '):
                add(el, adopted)
            return
        adopted, old = normalize(adopted), normalize(old)
        if adopted.lower() in (
                'id', 'idem'):  # id: Abbreviation of the Latin idem (“same”)
            adopted = old
        if adopted == '':
            adopted = 'supprimé'
        if 'suppr' in adopted.lower():
            adopted = adopted.lower()
        if old in old_to_adopted:
            print('## ERROR ###',
                  'DOUBLE ENTRY IN CONCORDANCE TABLE FOR',
                  old,
                  file=sys.stderr)
            confusing_entries.add(old)
        else:
            if 'suppr' not in adopted and adopted in old_to_adopted.values():
                print(
                    '## WARNING ###',
                    'MULTIPLE ARTICLES MERGED INTO ONE IN CONCORDANCE TABLE FOR',
                    adopted,
                    file=sys.stderr)
                adopted += ' (supprimé)'
            old_to_adopted[old] = adopted

    for line in rows:
        cells = [x.text.strip() for x in line.select('td')]
        old, adopted, *_ = cells
        if 'numérotation' in old.lower() or not old:
            continue
        add(old, adopted)

        # there can be two concordances per line
        # ex: https://www.senat.fr/dossier-legislatif/tc/tc_pjl08-155.html
        if len(cells) == 5:
            *_, old, adopted = cells
            add(old, adopted)

    for entry in confusing_entries:
        del old_to_adopted[entry]

    return old_to_adopted, list(confusing_entries)
コード例 #24
0
from tlfp.tools._step_logic import get_previous_step, use_old_procedure


def article_to_markdown(art):
    texte = ""
    for key in sorted(art["alineas"].keys()):
        if art["alineas"][key] != "":
            texte += art["alineas"][key] + "\n\n"
    return texte


def amendement_to_markdown(texte):
    return html2markdown.convert(texte)


dossiers = download(f"https://www.lafabriquedelaloi.fr/api/dossiers.csv")

for csv_dos in csv.DictReader(dossiers.text.splitlines(), delimiter=";"):
    law = csv_dos['id']
    dos = download(
        f"https://www.lafabriquedelaloi.fr/api/{law}/viz/procedure.json").json(
        )

    for step_index, step in enumerate(dos["steps"]):
        if step.get("nb_amendements", 0) > 0:
            amendements = download(
                f"https://www.lafabriquedelaloi.fr/api/{law}/viz/amendements_{step['directory']}.json"
            ).json()
            try:
                texte = download(
                    f"https://www.lafabriquedelaloi.fr/api/{law}/procedure/{step['directory']}/texte/texte.json"
コード例 #25
0
def process(OUTPUT_DIR, procedure):
    context = Context([0, OUTPUT_DIR], load_parls=True)

    #['Indéfini', 'Adopté', 'Irrecevable', 'Rejeté', 'Retiré', 'Tombe', 'Non soutenu', 'Retiré avant séance', 'Rectifié', 'Favorable' ,'Satisfait']
    def simplify_sort(sort):
        sort = sort.lower()
        if sort in "adopté favorable":
            return "adopté"
        if sort in "rejeté ":
            return "rejeté"
        if sort in "indéfini":
            return "en attente"
        return "non-voté"

    re_clean_first = re.compile(r'^(.*?)(,| et) .*$')
    def first_author(signataires):
        if signataires is None or "gouvernement" in signataires.lower():
            return ""
        return re_clean_first.sub(r'\1, …', signataires)

    def find_groupe(amd):
        if amd['signataires'] and "gouvernement" in amd['signataires'].lower():
            return "Gouvernement"
        ct = {}
        maxc = 0
        result = ""
        for gpe in amd['groupes_parlementaires']:
            g = slug_groupe(gpe['groupe'])
            if g not in ct:
                ct[g] = 0
            ct[g] += 1
            if ct[g] > maxc:
                maxc = ct[g]
                result = g
        return result

    def add_link(links, pA, pB, weight=1):
        p1 = min(pA, pB)
        p2 = max(pA, pB)
        linkid = "%s-%s" % (p1, p2)
        if linkid not in links:
            links[linkid] = {
              "1": p1,
              "2": p2,
              "w": 0
            }
        links[linkid]["w"] += weight

    article_number_regexp = re.compile(r'article (1er.*|(\d+).*)$', re.I)
    def sort_amendements(texte, amendements):
        articles = {}
        for article in texte:
            if article['type'] == 'article':
                titre = article.get('titre')
                if titre:
                    articles[titre.lower()] = article.get('order') * 10

        def solveorder(art):
            nonlocal articles
            art = art.lower()
            order = 10000;
            if art == 'titre' or art.startswith('intitul'):
                return 0
            elif art.startswith('motion'):
                return 1
            elif art.startswith('projet') \
                or art.startswith('proposition') \
                or art.startswith('texte'):
                return 5
            else:
                m = article_number_regexp.search(art)
                if m:
                    if articles.get(m.group(1)):
                        order = articles.get(m.group(1))
                    elif articles.get(m.group(2)):
                        order = articles.get(m.group(2))
                    if 'avant' in art:
                        order -= 1
                    elif 'après' in art or 'apres' in art:
                        order += 1
            return order


        for amendement in amendements:
            amdt = amendement['amendement']
            amdt['ordre_article'] = solveorder(amdt['sujet'])

        return amendements


    CACHE_BUSTING = 'cache=%d' % time()
    if 'url_jo' in procedure:
        CACHE_BUSTING = 'cache=5feb2018' # fixed cache busting for promulgated laws
    steps = {}
    last_text_id, last_text_typeparl = None, None
    steps = procedure['steps']
    for i, step in enumerate(steps):
        print('    * step -', step.get('stage'), step.get('step'), step.get('source_url'))
        if step.get('step') not in ('commission', 'hemicycle'):
            continue
        if step.get('step') == 'commission' and step.get('stage') == 'CMP':
            continue

        if i == 0:
            continue

        last_step_index = get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure'))
        last_step = steps[last_step_index]
        last_step_with_good_text_number = steps[get_previous_step(steps, i,
            is_old_procedure=procedure.get('use_old_procedure'), get_depot_step=True)
        ]
        texte_url = last_step_with_good_text_number.get('source_url')

        if step.get('stage') != 'CMP' and last_step_with_good_text_number.get('institution') != step.get('institution'):
            print('ERROR - last step is from another institution', file=sys.stderr)
            continue

        # for a CMP hemicycle we have to get the right text inside the CMP commission
        if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle':
            urls = [last_step.get('source_url')]
            if 'cmp_commission_other_url' in last_step:
                urls.append(last_step.get('cmp_commission_other_url'))
            an_url = [url for url in urls if 'nationale.fr' in url]
            senat_url = [url for url in urls if 'senat.fr' in url]
            if step.get('institution') == 'assemblee' and an_url:
                texte_url = an_url[0]
            elif step.get('institution') == 'senat' and senat_url:
                texte_url = senat_url[0]
            else:
                print('WARNING - missing the CMP commission text for', step.get('source_url'), file=sys.stderr)
                continue

        if texte_url is None:
            print('ERROR - no texte url', step.get('source_url'), file=sys.stderr)
            continue

        texte = open_json(os.path.join(context.sourcedir, 'procedure', last_step['directory']), 'texte/texte.json')

        amdt_url = None
        if "nationale.fr" in texte_url:
            if 'assemblee_legislature' not in procedure:
                print('         + no AN legislature - pass text')
                continue
            amdt_url = 'https://nosdeputes.fr/%s/amendements/%s/json?%s' % (procedure.get('assemblee_legislature'), get_text_id(texte_url), CACHE_BUSTING)
        elif "senat.fr" in texte_url:
            amdt_url = 'https://nossenateurs.fr/amendements/%s/json?%s' % (get_text_id(texte_url), CACHE_BUSTING)

        if amdt_url is None:
            continue

        print('      * downloading amendments:', amdt_url, 'for', texte_url)

        amendements_src = download(amdt_url).json().get('amendements', [])

        # TA texts can be zero-paded or not (TA0XXX or TAXXX), we try both
        if 'amendements/TA' in amdt_url:
            textid = get_text_id(texte_url)
            if 'TA0' in textid:
                alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').lstrip('0'))
            else:
                alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').zfill(4))
            print(' WARNING: TA - trying alternative url too', alternative_url)
            amendements_src += download(alternative_url).json().get('amendements', [])

        print('        parsing amendments:', len(amendements_src))

        # ignore amendments if they are not for the correct step
        amendements_src_filtered = []
        for amd in amendements_src:
            a = amd['amendement']
            if step.get('institution') == 'assemblee':
                # commission amendments can have two forms
                #    - /amendements/LOI/NUM.asp (13th legislature)
                #    - /amendements/LOI/COMMISSION_NAME/NUM.asp (14+ legislature)
                # hemicycle amendments are:
                #    - /amendements/LOI/NUM.asp (13th legislature)
                #    - /amendements/LOI/AN/NUM.asp (14+ legislature)
                amdt_step = 'hemicycle'
                if '/cr-' in a['source']:
                    amdt_step = 'commission'
                else:
                    url_parts = a['source'].split('amendements/')[1].split('/')
                    if len(url_parts) == 3 and url_parts[1] != 'AN':
                        amdt_step = 'commission'
            elif step.get('institution') == 'senat':
                amdt_step = 'commission' if '/commissions/' in a['source'] else 'hemicycle'
            else:
                # CMP - there's not way for now to distinguish the step
                amdt_step = step['step']
            if step['step'] != amdt_step:
                continue
            amendements_src_filtered.append(amd)

        if len(amendements_src_filtered) != len(amendements_src):
            print('WARNING: amendments ignored (not the right step) %s' %
                    (len(amendements_src) - len(amendements_src_filtered)), file=sys.stderr)
        amendements_src = amendements_src_filtered

        step['nb_amendements'] = len(amendements_src)

        if len(amendements_src) > 0:
            amendements_src = sort_amendements(texte['articles'], amendements_src)

            typeparl, urlapi = identify_room(texte_url,
                legislature=step.get('assemblee_legislature', procedure.get('assemblee_legislature')))

            sujets = {}
            groupes = {}

            fix_order = False
            orders = []
            parls = {}
            links = {}
            idents = {}
            for amd in amendements_src:
                a = amd['amendement']
                if "sort" not in a:
                    print('WARNING: amendment has no sort %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr)
                    continue
                if a["sort"] == "Rectifié":
                    continue
                if "sujet" not in a or not a["sujet"]:
                    if a["sort"] not in ["Irrecevable", "Retiré avant séance"]:
                        print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr)
                    continue
                key = a['sujet']
                if not key:
                    print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr)
                    continue
                if key not in sujets:
                    orders.append(key)
                    sujets[key] = {
                      'titre': key,
                      'order': a['ordre_article'],
                      'amendements': []
                    }
                if a['ordre_article'] > 9000:
                    fix_order = True

                gpe = find_groupe(a)
                if not gpe:
                    if a["sort"] != "Irrecevable":
                        sys.stderr.write('WARNING: no groupe found for %s\n' % a['url_nos%ss' % typeparl])
                    gpe = "Inconnu"
                context.add_groupe(groupes, gpe, urlapi)

                sujets[key]['amendements'].append({
                  'numero': a['numero'],
                  'date': a['date'],
                  'sort': simplify_sort(a['sort']),
                  'groupe': gpe,
                  'id_api': a['id'],
                  'aut': first_author(a['signataires'])
                })

                cosign = []
                hmd5 = a["cle_unicite"]
                if hmd5 not in idents:
                    idents[hmd5] = []
                for parll in a["parlementaires"]:
                    parl = parll["parlementaire"]
                    if parl not in parls:
                        p = context.get_parlementaire(urlapi, parl)
                        parls[parl] = {
                          "i": p["id"],
                          "s": parl,
                          "a": 0,
                          "n": p["nom"],
                          "g": p["groupe_sigle"],
                          "p": p["place_en_hemicycle"]
                        }
                    pid = parls[parl]["i"]
                    parls[parl]["a"] += 1
                    for cid in cosign:
                        add_link(links, pid, cid)
                        #add_link(links, pid, cid, 2)
                    cosign.append(pid)
                    for cid in idents[hmd5]:
                        add_link(links, pid, cid)
                    idents[hmd5].append(pid)

            if fix_order:
                orders.sort(key=cmp_to_key(compare_articles))
                for i, k in enumerate(orders):
                    sujets[k]["order"] = i

            amdtsfile = os.path.join(context.sourcedir, 'viz', 'amendements_%s.json' % step['directory'])
            data = {'id_step': step['directory'],
                    'api_root_url': amdapi_link(urlapi),
                    'groupes': groupes,
                    'sujets': sujets}
            print_json(data, amdtsfile)

            linksfile = os.path.join(context.sourcedir, 'viz', 'amendements_links_%s.json' % step['directory'])
            data = {'id_step': step['directory'],
                    'links': list(links.values()),
                    'parlementaires': dict((p["i"], dict((k, p[k]) for k in "psang")) for p in list(parls.values()))}
            # print_json(data, linksfile)


        ###########  INTERVENTIONS #############
        # TODO: move this to a dedicated file

        print('      * downloading interventions')
        typeparl, urlapi = identify_room(texte_url,
            legislature=step.get('assemblee_legislature', procedure.get('assemblee_legislature')))
        inter_dir = os.path.join(context.sourcedir, 'procedure', step['directory'], 'interventions')
        commission_or_hemicycle = '?commission=1' if step.get('step') == 'commission' else '?hemicycle=1'
        # TODO: TA texts can be zero-paded or not (TA0XXX or TAXXX), we should try both
        seance_name = None
        intervention_files = []

        texts = (get_text_id(texte_url),)
        if last_text_typeparl == typeparl:
            texts = (get_text_id(texte_url), last_text_id)

        for loiid in texts:
            url_seances = 'https://{}.fr/seances/{}/json{}'.format(urlapi, loiid, commission_or_hemicycle)
            print('        * downloading seances - ', url_seances)
            for id_seance_obj in sorted(download(url_seances).json().get('seances', []), key=lambda x: x["seance"]):
                url_seance = 'https://{}.fr/seance/{}/{}/json'.format(urlapi, id_seance_obj['seance'], loiid)
                print('           downloading seance - ', url_seance)
                resp = download(url_seance).json()
                if resp.get('seance'):
                    inter = resp.get('seance')[0]['intervention']
                    seance_name = inter['date'] + 'T' + inter['heure'] + '_' + inter['seance_id']
                    print('            dumping seance -', seance_name)
                    intervention_files.append(seance_name)
                    if not os.path.exists(inter_dir):
                        os.makedirs(inter_dir)
                    print_json(resp, os.path.join(inter_dir, seance_name + '.json'))
            if seance_name:
                step['has_interventions'] = True
                step['intervention_files'] = intervention_files
                break

        last_text_id = get_text_id(texte_url)
        last_text_typeparl = typeparl

    return procedure
コード例 #26
0
def process(OUTPUT_DIR, procedure):
    context = Context(OUTPUT_DIR, load_parls=True)

    #['Indéfini', 'Adopté', 'Irrecevable', 'Rejeté', 'Retiré', 'Tombe', 'Non soutenu', 'Retiré avant séance', 'Rectifié', 'Favorable' ,'Satisfait']
    def simplify_sort(sort):
        sort = sort.lower()
        if sort in "adopté favorable":
            return "adopté"
        if sort in "rejeté ":
            return "rejeté"
        if sort in "indéfini":
            return "en attente"
        return "non-voté"

    re_clean_first = re.compile(r'^(.*?)(,| et) .*$')
    def first_author(signataires):
        if signataires is None or "gouvernement" in signataires.lower():
            return ""
        return re_clean_first.sub(r'\1, …', signataires)

    def find_groupe(amd, typeparl, urlapi):
        if amd['signataires'] and "gouvernement" in amd['signataires'].lower():
            return "Gouvernement"

        # Fix groupes not historicized in NosSénateurs
        if typeparl == "senateur" and amd["parlementaires"]:
            return context.get_senateur_groupe(amd["parlementaires"][0]["parlementaire"], amd["date"], urlapi)

        return amd['auteur_groupe_acronyme']

    def add_link(links, pA, pB, weight=1):
        p1 = min(pA, pB)
        p2 = max(pA, pB)
        linkid = "%s-%s" % (p1, p2)
        if linkid not in links:
            links[linkid] = {
              "1": p1,
              "2": p2,
              "w": 0
            }
        links[linkid]["w"] += weight

    article_number_regexp = re.compile(r'article (1er.*|(\d+).*)$', re.I)
    def sort_amendements(texte, amendements):
        articles = {}
        for article in texte:
            if article['type'] == 'article':
                titre = article.get('titre')
                if titre:
                    articles[titre.lower()] = article.get('order') * 10

        def solveorder(art):
            nonlocal articles
            art = art.lower()
            order = 10000
            if art == 'titre' or art.startswith('intitul'):
                return 0
            elif art.startswith('motion'):
                return 1
            elif art.startswith('projet') \
                or art.startswith('proposition') \
                or art.startswith('texte'):
                return 5
            else:
                m = article_number_regexp.search(art)
                if m:
                    if articles.get(m.group(1)):
                        order = articles.get(m.group(1))
                    elif articles.get(m.group(2)):
                        order = articles.get(m.group(2))
                    if 'avant' in art:
                        order -= 1
                    elif 'après' in art or 'apres' in art:
                        order += 1
            return order


        for amendement in amendements:
            amdt = amendement['amendement']
            amdt['ordre_article'] = solveorder(amdt['sujet'])

        return amendements


    CACHE_BUSTING = 'cache=%d' % time()
    if 'url_jo' in procedure:
        CACHE_BUSTING = 'cache=lfdll-prod' # fixed cache busting for promulgated laws
    steps = {}
    last_text_id, last_text_typeparl = None, None
    steps = procedure['steps']
    for i, step in enumerate(steps):
        print('    * step -', step.get('stage'), step.get('step'), step.get('source_url'))
        if step.get('step') not in ('commission', 'hemicycle'):
            continue
        if step.get('step') == 'commission' and step.get('stage') == 'CMP':
            continue

        if i == 0:
            continue

        last_step_index = get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure'))
        last_step = steps[last_step_index]
        last_step_with_good_text_number = steps[get_previous_step(steps, i,
            is_old_procedure=procedure.get('use_old_procedure'), get_depot_step=True)
        ]
        texte_url = last_step_with_good_text_number.get('source_url')

        if last_step.get('in_discussion'):
            print('WARNING: ignoring future steps further than current discussion', file=sys.stderr)
            break

        if step.get('stage') != 'CMP' and last_step_with_good_text_number.get('institution') != step.get('institution'):
            print('ERROR - last step is from another institution', file=sys.stderr)
            continue

        # for a CMP hemicycle we have to get the right text inside the CMP commission
        if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle':
            urls = [last_step.get('source_url')]
            if 'cmp_commission_other_url' in last_step:
                urls.append(last_step.get('cmp_commission_other_url'))
            an_url = [url for url in urls if 'nationale.fr' in url]
            senat_url = [url for url in urls if 'senat.fr' in url]
            if step.get('institution') == 'assemblee' and an_url:
                texte_url = an_url[0]
            elif step.get('institution') == 'senat' and senat_url:
                texte_url = senat_url[0]
            else:
                print('WARNING - missing the CMP commission text for', step.get('source_url'), file=sys.stderr)
                continue

        if texte_url is None:
            print('ERROR - no texte url', step.get('source_url'), file=sys.stderr)
            continue

        legislature = None
        if 'assemblee-nationale.fr' in texte_url:
            legislature = national_assembly_text_legislature(texte_url)

        texte = open_json(os.path.join(context.sourcedir, 'procedure', last_step['directory']), 'texte/texte.json')

        typeparl, urlapi = identify_room(texte_url, legislature)

        amdt_url = None
        if "nationale.fr" in texte_url:
            if 'assemblee_legislature' not in procedure:
                print('         + no AN legislature - pass text')
                continue
            amdt_url = 'https://%s.fr/%s/amendements/%s/json?%s' % (urlapi, legislature, get_text_id(texte_url), CACHE_BUSTING)
        elif "senat.fr" in texte_url:
            amdt_url = 'https://%s.fr/amendements/%s/json?%s' % (urlapi, get_text_id(texte_url), CACHE_BUSTING)

        if amdt_url is None:
            continue

        print('      * downloading amendments:', amdt_url, 'for', texte_url)

        try:
            amendements_src = download(amdt_url).json().get('amendements', [])
        except:
            raise Exception("ERROR: amendements JSON at %s is badly formatted, it should probably be hardcached on ND/NS" % amdt_url)

        # TA texts can be zero-paded or not (TA0XXX or TAXXX), we try both
        if 'amendements/TA' in amdt_url:
            textid = get_text_id(texte_url)
            if 'TA0' in textid:
                alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').lstrip('0'))
            else:
                alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').zfill(4))
            print(' WARNING: TA - trying alternative url too', alternative_url)
            try:
                amendements_src += download(alternative_url).json().get('amendements', [])
            except:
                raise Exception("ERROR: amendements JSON at %s is badly formatted, it should probably be hardcached on ND/NS" % alternative_url)

        print('        parsing amendments:', len(amendements_src))

        # ignore amendments if they are not for the correct step
        amendements_src_filtered = []
        for amd in amendements_src:
            a = amd['amendement']
            if step.get('institution') == 'assemblee':
                # commission amendments can have two forms
                #    - /amendements/LOI/NUM.asp (13th legislature)
                #    - /amendements/LOI/COMMISSION_NAME/NUM.asp (14+ legislature)
                # hemicycle amendments are:
                #    - /amendements/LOI/NUM.asp (13th legislature)
                #    - /amendements/LOI/AN/NUM.asp (14+ legislature)
                amdt_step = 'hemicycle'
                if '/cr-' in a['source']:
                    amdt_step = 'commission'
                else:
                    url_parts = a['source'].split('amendements/')[1].split('/')
                    if len(url_parts) == 3 and url_parts[1] != 'AN':
                        amdt_step = 'commission'
            elif step.get('institution') == 'senat':
                amdt_step = 'commission' if '/commissions/' in a['source'] else 'hemicycle'
            else:
                # CMP - there's not way for now to distinguish the step
                amdt_step = step['step']
            if step['step'] != amdt_step:
                continue
            amendements_src_filtered.append(amd)

        if len(amendements_src_filtered) != len(amendements_src):
            print('WARNING: amendments ignored (not the right step) %s' %
                    (len(amendements_src) - len(amendements_src_filtered)), file=sys.stderr)
        amendements_src = amendements_src_filtered

        step['nb_amendements'] = len(amendements_src)

        if len(amendements_src) > 0:
            amendements_src = sort_amendements(texte['articles'], amendements_src)

            sujets = {}
            groupes = {}

            fix_order = False
            orders = []
            parls = {}
            links = {}
            idents = {}
            for amd in amendements_src:
                a = amd['amendement']
                if "sort" not in a:
                    print('WARNING: amendment has no sort %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr)
                    continue
                if a["sort"] == "Rectifié":
                    continue
                if "sujet" not in a or not a["sujet"]:
                    if a["sort"] not in ["Irrecevable", "Retiré avant séance"]:
                        print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr)
                    continue
                key = a['sujet']
                if not key:
                    print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr)
                    continue
                if key not in sujets:
                    orders.append(key)
                    sujets[key] = {
                      'titre': key,
                      'order': a['ordre_article'],
                      'amendements': []
                    }
                if a['ordre_article'] > 9000:
                    fix_order = True

                gpe = find_groupe(a, typeparl, urlapi)
                if not gpe:
                    if a["sort"] != "Irrecevable":
                        sys.stderr.write('WARNING: no groupe found for %s\n' % a['url_nos%ss' % typeparl])
                    gpe = "Inconnu"
                context.add_groupe(groupes, gpe, urlapi)

                sujets[key]['amendements'].append({
                  'numero': a['numero'],
                  'date': a['date'],
                  'sort': simplify_sort(a['sort']),
                  'groupe': gpe,
                  'id_api': a['id'],
                  'aut': first_author(a['signataires'])
                })

                cosign = []
                hmd5 = a["cle_unicite"]
                if hmd5 not in idents:
                    idents[hmd5] = []
                for parll in a["parlementaires"]:
                    parl = parll["parlementaire"]
                    if parl not in parls:
                        p = context.get_parlementaire(urlapi, parl)
                        parls[parl] = {
                          "i": p["id"],
                          "s": parl,
                          "a": 0,
                          "n": p["nom"],
                          "g": p["groupe_sigle"],
                          "p": p["place_en_hemicycle"]
                        }
                    pid = parls[parl]["i"]
                    parls[parl]["a"] += 1
                    for cid in cosign:
                        add_link(links, pid, cid)
                        #add_link(links, pid, cid, 2)
                    cosign.append(pid)
                    for cid in idents[hmd5]:
                        add_link(links, pid, cid)
                    idents[hmd5].append(pid)

            if fix_order:
                orders.sort(key=cmp_to_key(compare_articles))
                for i, k in enumerate(orders):
                    sujets[k]["order"] = i

            amdtsfile = os.path.join(context.sourcedir, 'viz', 'amendements_%s.json' % step['directory'])
            data = {'id_step': step['directory'],
                    'api_root_url': amdapi_link(urlapi),
                    'groupes': groupes,
                    'sujets': sujets}
            print_json(data, amdtsfile)

            linksfile = os.path.join(context.sourcedir, 'viz', 'amendements_links_%s.json' % step['directory'])
            data = {'id_step': step['directory'],
                    'links': list(links.values()),
                    'parlementaires': dict((p["i"], dict((k, p[k]) for k in "psang")) for p in list(parls.values()))}
            # print_json(data, linksfile)

        ###########  INTERVENTIONS #############
        # TODO: move this to a dedicated file

        print('      * downloading interventions')
        typeparl, urlapi = identify_room(texte_url, legislature)
        inter_dir = os.path.join(context.sourcedir, 'procedure', step['directory'], 'interventions')
        commission_or_hemicycle = '?commission=1' if step.get('step') == 'commission' else '?hemicycle=1'
        # TODO: TA texts can be zero-paded or not (TA0XXX or TAXXX), we should try both
        seance_name = None
        intervention_files = []

        texts = (get_text_id(texte_url),)
        if last_text_typeparl == typeparl:
            texts = (get_text_id(texte_url), last_text_id)

        for loiid in texts:
            if typeparl == 'depute':
                url_seances = 'https://%s.fr/%s/seances/%s/json%s' % (urlapi, legislature, loiid, commission_or_hemicycle)
            else:
                url_seances = 'https://%s.fr/seances/%s/json%s' % (urlapi, loiid, commission_or_hemicycle)

            print('        * downloading seances - ', url_seances)
            for id_seance_obj in sorted(download(url_seances).json().get('seances', []), key=lambda x: x["seance"]):
                if typeparl == 'depute':
                    url_seance = 'https://%s.fr/%s/seance/%s/%s/json' % (urlapi, legislature, id_seance_obj['seance'], loiid)
                else:
                    url_seance = 'https://%s.fr/seance/%s/%s/json' % (urlapi, id_seance_obj['seance'], loiid)

                print('           downloading seance - ', url_seance)
                resp = download(url_seance).json()
                if resp.get('seance'):
                    inter = resp.get('seance')[0]['intervention']
                    seance_name = inter['date'] + 'T' + inter['heure'] + '_' + inter['seance_id']
                    print('            dumping seance -', seance_name)
                    intervention_files.append(seance_name)
                    if not os.path.exists(inter_dir):
                        os.makedirs(inter_dir)
                    print_json(resp, os.path.join(inter_dir, seance_name + '.json'))
            if seance_name:
                step['has_interventions'] = True
                step['intervention_files'] = intervention_files
                break

        last_text_id = get_text_id(texte_url)
        last_text_typeparl = typeparl

    return procedure
コード例 #27
0
def test_status(url):
    resp = download(url)
    if resp.status_code != 200:
        return False
    return resp
コード例 #28
0
def download_senat(url, log=sys.stderr, verbose=True):
    if verbose: print('  [] download SENAT version')
    html = download(url).text
    if verbose: print('  [] parse SENAT version')
    return senapy_parse(html, url, logfile=log)
コード例 #29
0
def parse(url, resp=None, DEBUG=False, include_annexes=False):
    """
    parse the text of an url, an already cached  to`resp` can be passed to avoid an extra network request
    """
    all_articles = []

    def pr_js(article):
        nonlocal all_articles, texte
        if not len(all_articles):
            add_to_articles(texte, all_articles)
        add_to_articles(article, all_articles)

    if url.endswith('.pdf'):
        print("WARNING: text url is a pdf: %s skipping it..." % url)
        return all_articles
    if 'assemblee-nat.fr' in url:
        print("WARNING: url corresponds to old AN website: %s skipping it..." %
              url)
        return all_articles

    if url.startswith('http'):
        resp = download(url) if resp is None else resp
        if '/textes/' in url:
            resp.encoding = 'utf-8'
        if 'assemblee-nationale.fr' in url:
            resp.encoding = 'Windows-1252'
        string = resp.text
    elif url == '-':
        string = sys.stdin.read()
    else:
        try:
            string = open(url).read()
        except:
            string = open(url, encoding="Windows-1252").read()

    string, has_multiple_expose = clean_extra_expose_des_motifs(string)

    if 'legifrance.gouv.fr' in url:
        for reg, res in clean_legifrance_regexps:
            string = reg.sub(res, string)
    else:
        for reg, res in clean_texte_regexps:
            string = reg.sub(res, string)

    #fix weird Sénat formatting with single cells tables around pieces of text sometimes multiline... ex: https://www.senat.fr/leg/ppl15-246.html
    for match in re.findall(
            r'(<table[^>]*>\s*(?:<t(?:body|r|d|h)[^>]*>\s*)+)(.*?)((?:\s*</t(?:body|r|d|h)[^>]*>)+\s*</table>)',
            string, re.I):
        if not re.search(r'<t(r|d|h)[^>]*>', match[1], re.I):
            string = string.replace(''.join(match), match[1])

    srclst = []
    source_avenants = False
    m = re.search(
        r"NB(\s|&nbsp;)+:(\s|&nbsp;)+[lL]es? textes? d(u |es |e la |e l&#8217;)((convention|traité|avenant)s? et de(s| l&#8217;))?(accord|convention)s?(-cadres?)? figuren?t? (respectivement )?en annexe aux (deux |trois )?projets de loi \(n",
        re.sub(r'</?span[^>]*>', '', string), re.I)
    if m:
        try:
            srclst = [
                int(s.strip('no ')) for s in
                (string.replace('<sup>', '').replace('</sup>', '').replace(
                    '&nbsp;', ' ').replace('aux deux projets', 'aux projets').
                 replace('aux trois projets', 'aux projets').replace(
                     '°', 'o').replace('nos ', 'no ').replace('ns ', 'no ').
                 replace('(n ', '(no ').split(
                     ' en annexe aux projets de loi (no ')[1].strip().split(
                         ')')[0].strip().replace(' et ', ', ').split(', '))
            ]
            source_avenants = True
        except Exception as e:
            if DEBUG:
                print(
                    "WARNING, multi-reports detected with NB method crashing (%s: %s), trying regular method..."
                    % (type(e), e))
    if not source_avenants and "/rapports/r" in url and "TEXTES ADOPTÉS PAR LA COMMISSION" in string and string.count(
            ">Article unique<") == 2:
        m = re.search(
            r'<i>Assemblée nationale&nbsp;:&nbsp;</i><b>(\d+) </b>et<b> (\d+)</b>',
            string)
        if m:
            srclst = [int(m.group(1)), int(m.group(2))]
            source_avenants = True

    definitif = re_definitif.search(
        string) is not None or re_definitif_new_format.search(
            string) is not None or 'legifrance.gouv.fr' in url
    soup = BeautifulSoup(string, "html5lib")
    texte = {"type": "texte", "source": url, "definitif": definitif}

    # Generate Senat or AN ID from URL
    if url.startswith('http'):
        if "legifrance.gouv.fr" in url:
            m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I)
            if m:
                texte["id"] = m.group(1)
            elif "/jo/texte" in url:
                texte["id"] = url.split('/')[-3]
        elif re.search(r"assemblee-?nationale", url, re.I):
            m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I)
            numero = int(m.group(3))
            texte["id"] = "A" + m.group(1) + "-"
            if m.group(2) is not None:
                texte["id"] += m.group(2)
            texte["id"] += str(numero)
            texte["nosdeputes_id"] = get_text_id(url)
        else:
            m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})(rec)?\d?(_mono)?\.", url,
                          re.I)
            if m is None:
                m = re.search(r"/(-)?20(\d+)-\d+/(\d+)(_mono)?.html", url,
                              re.I)
            numero = int(m.group(3))
            texte["id"] = "S" + m.group(2) + "-"
            if m.group(1) is not None:
                texte["id"] += m.group(1)
            texte["id"] += "%03d" % numero
            texte["nossenateurs_id"] = get_text_id(url)

    texte["titre"] = clean_html(
        re_clean_title_legif.sub(
            '', soup.title.string.strip())) if soup.title else ""
    texte["expose"] = ""
    expose = False

    # states 'read' can be set to:
    READ_DISABLED = -1  # the text is not detected yet
    READ_TEXT = 0  # read the text
    READ_TITLE = 1  # titles lecture
    READ_ALINEAS = 2  # alineas lecture

    read = READ_TEXT
    art_num = ali_num = 0
    article = {}
    indextext = -1
    curtext = -1
    section = {"type": "section", "id": ""}

    rejected_all_articles = [
    ]  # we only keep the last detected text by default, here are stored the previous texts

    def should_be_parsed(x):
        """returns True if x can contain useful information"""
        if x.name not in ('p', 'table', 'h1', 'h2', 'h4'):
            return False
        # hack: we don't want to parse the table containing the conclusion from the senat
        #       ex: https://www.senat.fr/leg/tas12-040.html
        if x.name == "table" and re.search("SESSION (EXTRA)?ORDINAIRE DE",
                                           str(x)):
            return False
        # hack: senate can copy paste the /textes/ output from the AN
        #       ex: https://www.senat.fr/leg/ppl17-545.html
        # TODO: they also mess up the encoding by doing that
        if x.name == "table" and re.search(
                "<!-- Col de controle de taille -->", str(x)):
            return False
        return True

    def should_be_ignored(x):
        if hasattr(x, 'attrs') and 'display: none' in x.attrs.get('style', ''):
            return True
        return False

    for text in non_recursive_find_all(soup, should_be_parsed,
                                       should_be_ignored):
        line = clean_html(str(text))
        if DEBUG:
            print(read,
                  article.get('titre') or art_num,
                  ali_num,
                  line,
                  file=sys.stderr)

        # limit h2/h4 matches to PPL headers or Article unique
        if text.name not in ('p', 'table') and not re_mat_ppl.match(
                line) and not re_mat_tco.match(
                    line) and 'Article unique' not in line:
            if DEBUG:
                print(" -> IGNORING LINE", file=sys.stderr)
            continue

        if re_stars.match(line):
            continue
        if line == "<b>RAPPORT</b>" or line == "Mesdames, Messieurs,":
            read = READ_DISABLED
        if (srclst or indextext != -1) and re_sep_text.match(line):
            curtext += 1
            art_num = 0
        srcl = re_src_mult.search(line)
        if not source_avenants and srcl and read in (READ_DISABLED, READ_TEXT):
            srclst.append(int(srcl.group(1)))
            continue
        cl_line = re_cl_html.sub("", line).strip()
        if re_rap_mult.match(line):
            line = cl_line
            line = re_clean_mult_1.sub(",", line)
            line = re_clean_mult_2.sub("", line)
            cl_line = re_cl_html.sub("", line).strip()
            for n_t in line.split(','):
                indextext += 1
                if int(n_t) == numero:
                    break
        elif re_mat_ppl.match(line) or re_mat_tco.match(line) or (
                read == READ_DISABLED and line == "<b>Article 1er</b>"):
            read = READ_TEXT
            if len(all_articles):
                if DEBUG:
                    print('WARNING: Found articles before the real text')
                if article is not None:
                    pr_js(article)
                rejected_all_articles.append(all_articles)
                all_articles = []
                article = {}
                art_num = 0
        elif re_mat_exp.match(line):
            read = READ_DISABLED  # Deactivate description lecture
            expose = True
        elif read == READ_TEXT and definitif_before_congres in line or definitif_after_congres in line:
            texte['definitif'] = True
            if all_articles:
                all_articles[0]['definitif'] = True
            continue
        elif (re_echec_cmp.search(cl_line) or re_echec_com.search(cl_line)
              or re_echec_com2.search(cl_line) or re_echec_com3.search(cl_line)
              or re_echec_com4.search(cl_line) or re_echec_com5.search(cl_line)
              or re_echec_com6.search(cl_line) or re_echec_hemi.match(cl_line)
              or re_echec_hemi2.search(cl_line)
              or re_echec_hemi3.search(cl_line)
              ) and 'dont la teneur suit' not in cl_line:
            pr_js({"type": "echec", "texte": cl_line})
            break
        elif read == READ_DISABLED:
            continue
        # or (indextext != -1 and curtext != indextext): #keep all texts resulting from multireport now it's selected then in complete

        # crazy edge case: "(Conforme)Article 24 bis A (nouveau)" on one line
        # http://www.assemblee-nationale.fr/13/projets/pl3324.asp
        # simplified, just do the "(Conforme)" case
        if '<i>(Conforme)</i>' in line and re_mat_art.search(line):
            article["statut"] = 'conforme'
            line = line.replace('<i>(Conforme)</i>', '')
            cl_line = cl_line.replace('(Conforme)', '')

        # another crazy edge case: the text is inside the annexe
        # ex: http://www.assemblee-nationale.fr/13/rapports/r2083.asp
        # TODO: could detect via "le présent projet de loi dans le texte figurant en annexe"
        #       like the source_avenants logic
        if read != READ_ALINEAS and re_approb.match(line):
            art_num += 1
            article = {
                "type": "article",
                "order": art_num,
                "alineas": {},
                "statut": "none",
                "titre": "1er"
            }
            read = READ_ALINEAS

        # Identify section zones
        line = normalize_section_title(line, text, has_multiple_expose)
        m = re_mat_sec.match(line)
        if m:
            read = READ_TITLE  # Activate titles lecture
            section["type_section"] = real_lower(m.group(1))
            section_typ = m.group(1).upper()[0]
            if m.group(3) is not None:
                section_typ += "S"

            if re.search(re_préliminaire,
                         line) or " LIMINAIRE" in line.upper():
                section_num = "L"
            else:
                section_num = re_cl_html.sub('', m.group(5).strip())
                if word_to_number(section_num) is not None:
                    section_num = word_to_number(section_num)
                section_num = normalize_1(section_num, '1')
                section_num = re_clean_bister.sub(
                    lambda m: m.group(1) + " " + real_lower(m.group(2)),
                    section_num)
                section_num = re_mat_new.sub('', section_num).strip()
                m2 = re_mat_romans.match(section_num)
                if m2:
                    rest = section_num.replace(m2.group(0), '')
                    section_num = romans(m2.group(0))
                    if rest:
                        section_num = str(section_num) + rest
            # Get parent section id to build current section id
            section_par = re.sub(r"" + section_typ + r"[\dL].*$", "",
                                 section["id"])
            section["id"] = section_par + section_typ + str(section_num)
            # check_section_is_not_a_duplicate(section["id"])

            titre = blank_none(m.group('titre')).strip()
            if titre:
                section['titre'] = titre
                if article is not None:
                    pr_js(article)
                    article = {}
                pr_js(section)
                read = READ_TEXT
        elif re_mat_end.match(line) and not include_annexes:
            if not expose:
                if DEBUG:
                    print("DEBUG: END OF TEXT OF DETECTED")
                if len(all_articles) > 0:
                    break
            expose = False
            continue
        # Annexes.
        elif read == READ_ALINEAS and re_mat_ann.match(line):
            if include_annexes:
                if article is not None:
                    pr_js(article)
                titre = re_cl_html.sub("", re_mat_ann.sub("", line))
                art_num += 1
                article = {
                    "type": "annexe",
                    "order": art_num,
                    "alineas": {},
                    "statut": "none",
                    "titre": titre
                }
                ali_num = 0
            else:
                break
        # Identify titles and new article zones
        elif (re.match(r"(<i>)?<b>", line) or re_art_uni.match(cl_line)
              or re.match(r"^Articles? ", line)) and not re.search(
                  r">Articles? supprimé", line):

            line = cl_line.strip()
            # Read a new article
            if re_mat_art.match(line):
                if article is not None:
                    pr_js(article)
                read = READ_ALINEAS  # Activate alineas lecture
                expose = False
                art_num += 1
                ali_num = 0
                article = {
                    "type": "article",
                    "order": art_num,
                    "alineas": {},
                    "statut": "none"
                }
                if srclst:
                    article["source_text"] = srclst[curtext]
                m = re_mat_art.match(clean_article_name(text))
                article["titre"] = normalize_1(m.group(1),
                                               "1er").replace(u"İ", "I")

                assert article["titre"]  # avoid empty titles
                assert not texte['definitif'] or ' bis' not in article[
                    "titre"]  # detect invalid article names

                if m.group(2) is not None:
                    article["statut"] = re_cl_par.sub("", real_lower(
                        m.group(2))).strip()
                if section["id"] != "":
                    article["section"] = section["id"]
            # Read a section's title
            elif read == READ_TITLE and line:
                section["titre"] = lower_but_first(line)
                if article is not None:
                    pr_js(article)
                    article = {}
                pr_js(section)
                read = READ_TEXT

        # detect dots, used as hints for later completion
        if read != READ_DISABLED:
            if re_mat_dots.match(line):
                if article is not None:
                    pr_js(article)
                    article = {}
                pr_js({"type": "dots"})
                read = READ_TEXT  # ignore alineas after the dots
                continue

        # Read articles with alineas
        if read == READ_ALINEAS and not m:
            line = re_clean_coord.sub('', line)
            # if the line was only "Pour coordination", ignore it
            if not line:
                continue
            # Find extra status information
            if ali_num == 0 and re_mat_st.match(line):
                article["statut"] = re_cl_html.sub(
                    "",
                    re_cl_par.sub("", real_lower(line)).strip()).strip()
                continue
            if "<table>" in line:
                cl_line = cl_html_except_tables(line)
            line = re_clean_art_spaces2.sub(
                '. - ',
                re_clean_art_spaces.sub(
                    r'\1',
                    re_clean_idx_spaces.sub(
                        r'\1. ',
                        re_mat_new.sub(" ", cl_line).strip())))
            # Clean low/upcase issues with BIS TER etc.
            line = line.replace("oeUVRE", "OEUVRE")
            line = clean_full_upcase(line)
            line = re_clean_premier.sub(
                lambda m: (real_lower(m.group(0))
                           if m.group(1) else "") + m.group(3) + "er", line)
            line = re_clean_bister.sub(
                lambda m: m.group(1) + " " + real_lower(m.group(2)), line)
            # Clean different versions of same comment.
            line = re_clean_supr.sub('(Supprimé)', line)
            line = re_clean_conf.sub(r'\1(Non modifié)', line)
            line = re_clean_subsec_space.sub(r'\1\4 \5', line)
            line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line)

            tmp = line
            line = re_clean_punc_space.sub(r'\1 \2', tmp)
            line = re_clean_spaces.sub(' ', line)
            line = re_mat_sec.sub(
                lambda x: lower_but_first(x.group(1)) + x.group(4)
                if re_mat_n.match(x.group(4)) else x.group(0), line)
            line = re_clean_footer_notes.sub(".", line)
            # Clean comments (Texte du Sénat), (Texte de la Commission), ...
            if ali_num == 0 and re_mat_texte.match(line):
                continue
            line = re_mat_single_char.sub("", line)
            line = line.strip()
            if line:
                ali_num += 1
                article["alineas"]["%03d" % ali_num] = line
        else:
            #metas
            continue

    # sometimes we find multiple text starts inside one (mainly due to annotations),
    # by default we normally keep only the latest one, but if it is empty,
    # try to find a good one from previously parsed articles
    # ex: http://www.assemblee-nationale.fr/15/propositions/pion0965.asp
    if not all_articles:
        for rejected in rejected_all_articles:
            articles_parsed = [
                art for art in rejected if art.get('type') == 'article'
            ]
            if len(articles_parsed):
                print(
                    'WARNING: retrieving parsed text from a previously rejected text'
                )
                all_articles = rejected
                break

    if article is not None:
        pr_js(article)

    if indextext != -1 and curtext + 1 != len(srclst):
        print(
            "WARNING: multiple texts announced but %d/%d found %s" %
            (curtext + 1, len(srclst), srclst), indextext)

    return all_articles
コード例 #30
0
ファイル: opendata.py プロジェクト: regardscitoyens/senapy
def fetch_csv():
    csv_resp = download(
        "http://data.senat.fr/data/dosleg/dossiers-legislatifs.csv").text
    return list(csv.DictReader(csv_resp.split('\n'), delimiter=';'))
コード例 #31
0
def download_texte(url):
    text = download(url).text
    return clean_fioritures(clean_br(text))
コード例 #32
0
def parse(url, resp=None):
    """
    parse the text of an url, an already cached  to`resp` can be passed to avoid an extra network request
    """
    all_articles = []
    def pr_js(article):
        nonlocal all_articles
        add_to_articles(article, all_articles)

    def save_text(txt):
        if "done" not in txt:
            pr_js(txt)
        txt["done"] = True
        return txt

    if url.endswith('.pdf'):
        print("WARNING: text url is a pdf: %s skipping it..." % url)
        return all_articles
    if 'assemblee-nat.fr' in url:
        print("WARNING: url corresponds to old AN website: %s skipping it..." % url)
        return all_articles


    if url.startswith('http'):
        resp = download(url) if resp is None else resp
        if '/textes/'in url:
            resp.encoding = 'utf-8'
        string = resp.text
    else:
        string = open(url).read()

    string, has_multiple_expose = clean_extra_expose_des_motifs(string)

    if 'legifrance.gouv.fr' in url:
        for reg, res in clean_legifrance_regexps:
            string = reg.sub(res, string)
    else:
        for reg, res in clean_texte_regexps:
            string = reg.sub(res, string)


    definitif = re_definitif.search(string) is not None or 'legifrance.gouv.fr' in url
    soup = BeautifulSoup(string, "html5lib")
    texte = {"type": "texte", "source": url, "definitif": definitif}

    # Generate Senat or AN ID from URL
    if url.startswith('http'):
        if "legifrance.gouv.fr" in url:
            m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I)
            texte["id"] = m.group(1)
        elif re.search(r"assemblee-?nationale", url, re.I):
            m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I)
            numero = int(m.group(3))
            texte["id"] = "A" + m.group(1) + "-"
            if m.group(2) is not None:
                texte["id"] += m.group(2)
            texte["id"] += str(numero)
            texte["nosdeputes_id"] = get_text_id(url)
        else:
            m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})(rec)?\d?(_mono)?\.", url, re.I)
            if m is None:
                m = re.search(r"/(-)?20(\d+)-\d+/(\d+)(_mono)?.html", url, re.I)
            numero = int(m.group(3))
            texte["id"] = "S" + m.group(2) + "-"
            if m.group(1) is not None:
                texte["id"] += m.group(1)
            texte["id"] += "%03d" % numero
            texte["nossenateurs_id"] = get_text_id(url)

    texte["titre"] = re_clean_title_legif.sub('', soup.title.string.strip()) if soup.title else ""
    texte["expose"] = ""
    expose = False

    # 'read' can be
    #     -1 : the text is not detected yet
    #      0 : read the text
    #      1 : titles lecture
    #      2 : alineas lecture
    read = art_num = ali_num = 0
    section_id = ""
    article = None
    indextext = -1
    curtext = -1
    srclst = []
    section = {"type": "section", "id": ""}

    def should_be_parsed(x):
        """returns True if x can contain useful information"""
        if x.name not in ('p', 'table', 'h2', 'h4'):
            return False
        # hack: we don't want to parse the table containing the conclusion from the senat
        # ex: https://www.senat.fr/leg/tas12-040.html
        if x.name == "table" and re.search("SESSION (EXTRA)?ORDINAIRE DE", str(x)):
            return False
        return True

    for text in non_recursive_find_all(soup, should_be_parsed):
        line = clean_html(str(text))

        # limit h2/h4 matches to PPL headers or Article unique
        if text.name not in ('p', 'table') and not re_mat_ppl.match(line) and 'Article unique' not in line:
            continue

        if re_stars.match(line):
            continue
        if line == "<b>RAPPORT</b>" or line == "Mesdames, Messieurs,":
            read = -1
        if (srclst or indextext != -1) and re_sep_text.match(line):
            curtext += 1
            art_num = 0
        srcl = re_src_mult.search(line)
        cl_line = re_cl_html.sub("", line).strip()
        if srcl and read < 1:
            srclst.append(int(srcl.group(1)))
            continue
        elif re_rap_mult.match(line):
            line = cl_line
            line = re_clean_mult_1.sub(",", line)
            line = re_clean_mult_2.sub("", line)
            cl_line = re_cl_html.sub("", line).strip()
            for n_t in line.split(','):
                indextext += 1
                if int(n_t) == numero:
                    break
        elif re_mat_ppl.match(line) or re_mat_tco.match(line):
            read = 0
            texte = save_text(texte)
        elif re_mat_exp.match(line):
            read = -1 # Deactivate description lecture
            expose = True
        elif read == 0 and definitif_before_congres in line or definitif_after_congres in line:
            texte['definitif'] = True
            if all_articles:
                all_articles[0]['definitif'] = True
            continue
        elif (re_echec_cmp.search(cl_line)
                or re_echec_com.search(cl_line)
                or re_echec_com2.search(cl_line)
                or re_echec_com3.search(cl_line)
                or re_echec_com4.search(cl_line)
                or re_echec_com5.search(cl_line)
                or re_echec_com6.search(cl_line)
                or re_echec_hemi.match(cl_line)
                or re_echec_hemi2.search(cl_line)
                or re_echec_hemi3.search(cl_line)
            ) and 'dont la teneur suit' not in cl_line:
            texte = save_text(texte)
            pr_js({"type": "echec", "texte": cl_line})
            break
        elif read == -1 or (indextext != -1 and curtext != indextext):
            continue

        # crazy edge case: "(Conforme)Article 24 bis A (nouveau)" on one line
        # http://www.assemblee-nationale.fr/13/projets/pl3324.asp
        # simplified, just do the "(Conforme)" case
        if '<i>(Conforme)</i>' in line and re_mat_art.search(line):
            article["statut"] = 'conforme'
            line = line.replace('<i>(Conforme)</i>', '')
            cl_line = cl_line.replace('(Conforme)', '')

        # Identify section zones
        line = normalize_section_title(line, text, has_multiple_expose)
        m = re_mat_sec.match(line)
        if m:
            read = 1 # Activate titles lecture
            section["type_section"] = real_lower(m.group(1))
            section_typ = m.group(1).upper()[0]
            if m.group(3) is not None:
                section_typ += "S"

            if " LIMINAIRE" in line:
                section_num = "L"
            else:
                section_num = re_cl_html.sub('', m.group(5).strip())
                if word_to_number(section_num) is not None:
                    section_num = word_to_number(section_num)
                section_num = normalize_1(section_num, '1')
                section_num = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), section_num)
                section_num = re_mat_new.sub('', section_num).strip()
                m2 = re_mat_romans.match(section_num)
                if m2:
                    rest = section_num.replace(m2.group(0), '')
                    section_num = romans(m2.group(0))
                    if rest: section_num = str(section_num) + rest
            # Get parent section id to build current section id
            section_par = re.sub(r""+section_typ+"[\dL].*$", "", section["id"])
            section["id"] = section_par + section_typ + str(section_num)
            # check_section_is_not_a_duplicate(section["id"])

            titre = blank_none(m.group('titre')).strip()
            if titre:
                texte = save_text(texte)
                section['titre'] = titre
                if article is not None:
                    pr_js(article)
                    article = None
                pr_js(section)
                read = 0
        # Identify titles and new article zones
        elif (not expose and re_mat_end.match(line)) or (read == 2 and re_mat_ann.match(line)):
            break
        elif (re.match(r"(<i>)?<b>", line) or re_art_uni.match(cl_line) or re.match(r"^Articles? ", line)
            ) and not re.search(r">Articles? supprimé", line):

            line = cl_line.strip()
            # Read a new article
            if re_mat_art.match(line):
                if article is not None:
                    texte = save_text(texte)
                    pr_js(article)
                read = 2 # Activate alineas lecture
                expose = False
                art_num += 1
                ali_num = 0
                article = {"type": "article", "order": art_num, "alineas": {}, "statut": "none"}
                if srclst:
                    article["source_text"] = srclst[curtext]
                m = re_mat_art.match(clean_article_name(text))
                article["titre"] = normalize_1(m.group(1), "1er")

                assert article["titre"]  # avoid empty titles
                assert not texte['definitif'] or ' bis' not in article["titre"]  # detect invalid article names

                if m.group(2) is not None:
                    article["statut"] = re_cl_par.sub("", real_lower(m.group(2))).strip()
                if section["id"] != "":
                    article["section"] = section["id"]
            # Read a section's title
            elif read == 1 and line:
                texte = save_text(texte)
                section["titre"] = lower_but_first(line)
                if article is not None:
                    pr_js(article)
                    article = None
                pr_js(section)
                read = 0

        # detect dots, used as hints for later completion
        if read != -1 and len(all_articles) > 0:
            if re_mat_dots.match(line):
                if article is not None:
                    texte = save_text(texte)
                    pr_js(article)
                    article = None
                pr_js({"type": "dots"})
                read = 0
                continue

        # Read articles with alineas
        if read == 2 and not m:
            line = re_clean_coord.sub('', line)
            # if the line was only "Pour coordination", ignore it
            if not line:
                continue
            # Find extra status information
            if ali_num == 0 and re_mat_st.match(line):
                article["statut"] = re_cl_html.sub("", re_cl_par.sub("", real_lower(line)).strip()).strip()
                continue
            if "<table>" in line:
                cl_line = cl_html_except_tables(line)
            line = re_clean_art_spaces2.sub('. - ', re_clean_art_spaces.sub(r'\1', re_clean_idx_spaces.sub(r'\1. ', re_mat_new.sub(" ", cl_line).strip())))
            # Clean low/upcase issues with BIS TER etc.
            line = line.replace("oeUVRE", "OEUVRE")
            line = clean_full_upcase(line)
            line = re_clean_premier.sub(lambda m: (real_lower(m.group(0)) if m.group(1) else "")+m.group(3)+"er", line)
            line = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), line)
            # Clean different versions of same comment.
            line = re_clean_supr.sub('(Supprimé)', line)
            line = re_clean_conf.sub(r'\1(Non modifié)', line)
            line = re_clean_subsec_space.sub(r'\1\4 \5', line)
            line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line)

            tmp = line
            line = re_clean_punc_space.sub(r'\1 \2', tmp)
            line = re_clean_spaces.sub(' ', line)
            line = re_mat_sec.sub(lambda x: lower_but_first(x.group(1))+x.group(4) if re_mat_n.match(x.group(4)) else x.group(0), line)
            line = re_clean_footer_notes.sub(".", line)
            # Clean comments (Texte du Sénat), (Texte de la Commission), ...
            if ali_num == 0 and re_mat_texte.match(line):
                continue
            line = re_mat_single_char.sub("", line)
            line = line.strip()
            if line:
                ali_num += 1
                article["alineas"]["%03d" % ali_num] = line
        else:
            #metas
            continue

    if article is not None:
        save_text(texte)
        pr_js(article)

    return all_articles
コード例 #33
0
ファイル: opendata.py プロジェクト: regardscitoyens/senapy
def fetch_csv():
    csv_resp = download("http://data.senat.fr/data/dosleg/dossiers-legislatifs.csv").text
    return list(csv.DictReader(csv_resp.split('\n'), delimiter=';'))