def download_historic_dosleg(url): resp = download(url) if '/dyn/' in resp.url: # fallback to backed-up doslegs when the redirect is forced legislature, slug = parse_national_assembly_url(url) display_url = AN_OLD_URL_TEMPLATE.format(legislature=legislature, slug=slug) download_url = 'https://raw.githubusercontent.com/regardscitoyens/archive-AN-doslegs/master/archive/' \ + display_url.split('.fr/')[1] resp = download(download_url) resp.url = display_url resp.encoding = 'Windows-1252' return resp
def process(output_directory): if not os.path.exists(output_directory): os.makedirs(output_directory) yesterday = time.time() - 86400 for url in "2007-2012.nosdeputes", "2012-2017.nosdeputes", "2017-2022.nosdeputes", "www.nosdeputes", "www.nossenateurs": dfile = '%s-groupes.json' % url destfile = os.path.join(output_directory, dfile) if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday: print('downloading', dfile) open(destfile, 'w').write(download("https://%s.fr/organismes/groupe/json" % url).text) dfile = '%s.parlementaires.json' % url destfile = os.path.join(output_directory, dfile) if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday: print('downloading', dfile) open(destfile, 'w').write(download("http://%s.fr/%s/json" % (url, 'deputes' if 'deputes' in url else 'senateurs')).text)
def download_open_data_file(filename, file_url): raw_data = download(file_url) data_zip = zipfile.ZipFile(io.BytesIO(raw_data.content)) if filename: with data_zip.open(filename) as d: return json.loads(d.read().decode('utf-8')) data = { "export": { "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", "dossiersLegislatifs": { "dossier": [] }, "textesLegislatifs": { "document": [] } } } for filename in data_zip.namelist(): with data_zip.open(filename) as d: filedata = json.loads(d.read().decode('utf-8')) if "dossierParlementaire" in filename: data["export"]["dossiersLegislatifs"]["dossier"].append( filedata) else: data["export"]["textesLegislatifs"]["document"].append( filedata["document"]) return data
def download_open_data_file(filename, file_url): raw_data = download(file_url) data_zip = zipfile.ZipFile(io.BytesIO(raw_data.content)) if filename: with data_zip.open(filename) as d: return json.loads(d.read().decode('utf-8')) return data_zip
def test_status(url): try: resp = download(url) if resp.status_code != 200: return False except Exception: return False return resp
def test_status(url): resp = download(url) if resp.status_code != 200: return False # TODO: do this in download() if 'assemblee-nationale.fr' in url: resp.encoding = 'Windows-1252' return resp
def test_status(url): resp = download(url) if resp.status_code != 200: return False # TODO: do this in download() if 'assemblee-nationale.fr' in url: resp.encoding = 'Windows-1252' return resp
def download_senat(url, log=sys.stderr): print(' [] download SENAT version') resp = download(url) if resp.status_code != 200: print('WARNING: Invalid response -', resp.status_code) return html = resp.text print(' [] parse SENAT version') senat_dos = senapy_parse(html, url, logfile=log) debug_file(senat_dos, 'senat_dos.json') return senat_dos
def extract_full_decision(url): decision_src = download(url).text if '<a name=\'visa\' id="visa"></a>' not in decision_src: print("ERROR: could not find visa in decision CC", url, file=sys.stderr) return None decision_txt = decision_src.split('<a name=\'visa\' id="visa"></a>')[1] if not re_delibere.search(decision_txt): print("ERROR: could not find siège in décision CC", url, file=sys.stderr) return None decision_txt = clean_delib(decision_txt) return strip_text(decision_txt)
def download_senat(url, log=sys.stderr): print(' [] download SENAT version') resp = download(url) if resp.status_code != 200: print('WARNING: Invalid response -', resp.status_code) return html = resp.text print(' [] parse SENAT version') senat_dos = senapy_parse(html, url, logfile=log) debug_file(senat_dos, 'senat_dos.json') return senat_dos
def are_same_doslegs(senat_dos, an_dos): # same dosleg url ? if an_dos['url_dossier_senat'] == senat_dos['url_dossier_senat']: return True elif download(an_dos['url_dossier_senat']).status_code == 404: return True # same first text ? if senat_dos.get('steps') and an_dos.get('steps') \ and senat_dos['steps'][0].get('source_url') == an_dos['steps'][0].get('source_url'): return True # it's not the same dosleg ! return False
def find_an_url(data): if not data['steps']: return an_text_url = [step['source_url'] for step in data['steps'] if step.get('source_url') and 'assemblee-nationale' in step.get('source_url')] for url in an_text_url: html = download(url).text soup = BeautifulSoup(html, 'lxml') btn = soup.select_one('#btn_dossier') if btn: a = btn.parent if a.attrs.get('href'): return clean_url(urljoin(url, a.attrs['href']))
def parse_table_concordance(url): html = download(url).text soup = BeautifulSoup(html, 'html5lib') old_to_adopted = {} confusing_entries = set() rows = soup.select('div[align="center"] > table tr') + soup.select('div[align="left"] > table tr') def normalize(entry): if entry.lower() in ('unique', '1'): return '1er' return entry def add(old, adopted): nonlocal old_to_adopted, confusing_entries if ' et ' in old: for el in old.split(' et '): add(el, adopted) return adopted, old = normalize(adopted), normalize(old) if adopted.lower() in ('id', 'idem'): # id: Abbreviation of the Latin idem (“same”) adopted = old if adopted == '': adopted = 'supprimé' if 'suppr' in adopted.lower(): adopted = adopted.lower() if old in old_to_adopted: print('## ERROR ###', 'DOUBLE ENTRY IN CONCORDANCE TABLE FOR', old, file=sys.stderr) confusing_entries.add(old) else: if 'suppr' not in adopted and adopted in old_to_adopted.values(): print('## WARNING ###', 'MULTIPLE ARTICLES MERGED INTO ONE IN CONCORDANCE TABLE FOR', adopted, file=sys.stderr) adopted += ' (supprimé)' old_to_adopted[old] = adopted for line in rows: cells = [x.text.strip() for x in line.select('td')] old, adopted, *_ = cells if 'numérotation' in old.lower() or not old: continue add(old, adopted) # there can be two concordances per line # ex: https://www.senat.fr/dossier-legislatif/tc/tc_pjl08-155.html if len(cells) == 5: *_, old, adopted = cells add(old, adopted) for entry in confusing_entries: del old_to_adopted[entry] return old_to_adopted, list(confusing_entries)
def are_same_doslegs(senat_dos, an_dos): # same dosleg url ? if an_dos['url_dossier_senat'] == senat_dos['url_dossier_senat']: return True elif download(an_dos['url_dossier_senat']).status_code == 404: return True # same first text ? if senat_dos.get('steps') and an_dos.get('steps') \ and senat_dos['steps'][0].get('source_url') == an_dos['steps'][0].get('source_url'): return True # it's not the same dosleg ! return False
def find_senat_url(data): if not data['steps']: return senat_text_url = [ step['source_url'] for step in data['steps'] if step.get('source_url') and 'senat.fr' in step.get('source_url') ] for url in senat_text_url: html = download(url).text soup = BeautifulSoup(html, 'lxml') for a in soup.select('#primary a'): href = urljoin(url, a.attrs.get('href', '')) if 'dossier-legislatif/' in href or 'dossierleg/' in href: return clean_url(href)
def find_an_url(data): if not data['steps']: return an_text_url = [ step['source_url'] for step in data['steps'] if step.get('source_url') and 'assemblee-nationale' in step.get('source_url') ] for url in an_text_url: html = download(url).text soup = BeautifulSoup(html, 'html5lib') btn = soup.select_one('#btn_dossier') if btn: a = btn.parent if a.attrs.get('href'): return clean_url(urljoin(url, a.attrs['href']))
def download_daily(url_or_collecter, filename, output_directory): if not os.path.exists(output_directory): os.makedirs(output_directory) yesterday = time.time() - 86400 destfile = os.path.join(output_directory, filename + ".json") if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday: print('downloading', filename) if isinstance(url_or_collecter, str): jsondata = download(url_or_collecter).json() else: jsondata = url_or_collecter() print_json(jsondata, destfile) else: jsondata = open_json(destfile) return jsondata
def extract_full_decision(url): decision_src = download(url).text if '<a name=\'visa\' id="visa"></a>' not in decision_src: print("ERROR: could not find visa in decision CC", url, file=sys.stderr) return None decision_txt = decision_src.split('<a name=\'visa\' id="visa"></a>')[1] if not re_delibere.search(decision_txt): print("ERROR: could not find siège in décision CC", url, file=sys.stderr) return None decision_txt = clean_delib(decision_txt) return strip_text(decision_txt)
def download_daily(url_or_collecter, filename, output_directory): if not os.path.exists(output_directory): os.makedirs(output_directory) yesterday = time.time() - 86400 destfile = os.path.join(output_directory, filename + ".json") if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday: print('downloading', filename) if isinstance(url_or_collecter, str): jsondata = download(url_or_collecter).json() else: jsondata = url_or_collecter() print_json(jsondata, destfile) else: jsondata = open_json(destfile) return jsondata
def download_an(url, url_senat=False, log=sys.stderr, verbose=True): if verbose: print(' [] download AN version') resp = download(url) resp.encoding = 'Windows-1252' html = resp.text if verbose: print(' [] parse AN version') # TODO: do both instead of first results = anpy_parse(html, url, logfile=log, verbose=verbose) if len(results) > 1: if url_senat: for result in results: if result.get('url_dossier_senat') == url_senat: return result if verbose: print(' WARNING: TOOK FIRST DOSLEG BUT THERE ARE %d OF THEM' % len(results)) return results[0]
def download_open_data_doslegs(legislature): files = { 15: ( "Dossiers_Legislatifs_XV.json", "http://data.assemblee-nationale.fr/static/openData/repository/15/loi/dossiers_legislatifs/Dossiers_Legislatifs_XV.json.zip", ), 14: ( "Dossiers_Legislatifs_XIV.json", "http://data.assemblee-nationale.fr/static/openData/repository/14/loi/dossiers_legislatifs/Dossiers_Legislatifs_XIV.json.zip", ), } file, file_url = files[legislature] doslegs_resp = download(file_url) doslegs_zip = zipfile.ZipFile(io.BytesIO(doslegs_resp.content)) DATA = json.loads(doslegs_zip.open(file).read().decode("utf-8")) return DATA
def parse(url, resp=None, DEBUG=False, include_annexes=False): """ parse the text of an url, an already cached to`resp` can be passed to avoid an extra network request """ all_articles = [] def pr_js(article): nonlocal all_articles, texte if not len(all_articles): add_to_articles(texte, all_articles) add_to_articles(article, all_articles) if url.endswith('.pdf'): print("WARNING: text url is a pdf: %s skipping it..." % url) return all_articles if 'assemblee-nat.fr' in url: print("WARNING: url corresponds to old AN website: %s skipping it..." % url) return all_articles if url.startswith('http'): resp = download(url) if resp is None else resp if '/textes/'in url: resp.encoding = 'utf-8' if 'assemblee-nationale.fr' in url: resp.encoding = 'Windows-1252' string = resp.text elif url == '-': string = sys.stdin.read() else: string = open(url).read() string, has_multiple_expose = clean_extra_expose_des_motifs(string) if 'legifrance.gouv.fr' in url: for reg, res in clean_legifrance_regexps: string = reg.sub(res, string) else: for reg, res in clean_texte_regexps: string = reg.sub(res, string) #fix weird Sénat formatting with single cells tables around pieces of text sometimes multiline... ex: https://www.senat.fr/leg/ppl15-246.html for match in re.findall(r'(<table[^>]*>\s*(?:<t(?:body|r|d|h)[^>]*>\s*)+)(.*?)((?:\s*</t(?:body|r|d|h)[^>]*>)+\s*</table>)', string, re.I): if not re.search(r'<t(r|d|h)[^>]*>', match[1], re.I): string = string.replace(''.join(match), match[1]) srclst = [] source_avenants = False m = re.search(r"NB(\s| )+:(\s| )+[lL]es? textes? d(u |es |e la |e l’)((convention|traité|avenant)s? et de(s| l’))?(accord|convention)s?(-cadres?)? figuren?t? (respectivement )?en annexe aux (deux |trois )?projets de loi \(n", re.sub(r'</?span[^>]*>', '', string), re.I) if m: try: srclst = [int(s.strip('no ')) for s in ( string.replace('<sup>', '').replace('</sup>', '').replace(' ', ' ') .replace('aux deux projets', 'aux projets').replace('aux trois projets', 'aux projets') .replace('°', 'o').replace('nos ', 'no ').replace('ns ', 'no ').replace('(n ', '(no ') .split(' en annexe aux projets de loi (no ')[1] .strip() .split(')')[0] .strip() .replace(' et ', ', ') .split(', '))] source_avenants = True except Exception as e: if DEBUG: print("WARNING, multi-reports detected with NB method crashing (%s: %s), trying regular method..." % (type(e), e)) if not source_avenants and "/rapports/r" in url and "TEXTES ADOPTÉS PAR LA COMMISSION" in string and string.count(">Article unique<") == 2: m = re.search(r'<i>Assemblée nationale : </i><b>(\d+) </b>et<b> (\d+)</b>', string) if m: srclst = [int(m.group(1)), int(m.group(2))] source_avenants = True definitif = re_definitif.search(string) is not None or re_definitif_new_format.search(string) is not None or 'legifrance.gouv.fr' in url soup = BeautifulSoup(string, "html5lib") texte = {"type": "texte", "source": url, "definitif": definitif} # Generate Senat or AN ID from URL if url.startswith('http'): if "legifrance.gouv.fr" in url: m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I) if m: texte["id"] = m.group(1) elif "/jo/texte" in url: texte["id"] = url.split('/')[-3] elif re.search(r"assemblee-?nationale", url, re.I): m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I) numero = int(m.group(3)) texte["id"] = "A" + m.group(1) + "-" if m.group(2) is not None: texte["id"] += m.group(2) texte["id"] += str(numero) texte["nosdeputes_id"] = get_text_id(url) else: m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})(rec)?\d?(_mono)?\.", url, re.I) if m is None: m = re.search(r"/(-)?20(\d+)-\d+/(\d+)(_mono)?.html", url, re.I) numero = int(m.group(3)) texte["id"] = "S" + m.group(2) + "-" if m.group(1) is not None: texte["id"] += m.group(1) texte["id"] += "%03d" % numero texte["nossenateurs_id"] = get_text_id(url) texte["titre"] = clean_html(re_clean_title_legif.sub('', soup.title.string.strip())) if soup.title else "" texte["expose"] = "" expose = False # states 'read' can be set to: READ_DISABLED = -1 # the text is not detected yet READ_TEXT = 0 # read the text READ_TITLE = 1 # titles lecture READ_ALINEAS = 2 # alineas lecture read = READ_TEXT art_num = ali_num = 0 article = {} indextext = -1 curtext = -1 section = {"type": "section", "id": ""} rejected_all_articles = [] # we only keep the last detected text by default, here are stored the previous texts def should_be_parsed(x): """returns True if x can contain useful information""" if x.name not in ('p', 'table', 'h1', 'h2', 'h4'): return False # hack: we don't want to parse the table containing the conclusion from the senat # ex: https://www.senat.fr/leg/tas12-040.html if x.name == "table" and re.search("SESSION (EXTRA)?ORDINAIRE DE", str(x)): return False # hack: senate can copy paste the /textes/ output from the AN # ex: https://www.senat.fr/leg/ppl17-545.html # TODO: they also mess up the encoding by doing that if x.name == "table" and re.search("<!-- Col de controle de taille -->", str(x)): return False return True def should_be_ignored(x): if hasattr(x, 'attrs') and 'display: none' in x.attrs.get('style', ''): return True return False for text in non_recursive_find_all(soup, should_be_parsed, should_be_ignored): line = clean_html(str(text)) if DEBUG: print(read, article.get('titre') or art_num, ali_num, line, file=sys.stderr) # limit h2/h4 matches to PPL headers or Article unique if text.name not in ('p', 'table') and not re_mat_ppl.match(line) and not re_mat_tco.match(line) and 'Article unique' not in line: if DEBUG: print(" -> IGNORING LINE", file=sys.stderr) continue if re_stars.match(line): continue if line == "<b>RAPPORT</b>" or line == "Mesdames, Messieurs,": read = READ_DISABLED if (srclst or indextext != -1) and re_sep_text.match(line): curtext += 1 art_num = 0 srcl = re_src_mult.search(line) if not source_avenants and srcl and read in (READ_DISABLED, READ_TEXT): srclst.append(int(srcl.group(1))) continue cl_line = re_cl_html.sub("", line).strip() if re_rap_mult.match(line): line = cl_line line = re_clean_mult_1.sub(",", line) line = re_clean_mult_2.sub("", line) cl_line = re_cl_html.sub("", line).strip() for n_t in line.split(','): indextext += 1 if int(n_t) == numero: break elif re_mat_ppl.match(line) or re_mat_tco.match(line) or ( read == READ_DISABLED and line == "<b>Article 1er</b>"): read = READ_TEXT if len(all_articles): if DEBUG: print('WARNING: Found articles before the real text') if article is not None: pr_js(article) rejected_all_articles.append(all_articles) all_articles = [] article = {} art_num = 0 elif re_mat_exp.match(line): read = READ_DISABLED # Deactivate description lecture expose = True elif read == READ_TEXT and definitif_before_congres in line or definitif_after_congres in line: texte['definitif'] = True if all_articles: all_articles[0]['definitif'] = True continue elif (re_echec_cmp.search(cl_line) or re_echec_com.search(cl_line) or re_echec_com2.search(cl_line) or re_echec_com3.search(cl_line) or re_echec_com4.search(cl_line) or re_echec_com5.search(cl_line) or re_echec_com6.search(cl_line) or re_echec_hemi.match(cl_line) or re_echec_hemi2.search(cl_line) or re_echec_hemi3.search(cl_line) ) and 'dont la teneur suit' not in cl_line: pr_js({"type": "echec", "texte": cl_line}) break elif read == READ_DISABLED: continue # or (indextext != -1 and curtext != indextext): #keep all texts resulting from multireport now it's selected then in complete # crazy edge case: "(Conforme)Article 24 bis A (nouveau)" on one line # http://www.assemblee-nationale.fr/13/projets/pl3324.asp # simplified, just do the "(Conforme)" case if '<i>(Conforme)</i>' in line and re_mat_art.search(line): article["statut"] = 'conforme' line = line.replace('<i>(Conforme)</i>', '') cl_line = cl_line.replace('(Conforme)', '') # another crazy edge case: the text is inside the annexe # ex: http://www.assemblee-nationale.fr/13/rapports/r2083.asp # TODO: could detect via "le présent projet de loi dans le texte figurant en annexe" # like the source_avenants logic if read != READ_ALINEAS and re_approb.match(line): art_num += 1 article = { "type": "article", "order": art_num, "alineas": {}, "statut": "none", "titre": "1er" } read = READ_ALINEAS # Identify section zones line = normalize_section_title(line, text, has_multiple_expose) m = re_mat_sec.match(line) if m: read = READ_TITLE # Activate titles lecture section["type_section"] = real_lower(m.group(1)) section_typ = m.group(1).upper()[0] if m.group(3) is not None: section_typ += "S" if re.search(re_préliminaire, line) or " LIMINAIRE" in line.upper(): section_num = "L" else: section_num = re_cl_html.sub('', m.group(5).strip()) if word_to_number(section_num) is not None: section_num = word_to_number(section_num) section_num = normalize_1(section_num, '1') section_num = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), section_num) section_num = re_mat_new.sub('', section_num).strip() m2 = re_mat_romans.match(section_num) if m2: rest = section_num.replace(m2.group(0), '') section_num = romans(m2.group(0)) if rest: section_num = str(section_num) + rest # Get parent section id to build current section id section_par = re.sub(r"" + section_typ + r"[\dL].*$", "", section["id"]) section["id"] = section_par + section_typ + str(section_num) # check_section_is_not_a_duplicate(section["id"]) titre = blank_none(m.group('titre')).strip() if titre: section['titre'] = titre if article is not None: pr_js(article) article = {} pr_js(section) read = READ_TEXT elif re_mat_end.match(line) and not include_annexes: if not expose: break expose = False continue # Annexes. elif read == READ_ALINEAS and re_mat_ann.match(line): if include_annexes: if article is not None: pr_js(article) titre = re_cl_html.sub("", re_mat_ann.sub("", line)) art_num += 1 article = { "type": "annexe", "order": art_num, "alineas": {}, "statut": "none", "titre": titre } ali_num = 0 else: break # Identify titles and new article zones elif (re.match(r"(<i>)?<b>", line) or re_art_uni.match(cl_line) or re.match(r"^Articles? ", line) ) and not re.search(r">Articles? supprimé", line): line = cl_line.strip() # Read a new article if re_mat_art.match(line): if article is not None: pr_js(article) read = READ_ALINEAS # Activate alineas lecture expose = False art_num += 1 ali_num = 0 article = {"type": "article", "order": art_num, "alineas": {}, "statut": "none"} if srclst: article["source_text"] = srclst[curtext] m = re_mat_art.match(clean_article_name(text)) article["titre"] = normalize_1(m.group(1), "1er") assert article["titre"] # avoid empty titles assert not texte['definitif'] or ' bis' not in article["titre"] # detect invalid article names if m.group(2) is not None: article["statut"] = re_cl_par.sub("", real_lower(m.group(2))).strip() if section["id"] != "": article["section"] = section["id"] # Read a section's title elif read == READ_TITLE and line: section["titre"] = lower_but_first(line) if article is not None: pr_js(article) article = {} pr_js(section) read = READ_TEXT # detect dots, used as hints for later completion if read != READ_DISABLED: if re_mat_dots.match(line): if article is not None: pr_js(article) article = {} pr_js({"type": "dots"}) read = READ_TEXT # ignore alineas after the dots continue # Read articles with alineas if read == READ_ALINEAS and not m: line = re_clean_coord.sub('', line) # if the line was only "Pour coordination", ignore it if not line: continue # Find extra status information if ali_num == 0 and re_mat_st.match(line): article["statut"] = re_cl_html.sub("", re_cl_par.sub("", real_lower(line)).strip()).strip() continue if "<table>" in line: cl_line = cl_html_except_tables(line) line = re_clean_art_spaces2.sub('. - ', re_clean_art_spaces.sub(r'\1', re_clean_idx_spaces.sub(r'\1. ', re_mat_new.sub(" ", cl_line).strip()))) # Clean low/upcase issues with BIS TER etc. line = line.replace("oeUVRE", "OEUVRE") line = clean_full_upcase(line) line = re_clean_premier.sub(lambda m: (real_lower(m.group(0)) if m.group(1) else "")+m.group(3)+"er", line) line = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), line) # Clean different versions of same comment. line = re_clean_supr.sub('(Supprimé)', line) line = re_clean_conf.sub(r'\1(Non modifié)', line) line = re_clean_subsec_space.sub(r'\1\4 \5', line) line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line) tmp = line line = re_clean_punc_space.sub(r'\1 \2', tmp) line = re_clean_spaces.sub(' ', line) line = re_mat_sec.sub(lambda x: lower_but_first(x.group(1))+x.group(4) if re_mat_n.match(x.group(4)) else x.group(0), line) line = re_clean_footer_notes.sub(".", line) # Clean comments (Texte du Sénat), (Texte de la Commission), ... if ali_num == 0 and re_mat_texte.match(line): continue line = re_mat_single_char.sub("", line) line = line.strip() if line: ali_num += 1 article["alineas"]["%03d" % ali_num] = line else: #metas continue # sometimes we find multiple text starts inside one (mainly due to annotations), # by default we normally keep only the latest one, but if it is empty, # try to find a good one from previously parsed articles # ex: http://www.assemblee-nationale.fr/15/propositions/pion0965.asp if not all_articles: for rejected in rejected_all_articles: articles_parsed = [art for art in rejected if art.get('type') == 'article'] if len(articles_parsed): print('WARNING: retrieving parsed text from a previously rejected text') all_articles = rejected break if article is not None: pr_js(article) if indextext != -1 and curtext + 1 != len(srclst): print("WARNING: multiple texts announced but %d/%d found %s" % (curtext + 1, len(srclst), srclst), indextext) return all_articles
def parse_table_concordance(url): html = download(url).text soup = BeautifulSoup(html, 'html5lib') old_to_adopted = {} confusing_entries = set() rows = soup.select('div[align="center"] > table tr') + soup.select( 'div[align="left"] > table tr') def normalize(entry): if entry.lower() in ('unique', '1'): return '1er' return entry def add(old, adopted): nonlocal old_to_adopted, confusing_entries if ' et ' in old: for el in old.split(' et '): add(el, adopted) return adopted, old = normalize(adopted), normalize(old) if adopted.lower() in ( 'id', 'idem'): # id: Abbreviation of the Latin idem (“same”) adopted = old if adopted == '': adopted = 'supprimé' if 'suppr' in adopted.lower(): adopted = adopted.lower() if old in old_to_adopted: print('## ERROR ###', 'DOUBLE ENTRY IN CONCORDANCE TABLE FOR', old, file=sys.stderr) confusing_entries.add(old) else: if 'suppr' not in adopted and adopted in old_to_adopted.values(): print( '## WARNING ###', 'MULTIPLE ARTICLES MERGED INTO ONE IN CONCORDANCE TABLE FOR', adopted, file=sys.stderr) adopted += ' (supprimé)' old_to_adopted[old] = adopted for line in rows: cells = [x.text.strip() for x in line.select('td')] old, adopted, *_ = cells if 'numérotation' in old.lower() or not old: continue add(old, adopted) # there can be two concordances per line # ex: https://www.senat.fr/dossier-legislatif/tc/tc_pjl08-155.html if len(cells) == 5: *_, old, adopted = cells add(old, adopted) for entry in confusing_entries: del old_to_adopted[entry] return old_to_adopted, list(confusing_entries)
from tlfp.tools._step_logic import get_previous_step, use_old_procedure def article_to_markdown(art): texte = "" for key in sorted(art["alineas"].keys()): if art["alineas"][key] != "": texte += art["alineas"][key] + "\n\n" return texte def amendement_to_markdown(texte): return html2markdown.convert(texte) dossiers = download(f"https://www.lafabriquedelaloi.fr/api/dossiers.csv") for csv_dos in csv.DictReader(dossiers.text.splitlines(), delimiter=";"): law = csv_dos['id'] dos = download( f"https://www.lafabriquedelaloi.fr/api/{law}/viz/procedure.json").json( ) for step_index, step in enumerate(dos["steps"]): if step.get("nb_amendements", 0) > 0: amendements = download( f"https://www.lafabriquedelaloi.fr/api/{law}/viz/amendements_{step['directory']}.json" ).json() try: texte = download( f"https://www.lafabriquedelaloi.fr/api/{law}/procedure/{step['directory']}/texte/texte.json"
def process(OUTPUT_DIR, procedure): context = Context([0, OUTPUT_DIR], load_parls=True) #['Indéfini', 'Adopté', 'Irrecevable', 'Rejeté', 'Retiré', 'Tombe', 'Non soutenu', 'Retiré avant séance', 'Rectifié', 'Favorable' ,'Satisfait'] def simplify_sort(sort): sort = sort.lower() if sort in "adopté favorable": return "adopté" if sort in "rejeté ": return "rejeté" if sort in "indéfini": return "en attente" return "non-voté" re_clean_first = re.compile(r'^(.*?)(,| et) .*$') def first_author(signataires): if signataires is None or "gouvernement" in signataires.lower(): return "" return re_clean_first.sub(r'\1, …', signataires) def find_groupe(amd): if amd['signataires'] and "gouvernement" in amd['signataires'].lower(): return "Gouvernement" ct = {} maxc = 0 result = "" for gpe in amd['groupes_parlementaires']: g = slug_groupe(gpe['groupe']) if g not in ct: ct[g] = 0 ct[g] += 1 if ct[g] > maxc: maxc = ct[g] result = g return result def add_link(links, pA, pB, weight=1): p1 = min(pA, pB) p2 = max(pA, pB) linkid = "%s-%s" % (p1, p2) if linkid not in links: links[linkid] = { "1": p1, "2": p2, "w": 0 } links[linkid]["w"] += weight article_number_regexp = re.compile(r'article (1er.*|(\d+).*)$', re.I) def sort_amendements(texte, amendements): articles = {} for article in texte: if article['type'] == 'article': titre = article.get('titre') if titre: articles[titre.lower()] = article.get('order') * 10 def solveorder(art): nonlocal articles art = art.lower() order = 10000; if art == 'titre' or art.startswith('intitul'): return 0 elif art.startswith('motion'): return 1 elif art.startswith('projet') \ or art.startswith('proposition') \ or art.startswith('texte'): return 5 else: m = article_number_regexp.search(art) if m: if articles.get(m.group(1)): order = articles.get(m.group(1)) elif articles.get(m.group(2)): order = articles.get(m.group(2)) if 'avant' in art: order -= 1 elif 'après' in art or 'apres' in art: order += 1 return order for amendement in amendements: amdt = amendement['amendement'] amdt['ordre_article'] = solveorder(amdt['sujet']) return amendements CACHE_BUSTING = 'cache=%d' % time() if 'url_jo' in procedure: CACHE_BUSTING = 'cache=5feb2018' # fixed cache busting for promulgated laws steps = {} last_text_id, last_text_typeparl = None, None steps = procedure['steps'] for i, step in enumerate(steps): print(' * step -', step.get('stage'), step.get('step'), step.get('source_url')) if step.get('step') not in ('commission', 'hemicycle'): continue if step.get('step') == 'commission' and step.get('stage') == 'CMP': continue if i == 0: continue last_step_index = get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure')) last_step = steps[last_step_index] last_step_with_good_text_number = steps[get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure'), get_depot_step=True) ] texte_url = last_step_with_good_text_number.get('source_url') if step.get('stage') != 'CMP' and last_step_with_good_text_number.get('institution') != step.get('institution'): print('ERROR - last step is from another institution', file=sys.stderr) continue # for a CMP hemicycle we have to get the right text inside the CMP commission if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle': urls = [last_step.get('source_url')] if 'cmp_commission_other_url' in last_step: urls.append(last_step.get('cmp_commission_other_url')) an_url = [url for url in urls if 'nationale.fr' in url] senat_url = [url for url in urls if 'senat.fr' in url] if step.get('institution') == 'assemblee' and an_url: texte_url = an_url[0] elif step.get('institution') == 'senat' and senat_url: texte_url = senat_url[0] else: print('WARNING - missing the CMP commission text for', step.get('source_url'), file=sys.stderr) continue if texte_url is None: print('ERROR - no texte url', step.get('source_url'), file=sys.stderr) continue texte = open_json(os.path.join(context.sourcedir, 'procedure', last_step['directory']), 'texte/texte.json') amdt_url = None if "nationale.fr" in texte_url: if 'assemblee_legislature' not in procedure: print(' + no AN legislature - pass text') continue amdt_url = 'https://nosdeputes.fr/%s/amendements/%s/json?%s' % (procedure.get('assemblee_legislature'), get_text_id(texte_url), CACHE_BUSTING) elif "senat.fr" in texte_url: amdt_url = 'https://nossenateurs.fr/amendements/%s/json?%s' % (get_text_id(texte_url), CACHE_BUSTING) if amdt_url is None: continue print(' * downloading amendments:', amdt_url, 'for', texte_url) amendements_src = download(amdt_url).json().get('amendements', []) # TA texts can be zero-paded or not (TA0XXX or TAXXX), we try both if 'amendements/TA' in amdt_url: textid = get_text_id(texte_url) if 'TA0' in textid: alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').lstrip('0')) else: alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').zfill(4)) print(' WARNING: TA - trying alternative url too', alternative_url) amendements_src += download(alternative_url).json().get('amendements', []) print(' parsing amendments:', len(amendements_src)) # ignore amendments if they are not for the correct step amendements_src_filtered = [] for amd in amendements_src: a = amd['amendement'] if step.get('institution') == 'assemblee': # commission amendments can have two forms # - /amendements/LOI/NUM.asp (13th legislature) # - /amendements/LOI/COMMISSION_NAME/NUM.asp (14+ legislature) # hemicycle amendments are: # - /amendements/LOI/NUM.asp (13th legislature) # - /amendements/LOI/AN/NUM.asp (14+ legislature) amdt_step = 'hemicycle' if '/cr-' in a['source']: amdt_step = 'commission' else: url_parts = a['source'].split('amendements/')[1].split('/') if len(url_parts) == 3 and url_parts[1] != 'AN': amdt_step = 'commission' elif step.get('institution') == 'senat': amdt_step = 'commission' if '/commissions/' in a['source'] else 'hemicycle' else: # CMP - there's not way for now to distinguish the step amdt_step = step['step'] if step['step'] != amdt_step: continue amendements_src_filtered.append(amd) if len(amendements_src_filtered) != len(amendements_src): print('WARNING: amendments ignored (not the right step) %s' % (len(amendements_src) - len(amendements_src_filtered)), file=sys.stderr) amendements_src = amendements_src_filtered step['nb_amendements'] = len(amendements_src) if len(amendements_src) > 0: amendements_src = sort_amendements(texte['articles'], amendements_src) typeparl, urlapi = identify_room(texte_url, legislature=step.get('assemblee_legislature', procedure.get('assemblee_legislature'))) sujets = {} groupes = {} fix_order = False orders = [] parls = {} links = {} idents = {} for amd in amendements_src: a = amd['amendement'] if "sort" not in a: print('WARNING: amendment has no sort %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue if a["sort"] == "Rectifié": continue if "sujet" not in a or not a["sujet"]: if a["sort"] not in ["Irrecevable", "Retiré avant séance"]: print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue key = a['sujet'] if not key: print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue if key not in sujets: orders.append(key) sujets[key] = { 'titre': key, 'order': a['ordre_article'], 'amendements': [] } if a['ordre_article'] > 9000: fix_order = True gpe = find_groupe(a) if not gpe: if a["sort"] != "Irrecevable": sys.stderr.write('WARNING: no groupe found for %s\n' % a['url_nos%ss' % typeparl]) gpe = "Inconnu" context.add_groupe(groupes, gpe, urlapi) sujets[key]['amendements'].append({ 'numero': a['numero'], 'date': a['date'], 'sort': simplify_sort(a['sort']), 'groupe': gpe, 'id_api': a['id'], 'aut': first_author(a['signataires']) }) cosign = [] hmd5 = a["cle_unicite"] if hmd5 not in idents: idents[hmd5] = [] for parll in a["parlementaires"]: parl = parll["parlementaire"] if parl not in parls: p = context.get_parlementaire(urlapi, parl) parls[parl] = { "i": p["id"], "s": parl, "a": 0, "n": p["nom"], "g": p["groupe_sigle"], "p": p["place_en_hemicycle"] } pid = parls[parl]["i"] parls[parl]["a"] += 1 for cid in cosign: add_link(links, pid, cid) #add_link(links, pid, cid, 2) cosign.append(pid) for cid in idents[hmd5]: add_link(links, pid, cid) idents[hmd5].append(pid) if fix_order: orders.sort(key=cmp_to_key(compare_articles)) for i, k in enumerate(orders): sujets[k]["order"] = i amdtsfile = os.path.join(context.sourcedir, 'viz', 'amendements_%s.json' % step['directory']) data = {'id_step': step['directory'], 'api_root_url': amdapi_link(urlapi), 'groupes': groupes, 'sujets': sujets} print_json(data, amdtsfile) linksfile = os.path.join(context.sourcedir, 'viz', 'amendements_links_%s.json' % step['directory']) data = {'id_step': step['directory'], 'links': list(links.values()), 'parlementaires': dict((p["i"], dict((k, p[k]) for k in "psang")) for p in list(parls.values()))} # print_json(data, linksfile) ########### INTERVENTIONS ############# # TODO: move this to a dedicated file print(' * downloading interventions') typeparl, urlapi = identify_room(texte_url, legislature=step.get('assemblee_legislature', procedure.get('assemblee_legislature'))) inter_dir = os.path.join(context.sourcedir, 'procedure', step['directory'], 'interventions') commission_or_hemicycle = '?commission=1' if step.get('step') == 'commission' else '?hemicycle=1' # TODO: TA texts can be zero-paded or not (TA0XXX or TAXXX), we should try both seance_name = None intervention_files = [] texts = (get_text_id(texte_url),) if last_text_typeparl == typeparl: texts = (get_text_id(texte_url), last_text_id) for loiid in texts: url_seances = 'https://{}.fr/seances/{}/json{}'.format(urlapi, loiid, commission_or_hemicycle) print(' * downloading seances - ', url_seances) for id_seance_obj in sorted(download(url_seances).json().get('seances', []), key=lambda x: x["seance"]): url_seance = 'https://{}.fr/seance/{}/{}/json'.format(urlapi, id_seance_obj['seance'], loiid) print(' downloading seance - ', url_seance) resp = download(url_seance).json() if resp.get('seance'): inter = resp.get('seance')[0]['intervention'] seance_name = inter['date'] + 'T' + inter['heure'] + '_' + inter['seance_id'] print(' dumping seance -', seance_name) intervention_files.append(seance_name) if not os.path.exists(inter_dir): os.makedirs(inter_dir) print_json(resp, os.path.join(inter_dir, seance_name + '.json')) if seance_name: step['has_interventions'] = True step['intervention_files'] = intervention_files break last_text_id = get_text_id(texte_url) last_text_typeparl = typeparl return procedure
def process(OUTPUT_DIR, procedure): context = Context(OUTPUT_DIR, load_parls=True) #['Indéfini', 'Adopté', 'Irrecevable', 'Rejeté', 'Retiré', 'Tombe', 'Non soutenu', 'Retiré avant séance', 'Rectifié', 'Favorable' ,'Satisfait'] def simplify_sort(sort): sort = sort.lower() if sort in "adopté favorable": return "adopté" if sort in "rejeté ": return "rejeté" if sort in "indéfini": return "en attente" return "non-voté" re_clean_first = re.compile(r'^(.*?)(,| et) .*$') def first_author(signataires): if signataires is None or "gouvernement" in signataires.lower(): return "" return re_clean_first.sub(r'\1, …', signataires) def find_groupe(amd, typeparl, urlapi): if amd['signataires'] and "gouvernement" in amd['signataires'].lower(): return "Gouvernement" # Fix groupes not historicized in NosSénateurs if typeparl == "senateur" and amd["parlementaires"]: return context.get_senateur_groupe(amd["parlementaires"][0]["parlementaire"], amd["date"], urlapi) return amd['auteur_groupe_acronyme'] def add_link(links, pA, pB, weight=1): p1 = min(pA, pB) p2 = max(pA, pB) linkid = "%s-%s" % (p1, p2) if linkid not in links: links[linkid] = { "1": p1, "2": p2, "w": 0 } links[linkid]["w"] += weight article_number_regexp = re.compile(r'article (1er.*|(\d+).*)$', re.I) def sort_amendements(texte, amendements): articles = {} for article in texte: if article['type'] == 'article': titre = article.get('titre') if titre: articles[titre.lower()] = article.get('order') * 10 def solveorder(art): nonlocal articles art = art.lower() order = 10000 if art == 'titre' or art.startswith('intitul'): return 0 elif art.startswith('motion'): return 1 elif art.startswith('projet') \ or art.startswith('proposition') \ or art.startswith('texte'): return 5 else: m = article_number_regexp.search(art) if m: if articles.get(m.group(1)): order = articles.get(m.group(1)) elif articles.get(m.group(2)): order = articles.get(m.group(2)) if 'avant' in art: order -= 1 elif 'après' in art or 'apres' in art: order += 1 return order for amendement in amendements: amdt = amendement['amendement'] amdt['ordre_article'] = solveorder(amdt['sujet']) return amendements CACHE_BUSTING = 'cache=%d' % time() if 'url_jo' in procedure: CACHE_BUSTING = 'cache=lfdll-prod' # fixed cache busting for promulgated laws steps = {} last_text_id, last_text_typeparl = None, None steps = procedure['steps'] for i, step in enumerate(steps): print(' * step -', step.get('stage'), step.get('step'), step.get('source_url')) if step.get('step') not in ('commission', 'hemicycle'): continue if step.get('step') == 'commission' and step.get('stage') == 'CMP': continue if i == 0: continue last_step_index = get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure')) last_step = steps[last_step_index] last_step_with_good_text_number = steps[get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure'), get_depot_step=True) ] texte_url = last_step_with_good_text_number.get('source_url') if last_step.get('in_discussion'): print('WARNING: ignoring future steps further than current discussion', file=sys.stderr) break if step.get('stage') != 'CMP' and last_step_with_good_text_number.get('institution') != step.get('institution'): print('ERROR - last step is from another institution', file=sys.stderr) continue # for a CMP hemicycle we have to get the right text inside the CMP commission if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle': urls = [last_step.get('source_url')] if 'cmp_commission_other_url' in last_step: urls.append(last_step.get('cmp_commission_other_url')) an_url = [url for url in urls if 'nationale.fr' in url] senat_url = [url for url in urls if 'senat.fr' in url] if step.get('institution') == 'assemblee' and an_url: texte_url = an_url[0] elif step.get('institution') == 'senat' and senat_url: texte_url = senat_url[0] else: print('WARNING - missing the CMP commission text for', step.get('source_url'), file=sys.stderr) continue if texte_url is None: print('ERROR - no texte url', step.get('source_url'), file=sys.stderr) continue legislature = None if 'assemblee-nationale.fr' in texte_url: legislature = national_assembly_text_legislature(texte_url) texte = open_json(os.path.join(context.sourcedir, 'procedure', last_step['directory']), 'texte/texte.json') typeparl, urlapi = identify_room(texte_url, legislature) amdt_url = None if "nationale.fr" in texte_url: if 'assemblee_legislature' not in procedure: print(' + no AN legislature - pass text') continue amdt_url = 'https://%s.fr/%s/amendements/%s/json?%s' % (urlapi, legislature, get_text_id(texte_url), CACHE_BUSTING) elif "senat.fr" in texte_url: amdt_url = 'https://%s.fr/amendements/%s/json?%s' % (urlapi, get_text_id(texte_url), CACHE_BUSTING) if amdt_url is None: continue print(' * downloading amendments:', amdt_url, 'for', texte_url) try: amendements_src = download(amdt_url).json().get('amendements', []) except: raise Exception("ERROR: amendements JSON at %s is badly formatted, it should probably be hardcached on ND/NS" % amdt_url) # TA texts can be zero-paded or not (TA0XXX or TAXXX), we try both if 'amendements/TA' in amdt_url: textid = get_text_id(texte_url) if 'TA0' in textid: alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').lstrip('0')) else: alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').zfill(4)) print(' WARNING: TA - trying alternative url too', alternative_url) try: amendements_src += download(alternative_url).json().get('amendements', []) except: raise Exception("ERROR: amendements JSON at %s is badly formatted, it should probably be hardcached on ND/NS" % alternative_url) print(' parsing amendments:', len(amendements_src)) # ignore amendments if they are not for the correct step amendements_src_filtered = [] for amd in amendements_src: a = amd['amendement'] if step.get('institution') == 'assemblee': # commission amendments can have two forms # - /amendements/LOI/NUM.asp (13th legislature) # - /amendements/LOI/COMMISSION_NAME/NUM.asp (14+ legislature) # hemicycle amendments are: # - /amendements/LOI/NUM.asp (13th legislature) # - /amendements/LOI/AN/NUM.asp (14+ legislature) amdt_step = 'hemicycle' if '/cr-' in a['source']: amdt_step = 'commission' else: url_parts = a['source'].split('amendements/')[1].split('/') if len(url_parts) == 3 and url_parts[1] != 'AN': amdt_step = 'commission' elif step.get('institution') == 'senat': amdt_step = 'commission' if '/commissions/' in a['source'] else 'hemicycle' else: # CMP - there's not way for now to distinguish the step amdt_step = step['step'] if step['step'] != amdt_step: continue amendements_src_filtered.append(amd) if len(amendements_src_filtered) != len(amendements_src): print('WARNING: amendments ignored (not the right step) %s' % (len(amendements_src) - len(amendements_src_filtered)), file=sys.stderr) amendements_src = amendements_src_filtered step['nb_amendements'] = len(amendements_src) if len(amendements_src) > 0: amendements_src = sort_amendements(texte['articles'], amendements_src) sujets = {} groupes = {} fix_order = False orders = [] parls = {} links = {} idents = {} for amd in amendements_src: a = amd['amendement'] if "sort" not in a: print('WARNING: amendment has no sort %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue if a["sort"] == "Rectifié": continue if "sujet" not in a or not a["sujet"]: if a["sort"] not in ["Irrecevable", "Retiré avant séance"]: print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue key = a['sujet'] if not key: print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue if key not in sujets: orders.append(key) sujets[key] = { 'titre': key, 'order': a['ordre_article'], 'amendements': [] } if a['ordre_article'] > 9000: fix_order = True gpe = find_groupe(a, typeparl, urlapi) if not gpe: if a["sort"] != "Irrecevable": sys.stderr.write('WARNING: no groupe found for %s\n' % a['url_nos%ss' % typeparl]) gpe = "Inconnu" context.add_groupe(groupes, gpe, urlapi) sujets[key]['amendements'].append({ 'numero': a['numero'], 'date': a['date'], 'sort': simplify_sort(a['sort']), 'groupe': gpe, 'id_api': a['id'], 'aut': first_author(a['signataires']) }) cosign = [] hmd5 = a["cle_unicite"] if hmd5 not in idents: idents[hmd5] = [] for parll in a["parlementaires"]: parl = parll["parlementaire"] if parl not in parls: p = context.get_parlementaire(urlapi, parl) parls[parl] = { "i": p["id"], "s": parl, "a": 0, "n": p["nom"], "g": p["groupe_sigle"], "p": p["place_en_hemicycle"] } pid = parls[parl]["i"] parls[parl]["a"] += 1 for cid in cosign: add_link(links, pid, cid) #add_link(links, pid, cid, 2) cosign.append(pid) for cid in idents[hmd5]: add_link(links, pid, cid) idents[hmd5].append(pid) if fix_order: orders.sort(key=cmp_to_key(compare_articles)) for i, k in enumerate(orders): sujets[k]["order"] = i amdtsfile = os.path.join(context.sourcedir, 'viz', 'amendements_%s.json' % step['directory']) data = {'id_step': step['directory'], 'api_root_url': amdapi_link(urlapi), 'groupes': groupes, 'sujets': sujets} print_json(data, amdtsfile) linksfile = os.path.join(context.sourcedir, 'viz', 'amendements_links_%s.json' % step['directory']) data = {'id_step': step['directory'], 'links': list(links.values()), 'parlementaires': dict((p["i"], dict((k, p[k]) for k in "psang")) for p in list(parls.values()))} # print_json(data, linksfile) ########### INTERVENTIONS ############# # TODO: move this to a dedicated file print(' * downloading interventions') typeparl, urlapi = identify_room(texte_url, legislature) inter_dir = os.path.join(context.sourcedir, 'procedure', step['directory'], 'interventions') commission_or_hemicycle = '?commission=1' if step.get('step') == 'commission' else '?hemicycle=1' # TODO: TA texts can be zero-paded or not (TA0XXX or TAXXX), we should try both seance_name = None intervention_files = [] texts = (get_text_id(texte_url),) if last_text_typeparl == typeparl: texts = (get_text_id(texte_url), last_text_id) for loiid in texts: if typeparl == 'depute': url_seances = 'https://%s.fr/%s/seances/%s/json%s' % (urlapi, legislature, loiid, commission_or_hemicycle) else: url_seances = 'https://%s.fr/seances/%s/json%s' % (urlapi, loiid, commission_or_hemicycle) print(' * downloading seances - ', url_seances) for id_seance_obj in sorted(download(url_seances).json().get('seances', []), key=lambda x: x["seance"]): if typeparl == 'depute': url_seance = 'https://%s.fr/%s/seance/%s/%s/json' % (urlapi, legislature, id_seance_obj['seance'], loiid) else: url_seance = 'https://%s.fr/seance/%s/%s/json' % (urlapi, id_seance_obj['seance'], loiid) print(' downloading seance - ', url_seance) resp = download(url_seance).json() if resp.get('seance'): inter = resp.get('seance')[0]['intervention'] seance_name = inter['date'] + 'T' + inter['heure'] + '_' + inter['seance_id'] print(' dumping seance -', seance_name) intervention_files.append(seance_name) if not os.path.exists(inter_dir): os.makedirs(inter_dir) print_json(resp, os.path.join(inter_dir, seance_name + '.json')) if seance_name: step['has_interventions'] = True step['intervention_files'] = intervention_files break last_text_id = get_text_id(texte_url) last_text_typeparl = typeparl return procedure
def test_status(url): resp = download(url) if resp.status_code != 200: return False return resp
def download_senat(url, log=sys.stderr, verbose=True): if verbose: print(' [] download SENAT version') html = download(url).text if verbose: print(' [] parse SENAT version') return senapy_parse(html, url, logfile=log)
def parse(url, resp=None, DEBUG=False, include_annexes=False): """ parse the text of an url, an already cached to`resp` can be passed to avoid an extra network request """ all_articles = [] def pr_js(article): nonlocal all_articles, texte if not len(all_articles): add_to_articles(texte, all_articles) add_to_articles(article, all_articles) if url.endswith('.pdf'): print("WARNING: text url is a pdf: %s skipping it..." % url) return all_articles if 'assemblee-nat.fr' in url: print("WARNING: url corresponds to old AN website: %s skipping it..." % url) return all_articles if url.startswith('http'): resp = download(url) if resp is None else resp if '/textes/' in url: resp.encoding = 'utf-8' if 'assemblee-nationale.fr' in url: resp.encoding = 'Windows-1252' string = resp.text elif url == '-': string = sys.stdin.read() else: try: string = open(url).read() except: string = open(url, encoding="Windows-1252").read() string, has_multiple_expose = clean_extra_expose_des_motifs(string) if 'legifrance.gouv.fr' in url: for reg, res in clean_legifrance_regexps: string = reg.sub(res, string) else: for reg, res in clean_texte_regexps: string = reg.sub(res, string) #fix weird Sénat formatting with single cells tables around pieces of text sometimes multiline... ex: https://www.senat.fr/leg/ppl15-246.html for match in re.findall( r'(<table[^>]*>\s*(?:<t(?:body|r|d|h)[^>]*>\s*)+)(.*?)((?:\s*</t(?:body|r|d|h)[^>]*>)+\s*</table>)', string, re.I): if not re.search(r'<t(r|d|h)[^>]*>', match[1], re.I): string = string.replace(''.join(match), match[1]) srclst = [] source_avenants = False m = re.search( r"NB(\s| )+:(\s| )+[lL]es? textes? d(u |es |e la |e l’)((convention|traité|avenant)s? et de(s| l’))?(accord|convention)s?(-cadres?)? figuren?t? (respectivement )?en annexe aux (deux |trois )?projets de loi \(n", re.sub(r'</?span[^>]*>', '', string), re.I) if m: try: srclst = [ int(s.strip('no ')) for s in (string.replace('<sup>', '').replace('</sup>', '').replace( ' ', ' ').replace('aux deux projets', 'aux projets'). replace('aux trois projets', 'aux projets').replace( '°', 'o').replace('nos ', 'no ').replace('ns ', 'no '). replace('(n ', '(no ').split( ' en annexe aux projets de loi (no ')[1].strip().split( ')')[0].strip().replace(' et ', ', ').split(', ')) ] source_avenants = True except Exception as e: if DEBUG: print( "WARNING, multi-reports detected with NB method crashing (%s: %s), trying regular method..." % (type(e), e)) if not source_avenants and "/rapports/r" in url and "TEXTES ADOPTÉS PAR LA COMMISSION" in string and string.count( ">Article unique<") == 2: m = re.search( r'<i>Assemblée nationale : </i><b>(\d+) </b>et<b> (\d+)</b>', string) if m: srclst = [int(m.group(1)), int(m.group(2))] source_avenants = True definitif = re_definitif.search( string) is not None or re_definitif_new_format.search( string) is not None or 'legifrance.gouv.fr' in url soup = BeautifulSoup(string, "html5lib") texte = {"type": "texte", "source": url, "definitif": definitif} # Generate Senat or AN ID from URL if url.startswith('http'): if "legifrance.gouv.fr" in url: m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I) if m: texte["id"] = m.group(1) elif "/jo/texte" in url: texte["id"] = url.split('/')[-3] elif re.search(r"assemblee-?nationale", url, re.I): m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I) numero = int(m.group(3)) texte["id"] = "A" + m.group(1) + "-" if m.group(2) is not None: texte["id"] += m.group(2) texte["id"] += str(numero) texte["nosdeputes_id"] = get_text_id(url) else: m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})(rec)?\d?(_mono)?\.", url, re.I) if m is None: m = re.search(r"/(-)?20(\d+)-\d+/(\d+)(_mono)?.html", url, re.I) numero = int(m.group(3)) texte["id"] = "S" + m.group(2) + "-" if m.group(1) is not None: texte["id"] += m.group(1) texte["id"] += "%03d" % numero texte["nossenateurs_id"] = get_text_id(url) texte["titre"] = clean_html( re_clean_title_legif.sub( '', soup.title.string.strip())) if soup.title else "" texte["expose"] = "" expose = False # states 'read' can be set to: READ_DISABLED = -1 # the text is not detected yet READ_TEXT = 0 # read the text READ_TITLE = 1 # titles lecture READ_ALINEAS = 2 # alineas lecture read = READ_TEXT art_num = ali_num = 0 article = {} indextext = -1 curtext = -1 section = {"type": "section", "id": ""} rejected_all_articles = [ ] # we only keep the last detected text by default, here are stored the previous texts def should_be_parsed(x): """returns True if x can contain useful information""" if x.name not in ('p', 'table', 'h1', 'h2', 'h4'): return False # hack: we don't want to parse the table containing the conclusion from the senat # ex: https://www.senat.fr/leg/tas12-040.html if x.name == "table" and re.search("SESSION (EXTRA)?ORDINAIRE DE", str(x)): return False # hack: senate can copy paste the /textes/ output from the AN # ex: https://www.senat.fr/leg/ppl17-545.html # TODO: they also mess up the encoding by doing that if x.name == "table" and re.search( "<!-- Col de controle de taille -->", str(x)): return False return True def should_be_ignored(x): if hasattr(x, 'attrs') and 'display: none' in x.attrs.get('style', ''): return True return False for text in non_recursive_find_all(soup, should_be_parsed, should_be_ignored): line = clean_html(str(text)) if DEBUG: print(read, article.get('titre') or art_num, ali_num, line, file=sys.stderr) # limit h2/h4 matches to PPL headers or Article unique if text.name not in ('p', 'table') and not re_mat_ppl.match( line) and not re_mat_tco.match( line) and 'Article unique' not in line: if DEBUG: print(" -> IGNORING LINE", file=sys.stderr) continue if re_stars.match(line): continue if line == "<b>RAPPORT</b>" or line == "Mesdames, Messieurs,": read = READ_DISABLED if (srclst or indextext != -1) and re_sep_text.match(line): curtext += 1 art_num = 0 srcl = re_src_mult.search(line) if not source_avenants and srcl and read in (READ_DISABLED, READ_TEXT): srclst.append(int(srcl.group(1))) continue cl_line = re_cl_html.sub("", line).strip() if re_rap_mult.match(line): line = cl_line line = re_clean_mult_1.sub(",", line) line = re_clean_mult_2.sub("", line) cl_line = re_cl_html.sub("", line).strip() for n_t in line.split(','): indextext += 1 if int(n_t) == numero: break elif re_mat_ppl.match(line) or re_mat_tco.match(line) or ( read == READ_DISABLED and line == "<b>Article 1er</b>"): read = READ_TEXT if len(all_articles): if DEBUG: print('WARNING: Found articles before the real text') if article is not None: pr_js(article) rejected_all_articles.append(all_articles) all_articles = [] article = {} art_num = 0 elif re_mat_exp.match(line): read = READ_DISABLED # Deactivate description lecture expose = True elif read == READ_TEXT and definitif_before_congres in line or definitif_after_congres in line: texte['definitif'] = True if all_articles: all_articles[0]['definitif'] = True continue elif (re_echec_cmp.search(cl_line) or re_echec_com.search(cl_line) or re_echec_com2.search(cl_line) or re_echec_com3.search(cl_line) or re_echec_com4.search(cl_line) or re_echec_com5.search(cl_line) or re_echec_com6.search(cl_line) or re_echec_hemi.match(cl_line) or re_echec_hemi2.search(cl_line) or re_echec_hemi3.search(cl_line) ) and 'dont la teneur suit' not in cl_line: pr_js({"type": "echec", "texte": cl_line}) break elif read == READ_DISABLED: continue # or (indextext != -1 and curtext != indextext): #keep all texts resulting from multireport now it's selected then in complete # crazy edge case: "(Conforme)Article 24 bis A (nouveau)" on one line # http://www.assemblee-nationale.fr/13/projets/pl3324.asp # simplified, just do the "(Conforme)" case if '<i>(Conforme)</i>' in line and re_mat_art.search(line): article["statut"] = 'conforme' line = line.replace('<i>(Conforme)</i>', '') cl_line = cl_line.replace('(Conforme)', '') # another crazy edge case: the text is inside the annexe # ex: http://www.assemblee-nationale.fr/13/rapports/r2083.asp # TODO: could detect via "le présent projet de loi dans le texte figurant en annexe" # like the source_avenants logic if read != READ_ALINEAS and re_approb.match(line): art_num += 1 article = { "type": "article", "order": art_num, "alineas": {}, "statut": "none", "titre": "1er" } read = READ_ALINEAS # Identify section zones line = normalize_section_title(line, text, has_multiple_expose) m = re_mat_sec.match(line) if m: read = READ_TITLE # Activate titles lecture section["type_section"] = real_lower(m.group(1)) section_typ = m.group(1).upper()[0] if m.group(3) is not None: section_typ += "S" if re.search(re_préliminaire, line) or " LIMINAIRE" in line.upper(): section_num = "L" else: section_num = re_cl_html.sub('', m.group(5).strip()) if word_to_number(section_num) is not None: section_num = word_to_number(section_num) section_num = normalize_1(section_num, '1') section_num = re_clean_bister.sub( lambda m: m.group(1) + " " + real_lower(m.group(2)), section_num) section_num = re_mat_new.sub('', section_num).strip() m2 = re_mat_romans.match(section_num) if m2: rest = section_num.replace(m2.group(0), '') section_num = romans(m2.group(0)) if rest: section_num = str(section_num) + rest # Get parent section id to build current section id section_par = re.sub(r"" + section_typ + r"[\dL].*$", "", section["id"]) section["id"] = section_par + section_typ + str(section_num) # check_section_is_not_a_duplicate(section["id"]) titre = blank_none(m.group('titre')).strip() if titre: section['titre'] = titre if article is not None: pr_js(article) article = {} pr_js(section) read = READ_TEXT elif re_mat_end.match(line) and not include_annexes: if not expose: if DEBUG: print("DEBUG: END OF TEXT OF DETECTED") if len(all_articles) > 0: break expose = False continue # Annexes. elif read == READ_ALINEAS and re_mat_ann.match(line): if include_annexes: if article is not None: pr_js(article) titre = re_cl_html.sub("", re_mat_ann.sub("", line)) art_num += 1 article = { "type": "annexe", "order": art_num, "alineas": {}, "statut": "none", "titre": titre } ali_num = 0 else: break # Identify titles and new article zones elif (re.match(r"(<i>)?<b>", line) or re_art_uni.match(cl_line) or re.match(r"^Articles? ", line)) and not re.search( r">Articles? supprimé", line): line = cl_line.strip() # Read a new article if re_mat_art.match(line): if article is not None: pr_js(article) read = READ_ALINEAS # Activate alineas lecture expose = False art_num += 1 ali_num = 0 article = { "type": "article", "order": art_num, "alineas": {}, "statut": "none" } if srclst: article["source_text"] = srclst[curtext] m = re_mat_art.match(clean_article_name(text)) article["titre"] = normalize_1(m.group(1), "1er").replace(u"İ", "I") assert article["titre"] # avoid empty titles assert not texte['definitif'] or ' bis' not in article[ "titre"] # detect invalid article names if m.group(2) is not None: article["statut"] = re_cl_par.sub("", real_lower( m.group(2))).strip() if section["id"] != "": article["section"] = section["id"] # Read a section's title elif read == READ_TITLE and line: section["titre"] = lower_but_first(line) if article is not None: pr_js(article) article = {} pr_js(section) read = READ_TEXT # detect dots, used as hints for later completion if read != READ_DISABLED: if re_mat_dots.match(line): if article is not None: pr_js(article) article = {} pr_js({"type": "dots"}) read = READ_TEXT # ignore alineas after the dots continue # Read articles with alineas if read == READ_ALINEAS and not m: line = re_clean_coord.sub('', line) # if the line was only "Pour coordination", ignore it if not line: continue # Find extra status information if ali_num == 0 and re_mat_st.match(line): article["statut"] = re_cl_html.sub( "", re_cl_par.sub("", real_lower(line)).strip()).strip() continue if "<table>" in line: cl_line = cl_html_except_tables(line) line = re_clean_art_spaces2.sub( '. - ', re_clean_art_spaces.sub( r'\1', re_clean_idx_spaces.sub( r'\1. ', re_mat_new.sub(" ", cl_line).strip()))) # Clean low/upcase issues with BIS TER etc. line = line.replace("oeUVRE", "OEUVRE") line = clean_full_upcase(line) line = re_clean_premier.sub( lambda m: (real_lower(m.group(0)) if m.group(1) else "") + m.group(3) + "er", line) line = re_clean_bister.sub( lambda m: m.group(1) + " " + real_lower(m.group(2)), line) # Clean different versions of same comment. line = re_clean_supr.sub('(Supprimé)', line) line = re_clean_conf.sub(r'\1(Non modifié)', line) line = re_clean_subsec_space.sub(r'\1\4 \5', line) line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line) tmp = line line = re_clean_punc_space.sub(r'\1 \2', tmp) line = re_clean_spaces.sub(' ', line) line = re_mat_sec.sub( lambda x: lower_but_first(x.group(1)) + x.group(4) if re_mat_n.match(x.group(4)) else x.group(0), line) line = re_clean_footer_notes.sub(".", line) # Clean comments (Texte du Sénat), (Texte de la Commission), ... if ali_num == 0 and re_mat_texte.match(line): continue line = re_mat_single_char.sub("", line) line = line.strip() if line: ali_num += 1 article["alineas"]["%03d" % ali_num] = line else: #metas continue # sometimes we find multiple text starts inside one (mainly due to annotations), # by default we normally keep only the latest one, but if it is empty, # try to find a good one from previously parsed articles # ex: http://www.assemblee-nationale.fr/15/propositions/pion0965.asp if not all_articles: for rejected in rejected_all_articles: articles_parsed = [ art for art in rejected if art.get('type') == 'article' ] if len(articles_parsed): print( 'WARNING: retrieving parsed text from a previously rejected text' ) all_articles = rejected break if article is not None: pr_js(article) if indextext != -1 and curtext + 1 != len(srclst): print( "WARNING: multiple texts announced but %d/%d found %s" % (curtext + 1, len(srclst), srclst), indextext) return all_articles
def fetch_csv(): csv_resp = download( "http://data.senat.fr/data/dosleg/dossiers-legislatifs.csv").text return list(csv.DictReader(csv_resp.split('\n'), delimiter=';'))
def download_texte(url): text = download(url).text return clean_fioritures(clean_br(text))
def parse(url, resp=None): """ parse the text of an url, an already cached to`resp` can be passed to avoid an extra network request """ all_articles = [] def pr_js(article): nonlocal all_articles add_to_articles(article, all_articles) def save_text(txt): if "done" not in txt: pr_js(txt) txt["done"] = True return txt if url.endswith('.pdf'): print("WARNING: text url is a pdf: %s skipping it..." % url) return all_articles if 'assemblee-nat.fr' in url: print("WARNING: url corresponds to old AN website: %s skipping it..." % url) return all_articles if url.startswith('http'): resp = download(url) if resp is None else resp if '/textes/'in url: resp.encoding = 'utf-8' string = resp.text else: string = open(url).read() string, has_multiple_expose = clean_extra_expose_des_motifs(string) if 'legifrance.gouv.fr' in url: for reg, res in clean_legifrance_regexps: string = reg.sub(res, string) else: for reg, res in clean_texte_regexps: string = reg.sub(res, string) definitif = re_definitif.search(string) is not None or 'legifrance.gouv.fr' in url soup = BeautifulSoup(string, "html5lib") texte = {"type": "texte", "source": url, "definitif": definitif} # Generate Senat or AN ID from URL if url.startswith('http'): if "legifrance.gouv.fr" in url: m = re.search(r"cidTexte=(JORFTEXT\d+)(\D|$)", url, re.I) texte["id"] = m.group(1) elif re.search(r"assemblee-?nationale", url, re.I): m = re.search(r"/(\d+)/.+/(ta)?[\w\-]*(\d{4})[\.\-]", url, re.I) numero = int(m.group(3)) texte["id"] = "A" + m.group(1) + "-" if m.group(2) is not None: texte["id"] += m.group(2) texte["id"] += str(numero) texte["nosdeputes_id"] = get_text_id(url) else: m = re.search(r"(ta|l)?s?(\d\d)-(\d{1,3})(rec)?\d?(_mono)?\.", url, re.I) if m is None: m = re.search(r"/(-)?20(\d+)-\d+/(\d+)(_mono)?.html", url, re.I) numero = int(m.group(3)) texte["id"] = "S" + m.group(2) + "-" if m.group(1) is not None: texte["id"] += m.group(1) texte["id"] += "%03d" % numero texte["nossenateurs_id"] = get_text_id(url) texte["titre"] = re_clean_title_legif.sub('', soup.title.string.strip()) if soup.title else "" texte["expose"] = "" expose = False # 'read' can be # -1 : the text is not detected yet # 0 : read the text # 1 : titles lecture # 2 : alineas lecture read = art_num = ali_num = 0 section_id = "" article = None indextext = -1 curtext = -1 srclst = [] section = {"type": "section", "id": ""} def should_be_parsed(x): """returns True if x can contain useful information""" if x.name not in ('p', 'table', 'h2', 'h4'): return False # hack: we don't want to parse the table containing the conclusion from the senat # ex: https://www.senat.fr/leg/tas12-040.html if x.name == "table" and re.search("SESSION (EXTRA)?ORDINAIRE DE", str(x)): return False return True for text in non_recursive_find_all(soup, should_be_parsed): line = clean_html(str(text)) # limit h2/h4 matches to PPL headers or Article unique if text.name not in ('p', 'table') and not re_mat_ppl.match(line) and 'Article unique' not in line: continue if re_stars.match(line): continue if line == "<b>RAPPORT</b>" or line == "Mesdames, Messieurs,": read = -1 if (srclst or indextext != -1) and re_sep_text.match(line): curtext += 1 art_num = 0 srcl = re_src_mult.search(line) cl_line = re_cl_html.sub("", line).strip() if srcl and read < 1: srclst.append(int(srcl.group(1))) continue elif re_rap_mult.match(line): line = cl_line line = re_clean_mult_1.sub(",", line) line = re_clean_mult_2.sub("", line) cl_line = re_cl_html.sub("", line).strip() for n_t in line.split(','): indextext += 1 if int(n_t) == numero: break elif re_mat_ppl.match(line) or re_mat_tco.match(line): read = 0 texte = save_text(texte) elif re_mat_exp.match(line): read = -1 # Deactivate description lecture expose = True elif read == 0 and definitif_before_congres in line or definitif_after_congres in line: texte['definitif'] = True if all_articles: all_articles[0]['definitif'] = True continue elif (re_echec_cmp.search(cl_line) or re_echec_com.search(cl_line) or re_echec_com2.search(cl_line) or re_echec_com3.search(cl_line) or re_echec_com4.search(cl_line) or re_echec_com5.search(cl_line) or re_echec_com6.search(cl_line) or re_echec_hemi.match(cl_line) or re_echec_hemi2.search(cl_line) or re_echec_hemi3.search(cl_line) ) and 'dont la teneur suit' not in cl_line: texte = save_text(texte) pr_js({"type": "echec", "texte": cl_line}) break elif read == -1 or (indextext != -1 and curtext != indextext): continue # crazy edge case: "(Conforme)Article 24 bis A (nouveau)" on one line # http://www.assemblee-nationale.fr/13/projets/pl3324.asp # simplified, just do the "(Conforme)" case if '<i>(Conforme)</i>' in line and re_mat_art.search(line): article["statut"] = 'conforme' line = line.replace('<i>(Conforme)</i>', '') cl_line = cl_line.replace('(Conforme)', '') # Identify section zones line = normalize_section_title(line, text, has_multiple_expose) m = re_mat_sec.match(line) if m: read = 1 # Activate titles lecture section["type_section"] = real_lower(m.group(1)) section_typ = m.group(1).upper()[0] if m.group(3) is not None: section_typ += "S" if " LIMINAIRE" in line: section_num = "L" else: section_num = re_cl_html.sub('', m.group(5).strip()) if word_to_number(section_num) is not None: section_num = word_to_number(section_num) section_num = normalize_1(section_num, '1') section_num = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), section_num) section_num = re_mat_new.sub('', section_num).strip() m2 = re_mat_romans.match(section_num) if m2: rest = section_num.replace(m2.group(0), '') section_num = romans(m2.group(0)) if rest: section_num = str(section_num) + rest # Get parent section id to build current section id section_par = re.sub(r""+section_typ+"[\dL].*$", "", section["id"]) section["id"] = section_par + section_typ + str(section_num) # check_section_is_not_a_duplicate(section["id"]) titre = blank_none(m.group('titre')).strip() if titre: texte = save_text(texte) section['titre'] = titre if article is not None: pr_js(article) article = None pr_js(section) read = 0 # Identify titles and new article zones elif (not expose and re_mat_end.match(line)) or (read == 2 and re_mat_ann.match(line)): break elif (re.match(r"(<i>)?<b>", line) or re_art_uni.match(cl_line) or re.match(r"^Articles? ", line) ) and not re.search(r">Articles? supprimé", line): line = cl_line.strip() # Read a new article if re_mat_art.match(line): if article is not None: texte = save_text(texte) pr_js(article) read = 2 # Activate alineas lecture expose = False art_num += 1 ali_num = 0 article = {"type": "article", "order": art_num, "alineas": {}, "statut": "none"} if srclst: article["source_text"] = srclst[curtext] m = re_mat_art.match(clean_article_name(text)) article["titre"] = normalize_1(m.group(1), "1er") assert article["titre"] # avoid empty titles assert not texte['definitif'] or ' bis' not in article["titre"] # detect invalid article names if m.group(2) is not None: article["statut"] = re_cl_par.sub("", real_lower(m.group(2))).strip() if section["id"] != "": article["section"] = section["id"] # Read a section's title elif read == 1 and line: texte = save_text(texte) section["titre"] = lower_but_first(line) if article is not None: pr_js(article) article = None pr_js(section) read = 0 # detect dots, used as hints for later completion if read != -1 and len(all_articles) > 0: if re_mat_dots.match(line): if article is not None: texte = save_text(texte) pr_js(article) article = None pr_js({"type": "dots"}) read = 0 continue # Read articles with alineas if read == 2 and not m: line = re_clean_coord.sub('', line) # if the line was only "Pour coordination", ignore it if not line: continue # Find extra status information if ali_num == 0 and re_mat_st.match(line): article["statut"] = re_cl_html.sub("", re_cl_par.sub("", real_lower(line)).strip()).strip() continue if "<table>" in line: cl_line = cl_html_except_tables(line) line = re_clean_art_spaces2.sub('. - ', re_clean_art_spaces.sub(r'\1', re_clean_idx_spaces.sub(r'\1. ', re_mat_new.sub(" ", cl_line).strip()))) # Clean low/upcase issues with BIS TER etc. line = line.replace("oeUVRE", "OEUVRE") line = clean_full_upcase(line) line = re_clean_premier.sub(lambda m: (real_lower(m.group(0)) if m.group(1) else "")+m.group(3)+"er", line) line = re_clean_bister.sub(lambda m: m.group(1)+" "+real_lower(m.group(2)), line) # Clean different versions of same comment. line = re_clean_supr.sub('(Supprimé)', line) line = re_clean_conf.sub(r'\1(Non modifié)', line) line = re_clean_subsec_space.sub(r'\1\4 \5', line) line = re_clean_subsec_space2.sub(r'\1 \2 \3\4', line) tmp = line line = re_clean_punc_space.sub(r'\1 \2', tmp) line = re_clean_spaces.sub(' ', line) line = re_mat_sec.sub(lambda x: lower_but_first(x.group(1))+x.group(4) if re_mat_n.match(x.group(4)) else x.group(0), line) line = re_clean_footer_notes.sub(".", line) # Clean comments (Texte du Sénat), (Texte de la Commission), ... if ali_num == 0 and re_mat_texte.match(line): continue line = re_mat_single_char.sub("", line) line = line.strip() if line: ali_num += 1 article["alineas"]["%03d" % ali_num] = line else: #metas continue if article is not None: save_text(texte) pr_js(article) return all_articles
def fetch_csv(): csv_resp = download("http://data.senat.fr/data/dosleg/dossiers-legislatifs.csv").text return list(csv.DictReader(csv_resp.split('\n'), delimiter=';'))