def find_texts_discussed_after(min_date, senate_urls=False, include_resolutions=False): OPEN_DATA_REUNIONS_URL = "http://data.assemblee-nationale.fr/static/openData/repository/15/vp/reunions/Agenda_XV.json.zip" reunions = convert_reunions_open_data_file( download_open_data_file(None, OPEN_DATA_REUNIONS_URL)) doslegs = set() for reunion in to_arr(reunions['reunions']['reunion']): date = reunion['timeStampDebut'].split('T')[0] if date < min_date: continue if not reunion.get('ODJ') or not reunion['ODJ'].get('pointsODJ'): continue for pointODJ in to_arr(reunion['ODJ']['pointsODJ']['pointODJ']): if not pointODJ['dossiersLegislatifsRefs']: continue for dosleg in to_arr( pointODJ['dossiersLegislatifsRefs']['dossierRef']): doslegs.add(dosleg) dossiers_json = download_open_data_doslegs(15) docs = { doc['dossierParlementaire']["uid"]: doc['dossierParlementaire'] for doc in dossiers_json["export"]["dossiersLegislatifs"]["dossier"] } doslegs_urls = set() for dosleg_ref in doslegs: if dosleg_ref not in docs: # TODO: dossierAbsorbantRef print('[anpy]', dosleg_ref, ' dosleg in ODJ but not found in Open Data', file=sys.stderr) continue dossier = docs[dosleg_ref] if dossier["@xsi:type"] != "DossierLegislatif_Type": continue if not include_resolutions and dossier["procedureParlementaire"][ "libelle"] == "Résolution": continue titreChemin = dossier["titreDossier"]["titreChemin"] url_pattern = "http://www.assemblee-nationale.fr/dyn/{}/dossiers/{}" url = url_pattern.format(dossier["legislature"], titreChemin) url_senat = dossier["titreDossier"]["senatChemin"] if url_senat and senate_urls: url = clean_url(url_senat) doslegs_urls.add(url) return doslegs_urls
def find_an_url(data): if not data['steps']: return an_text_url = [step['source_url'] for step in data['steps'] if step.get('source_url') and 'assemblee-nationale' in step.get('source_url')] for url in an_text_url: html = download(url).text soup = BeautifulSoup(html, 'lxml') btn = soup.select_one('#btn_dossier') if btn: a = btn.parent if a.attrs.get('href'): return clean_url(urljoin(url, a.attrs['href']))
def find_senat_url(data): if not data['steps']: return senat_text_url = [ step['source_url'] for step in data['steps'] if step.get('source_url') and 'senat.fr' in step.get('source_url') ] for url in senat_text_url: html = download(url).text soup = BeautifulSoup(html, 'lxml') for a in soup.select('#primary a'): href = urljoin(url, a.attrs.get('href', '')) if 'dossier-legislatif/' in href or 'dossierleg/' in href: return clean_url(href)
def find_an_url(data): if not data['steps']: return an_text_url = [ step['source_url'] for step in data['steps'] if step.get('source_url') and 'assemblee-nationale' in step.get('source_url') ] for url in an_text_url: html = download(url).text soup = BeautifulSoup(html, 'html5lib') btn = soup.select_one('#btn_dossier') if btn: a = btn.parent if a.attrs.get('href'): return clean_url(urljoin(url, a.attrs['href']))
def parse(url, verbose=True, logfile=sys.stderr, cached_opendata_an={}): url = clean_url(url) if '/dyn/' in url: parsed = opendata_parse(url, verbose=verbose, logfile=logfile, cached_opendata_an=cached_opendata_an) if parsed: return [parsed] resp = download_historic_dosleg(url) return historic_doslegs_parse(resp.text, resp.url, verbose=verbose, logfile=logfile)
def parse(url, logfile=sys.stderr, cached_opendata_an={}): url = clean_url(url) if '/dyn/' in url: parsed = opendata_parse(url, logfile=logfile, cached_opendata_an=cached_opendata_an) if parsed: return [parsed] print('WARNING: NOT FOUND IN OPEN-DATA', file=logfile) resp = download_historic_dosleg(url) if resp.status_code != 200: print('WARNING: NOT FOUND IN HISTORIC DOSLEGS', file=logfile) return [] return historic_doslegs_parse(resp.text, resp.url, logfile=logfile)
def parse(url, logfile=sys.stderr, cached_opendata_an={}): def _log(*args): nonlocal logfile print(*args, file=logfile) legislature, _ = parse_national_assembly_url(url) if legislature and legislature in cached_opendata_an: dossiers_json = cached_opendata_an[legislature] else: dossiers_json = download_open_data_doslegs(legislature) docs = { doc["uid"]: doc for doc in dossiers_json["export"]["textesLegislatifs"]["document"] } for dossier in dossiers_json["export"]["dossiersLegislatifs"]["dossier"]: dossier = dossier["dossierParlementaire"] if dossier["@xsi:type"] != "DossierLegislatif_Type": continue titreChemin = dossier["titreDossier"]["titreChemin"] # find the right dosleg even if it's an old url url_common_part = "{}/dossiers/{}".format(dossier["legislature"], titreChemin) if not url.endswith(url_common_part): continue url = "http://www.assemblee-nationale.fr/dyn/{}".format( url_common_part) data = {} data["urgence"] = False url_senat = dossier["titreDossier"]["senatChemin"] if url_senat: data["url_dossier_senat"] = clean_url(url_senat) data["long_title"] = dossier["titreDossier"]["titre"] data["url_dossier_assemblee"] = clean_url(url) data["assemblee_legislature"] = int(dossier["legislature"]) data["assemblee_slug"] = dossier["titreDossier"]["titreChemin"] data["assemblee_id"] = "%s-%s" % (dossier["legislature"], data["assemblee_slug"]) if dossier["procedureParlementaire"]["libelle"] in ( "Projet de loi de finances de l'année", "Projet de loi de financement de la sécurité sociale", "Projet de loi de finances rectificative", "Projet ou proposition de loi constitutionnelle", ): data['use_old_procedure'] = True data["steps"] = [] step = None start_step = None for etape in to_arr(dossier["actesLegislatifs"]["acteLegislatif"]): for path, sous_etape in yield_leafs(etape): if sous_etape["@xsi:type"] in ("EtudeImpact_Type", "DepotAvisConseilEtat_Type"): continue step = {} date = sous_etape.get("dateActe") if date: step["date"] = date.split("T")[0] if sous_etape["@xsi:type"] == "ProcedureAccelere_Type": data["urgence"] = True continue elif sous_etape["@xsi:type"] == "Promulgation_Type": url = clean_url( sous_etape.get("urlLegifrance") or sous_etape["infoJO"]["urlLegifrance"]) data["url_jo"] = url data["end"] = step["date"] step["institution"] = "gouvernement" step["stage"] = "promulgation" step["source_url"] = url data["steps"].append(step) continue elif sous_etape["@xsi:type"] == "ConclusionEtapeCC_Type": step["institution"] = "conseil constitutionnel" step["stage"] = "constitutionnalité" step["source_url"] = clean_url(sous_etape["urlConclusion"]) data["steps"].append(step) if "textesAssocies" in sous_etape: # TODO review sous_etape["texteAssocie"] = to_arr( sous_etape["textesAssocies"] ["texteAssocie"])[0]["refTexteAssocie"] code = sous_etape.get("codeActe") if "AVIS-RAPPORT" in code or code == 'CMP-DEPOT': continue if '-DPTLETTRECT' in code: continue if code.startswith("AN"): step["institution"] = "assemblee" elif code.startswith("SN"): step["institution"] = "senat" if "-DEPOT" in code: step["step"] = "depot" elif "-COM" in code: step["step"] = "commission" elif "-DEBATS" in code: step["step"] = "hemicycle" else: _log(" - WARNING Unknown step type", code) continue if "1-" in code: step["stage"] = "1ère lecture" elif "2-" in code: step["stage"] = "2ème lecture" elif "3-" in code: step["stage"] = "3ème lecture" # TODO: else libelleCourt elif "NLEC-" in code: step["stage"] = "nouv. lect." elif "ANLDEF-" in code: step["stage"] = "l. définitive" if step["step"] == "commission": continue elif "CMP-" in code: step["stage"] = "CMP" if "-AN" in code: step["institution"] = "CMP" elif "-SN" in code: step["institution"] = "senat" if "RAPPORT-SN" in code: # ignore the cmp_commission_other_url for now continue else: step["institution"] = "CMP" elif "ANLUNI-" in code: step["stage"] = "l. unique" step["id_opendata"] = sous_etape["uid"] # keep first step for a step-type (ex: first hemiycle) if start_step is None or not same_stage_step_instit( start_step, step): start_step = step if "texteAdopte" in sous_etape or "texteAssocie" in sous_etape: # there is no multiple depot in the National Assembly # simply the senate re-submitting the same text if data['steps']: last_step = data['steps'][-1] if last_step[ 'institution'] == 'assemblee' and last_step.get( 'step') == step.get('step') == 'depot': # ignore the depot we already have (since the new one is the same) data['steps'] = data['steps'][:-1] # step['xsi-type'] = sous_etape.get('@xsi:type') # step['code'] = sous_etape.get('codeActe') id_text = sous_etape.get("texteAdopte") or sous_etape.get( "texteAssocie") if id_text: if "proposal_type" not in data: if id_text.startswith("PRJL"): data["proposal_type"] = "PJL" elif id_text.startswith("PION"): data["proposal_type"] = "PPL" doc = {} if id_text in docs: doc = docs[id_text] else: _log(" - ERROR missing text", id_text) url = None if step.get( "institution") == "assemblee" or "-AN" in code: doc_code = None if doc: doc_code = doc['classification']['type'][ 'code'] if doc_code == 'ACIN': continue url = an_text_url(id_text, doc_code) if url: step['source_url'] = url data["steps"].append(step) else: pass if data['steps']: # add predicted step if not data.get('url_jo'): if data['steps'][-1].get('step') != start_step.get( 'step') and start_step.get('step'): # TODO: we could also add all the dates into a steps['dates'] = [..] data['steps'].append(start_step) data["beginning"] = data["steps"][0]["date"] else: _log(" - WARNING no steps found for", url) return data return []
def historic_doslegs_parse(html, url_an=None, verbose=True, logfile=sys.stderr, nth_dos_in_page=0, parse_previous_works=True, parse_next_works=True): """ Parse an AN dosleg like http://www.assemblee-nationale.fr/13/dossiers/accord_Montenegro_mobilite_jeunes.asp nth_dos_in_page, parse_previous_works and parse_next_works are for internal logic """ data = { 'url_dossier_assemblee': clean_url(url_an), 'urgence': False, } def _log_error(*error): print('## ERROR ###', *error, file=logfile) def _log_warning(*error): print('## WARNING ###', *error, file=logfile) log_error = _log_error log_warning = _log_warning if not verbose: def log_error(*x): return None def log_warning(*x): return None soup = BeautifulSoup(html, 'lxml') legislature, slug = parse_national_assembly_url( data['url_dossier_assemblee']) data['assemblee_slug'] = slug if legislature: data['assemblee_legislature'] = legislature else: # strange link (old dosleg) log_error('NO LEGISLATURE IN AN LINK: ' + data['url_dossier_assemblee']) data['assemblee_id'] = '%s-%s' % (data.get('assemblee_legislature', ''), data['assemblee_slug']) data['steps'] = [] curr_institution = 'assemblee' curr_stage = '1ère lecture' last_section = None # Travaux des commissions/Discussion en séance publique last_step_index = 0 travaux_prep_already = False another_dosleg_inside = None predicted_next_step = None # For unfinished projects, we try to catch the next step previous_works = None url_jo = None html_lines = html.split('\n') for i, line in enumerate(html_lines): def parse_line(): return BeautifulSoup(line, 'lxml') def line_text(): return parse_line().text.strip() def get_last_step(): if len(data['steps']) > 0: return data['steps'][-1] return {} if '<COMMENTAIRE>' in line or '<table border="1"' in line: continue if '<font face="ARIAL" size="3" color="#000080">' in line: data['long_title'] = line_text() if '<br><b><font color="#000099">Travaux des commissions</font></b><br>' in line: last_section = line_text() if '<p align="center"><b><font color="#000080">Travaux préparatoires</font></b><br>' in line: if travaux_prep_already: if parse_next_works and not nth_dos_in_page: log_warning('FOUND ANOTHER DOSLEG INSIDE THE DOSLEG') another_dosleg_inside = '\n'.join( html.split('\n')[last_step_index + 1:]) if not nth_dos_in_page: break travaux_prep_already = False else: travaux_prep_already = True if not parse_next_works and travaux_prep_already and nth_dos_in_page: continue # Senat 1ère lecture, CMP, ... if '<font color="#000099" size="2" face="Arial">' in line: text = line_text() last_section = None if 'Dossier en ligne sur le site du Sénat' in text: data['url_dossier_senat'] = clean_url( parse_line().select('a')[-1].attrs['href']) text = text.replace('(Dossier en ligne sur le site du Sénat)', '') if 'Sénat' in text: curr_institution = 'senat' elif 'Assemblée nationale' in text: curr_institution = 'assemblee' elif 'Commission Mixte Paritaire' in text or 'Lecture texte CMP' in text: curr_institution = 'CMP' curr_stage = 'CMP' elif 'Conseil Constitutionnel' in text: curr_institution = 'conseil constitutionnel' curr_stage = 'constitutionnalité' elif 'Congrès du Parlement' in text: curr_institution = 'congrès' curr_stage = 'congrès' if '1ère lecture' in text: curr_stage = '1ère lecture' elif '2e lecture' in text: curr_stage = '2ème lecture' elif 'Nouvelle lecture' in text: curr_stage = 'nouv. lect.' elif 'Lecture définitive' in text: curr_stage = 'l. définitive' if not curr_stage: curr_stage = text.split('-')[-1].strip().lower() if curr_stage == "création de la commission d'enquête": log_warning('COMMISSION D\'ENQUETE') return None if '>Proposition de résolution européenne<' in line: log_warning('PROPOSITION DE RESOLUTION EUROPEENE') return None if '>Accès aux Travaux préparatoires' in line and not previous_works: previous_works = clean_url( urljoin(url_an, parse_line().find('a').attrs['href'])) curr_step = None # conseil. consti. has no step but we should get the link no_step_but_good_link = False if 'Rapport portant également sur les propositions' in line: continue elif re.search( r'<a[^>]* href=[^>]*>(projet de loi|proposition de loi|proposition de résolution)', line, re.I): curr_step = 'depot' if curr_stage == 'CMP': continue elif ">Texte de la commission" in line or '/ta-commission/' in line: curr_step = 'commission' elif '/ta/' in line or '/leg/tas' in line: if get_last_step().get('stage') != curr_stage: curr_step = 'depot' if curr_stage == 'CMP': curr_step = 'commission' else: curr_step = 'hemicycle' elif ('/rapports/' in line or '/rap/' in line) and last_section and 'commissions' in last_section: if get_last_step().get('step') == 'commission': # log_warning('DOUBLE COMMISSION LINE: %s' % line) continue curr_step = 'commission' elif 'www.conseil-constitutionnel.fr/decision/' in line: no_step_but_good_link = True # no commissions for l. définitive if curr_stage == 'l. définitive' and curr_step == 'commission': continue if curr_step or no_step_but_good_link: # if same step previously, replace or not the url if get_last_step().get('step') == curr_step: # log_warning('DOUBLE STEP: %s' % line) # remove last step since we prefer text links instead of reports links # TODO: add report link as bonus_url last_url = get_last_step().get('source_url') if not last_url or ('/rapports/' in last_url or '/rap/' in last_url): data['steps'] = data['steps'][:-1] # looks like the last url was already a text, let's assume it's a multi-depot else: # multi-depot if not CMP # TODO: re-order multi depot if curr_institution == 'senat' and curr_stage != 'CMP': curr_step = 'depot' links = [a.attrs.get('href') for a in parse_line().select('a')] links = [ href for href in links if href and 'fiches_id' not in href and '/senateur/' not in href and 'javascript:' not in href ] if not links: log_error('NO LINK IN LINE: %s' % (line, )) continue urls_raps = [] urls_others = [] for href in links: if '/rap/' in href or '/rapports/' in href: urls_raps.append(href) else: urls_others.append(href) cmp_commission_other_url = None if len(urls_others) > 0: url = urls_others[0] # CMP commission should produce two texts, one for each institution if curr_step == 'commission' and curr_stage == 'CMP' and len( urls_others) > 1: cmp_commission_other_url = clean_url( urljoin(url_an, urls_others[1])) else: url = urls_raps[0] url = clean_url(urljoin(url_an, url)) real_institution = curr_institution if curr_stage == 'CMP' and curr_step == 'hemicycle': if 'assemblee-nationale.fr' in url: real_institution = 'assemblee' elif 'senat.fr' in url: real_institution = 'senat' step = { 'institution': real_institution, 'stage': curr_stage, 'source_url': url, } if curr_step: step['step'] = curr_step if cmp_commission_other_url: step['cmp_commission_other_url'] = cmp_commission_other_url # try to detect a date for test_line in (line, html_lines[i - 1]): test_line = test_line.replace('1<sup>er</sup>', '1') date_match = re.search( r'(déposée? le|adoptée? .*? le|modifiée? .*?|rejetée? .*?)\s*(\d\d? \w\w\w+ \d\d\d\d)', test_line, re.I) if date_match: step['date'] = format_date(date_match.group(2)) else: date_match = re.search( r'(mis en ligne le)\s*(\d\d? \w\w\w+ \d\d\d\d)', test_line, re.I) if date_match: step['date'] = format_date(date_match.group(2)) if 'date' in step and 'beginning' not in data: data['beginning'] = step['date'] data['steps'].append(step) predicted_next_step = None last_step_index = i if 'publiée au Journal Officiel' in line and not url_jo: links = [ clean_url(a.attrs['href']) for a in parse_line().select('a') if 'legifrance' in a.attrs.get('href', '') ] if not links: log_error('NO GOOD LINK IN LINE: %s' % (line, )) continue url_jo = links[-1] if 'Le Gouvernement a engagé la procédure accélérée' in line or 'engagement de la procédure accélérée' in line: data['urgence'] = True # Next step prediction via small clues # TODO: this could be done via last_section (we parse two times the same thing) # TODO: this fails for CMP hemicycle senat if curr_stage != 'CMP': if '>Discussion en séance publique<' in line: predicted_next_step = { 'institution': curr_institution, 'stage': curr_stage, 'step': 'hemicycle', } elif '>Travaux des commissions<' in line: predicted_next_step = { 'institution': curr_institution, 'stage': curr_stage, 'step': 'commission', } if not url_jo: metas = {} for meta in soup.select('meta'): if 'name' in meta.attrs: metas[meta.attrs['name']] = meta.attrs['content'] url_jo = metas.get('LIEN_LOI_PROMULGUEE') if url_jo: data['url_jo'] = clean_url(url_jo) data['steps'].append({ 'institution': 'gouvernement', 'stage': 'promulgation', 'source_url': data['url_jo'], }) # add predicted next step for unfinished projects elif predicted_next_step: data['steps'].append(predicted_next_step) if 'url_dossier_senat' not in data or 'dossier-legislatif' not in data[ 'url_dossier_senat']: senat_url = find_senat_url(data) if senat_url: data['url_dossier_senat'] = senat_url # append previous works if there are some if previous_works and parse_previous_works: log_warning('MERGING %s WITH PREVIOUS WORKS %s' % (url_an, previous_works)) resp = download_historic_dosleg(previous_works) prev_data = historic_doslegs_parse(resp.text, previous_works, logfile=logfile, verbose=verbose, nth_dos_in_page=nth_dos_in_page, parse_next_works=False) if prev_data: prev_data = prev_data[nth_dos_in_page] if len( prev_data) > 1 else prev_data[0] data = merge_previous_works_an(prev_data, data) else: log_warning('INVALID PREVIOUS WORKS', previous_works) # is this part of a dosleg previous works ? next_legislature = data[ 'assemblee_legislature'] + 1 if 'assemblee_legislature' in data else 9999 if parse_next_works and next_legislature < 15: # TODO: parse 15th legislature from open data if it exists resp = download_historic_dosleg( url_an.replace('/%d/' % data['assemblee_legislature'], '/%d/' % (data['assemblee_legislature'] + 1))) if resp.status_code == 200: recent_data = historic_doslegs_parse( resp.text, resp.url, logfile=logfile, verbose=verbose, nth_dos_in_page=nth_dos_in_page, parse_previous_works=False) if recent_data: log_warning('FOUND MORE RECENT WORKS', resp.url) recent_data = recent_data[nth_dos_in_page] if len( recent_data) > 1 else recent_data[0] data = merge_previous_works_an(data, recent_data) if another_dosleg_inside: others = historic_doslegs_parse(another_dosleg_inside, url_an, logfile=logfile, verbose=verbose, nth_dos_in_page=nth_dos_in_page + 1) if others: return [data] + others return [data]
def parse(html, url_senat=None, logfile=sys.stderr): data = {} def log_error(error): print('## ERROR ###', error, file=logfile) soup = BeautifulSoup(html, 'html5lib') data['short_title'] = clean_spaces( soup.select_one('.title-dosleg').text.strip()) if not soup.select('.title .subtitle-01'): log_error('NO TITLE - MAYBE A REDIRECT ?') return title_lines = soup.select_one('.title .subtitle-01').text.strip() data['long_title_descr'] = clean_spaces( title_lines.split('\n')[0][:-2]) # remove " :" at the end of the line data['long_title'] = clean_spaces( soup.find("meta", {"name": "Description"})['content']) promulgee_line = None ordonnance_line = None acceleree_line = None cc_line = None for line in soup.select('.title .list-disc-03 li'): if ' parue ' in line.text: promulgee_line = line elif 'ordonnance' in line.text: ordonnance_line = line elif 'accélérée' in line.text or 'Urgence déclarée' in line.text: acceleree_line = line elif 'Décision du Conseil constitutionnel' in line.text: cc_line = line else: log_error('UNKNOWN SUBTITLE: %s' % line.text) if promulgee_line: data['law_name'] = clean_spaces( promulgee_line.find('strong').text.strip()) # promulgation data['end'] = format_date( promulgee_line.text.split('JO ')[-1].split('du ')[-1].split('(') [0].strip()) # inscription aux JO if promulgee_line.find('a'): data['url_jo'] = clean_url(promulgee_line.find('a').attrs['href']) url_jo_params = parse_qs(urlparse(data['url_jo']).query) if 'cidTexte' in url_jo_params: data['legifrance_cidTexte'] = url_jo_params['cidTexte'][0] else: last_part = [ part for part in data['url_jo'].split('/') if part ][-1] if last_part.startswith('JORFTEXT'): data['legifrance_cidTexte'] = last_part else: log_error('NO JO LINK') # TOPARSE: ordonnance_line # TOPARSE: CC decision data[ 'urgence'] = acceleree_line is not None or 'procédure accélérée engagée par le' in title_lines if not url_senat: # the url is in a comment like "<!-- URL_SENAT=XXXX !-->" for downloaded pages comment = soup.find(text=lambda text: isinstance(text, Comment) and 'URL_SENAT' in text) if comment: url_senat = comment.split('=')[1].strip() if url_senat: data['url_dossier_senat'] = clean_url(url_senat) data['senat_id'] = data['url_dossier_senat'].split('/')[-1].replace( '.html', '') else: url_senat = 'http://www.senat.fr/' tableau_comparatif = soup.select_one('.button-tableau-comparatifs') if tableau_comparatif: data['tableau_comparatif_url'] = clean_url( urljoin(url_senat, tableau_comparatif.attrs['href'])) # objet du texte (very basic) for div in soup.select('#main div.scroll'): if div.find('h3') and 'Objet du texte' in div.find('h3').text: data['objet_du_texte'] = div.text.replace('Objet du texte\n', '') \ .replace("Lire le billet de l'Espace presse", '').strip() continue # TODO: selecteur foireux ? for link in soup.select('h4.title.title-06.link-type-02 a'): if 'Assemblée' in link.text: url_an = link.attrs['href'] if 'documents/index-' not in url_an: data['url_dossier_assemblee'] = clean_url(url_an) legislature, data[ 'assemblee_slug'] = parse_national_assembly_url( data['url_dossier_assemblee']) if legislature: data['assemblee_legislature'] = legislature else: log_error('NO LEGISLATURE IN AN LINK: ' + url_an) data['assemblee_id'] = '%d-%s' % (data.get( 'assemblee_legislature', ''), data['assemblee_slug']) else: log_error('INVALID URL AN: ' + url_an) data['steps'] = [] steps_shortcuts = soup.select('.list-timeline li') # icons on top if not steps_shortcuts: log_error('VERY SPECIAL CASE - PAS DE NAVETTES NORMALES') return themes_box = soup.select_one('#box-themes') if themes_box: data['themes'] = [x.text.strip() for x in themes_box.select('.theme')] for t in [ 'financement de la sécurité', 'règlement des comptes', 'règlement du budget', 'approbation des comptes', 'loi de finances rectificative', 'loi de financement rectificative', 'de loi constitutionnelle' ]: if t in data['long_title']: data['use_old_procedure'] = True if 'plfss' in data.get('senat_id', '') or 'pjlf' in data.get( 'senat_id', ''): data['use_old_procedure'] = True if 'pjl' in data.get('senat_id', '') or 'plfss' in data.get( 'senat_id', ''): data['proposal_type'] = 'PJL' elif 'ppl' in data.get('senat_id', ''): data['proposal_type'] = 'PPL' else: log_error('UNKNOWN PROPOSAL TYPE (PPL/PJL)') steps_contents = [] for item in soup.select('#box-timeline > div div'): if 'timeline-' in item.attrs.get('id', ''): steps_contents.append(item) curr_institution = None curr_stage = None error_detection_last_date = None for timeline_index, step_shortcut in enumerate(steps_shortcuts): step = {} item = BeautifulSoup('', 'lxml') # no info block for steps in the futur if len(steps_contents) > timeline_index: item = steps_contents[timeline_index] section_title = item while section_title.previous_sibling and section_title.previous_sibling.name != 'h3': section_title = section_title.previous_sibling section_title = section_title.previous_sibling.text if section_title.previous_sibling else '' step['date'] = None if step_shortcut.select('em'): date_text = step_shortcut.select('em')[-1].text.strip() if '/' in date_text: step['date'] = format_date(date_text) if not step['date'] and item.text: # TODO: date sometimes is not on the shortcut log_error('SHORCUT WITHOUT DATE') if 'beginning' not in data and step['date']: data['beginning'] = step['date'] # TODO review this part step_shorcut_infos = step_shortcut.select_one( 'a[title]').attrs['title'].split('|')[-1].split('-') step_step = step_shorcut_infos[-1].lower().strip() if 'commission' in step_step: step_step = 'commission' elif 'séance' in step_step: step_step = 'hemicycle' # TODO: ca me parait bizarre cette histoire # stage = 1ere lecture|2eme lecture|CMP # institution = assemblee|senat|CMP|gouvernement # step = depot|commission|hemicycle if step_shortcut.select_one('em'): titre = step_shortcut.select_one('em').text.lower().strip() if titre == 'loi' or 'promulgation' in titre: curr_stage = 'promulgation' else: curr_stage = step_shorcut_infos[0].lower().strip() if curr_stage == 'cmp': curr_stage = 'CMP' # sometimes the lecture info is in the date, why not ? # ex: http://www.senat.fr/dossier-legislatif/a82831259.html if 'lecture' in date_text: curr_stage = date_text img = step_shortcut.find('img').attrs['src'] if 'picto_timeline_01_' in img: curr_institution = 'assemblee' step_step = 'depot' elif 'picto_timeline_02_' in img: curr_institution = 'senat' step_step = 'depot' elif 'picto_timeline_05_' in img: curr_institution = 'CMP' curr_stage = 'CMP' # there is no "depot" step for a CMP continue elif 'picto_timeline_03_' in img: step_step = 'commission' elif 'picto_timeline_04_' in img: step_step = 'hemicycle' elif 'picto_timeline_07_' in img: curr_institution = 'gouvernement' elif 'picto_timeline_09_' in img: # 'nouv. délib.' ex: http://www.senat.fr/dossier-legislatif/pjl02-182.html continue elif 'picto_timeline_10_' in img: curr_institution = 'congrès' if curr_stage == 'c. constit.': curr_institution = 'conseil constitutionnel' curr_stage = 'constitutionnalité' step_step = None # the picto can be the wrong one...also a depot step for a CMP doesn't makes sense # ex: http://www.senat.fr/dossier-legislatif/taan99-406.html if curr_stage == 'CMP' and step_step == 'depot': curr_institution = 'CMP' log_error('DEPOT STEP FOR A CMP') continue # no commissions for l. définitive if curr_stage == 'l. définitive' and step_step == 'commission': continue step['institution'] = curr_institution # standardize on 1ère lecture / 2ème lecture curr_stage = curr_stage.replace('eme', 'ème') # ignore step rejet like https://www.senat.fr/dossier-legislatif/ppl17-392.html if step_step == 'rejet': continue step['stage'] = curr_stage if curr_stage not in ('constitutionnalité', 'promulgation'): step['step'] = step_step # fill in for special case like http://www.senat.fr/dossier-legislatif/csm.html if curr_institution == 'congrès' and not curr_stage: step['stage'] = 'congrès' if curr_institution == 'congrès' and not step_step: step['step'] = 'congrès' # pass congrés if not hemicycle if step.get('step') == 'congrès': continue # add a legislature guess if missing if curr_institution == 'assemblee' and step['date']: legis = guess_legislature(step['date']) if legis: data['assemblee_legislature'] = legis good_urls = [] if 'Texte renvoyé en commission' in item.text: step['echec'] = 'renvoi en commission' elif item.text: # TROUVONS LES TEXTES for link in item.select('a'): line = link.parent if 'Lettre rectificative' in link.text: continue if 'href' in link.attrs: href = link.attrs['href'] nice_text = link.text.lower().strip() # TODO: assemblée "ppl, ppr, -a0" (a verif) if (('/leg/' in href and '/' not in href.replace('/leg/', '') and 'avis-ce' not in href) or nice_text in ('texte', 'texte de la commission', 'décision du conseil constitutionnel') or 'jo n°' in nice_text # TODO: parse the whole block for date + url # ex: http://www.senat.fr/dossier-legislatif/pjl08-641.html or 'conseil-constitutionnel.fr/decision.' in href): # if we detect a "texte de la commission" in an old procedure, it means it's probably not the old procedure if data.get( 'use_old_procedure' ) and nice_text == 'texte de la commission' and step.get( 'stage') != 'CMP': del data['use_old_procedure'] # motion for a referendum for example # ex: http://www.senat.fr/dossier-legislatif/pjl12-349.html if '/leg/motion' in href: continue href = pre_clean_url(href) url = urljoin(url_senat, href) line_text = line.text.lower() institution = curr_institution if curr_stage != 'promulgation': # TODO: be more specific, have a way to force the curr_instituion if 'par l\'assemblée' in line_text: institution = 'assemblee' elif 'par le sénat' in line_text: institution = 'senat' else: if curr_stage == 'CMP' and step_step == 'hemicycle' \ and 'texte' in nice_text and not step.get('echec'): if 'assemblee-nationale.fr' in href: institution = 'assemblee' else: institution = 'senat' date = find_date(line_text, curr_stage) if date: if error_detection_last_date and dateparser.parse( error_detection_last_date ) > dateparser.parse(date): # TODO: can be incorrect because of multi-depot log_error( 'DATE ORDER IS INCORRECT - last=%s - found=%s' % (error_detection_last_date, date)) error_detection_last_date = date if curr_stage == 'promulgation' and 'end' in data: date = data['end'] good_urls.append({ 'url': url, 'institution': institution, 'date': date, }) if not good_urls and item.text: # sinon prendre une url d'un peu moins bonne qualité if 'Texte retiré par' in item.text: # texte retiré means all the previous steps become useless except the depot data['steps'] = [ step for step in data['steps'] if step.get('step') == 'depots' ] continue elif 'Texte rejeté par' in item.text: step['echec'] = "rejet" if 'source_url' not in step and not step.get('echec'): # trouver les numeros dans le texte if curr_institution == 'senat' and step.get('date'): url = guess_senate_text_url(item.text, step, data) if url: step['source_url'] = url if 'source_url' not in step: # prendre un rapport for link in item.select('.list-disc-02 a'): if 'href' in link.attrs: href = link.attrs['href'] href = pre_clean_url(href) nice_text = link.text.lower().strip() if nice_text == 'rapport' or nice_text == 'rapport général': step['source_url'] = urljoin(url_senat, href) break if 'source_url' not in step and step.get( 'institution' ) == 'assemblee' and 'assemblee_legislature' in data: legislature = data['assemblee_legislature'] text_num_match = re.search(r'(Texte|Rapport)\s*n°\s*(\d+)', item.text, re.I) if text_num_match: text_num = text_num_match.group(2) url = None if step.get('step') == 'commission': url = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp' elif step.get('step') == 'depot': if data.get('proposal_type') == 'PJL': url = 'http://www.assemblee-nationale.fr/{}/projets/pl{:04d}.asp' else: url = 'http://www.assemblee-nationale.fr/{}/propositions/pion{:04d}.asp' elif step.get('step') == 'hemicycle': url = 'http://www.assemblee-nationale.fr/{}/ta/ta{:04d}.asp' if url: step['source_url'] = url.format( legislature, int(text_num)) if 'source_url' not in step and not step.get('echec'): log_error( 'ITEM WITHOUT URL TO TEXT - %s.%s.%s' % (step['institution'], step.get('stage'), step.get('step'))) # Decision Conseil Constitutionnel if curr_stage == 'constitutionnalité': # we try to find the decision in the paragraph or at the top of the dosleg decision_text = item.text if cc_line: decision_text += cc_line.text if 'partiellement conforme' in item.text: step['decision'] = 'partiellement conforme' elif 'se déclare incompétent' in item.text: step['decision'] = 'se déclare incompétent' elif 'non conforme' in item.text: step['decision'] = 'non conforme' elif 'conforme' in item.text: step['decision'] = 'conforme' else: log_error('WARNING: NO DECISION FOR CC') # look for Table de concordance if curr_stage == 'promulgation': for a in item.select('a'): if 'table de concordance' in a.text.lower(): table, errors = parse_table_concordance( clean_url(urljoin(url_senat, a.attrs['href']))) data['table_concordance'] = table if errors: data['table_concordance_confusing_entries'] = errors # CMP commission has two urls: one for the Senate and one for the AN if step.get('stage') == 'CMP' and step.get('step') == 'commission': match = re.search( r"numéro de dépôt à l'Assemblée Nationale : (\d+)", clean_spaces(item.text)) if match: text_num = int(match.group(1)) step['cmp_commission_other_url'] = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp'\ .format(data['assemblee_legislature'], text_num) steps_to_add = [] if good_urls: for url in good_urls: sub_step = dict(**step) # dubstep sub_step['source_url'] = url['url'] sub_step['institution'] = url['institution'] if url['date']: sub_step['date'] = url['date'] steps_to_add.append(sub_step) else: steps_to_add.append(step) # remove CMP.CMP.hemicycle if it's a fail if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle': if not good_urls: last_step = data['steps'][-1] if data['steps'][-1].get('stage') == 'CMP' and step.get( 'step') == 'hemicycle': if 'désaccord' in section_title: last_step['echec'] = 'echec' else: log_error( 'CMP.hemicycle with no links and no fail indicated' ) continue elif len(good_urls) != 2: log_error('CMP.hemicycle WITHOUT BOTH SENAT AND ASSEMBLEE') # todo: add empty missing step institutions_found = [url['institution'] for url in good_urls] if 'assemblee' not in institutions_found: sub_step = dict(**step) # dubstep sub_step['source_url'] = None sub_step['institution'] = 'assemblee' steps_to_add.append(sub_step) # clean urls for step in steps_to_add: url = step.get('source_url') if url: step['source_url'] = clean_url(url) if len(steps_to_add) > 1: # multi-depot if step.get('step') == 'depot' and step.get( 'institution') == 'senat': # put real text as last depot steps_to_add = sorted(steps_to_add, key=lambda step: 1 if data.get('senat_id', '') in step.get( 'source_url', '') else 0) # if we are in a later step, the others depot steps must go at the top if len(data['steps']) > 0: data['steps'] = steps_to_add[:-1] + data['steps'] steps_to_add = steps_to_add[-1:] # there can be multiple texts inside an hemicycle step, ok for CMP and multi-depots but not ok for other steps elif step.get('stage') != 'CMP': log_error( 'MULTIPLE TEXTS BUT NOT CMP.hemicycle - %s.%s.%s' % (step['institution'], step.get('stage'), step.get('step'))) steps_to_add = [steps_to_add[-1]] data['steps'] += steps_to_add # if there's not url for the AN dosleg, try to find it via the texts links if 'url_dossier_assemblee' not in data: an_url = find_an_url(data) if an_url: data['url_dossier_assemblee'] = an_url return data
def parse(html, url_senat=None, logfile=sys.stderr): data = {} def log_error(error): print('## ERROR ###', error, file=logfile) soup = BeautifulSoup(html, 'html5lib') data['short_title'] = clean_spaces(soup.select_one('.title-dosleg').text.strip()) if not soup.select('.title .subtitle-01'): log_error('NO TITLE - MAYBE A REDIRECT ?') return title_lines = soup.select_one('.title .subtitle-01').text.strip() data['long_title_descr'] = clean_spaces(title_lines.split('\n')[0][:-2]) # remove " :" at the end of the line data['long_title'] = clean_spaces(soup.find("meta", {"name": "Description"})['content']) promulgee_line = None ordonnance_line = None acceleree_line = None cc_line = None for line in soup.select('.title .list-disc-03 li'): if ' parue ' in line.text: promulgee_line = line elif 'ordonnance' in line.text: ordonnance_line = line elif 'accélérée' in line.text or 'Urgence déclarée' in line.text: acceleree_line = line elif 'Décision du Conseil constitutionnel' in line.text: cc_line = line else: log_error('UNKNOWN SUBTITLE: %s' % line.text) if promulgee_line: data['law_name'] = clean_spaces(promulgee_line.find('strong').text.strip()) # promulgation data['end'] = format_date(promulgee_line.text.split('JO ')[-1].split('du ')[-1].split('(')[0].strip()) # inscription aux JO if promulgee_line.find('a'): data['url_jo'] = clean_url(promulgee_line.find('a').attrs['href']) url_jo_params = parse_qs(urlparse(data['url_jo']).query) if 'cidTexte' in url_jo_params: data['legifrance_cidTexte'] = url_jo_params['cidTexte'][0] else: log_error('NO JO LINK') # TOPARSE: ordonnance_line # TOPARSE: CC decision data['urgence'] = acceleree_line is not None or 'procédure accélérée engagée par le' in title_lines if not url_senat: # the url is in a comment like "<!-- URL_SENAT=XXXX !-->" for downloaded pages comment = soup.find(text=lambda text: isinstance(text, Comment) and 'URL_SENAT' in text) if comment: url_senat = comment.split('=')[1].strip() if url_senat: data['url_dossier_senat'] = clean_url(url_senat) data['senat_id'] = data['url_dossier_senat'].split('/')[-1].replace('.html', '') else: url_senat = 'http://www.senat.fr/' tableau_comparatif = soup.select_one('.button-tableau-comparatifs') if tableau_comparatif: data['tableau_comparatif_url'] = clean_url(urljoin(url_senat, tableau_comparatif.attrs['href'])) # objet du texte (very basic) for div in soup.select('#main div.scroll'): if div.find('h3') and 'Objet du texte' in div.find('h3').text: data['objet_du_texte'] = div.text.replace('Objet du texte\n', '') \ .replace("Lire le billet de l'Espace presse", '').strip() continue # TODO: selecteur foireux ? for link in soup.select('h4.title.title-06.link-type-02 a'): if 'Assemblée' in link.text: url_an = link.attrs['href'] if 'documents/index-' not in url_an: data['url_dossier_assemblee'] = clean_url(url_an) legislature, data['assemblee_slug'] = parse_national_assembly_url(data['url_dossier_assemblee']) if legislature: data['assemblee_legislature'] = legislature else: log_error('NO LEGISLATURE IN AN LINK: ' + url_an) data['assemblee_id'] = '%d-%s' % (data.get('assemblee_legislature', ''), data['assemblee_slug']) else: log_error('INVALID URL AN: ' + url_an) data['steps'] = [] steps_shortcuts = soup.select('.list-timeline li') # icons on top if not steps_shortcuts: log_error('VERY SPECIAL CASE - PAS DE NAVETTES NORMALES') return themes_box = soup.select_one('#box-themes') if themes_box: data['themes'] = [x.text.strip() for x in themes_box.select('.theme')] for t in [ 'financement de la sécurité', 'règlement des comptes', 'règlement du budget', 'approbation des comptes', 'loi de finances rectificative', 'loi de financement rectificative', 'de loi constitutionnelle' ]: if t in data['long_title']: data['use_old_procedure'] = True if 'plfss' in data.get('senat_id', '') or 'pjlf' in data.get('senat_id', ''): data['use_old_procedure'] = True if 'pjl' in data.get('senat_id', '') or 'plfss' in data.get('senat_id', ''): data['proposal_type'] = 'PJL' elif 'ppl' in data.get('senat_id', ''): data['proposal_type'] = 'PPL' else: log_error('UNKNOWN PROPOSAL TYPE (PPL/PJL)') steps_contents = [] for item in soup.select('#box-timeline > div div'): if 'timeline-' in item.attrs.get('id', ''): steps_contents.append(item) curr_institution = None curr_stage = None error_detection_last_date = None for timeline_index, step_shortcut in enumerate(steps_shortcuts): step = {} item = BeautifulSoup('', 'lxml') # no info block for steps in the futur if len(steps_contents) > timeline_index: item = steps_contents[timeline_index] section_title = item while section_title.previous_sibling and section_title.previous_sibling.name != 'h3': section_title = section_title.previous_sibling section_title = section_title.previous_sibling.text if section_title.previous_sibling else '' step['date'] = None if step_shortcut.select('em'): date_text = step_shortcut.select('em')[-1].text.strip() if '/' in date_text: step['date'] = format_date(date_text) if not step['date'] and item.text: # TODO: date sometimes is not on the shortcut log_error('SHORCUT WITHOUT DATE') if 'beginning' not in data and step['date']: data['beginning'] = step['date'] # TODO review this part step_shorcut_infos = step_shortcut.select_one('a[title]').attrs['title'].split('|')[-1].split('-') step_step = step_shorcut_infos[-1].lower().strip() if 'commission' in step_step: step_step = 'commission' elif 'séance' in step_step: step_step = 'hemicycle' # TODO: ca me parait bizarre cette histoire # stage = 1ere lecture|2eme lecture|CMP # institution = assemblee|senat|CMP|gouvernement # step = depot|commission|hemicycle if step_shortcut.select_one('em'): titre = step_shortcut.select_one('em').text.lower().strip() if titre == 'loi' or 'promulgation' in titre: curr_stage = 'promulgation' else: curr_stage = step_shorcut_infos[0].lower().strip() if curr_stage == 'cmp': curr_stage = 'CMP' # sometimes the lecture info is in the date, why not ? # ex: http://www.senat.fr/dossier-legislatif/a82831259.html if 'lecture' in date_text: curr_stage = date_text img = step_shortcut.find('img').attrs['src'] if 'picto_timeline_01_' in img: curr_institution = 'assemblee' step_step = 'depot' elif 'picto_timeline_02_' in img: curr_institution = 'senat' step_step = 'depot' elif 'picto_timeline_05_' in img: curr_institution = 'CMP' curr_stage = 'CMP' # there is no "depot" step for a CMP continue elif 'picto_timeline_03_' in img: step_step = 'commission' elif 'picto_timeline_04_' in img: step_step = 'hemicycle' elif 'picto_timeline_07_' in img: curr_institution = 'gouvernement' elif 'picto_timeline_09_' in img: # 'nouv. délib.' ex: http://www.senat.fr/dossier-legislatif/pjl02-182.html continue elif 'picto_timeline_10_' in img: curr_institution = 'congrès' if curr_stage == 'c. constit.': curr_institution = 'conseil constitutionnel' curr_stage = 'constitutionnalité' step_step = None # the picto can be the wrong one...also a depot step for a CMP doesn't makes sense # ex: http://www.senat.fr/dossier-legislatif/taan99-406.html if curr_stage == 'CMP' and step_step == 'depot': curr_institution = 'CMP' log_error('DEPOT STEP FOR A CMP') continue # no commissions for l. définitive if curr_stage == 'l. définitive' and step_step == 'commission': continue step['institution'] = curr_institution # standardize on 1ère lecture / 2ème lecture curr_stage = curr_stage.replace('eme', 'ème') step['stage'] = curr_stage if curr_stage not in ('constitutionnalité', 'promulgation'): step['step'] = step_step # fill in for special case like http://www.senat.fr/dossier-legislatif/csm.html if curr_institution == 'congrès' and not curr_stage: step['stage'] = 'congrès' if curr_institution == 'congrès' and not step_step: step['step'] = 'congrès' # pass congrés if not hemicycle if step.get('step') == 'congrès': continue # add a legislature guess if missing if curr_institution == 'assemblee' and step['date']: legis = guess_legislature(step['date']) if legis: data['assemblee_legislature'] = legis good_urls = [] if 'Texte renvoyé en commission' in item.text: step['echec'] = 'renvoi en commission' elif item.text: # TROUVONS LES TEXTES for link in item.select('a'): line = link.parent if 'Lettre rectificative' in link.text: continue if 'href' in link.attrs: href = link.attrs['href'] nice_text = link.text.lower().strip() # TODO: assemblée "ppl, ppr, -a0" (a verif) if ( ('/leg/' in href and '/' not in href.replace('/leg/', '') and 'avis-ce' not in href) or nice_text in ('texte', 'texte de la commission', 'décision du conseil constitutionnel') or 'jo n°' in nice_text # TODO: parse the whole block for date + url # ex: http://www.senat.fr/dossier-legislatif/pjl08-641.html or 'conseil-constitutionnel.fr/decision.' in href ): # if we detect a "texte de la commission" in an old procedure, it means it's probably not the old procedure if data.get('use_old_procedure') and nice_text == 'texte de la commission' and step.get('stage') != 'CMP': del data['use_old_procedure'] # motion for a referendum for example # ex: http://www.senat.fr/dossier-legislatif/pjl12-349.html if '/leg/motion' in href: continue href = pre_clean_url(href) url = urljoin(url_senat, href) line_text = line.text.lower() institution = curr_institution if curr_stage != 'promulgation': # TODO: be more specific, have a way to force the curr_instituion if 'par l\'assemblée' in line_text: institution = 'assemblee' elif 'par le sénat' in line_text: institution = 'senat' else: if curr_stage == 'CMP' and step_step == 'hemicycle' \ and 'texte' in nice_text and not step.get('echec'): if 'assemblee-nationale.fr' in href: institution = 'assemblee' else: institution = 'senat' date = find_date(line_text, curr_stage) if date: if error_detection_last_date and dateparser.parse(error_detection_last_date) > dateparser.parse(date): # TODO: can be incorrect because of multi-depot log_error('DATE ORDER IS INCORRECT - last=%s - found=%s' % (error_detection_last_date, date)) error_detection_last_date = date if curr_stage == 'promulgation' and 'end' in data: date = data['end'] good_urls.append({ 'url': url, 'institution': institution, 'date': date, }) if not good_urls and item.text: # sinon prendre une url d'un peu moins bonne qualité if 'Texte retiré par' in item.text: # texte retiré means all the previous steps become useless except the depot data['steps'] = [step for step in data['steps'] if step.get('step') == 'depots'] continue elif 'Texte rejeté par' in item.text: step['echec'] = "rejet" if 'source_url' not in step and not step.get('echec'): # trouver les numeros dans le texte if curr_institution == 'senat' and step.get('date'): url = guess_senate_text_url(item.text, step, data) if url: step['source_url'] = url if 'source_url' not in step: # prendre un rapport for link in item.select('.list-disc-02 a'): if 'href' in link.attrs: href = link.attrs['href'] href = pre_clean_url(href) nice_text = link.text.lower().strip() if nice_text == 'rapport' or nice_text == 'rapport général': step['source_url'] = urljoin(url_senat, href) break if 'source_url' not in step and step.get('institution') == 'assemblee' and 'assemblee_legislature' in data: legislature = data['assemblee_legislature'] text_num_match = re.search(r'(Texte|Rapport)\s*n°\s*(\d+)', item.text, re.I) if text_num_match: text_num = text_num_match.group(2) url = None if step.get('step') == 'commission': url = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp' elif step.get('step') == 'depot': if data.get('proposal_type') == 'PJL': url = 'http://www.assemblee-nationale.fr/{}/projets/pl{:04d}.asp' else: url = 'http://www.assemblee-nationale.fr/{}/propositions/pion{:04d}.asp' elif step.get('step') == 'hemicycle': url = 'http://www.assemblee-nationale.fr/{}/ta/ta{:04d}.asp' if url: step['source_url'] = url.format(legislature, int(text_num)) if 'source_url' not in step and not step.get('echec'): log_error('ITEM WITHOUT URL TO TEXT - %s.%s.%s' % (step['institution'], step.get('stage'), step.get('step'))) # Decision Conseil Constitutionnel if curr_stage == 'constitutionnalité': # we try to find the decision in the paragraph or at the top of the dosleg decision_text = item.text if cc_line: decision_text += cc_line.text if 'partiellement conforme' in item.text: step['decision'] = 'partiellement conforme' elif 'se déclare incompétent' in item.text: step['decision'] = 'se déclare incompétent' elif 'non conforme' in item.text: step['decision'] = 'non conforme' elif 'conforme' in item.text: step['decision'] = 'conforme' else: log_error('WARNING: NO DECISION FOR CC') # look for Table de concordance if curr_stage == 'promulgation': for a in item.select('a'): if 'table de concordance' in a.text.lower(): table, errors = parse_table_concordance(clean_url(urljoin(url_senat, a.attrs['href']))) data['table_concordance'] = table if errors: data['table_concordance_confusing_entries'] = errors # CMP commission has two urls: one for the Senate and one for the AN if step.get('stage') == 'CMP' and step.get('step') == 'commission': match = re.search(r"numéro de dépôt à l'Assemblée Nationale : (\d+)", clean_spaces(item.text)) if match: text_num = int(match.group(1)) step['cmp_commission_other_url'] = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp'\ .format(data['assemblee_legislature'], text_num) steps_to_add = [] if good_urls: for url in good_urls: sub_step = dict(**step) # dubstep sub_step['source_url'] = url['url'] sub_step['institution'] = url['institution'] if url['date']: sub_step['date'] = url['date'] steps_to_add.append(sub_step) else: steps_to_add.append(step) # remove CMP.CMP.hemicycle if it's a fail if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle': if not good_urls: last_step = data['steps'][-1] if data['steps'][-1].get('stage') == 'CMP' and step.get('step') == 'hemicycle': if 'désaccord' in section_title: last_step['echec'] = 'echec' else: log_error('CMP.hemicycle with no links and no fail indicated') continue elif len(good_urls) != 2: log_error('CMP.hemicycle WITHOUT BOTH SENAT AND ASSEMBLEE') # todo: add empty missing step institutions_found = [url['institution'] for url in good_urls] if 'assemblee' not in institutions_found: sub_step = dict(**step) # dubstep sub_step['source_url'] = None sub_step['institution'] = 'assemblee' steps_to_add.append(sub_step) # clean urls for step in steps_to_add: url = step.get('source_url') if url: step['source_url'] = clean_url(url) if len(steps_to_add) > 1: # multi-depot if step.get('step') == 'depot' and step.get('institution') == 'senat': # put real text as last depot steps_to_add = sorted(steps_to_add, key=lambda step: 1 if data.get('senat_id', '') in step.get('source_url', '') else 0) # if we are in a later step, the others depot steps must go at the top if len(data['steps']) > 0: data['steps'] = steps_to_add[:-1] + data['steps'] steps_to_add = steps_to_add[-1:] # there can be multiple texts inside an hemicycle step, ok for CMP and multi-depots but not ok for other steps elif step.get('stage') != 'CMP': log_error('MULTIPLE TEXTS BUT NOT CMP.hemicycle - %s.%s.%s' % (step['institution'], step.get('stage'), step.get('step'))) steps_to_add = [steps_to_add[-1]] data['steps'] += steps_to_add # if there's not url for the AN dosleg, try to find it via the texts links if 'url_dossier_assemblee' not in data: an_url = find_an_url(data) if an_url: data['url_dossier_assemblee'] = an_url return data