def process(output_directory): if not os.path.exists(output_directory): os.makedirs(output_directory) yesterday = time.time() - 86400 dfile = 'lois_dites.json' destfile = os.path.join(output_directory, dfile) if not os.path.exists(destfile) or os.path.getmtime(destfile) < yesterday: common_laws = { l.id_legi: l.common_name for l in LawService().common_laws() } print_json(common_laws, destfile) else: common_laws = open_json(destfile) return common_laws
def process(dos, OUTPUT_DIR): def log_err(txt, arg=None): raise Exception() for step_i, step in enumerate(dos['steps']): step['directory'] = get_step_id(step_i, step) step_dir = os.path.join(OUTPUT_DIR, os.path.join(step['directory'], 'texte')) articles = step.get('articles_completed', step.get('articles')) if not articles: continue mkdirs(step_dir) for data in articles: if not data or not "type" in data: log_err("JSON badly formatted, missing field type: %s" % data) sys.exit(1) if data["type"] == "texte": textid = data["id"] alldata = dict(data) alldata['sections'] = [] alldata['articles'] = [] elif textid == "": log_err("JSON missing first line with text infos") sys.exit(1) elif data["type"] == "section": alldata['sections'].append(data) elif data["type"] == "article": alldata['articles'].append(data) elif data["type"] == "echec": alldata['expose'] = data['texte'] print_json(alldata, os.path.join(step_dir, 'texte.json')) step['texte.json'] = alldata return dos
""" % (get_node_id('CMP'), get_node_id('3ème lecture • assemblee'), get_node_id('3ème lecture • senat')) dot_result += """ { rank=same; %s; %s; } """ % (get_node_id('congrès • congrès'), get_node_id('constitutionnalité • conseil constitutionnel')) else: for stage in ['1ère lecture', '2ème lecture', '3ème lecture', 'CMP']: for step in ['depot', 'commission', 'hemicycle']: if stage == 'CMP' and step == 'commission': continue dot_result += (""" { rank=same; %s; %s; } """ % (get_node_id('%s • assemblee • %s' % (stage, step)), get_node_id('%s • senat • %s' % (stage, step)))) dot_result += '\n}' details = "_detailed" if mode == "detailed" else "" open('_steps%s.log' % details, 'w').write(steps_logs) print_json(step_trans, 'steps%s_transitions.json' % details) print(dot_result) # open('steps.dot','w').write(dot_result) # improve layout: https://stackoverflow.com/questions/11588667/how-to-influence-layout-of-graph-items
for _, a in articles.iteritems(): for s in a['steps']: stepid = s['directory'] if stepid not in good_steps: good_steps[stepid] = int(s['id_step'][:2]) for i, s in enumerate(procedure['steps']): s['debats_order'] = None if 'has_interventions' in s and s['has_interventions'] and s[ 'directory'] not in intervs: print >> sys.stderr, "WARNING: removing nearly empty interventions steps for %s" % s[ 'directory'].encode('utf-8') s['has_interventions'] = False if 'directory' in s: if i == len(procedure['steps']) - 1 and not s['enddate']: s['debats_order'] = max(good_steps.values()) + 1 else: s['debats_order'] = good_steps.get(s['directory'], None) if s.get('step', '') == 'depot' and s['debats_order'] != None: if '/propositions/' in s.get('source_url', ''): s['auteur_depot'] = u"Députés" elif '/leg/ppl' in s.get('source_url', ''): s['auteur_depot'] = u"Sénateurs" else: s['auteur_depot'] = u"Gouvernement" for field in dict(s): if field.endswith('_directory') or field.endswith('_files'): del (s[field]) print_json(procedure)
def process(dos): for step_i, step in enumerate(dos['steps']): articles = step.get('articles_completed', step.get('articles')) if not articles: continue for data in articles: if data["type"] == "article": data['liens'] = [] for i in range(len(data["alineas"])): text = data["alineas"]["%03d" % (i + 1)] for candidat in metslesliens.donnelescandidats( text, 'structuré'): if 'texte' in candidat: link = text[ candidat['index'][0]:candidat['index'][1]] data['liens'].append(link) """ data['liens'].append({ 'url': 'https://duckduckgo.com/?q=!ducky+' + urllib.parse.quote_plus(link), 'texte': link, 'alinea': i, # 'index': candidat['index'], }) """ return dos if __name__ == '__main__': print_json(process(open_json(sys.argv[1])))
from monitoring_config import config from measurement_tds_info import calculate_output_data from common import print_json print_json(calculate_output_data(config))
def callbackstore(ch, method, properties, body): common.sleep_random() print_method("store", method.routing_key) common.print_json(body) ch.basic_ack(delivery_tag=method.delivery_tag)
out['articles'][id]['steps'] = [] s = create_step(step_id, step['directory'], article) s['n_diff'] = 1 s['diff'] = 'add' if nstep >= depots: s['status'] = 'new' else: s['status'] = 'depot' txt = "\n".join(s['text']) if s['status'] == 'sup': s['length'] = 50 s['n_diff'] = 0 else: s['length'] = len(txt) out['articles'][id]['steps'].append(s) if 'step' in step and not echec: old_step_id = step_id except Exception as e: sys.stderr.write("ERROR parsing step %s:\n%s: %s\n" % (step, type(e), e)) exit(1) for a in out['articles']: new_steps = [] for s in out['articles'][a]['steps']: del(s['text']) new_steps.append(s) out['articles'][a]['steps'] = new_steps print_json(out)
mkdirs(step_dir) for data in articles: if not data or not "type" in data: log_err("JSON badly formatted, missing field type: %s" % data) sys.exit(1) if data["type"] == "texte": textid = data["id"] alldata = dict(data) alldata['sections'] = [] alldata['articles'] = [] elif textid == "": log_err("JSON missing first line with text infos") sys.exit(1) elif data["type"] == "section": alldata['sections'].append(data) elif data["type"] == "article": alldata['articles'].append(data) elif data["type"] == "echec": alldata['expose'] = data['texte'] print_json(alldata, os.path.join(step_dir, 'texte.json')) step['texte.json'] = alldata return dos if __name__ == '__main__': print_json(process(open_json(sys.argv[1]), 'test_out'))
if amdt["groupe"] == "Gouvernement": stats["total_amendements_gouvernement"] += 1 else: stats["total_amendements_parlementaire"] += 1 stats["echecs_procedure"] = len( [step for step in dos['steps'] if step.get("echec")]) if 'end' in dos: stats["total_days"] = (datize(dos["end"]) - datize(dos["beginning"])).days + 1 first_text, first_arts, last_text, last_arts = find_first_and_last_texts( dos) stats["total_input_articles"] = len(first_arts) stats["total_output_articles"] = len(last_arts) stats["ratio_articles_growth"] = len(last_arts) / len(first_arts) stats["ratio_texte_modif"] = 1 - compute_similarity_by_articles( first_arts, last_arts) stats["input_text_length"] = len("\n".join(first_text)) stats["output_text_length"] = len("\n".join(last_text)) return stats if __name__ == '__main__': print_json(process(sys.argv[1], open_json(sys.argv[2])))
s = create_step(step_id, step['directory'], article) s['n_diff'] = 1 s['diff'] = 'add' if nstep >= depots: s['status'] = 'new' else: s['status'] = 'depot' txt = "\n".join(s['text']) if s['status'] == 'sup': s['length'] = 50 s['n_diff'] = 0 else: s['length'] = len(txt) out['articles'][id]['steps'].append(s) if 'step' in step and not echec: old_step_id = step_id except Exception as e: sys.stderr.write("ERROR parsing step %s:\n%s: %s\n" % (step, type(e), e)) exit(1) for a in out['articles']: new_steps = [] for s in out['articles'][a]['steps']: del (s['text']) new_steps.append(s) out['articles'][a]['steps'] = new_steps print_json(out)
good_steps = {} for _, a in articles.iteritems(): for s in a['steps']: stepid = s['directory'] if stepid not in good_steps: good_steps[stepid] = int(s['id_step'][:2]) for i, s in enumerate(procedure['steps']): s['debats_order'] = None if 'has_interventions' in s and s['has_interventions'] and s['directory'] not in intervs: print >> sys.stderr, "WARNING: removing nearly empty interventions steps for %s" % s['directory'].encode('utf-8') s['has_interventions'] = False if 'directory' in s: if i == len(procedure['steps'])-1 and not s['enddate']: s['debats_order'] = max(good_steps.values()) + 1 else: s['debats_order'] = good_steps.get(s['directory'], None) if s.get('step', '') == 'depot' and s['debats_order'] != None: if '/propositions/' in s.get('source_url', ''): s['auteur_depot'] = u"Députés" elif '/leg/ppl' in s.get('source_url',''): s['auteur_depot'] = u"Sénateurs" else: s['auteur_depot'] = u"Gouvernement" for field in dict(s): if field.endswith('_directory') or field.endswith('_files'): del(s[field]) print_json(procedure)
def process(OUTPUT_DIR, procedure): context = Context([0, OUTPUT_DIR], load_parls=True) #['Indéfini', 'Adopté', 'Irrecevable', 'Rejeté', 'Retiré', 'Tombe', 'Non soutenu', 'Retiré avant séance', 'Rectifié', 'Favorable' ,'Satisfait'] def simplify_sort(sort): sort = sort.lower() if sort in "adopté favorable": return "adopté" if sort in "rejeté ": return "rejeté" if sort in "indéfini": return "en attente" return "non-voté" re_clean_first = re.compile(r'^(.*?)(,| et) .*$') def first_author(signataires): if signataires is None or "gouvernement" in signataires.lower(): return "" return re_clean_first.sub(r'\1, …', signataires) def find_groupe(amd): if amd['signataires'] and "gouvernement" in amd['signataires'].lower(): return "Gouvernement" ct = {} maxc = 0 result = "" for gpe in amd['groupes_parlementaires']: g = gpe['groupe'] count = 1 # the new api compact the groups if ':' in g: g, count = gpe['groupe'].split(':') count = int(count) g = slug_groupe(g) if g not in ct: ct[g] = 0 ct[g] += count if ct[g] > maxc: maxc = ct[g] result = g return result def add_link(links, pA, pB, weight=1): p1 = min(pA, pB) p2 = max(pA, pB) linkid = "%s-%s" % (p1, p2) if linkid not in links: links[linkid] = { "1": p1, "2": p2, "w": 0 } links[linkid]["w"] += weight article_number_regexp = re.compile(r'article (1er.*|(\d+).*)$', re.I) def sort_amendements(texte, amendements): articles = {} for article in texte: if article['type'] == 'article': titre = article.get('titre') if titre: articles[titre.lower()] = article.get('order') * 10 def solveorder(art): nonlocal articles art = art.lower() order = 10000; if art == 'titre' or art.startswith('intitul'): return 0 elif art.startswith('motion'): return 1 elif art.startswith('projet') \ or art.startswith('proposition') \ or art.startswith('texte'): return 5 else: m = article_number_regexp.search(art) if m: if articles.get(m.group(1)): order = articles.get(m.group(1)) elif articles.get(m.group(2)): order = articles.get(m.group(2)) if 'avant' in art: order -= 1 elif 'après' in art or 'apres' in art: order += 1 return order for amendement in amendements: amdt = amendement['amendement'] amdt['ordre_article'] = solveorder(amdt['sujet']) return amendements CACHE_BUSTING = 'cache=%d' % time() if 'url_jo' in procedure: CACHE_BUSTING = 'cache=5feb2018' # fixed cache busting for promulgated laws steps = {} last_text_id, last_text_typeparl = None, None steps = procedure['steps'] for i, step in enumerate(steps): print(' * step -', step.get('stage'), step.get('step'), step.get('source_url')) if step.get('step') not in ('commission', 'hemicycle'): continue if step.get('step') == 'commission' and step.get('stage') == 'CMP': continue if i == 0: continue last_step_index = get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure')) last_step = steps[last_step_index] last_step_with_good_text_number = steps[get_previous_step(steps, i, is_old_procedure=procedure.get('use_old_procedure'), get_depot_step=True) ] texte_url = last_step_with_good_text_number.get('source_url') if step.get('stage') != 'CMP' and last_step_with_good_text_number.get('institution') != step.get('institution'): print('ERROR - last step is from another institution', file=sys.stderr) continue # for a CMP hemicycle we have to get the right text inside the CMP commission if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle': urls = [last_step.get('source_url')] if 'cmp_commission_other_url' in last_step: urls.append(last_step.get('cmp_commission_other_url')) an_url = [url for url in urls if 'nationale.fr' in url] senat_url = [url for url in urls if 'senat.fr' in url] if step.get('institution') == 'assemblee' and an_url: texte_url = an_url[0] elif step.get('institution') == 'senat' and senat_url: texte_url = senat_url[0] else: print('WARNING - missing the CMP commission text for', step.get('source_url'), file=sys.stderr) continue if texte_url is None: print('ERROR - no texte url', step.get('source_url'), file=sys.stderr) continue texte = open_json(os.path.join(context.sourcedir, 'procedure', last_step['directory']), 'texte/texte.json') amdt_url = None if "nationale.fr" in texte_url: if 'assemblee_legislature' not in procedure: print(' + no AN legislature - pass text') continue amdt_url = 'https://nosdeputes.fr/%s/amendements/%s/json?%s' % (procedure.get('assemblee_legislature'), get_text_id(texte_url), CACHE_BUSTING) elif "senat.fr" in texte_url: amdt_url = 'https://nossenateurs.fr/amendements/%s/json?%s' % (get_text_id(texte_url), CACHE_BUSTING) if amdt_url is None: continue print(' * downloading amendments:', amdt_url, 'for', texte_url) amendements_src = download(amdt_url).json().get('amendements', []) # TA texts can be zero-paded or not (TA0XXX or TAXXX), we try both if 'amendements/TA' in amdt_url: textid = get_text_id(texte_url) if 'TA0' in textid: alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').lstrip('0')) else: alternative_url = amdt_url.replace(textid, 'TA' + textid.replace('TA', '').zfill(4)) print(' WARNING: TA - trying alternative url too', alternative_url) amendements_src += download(alternative_url).json().get('amendements', []) print(' parsing amendments:', len(amendements_src)) # ignore amendments if they are not for the correct step amendements_src_filtered = [] for amd in amendements_src: a = amd['amendement'] if step.get('institution') == 'assemblee': # commission amendments can have two forms # - /amendements/LOI/NUM.asp (13th legislature) # - /amendements/LOI/COMMISSION_NAME/NUM.asp (14+ legislature) # hemicycle amendments are: # - /amendements/LOI/NUM.asp (13th legislature) # - /amendements/LOI/AN/NUM.asp (14+ legislature) amdt_step = 'hemicycle' if '/cr-' in a['source']: amdt_step = 'commission' else: url_parts = a['source'].split('amendements/')[1].split('/') if len(url_parts) == 3 and url_parts[1] != 'AN': amdt_step = 'commission' elif step.get('institution') == 'senat': amdt_step = 'commission' if '/commissions/' in a['source'] else 'hemicycle' else: # CMP - there's not way for now to distinguish the step amdt_step = step['step'] if step['step'] != amdt_step: continue amendements_src_filtered.append(amd) if len(amendements_src_filtered) != len(amendements_src): print('WARNING: amendments ignored (not the right step) %s' % (len(amendements_src) - len(amendements_src_filtered)), file=sys.stderr) amendements_src = amendements_src_filtered step['nb_amendements'] = len(amendements_src) if len(amendements_src) > 0: amendements_src = sort_amendements(texte['articles'], amendements_src) typeparl, urlapi = identify_room(texte_url, legislature=step.get('assemblee_legislature', procedure.get('assemblee_legislature'))) sujets = {} groupes = {} fix_order = False orders = [] parls = {} links = {} idents = {} for amd in amendements_src: a = amd['amendement'] if "sort" not in a: print('WARNING: amendment has no sort %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue if a["sort"] == "Rectifié": continue if "sujet" not in a or not a["sujet"]: if a["sort"] not in ["Irrecevable", "Retiré avant séance"]: print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue key = a['sujet'] if not key: print('WARNING: amendment has no subject %s\n' % a['url_nos%ss' % typeparl], file=sys.stderr) continue if key not in sujets: orders.append(key) sujets[key] = { 'titre': key, 'order': a['ordre_article'], 'amendements': [] } if a['ordre_article'] > 9000: fix_order = True gpe = find_groupe(a) if not gpe: if a["sort"] != "Irrecevable": sys.stderr.write('WARNING: no groupe found for %s\n' % a['url_nos%ss' % typeparl]) gpe = "Inconnu" context.add_groupe(groupes, gpe, urlapi) sujets[key]['amendements'].append({ 'numero': a['numero'], 'date': a['date'], 'sort': simplify_sort(a['sort']), 'groupe': gpe, 'id_api': a['id'], 'aut': first_author(a['signataires']) }) cosign = [] hmd5 = a["cle_unicite"] if hmd5 not in idents: idents[hmd5] = [] for parll in a["parlementaires"]: parl = parll["parlementaire"] if parl not in parls: p = context.get_parlementaire(urlapi, parl) parls[parl] = { "i": p["id"], "s": parl, "a": 0, "n": p["nom"], "g": p["groupe_sigle"], "p": p["place_en_hemicycle"] } pid = parls[parl]["i"] parls[parl]["a"] += 1 for cid in cosign: add_link(links, pid, cid) #add_link(links, pid, cid, 2) cosign.append(pid) for cid in idents[hmd5]: add_link(links, pid, cid) idents[hmd5].append(pid) if fix_order: orders.sort(key=cmp_to_key(compare_articles)) for i, k in enumerate(orders): sujets[k]["order"] = i amdtsfile = os.path.join(context.sourcedir, 'viz', 'amendements_%s.json' % step['directory']) data = {'id_step': step['directory'], 'api_root_url': amdapi_link(urlapi), 'groupes': groupes, 'sujets': sujets} print_json(data, amdtsfile) linksfile = os.path.join(context.sourcedir, 'viz', 'amendements_links_%s.json' % step['directory']) data = {'id_step': step['directory'], 'links': list(links.values()), 'parlementaires': dict((p["i"], dict((k, p[k]) for k in "psang")) for p in list(parls.values()))} # print_json(data, linksfile) ########### INTERVENTIONS ############# # TODO: move this to a dedicated file print(' * downloading interventions') typeparl, urlapi = identify_room(texte_url, legislature=step.get('assemblee_legislature', procedure.get('assemblee_legislature'))) inter_dir = os.path.join(context.sourcedir, 'procedure', step['directory'], 'interventions') commission_or_hemicycle = '?commission=1' if step.get('step') == 'commission' else '?hemicycle=1' # TODO: TA texts can be zero-paded or not (TA0XXX or TAXXX), we should try both seance_name = None intervention_files = [] texts = (get_text_id(texte_url),) if last_text_typeparl == typeparl: texts = (get_text_id(texte_url), last_text_id) for loiid in texts: url_seances = 'https://{}.fr/seances/{}/json{}'.format(urlapi, loiid, commission_or_hemicycle) print(' * downloading seances - ', url_seances) for id_seance_obj in sorted(download(url_seances).json().get('seances', []), key=lambda x: x["seance"]): url_seance = 'https://{}.fr/seance/{}/{}/json'.format(urlapi, id_seance_obj['seance'], loiid) print(' downloading seance - ', url_seance) resp = download(url_seance).json() if resp.get('seance'): inter = resp.get('seance')[0]['intervention'] seance_name = inter['date'] + 'T' + inter['heure'] + '_' + inter['seance_id'] print(' dumping seance -', seance_name) intervention_files.append(seance_name) if not os.path.exists(inter_dir): os.makedirs(inter_dir) print_json(resp, os.path.join(inter_dir, seance_name + '.json')) if seance_name: step['has_interventions'] = True step['intervention_files'] = intervention_files break last_text_id = get_text_id(texte_url) last_text_typeparl = typeparl return procedure
ali_num += 1 article["alineas"]["%03d" % ali_num] = line else: #metas continue if article is not None: save_text(texte) pr_js(article) return all_articles if __name__ == '__main__': if '--test' not in sys.argv: print_json(parse(sys.argv[1])) else: def assert_eq(x, y): if x != y: print(repr(x), "!=", repr(y)) raise Exception() # keep the dots assert_eq(clean_html('<i>....................</i>'), '<i>....................</i>') # but remove them for status assert_eq(clean_html('...........Conforme.........'), 'Conforme') assert_eq(clean_html('...……......……..Conforme....……...…….'), 'Conforme') # even with spaces