Example #1
0
def download_historic_dosleg(url):
    resp = download(url)

    if '/dyn/' in resp.url:
        # fallback to backed-up doslegs when the redirect is forced
        legislature, slug = parse_national_assembly_url(url)
        display_url = AN_OLD_URL_TEMPLATE.format(legislature=legislature,
                                                 slug=slug)
        download_url = 'https://raw.githubusercontent.com/regardscitoyens/archive-AN-doslegs/master/archive/' \
            + display_url.split('.fr/')[1]
        resp = download(download_url)
        resp.url = display_url

    resp.encoding = 'Windows-1252'
    return resp
def dump_error_log(url, exception, api_dir, logdir, log):
    url_id = url.replace('/', '')
    if 'assemblee-nationale' in url:
        url_id = "%s-%s" % parse_national_assembly_url(url)
    elif 'senat.fr' in url:
        url_id = url.split('/')[-1].replace('.html', '')

    abs_dir = os.path.join(api_dir, logdir)
    mkdirs(abs_dir)
    abs_file = os.path.join(abs_dir, url_id)

    with open(abs_file, 'w') as f:
        f.write(log.getvalue())

    print('[error] parsing of', url, 'failed. Details in', abs_file)

    raise ParsingFailedException(exception, os.path.join(logdir, url_id))
def dump_error_log(url, exception, api_dir, logdir, log):
    url_id = url.replace('/', '')
    if 'assemblee-nationale' in url:
        url_id = "%s-%s" % parse_national_assembly_url(url)
    elif 'senat.fr' in url:
        url_id = url.split('/')[-1].replace('.html', '')

    abs_dir = os.path.join(api_dir, logdir)
    mkdirs(abs_dir)
    abs_file = os.path.join(abs_dir, url_id)

    with open(abs_file, 'w') as f:
        f.write(log.getvalue())

    print('[error] parsing of', url, 'failed. Details in', abs_file)

    raise ParsingFailedException(exception, os.path.join(logdir, url_id))
def parse(url, logfile=sys.stderr, cached_opendata_an={}):
    def _log(*args):
        nonlocal logfile
        print(*args, file=logfile)

    legislature, _ = parse_national_assembly_url(url)
    if legislature and legislature in cached_opendata_an:
        dossiers_json = cached_opendata_an[legislature]
    else:
        dossiers_json = download_open_data_doslegs(legislature)

    docs = {
        doc["uid"]: doc
        for doc in dossiers_json["export"]["textesLegislatifs"]["document"]
    }

    for dossier in dossiers_json["export"]["dossiersLegislatifs"]["dossier"]:
        dossier = dossier["dossierParlementaire"]

        if dossier["@xsi:type"] != "DossierLegislatif_Type":
            continue

        titreChemin = dossier["titreDossier"]["titreChemin"]

        # find the right dosleg even if it's an old url
        url_common_part = "{}/dossiers/{}".format(dossier["legislature"],
                                                  titreChemin)
        if not url.endswith(url_common_part):
            continue
        url = "http://www.assemblee-nationale.fr/dyn/{}".format(
            url_common_part)

        data = {}
        data["urgence"] = False
        url_senat = dossier["titreDossier"]["senatChemin"]
        if url_senat:
            data["url_dossier_senat"] = clean_url(url_senat)
        data["long_title"] = dossier["titreDossier"]["titre"]
        data["url_dossier_assemblee"] = clean_url(url)
        data["assemblee_legislature"] = int(dossier["legislature"])
        data["assemblee_slug"] = dossier["titreDossier"]["titreChemin"]
        data["assemblee_id"] = "%s-%s" % (dossier["legislature"],
                                          data["assemblee_slug"])

        if dossier["procedureParlementaire"]["libelle"] in (
                "Projet de loi de finances de l'année",
                "Projet de loi de financement de la sécurité sociale",
                "Projet de loi de finances rectificative",
                "Projet ou proposition de loi constitutionnelle",
        ):
            data['use_old_procedure'] = True

        data["steps"] = []
        step = None
        start_step = None
        for etape in to_arr(dossier["actesLegislatifs"]["acteLegislatif"]):
            for path, sous_etape in yield_leafs(etape):
                if sous_etape["@xsi:type"] in ("EtudeImpact_Type",
                                               "DepotAvisConseilEtat_Type"):
                    continue

                step = {}

                date = sous_etape.get("dateActe")
                if date:
                    step["date"] = date.split("T")[0]

                if sous_etape["@xsi:type"] == "ProcedureAccelere_Type":
                    data["urgence"] = True
                    continue
                elif sous_etape["@xsi:type"] == "Promulgation_Type":
                    url = clean_url(
                        sous_etape.get("urlLegifrance")
                        or sous_etape["infoJO"]["urlLegifrance"])
                    data["url_jo"] = url
                    data["end"] = step["date"]

                    step["institution"] = "gouvernement"
                    step["stage"] = "promulgation"
                    step["source_url"] = url
                    data["steps"].append(step)
                    continue
                elif sous_etape["@xsi:type"] == "ConclusionEtapeCC_Type":
                    step["institution"] = "conseil constitutionnel"
                    step["stage"] = "constitutionnalité"
                    step["source_url"] = clean_url(sous_etape["urlConclusion"])
                    data["steps"].append(step)

                if "textesAssocies" in sous_etape:
                    # TODO review
                    sous_etape["texteAssocie"] = to_arr(
                        sous_etape["textesAssocies"]
                        ["texteAssocie"])[0]["refTexteAssocie"]

                code = sous_etape.get("codeActe")

                if "AVIS-RAPPORT" in code or code == 'CMP-DEPOT':
                    continue
                if '-DPTLETTRECT' in code:
                    continue

                if code.startswith("AN"):
                    step["institution"] = "assemblee"
                elif code.startswith("SN"):
                    step["institution"] = "senat"

                if "-DEPOT" in code:
                    step["step"] = "depot"
                elif "-COM" in code:
                    step["step"] = "commission"
                elif "-DEBATS" in code:
                    step["step"] = "hemicycle"
                else:
                    _log("  - WARNING Unknown step type", code)
                    continue

                if "1-" in code:
                    step["stage"] = "1ère lecture"
                elif "2-" in code:
                    step["stage"] = "2ème lecture"
                elif "3-" in code:
                    step["stage"] = "3ème lecture"  # TODO: else libelleCourt
                elif "NLEC-" in code:
                    step["stage"] = "nouv. lect."
                elif "ANLDEF-" in code:
                    step["stage"] = "l. définitive"
                    if step["step"] == "commission":
                        continue
                elif "CMP-" in code:
                    step["stage"] = "CMP"
                    if "-AN" in code:
                        step["institution"] = "CMP"
                    elif "-SN" in code:
                        step["institution"] = "senat"
                        if "RAPPORT-SN" in code:
                            # ignore the cmp_commission_other_url for now
                            continue
                    else:
                        step["institution"] = "CMP"
                elif "ANLUNI-" in code:
                    step["stage"] = "l. unique"

                step["id_opendata"] = sous_etape["uid"]

                # keep first step for a step-type (ex: first hemiycle)
                if start_step is None or not same_stage_step_instit(
                        start_step, step):
                    start_step = step

                if "texteAdopte" in sous_etape or "texteAssocie" in sous_etape:
                    # there is no multiple depot in the National Assembly
                    # simply the senate re-submitting the same text
                    if data['steps']:
                        last_step = data['steps'][-1]
                        if last_step[
                                'institution'] == 'assemblee' and last_step.get(
                                    'step') == step.get('step') == 'depot':
                            # ignore the depot we already have (since the new one is the same)
                            data['steps'] = data['steps'][:-1]

                    # step['xsi-type'] = sous_etape.get('@xsi:type')
                    # step['code'] = sous_etape.get('codeActe')

                    id_text = sous_etape.get("texteAdopte") or sous_etape.get(
                        "texteAssocie")
                    if id_text:
                        if "proposal_type" not in data:
                            if id_text.startswith("PRJL"):
                                data["proposal_type"] = "PJL"
                            elif id_text.startswith("PION"):
                                data["proposal_type"] = "PPL"

                        doc = {}
                        if id_text in docs:
                            doc = docs[id_text]
                        else:
                            _log("  - ERROR missing text", id_text)

                        url = None
                        if step.get(
                                "institution") == "assemblee" or "-AN" in code:
                            doc_code = None
                            if doc:
                                doc_code = doc['classification']['type'][
                                    'code']
                                if doc_code == 'ACIN':
                                    continue
                            url = an_text_url(id_text, doc_code)
                            if url:
                                step['source_url'] = url

                    data["steps"].append(step)

                else:
                    pass

        if data['steps']:
            # add predicted step
            if not data.get('url_jo'):
                if data['steps'][-1].get('step') != start_step.get(
                        'step') and start_step.get('step'):
                    # TODO: we could also add all the dates into a steps['dates'] = [..]
                    data['steps'].append(start_step)
            data["beginning"] = data["steps"][0]["date"]
        else:
            _log("  - WARNING no steps found for", url)

        return data
    return []
Example #5
0
def historic_doslegs_parse(html,
                           url_an=None,
                           verbose=True,
                           logfile=sys.stderr,
                           nth_dos_in_page=0,
                           parse_previous_works=True,
                           parse_next_works=True):
    """
    Parse an AN dosleg like http://www.assemblee-nationale.fr/13/dossiers/accord_Montenegro_mobilite_jeunes.asp

    nth_dos_in_page, parse_previous_works and parse_next_works are for internal logic
    """

    data = {
        'url_dossier_assemblee': clean_url(url_an),
        'urgence': False,
    }

    def _log_error(*error):
        print('## ERROR ###', *error, file=logfile)

    def _log_warning(*error):
        print('## WARNING ###', *error, file=logfile)

    log_error = _log_error
    log_warning = _log_warning
    if not verbose:

        def log_error(*x):
            return None

        def log_warning(*x):
            return None

    soup = BeautifulSoup(html, 'lxml')

    legislature, slug = parse_national_assembly_url(
        data['url_dossier_assemblee'])
    data['assemblee_slug'] = slug
    if legislature:
        data['assemblee_legislature'] = legislature
    else:  # strange link (old dosleg)
        log_error('NO LEGISLATURE IN AN LINK: ' +
                  data['url_dossier_assemblee'])
    data['assemblee_id'] = '%s-%s' % (data.get('assemblee_legislature',
                                               ''), data['assemblee_slug'])

    data['steps'] = []
    curr_institution = 'assemblee'
    curr_stage = '1ère lecture'
    last_section = None  # Travaux des commissions/Discussion en séance publique
    last_step_index = 0
    travaux_prep_already = False
    another_dosleg_inside = None
    predicted_next_step = None  # For unfinished projects, we try to catch the next step
    previous_works = None
    url_jo = None

    html_lines = html.split('\n')
    for i, line in enumerate(html_lines):

        def parse_line():
            return BeautifulSoup(line, 'lxml')

        def line_text():
            return parse_line().text.strip()

        def get_last_step():
            if len(data['steps']) > 0:
                return data['steps'][-1]
            return {}

        if '<COMMENTAIRE>' in line or '<table border="1"' in line:
            continue

        if '<font face="ARIAL" size="3" color="#000080">' in line:
            data['long_title'] = line_text()
        if '<br><b><font color="#000099">Travaux des commissions</font></b><br>' in line:
            last_section = line_text()
        if '<p align="center"><b><font color="#000080">Travaux préparatoires</font></b><br>' in line:
            if travaux_prep_already:
                if parse_next_works and not nth_dos_in_page:
                    log_warning('FOUND ANOTHER DOSLEG INSIDE THE DOSLEG')
                    another_dosleg_inside = '\n'.join(
                        html.split('\n')[last_step_index + 1:])
                if not nth_dos_in_page:
                    break
                travaux_prep_already = False
            else:
                travaux_prep_already = True
        if not parse_next_works and travaux_prep_already and nth_dos_in_page:
            continue

        # Senat 1ère lecture, CMP, ...
        if '<font color="#000099" size="2" face="Arial">' in line:
            text = line_text()
            last_section = None
            if 'Dossier en ligne sur le site du Sénat' in text:
                data['url_dossier_senat'] = clean_url(
                    parse_line().select('a')[-1].attrs['href'])
                text = text.replace('(Dossier en ligne sur le site du Sénat)',
                                    '')
            if 'Sénat' in text:
                curr_institution = 'senat'
            elif 'Assemblée nationale' in text:
                curr_institution = 'assemblee'
            elif 'Commission Mixte Paritaire' in text or 'Lecture texte CMP' in text:
                curr_institution = 'CMP'
                curr_stage = 'CMP'
            elif 'Conseil Constitutionnel' in text:
                curr_institution = 'conseil constitutionnel'
                curr_stage = 'constitutionnalité'
            elif 'Congrès du Parlement' in text:
                curr_institution = 'congrès'
                curr_stage = 'congrès'

            if '1ère lecture' in text:
                curr_stage = '1ère lecture'
            elif '2e lecture' in text:
                curr_stage = '2ème lecture'
            elif 'Nouvelle lecture' in text:
                curr_stage = 'nouv. lect.'
            elif 'Lecture définitive' in text:
                curr_stage = 'l. définitive'
            if not curr_stage:
                curr_stage = text.split('-')[-1].strip().lower()

            if curr_stage == "création de la commission d'enquête":
                log_warning('COMMISSION D\'ENQUETE')
                return None

        if '>Proposition de résolution européenne<' in line:
            log_warning('PROPOSITION DE RESOLUTION EUROPEENE')
            return None

        if '>Accès aux Travaux préparatoires' in line and not previous_works:
            previous_works = clean_url(
                urljoin(url_an,
                        parse_line().find('a').attrs['href']))

        curr_step = None
        # conseil. consti. has no step but we should get the link
        no_step_but_good_link = False
        if 'Rapport portant également sur les propositions' in line:
            continue
        elif re.search(
                r'<a[^>]* href=[^>]*>(projet de loi|proposition de loi|proposition de résolution)',
                line, re.I):
            curr_step = 'depot'
            if curr_stage == 'CMP':
                continue
        elif ">Texte de la commission" in line or '/ta-commission/' in line:
            curr_step = 'commission'

        elif '/ta/' in line or '/leg/tas' in line:
            if get_last_step().get('stage') != curr_stage:
                curr_step = 'depot'
                if curr_stage == 'CMP':
                    curr_step = 'commission'
            else:
                curr_step = 'hemicycle'
        elif ('/rapports/' in line or '/rap/'
              in line) and last_section and 'commissions' in last_section:
            if get_last_step().get('step') == 'commission':
                # log_warning('DOUBLE COMMISSION LINE: %s' % line)
                continue
            curr_step = 'commission'

        elif 'www.conseil-constitutionnel.fr/decision/' in line:
            no_step_but_good_link = True

        # no commissions for l. définitive
        if curr_stage == 'l. définitive' and curr_step == 'commission':
            continue

        if curr_step or no_step_but_good_link:
            # if same step previously, replace or not the url
            if get_last_step().get('step') == curr_step:
                # log_warning('DOUBLE STEP: %s' % line)
                # remove last step since we prefer text links instead of reports links
                # TODO: add report link as bonus_url
                last_url = get_last_step().get('source_url')
                if not last_url or ('/rapports/' in last_url
                                    or '/rap/' in last_url):
                    data['steps'] = data['steps'][:-1]
                # looks like the last url was already a text, let's assume it's a multi-depot
                else:
                    # multi-depot if not CMP
                    # TODO: re-order multi depot
                    if curr_institution == 'senat' and curr_stage != 'CMP':
                        curr_step = 'depot'

            links = [a.attrs.get('href') for a in parse_line().select('a')]
            links = [
                href for href in links if href and 'fiches_id' not in href
                and '/senateur/' not in href and 'javascript:' not in href
            ]
            if not links:
                log_error('NO LINK IN LINE: %s' % (line, ))
                continue
            urls_raps = []
            urls_others = []
            for href in links:
                if '/rap/' in href or '/rapports/' in href:
                    urls_raps.append(href)
                else:
                    urls_others.append(href)

            cmp_commission_other_url = None
            if len(urls_others) > 0:
                url = urls_others[0]
                # CMP commission should produce two texts, one for each institution
                if curr_step == 'commission' and curr_stage == 'CMP' and len(
                        urls_others) > 1:
                    cmp_commission_other_url = clean_url(
                        urljoin(url_an, urls_others[1]))
            else:
                url = urls_raps[0]

            url = clean_url(urljoin(url_an, url))

            real_institution = curr_institution
            if curr_stage == 'CMP' and curr_step == 'hemicycle':
                if 'assemblee-nationale.fr' in url:
                    real_institution = 'assemblee'
                elif 'senat.fr' in url:
                    real_institution = 'senat'

            step = {
                'institution': real_institution,
                'stage': curr_stage,
                'source_url': url,
            }

            if curr_step:
                step['step'] = curr_step

            if cmp_commission_other_url:
                step['cmp_commission_other_url'] = cmp_commission_other_url

            # try to detect a date
            for test_line in (line, html_lines[i - 1]):
                test_line = test_line.replace('1<sup>er</sup>', '1')
                date_match = re.search(
                    r'(déposée? le|adoptée? .*? le|modifiée? .*?|rejetée? .*?)\s*(\d\d? \w\w\w+ \d\d\d\d)',
                    test_line, re.I)
                if date_match:
                    step['date'] = format_date(date_match.group(2))
                else:
                    date_match = re.search(
                        r'(mis en ligne le)\s*(\d\d? \w\w\w+ \d\d\d\d)',
                        test_line, re.I)
                    if date_match:
                        step['date'] = format_date(date_match.group(2))
                if 'date' in step and 'beginning' not in data:
                    data['beginning'] = step['date']
            data['steps'].append(step)
            predicted_next_step = None
            last_step_index = i

        if 'publiée au Journal Officiel' in line and not url_jo:
            links = [
                clean_url(a.attrs['href']) for a in parse_line().select('a')
                if 'legifrance' in a.attrs.get('href', '')
            ]
            if not links:
                log_error('NO GOOD LINK IN LINE: %s' % (line, ))
                continue
            url_jo = links[-1]

        if 'Le Gouvernement a engagé la procédure accélérée' in line or 'engagement de la procédure accélérée' in line:
            data['urgence'] = True

        # Next step prediction via small clues
        # TODO: this could be done via last_section (we parse two times the same thing)
        # TODO: this fails for CMP hemicycle senat
        if curr_stage != 'CMP':
            if '>Discussion en séance publique<' in line:
                predicted_next_step = {
                    'institution': curr_institution,
                    'stage': curr_stage,
                    'step': 'hemicycle',
                }
            elif '>Travaux des commissions<' in line:
                predicted_next_step = {
                    'institution': curr_institution,
                    'stage': curr_stage,
                    'step': 'commission',
                }

    if not url_jo:
        metas = {}
        for meta in soup.select('meta'):
            if 'name' in meta.attrs:
                metas[meta.attrs['name']] = meta.attrs['content']
        url_jo = metas.get('LIEN_LOI_PROMULGUEE')

    if url_jo:
        data['url_jo'] = clean_url(url_jo)
        data['steps'].append({
            'institution': 'gouvernement',
            'stage': 'promulgation',
            'source_url': data['url_jo'],
        })
    # add predicted next step for unfinished projects
    elif predicted_next_step:
        data['steps'].append(predicted_next_step)

    if 'url_dossier_senat' not in data or 'dossier-legislatif' not in data[
            'url_dossier_senat']:
        senat_url = find_senat_url(data)
        if senat_url:
            data['url_dossier_senat'] = senat_url

    # append previous works if there are some
    if previous_works and parse_previous_works:
        log_warning('MERGING %s WITH PREVIOUS WORKS %s' %
                    (url_an, previous_works))
        resp = download_historic_dosleg(previous_works)
        prev_data = historic_doslegs_parse(resp.text,
                                           previous_works,
                                           logfile=logfile,
                                           verbose=verbose,
                                           nth_dos_in_page=nth_dos_in_page,
                                           parse_next_works=False)
        if prev_data:
            prev_data = prev_data[nth_dos_in_page] if len(
                prev_data) > 1 else prev_data[0]
            data = merge_previous_works_an(prev_data, data)
        else:
            log_warning('INVALID PREVIOUS WORKS', previous_works)

    # is this part of a dosleg previous works ?
    next_legislature = data[
        'assemblee_legislature'] + 1 if 'assemblee_legislature' in data else 9999
    if parse_next_works and next_legislature < 15:
        #  TODO: parse 15th legislature from open data if it exists
        resp = download_historic_dosleg(
            url_an.replace('/%d/' % data['assemblee_legislature'],
                           '/%d/' % (data['assemblee_legislature'] + 1)))
        if resp.status_code == 200:
            recent_data = historic_doslegs_parse(
                resp.text,
                resp.url,
                logfile=logfile,
                verbose=verbose,
                nth_dos_in_page=nth_dos_in_page,
                parse_previous_works=False)
            if recent_data:
                log_warning('FOUND MORE RECENT WORKS', resp.url)
                recent_data = recent_data[nth_dos_in_page] if len(
                    recent_data) > 1 else recent_data[0]
                data = merge_previous_works_an(data, recent_data)

    if another_dosleg_inside:
        others = historic_doslegs_parse(another_dosleg_inside,
                                        url_an,
                                        logfile=logfile,
                                        verbose=verbose,
                                        nth_dos_in_page=nth_dos_in_page + 1)
        if others:
            return [data] + others
    return [data]
Example #6
0
def parse(html, url_senat=None, logfile=sys.stderr):
    data = {}

    def log_error(error):
        print('## ERROR ###', error, file=logfile)

    soup = BeautifulSoup(html, 'html5lib')

    data['short_title'] = clean_spaces(
        soup.select_one('.title-dosleg').text.strip())

    if not soup.select('.title .subtitle-01'):
        log_error('NO TITLE - MAYBE A REDIRECT ?')
        return

    title_lines = soup.select_one('.title .subtitle-01').text.strip()
    data['long_title_descr'] = clean_spaces(
        title_lines.split('\n')[0][:-2])  # remove " :" at the end of the line
    data['long_title'] = clean_spaces(
        soup.find("meta", {"name": "Description"})['content'])

    promulgee_line = None
    ordonnance_line = None
    acceleree_line = None
    cc_line = None
    for line in soup.select('.title .list-disc-03 li'):
        if ' parue ' in line.text:
            promulgee_line = line
        elif 'ordonnance' in line.text:
            ordonnance_line = line
        elif 'accélérée' in line.text or 'Urgence déclarée' in line.text:
            acceleree_line = line
        elif 'Décision du Conseil constitutionnel' in line.text:
            cc_line = line
        else:
            log_error('UNKNOWN SUBTITLE: %s' % line.text)
    if promulgee_line:
        data['law_name'] = clean_spaces(
            promulgee_line.find('strong').text.strip())  # promulgation
        data['end'] = format_date(
            promulgee_line.text.split('JO ')[-1].split('du ')[-1].split('(')
            [0].strip())  # inscription aux JO
        if promulgee_line.find('a'):
            data['url_jo'] = clean_url(promulgee_line.find('a').attrs['href'])
            url_jo_params = parse_qs(urlparse(data['url_jo']).query)
            if 'cidTexte' in url_jo_params:
                data['legifrance_cidTexte'] = url_jo_params['cidTexte'][0]
            else:
                last_part = [
                    part for part in data['url_jo'].split('/') if part
                ][-1]
                if last_part.startswith('JORFTEXT'):
                    data['legifrance_cidTexte'] = last_part
        else:
            log_error('NO JO LINK')
    # TOPARSE: ordonnance_line
    # TOPARSE: CC decision

    data[
        'urgence'] = acceleree_line is not None or 'procédure accélérée engagée par le' in title_lines
    if not url_senat:
        # the url is in a comment like "<!-- URL_SENAT=XXXX !-->" for downloaded pages
        comment = soup.find(text=lambda text: isinstance(text, Comment) and
                            'URL_SENAT' in text)
        if comment:
            url_senat = comment.split('=')[1].strip()
    if url_senat:
        data['url_dossier_senat'] = clean_url(url_senat)
        data['senat_id'] = data['url_dossier_senat'].split('/')[-1].replace(
            '.html', '')
    else:
        url_senat = 'http://www.senat.fr/'

    tableau_comparatif = soup.select_one('.button-tableau-comparatifs')
    if tableau_comparatif:
        data['tableau_comparatif_url'] = clean_url(
            urljoin(url_senat, tableau_comparatif.attrs['href']))

    # objet du texte (very basic)
    for div in soup.select('#main div.scroll'):
        if div.find('h3') and 'Objet du texte' in div.find('h3').text:
            data['objet_du_texte'] = div.text.replace('Objet du texte\n', '') \
                .replace("Lire le billet de l'Espace presse", '').strip()
            continue

    # TODO: selecteur foireux ?
    for link in soup.select('h4.title.title-06.link-type-02 a'):
        if 'Assemblée' in link.text:
            url_an = link.attrs['href']
            if 'documents/index-' not in url_an:
                data['url_dossier_assemblee'] = clean_url(url_an)
                legislature, data[
                    'assemblee_slug'] = parse_national_assembly_url(
                        data['url_dossier_assemblee'])
                if legislature:
                    data['assemblee_legislature'] = legislature
                else:
                    log_error('NO LEGISLATURE IN AN LINK: ' + url_an)
                data['assemblee_id'] = '%d-%s' % (data.get(
                    'assemblee_legislature', ''), data['assemblee_slug'])
            else:
                log_error('INVALID URL AN: ' + url_an)

    data['steps'] = []
    steps_shortcuts = soup.select('.list-timeline li')  # icons on top
    if not steps_shortcuts:
        log_error('VERY SPECIAL CASE - PAS DE NAVETTES NORMALES')
        return

    themes_box = soup.select_one('#box-themes')
    if themes_box:
        data['themes'] = [x.text.strip() for x in themes_box.select('.theme')]

    for t in [
            'financement de la sécurité', 'règlement des comptes',
            'règlement du budget', 'approbation des comptes',
            'loi de finances rectificative',
            'loi de financement rectificative', 'de loi constitutionnelle'
    ]:
        if t in data['long_title']:
            data['use_old_procedure'] = True
    if 'plfss' in data.get('senat_id', '') or 'pjlf' in data.get(
            'senat_id', ''):
        data['use_old_procedure'] = True

    if 'pjl' in data.get('senat_id', '') or 'plfss' in data.get(
            'senat_id', ''):
        data['proposal_type'] = 'PJL'
    elif 'ppl' in data.get('senat_id', ''):
        data['proposal_type'] = 'PPL'
    else:
        log_error('UNKNOWN PROPOSAL TYPE (PPL/PJL)')

    steps_contents = []
    for item in soup.select('#box-timeline > div div'):
        if 'timeline-' in item.attrs.get('id', ''):
            steps_contents.append(item)

    curr_institution = None
    curr_stage = None
    error_detection_last_date = None
    for timeline_index, step_shortcut in enumerate(steps_shortcuts):
        step = {}

        item = BeautifulSoup('',
                             'lxml')  # no info block for steps in the futur
        if len(steps_contents) > timeline_index:
            item = steps_contents[timeline_index]

        section_title = item
        while section_title.previous_sibling and section_title.previous_sibling.name != 'h3':
            section_title = section_title.previous_sibling
        section_title = section_title.previous_sibling.text if section_title.previous_sibling else ''

        step['date'] = None
        if step_shortcut.select('em'):
            date_text = step_shortcut.select('em')[-1].text.strip()
            if '/' in date_text:
                step['date'] = format_date(date_text)
        if not step['date'] and item.text:
            # TODO: date sometimes is not on the shortcut
            log_error('SHORCUT WITHOUT DATE')

        if 'beginning' not in data and step['date']:
            data['beginning'] = step['date']

        # TODO review this part
        step_shorcut_infos = step_shortcut.select_one(
            'a[title]').attrs['title'].split('|')[-1].split('-')
        step_step = step_shorcut_infos[-1].lower().strip()
        if 'commission' in step_step:
            step_step = 'commission'
        elif 'séance' in step_step:
            step_step = 'hemicycle'

        # TODO: ca me parait bizarre cette histoire
        # stage = 1ere lecture|2eme lecture|CMP
        # institution = assemblee|senat|CMP|gouvernement
        # step = depot|commission|hemicycle
        if step_shortcut.select_one('em'):
            titre = step_shortcut.select_one('em').text.lower().strip()
            if titre == 'loi' or 'promulgation' in titre:
                curr_stage = 'promulgation'
            else:
                curr_stage = step_shorcut_infos[0].lower().strip()
                if curr_stage == 'cmp':
                    curr_stage = 'CMP'

            # sometimes the lecture info is in the date, why not ?
            # ex: http://www.senat.fr/dossier-legislatif/a82831259.html
            if 'lecture' in date_text:
                curr_stage = date_text

            img = step_shortcut.find('img').attrs['src']
            if 'picto_timeline_01_' in img:
                curr_institution = 'assemblee'
                step_step = 'depot'
            elif 'picto_timeline_02_' in img:
                curr_institution = 'senat'
                step_step = 'depot'
            elif 'picto_timeline_05_' in img:
                curr_institution = 'CMP'
                curr_stage = 'CMP'
                # there is no "depot" step for a CMP
                continue
            elif 'picto_timeline_03_' in img:
                step_step = 'commission'
            elif 'picto_timeline_04_' in img:
                step_step = 'hemicycle'
            elif 'picto_timeline_07_' in img:
                curr_institution = 'gouvernement'
            elif 'picto_timeline_09_' in img:
                # 'nouv. délib.' ex: http://www.senat.fr/dossier-legislatif/pjl02-182.html
                continue
            elif 'picto_timeline_10_' in img:
                curr_institution = 'congrès'

        if curr_stage == 'c. constit.':
            curr_institution = 'conseil constitutionnel'
            curr_stage = 'constitutionnalité'
            step_step = None

        # the picto can be the wrong one...also a depot step for a CMP doesn't makes sense
        # ex: http://www.senat.fr/dossier-legislatif/taan99-406.html
        if curr_stage == 'CMP' and step_step == 'depot':
            curr_institution = 'CMP'
            log_error('DEPOT STEP FOR A CMP')
            continue

        # no commissions for l. définitive
        if curr_stage == 'l. définitive' and step_step == 'commission':
            continue

        step['institution'] = curr_institution

        # standardize on 1ère lecture / 2ème lecture
        curr_stage = curr_stage.replace('eme', 'ème')

        # ignore step rejet like https://www.senat.fr/dossier-legislatif/ppl17-392.html
        if step_step == 'rejet':
            continue

        step['stage'] = curr_stage
        if curr_stage not in ('constitutionnalité', 'promulgation'):
            step['step'] = step_step

        # fill in for special case like http://www.senat.fr/dossier-legislatif/csm.html
        if curr_institution == 'congrès' and not curr_stage:
            step['stage'] = 'congrès'
        if curr_institution == 'congrès' and not step_step:
            step['step'] = 'congrès'
        # pass congrés if not hemicycle
        if step.get('step') == 'congrès': continue

        # add a legislature guess if missing
        if curr_institution == 'assemblee' and step['date']:
            legis = guess_legislature(step['date'])
            if legis:
                data['assemblee_legislature'] = legis

        good_urls = []

        if 'Texte renvoyé en commission' in item.text:
            step['echec'] = 'renvoi en commission'
        elif item.text:
            # TROUVONS LES TEXTES
            for link in item.select('a'):
                line = link.parent

                if 'Lettre rectificative' in link.text:
                    continue

                if 'href' in link.attrs:
                    href = link.attrs['href']
                    nice_text = link.text.lower().strip()
                    # TODO: assemblée "ppl, ppr, -a0" (a verif)
                    if (('/leg/' in href
                         and '/' not in href.replace('/leg/', '')
                         and 'avis-ce' not in href) or nice_text
                            in ('texte', 'texte de la commission',
                                'décision du conseil constitutionnel')
                            or 'jo n°' in nice_text

                            # TODO: parse the whole block for date + url
                            # ex: http://www.senat.fr/dossier-legislatif/pjl08-641.html
                            or 'conseil-constitutionnel.fr/decision.' in href):
                        # if we detect a "texte de la commission" in an old procedure, it means it's probably not the old procedure
                        if data.get(
                                'use_old_procedure'
                        ) and nice_text == 'texte de la commission' and step.get(
                                'stage') != 'CMP':
                            del data['use_old_procedure']

                        # motion for a referendum for example
                        # ex: http://www.senat.fr/dossier-legislatif/pjl12-349.html
                        if '/leg/motion' in href:
                            continue
                        href = pre_clean_url(href)

                        url = urljoin(url_senat, href)
                        line_text = line.text.lower()
                        institution = curr_institution
                        if curr_stage != 'promulgation':  # TODO: be more specific, have a way to force the curr_instituion
                            if 'par l\'assemblée' in line_text:
                                institution = 'assemblee'
                            elif 'par le sénat' in line_text:
                                institution = 'senat'
                            else:
                                if curr_stage == 'CMP' and step_step == 'hemicycle' \
                                        and 'texte' in nice_text and not step.get('echec'):
                                    if 'assemblee-nationale.fr' in href:
                                        institution = 'assemblee'
                                    else:
                                        institution = 'senat'

                        date = find_date(line_text, curr_stage)
                        if date:
                            if error_detection_last_date and dateparser.parse(
                                    error_detection_last_date
                            ) > dateparser.parse(date):
                                # TODO: can be incorrect because of multi-depot
                                log_error(
                                    'DATE ORDER IS INCORRECT - last=%s - found=%s'
                                    % (error_detection_last_date, date))
                            error_detection_last_date = date
                        if curr_stage == 'promulgation' and 'end' in data:
                            date = data['end']

                        good_urls.append({
                            'url': url,
                            'institution': institution,
                            'date': date,
                        })

        if not good_urls and item.text:
            # sinon prendre une url d'un peu moins bonne qualité

            if 'Texte retiré par' in item.text:
                # texte retiré means all the previous steps become useless except the depot
                data['steps'] = [
                    step for step in data['steps']
                    if step.get('step') == 'depots'
                ]
                continue
            elif 'Texte rejeté par' in item.text:
                step['echec'] = "rejet"

            if 'source_url' not in step and not step.get('echec'):
                # trouver les numeros dans le texte
                if curr_institution == 'senat' and step.get('date'):
                    url = guess_senate_text_url(item.text, step, data)
                    if url:
                        step['source_url'] = url

                if 'source_url' not in step:
                    # prendre un rapport
                    for link in item.select('.list-disc-02 a'):
                        if 'href' in link.attrs:
                            href = link.attrs['href']
                            href = pre_clean_url(href)
                            nice_text = link.text.lower().strip()
                            if nice_text == 'rapport' or nice_text == 'rapport général':
                                step['source_url'] = urljoin(url_senat, href)
                                break

                if 'source_url' not in step and step.get(
                        'institution'
                ) == 'assemblee' and 'assemblee_legislature' in data:
                    legislature = data['assemblee_legislature']
                    text_num_match = re.search(r'(Texte|Rapport)\s*n°\s*(\d+)',
                                               item.text, re.I)
                    if text_num_match:
                        text_num = text_num_match.group(2)
                        url = None
                        if step.get('step') == 'commission':
                            url = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp'
                        elif step.get('step') == 'depot':
                            if data.get('proposal_type') == 'PJL':
                                url = 'http://www.assemblee-nationale.fr/{}/projets/pl{:04d}.asp'
                            else:
                                url = 'http://www.assemblee-nationale.fr/{}/propositions/pion{:04d}.asp'
                        elif step.get('step') == 'hemicycle':
                            url = 'http://www.assemblee-nationale.fr/{}/ta/ta{:04d}.asp'

                        if url:
                            step['source_url'] = url.format(
                                legislature, int(text_num))

            if 'source_url' not in step and not step.get('echec'):
                log_error(
                    'ITEM WITHOUT URL TO TEXT - %s.%s.%s' %
                    (step['institution'], step.get('stage'), step.get('step')))

        # Decision Conseil Constitutionnel
        if curr_stage == 'constitutionnalité':
            # we try to find the decision in the paragraph or at the top of the dosleg
            decision_text = item.text
            if cc_line:
                decision_text += cc_line.text

            if 'partiellement conforme' in item.text:
                step['decision'] = 'partiellement conforme'
            elif 'se déclare incompétent' in item.text:
                step['decision'] = 'se déclare incompétent'
            elif 'non conforme' in item.text:
                step['decision'] = 'non conforme'
            elif 'conforme' in item.text:
                step['decision'] = 'conforme'
            else:
                log_error('WARNING: NO DECISION FOR CC')

        # look for Table de concordance
        if curr_stage == 'promulgation':
            for a in item.select('a'):
                if 'table de concordance' in a.text.lower():
                    table, errors = parse_table_concordance(
                        clean_url(urljoin(url_senat, a.attrs['href'])))
                    data['table_concordance'] = table
                    if errors:
                        data['table_concordance_confusing_entries'] = errors

        # CMP commission has two urls: one for the Senate and one for the AN
        if step.get('stage') == 'CMP' and step.get('step') == 'commission':
            match = re.search(
                r"numéro de dépôt à l'Assemblée Nationale : (\d+)",
                clean_spaces(item.text))
            if match:
                text_num = int(match.group(1))
                step['cmp_commission_other_url'] = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp'\
                                                        .format(data['assemblee_legislature'], text_num)

        steps_to_add = []
        if good_urls:
            for url in good_urls:
                sub_step = dict(**step)  # dubstep
                sub_step['source_url'] = url['url']
                sub_step['institution'] = url['institution']
                if url['date']:
                    sub_step['date'] = url['date']
                steps_to_add.append(sub_step)
        else:
            steps_to_add.append(step)

        # remove CMP.CMP.hemicycle if it's a fail
        if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle':
            if not good_urls:
                last_step = data['steps'][-1]
                if data['steps'][-1].get('stage') == 'CMP' and step.get(
                        'step') == 'hemicycle':
                    if 'désaccord' in section_title:
                        last_step['echec'] = 'echec'
                    else:
                        log_error(
                            'CMP.hemicycle with no links and no fail indicated'
                        )
                    continue
            elif len(good_urls) != 2:
                log_error('CMP.hemicycle WITHOUT BOTH SENAT AND ASSEMBLEE')
                # todo: add empty missing step
                institutions_found = [url['institution'] for url in good_urls]
                if 'assemblee' not in institutions_found:
                    sub_step = dict(**step)  # dubstep
                    sub_step['source_url'] = None
                    sub_step['institution'] = 'assemblee'
                    steps_to_add.append(sub_step)

        # clean urls
        for step in steps_to_add:
            url = step.get('source_url')
            if url:
                step['source_url'] = clean_url(url)

        if len(steps_to_add) > 1:
            # multi-depot
            if step.get('step') == 'depot' and step.get(
                    'institution') == 'senat':
                # put real text as last depot
                steps_to_add = sorted(steps_to_add,
                                      key=lambda step: 1
                                      if data.get('senat_id', '') in step.get(
                                          'source_url', '') else 0)
                # if we are in a later step, the others depot steps must go at the top
                if len(data['steps']) > 0:
                    data['steps'] = steps_to_add[:-1] + data['steps']
                    steps_to_add = steps_to_add[-1:]
            # there can be multiple texts inside an hemicycle step, ok for CMP and multi-depots but not ok for other steps
            elif step.get('stage') != 'CMP':
                log_error(
                    'MULTIPLE TEXTS BUT NOT CMP.hemicycle - %s.%s.%s' %
                    (step['institution'], step.get('stage'), step.get('step')))
                steps_to_add = [steps_to_add[-1]]

        data['steps'] += steps_to_add

    # if there's not url for the AN dosleg, try to find it via the texts links
    if 'url_dossier_assemblee' not in data:
        an_url = find_an_url(data)
        if an_url:
            data['url_dossier_assemblee'] = an_url

    return data
Example #7
0
def parse(html, url_senat=None, logfile=sys.stderr):
    data = {}

    def log_error(error):
        print('## ERROR ###', error, file=logfile)

    soup = BeautifulSoup(html, 'html5lib')

    data['short_title'] = clean_spaces(soup.select_one('.title-dosleg').text.strip())

    if not soup.select('.title .subtitle-01'):
        log_error('NO TITLE - MAYBE A REDIRECT ?')
        return

    title_lines = soup.select_one('.title .subtitle-01').text.strip()
    data['long_title_descr'] = clean_spaces(title_lines.split('\n')[0][:-2])  # remove " :" at the end of the line
    data['long_title'] = clean_spaces(soup.find("meta", {"name": "Description"})['content'])

    promulgee_line = None
    ordonnance_line = None
    acceleree_line = None
    cc_line = None
    for line in soup.select('.title .list-disc-03 li'):
        if ' parue ' in line.text:
            promulgee_line = line
        elif 'ordonnance' in line.text:
            ordonnance_line = line
        elif 'accélérée' in line.text or 'Urgence déclarée' in line.text:
            acceleree_line = line
        elif 'Décision du Conseil constitutionnel' in line.text:
            cc_line = line
        else:
            log_error('UNKNOWN SUBTITLE: %s' % line.text)
    if promulgee_line:
        data['law_name'] = clean_spaces(promulgee_line.find('strong').text.strip())  # promulgation
        data['end'] = format_date(promulgee_line.text.split('JO ')[-1].split('du ')[-1].split('(')[0].strip())  # inscription aux JO
        if promulgee_line.find('a'):
            data['url_jo'] = clean_url(promulgee_line.find('a').attrs['href'])
            url_jo_params = parse_qs(urlparse(data['url_jo']).query)
            if 'cidTexte' in url_jo_params:
                data['legifrance_cidTexte'] = url_jo_params['cidTexte'][0]
        else:
            log_error('NO JO LINK')
    # TOPARSE: ordonnance_line
    # TOPARSE: CC decision

    data['urgence'] = acceleree_line is not None or 'procédure accélérée engagée par le' in title_lines
    if not url_senat:
        # the url is in a comment like "<!-- URL_SENAT=XXXX !-->" for downloaded pages
        comment = soup.find(text=lambda text: isinstance(text, Comment) and 'URL_SENAT' in text)
        if comment:
            url_senat = comment.split('=')[1].strip()
    if url_senat:
        data['url_dossier_senat'] = clean_url(url_senat)
        data['senat_id'] = data['url_dossier_senat'].split('/')[-1].replace('.html', '')
    else:
        url_senat = 'http://www.senat.fr/'

    tableau_comparatif = soup.select_one('.button-tableau-comparatifs')
    if tableau_comparatif:
        data['tableau_comparatif_url'] = clean_url(urljoin(url_senat, tableau_comparatif.attrs['href']))

    # objet du texte (very basic)
    for div in soup.select('#main div.scroll'):
        if div.find('h3') and 'Objet du texte' in div.find('h3').text:
            data['objet_du_texte'] = div.text.replace('Objet du texte\n', '') \
                .replace("Lire le billet de l'Espace presse", '').strip()
            continue

    # TODO: selecteur foireux ?
    for link in soup.select('h4.title.title-06.link-type-02 a'):
        if 'Assemblée' in link.text:
            url_an = link.attrs['href']
            if 'documents/index-' not in url_an:
                data['url_dossier_assemblee'] = clean_url(url_an)
                legislature, data['assemblee_slug'] = parse_national_assembly_url(data['url_dossier_assemblee'])
                if legislature:
                    data['assemblee_legislature'] = legislature
                else:
                    log_error('NO LEGISLATURE IN AN LINK: ' + url_an)
                data['assemblee_id'] = '%d-%s' % (data.get('assemblee_legislature', ''), data['assemblee_slug'])
            else:
                log_error('INVALID URL AN: ' + url_an)

    data['steps'] = []
    steps_shortcuts = soup.select('.list-timeline li')  # icons on top
    if not steps_shortcuts:
        log_error('VERY SPECIAL CASE - PAS DE NAVETTES NORMALES')
        return

    themes_box = soup.select_one('#box-themes')
    if themes_box:
        data['themes'] = [x.text.strip() for x in themes_box.select('.theme')]

    for t in [
            'financement de la sécurité',
            'règlement des comptes',
            'règlement du budget',
            'approbation des comptes',
            'loi de finances rectificative',
            'loi de financement rectificative',
            'de loi constitutionnelle'
        ]:
        if t in data['long_title']:
            data['use_old_procedure'] = True
    if 'plfss' in data.get('senat_id', '') or 'pjlf' in data.get('senat_id', ''):
        data['use_old_procedure'] = True

    if 'pjl' in data.get('senat_id', '') or 'plfss' in data.get('senat_id', ''):
        data['proposal_type'] = 'PJL'
    elif 'ppl' in data.get('senat_id', ''):
        data['proposal_type'] = 'PPL'
    else:
        log_error('UNKNOWN PROPOSAL TYPE (PPL/PJL)')

    steps_contents = []
    for item in soup.select('#box-timeline > div div'):
        if 'timeline-' in item.attrs.get('id', ''):
            steps_contents.append(item)

    curr_institution = None
    curr_stage = None
    error_detection_last_date = None
    for timeline_index, step_shortcut in enumerate(steps_shortcuts):
        step = {}

        item = BeautifulSoup('', 'lxml') # no info block for steps in the futur
        if len(steps_contents) > timeline_index:
            item = steps_contents[timeline_index]

        section_title = item
        while section_title.previous_sibling and section_title.previous_sibling.name != 'h3':
            section_title = section_title.previous_sibling
        section_title = section_title.previous_sibling.text if section_title.previous_sibling else ''

        step['date'] = None
        if step_shortcut.select('em'):
            date_text = step_shortcut.select('em')[-1].text.strip()
            if '/' in date_text:
                step['date'] = format_date(date_text)
        if not step['date'] and item.text:
            # TODO: date sometimes is not on the shortcut
            log_error('SHORCUT WITHOUT DATE')

        if 'beginning' not in data and step['date']:
            data['beginning'] = step['date']

        # TODO review this part
        step_shorcut_infos = step_shortcut.select_one('a[title]').attrs['title'].split('|')[-1].split('-')
        step_step = step_shorcut_infos[-1].lower().strip()
        if 'commission' in step_step:
            step_step = 'commission'
        elif 'séance' in step_step:
            step_step = 'hemicycle'

        # TODO: ca me parait bizarre cette histoire
        # stage = 1ere lecture|2eme lecture|CMP
        # institution = assemblee|senat|CMP|gouvernement
        # step = depot|commission|hemicycle
        if step_shortcut.select_one('em'):
            titre = step_shortcut.select_one('em').text.lower().strip()
            if titre == 'loi' or 'promulgation' in titre:
                curr_stage = 'promulgation'
            else:
                curr_stage = step_shorcut_infos[0].lower().strip()
                if curr_stage == 'cmp':
                    curr_stage = 'CMP'

            # sometimes the lecture info is in the date, why not ?
            # ex: http://www.senat.fr/dossier-legislatif/a82831259.html
            if 'lecture' in date_text:
                curr_stage = date_text

            img = step_shortcut.find('img').attrs['src']
            if 'picto_timeline_01_' in img:
                curr_institution = 'assemblee'
                step_step = 'depot'
            elif 'picto_timeline_02_' in img:
                curr_institution = 'senat'
                step_step = 'depot'
            elif 'picto_timeline_05_' in img:
                curr_institution = 'CMP'
                curr_stage = 'CMP'
                # there is no "depot" step for a CMP
                continue
            elif 'picto_timeline_03_' in img:
                step_step = 'commission'
            elif 'picto_timeline_04_' in img:
                step_step = 'hemicycle'
            elif 'picto_timeline_07_' in img:
                curr_institution = 'gouvernement'
            elif 'picto_timeline_09_' in img:
                # 'nouv. délib.' ex: http://www.senat.fr/dossier-legislatif/pjl02-182.html
                continue
            elif 'picto_timeline_10_' in img:
                curr_institution = 'congrès'

        if curr_stage == 'c. constit.':
            curr_institution = 'conseil constitutionnel'
            curr_stage = 'constitutionnalité'
            step_step = None

        # the picto can be the wrong one...also a depot step for a CMP doesn't makes sense
        # ex: http://www.senat.fr/dossier-legislatif/taan99-406.html
        if curr_stage == 'CMP' and step_step == 'depot':
            curr_institution = 'CMP'
            log_error('DEPOT STEP FOR A CMP')
            continue

        # no commissions for l. définitive
        if curr_stage == 'l. définitive' and step_step == 'commission':
            continue

        step['institution'] = curr_institution

        # standardize on 1ère lecture / 2ème lecture
        curr_stage = curr_stage.replace('eme', 'ème')

        step['stage'] = curr_stage
        if curr_stage not in ('constitutionnalité', 'promulgation'):
            step['step'] = step_step

        # fill in for special case like http://www.senat.fr/dossier-legislatif/csm.html
        if curr_institution == 'congrès' and not curr_stage:
            step['stage'] = 'congrès'
        if curr_institution == 'congrès' and not step_step:
            step['step'] = 'congrès'
        # pass congrés if not hemicycle
        if step.get('step') == 'congrès': continue

        # add a legislature guess if missing
        if curr_institution == 'assemblee' and step['date']:
            legis = guess_legislature(step['date'])
            if legis:
                data['assemblee_legislature'] = legis

        good_urls = []

        if 'Texte renvoyé en commission' in item.text:
            step['echec'] = 'renvoi en commission'
        elif item.text:
            # TROUVONS LES TEXTES
            for link in item.select('a'):
                line = link.parent

                if 'Lettre rectificative' in link.text:
                    continue

                if 'href' in link.attrs:
                    href = link.attrs['href']
                    nice_text = link.text.lower().strip()
                    # TODO: assemblée "ppl, ppr, -a0" (a verif)
                    if (
                        ('/leg/' in href and '/' not in href.replace('/leg/', '') and 'avis-ce' not in href)
                        or nice_text in ('texte', 'texte de la commission', 'décision du conseil constitutionnel')
                        or 'jo n°' in nice_text

                        # TODO: parse the whole block for date + url
                        # ex: http://www.senat.fr/dossier-legislatif/pjl08-641.html
                        or 'conseil-constitutionnel.fr/decision.' in href
                    ):
                        # if we detect a "texte de la commission" in an old procedure, it means it's probably not the old procedure
                        if data.get('use_old_procedure') and nice_text == 'texte de la commission' and step.get('stage') != 'CMP':
                            del data['use_old_procedure']

                        # motion for a referendum for example
                        # ex: http://www.senat.fr/dossier-legislatif/pjl12-349.html
                        if '/leg/motion' in href:
                            continue
                        href = pre_clean_url(href)

                        url = urljoin(url_senat, href)
                        line_text = line.text.lower()
                        institution = curr_institution
                        if curr_stage != 'promulgation':  # TODO: be more specific, have a way to force the curr_instituion
                            if 'par l\'assemblée' in line_text:
                                institution = 'assemblee'
                            elif 'par le sénat' in line_text:
                                institution = 'senat'
                            else:
                                if curr_stage == 'CMP' and step_step == 'hemicycle' \
                                        and 'texte' in nice_text and not step.get('echec'):
                                    if 'assemblee-nationale.fr' in href:
                                        institution = 'assemblee'
                                    else:
                                        institution = 'senat'

                        date = find_date(line_text, curr_stage)
                        if date:
                            if error_detection_last_date and dateparser.parse(error_detection_last_date) > dateparser.parse(date):
                                # TODO: can be incorrect because of multi-depot
                                log_error('DATE ORDER IS INCORRECT - last=%s - found=%s' % (error_detection_last_date, date))
                            error_detection_last_date = date
                        if curr_stage == 'promulgation' and 'end' in data:
                            date = data['end']

                        good_urls.append({
                            'url': url,
                            'institution': institution,
                            'date': date,
                        })



        if not good_urls and item.text:
            # sinon prendre une url d'un peu moins bonne qualité

            if 'Texte retiré par' in item.text:
                # texte retiré means all the previous steps become useless except the depot
                data['steps'] = [step for step in data['steps'] if step.get('step') == 'depots']
                continue
            elif 'Texte rejeté par' in item.text:
                step['echec'] = "rejet"

            if 'source_url' not in step and not step.get('echec'):
                # trouver les numeros dans le texte
                if curr_institution == 'senat' and step.get('date'):
                    url = guess_senate_text_url(item.text, step, data)
                    if url:
                        step['source_url'] = url

                if 'source_url' not in step:
                    # prendre un rapport
                    for link in item.select('.list-disc-02 a'):
                        if 'href' in link.attrs:
                            href = link.attrs['href']
                            href = pre_clean_url(href)
                            nice_text = link.text.lower().strip()
                            if nice_text == 'rapport' or nice_text == 'rapport général':
                                step['source_url'] = urljoin(url_senat, href)
                                break

                if 'source_url' not in step and step.get('institution') == 'assemblee' and 'assemblee_legislature' in data:
                    legislature = data['assemblee_legislature']
                    text_num_match = re.search(r'(Texte|Rapport)\s*n°\s*(\d+)', item.text, re.I)
                    if text_num_match:
                        text_num = text_num_match.group(2)
                        url = None
                        if step.get('step') == 'commission':
                            url = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp'
                        elif step.get('step') == 'depot':
                            if data.get('proposal_type') == 'PJL':
                                url = 'http://www.assemblee-nationale.fr/{}/projets/pl{:04d}.asp'
                            else:
                                url = 'http://www.assemblee-nationale.fr/{}/propositions/pion{:04d}.asp'
                        elif step.get('step') == 'hemicycle':
                            url = 'http://www.assemblee-nationale.fr/{}/ta/ta{:04d}.asp'

                        if url:
                            step['source_url'] = url.format(legislature, int(text_num))

            if 'source_url' not in step and not step.get('echec'):
                log_error('ITEM WITHOUT URL TO TEXT - %s.%s.%s' % (step['institution'], step.get('stage'), step.get('step')))

        # Decision Conseil Constitutionnel
        if curr_stage == 'constitutionnalité':
            # we try to find the decision in the paragraph or at the top of the dosleg
            decision_text = item.text
            if cc_line:
                decision_text += cc_line.text

            if 'partiellement conforme' in item.text:
                step['decision'] = 'partiellement conforme'
            elif 'se déclare incompétent' in item.text:
                step['decision'] = 'se déclare incompétent'
            elif 'non conforme' in item.text:
                step['decision'] = 'non conforme'
            elif 'conforme' in item.text:
                step['decision'] = 'conforme'
            else:
                log_error('WARNING: NO DECISION FOR CC')

        # look for Table de concordance
        if curr_stage == 'promulgation':
            for a in item.select('a'):
                if 'table de concordance' in a.text.lower():
                    table, errors = parse_table_concordance(clean_url(urljoin(url_senat, a.attrs['href'])))
                    data['table_concordance'] = table
                    if errors:
                        data['table_concordance_confusing_entries'] = errors

        # CMP commission has two urls: one for the Senate and one for the AN
        if step.get('stage') == 'CMP' and step.get('step') == 'commission':
            match = re.search(r"numéro de dépôt à l'Assemblée Nationale : (\d+)", clean_spaces(item.text))
            if match:
                text_num = int(match.group(1))
                step['cmp_commission_other_url'] = 'http://www.assemblee-nationale.fr/{}/ta-commission/r{:04d}-a0.asp'\
                                                        .format(data['assemblee_legislature'], text_num)

        steps_to_add = []
        if good_urls:
            for url in good_urls:
                sub_step = dict(**step)  # dubstep
                sub_step['source_url'] = url['url']
                sub_step['institution'] = url['institution']
                if url['date']:
                    sub_step['date'] = url['date']
                steps_to_add.append(sub_step)
        else:
            steps_to_add.append(step)

        # remove CMP.CMP.hemicycle if it's a fail
        if step.get('stage') == 'CMP' and step.get('step') == 'hemicycle':
            if not good_urls:
                last_step = data['steps'][-1]
                if data['steps'][-1].get('stage') == 'CMP' and step.get('step') == 'hemicycle':
                    if 'désaccord' in section_title:
                        last_step['echec'] = 'echec'
                    else:
                        log_error('CMP.hemicycle with no links and no fail indicated')
                    continue
            elif len(good_urls) != 2:
                log_error('CMP.hemicycle WITHOUT BOTH SENAT AND ASSEMBLEE')
                # todo: add empty missing step
                institutions_found = [url['institution'] for url in good_urls]
                if 'assemblee' not in institutions_found:
                    sub_step = dict(**step)  # dubstep
                    sub_step['source_url'] = None
                    sub_step['institution'] = 'assemblee'
                    steps_to_add.append(sub_step)

        # clean urls
        for step in steps_to_add:
            url = step.get('source_url')
            if url:
                step['source_url'] = clean_url(url)

        if len(steps_to_add) > 1:
            # multi-depot
            if step.get('step') == 'depot' and step.get('institution') == 'senat':
                # put real text as last depot
                steps_to_add = sorted(steps_to_add, key=lambda step: 1 if data.get('senat_id', '') in step.get('source_url', '') else 0)
                # if we are in a later step, the others depot steps must go at the top
                if len(data['steps']) > 0:
                    data['steps'] = steps_to_add[:-1] + data['steps']
                    steps_to_add = steps_to_add[-1:]
            # there can be multiple texts inside an hemicycle step, ok for CMP and multi-depots but not ok for other steps
            elif step.get('stage') != 'CMP':
                log_error('MULTIPLE TEXTS BUT NOT CMP.hemicycle - %s.%s.%s' % (step['institution'], step.get('stage'), step.get('step')))
                steps_to_add = [steps_to_add[-1]]

        data['steps'] += steps_to_add

    # if there's not url for the AN dosleg, try to find it via the texts links
    if 'url_dossier_assemblee' not in data:
        an_url = find_an_url(data)
        if an_url:
            data['url_dossier_assemblee'] = an_url

    return data