def parse_legislature_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for leg_header in soup.find_all('h2'): text = leg_header.get_text() text = re.sub(r'\s+', ' ', text) num = parse_roman(re.search('^[MDCLXVI]+', text).group(0)) m = re.search(r'à compter du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text) if m: start = parse_date(m.group(1)) end = None else: start = None end = None m = re.search(r'du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4}) ' r'au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text) if m: start = parse_date(m.group(1)) end = parse_date(m.group(2)) results.append(Legislature(number=num, start=start, end=end)) return results
def parse_legislature_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for leg_header in soup.find_all('h3'): text = leg_header.get_text() num = parse_roman(re.search('^[MDCLXVI]+', text).group(0)) m = re.search(r'A compter du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text) if m: start = parse_date(m.group(1)) end = None else: start = None end = None m = re.search(r'du (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4}) ' r'au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', text) if m: start = parse_date(m.group(1)) end = parse_date(m.group(2)) results.append(Legislature(number=num, start=start, end=end)) return results
def parse_code(self, url, html): """ Parse the code details and TOC from the given HTML content :type url: str :param url: source URL of the page :type html: unicode :param html: Content of the HTML :return: the code """ soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') code = Code(self.id_code, date_pub=self.date_pub, url_code=cleanup_url(url)) # -- Code title/subtitle code.title = soup.h1.text.strip() code.subtitle = soup.find('div', {'class': 'vigor-title'}).text.strip() regex = (r'Version (?:en vigueur au|abrogée depuis le) ' r'(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})') m = re.search(regex, code.subtitle) if m: code.date_pub = parse_date(m.group(1)) # -- TOC toc = soup.find('ul', id='liste-sommaire') code.children = [self.parse_toc_element(url, partie) for partie in toc.find_all('li', recursive=False)] return code
def parse_law(url, html, id_legi): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') law = Law( url_legi=cleanup_url(url), id_legi=id_legi ) clean_title = merge_spaces(soup.h2.get_text()).strip() law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip() if len(law.title) == 0: return None title_remain = None law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title, re.I) if law_num: law.type = 'law' law.kind = law_num.group(1) law.number = law_num.group(2) title_remain = law_num.group(3) prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)', law.title, re.I) if prop: law.type = prop.group(1).lower() try: LAW_KINDS.index(prop.group(2)) law.kind = prop.group(2) except ValueError: # not in list law.kind = None if title_remain: pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title_remain) if pub_date: law.pub_date = parse_date(pub_date.group(1)) dos_senat = soup.find(lambda e: e.name == 'a' and ( re.search(r'/dossier-legislatif/', e['href']) or re.search(r'/dossierleg/', e['href']))) if dos_senat: law.url_senat = dos_senat['href'].split('#')[0] law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1) dos_an = soup.find(lambda e: e.name == 'a' and re.search(r'/dossiers/', e['href'])) if dos_an: law.url_an = dos_an['href'].split('#')[0] law.legislature = int(re.search(r'/(\d+)/dossiers/', law.url_an).group(1)) law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1) return law
def parse_law(url, html, id_legi): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') law = Law(url_legi=cleanup_url(url), id_legi=id_legi) clean_title = merge_spaces(soup.h2.get_text()).strip() law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip() if len(law.title) == 0: return None title_remain = None law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title, re.I) if law_num: law.type = 'law' law.kind = law_num.group(1) law.number = law_num.group(2) title_remain = law_num.group(3) prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)', law.title, re.I) if prop: law.type = prop.group(1).lower() try: LAW_KINDS.index(prop.group(2)) law.kind = prop.group(2) except ValueError: # not in list law.kind = None if title_remain: pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title_remain) if pub_date: law.pub_date = parse_date(pub_date.group(1)) dos_senat = soup.find(lambda e: e.name == 'a' and (re.search(r'/dossier-legislatif/', e['href']) or re. search(r'/dossierleg/', e['href']))) if dos_senat: law.url_senat = dos_senat['href'].split('#')[0] law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1) dos_an = soup.find( lambda e: e.name == 'a' and re.search(r'/dossiers/', e['href'])) if dos_an: law.url_an = dos_an['href'].split('#')[0] law.legislature = int( re.search(r'/(\d+)/dossiers/', law.url_an).group(1)) law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1) return law
def parse_code(self, url, html): """ Parse the code details and TOC from the given HTML content :type url: str :param url: source URL of the page :type html: unicode :param html: Content of the HTML :return: the code """ soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') # -- main text div = (soup .find('div', id='content_false') .find('div', attrs={'class': 'data'})) code = Code(self.id_code, date_pub=self.date_pub, url_code=cleanup_url(url)) # -- Code title/subtitle div_title = div.find('div', id='titreTexte') span_subtitle = div_title.find('span', attrs={'class': 'sousTitreTexte'}) if span_subtitle: code.title = div_title.text.replace(span_subtitle.text, '') code.subtitle = span_subtitle.text.strip() regex = r'Version consolidée au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})' m = re.search(regex, code.subtitle) if m: code.date_pub = parse_date(m.group(1)) code.title = code.title.strip() # -- TOC code.children = [self.parse_code_ul(url, child) for child in div.find_all('ul', recursive=False)] return code
def parse_published_law_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h3'): year = int(year_header.get_text()) ul = year_header.find_next_sibling('ul') if not ul: continue for law_entry in ul.select('li a'): link_text = law_entry.get_text() law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)', link_text, re.I) if not law_num: continue url_legi = cleanup_url(urljoin(url, law_entry['href'])) qs_legi = parse_qs(urlparse(url_legi).query) title = law_entry.next_sibling pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title) results.append(Law( year=year, legislature=int(qs_legi['legislature'][0]), number=law_num.group(2), type='law', kind=law_num.group(1), pub_date=parse_date(pub_date.group(1)) if pub_date else None, title=merge_spaces(link_text + title), url_legi=url_legi, id_legi=qs_legi['idDocument'][0] )) return results
def parse_published_law_list(url, html, **law_args): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h2'): year = int(year_header.get_text().strip()) ul = year_header.find_next('ul') if not ul: print('No ul found') continue for law_entry in ul.select('li a'): link_text = law_entry.get_text().strip() law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)', link_text, re.I) if not law_num: continue url_legi = cleanup_url(urljoin(url, law_entry['href'])) id_legi = law_entry['href'].strip('/').split('/')[1] pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', link_text[len(law_num.group(0)):]) results.append( Law(year=year, number=law_num.group(2), type='law', kind=law_num.group(1), pub_date=parse_date(pub_date.group(1)) if pub_date else None, title=merge_spaces(link_text), url_legi=url_legi, id_legi=id_legi, **law_args)) return results
def parse_published_law_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h3'): year = int(year_header.get_text()) ul = year_header.find_next_sibling('ul') if not ul: continue for law_entry in ul.select('li a'): link_text = law_entry.get_text() law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)', link_text, re.I) if not law_num: continue url_legi = cleanup_url(urljoin(url, law_entry['href'])) qs_legi = parse_qs(urlparse(url_legi).query) title = law_entry.next_sibling pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title) results.append( Law(year=year, legislature=int(qs_legi['legislature'][0]), number=law_num.group(2), type='law', kind=law_num.group(1), pub_date=parse_date(pub_date.group(1)) if pub_date else None, title=merge_spaces(link_text + title), url_legi=url_legi, id_legi=qs_legi['idDocument'][0])) return results