def parse_code_ul(self, url, ul): """Fill the toc item""" li_list = ul.find_all('li', recursive=False) li = li_list[0] span_title = li.find('span', attrs={'class': re.compile(r'TM\d+Code')}, recursive=False) section = Section(span_title.attrs['id'], span_title.text.strip()) div_italic = li.find('div', attrs={'class': 'italic'}, recursive=False) if div_italic: section.content = div_italic.text.strip() span_link = li.find('span', attrs={'class': 'codeLienArt'}, recursive=False) if span_link: a_link = span_link.find('a', recursive=False) if self.with_articles: service = self.section_service section.articles = service.articles(self.id_code, section.id_section, self.date_pub) else: section.articles = a_link.text.strip() section.url_section = cleanup_url( urljoin(url, a_link.attrs['href'])) section.children = [self.parse_code_ul(url, child) for child in li.find_all('ul', recursive=False)] return section
def parse_pending_law_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h3'): year = int(year_header.get_text()) ul = year_header.find_next_sibling('ul') if not ul: continue for law_entry in ul.select('li a'): link_text = law_entry.get_text() nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text) url_legi = cleanup_url(urljoin(url, law_entry['href'])) qs_legi = parse_qs(urlparse(url_legi).query) results.append(Law( year=year, legislature=int(qs_legi['legislature'][0]), type=qs_legi['typeLoi'][0], title=merge_spaces(link_text), nor=nor_num.group(1) if nor_num else None, url_legi=url_legi, id_legi=qs_legi['idDocument'][0] )) return results
def parse_code(self, url, html): """ Parse the code details and TOC from the given HTML content :type url: str :param url: source URL of the page :type html: unicode :param html: Content of the HTML :return: the code """ soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') code = Code(self.id_code, date_pub=self.date_pub, url_code=cleanup_url(url)) # -- Code title/subtitle code.title = soup.h1.text.strip() code.subtitle = soup.find('div', {'class': 'vigor-title'}).text.strip() regex = (r'Version (?:en vigueur au|abrogée depuis le) ' r'(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})') m = re.search(regex, code.subtitle) if m: code.date_pub = parse_date(m.group(1)) # -- TOC toc = soup.find('ul', id='liste-sommaire') code.children = [self.parse_toc_element(url, partie) for partie in toc.find_all('li', recursive=False)] return code
def parse_pending_law_list(url, html, **law_kwargs): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h2'): year = int(year_header.get_text().strip()) ul = year_header.find_next('ul') if not ul: continue for law_entry in ul.select('li a'): link_text = law_entry.get_text().strip() nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text) type_loi = re.match(r'(Projet|Proposition)\s+de\s+loi\s+({})?' .format('|'.join(LAW_KINDS)), link_text) if type_loi: print(type_loi.groups()) url_legi = cleanup_url(urljoin(url, law_entry['href'])) id_legi = urlparse(url_legi).path.strip('/').split('/')[-1] results.append(Law( year=year, id_legi=id_legi, type=type_loi.group(0).lower()[:4], kind=type_loi.group(1), title=merge_spaces(link_text), nor=nor_num.group(1) if nor_num else None, url_legi=url_legi, **law_kwargs )) return results
def parse_pending_law_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h3'): year = int(year_header.get_text()) ul = year_header.find_next_sibling('ul') if not ul: continue for law_entry in ul.select('li a'): link_text = law_entry.get_text() nor_num = re.search(r'\(([A-Z0-9]+)\)$', link_text) url_legi = cleanup_url(urljoin(url, law_entry['href'])) qs_legi = parse_qs(urlparse(url_legi).query) results.append( Law(year=year, legislature=int(qs_legi['legislature'][0]), type=qs_legi['typeLoi'][0], title=merge_spaces(link_text), nor=nor_num.group(1) if nor_num else None, url_legi=url_legi, id_legi=qs_legi['idDocument'][0])) return results
def parse_law(url, html, id_legi): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') law = Law( url_legi=cleanup_url(url), id_legi=id_legi ) clean_title = merge_spaces(soup.h2.get_text()).strip() law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip() if len(law.title) == 0: return None title_remain = None law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title, re.I) if law_num: law.type = 'law' law.kind = law_num.group(1) law.number = law_num.group(2) title_remain = law_num.group(3) prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)', law.title, re.I) if prop: law.type = prop.group(1).lower() try: LAW_KINDS.index(prop.group(2)) law.kind = prop.group(2) except ValueError: # not in list law.kind = None if title_remain: pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title_remain) if pub_date: law.pub_date = parse_date(pub_date.group(1)) dos_senat = soup.find(lambda e: e.name == 'a' and ( re.search(r'/dossier-legislatif/', e['href']) or re.search(r'/dossierleg/', e['href']))) if dos_senat: law.url_senat = dos_senat['href'].split('#')[0] law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1) dos_an = soup.find(lambda e: e.name == 'a' and re.search(r'/dossiers/', e['href'])) if dos_an: law.url_an = dos_an['href'].split('#')[0] law.legislature = int(re.search(r'/(\d+)/dossiers/', law.url_an).group(1)) law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1) return law
def parse_law(url, html, id_legi): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') law = Law(url_legi=cleanup_url(url), id_legi=id_legi) clean_title = merge_spaces(soup.h2.get_text()).strip() law.title = re.sub(r'^Dossiers législatifs( - )?', '', clean_title).strip() if len(law.title) == 0: return None title_remain = None law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)(.*)', law.title, re.I) if law_num: law.type = 'law' law.kind = law_num.group(1) law.number = law_num.group(2) title_remain = law_num.group(3) prop = re.match(r'(proj|prop)(?:et de loi|osition de loi) (\w+)', law.title, re.I) if prop: law.type = prop.group(1).lower() try: LAW_KINDS.index(prop.group(2)) law.kind = prop.group(2) except ValueError: # not in list law.kind = None if title_remain: pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title_remain) if pub_date: law.pub_date = parse_date(pub_date.group(1)) dos_senat = soup.find(lambda e: e.name == 'a' and (re.search(r'/dossier-legislatif/', e['href']) or re. search(r'/dossierleg/', e['href']))) if dos_senat: law.url_senat = dos_senat['href'].split('#')[0] law.id_senat = re.search(r'([^/]+)\.html$', law.url_senat).group(1) dos_an = soup.find( lambda e: e.name == 'a' and re.search(r'/dossiers/', e['href'])) if dos_an: law.url_an = dos_an['href'].split('#')[0] law.legislature = int( re.search(r'/(\d+)/dossiers/', law.url_an).group(1)) law.id_an = re.search(r'([^/]+)\.asp$', law.url_an).group(1) return law
def parse_common_law_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] div = soup.find('div', {'id': 'content_right'}) ul = div.find('ul') re_find_common = re.compile(r'dite?[: ]+(?:loi )?"\s*([^"]+?)\s*"', re.I) re_find_second = re.compile(r'"\s*ou ((?:loi )?)"\s*([^"]+?)\s*"', re.I) for law_entry in ul.select('li'): link = law_entry.find('a') if not link: continue link_text = _clean_typos_legifrance(law_entry.get_text()) nor_num = re.search(r'NOR\s*([A-Z0-9]+)\n', link_text) url_legi = cleanup_url(urljoin(url, link['href'])) qs_legi = parse_qs(urlparse(url_legi).query) text_parts = link_text.strip("\n\r\t )").split('\n') title = merge_spaces(text_parts[0]) common_text = merge_spaces(text_parts[-1]).strip("() ") try: common = re_find_common.search(common_text).group(1) except Exception: common = common_text try: second = re_find_second.search(common_text) common += " ; %s" % "".join(second.groups()) except Exception: pass results.append( Law( title=title, common_name=common.replace('Loi', 'loi'), nor=nor_num.group(1) if nor_num else None, url_legi=url_legi, id_legi=qs_legi['cidTexte'][0] ) ) return results
def parse_code(self, url, html): """ Parse the code details and TOC from the given HTML content :type url: str :param url: source URL of the page :type html: unicode :param html: Content of the HTML :return: the code """ soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') # -- main text div = (soup .find('div', id='content_false') .find('div', attrs={'class': 'data'})) code = Code(self.id_code, date_pub=self.date_pub, url_code=cleanup_url(url)) # -- Code title/subtitle div_title = div.find('div', id='titreTexte') span_subtitle = div_title.find('span', attrs={'class': 'sousTitreTexte'}) if span_subtitle: code.title = div_title.text.replace(span_subtitle.text, '') code.subtitle = span_subtitle.text.strip() regex = r'Version consolidée au (\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})' m = re.search(regex, code.subtitle) if m: code.date_pub = parse_date(m.group(1)) code.title = code.title.strip() # -- TOC code.children = [self.parse_code_ul(url, child) for child in div.find_all('ul', recursive=False)] return code
def parse_published_law_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h3'): year = int(year_header.get_text()) ul = year_header.find_next_sibling('ul') if not ul: continue for law_entry in ul.select('li a'): link_text = law_entry.get_text() law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)', link_text, re.I) if not law_num: continue url_legi = cleanup_url(urljoin(url, law_entry['href'])) qs_legi = parse_qs(urlparse(url_legi).query) title = law_entry.next_sibling pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title) results.append(Law( year=year, legislature=int(qs_legi['legislature'][0]), number=law_num.group(2), type='law', kind=law_num.group(1), pub_date=parse_date(pub_date.group(1)) if pub_date else None, title=merge_spaces(link_text + title), url_legi=url_legi, id_legi=qs_legi['idDocument'][0] )) return results
def parse_published_law_list(url, html, **law_args): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h2'): year = int(year_header.get_text().strip()) ul = year_header.find_next('ul') if not ul: print('No ul found') continue for law_entry in ul.select('li a'): link_text = law_entry.get_text().strip() law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)', link_text, re.I) if not law_num: continue url_legi = cleanup_url(urljoin(url, law_entry['href'])) id_legi = law_entry['href'].strip('/').split('/')[1] pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', link_text[len(law_num.group(0)):]) results.append( Law(year=year, number=law_num.group(2), type='law', kind=law_num.group(1), pub_date=parse_date(pub_date.group(1)) if pub_date else None, title=merge_spaces(link_text), url_legi=url_legi, id_legi=id_legi, **law_args)) return results
def parse_published_law_list(url, html): soup = BeautifulSoup(html, 'html5lib', from_encoding='utf-8') results = [] for year_header in soup.find_all('h3'): year = int(year_header.get_text()) ul = year_header.find_next_sibling('ul') if not ul: continue for law_entry in ul.select('li a'): link_text = law_entry.get_text() law_num = re.match(r'LOI\s+(?:([^\s]+)\s+)?n°\s+([^\s]+)', link_text, re.I) if not law_num: continue url_legi = cleanup_url(urljoin(url, law_entry['href'])) qs_legi = parse_qs(urlparse(url_legi).query) title = law_entry.next_sibling pub_date = re.match(r'\s*du\s+(\d{1,2}(?:er)?\s+[^\s]+\s+\d{4})', title) results.append( Law(year=year, legislature=int(qs_legi['legislature'][0]), number=law_num.group(2), type='law', kind=law_num.group(1), pub_date=parse_date(pub_date.group(1)) if pub_date else None, title=merge_spaces(link_text + title), url_legi=url_legi, id_legi=qs_legi['idDocument'][0])) return results