def _get_body_from_html( self, list_name: str, url: str, soup: BeautifulSoup ) -> Union[str, None]: """ Lexer for the message body/payload. This methods look first whether the body is available in text/plain, before it looks for the text/html option. If neither is available it returns None. Therefore this method does not try to return the richest information content, but simply the ascii format. Parameters ---------- list_name : The name of the LISTSERV Email list. url : URL to the Email. soup : HTML code from which the Email body can be obtained. """ # TODO re-write using email.parser.Parser url_root = ("/").join(url.split("/")[:-2]) a_tags = soup.select(f'a[href*="A3="][href*="{list_name}"]') href_plain_text = [ tag.get("href") for tag in a_tags if "Fplain" in tag.get("href") ] href_html_text = [ tag.get("href") for tag in a_tags if "Fhtml" in tag.get("href") ] try: if href_plain_text: body_soup = get_website_content( urljoin(url_root, href_plain_text[0]) ) if body_soup == "RequestException": return body_soup else: return body_soup.find("pre").text elif href_html_text: body_soup = get_website_content( urljoin(url_root, href_html_text[0]) ) if body_soup == "RequestException": return body_soup else: return body_soup.get_text(strip=True) except Exception: logger.exception( f"The message body of {url} which is part of the " f"list {list_name} could not be loaded." ) return None
def get_lists_from_url( name: str, select: dict, url_root: str, url_home: Optional[str] = None, instant_save: bool = True, only_mlist_urls: bool = True, ) -> List[Union[W3CMailList, str]]: """Docstring in `AbstractMailListDomain`.""" archive = [] if url_home is None: soup = get_website_content(url_root) else: soup = get_website_content(url_home) mlist_urls = [ urljoin(url_root, h3_tag.select("a")[0].get("href")) for h3_tag in soup.select("h3") if h3_tag.select("a") ] mlist_urls = list(set(mlist_urls)) # remove duplicates if only_mlist_urls: # collect mailing-list urls for mlist_url in tqdm(mlist_urls, ascii=True): # check if mailing list contains messages in period _period_urls = W3CMailList.get_all_periods_and_their_urls( mlist_url)[1] # check if mailing list is public if len(_period_urls) > 0: archive.append(mlist_url) else: # collect mailing-list contents for mlist_url in mlist_urls: mlist_name = W3CMailList.get_name_from_url(mlist_url) mlist = W3CMailList.from_url( name=mlist_name, url=mlist_url, select=select, ) if len(mlist) != 0: if instant_save: dir_out = CONFIG.mail_path + name Path(dir_out).mkdir(parents=True, exist_ok=True) mlist.to_mbox(dir_out=CONFIG.mail_path) archive.append(mlist.name) else: logger.info(f"Recorded the list {mlist.name}.") archive.append(mlist) return archive
def from_url( self, list_name: str, url: str, fields: str = "total", ) -> Message: """ Parameters ---------- list_name : The name of the mailing list. url : URL of this Email fields : Indicates whether to return 'header', 'body' or 'total'/both or the Email. The latter is the default. """ soup = get_website_content(url, session=self.session) if soup == "RequestException": body = "RequestException" header = self.empty_header else: if fields in ["header", "total"]: header = self._get_header_from_html(soup) else: header = self.empty_header if fields in ["body", "total"]: body = self._get_body_from_html(list_name, url, soup) else: body = None return self.create_email_message(url, body, **header)
def get_sections(url_root: str, url_home: str) -> int: """ Get different sections of mail list domain. On the Listserv 16.5 website they look like: [3GPP] [3GPP–AT1] [AT2–CONS] [CONS–EHEA] [EHEA–ERM_] ... On the Listserv 17 website they look like: [<<][<]1-50(798)[>][>>] Returns ------- If sections exist, it returns their urls and names. Otherwise it returns the url_home. """ soup = get_website_content(url_home) sections = soup.select( 'a[href*="INDEX="][href*="p="]', ) archive_sections_dict = {} if sections: for sec in sections: key = urljoin(url_root, sec.get("href")) value = sec.text if value in ["Next", "Previous"]: continue archive_sections_dict[key] = value archive_sections_dict[re.sub(r"p=[0-9]+", "p=1", key)] = "FIRST" else: archive_sections_dict[url_home] = "Home" return archive_sections_dict
def get_message_urls_from_mlist_url(name: str, url: str) -> List[str]: url_root = ("/").join(url.split("/")[:-2]) soup = get_website_content(url) a_tags = soup.select(f'a[href*="A2="][href*="{name}"]') if a_tags: a_tags = [urljoin(url_root, url.get("href")) for url in a_tags] return a_tags
def get_all_periods_and_their_urls( url: str, ) -> Tuple[List[str], List[str]]: """ W3C groups messages into monthly time bundles. This method obtains all the URLs that lead to the messages of each time bundle. Returns ------- Returns a tuple of two lists that look like: (['April 2017', 'January 2001', ...], ['ulr1', 'url2', ...]) """ # wait between loading messages, for politeness time.sleep(0.5) soup = get_website_content(url) print("get_all_periods_and_their_urls:") print(url) periods = [] urls_of_periods = [] rows = soup.select("tbody tr") for row in rows: link = row.select("td:nth-of-type(1) a") if len(link) > 0: link = link[0] else: continue periods.append(link.text) urls_of_periods.append(url + "/" + link.get("href")) return periods, urls_of_periods
def get_message_urls_from_period_url(name: str, url: str) -> List[str]: soup = get_website_content(url) a_tags = soup.select("div.messages-list a") if a_tags: a_tags = [ urljoin(url, a_tag.get("href")) for a_tag in a_tags if a_tag.get("href") is not None ] return a_tags
def get_messages_urls(cls, name: str, url: str) -> List[str]: """ Parameters ---------- name : Name of the W3C mailing list. url : URL to group of messages that are within the same period. Returns ------- List of URLs from which `mboxMessage` can be initialized. """ soup = get_website_content(url) if soup == "RequestException": return [] else: a_tags = soup.select("div.messages-list a") if a_tags: a_tags = [ urljoin(url, a_tag.get("href")) for a_tag in a_tags if a_tag.get("href") is not None ] return a_tags
def get_all_periods_and_their_urls( url: str, ) -> Tuple[List[str], List[str]]: """ LISTSERV groups messages into weekly time bundles. This method obtains all the URLs that lead to the messages of each time bundle. Returns ------- Returns a tuple of two lists that look like: (['April 2017, 2', 'January 2001', ...], ['ulr1', 'url2', ...]) """ # wait between loading messages, for politeness time.sleep(0.5) url_root = ("/").join(url.split("/")[:-2]) soup = get_website_content(url) periods = [list_tag.find("a").text for list_tag in soup.find_all("li")] urls_of_periods = [ urljoin(url_root, list_tag.find("a").get("href")) for list_tag in soup.find_all("li") ] return periods, urls_of_periods
def get_lists_from_url( url_root: str, url_home: str, select: dict, session: Optional[str] = None, instant_save: bool = True, only_mlist_urls: bool = True, ) -> List[Union[ListservMailList, str]]: """Docstring in `AbstractMailList`.""" archive = [] # run through archive sections for url in list( ListservMailListDomain.get_sections(url_root, url_home).keys() ): soup = get_website_content(url) a_tags_in_section = soup.select( f'a[href^="{urlparse(url).path}?A0="]', ) mlist_urls = [ urljoin(url_root, a_tag.get("href")) for a_tag in a_tags_in_section ] mlist_urls = list(set(mlist_urls)) # remove duplicates if only_mlist_urls: # collect mailing-list urls for mlist_url in mlist_urls: name = ListservMailList.get_name_from_url(mlist_url) # check if mailing list contains messages in period _period_urls = ( ListservMailList.get_all_periods_and_their_urls( mlist_url )[1] ) # check if mailing list is public if len(_period_urls) > 0: loops = 0 for _period_url in _period_urls: loops += 1 nr_msgs = len( ListservMailList.get_messages_urls( name=name, url=_period_url ) ) if nr_msgs > 0: archive.append(mlist_url) break else: # collect mailing-list contents for mlist_url in mlist_urls: name = ListservMailList.get_name_from_url(mlist_url) mlist = ListservMailList.from_url( name=name, url=mlist_url, select=select, session=session, ) if len(mlist) != 0: if instant_save: dir_out = CONFIG.mail_path + name Path(dir_out).mkdir(parents=True, exist_ok=True) mlist.to_mbox(dir_out=CONFIG.mail_path) archive.append(mlist.name) else: logger.info(f"Recorded the list {mlist.name}.") archive.append(mlist) return archive