Python get_website_contentの例、bigbang.ingress.utils.get_website_content Pythonの例

コード例 #1

0

ファイルを表示

ファイル: listserv.py プロジェクト: nllz/bigbang

    def _get_body_from_html(
        self, list_name: str, url: str, soup: BeautifulSoup
    ) -> Union[str, None]:
        """
        Lexer for the message body/payload.
        This methods look first whether the body is available in text/plain,
        before it looks for the text/html option. If neither is available it
        returns None.

        Therefore this method does not try to return the richest information
        content, but simply the ascii format.

        Parameters
        ----------
        list_name : The name of the LISTSERV Email list.
        url : URL to the Email.
        soup : HTML code from which the Email body can be obtained.
        """
        # TODO re-write using email.parser.Parser
        url_root = ("/").join(url.split("/")[:-2])
        a_tags = soup.select(f'a[href*="A3="][href*="{list_name}"]')
        href_plain_text = [
            tag.get("href") for tag in a_tags if "Fplain" in tag.get("href")
        ]
        href_html_text = [
            tag.get("href") for tag in a_tags if "Fhtml" in tag.get("href")
        ]
        try:
            if href_plain_text:
                body_soup = get_website_content(
                    urljoin(url_root, href_plain_text[0])
                )
                if body_soup == "RequestException":
                    return body_soup
                else:
                    return body_soup.find("pre").text
            elif href_html_text:
                body_soup = get_website_content(
                    urljoin(url_root, href_html_text[0])
                )
                if body_soup == "RequestException":
                    return body_soup
                else:
                    return body_soup.get_text(strip=True)
        except Exception:
            logger.exception(
                f"The message body of {url} which is part of the "
                f"list {list_name} could not be loaded."
            )
            return None

コード例 #2

0

ファイルを表示

    def get_lists_from_url(
        name: str,
        select: dict,
        url_root: str,
        url_home: Optional[str] = None,
        instant_save: bool = True,
        only_mlist_urls: bool = True,
    ) -> List[Union[W3CMailList, str]]:
        """Docstring in `AbstractMailListDomain`."""
        archive = []
        if url_home is None:
            soup = get_website_content(url_root)
        else:
            soup = get_website_content(url_home)
        mlist_urls = [
            urljoin(url_root,
                    h3_tag.select("a")[0].get("href"))
            for h3_tag in soup.select("h3") if h3_tag.select("a")
        ]
        mlist_urls = list(set(mlist_urls))  # remove duplicates

        if only_mlist_urls:
            # collect mailing-list urls
            for mlist_url in tqdm(mlist_urls, ascii=True):
                # check if mailing list contains messages in period
                _period_urls = W3CMailList.get_all_periods_and_their_urls(
                    mlist_url)[1]
                # check if mailing list is public
                if len(_period_urls) > 0:
                    archive.append(mlist_url)
        else:
            # collect mailing-list contents
            for mlist_url in mlist_urls:
                mlist_name = W3CMailList.get_name_from_url(mlist_url)
                mlist = W3CMailList.from_url(
                    name=mlist_name,
                    url=mlist_url,
                    select=select,
                )
                if len(mlist) != 0:
                    if instant_save:
                        dir_out = CONFIG.mail_path + name
                        Path(dir_out).mkdir(parents=True, exist_ok=True)
                        mlist.to_mbox(dir_out=CONFIG.mail_path)
                        archive.append(mlist.name)
                    else:
                        logger.info(f"Recorded the list {mlist.name}.")
                        archive.append(mlist)
        return archive

コード例 #3

0

ファイルを表示

ファイル: abstract.py プロジェクト: nllz/bigbang

 def from_url(
     self,
     list_name: str,
     url: str,
     fields: str = "total",
 ) -> Message:
     """
     Parameters
     ----------
     list_name : The name of the mailing list.
     url : URL of this Email
     fields : Indicates whether to return 'header', 'body' or 'total'/both or
         the Email. The latter is the default.
     """
     soup = get_website_content(url, session=self.session)
     if soup == "RequestException":
         body = "RequestException"
         header = self.empty_header
     else:
         if fields in ["header", "total"]:
             header = self._get_header_from_html(soup)
         else:
             header = self.empty_header
         if fields in ["body", "total"]:
             body = self._get_body_from_html(list_name, url, soup)
         else:
             body = None
     return self.create_email_message(url, body, **header)

コード例 #4

0

ファイルを表示

ファイル: listserv.py プロジェクト: nllz/bigbang

    def get_sections(url_root: str, url_home: str) -> int:
        """
        Get different sections of mail list domain.
        On the Listserv 16.5 website they look like:
        [3GPP] [3GPP–AT1] [AT2–CONS] [CONS–EHEA] [EHEA–ERM_] ...
        On the Listserv 17 website they look like:
        [<<][<]1-50(798)[>][>>]

        Returns
        -------
        If sections exist, it returns their urls and names. Otherwise it returns
        the url_home.
        """
        soup = get_website_content(url_home)
        sections = soup.select(
            'a[href*="INDEX="][href*="p="]',
        )
        archive_sections_dict = {}
        if sections:
            for sec in sections:
                key = urljoin(url_root, sec.get("href"))
                value = sec.text
                if value in ["Next", "Previous"]:
                    continue
                archive_sections_dict[key] = value
            archive_sections_dict[re.sub(r"p=[0-9]+", "p=1", key)] = "FIRST"
        else:
            archive_sections_dict[url_home] = "Home"
        return archive_sections_dict

コード例 #5

0

ファイルを表示

ファイル: listserv.py プロジェクト: nllz/bigbang

 def get_message_urls_from_mlist_url(name: str, url: str) -> List[str]:
     url_root = ("/").join(url.split("/")[:-2])
     soup = get_website_content(url)
     a_tags = soup.select(f'a[href*="A2="][href*="{name}"]')
     if a_tags:
         a_tags = [urljoin(url_root, url.get("href")) for url in a_tags]
     return a_tags

コード例 #6

0

ファイルを表示

    def get_all_periods_and_their_urls(
        url: str, ) -> Tuple[List[str], List[str]]:
        """
        W3C groups messages into monthly time bundles. This method
        obtains all the URLs that lead to the messages of each time bundle.

        Returns
        -------
        Returns a tuple of two lists that look like:
        (['April 2017', 'January 2001', ...], ['ulr1', 'url2', ...])
        """
        # wait between loading messages, for politeness
        time.sleep(0.5)
        soup = get_website_content(url)
        print("get_all_periods_and_their_urls:")
        print(url)
        periods = []
        urls_of_periods = []
        rows = soup.select("tbody tr")
        for row in rows:
            link = row.select("td:nth-of-type(1) a")
            if len(link) > 0:
                link = link[0]
            else:
                continue
            periods.append(link.text)
            urls_of_periods.append(url + "/" + link.get("href"))
        return periods, urls_of_periods

コード例 #7

0

ファイルを表示

 def get_message_urls_from_period_url(name: str, url: str) -> List[str]:
     soup = get_website_content(url)
     a_tags = soup.select("div.messages-list a")
     if a_tags:
         a_tags = [
             urljoin(url, a_tag.get("href")) for a_tag in a_tags
             if a_tag.get("href") is not None
         ]
     return a_tags

コード例 #8

0

ファイルを表示

    def get_messages_urls(cls, name: str, url: str) -> List[str]:
        """
        Parameters
        ----------
        name : Name of the W3C mailing list.
        url : URL to group of messages that are within the same period.

        Returns
        -------
        List of URLs from which `mboxMessage` can be initialized.
        """
        soup = get_website_content(url)
        if soup == "RequestException":
            return []
        else:
            a_tags = soup.select("div.messages-list a")
            if a_tags:
                a_tags = [
                    urljoin(url, a_tag.get("href")) for a_tag in a_tags
                    if a_tag.get("href") is not None
                ]
            return a_tags

コード例 #9

0

ファイルを表示

ファイル: listserv.py プロジェクト: nllz/bigbang

    def get_all_periods_and_their_urls(
        url: str,
    ) -> Tuple[List[str], List[str]]:
        """
        LISTSERV groups messages into weekly time bundles. This method
        obtains all the URLs that lead to the messages of each time bundle.

        Returns
        -------
        Returns a tuple of two lists that look like:
        (['April 2017,  2', 'January 2001', ...], ['ulr1', 'url2', ...])
        """
        # wait between loading messages, for politeness
        time.sleep(0.5)

        url_root = ("/").join(url.split("/")[:-2])
        soup = get_website_content(url)
        periods = [list_tag.find("a").text for list_tag in soup.find_all("li")]
        urls_of_periods = [
            urljoin(url_root, list_tag.find("a").get("href"))
            for list_tag in soup.find_all("li")
        ]
        return periods, urls_of_periods

コード例 #10

0

ファイルを表示

ファイル: listserv.py プロジェクト: nllz/bigbang

    def get_lists_from_url(
        url_root: str,
        url_home: str,
        select: dict,
        session: Optional[str] = None,
        instant_save: bool = True,
        only_mlist_urls: bool = True,
    ) -> List[Union[ListservMailList, str]]:
        """Docstring in `AbstractMailList`."""
        archive = []
        # run through archive sections
        for url in list(
            ListservMailListDomain.get_sections(url_root, url_home).keys()
        ):
            soup = get_website_content(url)
            a_tags_in_section = soup.select(
                f'a[href^="{urlparse(url).path}?A0="]',
            )

            mlist_urls = [
                urljoin(url_root, a_tag.get("href"))
                for a_tag in a_tags_in_section
            ]
            mlist_urls = list(set(mlist_urls))  # remove duplicates

            if only_mlist_urls:
                # collect mailing-list urls
                for mlist_url in mlist_urls:
                    name = ListservMailList.get_name_from_url(mlist_url)
                    # check if mailing list contains messages in period
                    _period_urls = (
                        ListservMailList.get_all_periods_and_their_urls(
                            mlist_url
                        )[1]
                    )
                    # check if mailing list is public
                    if len(_period_urls) > 0:
                        loops = 0
                        for _period_url in _period_urls:
                            loops += 1
                            nr_msgs = len(
                                ListservMailList.get_messages_urls(
                                    name=name, url=_period_url
                                )
                            )
                            if nr_msgs > 0:
                                archive.append(mlist_url)
                                break
            else:
                # collect mailing-list contents
                for mlist_url in mlist_urls:
                    name = ListservMailList.get_name_from_url(mlist_url)
                    mlist = ListservMailList.from_url(
                        name=name,
                        url=mlist_url,
                        select=select,
                        session=session,
                    )
                    if len(mlist) != 0:
                        if instant_save:
                            dir_out = CONFIG.mail_path + name
                            Path(dir_out).mkdir(parents=True, exist_ok=True)
                            mlist.to_mbox(dir_out=CONFIG.mail_path)
                            archive.append(mlist.name)
                        else:
                            logger.info(f"Recorded the list {mlist.name}.")
                            archive.append(mlist)
        return archive