Beispiel #1
0
class BChrome():
    """
    This class helps to interact with Chrome/Selenium.
    It was made to be used as a Base class for the sources who need Chrome.
    """
    def __init__(self) -> None:
        self.logger = Logger(__class__)
        self.uri: str = ""
        self.driver = self.driverStart()
        pass

    def driverStart(self) -> Chrome:
        options = ChromeOptions()
        options.add_argument("--disable-extensions")
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        try:
            driver = Chrome(options=options)
            return driver
        except Exception as e:
            self.logger.critical(f"Chrome Driver failed to start! Error: {e}")

    def driverGetContent(self) -> str:
        try:
            return self.driver.page_source
        except Exception as e:
            self.logger.critical(f"Failed to collect data from {self.uri}. {e}")

    #def __driverGet__(self, uri: str ) -> None:
    #    self.driverGoTo(uri=uri)

    def driverGoTo(self, uri: str) -> None:
        try:
            self.driver.get(uri)
            self.driver.implicitly_wait(10)
        except Exception as e:
            self.logger.error(f"Driver failed to get {uri}. Error: {e}")

    def driverClose(self) -> None:
        try:
            self.driver.close()
        except Exception as e:
            self.logger.error(f"Driver failed to close. Error: {e}")
Beispiel #2
0
class RequestContent:
    """
    This is a common class that will request site information.
    This class will make use of the Requests and BeautifulSoup librarys.

    Examples:
    RequestContent(url='www').
    RequestContent().setUrl("www").
    """

    def __init__(self, url: str = "") -> None:
        self.url = url
        self.logger = Logger(__class__)
        pass

    def setUrl(self, url: str) -> None:
        """
        If you want to parse a URL, set the value here.
        """
        self.url = url

    def setSoup(self, soup: BeautifulSoup) -> None:
        """
        If the source has already been parsed elsewhere, pass the BeautifulSoup object here.
        """
        self.soup = soup

    def __getHeaders__(self) -> dict:
        return {"User-Agent": "NewsBot - Automated News Delivery"}

    def __getSource__(self) -> str:
        try:
            res: Response = get(self.url, headers=self.__getHeaders__())
            if res.ok == True:
                self.__response__: Response = res
                return res.text
            else:
                self.logger.error(
                    f"Attempted to get data from '{self.url}' but did not get any data.  StatusCode={res.status_code}"
                )
                return ""
        except Exception as e:
            self.logger.critical(
                f"Failed to get data from '{self.url}' but resulted in an error. {e} "
            )

    def __getSoup__(self) -> BeautifulSoup:
        try:
            soup = BeautifulSoup(self.__source__, features="html.parser")
            return soup
        except Exception as e:
            self.logger.error(e)
            return BeautifulSoup()

    def getPageDetails(self) -> None:
        """
        This pulls the source code and converts it into a BeautifulSoup object.
        """
        if self.url == "":
            self.logger.error(
                "Was requested to pull data from a site, but no URL was passed."
            )
        else:
            self.__source__ = self.__getSource__()

        try:
            if self.__soup__.text == "":
                self.__soup__ = self.__getSoup__()
            else:
                pass
        except:
            self.__soup__ = self.__getSoup__()

        pass

    def findSingle(
        self, name: str = "", attrKey: str = "", attrValue: str = ""
    ) -> BeautifulSoup:
        if attrKey != "":
            attrs = {attrKey: attrValue}
            res = self.__soup__.find(name=name, attrs=attrs)
            return res
        else:
            return self.__soup__.find(name=name)

    def findMany(
        self, name: str = "", attrKey: str = "", attrValue: str = ""
    ) -> List[BeautifulSoup]:
        if attrKey != "":
            return self.__soup__.find_all(name=name, attrs={attrKey: attrValue})
        else:
            return self.__soup__.find_all(name=name)

    def findFeedLink(self) -> dict:
        atom = self.findSingle(
            name="link", attrKey="type", attrValue="application/atom+xml"
        )
        rss = self.findSingle(
            name="link", attrKey="type", attrValue="application/rss+xml"
        )
        json = self.findSingle(
            name="link", attrKey="type", attrValue="application/json"
        )

        if atom != None:
            return self.__buildFeedDict__("atom", atom.attrs["href"])
        elif rss != None:
            return self.__buildFeedDict__("rss", rss.attrs["href"])
        elif json != None:
            return self.__buildFeedDict__("json", json.attrs["href"])
        else:
            return self.__buildFeedDict__("none", None)

    def __buildFeedDict__(self, type: str, content: str) -> dict:
        return {"type": type, "content": content}

    def findSiteIcon(self, siteUrl: str) -> str:
        """
        This will go and attempt to extract the 'apple-touch-icon' from the header.

        return: str
        """
        # if a site url contains the / lets remove it
        if siteUrl.endswith("/") == True:
            siteUrl = siteUrl.strip("/")

        bestSize: int = -1
        icons = self.findMany(name="link", attrKey="rel", attrValue="apple-touch-icon")
        # look though all the icons given, find the largest one.
        for icon in icons:
            size: int = int(icon.attrs["sizes"].split("x")[0])
            if size > bestSize:
                bestSize = size

        # take what we found as the largest icon and store it.
        for icon in icons:
            size: int = int(icon.attrs["sizes"].split("x")[0])
            if size == bestSize:
                href = icon.attrs["href"]
                if "http" in href or "https" in href:
                    return href
                else:
                    return f"{siteUrl}{href}"
        return ""

    def findArticleThumbnail(self) -> str:
        """
        This is best used on articles, not on root the main site page.
        It will go and check the page for defined thumbnails and return the first one it finds, if any.

        return: str
        """
        meta = (
            {"name": "meta", "attrKey": "property", "attrValue": "og:image"},
            {"name": "meta", "attrKey": "name", "attrValue": "twitter:image:src"},
        )

        for i in meta:
            try:
                item = self.findSingle(
                    name=i["name"], attrKey=i["attrKey"], attrValue=i["attrValue"]
                )
                if item.attrs["content"] != "":
                    thumb = item.attrs["content"]
                    return thumb
            except:
                pass
        return ""

    def findArticleDescription(self) -> str:
        lookups = (
            {"name": "div", "key": "class", "value": "entry-content e-content"},
            {"name": "div", "key": "class", "value": "engadget-post-contents"},
            {"name": "div", "key": "class", "value": "article-content post-page"},
        )

        for l in lookups:
            content = self.findSingle(
                name=l["name"], attrKey=l["key"], attrValue=l["value"]
            )
            if content.text != "":
                return content.text
Beispiel #3
0
class Discord(IOutputs):
    def __init__(self) -> None:
        self.logger = Logger(__class__)
        self.table = DiscordQueue()
        self.tempMessage: DiscordWebhook = DiscordWebhook("placeholder")
        pass

    def enableThread(self) -> None:
        while True:
            # Tell the database to give us the queue on the table.
            try:
                queue = self.table.getQueue()

                for i in queue:

                    resp = self.sendMessage(i)

                    # Only remove the object from the queue if we sent it out correctly.
                    safeToRemove: bool = True
                    for r in resp:
                        if r.status_code != 204:
                            safeToRemove = False

                    if safeToRemove == True:
                        i.remove()

                    sleep(env.discord_delay_seconds)
            except Exception as e:
                self.logger.error(
                    f"Failed to post a message. {i.title}. Status_code: {resp.status_code}. msg: {resp.text}. error {e}"
                )

            sleep(env.discord_delay_seconds)

    def buildMessage(self, article: DiscordQueue) -> None:
        # reset the stored message
        self.tempMessage = DiscordWebhook("placeholder")

        # Extract the webhooks that relate to this site
        webhooks: List[str] = self.getHooks(article.siteName)

        # Make a new webhook with the hooks that relate to this site
        hook: DiscordWebhook = DiscordWebhook(webhooks)
        # hook.content = article.link

        title = article.title
        if len(title) >= 128:
            title = f"{title[0:128]}..."

        # Make a new Embed object
        embed: DiscordEmbed = DiscordEmbed(title=title)  # , url=article.link)

        try:
            authorIcon = self.getAuthorIcon(article.authorImage,
                                            article.siteName)
            embed.set_author(name=article.authorName,
                             url=None,
                             icon_url=authorIcon)
        except:
            pass

        # Discord Embed Description can only contain 2048 characters
        ch = ConvertHtml()
        if article.description != "":
            description: str = str(article.description)
            description = self.convertFromHtml(description)
            description = ch.replaceImages(description, '')
            #description = self.replaceImages(description)
            descriptionCount = len(description)
            if descriptionCount >= 2048:
                description = description[0:2040]
                description = f"{description}..."
            embed.description = description

        # Figure out if we have video based content
        if article.video != "":
            embed.description = "View the video online!"
            embed.set_video(url=article.video,
                            height=article.videoHeight,
                            width=article.videoWidth)

        try:
            if article.thumbnail != "":
                if " " in article.thumbnail:
                    s = article.thumbnail.split(" ")
                    embed.set_image(url=s[0])
                else:
                    embed.set_image(url=article.thumbnail)
        except Exception as e:
            self.logger.warning(
                f"Failed to attach a thumbnail. \r\n {e}\r\n thumbnails: {article.thumbnail}"
            )

        # add the link to the embed
        embed.add_embed_field(name="Link:", value=article.link)

        # Build our footer message
        footer = self.buildFooter(article.siteName)
        footerIcon = self.getFooterIcon(article.siteName)
        embed.set_footer(icon_url=footerIcon, text=footer)

        embed.set_color(color=self.getEmbedColor(article.siteName))

        hook.add_embed(embed)
        self.tempMessage = hook

    def sendMessage(self, article: DiscordQueue) -> List[Response]:
        if article.title != "":
            self.logger.info(f"Discord - Sending article '{article.title}'")
        else:
            self.logger.info(
                f"Discord - Sending article '{article.description}'")
        self.buildMessage(article)
        try:
            res = self.tempMessage.execute()
        except Exception as e:
            self.logger.critical(
                f"Failed to send to Discord.  Check to ensure the webhook is correct. Error: {e}"
            )

        hooks: int = len(self.getHooks(article.siteName))

        # Chcekcing to see if we returned a single responce or multiple.
        if hooks == 1:
            responces = list()
            responces.append(res)
        else:
            responces = res

        return responces

    def getHooks(self, newsSource: str) -> List[str]:
        hooks = list()
        try:
            dwh = DiscordWebHooks(name=newsSource).findAllByName()
            for i in dwh:
                hooks.append(i.key)
            return hooks
        except Exception as e:
            self.logger.critical(
                f"Unable to find DiscordWebhook for {newsSource.siteName}")

    def convertFromHtml(self, msg: str) -> str:
        msg = msg.replace("<h2>", "**")
        msg = msg.replace("</h2>", "**")
        msg = msg.replace("<h3>", "**")
        msg = msg.replace("</h3>", "**\r\n")
        msg = msg.replace("<strong>", "**")
        msg = msg.replace("</strong>", "**\r\n")
        msg = msg.replace("<ul>", "\r\n")
        msg = msg.replace("</ul>", "")
        msg = msg.replace("</li>", "\r\n")
        msg = msg.replace("<li>", "> ")
        msg = msg.replace("&#8220;", '"')
        msg = msg.replace("&#8221;", '"')
        msg = msg.replace("&#8230;", "...")
        msg = msg.replace("<b>", "**")
        msg = msg.replace("</b>", "**")
        msg = msg.replace("<br>", "\r\n")
        msg = msg.replace("<br/>", "\r\n")
        msg = msg.replace("\xe2\x96\xa0", "*")
        msg = msg.replace("\xa0", "\r\n")
        msg = msg.replace("<p>", "")
        msg = msg.replace("</p>", "\r\n")

        msg = self.replaceLinks(msg)
        return msg

    def replaceLinks(self, msg: str) -> str:
        """
        Find the HTML links and replace them with something discord supports.
        """
        # links = re.findall("(?<=<a )(.*)(?=</a>)", msg)
        msg = msg.replace("'", '"')
        links = re.findall("<a(.*?)a>", msg)
        for l in links:
            hrefs = re.findall('href="(.*?)"', l)
            texts = re.findall(">(.*?)</", l)
            if len(hrefs) >= 1 and len(texts) >= 1:
                discordLink = f"[{texts[0]}]({hrefs[0]})"
                msg = msg.replace(f"<a{l}a>", discordLink)
        return msg

    def replaceImages(self, msg: str) -> str:
        imgs = re.findall("<img (.*?)>", msg)
        for i in imgs:
            # Removing the images for now.
            # src = re.findall('src=(.*?)">', i)
            replace = f"<img {i}>"
            msg = msg.replace(replace, "")
        return msg

    def getAuthorIcon(self, authorIcon: str, siteName: str) -> str:
        if authorIcon != "":
            return authorIcon
        else:
            if (siteName == "Final Fantasy XIV"
                    or siteName == "Phantasy Star Online 2"
                    or siteName == "Pokemon Go Hub"):
                res = Icons(site=f"Default {siteName}").findAllByName()
                return res[0].filename
            else:
                s: List[str] = siteName.split(" ")
                if s[0] == "RSS":
                    # res = Icons(site=f"Default {s[1]}").findAllByName()
                    res = Icons(site=siteName).findAllByName()
                else:
                    res = Icons(site=f"Default {s[0]}").findAllByName()
                return res[0].filename

    def buildFooter(self, siteName: str) -> str:
        footer = ""
        end: str = "Brought to you by NewsBot"
        if "reddit" in siteName.lower():
            s = siteName.split(" ")
            footer = f"{end}"
        elif "Youtube" in siteName:
            s = siteName.split(" ")
            footer = f"{s[1]} - {end}"
        elif "Instagram" in siteName or "Twitter" in siteName:
            s = siteName.split(" ")
            if s[1] == "tag":
                footer = f"#{s[2]} - {end}"
            elif s[1] == "user":
                footer = f"{s[2]} - {end}"
        elif "RSS" in siteName:
            s = siteName.split(" ")
            footer = f"{s[1]} - {end}"
        else:
            footer = end

        return footer

    def getFooterIcon(self, siteName: str) -> str:
        if (siteName == "Phatnasy Star Online 2"
                or siteName == "Pokemon Go Hub"
                or siteName == "Final Fantasy XIV"):
            res = Icons(site=f"Default {siteName}").findAllByName()
            return res[0].filename
        else:
            s: List[str] = siteName.split(" ")
            values = (f"Default {s[1]}", f"Default {s[0]}", siteName)
            for v in values:
                r = Icons(site=v).findAllByName()
                if len(r) == 1:
                    res = r
            # if s[0].lower() == 'rss':
            #    res = Icons(site=f"Default {s[1]}").findAllByName()
            # else:
            #    res = Icons(site=f"Default {s[0]}").findAllByName()

            try:
                if res[0].filename != "":
                    return res[0].filename
                else:
                    return ""
            except:
                return ""

    def getEmbedColor(self, siteName: str) -> int:
        # Decimal values can be collected from https://www.spycolor.com
        if "Reddit" in siteName:
            return 16395272
        elif "YouTube" in siteName:
            return 16449542
        elif "Instagram" in siteName:
            return 13303930
        elif "Twitter" in siteName:
            return 1937134
        elif "Final Fantasy XIV" in siteName:
            return 11809847
        elif "Pokemon Go Hub" in siteName:
            return 2081673
        elif "Phantasy Star Online 2" in siteName:
            return 5557497
        elif "Twitch" in siteName:
            return 9718783
        else:
            return 0
Beispiel #4
0
class BSources():
    """
    This class contains some common code found in the sources.  Do not use this on its own!
    """
    def __init__(self) -> None:
        self.uri:str = ""
        self.logger = Logger(__class__)
        
        self.outputDiscord: bool = False
        self.hooks = List[DiscordWebHooks] = list()
        
        self.sourceEnabled: bool = False
        self.links: List[Sources] = list()
        pass

    def checkEnv(self, siteName: str) -> None:
        # Check if site was requested.
        self.outputDiscord = self.isDiscordEnabled(siteName)
        if self.outputDiscord == True:
            self.hooks = self.getDiscordList(siteName)

        self.sourceEnabled = self.isSourceEnabled(siteName)
        if self.sourceEnabled == True:
            self.links = self.getSourceList(siteName)

    def getSourceList(self, siteName: str) -> List[Sources]:
        l = list()
        res = Sources(name=siteName).findAllByName()
        for i in res:
            l.append(i)
        return l

    def isSourceEnabled(self, siteName: str) -> bool:
        res = Sources(name=siteName).findAllByName()
        if len(res) >= 1:
            return True
        else:
            return False

    def getDiscordList(self, siteName: str) -> List[DiscordWebHooks]:
        h = list()
        dwh = DiscordWebHooks(name=siteName).findAllByName()
        if len(dwh) >= 1:
            for i in dwh:
                h.append(i)
        return h

    def isDiscordEnabled(self, siteName: str) -> bool:
        dwh = DiscordWebHooks(name=siteName).findAllByName()
        if len(dwh) >= 1:
            return True 
        else:
            return False

    def getContent(self) -> Response:
        try:
            headers = self.getHeaders()
            return get(self.uri, headers=headers)
        except Exception as e:
            self.logger.critical(f"Failed to collect data from {self.uri}. {e}")

    def getParser(self, requestsContent: Response = "", seleniumContent: str = "") -> BeautifulSoup:
        try:
            if seleniumContent != "":
                return BeautifulSoup(seleniumContent, features="html.parser")
            else:
                return BeautifulSoup(requestsContent.content, features="html.parser")
        except Exception as e:
            self.logger.critical(f"failed to parse data returned from requests. {e}")

    def getHeaders(self) -> dict:
        return {"User-Agent": "NewsBot - Automated News Delivery"}
Beispiel #5
0
class RssParser:
    def __init__(self, url: str, siteName: str) -> None:
        self.logger = Logger(__class__)
        self.url: str = url
        self.siteName: str = siteName
        self.content: RequestSiteContent = RequestContent(url=url)
        self.content.getPageDetails()
        # self.rssHelper: IRssContent = rssHelper
        pass

    def getPosts(self) -> List:
        return self.content.findMany(name="item")

    def processItem(self, item: BeautifulSoup, title: str) -> Articles:
        # get the link for the article
        url = self.findItemLink(item)
        if url == "" or url == None or url == "https://":

            # did not find a valid url, pass back a blank object
            return Articles()

        # Check if we have already looked atthis link
        if Articles(url=url).exists() == False:
            # Set the new URI and store the source for now to avoid extra calls
            # rc = RequestContent(url=url)
            ra = RequestArticleContent(url=url)
            ra.getPageDetails()
            thumb = ra.findArticleThumbnail()

            description = ""
            # description = ra.findArticleDescription()

            a = Articles(
                siteName=title,
                title=item.find(name="title").text,
                description=self.findItemDescription(item, description),
                tags=self.findItemTags(item),
                url=url,
                pubDate=item.find(name="pubdate").text,
                authorName=self.findItemAuthor(item),
            )
            a.thumbnail = thumb
        else:
            return Articles()
        return a

    def findItemDescription(self, item: BeautifulSoup, desc: str) -> str:
        i: str = ""
        if desc != "":
            return desc
        else:
            items = ("description", "content:encoded")
            for i in items:
                try:
                    # i:str = item.find(name="description").text
                    i = item.find(name=i).text
                    if i != "":
                        return i
                except:
                    pass

            if i == "":
                self.logger.critical(
                    f"Failed to locate RSS body.  Review {self.url} for the reason"
                )
            return ""

    def findItemLink(self, item: BeautifulSoup) -> str:
        url: str = item.find(name="link").next
        url = url.replace("\n", "")
        url = url.replace("\t", "")
        url = url.replace("\r", "")
        url = url.strip()
        return url

    def findItemTags(self, item: BeautifulSoup) -> str:
        tags: List[str] = list()
        for i in item.find_all(name="category"):
            # lets vsc see the expected class
            i: BeautifulSoup = i
            tags.append(i.text)

        s = str(tags)
        return s

    def findItemAuthor(self, item: BeautifulSoup) -> str:
        items = ("author", "dc:creator")
        for i in items:
            try:
                itemAuthor = item.find(name=i).text
                if itemAuthor != "":
                    return itemAuthor
            except:
                pass

        return ""