Ejemplo n.º 1
0
 def parseItem(self, item: Dict) -> Articles:
     a = Articles(url=item["url"])
     if a.exists() == False:
         rc = RequestContent(url=item["url"])
         rc.getPageDetails()
         a = Articles(
             siteName=self.siteName,
             tags=f"RSS, {self.siteName}",
             title=item["title"],
             url=item["url"],
             pubDate=item["date_published"],
             thumbnail=rc.findArticleThumbnail(),
             authorName=item["author"]["name"],
             description=item["content_html"],
         )
     return a
Ejemplo n.º 2
0
    def getArticles(self) -> List[Articles]:
        self.driver = self.driverStart()
        allArticles: List[Articles] = list()

        for site in self.links:
            self.currentLink = site

            nameSplit = site.name.split(" ")
            igType = nameSplit[1]
            self.siteName = f"Instagram {nameSplit[2]}"
            self.logger.debug(f"Instagram - {nameSplit[2]} - Checking for updates.")
            
            #self.uri = f"{self.baseUri}directory/hashtags/"
            self.uri = f"https://www.instagram.com/directory/profiles/0-0/"
            self.driverGoTo(self.uri)

            # Figure out if we are looking for a user or tag
            if igType == "user":
                #self.uri = f"{self.baseUri}{nameSplit[2]}"
                WebDriverWait(driver=self.driver, timeout=5)
                self.driver.save_screenshot('ig_hashtag.png')
                res = self.driver.find_element_by_xpath('/html/body/div[1]/section/nav/div[2]/div/div/div[2]/div/div/span[2]')
                links = self.getUserArticleLinks()
            elif igType == "tag":
                self.uri = f"{self.baseUri}explore/tags/{nameSplit[2]}/"
                self.driverGoTo(self.uri)
                links = self.getTagArticleLinks()

            for l in links:
                # check if we have already seen the url
                a = Articles(url=l)
                if a.exists() == False:
                    # Get the content
                    allArticles.append(self.getPostInfo(l))

            self.logger.debug(f"{self.siteName} - Finished checking.")
            try:
                pass
            except Exception as e:
                self.logger.error(
                    f"Failed to parse articles from {self.siteName}.  Chances are we have a malformed responce. {e}"
                )

        self.driverClose()
        self.siteName = "Instagram"

        return allArticles
Ejemplo n.º 3
0
    def getArticles(self) -> List[Articles]:
        # TODO Flag NSFW
        #allowNSFW = True

        self.driver = self.driverStart()

        # rss = RSSRoot()
        allArticles: List[Articles] = list()
        for source in self.links:
            authorImage = ""
            authorName = ""
            subreddit = source.name.replace("Reddit ", "")

            self.logger.debug(f"Collecting posts for '/r/{subreddit}'...")

            # Add the info we get via Selenium to the Cache to avoid pulling it each time.
            authorImage = Cache(key=f"reddit {subreddit} authorImage").find()
            authorName = Cache(key=f"reddit {subreddit} authorName").find()
            if authorImage == "":
                # Collect values that we do not get from the RSS
                self.uri = f"https://reddit.com/r/{subreddit}"
                self.driverGoTo(self.uri)
                #source = self.driverGetContent()
                soup = self.getParser(seleniumContent=self.driverGetContent())

                subImages = soup.find_all(
                    name="img", attrs={"class": "Mh_Wl6YioFfBc9O1SQ4Jp"})
                if len(subImages) != 0:
                    # Failed to find the custom icon.  The sub might not have a custom CSS.
                    authorImage = subImages[0].attrs["src"]

                if authorImage == "":
                    # I am not sure how to deal with svg images at this time.  Going to throw in the default reddit icon.
                    subImages = soup.find_all(
                        name="svg", attrs={"class": "ixfotyd9YXZz0LNAtJ25N"})
                    if len(subImages) == 1:
                        authorImage = "https://www.redditstatic.com/desktop2x/img/favicon/android-icon-192x192.png"

                subName = soup.find_all(
                    name="h1", attrs={"class": "_2yYPPW47QxD4lFQTKpfpLQ"})
                authorName = f"/r/{subreddit} - {subName[0].text}"
                Cache(key=f"reddit {subreddit} authorImage",
                      value=authorImage).add()
                Cache(key=f"reddit {subreddit} authorName",
                      value=authorName).add()

            # Now check the RSS
            posts = self.getPosts(subreddit)
            for p in posts:
                if (Articles(url=f"https://reddit.com{p['data']['permalink']}"
                             ).exists() == False):
                    allArticles.append(
                        self.getPostDetails(p["data"], subreddit, authorName,
                                            authorImage))

            sleep(5.0)

        self.driverClose()
        return allArticles
Ejemplo n.º 4
0
    def processItem(self, item: BeautifulSoup, title: str) -> Articles:
        # get the link for the article
        url = self.findItemLink(item)
        if url == "" or url == None or url == "https://":

            # did not find a valid url, pass back a blank object
            return Articles()

        # Check if we have already looked atthis link
        if Articles(url=url).exists() == False:
            # Set the new URI and store the source for now to avoid extra calls
            # rc = RequestContent(url=url)
            ra = RequestArticleContent(url=url)
            ra.getPageDetails()
            thumb = ra.findArticleThumbnail()

            description = ""
            # description = ra.findArticleDescription()

            a = Articles(
                siteName=title,
                title=item.find(name="title").text,
                description=self.findItemDescription(item, description),
                tags=self.findItemTags(item),
                url=url,
                pubDate=item.find(name="pubdate").text,
                authorName=self.findItemAuthor(item),
            )
            a.thumbnail = thumb
        else:
            return Articles()
        return a
Ejemplo n.º 5
0
    def getArticles(self) -> List[Articles]:
        allArticles: List[Articles] = list()
        for site in self.links:
            self.logger.debug(f"{site.name} - Checking for updates.")
            self.uri = site.url

            siteContent: Response = self.getContent()
            if siteContent.status_code != 200:
                self.logger.error(
                    f"The returned content from {self.siteName} is either malformed or incorrect.  We got the wrong status code.  Expected 200 but got {siteContent.status_code}"
                )
            page: BeautifulSoup = self.getParser(requestsContent=siteContent)

            try:
                for news in page.find_all("li", {"class", "news-item all sr"}):
                    a = Articles(siteName=self.siteName, authorName=self.authorName)
                    # a.siteName = "Phantasy Star Online 2"
                    a.thumbnail = re.findall(
                        "url[(](.*?)[)]", news.contents[1].attrs["style"]
                    )[0]

                    nc = news.contents[3].contents
                    a.title = nc[1].text
                    a.description = nc[3].text

                    bottom = nc[5].contents
                    a.tags = bottom[1].text
                    a.pubDate = bottom[5].text

                    link = re.findall(
                        r"ShowDetails\('(.*?)'", bottom[7].attrs["onclick"],
                    )[0]
                    # tells us the news type and news link
                    cat = bottom[1].text.lower()
                    if " " in cat:
                        cat = cat.replace(" ", "-")

                    a.url = f"{self.uri}/{cat}/{link}"

                    allArticles.append(a)
            except UnableToFindContent as e:
                self.logger.error(f"PSO2 - Unable to find articles. {e}")

        self.logger.debug(f"{site.name} - Finished collecting articles")
        return allArticles
Ejemplo n.º 6
0
    def processItem(self, item: object) -> Articles:
        a = Articles(
            siteName=self.siteName,
            authorName=self.authorName,
            tags="pokemon go hub, pokemon, go, hub, news",
        )

        for i in item.contents:
            if i.name == "title":
                a.title = i.next
            elif i.name == "link":
                a.url = self.removeHTMLTags(i.next)
            elif i.name == "pubdate":
                a.pubDate = i.next
            elif i.name == "category":
                a.tags = i.next
            elif i.name == "description":
                a.description = self.removeHTMLTags(i.next)
            elif i.name == "content:encoded":
                a.content = i.next
        return a
Ejemplo n.º 7
0
    def getArticles(self) -> List[Articles]:
        allArticles: List[Articles] = list()
        for site in self.links:
            self.logger.debug(f"{site.name} - Checking for updates.")
            self.uri = site.url

            #siteContent: Response = self.getContent()
            page = self.getParser(requestsContent=self.getContent())

            if "Topics" in site.name:
                try:
                    for news in page.find_all(
                            "li",
                        {"class", "news__list--topics ic__topics--list"}):
                        a = Articles(
                            siteName=self.siteName,
                            tags="ffxiv, topics, news",
                            authorName=self.authorName,
                        )
                        # a.siteName = self.siteName
                        header = news.contents[0].contents
                        body = news.contents[1].contents
                        a.title = header[0].text
                        a.url = f"{self.baseUri}{header[0].contents[0].attrs['href']}"
                        a.thumbnail = body[0].contents[0].attrs["src"]
                        a.description = body[0].contents[0].next_element.text
                        # a.tags = "Topics"
                        allArticles.append(a)
                except Exception as e:
                    self.logger.error(
                        f"Failed to collect Topics from FFXIV. {e}")

            if "Notices" in site.name:
                try:
                    for news in page.find_all(
                            "a", {"class", "news__list--link ic__info--list"}):
                        a = Articles(
                            siteName=self.siteName,
                            tags="ffxiv, notices, news",
                            authorName=self.authorName,
                        )
                        # a.siteName = self.siteName
                        a.title = news.text
                        a.url = f"{self.baseUri}{news.attrs['href']}"
                        # a.tags = "Notices"
                        self.uri = a.link
                        #subPage = self.getContent()
                        details = self.getParser(
                            requestsContent=self.getContent())
                        for d in details.find_all(
                                "div", {"class", "news__detail__wrapper"}):
                            a.description = d.text
                        allArticles.append(a)
                except Exception as e:
                    self.logger.error(
                        f"Failed to collect Notice from FFXIV. {e}")
                    pass

            if "Maintenance" in site.name:
                try:
                    for news in page.find_all(
                            "a",
                        {"class", "news__list--link ic__maintenance--list"}):
                        a = Articles(
                            siteName=self.siteName,
                            tags="ffxiv, maintenance, news",
                            authorName=self.authorName,
                        )
                        # a.siteName = self.siteName
                        a.title = news.text
                        a.url = f"{self.baseUri}{news.attrs['href']}"
                        # a.tags = site["tag"]
                        self.uri = a.link
                        #subPage = self.getContent()
                        details = self.getParser(
                            requestsContent=self.getContent())
                        for d in details.find_all(
                                "div", {"class", "news__detail__wrapper"}):
                            a.description = d.text

                        allArticles.append(a)
                except Exception as e:
                    self.logger.error(
                        f"Failed to collect {site['tag']} records from FFXIV. {e}"
                    )
                    pass

            if "Updates" in site.name:
                try:
                    for news in page.find_all(
                            "a",
                        {"class", "news__list--link ic__update--list"}):
                        a = Articles(
                            siteName=self.siteName,
                            tags="ffxiv, updates, news",
                            authorName=self.authorName,
                        )
                        a.title = news.text
                        a.url = f"{self.baseUri}{news.attrs['href']}"
                        self.uri = a.link

                        #subPage = self.getContent()
                        details = self.getParser(
                            requestsContent=self.getContent())

                        for d in details.find_all(
                                "div", {"class", "news__detail__wrapper"}):
                            a.description = d.text
                        allArticles.append(a)
                except Exception as e:
                    self.logger.error(
                        f"Failed to collect {site['tag']} records from FFXIV. {e}"
                    )
                    pass

            if "Status" in site.name:
                try:
                    for news in page.find_all(
                            "a",
                        {"class", "news__list--link ic__obstacle--list"}):
                        a = Articles(
                            siteName=self.siteName,
                            tags="ffxiv, news, status",
                            authorName=self.authorName,
                        )
                        a.siteName = self.siteName
                        a.title = news.text
                        a.link = f"{self.baseUri}{news.attrs['href']}"
                        a.tags = site["tag"]
                        self.uri = a.link

                        #subPage = self.getContent()
                        details = self.getParser(
                            requestsContent=self.getContent())

                        for d in details.find_all(
                                "div", {"class", "news__detail__wrapper"}):
                            a.description = d.text
                        allArticles.append(a)
                except Exception as e:
                    self.logger.error(
                        f"Failed to collect {site['tag']} records from FFXIV. {e}"
                    )
                    pass

        return allArticles
Ejemplo n.º 8
0
    def getArticles(self) -> List[Articles]:
        self.logger.debug("Checking Twitch for updates.")
        api = TwitchAPI()
        auth = api.auth()

        allPosts = list()
        for i in self.links:
            s = i.name.split(" ")
            userName = s[2]
            self.logger.debug(f"Checking Twitch user {userName} for updates.")

            user_id = Cache(key=f"twitch {userName} user_id").find()
            if user_id == "":
                # Take the value and add it to the cache so we dont need to call the API for this
                user: TwitchUser = api.getUser(auth, userName)
                user_id = Cache(key=f"twitch {userName} user_id",
                                value=user.id).add()
                display_name = Cache(key=f"twitch {userName} display_name",
                                     value=user.display_name).add()
                profile_image_url = Cache(
                    key=f"twitch {userName} profile_image_url",
                    value=user.profile_image_url,
                ).add()
            else:
                # We have cached this information already
                display_name = Cache(key=f"twitch {userName} display").find()
                profile_image_url = Cache(
                    key=f"twitch {userName} profile_image_url").find()

            enableClips = Cache(key="twitch clips enabled").find()
            if enableClips.lower() == "true":
                clips: List[TwitchClip] = api.getClips(auth, user_id=user_id)
                for v in clips:
                    try:
                        a = Articles(
                            siteName=f"Twitch user {display_name}",
                            authorName=display_name,
                            authorImage=profile_image_url,
                            tags=f"Twitch, clip, {display_name}",
                            title=v.title,
                            pubDate=v.created_at,
                            url=v.url,
                            thumbnail=v.thumbnail_url,
                            description=
                            "A new clip has been posted! You can watch it with the link below.",
                        )
                        allPosts.append(a)
                    except Exception as e:
                        self.logger.error(e)

            enableVoD = Cache(key="twitch vod enable").find()
            if enableVoD.lower() == "true":
                videos: List[TwitchVideo] = api.getVideos(auth,
                                                          user_id=user_id)
                for v in videos:
                    try:
                        a = Articles(
                            siteName=f"Twitch user {display_name}",
                            authorName=display_name,
                            authorImage=profile_image_url,
                            tags=f"Twitch, vod, {display_name}",
                            # description = v.description,
                            title=v.title,
                            description=
                            "A new video has been posed! You can watch it with the link below.",
                            pubDate=v.published_at,
                            url=v.url,
                        )
                        thumb: str = v.thumbnail_url
                        thumb = thumb.replace("%{width}", "600")
                        thumb = thumb.replace("%{height}", "400")
                        a.thumbnail = thumb
                        allPosts.append(a)
                    except Exception as e:
                        self.logger.error(e)

        return allPosts
Ejemplo n.º 9
0
    def getPostDetails(self, obj: dict, subreddit: str, authorName: str,
                       authorImage: str) -> Articles:
        try:

            a = Articles()
            a.url = f"https://reddit.com{obj['permalink']}"
            a.siteName = f"Reddit {subreddit}"
            a.authorImage = authorImage
            a.authorName = authorName
            a.title = f"{obj['title']}"
            a.tags = obj["subreddit"]

            # figure out what url we are going to display
            if obj["is_video"] == True:
                a.video = obj["media"]["reddit_video"]["fallback_url"]
                a.videoHeight = obj["media"]["reddit_video"]["height"]
                a.videoWidth = obj["media"]["reddit_video"]["width"]
                a.thumbnail = self.getVideoThumbnail(obj["preview"])

            elif obj["media_only"] == True:
                print("review dis")
            elif "gallery" in obj["url"]:
                self.uri = obj["url"]
                source = self.getContent()
                soup = self.getParser(requestsContent=source)
                try:
                    images = soup.find_all(
                        name="img", attrs={"class": "_1dwExqTGJH2jnA-MYGkEL-"})
                    pictures: str = ""
                    for i in images:
                        pictures += f"{i.attrs['src']} "
                    a.thumbnail = pictures
                except Exception as e:
                    self.logger.error(
                        f"Failed to find the images on a reddit gallery.  CSS might have changed."
                    )
            else:
                a.thumbnail = obj["url"]

            return a
        except Exception as e:
            self.logger.error(
                f"Failed to extract Reddit post.  Too many connections? {e}")
Ejemplo n.º 10
0
    def parseItem(self, item: BeautifulSoup) -> Articles:
        feedTitle: str = self.content.findSingle(name="title")

        a = Articles()
        a.url = item.find(name="link", attrs={"type": "text/html"}).attrs["href"]
        if a.exists() == False:
            rc = RequestContent(url=a.url)
            rc.getPageDetails()
            thumbnail = rc.findArticleThumbnail()

            a.siteName = self.siteName
            a.tags = f"RSS, {self.siteName}"
            a.title = item.find(name="title").text.replace("\n", "").strip()
            a.pubDate = item.find(name="updated").text
            text: str = item.find(name="content").text
            a.thumbnail = thumbnail

            # this works on github commits
            if ">" in text and "<" in text:
                text = re.findall(">(.*?)<", text)[0]
            a.description = text

            author = item.find(name="author")
            if "github.com" in self.url:
                a.authorName = author.find(name="name").text
            else:
                a.authorName = author.text

        return a
Ejemplo n.º 11
0
 def test_00init(self):
     a = Articles()
     if a.id != "":
         assert True
     else:
         assert False
Ejemplo n.º 12
0
    def getPostInfo(self, link: str) -> Articles:
        a = Articles(url=link, siteName=self.currentLink.name, tags="instagram, posts")

        self.driverGoTo(link)
        #source = self.getContent()
        soup = self.getParser(requestsContent=self.getContent())

        nameSplit = self.currentLink.name.split(" ")
        if nameSplit[1] == "tag":
            a.tags += f", tag, {nameSplit[2]}"
        elif nameSplit[1] == "user":
            a.tags += f", user, {nameSplit[2]}"

        # Get the title from the post
        title = soup.find_all(name="span", attrs={"class", ""})

        # Get the poster Avatar
        authorImages = soup.find_all(name="img")
        for i in authorImages:
            try:
                if "profile picture" in i.attrs["alt"]:
                    a.authorImage = i.attrs["src"]
                    break
            except:
                pass

        # get posters name
        authorName = soup.find_all(
            name="a", attrs={"class": "sqdOP yWX7d _8A5w5 ZIAjV"}
        )
        a.authorName = authorName[0].text

        # Check the title to make sure it was not just all tags... someone did that! - Done
        # TODO Need a better placeholder value
        cleanTitle = self.cleanTitle(title[1].text)
        if cleanTitle == "":
            a.title = "Instagram Post"
        else:
            a.title = cleanTitle

        # improve the regex to collect tags.  It nuked out a title... oops - Made an adjustment
        tags = self.getTags(title[1].text)
        if tags != "":
            a.tags = tags

        # Get when the post went up
        dt = soup.find_all(name="time", attrs={"class": "FH9sR Nzb55"})
        a.pubDate = dt[0].attrs["datetime"]

        # Video link
        hasVideo = soup.find_all(
            name="span", attrs={"class": "qBUYS _7CSz9 FGFB7 videoSpritePlayButton"}
        )
        hasCollection = soup.find_all(name="button", attrs={"class": "_6CZji"})
        if len(hasVideo) >= 1:
            video = soup.find(name="video", attrs={"class": "tWeCl"})
            a.description = "This post contains a video, view it online!"
            a.video = video.attrs["src"]

        # check if it contains multiple pictures
        elif len(hasCollection) >= 1:
            a.description = "This post contains multiple pictures, view them online!"
            a.thumbnail = self.getPicture(soup)
            # TODO Figure out if the collection can be stored.
            # Its not like Discord can present them all with a single post.
            # self.getCollection(soup)

        # Get a single picture
        else:
            a.thumbnail = self.getPicture(soup)
        return a
Ejemplo n.º 13
0
    def getArticles(self) -> List[Articles]:
        self.logger.debug(f"Checking YouTube for new content")
        self.driver = self.driverStart()

        allArticles: List[Articles] = list()

        for site in self.links:
            s = site.name.split(" ")
            self.authorName = ""
            self.authorImage = ""
            self.logger.debug(f"{site.name} - Checking for updates")

            # pull the source code from the main youtube page
            channelID = Cache(key=f"youtube {s[1]} channelID").find()
            if channelID == "":
                self.uri = f"{site.url}"
                self.driverGoTo(self.uri)
                #self.driver.save_screenshot("youtube_step1.png")
                siteContent: str = self.driverGetContent()
                page: BeautifulSoup = self.getParser(seleniumContent=siteContent)
                channelID: str = self.getChannelId(page)
                Cache(key=f"youtube {s[1]} channelID", value=channelID).add()

                # Not finding the values I want with just request.  Time for Chrome.
                # We are collecting info that is not present in the RSS feed.
                # We are going to store them in the class.
                try:
                    authorImage = page.find_all(name="img", attrs={"id": "img"})
                    self.authorImage = authorImage[0].attrs["src"]
                    Cache(
                        key=f"youtube {s[1]} authorImage", value=self.authorImage
                    ).add()
                except Exception as e:
                    self.logger.error(
                        f"Failed to find the authorImage for {s[1]}.  CSS might have changed. {e}"
                    )
                authorImage.clear()

                try:
                    authorName = page.find_all(
                        name="yt-formatted-string",
                        attrs={"class": "style-scope ytd-channel-name", "id": "text"},
                    )
                    self.authorName = authorName[0].text
                    Cache(key=f"youtube {s[1]} authorName", value=self.authorName).add()
                except Exception as e:
                    self.logger.error(
                        f"Failed to find the authorName for {s[1]}.  CSS might have changed. {e}"
                    )
                authorName.clear()
            else:
                self.authorName = Cache(key=f"youtube {s[1]} authorName").find()
                self.authorImage = Cache(key=f"youtube {s[1]} authorImage").find()

            # Generatet he hidden RSS feed uri
            self.uri = f"{self.feedBase}{channelID}"
            siteContent = self.getContent()
            page = self.getParser(siteContent)

            root = page.contents[2].contents
            for item in root:
                if item.name == "entry":
                    a = Articles()
                    a.url = item.contents[9].attrs["href"]
                    a.video = a.url
                    a.title = item.contents[7].text
                    a.pubDate = item.contents[13].text
                    a.siteName = site.name
                    a.thumbnail = item.contents[17].contents[5].attrs["url"]
                    a.authorImage = self.authorImage
                    a.authorName = self.authorName

                    allArticles.append(a)

        self.driverClose()
        return allArticles