Python BeautifulSoup Examples, pyhn.lib.bs4_py3.BeautifulSoup Python Examples

Example #1

0

Show file

 def getStoryNumber(self, source):
     """
     Parses HTML and returns the number of a story.
     """
     bs = BeautifulSoup(source)
     span = bs.find('span', attrs={'class': 'rank'})
     number = span.string.replace('.', '')
     return int(number)

Example #2

0

Show file

File: hnapi.py Project: Tienenbao/pyhn

 def getStoryNumber(self, source):
     """
     Parses HTML and returns the number of a story.
     """
     bs = BeautifulSoup(source)
     span = bs.find('span', attrs={'class': 'rank'})
     number = span.string.replace('.', '')
     return int(number)

Example #3

0

Show file

 def getStoryTitle(self, source):
     """
     Gets the title of a story.
     """
     bs = BeautifulSoup(source)
     title = bs.find('td', attrs={'class': 'title'}).text
     title = title.strip()
     return title

Example #4

0

Show file

File: hnapi.py Project: Tienenbao/pyhn

 def getStoryTitle(self, source):
     """
     Gets the title of a story.
     """
     bs = BeautifulSoup(source)
     title = bs.find('td', attrs={'class': 'title'}).text
     title = title.strip()
     return title

Example #5

0

Show file

 def getStoryDomain(self, source):
     """
     Gets the domain of a story.
     """
     bs = BeautifulSoup(source)
     url = bs.find('a').get('href')
     url_parsed = urlparse(url)
     if url_parsed.netloc:
         return url
     return urljoin('https://news.ycombinator.com', url)

Example #6

0

Show file

File: hnapi.py Project: Tienenbao/pyhn

 def getStoryDomain(self, source):
     """
     Gets the domain of a story.
     """
     bs = BeautifulSoup(source)
     url = bs.find('a').get('href')
     url_parsed = urlparse(url)
     if url_parsed.netloc:
         return url
     return urljoin('https://news.ycombinator.com', url)

Example #7

0

Show file

 def getMoreLink(self, source):
     soup = BeautifulSoup(source)
     more_a = soup.findAll("a", {"rel": "nofollow"}, text="More")
     if more_a:
         return urljoin('https://news.ycombinator.com/', more_a[0]['href'])
     return None

Example #8

0

Show file

    def getStories(self, source):
        """
        Looks at source, makes stories from it, returns the stories.
        """
        """ <td align=right valign=top class="title">31.</td> """
        self.numberOfStoriesOnFrontPage = source.count('span class="rank"')

        # Create the empty stories.
        newsStories = []
        for i in range(0, self.numberOfStoriesOnFrontPage):
            story = HackerNewsStory()
            newsStories.append(story)

        soup = BeautifulSoup(source)
        # Gives URLs, Domains and titles.
        story_details = soup.findAll("td", {"class": "title"})
        # Gives score, submitter, comment count and comment URL.
        story_other_details = soup.findAll("td", {"class": "subtext"})
        # Get story numbers.
        storyNumbers = []
        for i in range(0, len(story_details) - 1, 2):
            # Otherwise, story_details[i] is a BeautifulSoup-defined object.
            story = str(story_details[i])
            storyNumber = self.getStoryNumber(story)
            storyNumbers.append(storyNumber)

        storyURLs = []
        storyDomains = []
        storyTitles = []
        storyScores = []
        storySubmitters = []
        storyCommentCounts = []
        storyCommentURLs = []
        storyPublishedTime = []
        storyIDs = []

        # Every second cell contains a story.
        for i in range(1, len(story_details), 2):
            story = str(story_details[i])
            storyURLs.append(self.getStoryURL(story))
            storyDomains.append(self.getStoryDomain(story))
            storyTitles.append(self.getStoryTitle(story))

        for s in story_other_details:
            story = str(s)
            storyScores.append(self.getStoryScore(story))
            storySubmitters.append(self.getSubmitter(story))
            storyCommentCounts.append(self.getCommentCount(story))
            storyCommentURLs.append(self.getCommentsURL(story))
            storyPublishedTime.append(self.getPublishedTime(story))
            storyIDs.append(self.getHNID(story))

        # Associate the values with our newsStories.
        for i in range(0, self.numberOfStoriesOnFrontPage):
            newsStories[i].number = storyNumbers[i]
            newsStories[i].URL = storyURLs[i]
            newsStories[i].domain = storyDomains[i]
            newsStories[i].title = storyTitles[i]
            newsStories[i].score = storyScores[i]
            newsStories[i].submitter = storySubmitters[i]
            newsStories[i].submitterURL = \
                "https://news.ycombinator.com/user?id=" + storySubmitters[i]
            newsStories[i].commentCount = storyCommentCounts[i]
            newsStories[i].commentsURL = storyCommentURLs[i]
            newsStories[i].publishedTime = storyPublishedTime[i]
            newsStories[i].id = storyIDs[i]

            if newsStories[i].id < 0:
                newsStories[i].URL.find('item?id=') + 8
                newsStories[i].commentsURL = ''
                newsStories[i].submitter = -1
                newsStories[i].submitterURL = -1

        return newsStories

Example #9

0

Show file

File: hnapi.py Project: Tienenbao/pyhn

 def getMoreLink(self, source):
     soup = BeautifulSoup(source)
     more_a = soup.findAll("a", {"rel": "nofollow"}, text="More")
     if more_a:
         return urljoin('https://news.ycombinator.com/', more_a[0]['href'])
     return None

Example #10

0

Show file

File: hnapi.py Project: Tienenbao/pyhn

    def getStories(self, source):
        """
        Looks at source, makes stories from it, returns the stories.
        """
        """ <td align=right valign=top class="title">31.</td> """
        self.numberOfStoriesOnFrontPage = source.count('span class="rank"')

        # Create the empty stories.
        newsStories = []
        for i in range(0, self.numberOfStoriesOnFrontPage):
            story = HackerNewsStory()
            newsStories.append(story)

        soup = BeautifulSoup(source)
        # Gives URLs, Domains and titles.
        story_details = soup.findAll("td", {"class": "title"})
        # Gives score, submitter, comment count and comment URL.
        story_other_details = soup.findAll("td", {"class": "subtext"})
        # Get story numbers.
        storyNumbers = []
        for i in range(0, len(story_details) - 1, 2):
            # Otherwise, story_details[i] is a BeautifulSoup-defined object.
            story = str(story_details[i])
            storyNumber = self.getStoryNumber(story)
            storyNumbers.append(storyNumber)

        storyURLs = []
        storyDomains = []
        storyTitles = []
        storyScores = []
        storySubmitters = []
        storyCommentCounts = []
        storyCommentURLs = []
        storyPublishedTime = []
        storyIDs = []

        # Every second cell contains a story.
        for i in range(1, len(story_details), 2):
            story = str(story_details[i])
            storyURLs.append(self.getStoryURL(story))
            storyDomains.append(self.getStoryDomain(story))
            storyTitles.append(self.getStoryTitle(story))

        for s in story_other_details:
            story = str(s)
            storyScores.append(self.getStoryScore(story))
            storySubmitters.append(self.getSubmitter(story))
            storyCommentCounts.append(self.getCommentCount(story))
            storyCommentURLs.append(self.getCommentsURL(story))
            storyPublishedTime.append(self.getPublishedTime(story))
            storyIDs.append(self.getHNID(story))

        # Associate the values with our newsStories.
        for i in range(0, self.numberOfStoriesOnFrontPage):
            newsStories[i].number = storyNumbers[i]
            newsStories[i].URL = storyURLs[i]
            newsStories[i].domain = storyDomains[i]
            newsStories[i].title = storyTitles[i]
            newsStories[i].score = storyScores[i]
            newsStories[i].submitter = storySubmitters[i]
            newsStories[i].submitterURL = \
                "https://news.ycombinator.com/user?id=" + storySubmitters[i]
            newsStories[i].commentCount = storyCommentCounts[i]
            newsStories[i].commentsURL = storyCommentURLs[i]
            newsStories[i].publishedTime = storyPublishedTime[i]
            newsStories[i].id = storyIDs[i]

            if newsStories[i].id < 0:
                newsStories[i].URL.find('item?id=') + 8
                newsStories[i].commentsURL = ''
                newsStories[i].submitter = -1
                newsStories[i].submitterURL = -1

        return newsStories