def extract_posts(self, soup, filename):
        """Extract posts"""

        forum = soup.find('td', 'navbar-links').find('a', {'href': re.compile('^viewforum.php\?.*')})
        forum_url = forum['href']
        forum_id = re.search('(\d+)', forum_url).group(1)
        forum_name = self.get_text(forum)
        
        topic = soup.find('td', 'content content-navbar').table.find('span', 'gen').a
        topic_name = topic.b.string
        topic_url = topic['href']
        topic_id = re.search('viewtopic.php.*\Wt=(\d+)', topic_url).group(1)
        
        posts = []
        messages = soup.findAll('div', 'postbody')
        for msg in messages:
            post = Post()
            posts.append(post)        
            
            post.site_name = 'menessentials'
            post.forum_url = forum_url
            post.forum_id = forum_id
            post.forum_name = forum_name
            post.topic_name = topic_name
            post.topic_id = topic_id
            
            date = msg.parent.find('span', 'postdate')
            post.date = datetime.strptime(self.get_text(date), "Posted: %a %b %d, %Y %I:%M %p")
            
            user = msg.parent.parent.find('span', 'name').a
            if user:
                post.user_name = user.string
                post.user_url = user["href"]
                post.user_id = re.search('u=(\d+)', post.user_url).group(1)
            
            post.quote = []
            for quote in reversed(msg.findAll('table', 'quote')):  # Reverse, to handle nested quotes
                quote.extract()
                q = Quote()
                q.text = self.get_text(quote.find('td', 'quote'))
                quote_user = self.get_text(quote.find('td', 'quote_user'))
                q.user_name = re.sub('\s*wrote:$', '', quote_user)
                post.quote.append(q)
            
            # Now, after quotes are removed, we can extract the text of the post         
            post.text = self.get_text(msg)
            
            # Extract links
            post.link = []
            for link in msg.findAll('a', href=True):
                    post.link.append(link['href'])
                
            

        print >>sys.stderr, '    ', len(posts), 'posts'
        return posts
Exemple #2
0
    def extract_posts(self, soup, filename):
        """Extract posts"""
     
        # Forum and topic info will be the same for all posts
        forum = soup.find('a', 'nav', href=re.compile('^viewforum.php\?'))
        forum_url = forum['href']
        forum_id = re.search('\Wf=(\d+)', forum_url).group(1)
        forum_name = self.get_text(forum)
        
        topic = soup.find('a', 'maintitle', href=re.compile('^viewtopic.php\?'))
        topic_name = topic.string
        topic_url = topic['href']
        topic_id = re.search('\Wt=(\d+)', topic_url).group(1)

        def new_post():    
            post = Post()
            post.site_name = 'shavemyface'
            post.forum_url = forum_url
            post.forum_id = forum_id
            post.forum_name = forum_name
            post.topic_name = topic_name
            post.topic_id = topic_id
            return post
            
        posts = []
        messages = soup.findAll('span', 'postbody')
        for msg in messages:
        
            # ignore empty spans (or spans already used; see below)
            if not msg.contents:
                continue
                
            # check that this is not a quote
            if msg.parent.get('class', None) == 'quote':
                continue
            
            post = new_post()
            posts.append(post)        
            
            # Extract date and subject
            details = msg.parent.parent.parent.find('span', 'postdetails')
            date = details.find(text=re.compile('Posted: .*'))
            post.date = datetime.strptime(date.string, "Posted: %a %b %d, %Y %I:%M %p")
            subject = details.find(text=re.compile('Post subject: .*'))
            post.subject = re.sub(r'.*Post subject: ', '', subject.string)
            
            # Extract user name and id
            container = msg.parent.parent.parent.parent.parent
            post.user_name = container.td.find('span', 'name').b.string
            post_footer = container.findNextSiblings('tr', limit=1)[0]
            user_link = post_footer.find('a', href=re.compile(r'^profile.php\?'))
            if user_link:
                post.user_url = user_link['href'] 
                post.user_id = re.search(r'profile.php\?.*\bu=(\d+)', post.user_url).group(1)
            
            # Extract quotes
            quotes = msg.parent.findAll('td', 'quote')
            if not quotes: quotes = []
            for quote in reversed(quotes):  # reverse elements, so that quotes will come out in the correct order
                container = quote.parent.parent
                container.extract()
                q = Quote()
                q.text = self.get_text(quote)
                quote_user = container.find('span', 'genmed').b.string
                q.user_name = re.sub('\s*wrote:$', '', quote_user)
                if hasattr(post, 'quote'):
                    post.quote.append(q)
                else:
                    post.quote = [q]
            
            
            # Extract post content and links
            
            # All span.postbody siblings constitute post content
            spans = msg.parent.findAll('span', 'postbody')
            post.text = '\n'.join([self.get_text(span) for span in spans])
            
            # Extract links
            post.link = []
            for span in spans:
                for link in span.findAll('a', href=True):
                        post.link.append(link['href'])
                
            
            # Empty spans that we've used
            for span in spans: 
                span.contents = [] 
            
                    

        print >>sys.stderr, '    ', len(posts), 'posts'
        return posts