Ejemplo n.º 1
0
 def new_post():    
     post = Post()
     post.site_name = 'shavemyface'
     post.forum_url = forum_url
     post.forum_id = forum_id
     post.forum_name = forum_name
     post.topic_name = topic_name
     post.topic_id = topic_id
     return post
Ejemplo n.º 2
0
    def extract_posts(self, soup, filename):
        """Extract posts"""

        forum = soup.find('td', 'navbar-links').find('a', {'href': re.compile('^viewforum.php\?.*')})
        forum_url = forum['href']
        forum_id = re.search('(\d+)', forum_url).group(1)
        forum_name = self.get_text(forum)
        
        topic = soup.find('td', 'content content-navbar').table.find('span', 'gen').a
        topic_name = topic.b.string
        topic_url = topic['href']
        topic_id = re.search('viewtopic.php.*\Wt=(\d+)', topic_url).group(1)
        
        posts = []
        messages = soup.findAll('div', 'postbody')
        for msg in messages:
            post = Post()
            posts.append(post)        
            
            post.site_name = 'menessentials'
            post.forum_url = forum_url
            post.forum_id = forum_id
            post.forum_name = forum_name
            post.topic_name = topic_name
            post.topic_id = topic_id
            
            date = msg.parent.find('span', 'postdate')
            post.date = datetime.strptime(self.get_text(date), "Posted: %a %b %d, %Y %I:%M %p")
            
            user = msg.parent.parent.find('span', 'name').a
            if user:
                post.user_name = user.string
                post.user_url = user["href"]
                post.user_id = re.search('u=(\d+)', post.user_url).group(1)
            
            post.quote = []
            for quote in reversed(msg.findAll('table', 'quote')):  # Reverse, to handle nested quotes
                quote.extract()
                q = Quote()
                q.text = self.get_text(quote.find('td', 'quote'))
                quote_user = self.get_text(quote.find('td', 'quote_user'))
                q.user_name = re.sub('\s*wrote:$', '', quote_user)
                post.quote.append(q)
            
            # Now, after quotes are removed, we can extract the text of the post         
            post.text = self.get_text(msg)
            
            # Extract links
            post.link = []
            for link in msg.findAll('a', href=True):
                    post.link.append(link['href'])
                
            

        print >>sys.stderr, '    ', len(posts), 'posts'
        return posts