def extract_posts(self, soup, filename): """Extract posts""" forum = soup.find('td', 'navbar-links').find('a', {'href': re.compile('^viewforum.php\?.*')}) forum_url = forum['href'] forum_id = re.search('(\d+)', forum_url).group(1) forum_name = self.get_text(forum) topic = soup.find('td', 'content content-navbar').table.find('span', 'gen').a topic_name = topic.b.string topic_url = topic['href'] topic_id = re.search('viewtopic.php.*\Wt=(\d+)', topic_url).group(1) posts = [] messages = soup.findAll('div', 'postbody') for msg in messages: post = Post() posts.append(post) post.site_name = 'menessentials' post.forum_url = forum_url post.forum_id = forum_id post.forum_name = forum_name post.topic_name = topic_name post.topic_id = topic_id date = msg.parent.find('span', 'postdate') post.date = datetime.strptime(self.get_text(date), "Posted: %a %b %d, %Y %I:%M %p") user = msg.parent.parent.find('span', 'name').a if user: post.user_name = user.string post.user_url = user["href"] post.user_id = re.search('u=(\d+)', post.user_url).group(1) post.quote = [] for quote in reversed(msg.findAll('table', 'quote')): # Reverse, to handle nested quotes quote.extract() q = Quote() q.text = self.get_text(quote.find('td', 'quote')) quote_user = self.get_text(quote.find('td', 'quote_user')) q.user_name = re.sub('\s*wrote:$', '', quote_user) post.quote.append(q) # Now, after quotes are removed, we can extract the text of the post post.text = self.get_text(msg) # Extract links post.link = [] for link in msg.findAll('a', href=True): post.link.append(link['href']) print >>sys.stderr, ' ', len(posts), 'posts' return posts
def extract_posts(self, soup, filename): """Extract posts""" # Forum and topic info will be the same for all posts forum = soup.find('a', 'nav', href=re.compile('^viewforum.php\?')) forum_url = forum['href'] forum_id = re.search('\Wf=(\d+)', forum_url).group(1) forum_name = self.get_text(forum) topic = soup.find('a', 'maintitle', href=re.compile('^viewtopic.php\?')) topic_name = topic.string topic_url = topic['href'] topic_id = re.search('\Wt=(\d+)', topic_url).group(1) def new_post(): post = Post() post.site_name = 'shavemyface' post.forum_url = forum_url post.forum_id = forum_id post.forum_name = forum_name post.topic_name = topic_name post.topic_id = topic_id return post posts = [] messages = soup.findAll('span', 'postbody') for msg in messages: # ignore empty spans (or spans already used; see below) if not msg.contents: continue # check that this is not a quote if msg.parent.get('class', None) == 'quote': continue post = new_post() posts.append(post) # Extract date and subject details = msg.parent.parent.parent.find('span', 'postdetails') date = details.find(text=re.compile('Posted: .*')) post.date = datetime.strptime(date.string, "Posted: %a %b %d, %Y %I:%M %p") subject = details.find(text=re.compile('Post subject: .*')) post.subject = re.sub(r'.*Post subject: ', '', subject.string) # Extract user name and id container = msg.parent.parent.parent.parent.parent post.user_name = container.td.find('span', 'name').b.string post_footer = container.findNextSiblings('tr', limit=1)[0] user_link = post_footer.find('a', href=re.compile(r'^profile.php\?')) if user_link: post.user_url = user_link['href'] post.user_id = re.search(r'profile.php\?.*\bu=(\d+)', post.user_url).group(1) # Extract quotes quotes = msg.parent.findAll('td', 'quote') if not quotes: quotes = [] for quote in reversed(quotes): # reverse elements, so that quotes will come out in the correct order container = quote.parent.parent container.extract() q = Quote() q.text = self.get_text(quote) quote_user = container.find('span', 'genmed').b.string q.user_name = re.sub('\s*wrote:$', '', quote_user) if hasattr(post, 'quote'): post.quote.append(q) else: post.quote = [q] # Extract post content and links # All span.postbody siblings constitute post content spans = msg.parent.findAll('span', 'postbody') post.text = '\n'.join([self.get_text(span) for span in spans]) # Extract links post.link = [] for span in spans: for link in span.findAll('a', href=True): post.link.append(link['href']) # Empty spans that we've used for span in spans: span.contents = [] print >>sys.stderr, ' ', len(posts), 'posts' return posts