def topic_page_posts(self, topic, html): '''Scan a topic page and get a list of posts''' posts = [] doc = soup.BeautifulSoup(html, convertEntities='html') padivs = doc.findAll(soup.tagclass('div','postarea')) for padiv in padivs: pdiv = padiv.parent poster = pdiv.find(soup.tagclass('div','poster')) author = soup.cdata(poster.h4) keyinfo = padiv.find(soup.tagclass('div','keyinfo')) h5 = keyinfo.h5 subject = soup.cdata(h5.a) dd = h5.findNextSibling('div') dateetc = soup.cdata(dd).strip() datetime = fixdate(dateetc) post = padiv.find(soup.tagclass('div','post')) inner = post.find(soup.tagclass('div','inner')) pid = re.match(r'msg_(\d+)',inner['id']).group(1) body = unicode(str(inner),'utf-8') # print 'body=',repr(body) # print ("pid=%s author=%s datetime=%s subject=%s body=%s" % \ # (pid,author,dateetc,subject,body)).encode('utf-8') post = ptforum.Post(pid=pid, topic=topic, author=author, datetime=datetime, subject=subject, body=body) posts.append(post) if not topic.firstpost: topic.firstpost = pid return posts
def forum_page_topics(self, forum, html): '''Find all the topics on a forum page''' topics = [] doc = soup.BeautifulSoup(html, convertEntities='html') index = doc.find('div', id='messageindex') subjtds = index.findAll(soup.tagclass('td','subject')) for subjtd in subjtds: span = subjtd.find('span') a = span.find('a') href = a['href'] tid = re.search(r'topic[=,](\d+)', href).group(1) title = a.string.strip() tr = subjtd.parent starttd = tr.find(soup.tagclass('td','starter')) author = soup.cdata(starttd).strip() reptd = tr.find(soup.tagclass('td','replies')) replies = soup.cdata(reptd).strip() # Create or update topic topic = forum.topic_find(tid) topic.title = title topic.author = author topic.replies = replies topics.append(topic) return topics
def topic_page_posts(self, topic, html): '''Scan a topic page and get a list of posts <table class="forumline"> <tr> <td>...<a name="425" id="425"></a><strong>someuser</strong>...</td> <td><table>...</table> <table><tr><td class="postbody">...</td></tr> ''' posts = [] doc = soup.BeautifulSoup(html, fromEncoding='utf-8', convertEntities='html') titletd = doc.find(soup.tagclass('td','maintitle')) title = soup.cdata(titletd).strip() #print 'title',title.encode('utf-8') table = doc.find(soup.tagclass('table','forumline')) for tr in table.findAll('tr',recursive=False): tds = tr.findAll('td',recursive=False) if len(tds) < 2: continue #print str(tr)[:79] nauthor, nmessage = tds[:2] pid = author = datetime = subject = body = None namea = nauthor.find('a', {'name':True}) if namea: #print ' a',namea pid = namea['name'] b = nauthor.find(['strong','b']) if b: #print ' b',b author = soup.cdata(b) posttd = nmessage.find(soup.tagclass('td','postdetails')) if posttd: dateetc = soup.cdata(posttd) datetime = fixdate(dateetc) bodytd = nmessage.find(soup.tagclass('td','postbody')) if bodytd: subject = '' contents = bodytd.contents # Remove guff from start if contents and contents[0] == '\n': contents = contents[1:] if contents and contents[0].name == 'hr': contents = contents[1:] #for c in bodytd.contents: # print '+',repr(str(c)) body = ''.join(map(unicode, contents)) # print ' body',repr(body) post = ptforum.Post(pid=pid, topic=topic, author=author, datetime=datetime, subject=subject, body=body) posts.append(post) if not topic.firstpost: topic.firstpost = pid return posts
def forum_page_topics(self, forum, html): '''Find all the topics on a forum page. Page structure: <table ...> <table class="forumline"> <tr><th>... <tr> <td>...</td> <td>...<span class="topictitle"></span><a href="...">topic title</a>...</td> <td><span>8</span></td> <td><span> <a href="...">authorid</a> </span></td> <td><span>41</span></td> <td><span> Sun May 16, 2010 3:06 pm <br /><a>...</a> <a href="viewtopic...">...</td> ... </tr> ''' topics = [] doc = soup.BeautifulSoup(html, convertEntities='html') #soup.dump(doc) #self.dump(str(doc), 'topics.xml') #for table in doc.findAll('table'): # print str(table)[:79].replace("\n",'') index = doc.find(soup.tagclass('table', 'forumline')) for tr in index.findAll('tr'): tds = tr.findAll('td') if len(tds) < 6: continue ntype, ntopic, nreplies, nauthor, nviews, nlastpost = tds[:6] title = tid = replies = author = lastpost = None topica = ntopic.find(soup.tagclass('a','topictitle')) if topica: title = soup.cdata(topica) href = topica['href'] m = re.match('.*?t=(\d+)', href) if m: tid = m.group(1) replies = soup.cdata(nreplies) authora = nauthor.find('a') if authora: author = soup.cdata(authora) # lastpost = soup.cdata(nlastpost) #print 'tid=%s title=%s author=%s replies=%s' % (tid, repr(title), author, replies) # Create or update topic topic = forum.topic_find(tid) topic.title = title topic.author = author topic.replies = replies topics.append(topic) return topics
def topic_page_posts(self, topic, html): '''Scan a topic page and get a list of posts <div...> <h2>SUBJECT</h2> </div> <div id="pPID" class="post ..."> <div class="postbody ..."> <div class="author">by <strong><a href="./memberlist.php...">AUTHOR</a></strong> » DAY MON DD, YEAR H:MM PM </div> <div class="content">BODY</div> ''' posts = [] doc = soup.BeautifulSoup(html, fromEncoding='utf-8', convertEntities='html') h2 = doc.find('h2') title = soup.cdata(h2) _logger.info('TITLE %s', title) for post in doc.findAll(soup.tagclass('div','post')): pid = author = datetime = subject = body = None pid = post['id'].lstrip('p') # E.g. 'p12345' -> '12345' authdiv = post.find(soup.tagclass('div','author')) if not authdiv: _logger.warn('** No div.author') else: a = authdiv.find('a') if not a: _logger.warn('** no a in div.author') else: author = soup.cdata(a) _logger.info('AUTHOR %s' % author) datetime = fixdate(soup.cdata(authdiv)) _logger.info('DATETIME %s' % datetime) contdiv = post.find(soup.tagclass('div','content')) if not contdiv: _logger.warn('** no div.content') else: contents = contdiv.contents body = ''.join(map(unicode, contents)) post = ptforum.Post(pid=pid, topic=topic, author=author, datetime=datetime, subject=subject or title, body=body) posts.append(post) if not topic.firstpost: topic.firstpost = pid return posts
def forum_page_topics(self, forum, html): '''Find all the topics on a forum page. Page structure: <ul class="topiclist topics"> <li class="row bg1"> <dl class="row-fluid"> <dt ...> <a...> <a class="topictitle" href="./viewtopic.php?f=35&t=10369">@title</a> <dd ...> 7 <dfn>Replies</dfn> <dd ...> <a href="./memberlist.php...">@USER</a> <dd ...> <dfn>Last post</dfn> by <a href="./memberlist.php..">@LASTUSER</a> <br> <a href="./viewtopic.php?f=35&t=10396&p=117246#p117246"><time...></a> ... <li>... ... <div class="paging ..."> <span> <a href="./viewforum.php?f=35&start=25">2</a> N.B. There may be more than one <ul class="topiclist topics"> ''' topics = [] doc = soup.BeautifulSoup(html, convertEntities='html') for index in doc.findAll(soup.tagclass('ul', 'topics')): for li in index.findAll('li'): _logger.debug('-- li %s', li['class']) dl = li.find('dl') dt = dl.find('dt') title = tid = replies = author = lastpost = None topica = dt.find(soup.tagclass('a','topictitle')) if topica: title = soup.cdata(topica) _logger.info('TITLE %s', title) href = topica['href'] m = t_nRE.search(href) if m: tid = m.group(1) _logger.info('TID %s', tid) else: _logger.warn('** no tid') for dd in dl.findAll('dd'): dfn = dd.find('dfn') if not dfn: a = dd.find('a') href = a['href'] if 'memberlist.php' in href.split('?')[0]: author = soup.cdata(a) _logger.info('AUTHOR %s', author) else: text = soup.cdata(dfn) if 'Replies' in text: replies = soup.cdata(dd).split()[0] _logger.info('REPLIES %s', replies) elif 'Last post' in text: _logger.debug('-- Last post') for a in dd.findAll('a'): href = a['href'] if 'viewtopic.php' in href.split('?')[0]: m = t_nRE.search(href) if m: lastpost = m.group(1) _logger.info('LASTPOST %s', lastpost) else: _logger.warn('** no lastpost') # Create or update topic topic = forum.topic_find(tid) topic.title = title topic.author = author topic.replies = replies topics.append(topic) return topics