Beispiel #1
0
    def topic_page_posts(self, topic, html):
	'''Scan a topic page and get a list of posts'''
	posts = []
	doc = soup.BeautifulSoup(html, convertEntities='html')
	padivs = doc.findAll(soup.tagclass('div','postarea'))
	for padiv in padivs:
	    pdiv = padiv.parent
	    poster = pdiv.find(soup.tagclass('div','poster'))
	    author = soup.cdata(poster.h4)
	    keyinfo = padiv.find(soup.tagclass('div','keyinfo'))
	    h5 = keyinfo.h5
	    subject = soup.cdata(h5.a)
	    dd = h5.findNextSibling('div')
	    dateetc = soup.cdata(dd).strip()
	    datetime = fixdate(dateetc)
	    post = padiv.find(soup.tagclass('div','post'))
            inner = post.find(soup.tagclass('div','inner'))
	    pid = re.match(r'msg_(\d+)',inner['id']).group(1)
	    body = unicode(str(inner),'utf-8')
	    # print 'body=',repr(body)
	    # print ("pid=%s author=%s datetime=%s subject=%s body=%s" % \
	    # (pid,author,dateetc,subject,body)).encode('utf-8')
	    
	    post = ptforum.Post(pid=pid, topic=topic,
                                author=author,
                                datetime=datetime,
                                subject=subject,
                                body=body)
	    posts.append(post)

            if not topic.firstpost:
                topic.firstpost = pid
	return posts
Beispiel #2
0
    def forum_page_topics(self, forum, html):
	'''Find all the topics on a forum page'''
	topics = []
	doc = soup.BeautifulSoup(html, convertEntities='html')
	index = doc.find('div', id='messageindex')
	subjtds = index.findAll(soup.tagclass('td','subject'))
	for subjtd in subjtds:
	    span = subjtd.find('span')
	    a = span.find('a')
	    href = a['href']
	    tid = re.search(r'topic[=,](\d+)', href).group(1)
	    title = a.string.strip()
	    tr = subjtd.parent
	    starttd = tr.find(soup.tagclass('td','starter'))
	    author = soup.cdata(starttd).strip()
	    reptd = tr.find(soup.tagclass('td','replies'))
	    replies = soup.cdata(reptd).strip()

	    # Create or update topic
	    topic = forum.topic_find(tid)
	    topic.title = title
	    topic.author = author
	    topic.replies = replies

	    topics.append(topic)
	return topics
Beispiel #3
0
    def topic_page_posts(self, topic, html):
	'''Scan a topic page and get a list of posts
         <table class="forumline">
         <tr>
          <td>...<a name="425" id="425"></a><strong>someuser</strong>...</td>
          <td><table>...</table> <table><tr><td class="postbody">...</td></tr>
        '''
	posts = []
	doc = soup.BeautifulSoup(html, fromEncoding='utf-8', convertEntities='html')
        titletd = doc.find(soup.tagclass('td','maintitle'))
        title = soup.cdata(titletd).strip()
        #print 'title',title.encode('utf-8')
        table = doc.find(soup.tagclass('table','forumline'))
        for tr in table.findAll('tr',recursive=False):
            tds = tr.findAll('td',recursive=False)
            if len(tds) < 2:
                continue
            #print str(tr)[:79]
            nauthor, nmessage = tds[:2]
            pid = author = datetime = subject = body = None
            namea = nauthor.find('a', {'name':True})
            if namea:
                #print ' a',namea
                pid = namea['name']
            b = nauthor.find(['strong','b'])
            if b:
                #print ' b',b
                author = soup.cdata(b)
            posttd = nmessage.find(soup.tagclass('td','postdetails'))
            if posttd:
                dateetc = soup.cdata(posttd)
                datetime = fixdate(dateetc)
            bodytd = nmessage.find(soup.tagclass('td','postbody'))
            if bodytd:
                subject = ''
                contents = bodytd.contents
                # Remove guff from start
                if contents and contents[0] == '\n':
                    contents = contents[1:]
                if contents and contents[0].name == 'hr':
                    contents = contents[1:]
                #for c in bodytd.contents:
                #    print '+',repr(str(c))
                body = ''.join(map(unicode, contents))
                # print ' body',repr(body)
	    post = ptforum.Post(pid=pid, topic=topic,
                                author=author, 
                                datetime=datetime,
                                subject=subject,
                                body=body)
	    posts.append(post)

            if not topic.firstpost:
                topic.firstpost = pid
	return posts
Beispiel #4
0
    def forum_page_topics(self, forum, html):
	'''Find all the topics on a forum page.
        Page structure:
          <table ...>
           <table class="forumline">
            <tr><th>...
            <tr>
             <td>...</td>
             <td>...<span class="topictitle"></span><a href="...">topic title</a>...</td>
             <td><span>8</span></td>
             <td><span>&nbsp<a href="...">authorid</a>&nbsp;</span></td>
             <td><span>41</span></td>
             <td><span> Sun May 16, 2010 3:06 pm <br /><a>...</a> <a href="viewtopic...">...</td>
             ...
        </tr>
            
        '''
	topics = []
	doc = soup.BeautifulSoup(html, convertEntities='html')
        #soup.dump(doc)
        #self.dump(str(doc), 'topics.xml')
        #for table in doc.findAll('table'):
        #    print str(table)[:79].replace("\n",'')
        index = doc.find(soup.tagclass('table', 'forumline'))
        for tr in index.findAll('tr'):
            tds = tr.findAll('td')
            if len(tds) < 6:
                continue
            ntype, ntopic, nreplies, nauthor, nviews, nlastpost = tds[:6]

            title = tid = replies = author = lastpost = None
            topica = ntopic.find(soup.tagclass('a','topictitle'))
            if topica:
                title = soup.cdata(topica)
                href = topica['href']
                m = re.match('.*?t=(\d+)', href)
                if m:
                    tid = m.group(1)
            replies = soup.cdata(nreplies)
            authora = nauthor.find('a')
            if authora:
                author = soup.cdata(authora)
            # lastpost = soup.cdata(nlastpost)
            #print 'tid=%s title=%s author=%s replies=%s' % (tid, repr(title), author, replies)
	    # Create or update topic
	    topic = forum.topic_find(tid)
	    topic.title = title
	    topic.author = author
	    topic.replies = replies

	    topics.append(topic)
	return topics
Beispiel #5
0
    def topic_page_posts(self, topic, html):
	'''Scan a topic page and get a list of posts
        <div...>
          <h2>SUBJECT</h2>
        </div>
        <div id="pPID" class="post ...">
         <div class="postbody ...">
          <div class="author">by <strong><a href="./memberlist.php...">AUTHOR</a></strong>
           &raquo; DAY MON DD, YEAR H:MM PM </div>
          <div class="content">BODY</div>
        '''
	posts = []
	doc = soup.BeautifulSoup(html, fromEncoding='utf-8', convertEntities='html')
        h2 = doc.find('h2')
        title = soup.cdata(h2)
        _logger.info('TITLE %s', title)
        for post in doc.findAll(soup.tagclass('div','post')):
            pid = author = datetime = subject = body = None
            pid = post['id'].lstrip('p')     # E.g. 'p12345' -> '12345'
            authdiv = post.find(soup.tagclass('div','author'))
            if not authdiv:
                _logger.warn('** No div.author')
            else:
                a = authdiv.find('a')
                if not a:
                    _logger.warn('** no a in div.author')
                else:
                    author = soup.cdata(a)
                    _logger.info('AUTHOR %s' % author)
                datetime = fixdate(soup.cdata(authdiv))
                _logger.info('DATETIME %s' % datetime)
            contdiv = post.find(soup.tagclass('div','content'))
            if not contdiv:
                _logger.warn('** no div.content')
            else:
                contents = contdiv.contents
                body = ''.join(map(unicode, contents))
	    post = ptforum.Post(pid=pid, topic=topic,
                                author=author, 
                                datetime=datetime,
                                subject=subject or title,
                                body=body)
	    posts.append(post)

            if not topic.firstpost:
                topic.firstpost = pid
	return posts
Beispiel #6
0
    def forum_page_topics(self, forum, html):
	'''Find all the topics on a forum page.
        Page structure:
	  <ul class="topiclist topics">
           <li class="row bg1">
            <dl class="row-fluid">
             <dt ...>
              <a...>
              <a class="topictitle" href="./viewtopic.php?f=35&t=10369">@title</a>
             <dd ...>
              7 <dfn>Replies</dfn>
             <dd ...>
              <a href="./memberlist.php...">@USER</a>
             <dd ...>
              <dfn>Last post</dfn> by <a href="./memberlist.php..">@LASTUSER</a>
              <br>
              <a href="./viewtopic.php?f=35&t=10396&p=117246#p117246"><time...></a>
             ...
           <li>...
           ...
         <div class="paging ...">
          <span>
           <a href="./viewforum.php?f=35&start=25">2</a>
        N.B. There may be more than one <ul class="topiclist topics"> 
        '''
	topics = []
	doc = soup.BeautifulSoup(html, convertEntities='html')
        for index in doc.findAll(soup.tagclass('ul', 'topics')):
            for li in index.findAll('li'):
                _logger.debug('-- li %s', li['class'])
                dl = li.find('dl')
                dt = dl.find('dt')
                title = tid = replies = author = lastpost = None
                topica = dt.find(soup.tagclass('a','topictitle'))
                if topica:
                    title = soup.cdata(topica)
                    _logger.info('TITLE %s', title)
                    href = topica['href']
                    m = t_nRE.search(href)
                    if m:
                        tid = m.group(1)
                        _logger.info('TID %s', tid)
                    else:
                        _logger.warn('** no tid')
                for dd in dl.findAll('dd'):
                    dfn = dd.find('dfn')
                    if not dfn:
                        a = dd.find('a')
                        href = a['href']
                        if 'memberlist.php' in href.split('?')[0]:
                            author = soup.cdata(a)
                            _logger.info('AUTHOR %s', author)
                    else:
                        text = soup.cdata(dfn)
                        if 'Replies' in text:
                            replies = soup.cdata(dd).split()[0]
                            _logger.info('REPLIES %s', replies)
                        elif 'Last post' in text:
                            _logger.debug('-- Last post')
                            for a in dd.findAll('a'):
                                href = a['href']
                                if 'viewtopic.php' in href.split('?')[0]:
                                    m = t_nRE.search(href)
                                    if m:
                                        lastpost = m.group(1)
                                        _logger.info('LASTPOST %s', lastpost)
                                    else:
                                        _logger.warn('** no lastpost')
                # Create or update topic
                topic = forum.topic_find(tid)
                topic.title = title
                topic.author = author
                topic.replies = replies

                topics.append(topic)
	return topics