コード例 #1
0
    def parseTopic(self, response):
        self.log('in parseTopic', logging.DEBUG)
        hxs = Selector(response)
        item = TopicItem()

        # get topic content and group name
        item['topic'] = ''.join(hxs.xpath('//div[@class="topic-content"]/p//text()').extract())
        item['groupName'] = list_first_item(hxs.xpath('//div[contains(@class, "group-item")]//div[@class="title"]/a/text()').extract()) or ''

        comments = hxs.xpath('//li[contains(@class, "comment-item")]')
        replystr = ''
        for comment in comments:
            # if there is reply quote, append it to replystr
            quote = ''
            if comment.xpath('.//div[@class="reply-quote"]'):
                quote = ''.join(comment.xpath('.//div[@class="reply-quote"]/span[@class="short"]//text()').extract())
                quote += list_first_item(comment.xpath('.//div[@class="reply-quote"]/span[@class="pubdate"]/a/text()').extract())

            speaker = list_first_item(comment.xpath('div[@class="reply-doc content"]//h4/a/text()').extract()) or ''
            sentence = ''.join(comment.xpath('div[@class="reply-doc content"]//p//text()').extract()) or ''
            if quote:
                replystr += speaker + ':' + quote + '@p ' + sentence + '#'
            else:
                replystr += speaker + ':' + sentence + '#'

        item['reply'] = replystr
        item['topicUrl'] = response.url
        return item
コード例 #2
0
    def parseGroup(self, response):
        self.log('in parseGroup', logging.DEBUG)
        hxs = Selector(response)
        item = GroupItem()

        # get group name
        # ('//h1/text()')h1 content,("^\s+(.*)\s+$") filter whitespace
        item['groupName'] = list_first_item(hxs.xpath('//h1/text()').re("^\s+(.*)\s+$"))

        # get group url and add to log file
        item['groupURL'] = response.url
        self.addURL2Log(response.url, 'groupURL')

        # get topic link and parse
        topics = hxs.xpath('//div[@id="group-topics"]//td[@class="title"]')
        for topic in topics:
            topicUrl = list_first_item(topic.xpath('a/@href').extract())
            if topicUrl:
                self.addURL2Log(topicUrl, 'topics in group: %s' % item['groupName'])
                yield Request(topicUrl, callback=self.parseTopic)
                time.sleep(0.1)

        time.sleep(2)
        # get relative groups
        item['relativeGroups'] = []
        groups = hxs.xpath('//div[contains(@class, "group-list-item") or contains(@class, "group-item")]')
        for group in groups:
            url = list_first_item(group.xpath('div[contains(@class, "title")]/a/@href').extract())
            if url:
                item['relativeGroups'].append(url)
                self.addURL2Log(url, 'relativeGroups')
                yield Request(url)