Esempio n. 1
0
    def get_title(self):

        title_regexps = (
            ('http://lists.w3.org/.*', u'<!-- subject="(.*?)" -->'),
            ('http://lists.kde.org/.*', ur"<title>MARC: msg '(.*?)'</title>"),
            ('', ur'<title>(.*?)</title>')    # default: make sure last
        )

        for prefix, regexp in title_regexps:
            if self.url.startswith(prefix):
                break 
        
        title = "UNKNOWN TITLE"
        if self.html_u:
            tmatch = re.search(regexp, self.html_u, re.DOTALL|re.IGNORECASE)
            if tmatch:
                title = tmatch.group(1).strip()
                title = unescape_XML(title)
                title = sentence_case(title)
                title = smart_punctuation_to_ascii(title)
        return title
Esempio n. 2
0
 def get_permalink(self):
     permalink = self.url.split('/wiki/')[0] + re.search('''<li id="t-permalink"><a href="(.*?)"''', self.html_u).group(1)
     return unescape_XML(permalink)