def get_title(self): title_regexps = ( ('http://lists.w3.org/.*', u'<!-- subject="(.*?)" -->'), ('http://lists.kde.org/.*', ur"<title>MARC: msg '(.*?)'</title>"), ('', ur'<title>(.*?)</title>') # default: make sure last ) for prefix, regexp in title_regexps: if self.url.startswith(prefix): break title = "UNKNOWN TITLE" if self.html_u: tmatch = re.search(regexp, self.html_u, re.DOTALL|re.IGNORECASE) if tmatch: title = tmatch.group(1).strip() title = unescape_XML(title) title = sentence_case(title) title = smart_punctuation_to_ascii(title) return title
def get_permalink(self): permalink = self.url.split('/wiki/')[0] + re.search('''<li id="t-permalink"><a href="(.*?)"''', self.html_u).group(1) return unescape_XML(permalink)