def file2dict(fn, comments=True, body=True): s = Storage() s.file = fn fh = open('_' + fn + '.txt') s.title = markdown(fh.readline()[2:].strip()).strip().replace( '<p>', '').replace('</p>', '') fh.readline() if body: s.body = markdown(fh.read()) s.shortbody = striphtml(s.body)[:150] if not os.path.exists('.' + fn + '.ctime'): open('.' + fn + '.ctime', 'w').write(str(os.stat('_' + fn + '.txt')[-1])) s.date = cleandate(open('.' + fn + '.ctime').read()) s.isodate = isodate(open('.' + fn + '.ctime').read()) if comments: s.comments = [] i = 1 while os.path.exists('_' + fn + '.comments.' + str(i) + '.txt'): furl = '_' + fn + '.comments.' + str(i) + '.txt' if not open(furl).read(): i += 1 continue s.comments.append(Storage(pickle.load(open(furl)))) if s.comments[-1].__dict__.get('moderate'): s.comments.pop() i += 1 continue s.comments[-1].date = cleandate(s.comments[-1].date) s.comments[-1].id = i s.comments[-1].content = sanitize.HTML(markdown( s.comments[-1].content), addnofollow=True) i += 1 return s
def createItem(self, feed, entry, encoding): item = {} item['channel_name'] = feed.title.encode(encoding) item['title'] = entry.title.encode(encoding) item['title_plain'] = entry.title.encode(encoding) item['id'] = entry.link.encode(encoding) item['link'] = entry.link.encode(encoding) item['channel_link'] = feed.title.encode(encoding) item['channel_title_name'] = feed.title.encode(encoding) if entry.has_key('content'): content = '' for i in entry.content: if i.type == 'text/html': content += sanitize.HTML(i.value) elif i.type == 'text/plain': content += cgi.escape(i.value) elif entry.has_key('summary'): content = entry.summary.encode(encoding) else: content = '' item['content'] = content if hasattr(entry, 'updated'): item['date_822'] = entry.updated.encode(encoding) item['date'] = entry.updated.encode(encoding) else: #TODO pass item['author_email'] = False if hasattr(entry, 'author'): item['author_name'] = entry.author.encode(encoding) item['author'] = entry.author.encode(encoding) else: #TODO pass return item
def update_info(self, feed): """Update information from the feed. This reads the feed information supplied by feedparser and updates the cached information about the feed. These are the various potentially interesting properties that you might care about. """ for key in feed.keys(): if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass elif feed.has_key(key + "_parsed"): # Ignore unparsed date fields pass elif key.endswith("_detail"): # retain name and email sub-fields if feed[key].has_key('name') and feed[key].name: self.set_as_string(key.replace("_detail","_name"), \ feed[key].name) if feed[key].has_key('email') and feed[key].email: self.set_as_string(key.replace("_detail","_email"), \ feed[key].email) elif key == "items": # Ignore items field pass elif key.endswith("_parsed"): # Date fields if feed[key] is not None: self.set_as_date(key[:-len("_parsed")], feed[key]) elif key == "image": # Image field: save all the information if feed[key].has_key("url"): self.set_as_string(key + "_url", feed[key].url) if feed[key].has_key("link"): self.set_as_string(key + "_link", feed[key].link) if feed[key].has_key("title"): self.set_as_string(key + "_title", feed[key].title) if feed[key].has_key("width"): self.set_as_string(key + "_width", str(feed[key].width)) if feed[key].has_key("height"): self.set_as_string(key + "_height", str(feed[key].height)) elif isinstance(feed[key], (str, unicode)): # String fields try: detail = key + '_detail' if feed.has_key(detail) and feed[detail].has_key('type'): if feed[detail].type == 'text/html': feed[key] = sanitize.HTML(feed[key]) elif feed[detail].type == 'text/plain': feed[key] = escape(feed[key]) self.set_as_string(key, feed[key]) except KeyboardInterrupt: raise except: log.exception("Ignored '%s' of <%s>, unknown format", key, self.url)
def _html(self, html_source, expected_data, base_uri=None, add_nofollow=False): """ :type html_source: str :type expected_data: str :type base_uri: str :type add_nofollow: bool """ self.assertEqual( sanitize.HTML(htmlSource=html_source, baseuri=base_uri, addnofollow=add_nofollow), expected_data)
def update(self, entry): """Update the item from the feedparser entry given.""" for key in entry.keys(): if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass elif entry.has_key(key + "_parsed"): # Ignore unparsed date fields pass elif key.endswith("_detail"): # retain name, email, and language sub-fields if entry[key].has_key('name') and entry[key].name: self.set_as_string(key.replace("_detail","_name"), \ entry[key].name) if entry[key].has_key('email') and entry[key].email: self.set_as_string(key.replace("_detail","_email"), \ entry[key].email) if entry[key].has_key('language') and entry[key].language and \ (not self._channel.has_key('language') or \ entry[key].language != self._channel.language): self.set_as_string(key.replace("_detail","_language"), \ entry[key].language) elif key.endswith("_parsed"): # Date fields if entry[key] is not None: self.set_as_date(key[:-len("_parsed")], entry[key]) elif key == "source": # Source field: save both url and value if entry[key].has_key("value"): self.set_as_string(key + "_name", entry[key].value) if entry[key].has_key("url"): self.set_as_string(key + "_link", entry[key].url) elif key == "content": # Content field: concatenate the values value = "" for item in entry[key]: if item.type == 'text/html': item.value = sanitize.HTML(item.value) elif item.type == 'text/plain': item.value = escape(item.value) if item.has_key('language') and item.language and \ (not self._channel.has_key('language') or item.language != self._channel.language) : self.set_as_string(key + "_language", item.language) value += cache.utf8(item.value) self.set_as_string(key, value) elif isinstance(entry[key], (str, unicode)): # String fields try: detail = key + '_detail' if entry.has_key(detail): if entry[detail].has_key('type'): if entry[detail].type == 'text/html': entry[key] = sanitize.HTML(entry[key]) elif entry[detail].type == 'text/plain': entry[key] = escape(entry[key]) self.set_as_string(key, entry[key]) except KeyboardInterrupt: raise except: log.exception("Ignored '%s' of <%s>, unknown format", key, self.id) # Generate the date field if we need to self.get_date("date")