Beispiel #1
0
def file2dict(fn, comments=True, body=True):
    s = Storage()
    s.file = fn
    fh = open('_' + fn + '.txt')
    s.title = markdown(fh.readline()[2:].strip()).strip().replace(
        '<p>', '').replace('</p>', '')
    fh.readline()
    if body:
        s.body = markdown(fh.read())
        s.shortbody = striphtml(s.body)[:150]
    if not os.path.exists('.' + fn + '.ctime'):
        open('.' + fn + '.ctime',
             'w').write(str(os.stat('_' + fn + '.txt')[-1]))
    s.date = cleandate(open('.' + fn + '.ctime').read())
    s.isodate = isodate(open('.' + fn + '.ctime').read())
    if comments:
        s.comments = []
        i = 1
        while os.path.exists('_' + fn + '.comments.' + str(i) + '.txt'):
            furl = '_' + fn + '.comments.' + str(i) + '.txt'
            if not open(furl).read():
                i += 1
                continue
            s.comments.append(Storage(pickle.load(open(furl))))
            if s.comments[-1].__dict__.get('moderate'):
                s.comments.pop()
                i += 1
                continue
            s.comments[-1].date = cleandate(s.comments[-1].date)
            s.comments[-1].id = i
            s.comments[-1].content = sanitize.HTML(markdown(
                s.comments[-1].content),
                                                   addnofollow=True)
            i += 1
    return s
Beispiel #2
0
 def createItem(self, feed, entry, encoding):
     item = {}
     item['channel_name'] = feed.title.encode(encoding)
     item['title'] = entry.title.encode(encoding)
     item['title_plain'] = entry.title.encode(encoding)
     item['id'] = entry.link.encode(encoding)
     item['link'] = entry.link.encode(encoding)
     item['channel_link'] = feed.title.encode(encoding)
     item['channel_title_name'] = feed.title.encode(encoding)
     if entry.has_key('content'):
         content = ''
         for i in entry.content:
             if i.type == 'text/html':
                 content += sanitize.HTML(i.value)
             elif i.type == 'text/plain':
                 content += cgi.escape(i.value)
     elif entry.has_key('summary'):
         content = entry.summary.encode(encoding)
     else:
         content = ''
     item['content'] = content
     if hasattr(entry, 'updated'):
         item['date_822'] = entry.updated.encode(encoding)
         item['date'] = entry.updated.encode(encoding)
     else:
         #TODO
         pass
     item['author_email'] = False
     if hasattr(entry, 'author'):
         item['author_name'] = entry.author.encode(encoding)
         item['author'] = entry.author.encode(encoding)
     else:
         #TODO
         pass
     return item
Beispiel #3
0
    def update_info(self, feed):
        """Update information from the feed.

        This reads the feed information supplied by feedparser and updates
        the cached information about the feed.  These are the various
        potentially interesting properties that you might care about.
        """
        for key in feed.keys():
            if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
                # Ignored fields
                pass
            elif feed.has_key(key + "_parsed"):
                # Ignore unparsed date fields
                pass
            elif key.endswith("_detail"):
                # retain name and  email sub-fields
                if feed[key].has_key('name') and feed[key].name:
                    self.set_as_string(key.replace("_detail","_name"), \
                        feed[key].name)
                if feed[key].has_key('email') and feed[key].email:
                    self.set_as_string(key.replace("_detail","_email"), \
                        feed[key].email)
            elif key == "items":
                # Ignore items field
                pass
            elif key.endswith("_parsed"):
                # Date fields
                if feed[key] is not None:
                    self.set_as_date(key[:-len("_parsed")], feed[key])
            elif key == "image":
                # Image field: save all the information
                if feed[key].has_key("url"):
                    self.set_as_string(key + "_url", feed[key].url)
                if feed[key].has_key("link"):
                    self.set_as_string(key + "_link", feed[key].link)
                if feed[key].has_key("title"):
                    self.set_as_string(key + "_title", feed[key].title)
                if feed[key].has_key("width"):
                    self.set_as_string(key + "_width", str(feed[key].width))
                if feed[key].has_key("height"):
                    self.set_as_string(key + "_height", str(feed[key].height))
            elif isinstance(feed[key], (str, unicode)):
                # String fields
                try:
                    detail = key + '_detail'
                    if feed.has_key(detail) and feed[detail].has_key('type'):
                        if feed[detail].type == 'text/html':
                            feed[key] = sanitize.HTML(feed[key])
                        elif feed[detail].type == 'text/plain':
                            feed[key] = escape(feed[key])
                    self.set_as_string(key, feed[key])
                except KeyboardInterrupt:
                    raise
                except:
                    log.exception("Ignored '%s' of <%s>, unknown format", key,
                                  self.url)
 def _html(self,
           html_source,
           expected_data,
           base_uri=None,
           add_nofollow=False):
     """
     :type html_source: str
     :type expected_data: str
     :type base_uri: str
     :type add_nofollow: bool
     """
     self.assertEqual(
         sanitize.HTML(htmlSource=html_source,
                       baseuri=base_uri,
                       addnofollow=add_nofollow), expected_data)
Beispiel #5
0
    def update(self, entry):
        """Update the item from the feedparser entry given."""
        for key in entry.keys():
            if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
                # Ignored fields
                pass
            elif entry.has_key(key + "_parsed"):
                # Ignore unparsed date fields
                pass
            elif key.endswith("_detail"):
                # retain name, email, and language sub-fields
                if entry[key].has_key('name') and entry[key].name:
                    self.set_as_string(key.replace("_detail","_name"), \
                        entry[key].name)
                if entry[key].has_key('email') and entry[key].email:
                    self.set_as_string(key.replace("_detail","_email"), \
                        entry[key].email)
                if entry[key].has_key('language') and entry[key].language and \
                   (not self._channel.has_key('language') or \
                   entry[key].language != self._channel.language):
                    self.set_as_string(key.replace("_detail","_language"), \
                        entry[key].language)
            elif key.endswith("_parsed"):
                # Date fields
                if entry[key] is not None:
                    self.set_as_date(key[:-len("_parsed")], entry[key])
            elif key == "source":
                # Source field: save both url and value
                if entry[key].has_key("value"):
                    self.set_as_string(key + "_name", entry[key].value)
                if entry[key].has_key("url"):
                    self.set_as_string(key + "_link", entry[key].url)
            elif key == "content":
                # Content field: concatenate the values
                value = ""
                for item in entry[key]:
                    if item.type == 'text/html':
                        item.value = sanitize.HTML(item.value)
                    elif item.type == 'text/plain':
                        item.value = escape(item.value)
                    if item.has_key('language') and item.language and \
                       (not self._channel.has_key('language') or
                       item.language != self._channel.language) :
                        self.set_as_string(key + "_language", item.language)
                    value += cache.utf8(item.value)
                self.set_as_string(key, value)
            elif isinstance(entry[key], (str, unicode)):
                # String fields
                try:
                    detail = key + '_detail'
                    if entry.has_key(detail):
                        if entry[detail].has_key('type'):
                            if entry[detail].type == 'text/html':
                                entry[key] = sanitize.HTML(entry[key])
                            elif entry[detail].type == 'text/plain':
                                entry[key] = escape(entry[key])
                    self.set_as_string(key, entry[key])
                except KeyboardInterrupt:
                    raise
                except:
                    log.exception("Ignored '%s' of <%s>, unknown format", key,
                                  self.id)

        # Generate the date field if we need to
        self.get_date("date")