예제 #1
0
    def _retrieveSingleFeed(self, feedContainer, url):
        # feedparser doesn't understand proper file: url's
        if url.startswith('file://'):
            url = url[7:]
            if not os.path.exists(url):
                raise IOError("Couldn't locate %r" % url)
        # urllib does not support the 'feed' scheme -- replace with 'http'
        if url.startswith('feed://'):
            url = url.replace('feed://', 'http://', 1)
        portal_transforms = getToolByName(feedContainer, 'portal_transforms')
        parsed = feedparser.parse(url)
        for entry in parsed.entries:
            id = get_uid_from_entry(entry)
            if not id:
                logger.warn("Ignored unidentifiable entry without id or link.")
                continue
            updated = entry.get('updated')
            published = entry.get('published')

            if not updated:
                # property may be blank if item has never
                # been updated -- use published date
                updated = published

            if updated:
                try:
                    updated = extendedDateTime(updated)
                except DateTime.SyntaxError:
                    logger.warn("SyntaxError while parsing %r as DateTime for "
                                "the 'updated' field of entry %s",
                                updated, getattr(entry, 'title', ''))
                    continue

            prev = feedContainer.getItem(id)
            if prev is None:
                # Completely new item, add it.
                addItem = feedContainer.addItem
            elif updated is None:
                logger.warn("No updated or published date known. "
                            "Not updating previously added entry.")
                continue
            elif updated > prev.getFeedItemUpdated():
                # Refreshed item, replace it.
                addItem = feedContainer.replaceItem
            else:
                # Not new, not refreshed: let it be, laddy.  Still,
                # the entry might have changed slightly, so we check
                # this.
                if prev.getObjectInfo != entry:
                    # Note: no need for a reindexObject here, which
                    # would also update the modification date, which
                    # we do not want.  See
                    # http://plone.org/products/feedfeeder/issues/34
                    prev.setObjectInfo(entry.copy())
                continue

            obj = addItem(id)

            linkDict = getattr(entry, 'link', None)
            if linkDict:
                # Hey, that's not a dict at all; at least not in my test.
                #link = linkDict['href']
                link = linkDict
            else:
                linkDict = getattr(entry, 'links', [{'href': ''}])[0]
                link = linkDict['href']

            if not updated:
                updated = DateTime()
            if published is not None:
                try:
                    published = extendedDateTime(published)
                except DateTime.SyntaxError:
                    logger.warn("SyntaxError while parsing %r as DateTime for "
                                "the 'published' field of entry %s",
                                published, getattr(entry, 'title', ''))
                    continue
                obj.setEffectiveDate(published)

            summary = getattr(entry, 'summary', '')
            logger.debug("1 summary: %r" % summary)
            summary = convert_summary(summary)
            logger.debug("2 summary: %r" % summary)

            obj.update(id=id,
                       title=getattr(entry, 'title', ''),
                       description=summary,
                       feedItemAuthor=getattr(entry, 'author', ''),
                       feedItemUpdated=updated,
                       link=link,
                       feedTitle=parsed['feed'].get('title', ''),
                       objectInfo=entry.copy(),
                       )
            # Tags cannot be handled by the update method AFAIK,
            # because it is not an Archetypes field.
            feed_tags = [x.get('term') for x in entry.get('tags', [])]
            obj.feed_tags = feed_tags
            content = None
            if hasattr(entry, 'content'):
                content = entry.content[0]
                ctype = content.get('type')  # sometimes no type on linux prsr.
            elif hasattr(entry, 'summary_detail'):
                # If it is a rss feed with a html description use that
                # as content.
                ctype = entry.summary_detail.get('type')
                if ctype in ('text/xhtml', 'application/xhtml+xml',
                             'text/html'):
                    content = entry.summary_detail
            if content:
                if ctype in ('text/xhtml', 'application/xhtml+xml'):
                    # Archetypes doesn't make a difference between
                    # html and xhtml, so we set the type to text/html:
                    ctype = 'text/html'
                    # Warning: minidom.parseString needs a byte
                    # string, not a unicode one, so we need to
                    # encode it first, but only for this parsing.
                    # http://evanjones.ca/python-utf8.html
                    encoded_content = content['value'].encode('utf-8')
                    try:
                        doc = minidom.parseString(encoded_content)
                    except:
                        # Might be an ExpatError, but that is
                        # somewhere in a .so file, so we cannot
                        # specifically catch only that error.  One
                        # reason for an ExpatError, is that if there
                        # is no encapsulated tag, minidom parse fails,
                        # so we can try again in that case.
                        encoded_content = "<div>" + encoded_content + "</div>"
                        try:
                            doc = minidom.parseString(encoded_content)
                        except:
                            # Might be that ExpatError again.
                            logger.warn("Error parsing content for %s", id)
                            continue
                    if len(doc.childNodes) > 0 and \
                            doc.firstChild.hasAttributes():
                        handler = None
                        top = doc.firstChild
                        cls = top.getAttribute('class')
                        if cls:
                            handler = component.queryAdapter(
                                obj, IFeedItemContentHandler, name=cls)
                        if handler is None:
                            handler = component.queryAdapter(
                                obj, IFeedItemContentHandler)

                        if handler is None:
                            update_text(obj, content['value'], mimetype=ctype)
                        else:
                            handler.apply(top)
                            # Grab the first non-<dl> node and treat
                            # that as the content.
                            actualContent = None
                            for node in top.childNodes:
                                if node.nodeName == 'div':
                                    actualContent = node.toxml()
                                    update_text(obj, actualContent,
                                                mimetype=ctype)
                                    break
                    else:
                        update_text(obj, content['value'], mimetype=ctype)
                else:
                    update_text(obj, content['value'], mimetype=ctype)
                if summary == convert_summary(content['value']):
                    # summary and content is the same so we can cut
                    # the summary.  The transform can stumble over
                    # unicode, so we convert to a utf-8 string.
                    summary = summary.encode('utf-8')
                    if portal_transforms is not None:
                        data = portal_transforms.convert('html_to_text', summary)
                        summary = data.getData()
                    words = summary.split()[:72]
                    summarywords = words[:45]
                    if len(words) > 70:
                        # use the first 50-70 words as a description
                        for word in words[45:]:
                            summarywords.append(word)
                            if word.endswith('.'):
                                # if we encounter a fullstop that will be the
                                # last word appended to the description
                                break
                        summary = ' '.join(summarywords)
                        if not summary.endswith('.'):
                            summary = summary + ' ...'
                    obj.setDescription(summary)

            if hasattr(entry, 'links'):
                enclosures = [x for x in entry.links if x.rel == 'enclosure']
                real_enclosures = [x for x in enclosures if
                                   not self.isHTMLEnclosure(x)]
                for link in real_enclosures:
                    if MAXSIZE > 0 and int(link.get('length', 0)) > MAXSIZE * 1000:
                        logger.warn("Ignored enclosure {0} size {1} kb exceeds maximum {2} kb".format(
                            link.get('href', ''), int(link.get('length', 0))/1000, MAXSIZE))
                        continue
                    enclosureSig = md5(link.href)
                    enclosureId = enclosureSig.hexdigest()
                    if enclosureId in obj.objectIds():
                        # Two enclosures with the same href in this
                        # entry...
                        continue
                    enclosure = obj.addEnclosure(enclosureId)
                    enclosure.update(title=enclosureId)
                    updateWithRemoteFile(enclosure, link)
                    if enclosure.Title() != enclosure.getId():
                        self.tryRenamingEnclosure(enclosure, obj)
                    # At this moment in time, the
                    # rename-after-creation magic might have changed
                    # the ID of the file. So we recatalog the object.
                    obj.reindexObject()

            # change ksuess 20.11.2013
            # media_thumbnail in Textfeld schreiben
            if hasattr(entry, 'media_thumbnail'):
                media_thumbnail = entry.media_thumbnail
                url = None
                try:
                    url = media_thumbnail[0]['url']
                except Exception, e:
                    logger.warn(str(e))
                if url:
                    update_text(obj, url, mimetype='text/plain')
                
            
            if obj is not None:
                try:
                    event.notify(FeedItemConsumedEvent(obj))
                except UnicodeDecodeError:
                    logger.warn("UnicodeDecodeError: %s" %
                                '/'.join(obj.getPhysicalPath()))
예제 #2
0
    def _retrieveSingleFeed(self, feedContainer, url):
        # feedparser doesn't understand proper file: url's
        if url.startswith('file://'):
            url = url[7:]
            if not os.path.exists(url):
                raise IOError("Couldn't locate %r" % url)
        # urllib does not support the 'feed' scheme -- replace with 'http'
        if url.startswith('feed://'):
            url = url.replace('feed://', 'http://', 1)
        portal_transforms = getToolByName(feedContainer, 'portal_transforms')
        parsed = feedparser.parse(url)
        for entry in parsed.entries:
            id = get_uid_from_entry(entry)
            if not id:
                logger.warn("Ignored unidentifiable entry without id or link.")
                continue
            updated = entry.get('updated')
            published = entry.get('published')

            if not updated:
                # property may be blank if item has never
                # been updated -- use published date
                updated = published

            if updated:
                try:
                    updated = extendedDateTime(updated)
                except DateTime.SyntaxError:
                    logger.warn("SyntaxError while parsing %r as DateTime for "
                                "the 'updated' field of entry %s",
                                updated, getattr(entry, 'title', ''))
                    continue

            prev = feedContainer.getItem(id)
            if prev is None:
                # Completely new item, add it.
                addItem = feedContainer.addItem
            elif updated is None:
                logger.warn("No updated or published date known. "
                            "Not updating previously added entry.")
                continue
            elif updated > prev.getFeedItemUpdated():
                # Refreshed item, replace it.
                addItem = feedContainer.replaceItem
            else:
                # Not new, not refreshed: let it be, laddy.  Still,
                # the entry might have changed slightly, so we check
                # this.
                if prev.getObjectInfo != entry:
                    # Note: no need for a reindexObject here, which
                    # would also update the modification date, which
                    # we do not want.  See
                    # http://plone.org/products/feedfeeder/issues/34
                    prev.setObjectInfo(entry.copy())
                continue

            obj = addItem(id)

            linkDict = getattr(entry, 'link', None)
            if linkDict:
                # Hey, that's not a dict at all; at least not in my test.
                #link = linkDict['href']
                link = linkDict
            else:
                linkDict = getattr(entry, 'links', [{'href': ''}])[0]
                link = linkDict['href']

            if not updated:
                updated = DateTime()
            if published is not None:
                try:
                    published = extendedDateTime(published)
                except DateTime.SyntaxError:
                    logger.warn("SyntaxError while parsing %r as DateTime for "
                                "the 'published' field of entry %s",
                                published, getattr(entry, 'title', ''))
                    continue
                obj.setEffectiveDate(published)

            summary = getattr(entry, 'summary', '')
            logger.debug("1 summary: %r" % summary)
            summary = convert_summary(summary)
            logger.debug("2 summary: %r" % summary)

            obj.update(id=id,
                       title=getattr(entry, 'title', ''),
                       description=summary,
                       feedItemAuthor=getattr(entry, 'author', ''),
                       feedItemUpdated=updated,
                       link=link,
                       feedTitle=parsed['feed'].get('title', ''),
                       objectInfo=entry.copy(),
                       )
            # Tags cannot be handled by the update method AFAIK,
            # because it is not an Archetypes field.
            feed_tags = [x.get('term') for x in entry.get('tags', [])]
            obj.feed_tags = feed_tags
            content = None
            if hasattr(entry, 'content'):
                content = entry.content[0]
                ctype = content.get('type')  # sometimes no type on linux prsr.
            elif hasattr(entry, 'summary_detail'):
                # If it is a rss feed with a html description use that
                # as content.
                ctype = entry.summary_detail.get('type')
                if ctype in ('text/xhtml', 'application/xhtml+xml',
                             'text/html'):
                    content = entry.summary_detail
            if content:
                if ctype in ('text/xhtml', 'application/xhtml+xml'):
                    # Archetypes doesn't make a difference between
                    # html and xhtml, so we set the type to text/html:
                    ctype = 'text/html'
                    # Warning: minidom.parseString needs a byte
                    # string, not a unicode one, so we need to
                    # encode it first, but only for this parsing.
                    # http://evanjones.ca/python-utf8.html
                    encoded_content = content['value'].encode('utf-8')
                    try:
                        doc = minidom.parseString(encoded_content)
                    except:
                        # Might be an ExpatError, but that is
                        # somewhere in a .so file, so we cannot
                        # specifically catch only that error.  One
                        # reason for an ExpatError, is that if there
                        # is no encapsulated tag, minidom parse fails,
                        # so we can try again in that case.
                        encoded_content = "<div>" + encoded_content + "</div>"
                        try:
                            doc = minidom.parseString(encoded_content)
                        except:
                            # Might be that ExpatError again.
                            logger.warn("Error parsing content for %s", id)
                            continue
                    if len(doc.childNodes) > 0 and \
                            doc.firstChild.hasAttributes():
                        handler = None
                        top = doc.firstChild
                        cls = top.getAttribute('class')
                        if cls:
                            handler = component.queryAdapter(
                                obj, IFeedItemContentHandler, name=cls)
                        if handler is None:
                            handler = component.queryAdapter(
                                obj, IFeedItemContentHandler)

                        if handler is None:
                            update_text(obj, content['value'], mimetype=ctype)
                        else:
                            handler.apply(top)
                            # Grab the first non-<dl> node and treat
                            # that as the content.
                            actualContent = None
                            for node in top.childNodes:
                                if node.nodeName == 'div':
                                    actualContent = node.toxml()
                                    update_text(obj, actualContent,
                                                mimetype=ctype)
                                    break
                    else:
                        update_text(obj, content['value'], mimetype=ctype)
                else:
                    update_text(obj, content['value'], mimetype=ctype)
                if summary == convert_summary(content['value']):
                    # summary and content is the same so we can cut
                    # the summary.  The transform can stumble over
                    # unicode, so we convert to a utf-8 string.
                    summary = summary.encode('utf-8')
                    data = portal_transforms.convert('html_to_text', summary)
                    summary = data.getData()
                    words = summary.split()[:72]
                    summarywords = words[:45]
                    if len(words) > 70:
                        # use the first 50-70 words as a description
                        for word in words[45:]:
                            summarywords.append(word)
                            if word.endswith('.'):
                                # if we encounter a fullstop that will be the
                                # last word appended to the description
                                break
                        summary = ' '.join(summarywords)
                        if not summary.endswith('.'):
                            summary = summary + ' ...'
                    obj.setDescription(summary)

            if hasattr(entry, 'links'):
                enclosures = [x for x in entry.links if x.rel == 'enclosure']
                real_enclosures = [x for x in enclosures if
                                   not self.isHTMLEnclosure(x)]

                for link in real_enclosures:
                    enclosureSig = md5(link.href)
                    enclosureId = enclosureSig.hexdigest()
                    if enclosureId in obj.objectIds():
                        # Two enclosures with the same href in this
                        # entry...
                        continue
                    enclosure = obj.addEnclosure(enclosureId)
                    enclosure.update(title=enclosureId)
                    updateWithRemoteFile(enclosure, link)
                    if enclosure.Title() != enclosure.getId():
                        self.tryRenamingEnclosure(enclosure, obj)
                    # At this moment in time, the
                    # rename-after-creation magic might have changed
                    # the ID of the file. So we recatalog the object.
                    obj.reindexObject()

            if obj is not None:
                try:
                    event.notify(FeedItemConsumedEvent(obj))
                except UnicodeDecodeError:
                    logger.warn("UnicodeDecodeError: %s" %
                                obj.getPhysicalPath())