Python FeedParserDictの例、feedparser.FeedParserDict Pythonの例

コード例 #1

0

ファイルを表示

ファイル: speedparsertests.py プロジェクト: nikegp/speedparser3

def load_cache(path):
    """Load a cached feedparser result."""
    jsonpath = path.replace('dat', 'json')
    if not os.path.exists(jsonpath):
        return None
    with open(jsonpath) as f:
        data = json.loads(f.read())
    ret = feedparser.FeedParserDict()
    ret.update(data)
    if 'updated_parsed' in data['feed'] and data['feed']['updated_parsed']:
        try:
            data['feed']['updated_parsed'] = time.gmtime(
                data['feed']['updated_parsed'])
        except:
            pass

    ret.feed = feedparser.FeedParserDict(data.get('feed', {}))
    entries = []
    for e in data.get('entries', []):
        if 'updated_parsed' in e and e['updated_parsed']:
            try:
                e['updated_parsed'] = time.gmtime(e['updated_parsed'])
            except:
                pass
        entries.append(feedparser.FeedParserDict(e))
    ret.entries = entries
    return ret

コード例 #2

0

ファイルを表示

def parse(document, clean_html=True, unix_timestamp=False, encoding=None):
    """Parse a document and return a feedparser dictionary with attr key access.
    If clean_html is False, the html in the feed will not be cleaned.  If
    clean_html is True, a sane version of lxml.html.clean.Cleaner will be used.
    If it is a Cleaner object, that cleaner will be used.  If unix_timestamp is
    True, the date information will be a numerical unix timestamp rather than a
    struct_time.  If encoding is provided, the encoding of the document will be
    manually set to that."""
    if isinstance(clean_html, bool):
        cleaner = default_cleaner if clean_html else fake_cleaner
    else:
        cleaner = clean_html
    result = feedparser.FeedParserDict()
    result['feed'] = feedparser.FeedParserDict()
    result['entries'] = []
    result['bozo'] = 0
    try:
        parser = SpeedParser(document, cleaner, unix_timestamp, encoding)
        parser.update(result)
    except Exception as e:
        if isinstance(e, UnicodeDecodeError) and encoding is True:
            encoding = chardet.detect(document)['encoding']
            document = document.decode(encoding, 'replace').encode('utf-8')
            return parse(document, clean_html, unix_timestamp, encoding)
        import traceback
        result['bozo'] = 1
        result['bozo_exception'] = e
        result['bozo_tb'] = traceback.format_exc()
    return result

コード例 #3

0

ファイルを表示

 def test_parse_ensure_proper_return_on_success(self):
     response = feedparser.FeedParserDict()
     response.status = 200
     response['feed'] = feedparser.FeedParserDict()
     response['feed']['title'] = 'Bola'
     f = create_dynamic_parse_func(response)
     scraper = Scraper(f, self.feed)
     self.assertEqual(response, scraper.parse(False))

コード例 #4

0

ファイルを表示

def _start_newznab_attr(self, attrsD):
    context = self._getContext()

    context.setdefault('newznab', feedparser.FeedParserDict())
    context['newznab'].setdefault('tags', feedparser.FeedParserDict())

    name = attrsD.get('name')
    value = attrsD.get('value')

    if name == 'category':
        context['newznab'].setdefault('categories', []).append(value)
    else:
        context['newznab'][name] = value

コード例 #5

0

ファイルを表示

def get_date_published(entry: feedparser.FeedParserDict()) -> str:
    try:
        return entry['published']

    except Exception as exc:
        logger.exception(exc)
        return ""

コード例 #6

0

ファイルを表示

def get_rss_link(entry: feedparser.FeedParserDict()) -> str:
    try:
        return entry['link']

    except Exception as exc:
        logger.exception(exc)
        return ""

コード例 #7

0

ファイルを表示

def get_summary(entry: feedparser.FeedParserDict()) -> str:
    try:
        return entry['summary']

    except Exception as exc:
        logger.exception(exc)
        return ""

コード例 #8

0

ファイルを表示

ファイル: cc.py プロジェクト: explodes/python-constant-contact

    def get_images(self, folder_id, raw=False):
        """ get the contents of a folder """
        response = self._request("%s/%s/images" % (self.folders_url,
            folder_id), headers=self.accept_header)[1]
        if raw:
            return response

        # the image list uses microformats which are not on the feedparser
        # whitelist, so we'll need some custom parsing

        sanitize_html = fp.SANITIZE_HTML
        fp.SANITIZE_HTML = 0
        parsed = fp.parse(response)
        BeautifulSoup = fp.BeautifulSoup.BeautifulSoup
        for image in parsed.entries:
            # TODO: think of a better way to automate this
            content = image.content[0].value
            image.content = fp.FeedParserDict()
            soup = BeautifulSoup(content)
            for prop in ['filename', 'imageurl', 'height', 'width',
                'description']:
                image.content[prop] = getattr(soup, prop).text

        fp.SANITIZE_HTML = sanitize_html
        return parsed

コード例 #9

0

ファイルを表示

def restore_sort_feedparserdict(feed_items):
    ''' restore feed items to FeedParserDict - for some reason Django sessions
        converts them to Dict; sort by date
        :arguments: list of news item dicts
        :returns: dictionary of news items dicts
    '''
    # first restore feed_item list to a feedparser dict
    feed_items = [
        feedparser.FeedParserDict(feed_item) for feed_item in feed_items
    ]

    # sort the feed_items list on date
    feed_items_sorted = []
    for feed_item in feed_items:
        feed_items_sorted.append(
            (feed_item, feedparser_time_to_datetime(feed_item)))

    feed_items_sorted.sort(key=lambda k: k[1], reverse=True)

    # now make the new feedparser dict
    new_feed_items = {}
    for i, feed_item in enumerate(feed_items_sorted):
        new_feed_items[i] = feed_item[0]

    return new_feed_items

コード例 #10

0

ファイルを表示

def bulk_import(parsed_feed):
    match = USERNAME_RE.search(parsed_feed.feed.link)
    username = match.group('name')
    if parsed_feed.feed.link in _cached_video_count:
        count = _cached_video_count[parsed_feed.feed.link]
    else:
        count = video_count(parsed_feed)
    post_url = _post_url(username, match.group('type') or 'videos', 'page=%i')
    parsed_feed = feedparser.FeedParserDict(parsed_feed.copy())
    parsed_feed.entries = []
    for i in range(1, int(math.ceil(count / 20.0)) + 1):
        response = open_url_while_lying_about_agent(post_url % i)
        if response.getcode() != 200:
            break
        data = response.read()
        if not data:
            break
        json_data = simplejson.loads(data)
        for video in json_data:
            parsed_feed.entries.append(
                feedparser_dict(_json_to_feedparser(video)))

    # clean up cache
    if parsed_feed.feed.link in _cached_video_count:
        del _cached_video_count[parsed_feed.feed.link]

    return parsed_feed

コード例 #11

0

ファイルを表示

ファイル: test_PostSorter.py プロジェクト: thangduong/kamaelia

    def generateFeeds(self, number, date=None, name='feed'):
        feeds = []
        for i in xrange(number):
            if date is None:
                date = time.gmtime()

            if date == self.NO_DATE:
                entries = [feedparser.FeedParserDict()]
            else:
                entries = [feedparser.FeedParserDict(updated_parsed=date)]

            feeds.append(
                feedparser.FeedParserDict(feed='%s-%i' % (name, i),
                                          encoding='UTF-8',
                                          entries=entries))
        return feeds

コード例 #12

0

ファイルを表示

 def test_parse_ensure_temporary_error_on_unknown_status(self):
     response = feedparser.FeedParserDict()
     response.status = 500
     f = create_dynamic_parse_func(response)
     scraper = Scraper(f, self.feed)
     with self.assertRaises(TemporaryFeedError):
         scraper.parse(False)

コード例 #13

0

ファイルを表示

ファイル: spider.py プロジェクト: aviarypl/venus

def httpThread(thread_index, input_queue, output_queue, log):
    import httplib2
    from httplib import BadStatusLine

    h = httplib2.Http(config.http_cache_directory())
    uri, feed_info = input_queue.get(block=True)
    while uri:
        log.info("Fetching %s via %d", uri, thread_index)
        feed = StringIO('')
        setattr(feed, 'url', uri)
        setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'}))
        try:
            # map IRI => URI
            try:
                if isinstance(uri, unicode):
                    idna = uri.encode('idna')
                else:
                    idna = uri.decode('utf-8').encode('idna')
                if idna != uri: log.info("IRI %s mapped to %s", uri, idna)
            except:
                log.info("unable to map %s to a URI", uri)
                idna = uri

            # cache control headers
            headers = {}
            if feed_info.feed.has_key('planet_http_etag'):
                headers['If-None-Match'] = feed_info.feed['planet_http_etag']
            if feed_info.feed.has_key('planet_http_last_modified'):
                headers['If-Modified-Since'] = \
                    feed_info.feed['planet_http_last_modified']

            headers["user-agent"] = "Venus (+%s)" % config.link()

            # issue request
            (resp, content) = h.request(idna, 'GET', headers=headers)

            # unchanged detection
            resp['-content-hash'] = md5(content or '').hexdigest()
            if resp.status == 200:
                if resp.fromcache:
                    resp.status = 304
                elif feed_info.feed.has_key('planet_content_hash') and \
                    feed_info.feed['planet_content_hash'] == \
                    resp['-content-hash']:
                    resp.status = 304

            # build a file-like object
            feed = StringIO(content)
            setattr(feed, 'url', resp.get('content-location', uri))
            if resp.has_key('content-encoding'):
                del resp['content-encoding']
            setattr(feed, 'headers', resp)
        except BadStatusLine:
            log.error("Bad Status Line received for %s via %d", uri,
                      thread_index)
        except httplib2.HttpLib2Error, e:
            log.error("HttpLib2Error: %s via %d", str(e), thread_index)
        except socket.gaierror, e:
            log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]),
                      thread_index)

コード例 #14

0

ファイルを表示

 def test_parse_ensure_broken_feed_on_404(self):
     response = feedparser.FeedParserDict()
     response.status = 404
     f = create_dynamic_parse_func(response)
     scraper = Scraper(f, self.feed)
     with self.assertRaises(BrokenFeed):
         scraper.parse(False)

コード例 #15

0

ファイルを表示

ファイル: feedparsertest.py プロジェクト: oldchip/addonkodi.xml

    def test_issue_328_fallback_behavior(self):
        warnings.filterwarnings('error')

        d = feedparser.FeedParserDict()
        d['published'] = u'pub string'
        d['published_parsed'] = u'pub tuple'
        d['updated'] = u'upd string'
        d['updated_parsed'] = u'upd tuple'
        # Ensure that `updated` doesn't map to `published` when it exists
        self.assertTrue('published' in d)
        self.assertTrue('published_parsed' in d)
        self.assertTrue('updated' in d)
        self.assertTrue('updated_parsed' in d)
        self.assertEqual(d['published'], 'pub string')
        self.assertEqual(d['published_parsed'], 'pub tuple')
        self.assertEqual(d['updated'], 'upd string')
        self.assertEqual(d['updated_parsed'], 'upd tuple')

        d = feedparser.FeedParserDict()
        d['published'] = u'pub string'
        d['published_parsed'] = u'pub tuple'
        # Ensure that `updated` doesn't actually exist
        self.assertTrue('updated' not in d)
        self.assertTrue('updated_parsed' not in d)
        # Ensure that accessing `updated` throws a DeprecationWarning
        try:
            d['updated']
        except DeprecationWarning:
            # Expected behavior
            pass
        else:
            # Wrong behavior
            self.assertEqual(True, False)
        try:
            d['updated_parsed']
        except DeprecationWarning:
            # Expected behavior
            pass
        else:
            # Wrong behavior
            self.assertEqual(True, False)
        # Ensure that `updated` maps to `published`
        warnings.filterwarnings('ignore')
        self.assertEqual(d['updated'], u'pub string')
        self.assertEqual(d['updated_parsed'], u'pub tuple')
        warnings.resetwarnings()

コード例 #16

0

ファイルを表示

def feedparser_dict(obj):
    if isinstance(obj, dict):
        return feedparser.FeedParserDict(
            dict([(key, feedparser_dict(value))
                  for (key, value) in obj.items()]))
    if isinstance(obj, (list, tuple)):
        return [feedparser_dict(member) for member in obj]
    return obj

コード例 #17

0

ファイルを表示

ファイル: tv_cache.py プロジェクト: harikrishnan0eh/SiCKRAGE

    def getRSSFeed(self, url, params=None):
        try:
            if self.provider.login():
                resp = WebSession().get(url, params=params).text
                return feedparser.parse(resp)
        except Exception as e:
            sickrage.app.log.debug("RSS Error: {}".format(e))

        return feedparser.FeedParserDict()

コード例 #18

0

ファイルを表示

def get_datetime(fp: feedparser.FeedParserDict()) -> str:
    try:
        tm = fp['feed']['updated_parsed']
        return str(tm.year) + left_pad(tm.month) + left_pad(tm.day) + \
            left_pad(tm.hour) + left_pad(tm.minute)

    except Exception as exc:
        logger.exception(exc)
        tm = datetime.now()
        return str(tm.year) + left_pad(tm.month) + left_pad(tm.day) + \
            left_pad(tm.hour) + left_pad(tm.minute)

コード例 #19

0

ファイルを表示

ファイル: test_word.py プロジェクト: deeban25/shakespeare

 def test_01_load_entry(self):
     # TODO: standalone test ...
     import feedparser
     entry = feedparser.FeedParserDict()
     title = u' xxx'
     name = title.strip()
     content = [{'value': u'yyy', 'language': 'en'}]
     entry.title = title
     entry.content = content
     word = wordm.load_entry(entry)
     model.Session.remove()
     word = model.Word.by_name(name)
     assert word.notes == content[0]['value'], word.notes

コード例 #20

0

ファイルを表示

    def test_has_updated_past(self):
        self.feed.last_updated_at = make_aware(datetime.now())
        future = datetime.now() - timedelta(days=1)
        entry_list = [
            {'updated_parsed': future.timetuple()},
        ]

        feed_dict = feedparser.FeedParserDict()
        feed_dict['entries'] = entry_list

        f = create_dynamic_parse_func(feed_dict)
        scraper = Scraper(f, self.feed)
        self.assertFalse(scraper._has_updated(feed_dict, False))

コード例 #21

0

ファイルを表示

def _feed_dict(d):
    if isinstance(d, dict):
        for key, value in d.iteritems():
            #parsed time tuple
            if key.endswith('_parsed'):
                utc_time = utc.from_string(value)
                d[key] = utc_time.timetuple()
        inst = feedparser.FeedParserDict(d)
    elif 'error' in d:
        inst = pickle.loads(d['error'])
    else:
        inst = d
    return inst

コード例 #22

0

ファイルを表示

    def __init__(self,
                 root,
                 namespaces={},
                 encoding='utf-8',
                 type='rss20',
                 cleaner=default_cleaner,
                 unix_timestamp=False):
        """A port of SpeedParserFeed that uses far fewer xpath lookups, which
        ends up simplifying parsing and makes it easier to catch the various
        names that different tags might come under."""
        self.root = root
        self.unix_timestamp = unix_timestamp
        nslookup = reverse_namespace_map(namespaces)
        self.cleaner = cleaner
        self.baseurl = base_url(root)

        feed = feedparser.FeedParserDict()
        tag_map = self.tag_map

        channel = xpath(root, self.channel_xpath, namespaces)
        if len(channel) == 1:
            channel = channel[0]

        for child in channel:
            if isinstance(child, etree._Comment):
                continue
            ns, tag = clean_ns(child.tag)
            mapping = tag_map.get(tag, None)
            if mapping:
                getattr(self, 'parse_%s' % mapping)(child, feed,
                                                    nslookup.get(ns, ns))
            if not ns:
                continue
            fulltag = '%s:%s' % (nslookup.get(ns, ''), tag)
            mapping = tag_map.get(fulltag, None)
            if mapping:
                getattr(self, 'parse_%s' % mapping)(child, feed, nslookup[ns])

        # this copies feedparser behavior if, say, xml:lang is defined in the
        # root feed element, even though this element tends to have garbage like
        # "utf-8" in it rather than an actual language
        if 'language' not in feed:
            for attr in root.attrib:
                if attr.endswith('lang'):
                    feed['language'] = root.attrib[attr]

        if 'id' in feed and 'link' not in feed:
            feed['link'] = feed['id']

        self.feed = feed

コード例 #23

0

ファイルを表示

    def test_01_load_entry(self):
        import feedparser
        entry = feedparser.FeedParserDict()
        title = u'Introduction: Sonnet 18'
        name = title.strip()
        content = [{'value': u'yyy', 'language': 'en'}]
        entry.title = title
        entry.content = content

        loader = feed.WorkIntroductionLoader()
        work = loader.load_entry(entry)
        assert work.name == 'test_sonnet18', work
        model.Session.commit()
        model.Session.remove()
        work = model.Work.by_name(TestData.name)
        assert work.notes == content[0]['value'], work.notes

コード例 #24

0

ファイルを表示

 def test_builds_valid_message_for_post(self):
     search_term = 'some search'
     url = 'http://example.com/item1'
     feedUrl = 'http://example.com/feed'
     title = 'some title'
     content = 'some content'
     date_published = None
     author = 'some guy'
     entry = feedparser.FeedParserDict({'id': feedUrl})
     post = pshb.PostFactory.createPost(url, feedUrl, title, content,
                                        date_published, author, entry)
     message_builder = MessageBuilder()
     message = message_builder.build_message_from_post(post, search_term)
     expected = '[%s] matched post: [%s] with URL: [%s]' % (search_term,
                                                            title, url)
     self.assertEquals(expected, message)

コード例 #25

0

ファイルを表示

    def parse_entry(self, entry):
        """An attempt to parse pieces of an entry out w/o xpath, by looping
        over the entry root's children and slotting them into the right places.
        This is going to be way messier than SpeedParserEntries, and maybe
        less cleanly usable, but it should be faster."""

        e = feedparser.FeedParserDict()
        tag_map = self.tag_map
        nslookup = self.nslookup

        for child in entry.getchildren():
            if isinstance(child, etree._Comment):
                continue
            ns, tag = clean_ns(child.tag)
            mapping = tag_map.get(tag, None)
            if mapping:
                getattr(self, 'parse_%s' % mapping)(child, e,
                                                    nslookup.get(ns, ns))
            if not ns:
                continue
            fulltag = '%s:%s' % (nslookup.get(ns, ''), tag)
            mapping = tag_map.get(fulltag, None)
            if mapping:
                getattr(self, 'parse_%s' % mapping)(child, e, nslookup[ns])

        lacks_summary = 'summary' not in e or e['summary'] is None
        lacks_content = 'content' not in e or not bool(e.get('content', None))

        if not lacks_summary and lacks_content:
            e['content'] = [{'value': e.summary}]

        # feedparser sometimes copies the first content value into the
        # summary field when summary was completely missing;  we want
        # to do that as well, but avoid the case where summary was given as ''
        if lacks_summary and not lacks_content:
            e['summary'] = e['content'][0]['value']

        if e.get('summary', False) is None:
            e['summary'] = u''

        # support feed entries that have a guid but no link
        if 'guid' in e and 'link' not in e:
            e['link'] = full_href(e['guid'], self.baseurl)

        return e

コード例 #26

0

ファイルを表示

def extractTags(x):
    complete = ""
    dic = feedparser.FeedParserDict()
    lis = [1, 2]
    lis_type = type(lis)
    dic_type = type(dic)
    values = x.values()
    for value in values:
        value_type = type(value)
        if (value_type == type(dic)):
            values.extend(value.values())
        elif (value_type == type(lis)):
            values.extend(value[0].values())
        else:
            complete = complete + "\n , " + str(value)

    compounds = saveCompounds(str(complete))

    return compounds

コード例 #27

0

ファイルを表示

ファイル: rssManager.py プロジェクト: Raccoon-git/xpebot

class rssManager(object):
    """description of class"""

    rssNewsUrl = main.botConfig.GetRssNewsUrl()
    rssNewsCount = main.botConfig.GetRssNewsCount()

    rssNews = feedparser.FeedParserDict()

    def RssNewsReader(self):
        log.info('Rss News Reader')
        self.rssNews = feedparser.parse(self.rssNewsUrl)
        newsMessage = '<strong>' + self.rssNews.feed.title + '</strong>\n\n'
        for (i, entry) in enumerate(self.rssNews.entries):
            if i == self.rssNewsCount: break
            log.info(entry.title)
            newsMessage += '<a href="%s">%d. %s</a>\n' % (entry.link, i + 1,
                                                          entry.title)

        log.info('Rss News : %s', newsMessage)
        return newsMessage

コード例 #28

0

ファイルを表示

ファイル: feedparsertest.py プロジェクト: oldchip/addonkodi.xml

 def setUp(self):
     self.d = feedparser.FeedParserDict()

コード例 #29

0

ファイルを表示

def spiderPlanet(only_if_new=False):
    """ Spider (fetch) an entire planet """
    log = planet.logger

    global index
    index = True

    timeout = config.feed_timeout()
    try:
        socket.setdefaulttimeout(float(timeout))
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
            import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(float(timeout))
            log.info("Socket timeout set to %d seconds", timeout)
        except:
            log.warning("Timeout set to invalid value '%s', skipping", timeout)

    from Queue import Queue
    from threading import Thread

    fetch_queue = Queue()
    parse_queue = Queue()

    threads = {}
    http_cache = config.http_cache_directory()
    # Should this be done in config?
    if http_cache and not os.path.exists(http_cache):
        os.makedirs(http_cache)

    if int(config.spider_threads()):
        # Start all the worker threads
        for i in range(int(config.spider_threads())):
            threads[i] = Thread(target=httpThread,
                                args=(i, fetch_queue, parse_queue, log))
            threads[i].start()
    else:
        log.info("Building work queue")

    # Load the fetch and parse work queues
    for uri in config.subscriptions():
        # read cached feed info
        sources = config.cache_sources_directory()
        feed_source = filename(sources, uri)
        feed_info = feedparser.parse(feed_source)

        if feed_info.feed and only_if_new:
            log.info("Feed %s already in cache", uri)
            continue
        if feed_info.feed.get('planet_http_status', None) == '410':
            log.info("Feed %s gone", uri)
            continue

        if threads and _is_http_uri(uri):
            fetch_queue.put(item=(uri, feed_info))
        else:
            parse_queue.put(item=(uri, feed_info, uri))

    # Mark the end of the fetch queue
    for thread in threads.keys():
        fetch_queue.put(item=(None, None))

    # Process the results as they arrive
    feeds_seen = {}
    while fetch_queue.qsize() or parse_queue.qsize() or threads:
        while parse_queue.qsize():
            (uri, feed_info, feed) = parse_queue.get(False)
            try:

                if not hasattr(feed,
                               'headers') or int(feed.headers.status) < 300:
                    options = {}
                    if hasattr(feed_info, 'feed'):
                        options['etag'] = \
                            feed_info.feed.get('planet_http_etag',None)
                        try:
                            modified = time.strptime(
                                feed_info.feed.get('planet_http_last_modified',
                                                   None))
                        except:
                            pass

                    data = feedparser.parse(feed, **options)
                else:
                    data = feedparser.FeedParserDict({
                        'version':
                        None,
                        'headers':
                        feed.headers,
                        'entries': [],
                        'feed': {},
                        'href':
                        feed.url,
                        'bozo':
                        0,
                        'status':
                        int(feed.headers.status)
                    })

                # duplicate feed?
                id = data.feed.get('id', None)
                if not id: id = feed_info.feed.get('id', None)

                href = uri
                if data.has_key('href'): href = data.href

                duplicate = None
                if id and id in feeds_seen:
                    duplicate = id
                elif href and href in feeds_seen:
                    duplicate = href

                if duplicate:
                    feed_info.feed['planet_message'] = \
                        'duplicate subscription: ' + feeds_seen[duplicate]
                    log.warn('Duplicate subscription: %s and %s' %
                             (uri, feeds_seen[duplicate]))
                    if href: feed_info.feed['planet_http_location'] = href

                if id: feeds_seen[id] = uri
                if href: feeds_seen[href] = uri

                # complete processing for the feed
                writeCache(uri, feed_info, data)

            except Exception, e:
                import sys, traceback
                type, value, tb = sys.exc_info()
                log.error('Error processing %s', uri)
                for line in (traceback.format_exception_only(type, value) +
                             traceback.format_tb(tb)):
                    log.error(line.rstrip())

        time.sleep(0.1)

        for index in threads.keys():
            if not threads[index].isAlive():
                del threads[index]
                if not threads:
                    log.info("Finished threaded part of processing.")

コード例 #30

0

ファイルを表示

def writeCache(feed_uri, feed_info, data):
    log = planet.logger
    sources = config.cache_sources_directory()
    blacklist = config.cache_blacklist_directory()

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries) > 0:
            data.status = 200
        elif data.bozo and \
            data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500

    activity_horizon = \
        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))

    # process based on the HTTP status code
    if data.status == 200 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if data.has_key("entries") and len(data.entries) == 0:
            log.warning("No data %s", feed_uri)
            feed_info.feed['planet_message'] = 'no data'
        elif feed_uri == data.url:
            log.info("Updating feed %s", feed_uri)
        else:
            log.info("Updating feed %s @ %s", feed_uri, data.url)
    elif data.status == 301 and data.has_key("entries") and len(
            data.entries) > 0:
        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
        data.feed['planet_http_location'] = data.url
    elif data.status == 304 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if feed_uri == data.url:
            log.info("Feed %s unchanged", feed_uri)
        else:
            log.info("Feed %s unchanged @ %s", feed_uri, data.url)

        if not feed_info.feed.has_key('planet_message'):
            if feed_info.feed.has_key('planet_updated'):
                updated = feed_info.feed.planet_updated
                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
                    return
        else:
            if feed_info.feed.planet_message.startswith("no activity in"):
                return
            if not feed_info.feed.planet_message.startswith("duplicate") and \
               not feed_info.feed.planet_message.startswith("no data"):
                del feed_info.feed['planet_message']

    elif data.status == 410:
        log.info("Feed %s gone", feed_uri)
    elif data.status == 408:
        log.warning("Feed %s timed out", feed_uri)
    elif data.status >= 400:
        log.error("Error %d while updating feed %s", data.status, feed_uri)
    else:
        log.info("Updating feed %s", feed_uri)

    # if read failed, retain cached information
    if not data.get('version') and feed_info.get('version'):
        data.feed = feed_info.feed
        data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true'
        data.version = feed_info.feed.get('planet_format')
    data.feed['planet_http_status'] = str(data.status)

    # capture etag and last-modified information
    if data.has_key('headers'):
        if data.has_key('etag') and data.etag:
            data.feed['planet_http_etag'] = data.etag
        elif data.headers.has_key('etag') and data.headers['etag']:
            data.feed['planet_http_etag'] = data.headers['etag']

        if data.headers.has_key('last-modified'):
            data.feed['planet_http_last_modified'] = data.headers[
                'last-modified']
        elif data.has_key('modified') and data.modified:
            data.feed['planet_http_last_modified'] = time.asctime(
                data.modified)

        if data.headers.has_key('-content-hash'):
            data.feed['planet_content_hash'] = data.headers['-content-hash']

    # capture feed and data from the planet configuration file
    if data.get('version'):
        if not data.feed.has_key('links'): data.feed['links'] = list()
        feedtype = 'application/atom+xml'
        if data.version.startswith('rss'): feedtype = 'application/rss+xml'
        if data.version in ['rss090', 'rss10']:
            feedtype = 'application/rdf+xml'
        for link in data.feed.links:
            if link.rel == 'self':
                link['type'] = feedtype
                break
        else:
            data.feed.links.append(
                feedparser.FeedParserDict({
                    'rel': 'self',
                    'type': feedtype,
                    'href': feed_uri
                }))
    for name, value in config.feed_options(feed_uri).items():
        if name == "gravatar" and re.match(re_email, value):
            data.feed['planet_' + name] = md5(value.strip()).hexdigest()
        else:
            data.feed['planet_' + name] = value

    # perform user configured scrub operations on the data
    scrub.scrub(feed_uri, data)

    from planet import idindex
    global index
    if index != None: index = idindex.open()

    # select latest entry for each unique id
    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
        elif hasattr(entry['id'], 'values'):
            entry['id'] = entry['id'].values()[0]
        if not entry['id']: continue

        # determine updated date for purposes of selection
        updated = ''
        if entry.has_key('published'): updated = entry.published
        if entry.has_key('updated'): updated = entry.updated

        # if not seen or newer than last seen, select it
        if updated >= ids.get(entry.id, ('', ))[0]:
            ids[entry.id] = (updated, entry)

    # write each entry to the cache
    cache = config.cache_directory()
    for updated, entry in ids.values():

        # compute blacklist file name based on the id
        blacklist_file = filename(blacklist, entry.id)

        # check if blacklist file exists. If so, skip it.
        if os.path.exists(blacklist_file):
            continue

        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
            entry['updated_parsed'] = entry.get('published_parsed', None)
        if entry.has_key('updated_parsed'):
            try:
                mtime = calendar.timegm(entry.updated_parsed)
            except:
                pass
        if not mtime:
            try:
                mtime = os.stat(cache_file).st_mtime
            except:
                if data.feed.has_key('updated_parsed'):
                    try:
                        mtime = calendar.timegm(data.feed.updated_parsed)
                    except:
                        pass
        if not mtime: mtime = time.time()
        entry['updated_parsed'] = time.gmtime(mtime)

        # apply any filters
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml().encode('utf-8')
        xdoc.unlink()
        for filter in config.filters(feed_uri):
            output = shell.run(filter, output, mode="filter")
            if not output: break
        if not output:
            if os.path.exists(cache_file): os.remove(cache_file)
            continue

        # write out and timestamp the results
        write(output, cache_file, mtime)

        # optionally index
        if index != None:
            feedid = data.feed.get('id', data.feed.get('link', None))
            if feedid:
                if type(feedid) == unicode: feedid = feedid.encode('utf-8')
                index[filename('', entry.id)] = feedid

    if index: index.close()

    # identify inactive feeds
    if config.activity_threshold(feed_uri):
        updated = [
            entry.updated_parsed for entry in data.entries
            if entry.has_key('updated_parsed')
        ]
        updated.sort()

        if updated:
            data.feed['planet_updated'] = \
                time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1])
        elif data.feed.has_key('planet_updated'):
            updated = [
                feedparser._parse_date_iso8601(data.feed.planet_updated)
            ]

        if not updated or updated[-1] < activity_horizon:
            msg = "no activity in %d days" % config.activity_threshold(
                feed_uri)
            log.info(msg)
            data.feed['planet_message'] = msg

    # report channel level errors
    if data.status == 226:
        if data.feed.has_key('planet_message'): del data.feed['planet_message']
        if feed_info.feed.has_key('planet_updated'):
            data.feed['planet_updated'] = feed_info.feed['planet_updated']
    elif data.status == 403:
        data.feed['planet_message'] = "403: forbidden"
    elif data.status == 404:
        data.feed['planet_message'] = "404: not found"
    elif data.status == 408:
        data.feed['planet_message'] = "408: request timeout"
    elif data.status == 410:
        data.feed['planet_message'] = "410: gone"
    elif data.status == 500:
        data.feed['planet_message'] = "internal server error"
    elif data.status >= 400:
        data.feed['planet_message'] = "http status %s" % data.status

    # write the feed info to the cache
    if not os.path.exists(sources): os.makedirs(sources)
    xdoc = minidom.parseString('''<feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
    reconstitute.source(xdoc.documentElement, data.feed, data.bozo,
                        data.version)
    write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
    xdoc.unlink()