Python scrub Beispiele

Programmiersprache: Python

Namespace / Paketname: scrub

Methode / Funktion: scrub

Beispiele auf hotexamples.com: 5

Python scrub - 5 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die scrub.scrub, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

def writeCache(feed_uri, feed_info, data):
    log = planet.logger
    sources = config.cache_sources_directory()
    blacklist = config.cache_blacklist_directory()

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries) > 0:
            data.status = 200
        elif data.bozo and \
            data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500

    activity_horizon = \
        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))

    # process based on the HTTP status code
    if data.status == 200 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if data.has_key("entries") and len(data.entries) == 0:
            log.warning("No data %s", feed_uri)
            feed_info.feed['planet_message'] = 'no data'
        elif feed_uri == data.url:
            log.info("Updating feed %s", feed_uri)
        else:
            log.info("Updating feed %s @ %s", feed_uri, data.url)
    elif data.status == 301 and data.has_key("entries") and len(
            data.entries) > 0:
        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
        data.feed['planet_http_location'] = data.url
    elif data.status == 304 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if feed_uri == data.url:
            log.info("Feed %s unchanged", feed_uri)
        else:
            log.info("Feed %s unchanged @ %s", feed_uri, data.url)

        if not feed_info.feed.has_key('planet_message'):
            if feed_info.feed.has_key('planet_updated'):
                updated = feed_info.feed.planet_updated
                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
                    return
        else:
            if feed_info.feed.planet_message.startswith("no activity in"):
                return
            if not feed_info.feed.planet_message.startswith("duplicate") and \
               not feed_info.feed.planet_message.startswith("no data"):
                del feed_info.feed['planet_message']

    elif data.status == 410:
        log.info("Feed %s gone", feed_uri)
    elif data.status == 408:
        log.warning("Feed %s timed out", feed_uri)
    elif data.status >= 400:
        log.error("Error %d while updating feed %s", data.status, feed_uri)
    else:
        log.info("Updating feed %s", feed_uri)

    # if read failed, retain cached information
    if not data.get('version') and feed_info.get('version'):
        data.feed = feed_info.feed
        data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true'
        data.version = feed_info.feed.get('planet_format')
    data.feed['planet_http_status'] = str(data.status)

    # capture etag and last-modified information
    if data.has_key('headers'):
        if data.has_key('etag') and data.etag:
            data.feed['planet_http_etag'] = data.etag
        elif data.headers.has_key('etag') and data.headers['etag']:
            data.feed['planet_http_etag'] = data.headers['etag']

        if data.headers.has_key('last-modified'):
            data.feed['planet_http_last_modified'] = data.headers[
                'last-modified']
        elif data.has_key('modified') and data.modified:
            data.feed['planet_http_last_modified'] = time.asctime(
                data.modified)

        if data.headers.has_key('-content-hash'):
            data.feed['planet_content_hash'] = data.headers['-content-hash']

    # capture feed and data from the planet configuration file
    if data.get('version'):
        if not data.feed.has_key('links'): data.feed['links'] = list()
        feedtype = 'application/atom+xml'
        if data.version.startswith('rss'): feedtype = 'application/rss+xml'
        if data.version in ['rss090', 'rss10']:
            feedtype = 'application/rdf+xml'
        for link in data.feed.links:
            if link.rel == 'self':
                link['type'] = feedtype
                break
        else:
            data.feed.links.append(
                feedparser.FeedParserDict({
                    'rel': 'self',
                    'type': feedtype,
                    'href': feed_uri
                }))
    for name, value in config.feed_options(feed_uri).items():
        if name == "gravatar" and re.match(re_email, value):
            data.feed['planet_' + name] = md5(value.strip()).hexdigest()
        else:
            data.feed['planet_' + name] = value

    # perform user configured scrub operations on the data
    scrub.scrub(feed_uri, data)

    from planet import idindex
    global index
    if index != None: index = idindex.open()

    # select latest entry for each unique id
    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
        elif hasattr(entry['id'], 'values'):
            entry['id'] = entry['id'].values()[0]
        if not entry['id']: continue

        # determine updated date for purposes of selection
        updated = ''
        if entry.has_key('published'): updated = entry.published
        if entry.has_key('updated'): updated = entry.updated

        # if not seen or newer than last seen, select it
        if updated >= ids.get(entry.id, ('', ))[0]:
            ids[entry.id] = (updated, entry)

    # write each entry to the cache
    cache = config.cache_directory()
    for updated, entry in ids.values():

        # compute blacklist file name based on the id
        blacklist_file = filename(blacklist, entry.id)

        # check if blacklist file exists. If so, skip it.
        if os.path.exists(blacklist_file):
            continue

        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
            entry['updated_parsed'] = entry.get('published_parsed', None)
        if entry.has_key('updated_parsed'):
            try:
                mtime = calendar.timegm(entry.updated_parsed)
            except:
                pass
        if not mtime:
            try:
                mtime = os.stat(cache_file).st_mtime
            except:
                if data.feed.has_key('updated_parsed'):
                    try:
                        mtime = calendar.timegm(data.feed.updated_parsed)
                    except:
                        pass
        if not mtime: mtime = time.time()
        entry['updated_parsed'] = time.gmtime(mtime)

        # apply any filters
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml().encode('utf-8')
        xdoc.unlink()
        for filter in config.filters(feed_uri):
            output = shell.run(filter, output, mode="filter")
            if not output: break
        if not output:
            if os.path.exists(cache_file): os.remove(cache_file)
            continue

        # write out and timestamp the results
        write(output, cache_file, mtime)

        # optionally index
        if index != None:
            feedid = data.feed.get('id', data.feed.get('link', None))
            if feedid:
                if type(feedid) == unicode: feedid = feedid.encode('utf-8')
                index[filename('', entry.id)] = feedid

    if index: index.close()

    # identify inactive feeds
    if config.activity_threshold(feed_uri):
        updated = [
            entry.updated_parsed for entry in data.entries
            if entry.has_key('updated_parsed')
        ]
        updated.sort()

        if updated:
            data.feed['planet_updated'] = \
                time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1])
        elif data.feed.has_key('planet_updated'):
            updated = [
                feedparser._parse_date_iso8601(data.feed.planet_updated)
            ]

        if not updated or updated[-1] < activity_horizon:
            msg = "no activity in %d days" % config.activity_threshold(
                feed_uri)
            log.info(msg)
            data.feed['planet_message'] = msg

    # report channel level errors
    if data.status == 226:
        if data.feed.has_key('planet_message'): del data.feed['planet_message']
        if feed_info.feed.has_key('planet_updated'):
            data.feed['planet_updated'] = feed_info.feed['planet_updated']
    elif data.status == 403:
        data.feed['planet_message'] = "403: forbidden"
    elif data.status == 404:
        data.feed['planet_message'] = "404: not found"
    elif data.status == 408:
        data.feed['planet_message'] = "408: request timeout"
    elif data.status == 410:
        data.feed['planet_message'] = "410: gone"
    elif data.status == 500:
        data.feed['planet_message'] = "internal server error"
    elif data.status >= 400:
        data.feed['planet_message'] = "http status %s" % data.status

    # write the feed info to the cache
    if not os.path.exists(sources): os.makedirs(sources)
    xdoc = minidom.parseString('''<feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
    reconstitute.source(xdoc.documentElement, data.feed, data.bozo,
                        data.version)
    write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
    xdoc.unlink()

Beispiel #2

Datei anzeigen

Datei: spider.py Projekt: hoonose/aggregator

def writeCache(feed_uri, feed_info, data):
    log = planet.logger
    sources = config.cache_sources_directory()

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries) > 0:
            data.status = 200
        elif data.bozo and \
            data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500

    activity_horizon = \
        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))

    # process based on the HTTP status code
    if data.status == 200 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if data.has_key("entries") and len(data.entries) == 0:
            log.warning("No data %s", feed_uri)
            feed_info.feed['planet_message'] = 'no data'
        elif feed_uri == data.url:
            log.info("Updating feed %s", feed_uri)
        else:
            log.info("Updating feed %s @ %s", feed_uri, data.url)
    elif data.status == 301 and data.has_key("entries") and len(
            data.entries) > 0:
        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
        data.feed['planet_http_location'] = data.url
    elif data.status == 304 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if feed_uri == data.url:
            log.info("Feed %s unchanged", feed_uri)
        else:
            log.info("Feed %s unchanged @ %s", feed_uri, data.url)

        if not feed_info.feed.has_key('planet_message'):
            if feed_info.feed.has_key('planet_updated'):
                updated = feed_info.feed.planet_updated
                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
                    return
        else:
            if feed_info.feed.planet_message.startswith("no activity in"):
                return
            if not feed_info.feed.planet_message.startswith("duplicate") and \
               not feed_info.feed.planet_message.startswith("no data"):
                del feed_info.feed['planet_message']

    elif data.status == 410:
        log.info("Feed %s gone", feed_uri)
    elif data.status == 408:
        log.warning("Feed %s timed out", feed_uri)
    elif data.status >= 400:
        log.error("Error %d while updating feed %s", data.status, feed_uri)
    else:
        log.info("Updating feed %s", feed_uri)

    # if read failed, retain cached information
    if not data.version and feed_info.version:
        data.feed = feed_info.feed
        data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true'
        data.version = feed_info.feed.get('planet_format')
    data.feed['planet_http_status'] = str(data.status)

    # capture etag and last-modified information
    if data.has_key('headers'):
        if data.has_key('etag') and data.etag:
            data.feed['planet_http_etag'] = data.etag
        elif data.headers.has_key('etag') and data.headers['etag']:
            data.feed['planet_http_etag'] = data.headers['etag']

        if data.headers.has_key('last-modified'):
            data.feed['planet_http_last_modified'] = data.headers[
                'last-modified']
        elif data.has_key('modified') and data.modified:
            data.feed['planet_http_last_modified'] = time.asctime(
                data.modified)

        if data.headers.has_key('-content-hash'):
            data.feed['planet_content_hash'] = data.headers['-content-hash']

    # capture feed and data from the planet configuration file
    if data.version:
        if not data.feed.has_key('links'): data.feed['links'] = list()
        feedtype = 'application/atom+xml'
        if data.version.startswith('rss'): feedtype = 'application/rss+xml'
        if data.version in ['rss090', 'rss10']:
            feedtype = 'application/rdf+xml'
        for link in data.feed.links:
            if link.rel == 'self':
                link['type'] = feedtype
                break
        else:
            data.feed.links.append(
                feedparser.FeedParserDict({
                    'rel': 'self',
                    'type': feedtype,
                    'href': feed_uri
                }))
    for name, value in config.feed_options(feed_uri).items():
        data.feed['planet_' + name] = value

    # perform user configured scrub operations on the data
    scrub.scrub(feed_uri, data)

    from planet import idindex
    global index
    if index != None: index = idindex.open()

    # select latest entry for each unique id
    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
            if not entry['id']: continue

        # determine updated date for purposes of selection
        updated = ''
        if entry.has_key('published'): updated = entry.published
        if entry.has_key('updated'): updated = entry.updated

        # if not seen or newer than last seen, select it
        if updated >= ids.get(entry.id, ('', ))[0]:
            ids[entry.id] = (updated, entry)

    # write each entry to the cache
    cache = config.cache_directory()
    for updated, entry in ids.values():

        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        #FEEDWORLD: published preferred over updated
        if not entry.has_key(
                'published_parsed') or not entry['published_parsed']:
            entry['published_parsed'] = entry.get('updated_parsed', None)
        if entry.has_key('published_parsed'):
            try:
                mtime = calendar.timegm(entry.published_parsed)
            except:
                pass
        if not mtime:
            try:
                mtime = os.stat(cache_file).st_mtime
            except:
                if data.feed.has_key('published_parsed'):
                    try:
                        mtime = calendar.timegm(data.feed.published_parsed)
                    except:
                        pass
        if not mtime: mtime = time.time()
        entry['updated_parsed'] = time.gmtime(mtime)

        # apply any filters
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml().encode('utf-8')
        xdoc.unlink()
        for filter in config.filters(feed_uri):
            output = shell.run(filter, output, mode="filter")
            if not output: break
        if not output:
            if os.path.exists(cache_file): os.remove(cache_file)
            continue

        #FEEDWORLD: ugly hack to get around feeds that don't have timestamps
        try:
            elements = minidom.parseString(output).getElementsByTagName(
                "feedworld_mtime")
            if len(elements) > 0:
                mtime = int(elements[0].firstChild.data)
                #log.info("mtime=%d\n"%mtime)
        except Exception, e:
            log.debug("error: %s\n" % e)

        # write out and timestamp the results
        write(output, cache_file)
        os.utime(cache_file, (mtime, mtime))

        # optionally index
        if index != None:
            feedid = data.feed.get('id', data.feed.get('link', None))
            if feedid:
                if type(feedid) == unicode: feedid = feedid.encode('utf-8')
                index[filename('', entry.id)] = feedid

Beispiel #3

Datei anzeigen

Datei: __main__.py Projekt: Network-verification/batfish

def main(argv=None): # IGNORE:C0111
    '''Command line options.'''

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
    if __name__ == '__main__':
        program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    else:
        program_shortdesc = __doc__.split("\n")[1]
    program_license = '''%s

  Created by Ari Fogel on %s.
  Copyright 2016 Ari Fogel. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    try:
        # Setup argument parser
        parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter)
        parser.add_argument('-o', '--outputdir', dest='outputDir', default=None, help="path to output directory", metavar="path")
        #parser.add_argument("-r", "--recursive", dest="recurse", action="store_true", help="recurse into subfolders [default: %(default)s]")
        parser.add_argument("-v", "--verbose", dest="verbose", action="count", help="set verbosity level [default: %(default)s]")
        #parser.add_argument("-i", "--include", dest="include", help="only include paths matching this regex pattern. Note: exclude is given preference over include. [default: %(default)s]", metavar="RE" )
        #parser.add_argument("-e", "--exclude", dest="exclude", help="exclude paths matching this regex pattern. [default: %(default)s]", metavar="RE" )
        parser.add_argument('-V', '--version', action='version', version=program_version_message)
        parser.add_argument('-i', '--inputdir', dest="inputDir", help="path to directory with source configuration file(s)", default=None, metavar="path", required=True)

        # Process arguments
        args = parser.parse_args()

        inputDir = args.inputDir
        options = Options()
        options.inputDir = inputDir
        options.verbose = args.verbose
        options.outputDir = args.outputDir

        if inputDir == None:
            raise BatfishException("No input directory specified"), None, sys.exc_info()[2]
        inputFiles = [join(inputDir, f) for f in listdir(inputDir) if isfile(join(inputDir, f))]
        for inputFile in inputFiles:
            ### do something with inpath ###
            scrub(inputFile, options)
        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
    except BatfishException as e:
        raise BatfishException("error running batfish_scrubber", e), None, sys.exc_info()[2]
    except Exception, e:
        if DEBUG or TESTRUN:
            raise(e)
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2

Beispiel #4

Datei anzeigen

Datei: spider.py Projekt: ArchLinuxJP/archplanet-jp

def writeCache(feed_uri, feed_info, data):
    log = planet.logger
    sources = config.cache_sources_directory()
    blacklist = config.cache_blacklist_directory()

    # capture http status
    if not data.has_key("status"):
        if data.has_key("entries") and len(data.entries)>0:
            data.status = 200
        elif data.bozo and \
            data.bozo_exception.__class__.__name__.lower()=='timeout':
            data.status = 408
        else:
            data.status = 500

    activity_horizon = \
        time.gmtime(time.time()-86400*config.activity_threshold(feed_uri))

    # process based on the HTTP status code
    if data.status == 200 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if data.has_key("entries") and len(data.entries) == 0:
            log.warning("No data %s", feed_uri)
            feed_info.feed['planet_message'] = 'no data'
        elif feed_uri == data.url:
            log.info("Updating feed %s", feed_uri)
        else:
            log.info("Updating feed %s @ %s", feed_uri, data.url)
    elif data.status == 301 and data.has_key("entries") and len(data.entries)>0:
        log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url)
        data.feed['planet_http_location'] = data.url
    elif data.status == 304 and data.has_key("url"):
        feed_info.feed['planet_http_location'] = data.url
        if feed_uri == data.url:
            log.info("Feed %s unchanged", feed_uri)
        else:
            log.info("Feed %s unchanged @ %s", feed_uri, data.url)

        if not feed_info.feed.has_key('planet_message'):
            if feed_info.feed.has_key('planet_updated'):
                updated = feed_info.feed.planet_updated
                if feedparser._parse_date_iso8601(updated) >= activity_horizon:
                    return
        else:
            if feed_info.feed.planet_message.startswith("no activity in"):
               return
            if not feed_info.feed.planet_message.startswith("duplicate") and \
               not feed_info.feed.planet_message.startswith("no data"):
               del feed_info.feed['planet_message']

    elif data.status == 410:
        log.info("Feed %s gone", feed_uri)
    elif data.status == 408:
        log.warning("Feed %s timed out", feed_uri)
    elif data.status >= 400:
        log.error("Error %d while updating feed %s", data.status, feed_uri)
    else:
        log.info("Updating feed %s", feed_uri)

    # if read failed, retain cached information
    if not data.get('version') and feed_info.get('version'):
        data.feed = feed_info.feed
        data.bozo = feed_info.feed.get('planet_bozo','true') == 'true'
        data.version = feed_info.feed.get('planet_format')
    data.feed['planet_http_status'] = str(data.status)

    # capture etag and last-modified information
    if data.has_key('headers'):
        if data.has_key('etag') and data.etag:
            data.feed['planet_http_etag'] = data.etag
        elif data.headers.has_key('etag') and data.headers['etag']:
            data.feed['planet_http_etag'] =  data.headers['etag']

        if data.headers.has_key('last-modified'):
            data.feed['planet_http_last_modified']=data.headers['last-modified']
        elif data.has_key('modified') and data.modified:
            data.feed['planet_http_last_modified'] = time.asctime(data.modified)

        if data.headers.has_key('-content-hash'):
            data.feed['planet_content_hash'] = data.headers['-content-hash']

    # capture feed and data from the planet configuration file
    if data.get('version'):
        if not data.feed.has_key('links'): data.feed['links'] = list()
        feedtype = 'application/atom+xml'
        if data.version.startswith('rss'): feedtype = 'application/rss+xml'
        if data.version in ['rss090','rss10']: feedtype = 'application/rdf+xml'
        for link in data.feed.links:
            if link.rel == 'self':
                link['type'] = feedtype
                break
        else:
            data.feed.links.append(feedparser.FeedParserDict(
                {'rel':'self', 'type':feedtype, 'href':feed_uri}))
    for name, value in config.feed_options(feed_uri).items():
        data.feed['planet_'+name] = value

    # perform user configured scrub operations on the data
    scrub.scrub(feed_uri, data)

    from planet import idindex
    global index
    if index != None: index = idindex.open()
 
    # select latest entry for each unique id
    ids = {}
    for entry in data.entries:
        # generate an id, if none is present
        if not entry.has_key('id') or not entry.id:
            entry['id'] = reconstitute.id(None, entry)
        elif hasattr(entry['id'], 'values'):
            entry['id'] = entry['id'].values()[0]
        if not entry['id']: continue

        # determine updated date for purposes of selection
        updated = ''
        if entry.has_key('published'): updated=entry.published
        if entry.has_key('updated'):   updated=entry.updated

        # if not seen or newer than last seen, select it
        if updated >= ids.get(entry.id,('',))[0]:
            ids[entry.id] = (updated, entry)

    # write each entry to the cache
    cache = config.cache_directory()
    for updated, entry in ids.values():

        # compute blacklist file name based on the id
        blacklist_file = filename(blacklist, entry.id)  

        # check if blacklist file exists. If so, skip it. 
        if os.path.exists(blacklist_file):
           continue

        # compute cache file name based on the id
        cache_file = filename(cache, entry.id)

        # get updated-date either from the entry or the cache (default to now)
        mtime = None
        if not entry.has_key('updated_parsed') or not entry['updated_parsed']:
            entry['updated_parsed'] = entry.get('published_parsed',None)
        if entry.has_key('updated_parsed'):
            try:
                mtime = calendar.timegm(entry.updated_parsed)
            except:
                pass
        if not mtime:
            try:
                mtime = os.stat(cache_file).st_mtime
            except:
                if data.feed.has_key('updated_parsed'):
                    try:
                        mtime = calendar.timegm(data.feed.updated_parsed)
                    except:
                        pass
        if not mtime: mtime = time.time()
        entry['updated_parsed'] = time.gmtime(mtime)

        # apply any filters
        xdoc = reconstitute.reconstitute(data, entry)
        output = xdoc.toxml().encode('utf-8')
        xdoc.unlink()
        for filter in config.filters(feed_uri):
            output = shell.run(filter, output, mode="filter")
            if not output: break
        if not output:
          if os.path.exists(cache_file): os.remove(cache_file)
          continue

        # write out and timestamp the results
        write(output, cache_file, mtime) 
    
        # optionally index
        if index != None: 
            feedid = data.feed.get('id', data.feed.get('link',None))
            if feedid:
                if type(feedid) == unicode: feedid = feedid.encode('utf-8')
                index[filename('', entry.id)] = feedid

    if index: index.close()

    # identify inactive feeds
    if config.activity_threshold(feed_uri):
        updated = [entry.updated_parsed for entry in data.entries
            if entry.has_key('updated_parsed')]
        updated.sort()

        if updated:
            data.feed['planet_updated'] = \
                time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1])
        elif data.feed.has_key('planet_updated'):
           updated = [feedparser._parse_date_iso8601(data.feed.planet_updated)]

        if not updated or updated[-1] < activity_horizon:
            msg = "no activity in %d days" % config.activity_threshold(feed_uri)
            log.info(msg)
            data.feed['planet_message'] = msg

    # report channel level errors
    if data.status == 226:
        if data.feed.has_key('planet_message'): del data.feed['planet_message']
        if feed_info.feed.has_key('planet_updated'):
            data.feed['planet_updated'] = feed_info.feed['planet_updated']
    elif data.status == 403:
        data.feed['planet_message'] = "403: forbidden"
    elif data.status == 404:
        data.feed['planet_message'] = "404: not found"
    elif data.status == 408:
        data.feed['planet_message'] = "408: request timeout"
    elif data.status == 410:
        data.feed['planet_message'] = "410: gone"
    elif data.status == 500:
        data.feed['planet_message'] = "internal server error"
    elif data.status >= 400:
        data.feed['planet_message'] = "http status %s" % data.status

    # write the feed info to the cache
    if not os.path.exists(sources): os.makedirs(sources)
    xdoc=minidom.parseString('''<feed xmlns:planet="%s"
      xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns)
    reconstitute.source(xdoc.documentElement,data.feed,data.bozo,data.version)
    write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri))
    xdoc.unlink()

Beispiel #5

Datei anzeigen

def main(argv=None):  # IGNORE:C0111
    '''Command line options.'''

    if argv is None:
        argv = sys.argv
    else:
        sys.argv.extend(argv)

    program_name = os.path.basename(sys.argv[0])
    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version,
                                                     program_build_date)
    if __name__ == '__main__':
        program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    else:
        program_shortdesc = __doc__.split("\n")[1]
    program_license = '''%s

  Created by Ari Fogel on %s.
  Copyright 2016 Ari Fogel. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    try:
        # Setup argument parser
        parser = ArgumentParser(description=program_license,
                                formatter_class=RawDescriptionHelpFormatter)
        parser.add_argument('-o',
                            '--outputdir',
                            dest='outputDir',
                            default=None,
                            help="path to output directory",
                            metavar="path")
        #parser.add_argument("-r", "--recursive", dest="recurse", action="store_true", help="recurse into subfolders [default: %(default)s]")
        parser.add_argument("-v",
                            "--verbose",
                            dest="verbose",
                            action="count",
                            help="set verbosity level [default: %(default)s]")
        #parser.add_argument("-i", "--include", dest="include", help="only include paths matching this regex pattern. Note: exclude is given preference over include. [default: %(default)s]", metavar="RE" )
        #parser.add_argument("-e", "--exclude", dest="exclude", help="exclude paths matching this regex pattern. [default: %(default)s]", metavar="RE" )
        parser.add_argument('-V',
                            '--version',
                            action='version',
                            version=program_version_message)
        parser.add_argument(
            '-i',
            '--inputdir',
            dest="inputDir",
            help="path to directory with source configuration file(s)",
            default=None,
            metavar="path",
            required=True)

        # Process arguments
        args = parser.parse_args()

        inputDir = args.inputDir
        options = Options()
        options.inputDir = inputDir
        options.verbose = args.verbose
        options.outputDir = args.outputDir

        if inputDir == None:
            raise BatfishException(
                "No input directory specified"), None, sys.exc_info()[2]
        inputFiles = [
            join(inputDir, f) for f in listdir(inputDir)
            if isfile(join(inputDir, f))
        ]
        for inputFile in inputFiles:
            ### do something with inpath ###
            scrub(inputFile, options)
        return 0
    except KeyboardInterrupt:
        ### handle keyboard interrupt ###
        return 0
    except BatfishException as e:
        raise BatfishException("error running batfish_scrubber",
                               e), None, sys.exc_info()[2]
    except Exception, e:
        if DEBUG or TESTRUN:
            raise (e)
        indent = len(program_name) * " "
        sys.stderr.write(program_name + ": " + repr(e) + "\n")
        sys.stderr.write(indent + "  for help use --help")
        return 2