def get_html_report(cfg_file=None, cfg=None): """ Format a report in HTML """ rval = "" if cfg is not None: # use it pass elif cfg_file is not None: cfg = CrawlConfig.add_config(filename=cfg_file) else: cfg = CrawlConfig.add_config() db = CrawlDBI.DBI(dbtype="crawler") last_rpt_time = rpt_lib.get_last_rpt_time(db) rval += ('<head><meta http-equiv="refresh" content="60">\n') rval += ("<title>HPSSIC Dashboard</title></head>") rval += ("<body><center><h1>HPSS Integrity Crawler Dashboard</h1>" + "<br><h4>Version %s</h4>" % version.__version__ + "</center>\n") rval += ("Report generated at %s\n" % time.strftime("%Y.%m%d %H:%M:%S")) rval += ("<br>Based on data from %s\n" % time.strftime("%Y.%m%d %H:%M:%S", time.localtime(last_rpt_time))) rval += get_html_cv_report(db, last_rpt_time) rval += get_html_mpra_report(db, last_rpt_time) rval += get_html_tcc_report(db, last_rpt_time) rval += "</body>" db.close() return rval
def cvv_show_next(argv): """show_next - Report the Checkables in the order they will be checked usage: cvtool show_next """ p = optparse.OptionParser() p.add_option('-c', '--config', action='store', default='', dest='config', help='alternate configuration') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--id', action='store', default='', dest='id', help='id of entry to be checked') p.add_option('-l', '--limit', action='store', default=-1, dest='limit', type=int, help='max records to get') p.add_option('-p', '--path', action='store', default='', dest='path', help='name of path to be checked') p.add_option('-v', '--verbose', action='store_true', default=False, dest='verbose', help='more information') try: (o, a) = p.parse_args(argv) except SystemExit: return if o.debug: pdb.set_trace() if o.config: cfg = CrawlConfig.add_config(close=True, filename=o.config) else: cfg = CrawlConfig.add_config() if o.limit < 0: limit = int(cfg.get_d('cv', 'operations', '10')) else: limit = o.limit clist = Checkable.Checkable.get_list(limit) for c in clist: if c.last_check == 0: print("%18d %s %s" % (c.last_check, c.type, c.path)) else: print("%s %s %s" % (U.ymdhms(c.last_check), c.type, c.path))
def running_pid(proc_required=True, context=None): """ Return a list of pids if the crawler is running (per ps(1)) or [] otherwise """ cfg = CrawlConfig.add_config() rval = [] if proc_required: result = pidcmd() for line in result.split("\n"): if 'crawl start' in line: pid = int(line.split()[0]) pfpath = "%s/%d" % (CrawlConfig.pid_dir(), pid) if os.path.exists(pfpath): (ctx, xpath) = util.contents(pfpath).strip().split() rval.append((pid, ctx, xpath)) elif not os.path.exists(pfpath + '.DEFUNCT'): # crawler is running but the pid file has been lost ctx = context or cfg.get('crawler', 'context') xpath = cfg.get_d('crawler', 'exitpath', '%s.exit' % ctx) make_pidfile(pid, ctx, xpath) rval.append((pid, ctx, xpath)) # if pfpath + '.DEFUNCT' exists, the crawler is shutting down # so we don't want to recreate the pid file. else: pid_l = glob.glob("%s/*" % CrawlConfig.pid_dir()) for pid_n in pid_l: pid = int(os.path.basename(pid_n)) (ctx, xpath) = util.contents(pid_n).strip().split() rval.append((pid, ctx, xpath)) return rval
def clean_defunct_pidfiles(context): """ Remove .DEFUNCT pid files for *context* """ cfg = CrawlConfig.add_config() pdir = CrawlConfig.pid_dir() for path in glob.glob(os.path.join(pdir, '*.DEFUNCT')): c = util.contents(path) if context in c: os.unlink(path)
def load_recheck_list(cls, how_many): """ Look to see whether any of the already checksummed items in the database have a last check time over the threshold for rechecking. If so, we'll shove some of them to the front of the list based on the configuration. """ cfg = CrawlConfig.add_config() r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0')) r_age = cfg.get_time('cv', 'recheck_age', 365*24*3600) threshold = int(time.time() - r_age) CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold) if r_fraction == 0.0: return [] limit = round(r_fraction * how_many) db = CrawlDBI.DBI(dbtype='crawler') kw = {'table': 'checkables', 'fields': ['rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported'], 'where': 'checksum <> 0 and last_check < %d' % threshold, 'orderby': 'last_check', 'limit': limit} rows = db.select(**kw) db.close() rval = [] for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), in_db=True, dirty=False) rval.append(new) return rval
def load_recheck_list(cls, how_many): """ Look to see whether any of the already checksummed items in the database have a last check time over the threshold for rechecking. If so, we'll shove some of them to the front of the list based on the configuration. """ cfg = CrawlConfig.add_config() r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0')) r_age = cfg.get_time('cv', 'recheck_age', 365 * 24 * 3600) threshold = int(time.time() - r_age) CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold) if r_fraction == 0.0: return [] limit = round(r_fraction * how_many) db = CrawlDBI.DBI(dbtype='crawler') kw = { 'table': 'checkables', 'fields': [ 'rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported' ], 'where': 'checksum <> 0 and last_check < %d' % threshold, 'orderby': 'last_check', 'limit': limit } rows = db.select(**kw) db.close() rval = [] for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), in_db=True, dirty=False) rval.append(new) return rval
def history_load(loadlist, filename): """ Each plugin's sublib has a load_history() routine that knows how to load its data to the history file. Unfortunately, we do have to know here something special about plugin 'cv' to warn the user when a filename was specified without 'cv' in the load list or vice versa and when to pass filename to the plugin's load_history() method. """ cfg = CrawlConfig.add_config() pluglist = U.csv_list(cfg.get_d('crawler', 'plugins', U.default_plugins())) ll = U.csv_list(loadlist) if 'all' in ll or ll == []: ll = copy.deepcopy(pluglist) if filename is None and 'cv' in ll: print(MSG.history_cv_not_loaded) ll.remove('cv') elif filename is not None and 'cv' not in ll: print(MSG.history_filename_ignored) unk_plugs = [x for x in ll if x not in pluglist] if 0 < len(unk_plugs): print(MSG.unrecognized_plugin_S % ', '.join(unk_plugs)) map(ll.remove, unk_plugs) if ll == []: return dbschem.make_table('history') for plug in [x for x in ll if x in pluglist]: print("loading %s..." % plug) if plug == 'cv' and filename is not None: args = [filename] else: args = [] p = CrawlPlugin.CrawlPlugin(name=plug, cfg=cfg) p.load_history(*args)
def __init__(self, *args, **kwargs): """ Initialize a Checkable object -- set the path, type, checksum, cos, and last_check to default values, then update them based on the arguments. """ # where the item is in HPSS self.path = '---' # file ('f') or directory ('d') self.type = '-' # which COS the file is in (empty for directories) self.cos = '' # which tape cartridge(s) the file is stored on self.cart = None # the type of tape cartridge(s) self.ttypes = None # 1 if we have a checksum stored, else 0 self.checksum = 0 # how many times we've tried and failed to retrieve the file content self.fails = 0 # whether we've reported that retrievals are failing for this file self.reported = 0 # when was the last check of this file (epoch time) self.last_check = 0 # this item's row id in the database self.rowid = None # how likely are we to add an item to the sample? self.probability = 0.1 # whether this object is in the database self.in_db = False # whether this object has been changed self.dirty = False # non keyword arguments self.args = args for k in kwargs: if k not in ['rowid', 'path', 'type', 'checksum', 'cos', 'cart', 'ttypes', 'dim', 'fails', 'reported', 'last_check', 'probability', 'in_db', 'dirty']: raise StandardError("Attribute %s is invalid for Checkable" % k) setattr(self, k, kwargs[k]) for attr in ['checksum', 'fails', 'reported']: if getattr(self, attr) is None: setattr(self, attr, 0) # Set up dimensions based on configuration. If no dimensions option is # set in the configuration, we just leave the dimensions dict emtpy. # Since this class is only used by the cv_plugin, it makes no sense for # this code to be running if there is no cv section in the # configuration, so we'll let that exception get thrown up the stack. cfg = CrawlConfig.add_config() self.dim = {} try: dim_l = util.csv_list(cfg.get('cv', 'dimensions')) for dname in dim_l: self.dim[dname] = Dimension.get_dim(dname) except CrawlConfig.NoOptionError: pass super(Checkable, self).__init__()
def get_list(cls, how_many=-1, prob=0.1, rootlist=[]): """ Return the current list of Checkables from the database. """ if how_many < 0: cfg = CrawlConfig.add_config() how_many = int(cfg.get_d('cv', 'operations', '30')) rval = Checkable.load_priority_list() if how_many <= len(rval): return rval rval.extend(Checkable.load_recheck_list(how_many)) if how_many <= len(rval): return rval db = CrawlDBI.DBI(dbtype='crawler') kw = {'table': 'checkables', 'fields': ['rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported'], 'orderby': 'last_check'} if 0 < how_many: kw['limit'] = how_many rows = db.select(**kw) # check whether any roots from rootlist are missing and if so, add them # to the table reselect = False pathlist = [x[1] for x in rows] for root in rootlist: if root not in pathlist: nr = Checkable(path=root, type='d') nr.load() nr.persist() reselect = True if reselect: rows = db.select(**kw) for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), probability=prob, in_db=True, dirty=False) if new not in rval: rval.append(new) if how_many <= len(rval): break db.close() CrawlConfig.log("returning %d items" % len(rval)) return rval
def __init__(self, *args, **kwargs): """ Initialize a Checkable object -- set the path, type, checksum, cos, and last_check to default values, then update them based on the arguments. """ # where the item is in HPSS self.path = '---' # file ('f') or directory ('d') self.type = '-' # which COS the file is in (empty for directories) self.cos = '' # which tape cartridge(s) the file is stored on self.cart = None # the type of tape cartridge(s) self.ttypes = None # 1 if we have a checksum stored, else 0 self.checksum = 0 # how many times we've tried and failed to retrieve the file content self.fails = 0 # whether we've reported that retrievals are failing for this file self.reported = 0 # when was the last check of this file (epoch time) self.last_check = 0 # this item's row id in the database self.rowid = None # how likely are we to add an item to the sample? self.probability = 0.1 # whether this object is in the database self.in_db = False # whether this object has been changed self.dirty = False # non keyword arguments self.args = args for k in kwargs: if k not in [ 'rowid', 'path', 'type', 'checksum', 'cos', 'cart', 'ttypes', 'dim', 'fails', 'reported', 'last_check', 'probability', 'in_db', 'dirty' ]: raise StandardError("Attribute %s is invalid for Checkable" % k) setattr(self, k, kwargs[k]) for attr in ['checksum', 'fails', 'reported']: if getattr(self, attr) is None: setattr(self, attr, 0) # Set up dimensions based on configuration. If no dimensions option is # set in the configuration, we just leave the dimensions dict emtpy. # Since this class is only used by the cv_plugin, it makes no sense for # this code to be running if there is no cv section in the # configuration, so we'll let that exception get thrown up the stack. cfg = CrawlConfig.add_config() self.dim = {} try: dim_l = util.csv_list(cfg.get('cv', 'dimensions')) for dname in dim_l: self.dim[dname] = Dimension.get_dim(dname) except CrawlConfig.NoOptionError: pass super(Checkable, self).__init__()
def get_list(cls, how_many=-1, prob=0.1, rootlist=[]): """ Return the current list of Checkables from the database. """ if how_many < 0: cfg = CrawlConfig.add_config() how_many = int(cfg.get_d('cv', 'operations', '30')) rval = Checkable.load_priority_list() if how_many <= len(rval): return rval rval.extend(Checkable.load_recheck_list(how_many)) if how_many <= len(rval): return rval db = CrawlDBI.DBI(dbtype='crawler') kw = { 'table': 'checkables', 'fields': [ 'rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported' ], 'orderby': 'last_check' } if 0 < how_many: kw['limit'] = how_many rows = db.select(**kw) # check whether any roots from rootlist are missing and if so, add them # to the table reselect = False pathlist = [x[1] for x in rows] for root in rootlist: if root not in pathlist: nr = Checkable(path=root, type='d') nr.load() nr.persist() reselect = True if reselect: rows = db.select(**kw) for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), probability=prob, in_db=True, dirty=False) if new not in rval: rval.append(new) if how_many <= len(rval): break db.close() CrawlConfig.log("returning %d items" % len(rval)) return rval
def dispatch(self): """ Figure out where we're supposed to send this alert and send it. Possible destinations are the log file, one or more e-mail addresses, and/or a shell program. It's also possible for a 'use' option to show up in the alerts section. In this case, we're being redirected to another section, also 'use' can also point to the current alerts section. There's no reason to ever do this, but it could happen so we want to handle it in a reasonable way. That's why we sort the config options in the while statement below -- to make 'use' get handled last, so any other options in the section will get handled. Once we process 'use', anything not yet processed in the current section is ignored. """ if self.cfg is not None: cfg = self.cfg else: cfg = CrawlConfig.add_config() if self.caller != '': section = cfg.get(self.caller, 'alerts') else: section = 'alerts' done = False while not done: for opt in sorted(cfg.options(section)): if opt == 'log': # write to log fmt = cfg.get(section, 'log') CrawlConfig.log(fmt, self.msg) done = True elif opt == 'shell': # run the program cmd = cfg.get(section, 'shell') if '%s' in cmd: cmdline = cmd % (self.msg) else: cmdline = cmd os.system(cmdline) CrawlConfig.log("ran: '%s'" % (cmdline)) done = True elif opt == 'email': CrawlMail.send(cfg=cfg, to="%s.email" % section, subj="HPSS Integrity Crawler ALERT", msg=self.msg) done = True elif opt == 'use': # delegate to another section done = True new_section = cfg.get(section, 'use') # if it's the same section, ignore the 'use', but we don't # want to break the rule that all options after a 'use' are # ignored. So we set done to True to terminate the while # loop and break unconditionally at the end of this clause # to get out of the for loop if new_section != section: section = new_section done = False break
def crl_history(argv): """history - access to the plugin history usage: crawl history [--load|--show|--reset] --load {all,cv,mpra,tcc,rpt} Load the history table from listed plugin tables, log file --show Read the history table and report its contents. --reset Drop the history table. --read-log FILENAME If --load is specified and includes 'cv', read FILENAME and load cv history from it. To load just cv data, --load cv --read-log FILENAME To load just mpra data, --load mpra To load all plugins, --load all (or "") --read-log FILENAME If --load contains 'cv' but --read-log is not specified, an error message will be issued. If --load contains 'all' or is empty and --read-log is not specified, a warning will be issued to notify the user that cv data is not being loaded. If --load does not contain 'cv' or 'all' and is not empty and --read-log is specified, a warning will be issued that the log file is not being read and cv data is not being loaded. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-n', '--dry-run', action='store_true', default=False, dest='dryrun', help='just report') p.add_option('-l', '--load', action='store', default=None, dest='loadlist', help='plugins to load') p.add_option('-r', '--read-log', action='store', default=None, dest='filename', help='log file for cv history') p.add_option('-R', '--reset', action='store_true', default=False, dest='reset', help='drop the history table') p.add_option('-s', '--show', action='store', default='unset', dest='show', help='Report the contents of the history table') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() # This is saying, if any two of our primary command line options are set, # we have a problem since they are all mutually exclusive. if o.show == 'unset': o.show = None if any([all([o.loadlist is not None, o.reset]), all([o.loadlist is not None, o.show]), all([o.reset, o.show])]): raise SystemExit(MSG.history_options) if o.dryrun: cfg = CrawlConfig.add_config() table = cfg.get('dbi-crawler', 'tbl_prefix') + '_history' dbname = cfg.get('dbi-crawler', 'dbname') hostname = cfg.get('dbi-crawler', 'hostname') if o.show: # This option is non-destructive, so we ignore --dry-run for it. history_show(o.show) elif o.reset: if o.dryrun: print(MSG.history_reset_dryrun_SSS % (table, dbname, hostname)) else: print(dbschem.drop_table(table='history')) elif o.loadlist is not None: if o.dryrun: print(MSG.history_load_dryrun_SSSS % (table, dbname, hostname, o.filename)) else: history_load(o.loadlist, o.filename)
def crl_history(argv): """history - access to the plugin history usage: crawl history [--load|--show|--reset] --load {all,cv,mpra,tcc,rpt} Load the history table from listed plugin tables, log file --show Read the history table and report its contents. --reset Drop the history table. --read-log FILENAME If --load is specified and includes 'cv', read FILENAME and load cv history from it. To load just cv data, --load cv --read-log FILENAME To load just mpra data, --load mpra To load all plugins, --load all (or "") --read-log FILENAME If --load contains 'cv' but --read-log is not specified, an error message will be issued. If --load contains 'all' or is empty and --read-log is not specified, a warning will be issued to notify the user that cv data is not being loaded. If --load does not contain 'cv' or 'all' and is not empty and --read-log is specified, a warning will be issued that the log file is not being read and cv data is not being loaded. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-n', '--dry-run', action='store_true', default=False, dest='dryrun', help='just report') p.add_option('-l', '--load', action='store', default=None, dest='loadlist', help='plugins to load') p.add_option('-r', '--read-log', action='store', default=None, dest='filename', help='log file for cv history') p.add_option('-R', '--reset', action='store_true', default=False, dest='reset', help='drop the history table') p.add_option('-s', '--show', action='store', default='unset', dest='show', help='Report the contents of the history table') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() # This is saying, if any two of our primary command line options are set, # we have a problem since they are all mutually exclusive. if o.show == 'unset': o.show = None if any([ all([o.loadlist is not None, o.reset]), all([o.loadlist is not None, o.show]), all([o.reset, o.show]) ]): raise SystemExit(MSG.history_options) if o.dryrun: cfg = CrawlConfig.add_config() table = cfg.get('dbi-crawler', 'tbl_prefix') + '_history' dbname = cfg.get('dbi-crawler', 'dbname') hostname = cfg.get('dbi-crawler', 'hostname') if o.show: # This option is non-destructive, so we ignore --dry-run for it. history_show(o.show) elif o.reset: if o.dryrun: print(MSG.history_reset_dryrun_SSS % (table, dbname, hostname)) else: print(dbschem.drop_table(table='history')) elif o.loadlist is not None: if o.dryrun: print(MSG.history_load_dryrun_SSSS % (table, dbname, hostname, o.filename)) else: history_load(o.loadlist, o.filename)