def check_path(path, verbose=False, plugin=True, xof=True): """ If plugin is True, we want to log and store, which tcc_report does by default so we leave those flags alone. If plugin is False, we're interactive and we want to write any report to stdout. However, we only make a report if 1) verbose is True, or 2) the counts don't match. """ cosinfo = get_cos_info() nsobj = path_nsobject(path) try: bfl = get_bitfile_set(int(nsobj), 1) except U.HpssicError as e: if plugin: CrawlConfig.log(e.value) return elif xof: raise SystemExit(e.value) else: raise U.HpssicError(e.value) bf = U.pop0(bfl) sc_count = int(bf['SC_COUNT']) cos_count = int(cosinfo[bf['BFATTR_COS_ID']]) if plugin and sc_count != cos_count: tcc_report(bf, path=path) elif not plugin and (verbose or sc_count != cos_count): print(tcc_report(bf, path=path, log=False, store=False))
def verify(self, h): """ Attempt to verify the current file. """ CrawlConfig.log("hsi(%d) attempting to verify %s" % (h.pid(), self.path)) rsp = h.hashverify(self.path) if "TIMEOUT" in rsp or "ERROR" in rsp: rval = "skipped" self.set('fails', self.fails + 1) CrawlConfig.log( "hashverify transfer incomplete on %s -- skipping" % self.path) h.quit() elif "%s: (md5) OK" % self.path in rsp: rval = "matched" CrawlConfig.log("hashverify matched on %s" % self.path) elif "no valid checksum found" in rsp: if self.addable(self.cos): rval = self.add_to_sample(h) else: self.set('checksum', 0) rval = "skipped" CrawlConfig.log("hashverify skipped %s" % self.path) else: rval = Alert.Alert("Checksum mismatch: %s" % rsp) CrawlConfig.log("hashverify generated 'Checksum mismatch' " + "alert on %s" % self.path) return rval
def tcc_report(bitfile, cosinfo=None, path=None, log=True, store=True): """ The bitfile appears to not have the right number of copies. We're going to write its information out to a report for manual followup. """ cosinfo = get_cos_info() fmt = "%7s %8s %8s %s" hdr = fmt % ("COS", "Ccopies", "Fcopies", "Filepath") # Compute the bitfile's path if path is None: bfp = get_bitfile_path(bitfile['BFID']) else: bfp = path rpt = fmt % (bitfile['BFATTR_COS_ID'], str(cosinfo[bitfile['BFATTR_COS_ID']]), str(bitfile['SC_COUNT']), bfp) if log: CrawlConfig.log(rpt) if store: try: tcc_report._f.write(rpt + "\n") tcc_report._f.flush() except AttributeError: cfg = CrawlConfig.get_config() rptfname = cfg.get(sectname(), 'report_file') tcc_report._f = open(rptfname, 'a') tcc_report._f.write(hdr) tcc_report._f.write(rpt + "\n") tcc_report._f.flush() return rpt
def simplug(plugin, args): """ Common plugin simulator. May be used by the interactive tools to simulate running the associated plugin. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--iterations', action='store', default=1, dest='iterations', type='int', help='how many iterations to run') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log("starting %s simplug, just got config" % plugin) sys.path.append(cfg.get('crawler', 'plugin-dir')) modname = cfg.get(plugin, 'module') try: P = __import__(modname) except ImportError: H = __import__('hpssic.plugins.' + modname) P = getattr(H.plugins, modname) P.main(cfg) if 1 < o.iterations: for count in range(o.iterations-1): stime = cfg.get_time(plugin, 'frequency') time.sleep(stime) P.main(cfg)
def tcc_report(bitfile, cosinfo=None, path=None, log=True, store=True): """ The bitfile appears to not have the right number of copies. We're going to write its information out to a report for manual followup. """ cosinfo = get_cos_info() fmt = "%7s %8s %8s %s" hdr = fmt % ("COS", "Ccopies", "Fcopies", "Filepath") # Compute the bitfile's path if path is None: bfp = get_bitfile_path(bitfile['BFID']) else: bfp = path rpt = fmt % (bitfile['BFATTR_COS_ID'], str(cosinfo[bitfile['BFATTR_COS_ID']]), str(bitfile['SC_COUNT']), bfp) if log: CrawlConfig.log(rpt) if store: try: tcc_report._f.write(rpt + "\n") tcc_report._f.flush() except AttributeError: cfg = CrawlConfig.get_config() rptfname = cfg.get(sectname(), 'report_file') tcc_report._f = open(rptfname, 'a') tcc_report._f.write(hdr) tcc_report._f.write(rpt + "\n") tcc_report._f.flush() return rpt
def crl_cfgdump(argv): """cfgdump - load a config file and dump its contents usage: crawl cfgdump -c <filename> [--to stdout|log] [--logpath <path>] """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-t', '--to', action='store', default='', dest='target', help='specify where to send the output') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() if o.target == '': o.target = 'stdout' cfg = CrawlConfig.get_config(o.config) dumpstr = cfg.dump() if o.target == 'stdout': print dumpstr elif o.target == 'log': log = CrawlConfig.log(logpath=o.logpath, cfg=cfg) for line in dumpstr.split("\n"): CrawlConfig.log(line)
def crl_log(argv): """log - write a message to the indicated log file usage: crawl log --log <filename> <message> """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default=None, dest='logfile', help='specify the log file') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log(" ".join(a), logpath=o.logfile, cfg=cfg)
def verify(self, h): """ Attempt to verify the current file. """ CrawlConfig.log("hsi(%d) attempting to verify %s" % (h.pid(), self.path)) rsp = h.hashverify(self.path) if "TIMEOUT" in rsp or "ERROR" in rsp: rval = "skipped" self.set('fails', self.fails + 1) CrawlConfig.log("hashverify transfer incomplete on %s -- skipping" % self.path) h.quit() elif "%s: (md5) OK" % self.path in rsp: rval = "matched" CrawlConfig.log("hashverify matched on %s" % self.path) elif "no valid checksum found" in rsp: if self.addable(self.cos): rval = self.add_to_sample(h) else: self.set('checksum', 0) rval = "skipped" CrawlConfig.log("hashverify skipped %s" % self.path) else: rval = Alert.Alert("Checksum mismatch: %s" % rsp) CrawlConfig.log("hashverify generated 'Checksum mismatch' " + "alert on %s" % self.path) return rval
def record_checked_ids(cfg, low, high, correct, error): """ Save checked NSOBJECT ids in the HPSSIC database. If we check a range and get no hits (i.e., no NSOBJECT ids exist in the range), we'll store (<time>, <low-id>, <high-id>, 0, 0) If we get a hit with the right copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 1, 0) If we get a hit with the wrong copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 0, 1) """ tabname = cfg.get(sectname(), 'table_name') result = dbschem.make_table(tabname) ts = int(time.time()) CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts)) db = CrawlDBI.DBI(dbtype="crawler") db.insert(table=tabname, fields=[ 'check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct', 'error' ], data=[(ts, low, high, correct, error)]) db.close()
def check_path(path, verbose=False, plugin=True, xof=True): """ If plugin is True, we want to log and store, which tcc_report does by default so we leave those flags alone. If plugin is False, we're interactive and we want to write any report to stdout. However, we only make a report if 1) verbose is True, or 2) the counts don't match. """ cosinfo = get_cos_info() nsobj = path_nsobject(path) try: bfl = get_bitfile_set(int(nsobj), 1) except U.HpssicError as e: if plugin: CrawlConfig.log(e.value) return elif xof: raise SystemExit(e.value) else: raise U.HpssicError(e.value) bf = U.pop0(bfl) sc_count = int(bf['SC_COUNT']) cos_count = int(cosinfo[bf['BFATTR_COS_ID']]) if plugin and sc_count != cos_count: tcc_report(bf, path=path) elif not plugin and (verbose or sc_count != cos_count): print(tcc_report(bf, path=path, log=False, store=False))
def record_checked_ids(cfg, low, high, correct, error): """ Save checked NSOBJECT ids in the HPSSIC database. If we check a range and get no hits (i.e., no NSOBJECT ids exist in the range), we'll store (<time>, <low-id>, <high-id>, 0, 0) If we get a hit with the right copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 1, 0) If we get a hit with the wrong copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 0, 1) """ tabname = cfg.get(sectname(), 'table_name') result = dbschem.make_table(tabname) ts = int(time.time()) CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts)) db = CrawlDBI.DBI(dbtype="crawler") db.insert(table=tabname, fields=['check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct', 'error'], data=[(ts, low, high, correct, error)]) db.close()
def crl_start(argv): """start - if the crawler is not already running as a daemon, start it usage: crawl start default config file: crawl.cfg, or $CRAWL_CONF, or -c <filename> on command line default log file: /var/log/crawl.log, or $CRAWL_LOG, or -l <filename> on command line """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default='', dest='logfile', help='specify the log file') p.add_option('-C', '--context', action='store', default='', dest='context', help="context of crawler ('TEST' or 'PROD')") (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) # # Initialize the configuration # if o.context != '': cfg.set('crawler', 'context', o.context) try: exitpath = cfg.get('crawler', 'exitpath') except CrawlConfig.NoOptionError as e: print("No exit path is specified in the configuration") sys.exit(1) vstr = "HPSS Integrity Crawler version %s" % version.__version__ log = CrawlConfig.log(vstr, logpath=o.logfile, cfg=cfg) pfpath = make_pidfile(os.getpid(), cfg.get('crawler', 'context'), exitpath, just_check=True) crawler = CrawlDaemon(pfpath, stdout="crawler.stdout", stderr="crawler.stderr", logger=log, workdir='.') CrawlConfig.log('crl_start: calling crawler.start()') crawler.start() pass
def load_recheck_list(cls, how_many): """ Look to see whether any of the already checksummed items in the database have a last check time over the threshold for rechecking. If so, we'll shove some of them to the front of the list based on the configuration. """ cfg = CrawlConfig.add_config() r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0')) r_age = cfg.get_time('cv', 'recheck_age', 365 * 24 * 3600) threshold = int(time.time() - r_age) CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold) if r_fraction == 0.0: return [] limit = round(r_fraction * how_many) db = CrawlDBI.DBI(dbtype='crawler') kw = { 'table': 'checkables', 'fields': [ 'rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported' ], 'where': 'checksum <> 0 and last_check < %d' % threshold, 'orderby': 'last_check', 'limit': limit } rows = db.select(**kw) db.close() rval = [] for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), in_db=True, dirty=False) rval.append(new) return rval
def populate_cart(self, h): """ Fill in the cart field """ rsp = h.lsP(self.path) tmp = Checkable.fdparse(rsp.split("\n")[1]) try: self.cart = tmp.cart except AttributeError: self.cart = '' CrawlConfig.log("%s <- Checkable.fdparse('%s')" % (tmp, rsp.split("\n")[1]))
def populate_cart(self, h): """ Fill in the cart field """ rsp = h.lsP(self.path) tmp = Checkable.fdparse(rsp.split("\n")[1]) try: self.cart = tmp.cart except AttributeError: self.cart = '' CrawlConfig.log("%s <- Checkable.fdparse('%s')" % (tmp, rsp.split("\n")[1]))
def load_recheck_list(cls, how_many): """ Look to see whether any of the already checksummed items in the database have a last check time over the threshold for rechecking. If so, we'll shove some of them to the front of the list based on the configuration. """ cfg = CrawlConfig.add_config() r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0')) r_age = cfg.get_time('cv', 'recheck_age', 365*24*3600) threshold = int(time.time() - r_age) CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold) if r_fraction == 0.0: return [] limit = round(r_fraction * how_many) db = CrawlDBI.DBI(dbtype='crawler') kw = {'table': 'checkables', 'fields': ['rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported'], 'where': 'checksum <> 0 and last_check < %d' % threshold, 'orderby': 'last_check', 'limit': limit} rows = db.select(**kw) db.close() rval = [] for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), in_db=True, dirty=False) rval.append(new) return rval
def fire(self): """ Run the plugin. """ if self.firable: CrawlConfig.log("%s: firing" % self.name) # sys.modules[self.modname].main(self.cfg) errors = self.plugin.main(self.cfg) self.last_fired = time.time() crawl_sublib.record_history(self.name, self.last_fired, errors) elif self.cfg.getboolean('crawler', 'verbose'): CrawlConfig.log("%s: not firable" % self.name) self.last_fired = time.time()
def add_to_sample(self, hsi, already_hashed=False): """ Add the current Checkable to the sample. If already_hashed is True, this is a file for which a checksum has already been computed. We just need to record that fact by setting its checksum member to 1 and updating the sample count. If already_hashed is False, we need to carry out the following steps: 1) run hashcreate on the file 2) set checksum to non-zero to record that we have a checksum 3) update the sample count in the Dimension object """ if not already_hashed: CrawlConfig.log("starting hashcreate on %s", self.path) rsp = hsi.hashcreate(self.path) if "TIMEOUT" in rsp or "ERROR" in rsp: CrawlConfig.log("hashcreate transfer failed on %s", self.path) hsi.quit() self.set('fails', self.fails + 1) return "skipped" elif "Access denied" in rsp: CrawlConfig.log("hashcreate failed with 'access denied' on %s", self.path) hsi.quit() return "access denied" else: CrawlConfig.log("completed hashcreate on %s", self.path) if self.checksum == 0: for dn in self.dim: cat = getattr(self, dn) self.dim[dn].addone(cat) self.set('checksum', 1) return "checksummed"
def add_to_sample(self, hsi, already_hashed=False): """ Add the current Checkable to the sample. If already_hashed is True, this is a file for which a checksum has already been computed. We just need to record that fact by setting its checksum member to 1 and updating the sample count. If already_hashed is False, we need to carry out the following steps: 1) run hashcreate on the file 2) set checksum to non-zero to record that we have a checksum 3) update the sample count in the Dimension object """ if not already_hashed: CrawlConfig.log("starting hashcreate on %s", self.path) rsp = hsi.hashcreate(self.path) if "TIMEOUT" in rsp or "ERROR" in rsp: CrawlConfig.log("hashcreate transfer failed on %s", self.path) hsi.quit() self.set('fails', self.fails + 1) return "skipped" elif "Access denied" in rsp: CrawlConfig.log("hashcreate failed with 'access denied' on %s", self.path) hsi.quit() return "access denied" else: CrawlConfig.log("completed hashcreate on %s", self.path) if self.checksum == 0: for dn in self.dim: cat = getattr(self, dn) self.dim[dn].addone(cat) self.set('checksum', 1) return "checksummed"
def crl_fire(argv): """fire - run a plugin usage: crawl fire --cfg cfgname --logpath logfname --plugin plugname """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') p.add_option('-p', '--plugin', action='store', default='', dest='plugname', help='which plugin to fire') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) CrawlConfig.log(logpath=o.logpath, cfg=cfg) if o.plugname == '': print("'-p <plugin-name>' is required") elif not cfg.has_section(o.plugname): print("No plugin named '%s' found in configuration" % o.plugname) else: plugdir = cfg.get('crawler', 'plugin-dir') sys.path.append(plugdir) __import__(o.plugname) CrawlConfig.log('firing %s', o.plugname) sys.modules[o.plugname].main(cfg)
def crl_cfgdump(argv): """cfgdump - load a config file and dump its contents usage: crawl cfgdump -c <filename> [--to stdout|log] [--logpath <path>] """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-t', '--to', action='store', default='', dest='target', help='specify where to send the output') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() if o.target == '': o.target = 'stdout' cfg = CrawlConfig.get_config(o.config) dumpstr = cfg.dump() if o.target == 'stdout': print dumpstr elif o.target == 'log': log = CrawlConfig.log(logpath=o.logpath, cfg=cfg) for line in dumpstr.split("\n"): CrawlConfig.log(line)
def highest_nsobject_id(): """ Cache and return the largest NSOBJECT id in the DB2 database. The variables highest_nsobject_id._max_obj_id and highest_nsobject_id._when are local to this function but do not lose their values between invocations. """ if (not hasattr(highest_nsobject_id, '_max_obj_id') or not hasattr(highest_nsobject_id, '_when') or 60 < time.time() - highest_nsobject_id._when): highest_nsobject_id._max_obj_id = max_nsobj_id() highest_nsobject_id._when = time.time() CrawlConfig.log("max object id = %d at %s" % (highest_nsobject_id._max_obj_id, util.ymdhms(highest_nsobject_id._when))) rval = highest_nsobject_id._max_obj_id return rval
def get_last_rpt_time(db): """ Retrieve the last report time from the report table. If the table does not exist before make_table ('Created' in result), the table is empty so we just return 0 to indicate no last report time. """ result = dbschem.make_table("report") if "Created" in result: rval = 0 else: rows = db.select(table='report', fields=['max(report_time)']) (rval) = rows[0][0] if rval is None: rval = 0 CrawlConfig.log("time of last report: %d" % rval) return rval
def highest_nsobject_id(): """ Cache and return the largest NSOBJECT id in the DB2 database. The variables highest_nsobject_id._max_obj_id and highest_nsobject_id._when are local to this function but do not lose their values between invocations. """ if (not hasattr(highest_nsobject_id, '_max_obj_id') or not hasattr(highest_nsobject_id, '_when') or 60 < time.time() - highest_nsobject_id._when): highest_nsobject_id._max_obj_id = max_nsobj_id() highest_nsobject_id._when = time.time() CrawlConfig.log("max object id = %d at %s" % (highest_nsobject_id._max_obj_id, util.ymdhms(highest_nsobject_id._when))) rval = highest_nsobject_id._max_obj_id return rval
def get_last_rpt_time(db): """ Retrieve the last report time from the report table. If the table does not exist before make_table ('Created' in result), the table is empty so we just return 0 to indicate no last report time. """ result = dbschem.make_table("report") if "Created" in result: rval = 0 else: rows = db.select(table='report', fields=['max(report_time)']) (rval) = rows[0][0] if rval is None: rval = 0 CrawlConfig.log("time of last report: %d" % rval) return rval
def hashcreate(self, pathnames): """ Argument pathnames should reference one or more files. It may be a string containing one or more space separated file paths, or a list of one or more file paths. If it has type unicode, it will be encoded to 'ascii' before being treated as a string. """ if type(pathnames) == str: pathlist = pathnames.split() elif type(pathnames) == list: pathlist = pathnames elif type(pathnames) == unicode: pathlist = pathnames.encode('ascii', 'ignore').split() else: raise HSIerror("%s: Invalid argument (%s: '%s')" % (util.my_name(), type(pathnames), pathnames)) rval = "" for path in pathlist: if self.reset_atime: prev_time = self.access_time(path) if self.hash_algorithm is None: cmd = "hashcreate %s" % path else: cmd = "hashcreate -H %s %s" % (self.hash_algorithm, path) self.xobj.sendline(cmd) which = self.xobj.expect([self.prompt, pexpect.TIMEOUT] + self.hsierrs) while which == 1 and 1 < len(self.xobj.before): CrawlConfig.log("got a timeout, continuing because before " + "is not empty and does not contain an error") rval += self.xobj.before which = self.xobj.expect([self.prompt, pexpect.TIMEOUT] + self.hsierrs) rval += self.xobj.before if 1 == which: rval += " TIMEOUT" elif 0 != which: rval += " ERROR" if self.reset_atime: self.touch(path, when=prev_time) return rval
def crl_log(argv): """log - write a message to the indicated log file usage: crawl log --log <filename> <message> """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default=None, dest='logfile', help='specify the log file') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log(" ".join(a), logpath=o.logfile, cfg=cfg)
def __init__(self, name=None, cfg=None): """ Configuration data is read and copied into the object by method init_cfg_data(), called by both the constructor and reload(). init_cfg_data() reverses the order of cfg and name in its argument list from the constructor so name can have a default and reload() doesn't have to pass it. last_fired is initialized by the constructor but not by reload(). So if the plugin is updated by a reconfigure, it won't lose its last fire time but will stay on the same schedule. """ assert(name is not None) assert(cfg is not None) self.cfg = cfg l = CrawlConfig.log(cfg=cfg, close=True) CrawlConfig.log("%s: Initializing plugin data" % name) self.init_cfg_data(name, cfg) self.last_fired = time.time() - self.frequency - 1 super(CrawlPlugin, self).__init__()
def addable(self): """ Determine which Dimensions want this item added. Note that we want this routine to be general across dimensions so we don't want it to assume anything about the dimension it's checking (like that it's named 'cos' for example). That why calls to this pass in cos rather than looking at the value in the object. """ for dn in self.dim: cval = getattr(self, dn) if self.dim[dn].vote(cval) is False: CrawlConfig.log("%s votes against %s -- skipping" % (dn, self.path)) return False randval = random.random() if self.probability < randval: CrawlConfig.log("random votes against %s -- skipping (%g < %g)" % (self.path, self.probability, randval)) return False return True
def addable(self): """ Determine which Dimensions want this item added. Note that we want this routine to be general across dimensions so we don't want it to assume anything about the dimension it's checking (like that it's named 'cos' for example). That why calls to this pass in cos rather than looking at the value in the object. """ for dn in self.dim: cval = getattr(self, dn) if self.dim[dn].vote(cval) is False: CrawlConfig.log("%s votes against %s -- skipping" % (dn, self.path)) return False randval = random.random() if self.probability < randval: CrawlConfig.log("random votes against %s -- skipping (%g < %g)" % (self.path, self.probability, randval)) return False return True
def mpra_fetch_recent(type): """ Retrieve and return the most recent record reported so we don't report the same record repeatedly """ db = CrawlDBI.DBI(dbtype="crawler") if not db.table_exists(table='mpra'): CrawlConfig.log("Fetch from not existent mpra table -- return 0") return 0 rows = db.select(table='mpra', fields=['scan_time, end_time'], where='type = ?', data=(type,)) last_end_time = -1 max_scan_time = 0 for r in rows: if max_scan_time < r[0]: max_scan_time = r[0] last_end_time = r[1] if last_end_time < 0: CrawlConfig.log("No '%s' value in mpra -- returning 0" % type) return 0 else: CrawlConfig.log("Fetch '%s' from mpra table -- return %d" % (type, last_end_time)) return last_end_time
def simplug(plugin, args): """ Common plugin simulator. May be used by the interactive tools to simulate running the associated plugin. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--iterations', action='store', default=1, dest='iterations', type='int', help='how many iterations to run') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log("starting %s simplug, just got config" % plugin) sys.path.append(cfg.get('crawler', 'plugin-dir')) modname = cfg.get(plugin, 'module') try: P = __import__(modname) except ImportError: H = __import__('hpssic.plugins.' + modname) P = getattr(H.plugins, modname) P.main(cfg) if 1 < o.iterations: for count in range(o.iterations - 1): stime = cfg.get_time(plugin, 'frequency') time.sleep(stime) P.main(cfg)
def maybe_update_hsi(): """ If the hsi wrapper script has changed, grab and edit a fresh copy """ l = util.which_all('hsi') trg = l[0] tc = util.contents(trg).split("\n") tv = util.grep('^BINARYVERSION=', tc) s = [x for x in l if 'sources/hpss' in x] src = s[0] sc = util.contents(src).split("\n") sv = util.grep('^BINARYVERSION=', sc) if tv[0] != sv[0]: z = util.grep("${EXECUTABLE}", sc, regex=False, index=True) sc[z[0]] = "exec " + sc[z[0]] try: f = open(trg, 'w') f.writelines("\n".join(sc) + "\n") f.close() except IOError as e: CrawlConfig.log(MSG.hsi_wrap_ood)
def crl_fire(argv): """fire - run a plugin usage: crawl fire --cfg cfgname --logpath logfname --plugin plugname """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') p.add_option('-p', '--plugin', action='store', default='', dest='plugname', help='which plugin to fire') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) CrawlConfig.log(logpath=o.logpath, cfg=cfg) if o.plugname == '': print("'-p <plugin-name>' is required") elif not cfg.has_section(o.plugname): print("No plugin named '%s' found in configuration" % o.plugname) else: plugdir = cfg.get('crawler', 'plugin-dir') sys.path.append(plugdir) __import__(o.plugname) CrawlConfig.log('firing %s', o.plugname) sys.modules[o.plugname].main(cfg)
def quit(self): """ All done here. Let's bail. """ try: pid = self.xobj.pid self.xobj.sendline("quit") self.xobj.expect([pexpect.EOF, pexpect.TIMEOUT]) self.xobj.close() CrawlConfig.log("Closing hsi process %d" % pid) except OSError as e: tbstr = tb.format_exc() CrawlConfig.log("Ignoring OSError '%s'" % str(e)) for line in tbstr.split("\n"): CrawlConfig.log(line)
def dispatch(self): """ Figure out where we're supposed to send this alert and send it. Possible destinations are the log file, one or more e-mail addresses, and/or a shell program. It's also possible for a 'use' option to show up in the alerts section. In this case, we're being redirected to another section, also 'use' can also point to the current alerts section. There's no reason to ever do this, but it could happen so we want to handle it in a reasonable way. That's why we sort the config options in the while statement below -- to make 'use' get handled last, so any other options in the section will get handled. Once we process 'use', anything not yet processed in the current section is ignored. """ if self.cfg is not None: cfg = self.cfg else: cfg = CrawlConfig.add_config() if self.caller != '': section = cfg.get(self.caller, 'alerts') else: section = 'alerts' done = False while not done: for opt in sorted(cfg.options(section)): if opt == 'log': # write to log fmt = cfg.get(section, 'log') CrawlConfig.log(fmt, self.msg) done = True elif opt == 'shell': # run the program cmd = cfg.get(section, 'shell') if '%s' in cmd: cmdline = cmd % (self.msg) else: cmdline = cmd os.system(cmdline) CrawlConfig.log("ran: '%s'" % (cmdline)) done = True elif opt == 'email': CrawlMail.send(cfg=cfg, to="%s.email" % section, subj="HPSS Integrity Crawler ALERT", msg=self.msg) done = True elif opt == 'use': # delegate to another section done = True new_section = cfg.get(section, 'use') # if it's the same section, ignore the 'use', but we don't # want to break the rule that all options after a 'use' are # ignored. So we set done to True to terminate the while # loop and break unconditionally at the end of this clause # to get out of the for loop if new_section != section: section = new_section done = False break
def run(self): """ This routine runs in the background as a daemon. Here's where we fire off plug-ins as appropriate. """ cfgname = '' self.cfg = CrawlConfig.get_config(cfgname) self.pidfile = "%s/%d" % (self.piddir, os.getpid()) exit_file = self.cfg.get('crawler', 'exitpath') ctx = self.cfg.get('crawler', 'context') clean_defunct_pidfiles(ctx) make_pidfile(os.getpid(), ctx, exit_file) atexit.register(self.delpid) keep_going = True plugin_d = {} while keep_going: try: pluglstr = self.cfg.get('crawler', 'plugins') pluglist = [x.strip() for x in pluglstr.split(',')] for s in pluglist: self.dlog('crawl: CONFIG: [%s]' % s) for o in self.cfg.options(s): self.dlog('crawl: CONFIG: %s: %s' % (o, self.cfg.get(s, o))) if s == 'crawler': continue elif s in plugin_d.keys(): CrawlConfig.log("reloading plugin %s" % s) plugin_d[s].reload(self.cfg) else: CrawlConfig.log("initial load of plugin %s" % s) plugin_d[s] = CrawlPlugin.CrawlPlugin(name=s, cfg=self.cfg) # remove any plugins that are not in the new configuration for p in plugin_d.keys(): if p not in self.cfg.sections(): CrawlConfig.log("unloading obsolete plugin %s" % p) del plugin_d[p] heartbeat = self.cfg.get_time('crawler', 'heartbeat', 10) while keep_going: # # Fire any plugins that are due # if not self.cfg.quiet_time(time.time()): hb_msg = "crawl: heartbeat..." if self.fire_plugins(plugin_d): keep_going = False else: hb_msg = "crawl: heartbeat... [quiet]" # CrawlConfig.log("issue the heartbeat") # # Issue the heartbeat if it's time # if 0 == (int(time.time()) % heartbeat): # self.dlog(hb_msg) CrawlConfig.log(hb_msg) # CrawlConfig.log("check for config changes") # # If config file has changed, reload it. # cached config object and breaking out of the inner loop. # if self.cfg.changed(): cfgname = self.cfg.get('crawler', 'filename') self.cfg = CrawlConfig.get_config(reset=True) break # CrawlConfig.log("check for exit signal") # # Check for the exit signal # if util.conditional_rm(exit_file): self.dlog('crawl: shutting down') keep_going = False # CrawlConfig.log("sleep") # # We cycle once per second so we can detect if the user # asks us to stop or if the config file changes and needs # to be reloaded # time.sleep(1.0) except: # if we get an exception, write the traceback to the log file tbstr = tb.format_exc() for line in tbstr.split('\n'): self.dlog("crawl: '%s'" % line) keep_going = False
def run(self): """ This routine runs in the background as a daemon. Here's where we fire off plug-ins as appropriate. """ cfgname = '' self.cfg = CrawlConfig.get_config(cfgname) self.pidfile = "%s/%d" % (self.piddir, os.getpid()) exit_file = self.cfg.get('crawler', 'exitpath') ctx = self.cfg.get('crawler', 'context') clean_defunct_pidfiles(ctx) make_pidfile(os.getpid(), ctx, exit_file) atexit.register(self.delpid) keep_going = True plugin_d = {} while keep_going: try: pluglstr = self.cfg.get('crawler', 'plugins') pluglist = [x.strip() for x in pluglstr.split(',')] for s in pluglist: self.dlog('crawl: CONFIG: [%s]' % s) for o in self.cfg.options(s): self.dlog('crawl: CONFIG: %s: %s' % (o, self.cfg.get(s, o))) if s == 'crawler': continue elif s in plugin_d.keys(): CrawlConfig.log("reloading plugin %s" % s) plugin_d[s].reload(self.cfg) else: CrawlConfig.log("initial load of plugin %s" % s) plugin_d[s] = CrawlPlugin.CrawlPlugin(name=s, cfg=self.cfg) # remove any plugins that are not in the new configuration for p in plugin_d.keys(): if p not in self.cfg.sections(): CrawlConfig.log("unloading obsolete plugin %s" % p) del plugin_d[p] heartbeat = self.cfg.get_time('crawler', 'heartbeat', 10) while keep_going: # # Fire any plugins that are due # if not self.cfg.quiet_time(time.time()): hb_msg = "crawl: heartbeat..." if self.fire_plugins(plugin_d): keep_going = False else: hb_msg = "crawl: heartbeat... [quiet]" # CrawlConfig.log("issue the heartbeat") # # Issue the heartbeat if it's time # if 0 == (int(time.time()) % heartbeat): # self.dlog(hb_msg) CrawlConfig.log(hb_msg) # CrawlConfig.log("check for config changes") # # If config file has changed, reload it. # cached config object and breaking out of the inner loop. # if self.cfg.changed(): cfgname = self.cfg.get('crawler', 'filename') self.cfg = CrawlConfig.get_config(reset=True) break # CrawlConfig.log("check for exit signal") # # Check for the exit signal # if util.conditional_rm(exit_file): self.dlog('crawl: shutting down') keep_going = False # CrawlConfig.log("sleep") # # We cycle once per second so we can detect if the user # asks us to stop or if the config file changes and needs # to be reloaded # time.sleep(1.0) except: # if we get an exception, write the traceback to the log file tbstr = tb.format_exc() for line in tbstr.split('\n'): self.dlog("crawl: '%s'" % line) keep_going = False
def crl_start(argv): """start - if the crawler is not already running as a daemon, start it usage: crawl start default config file: crawl.cfg, or $CRAWL_CONF, or -c <filename> on command line default log file: /var/log/crawl.log, or $CRAWL_LOG, or -l <filename> on command line """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default='', dest='logfile', help='specify the log file') p.add_option('-C', '--context', action='store', default='', dest='context', help="context of crawler ('TEST' or 'PROD')") (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) # # Initialize the configuration # if o.context != '': cfg.set('crawler', 'context', o.context) try: exitpath = cfg.get('crawler', 'exitpath') except CrawlConfig.NoOptionError as e: print("No exit path is specified in the configuration") sys.exit(1) vstr = "HPSS Integrity Crawler version %s" % version.__version__ log = CrawlConfig.log(vstr, logpath=o.logfile, cfg=cfg) pfpath = make_pidfile(os.getpid(), cfg.get('crawler', 'context'), exitpath, just_check=True) crawler = CrawlDaemon(pfpath, stdout="crawler.stdout", stderr="crawler.stderr", logger=log, workdir='.') CrawlConfig.log('crl_start: calling crawler.start()') crawler.start() pass
def send(to='', subj='', msg='', sender='', cfg=None): """ Send e-mail as indicated sender precedence: argument, cfg, default value; if type(sender) is not str, throw the exception """ if type(to) != str: raise util.HpssicError(MSG.invalid_recip_list) if sender is not None and type(sender) != str: raise util.HpssicError(MSG.invalid_sender_S % str(sender)) if type(msg) != str: raise util.HpssicError(MSG.invalid_msg_body) if subj is not None and type(subj) != str: raise util.HpssicError(MSG.invalid_subject_S % str(subj)) # Prepare a message object based on *msg* if msg: payload = email.mime.text.MIMEText(msg) else: payload = email.mime.text.MIMEText(MSG.empty_message) # Set the recipient address(es) based on *to* default_recip = '*****@*****.**' if to == '': if cfg is None: raise util.HpssicError(MSG.no_recip_list) else: (section, option) = ('crawler', 'notify-e-mail') addrs = cfg.get(section, option) elif ',' in to or '@' in to: addrs = to elif '.' in to: if cfg is None: addrs = default_recip else: (section, option) = to.split('.') addrs = cfg.get_d(section, option, default_recip) addrlist = [x.strip() for x in addrs.split(',')] payload['To'] = addrs # Set the subject based on *subj* if subj: payload['Subject'] = subj else: payload['Subject'] = MSG.default_mail_subject # Set the from address default_sender = 'hpssic@%s' % util.hostname(long=True) if sender is None or sender == '': if cfg is not None: sender = cfg.get_d('crawler', 'from_address', default_sender) else: sender = default_sender elif '@' not in sender: raise util.HpssicError(MSG.invalid_sender_S % str(sender)) payload['From'] = sender # Send the message s = smtplib.SMTP('localhost') s.sendmail(sender, addrlist, payload.as_string()) s.quit() # Log it CrawlConfig.log("sent mail to %s", addrs)
def dispatch(self): """ Figure out where we're supposed to send this alert and send it. Possible destinations are the log file, one or more e-mail addresses, and/or a shell program. It's also possible for a 'use' option to show up in the alerts section. In this case, we're being redirected to another section, also 'use' can also point to the current alerts section. There's no reason to ever do this, but it could happen so we want to handle it in a reasonable way. That's why we sort the config options in the while statement below -- to make 'use' get handled last, so any other options in the section will get handled. Once we process 'use', anything not yet processed in the current section is ignored. """ if self.cfg is not None: cfg = self.cfg else: cfg = CrawlConfig.add_config() if self.caller != '': section = cfg.get(self.caller, 'alerts') else: section = 'alerts' done = False while not done: for opt in sorted(cfg.options(section)): if opt == 'log': # write to log fmt = cfg.get(section, 'log') CrawlConfig.log(fmt, self.msg) done = True elif opt == 'shell': # run the program cmd = cfg.get(section, 'shell') if '%s' in cmd: cmdline = cmd % (self.msg) else: cmdline = cmd os.system(cmdline) CrawlConfig.log("ran: '%s'" % (cmdline)) done = True elif opt == 'email': CrawlMail.send(cfg=cfg, to="%s.email" % section, subj="HPSS Integrity Crawler ALERT", msg=self.msg) done = True elif opt == 'use': # delegate to another section done = True new_section = cfg.get(section, 'use') # if it's the same section, ignore the 'use', but we don't # want to break the rule that all options after a 'use' are # ignored. So we set done to True to terminate the while # loop and break unconditionally at the end of this clause # to get out of the for loop if new_section != section: section = new_section done = False break
def check(self): """ For a directory: - get a list of its contents if possible, - create a Checkable object for each item and persist it to the database - return the list of Checkables found in the directory For a file: - if it already has a hash, add it to the sample if not already and verify it - if it does not have a hash, decide whether to add it or not The value of probability [0.0 .. 1.0] indicates the likelihood with which we should check files. potential outcomes return read a directory list of Checkable objects file checksum fail Alert invalid Checkable type raise StandardError access denied "access denied" verified file checksum "matched" checksum a file "checksummed" skipped a file "skipped" hpss unavailable "unavailable" Here we examine a population member, count it as a member of the population, decide whether to add it to the sample, and if so, count it as a sample member. First, we have to make all the decisions and update the object accordingly. Then, we persist the object to the database. """ # fire up hsi # self.probability = probability rval = [] cfg = CrawlConfig.get_config() # hsi_timeout = int(cfg.get_d('crawler', 'hsi_timeout', 300)) try: # h = hpss.HSI(timeout=hsi_timeout, verbose=True) h = hpss.HSI(verbose=True) CrawlConfig.log("started hsi with pid %d" % h.pid()) except hpss.HSIerror as e: return "unavailable" if self.type == 'd': rsp = h.lsP(self.path) if "Access denied" in rsp: rval = "access denied" else: for line in rsp.split("\n"): new = Checkable.fdparse(line) if new is not None: rval.append(new) new.load() new.persist() # returning list of items found in the directory elif self.type == 'f': if self.cart is None: self.populate_cart(h) if self.checksum == 0: if self.has_hash(h): self.add_to_sample(h, already_hashed=True) rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() elif self.addable(): rval = self.add_to_sample(h) # returning "access denied" or "checksummed" else: rval = "skipped" else: rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() else: raise StandardError("Invalid Checkable type: %s" % self.type) if (3 < self.fails) and (0 == self.reported): self.fail_report(h.before()) rval = "skipped" h.quit() self.set('last_check', time.time()) CrawlConfig.log( "Persisting checkable '%s' with %s = %f, %s = %d" % (self.path, 'last_check', self.last_check, 'fails', self.fails)) self.persist() return rval
def get_list(cls, how_many=-1, prob=0.1, rootlist=[]): """ Return the current list of Checkables from the database. """ if how_many < 0: cfg = CrawlConfig.add_config() how_many = int(cfg.get_d('cv', 'operations', '30')) rval = Checkable.load_priority_list() if how_many <= len(rval): return rval rval.extend(Checkable.load_recheck_list(how_many)) if how_many <= len(rval): return rval db = CrawlDBI.DBI(dbtype='crawler') kw = { 'table': 'checkables', 'fields': [ 'rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported' ], 'orderby': 'last_check' } if 0 < how_many: kw['limit'] = how_many rows = db.select(**kw) # check whether any roots from rootlist are missing and if so, add them # to the table reselect = False pathlist = [x[1] for x in rows] for root in rootlist: if root not in pathlist: nr = Checkable(path=root, type='d') nr.load() nr.persist() reselect = True if reselect: rows = db.select(**kw) for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), probability=prob, in_db=True, dirty=False) if new not in rval: rval.append(new) if how_many <= len(rval): break db.close() CrawlConfig.log("returning %d items" % len(rval)) return rval
def get_list(cls, how_many=-1, prob=0.1, rootlist=[]): """ Return the current list of Checkables from the database. """ if how_many < 0: cfg = CrawlConfig.add_config() how_many = int(cfg.get_d('cv', 'operations', '30')) rval = Checkable.load_priority_list() if how_many <= len(rval): return rval rval.extend(Checkable.load_recheck_list(how_many)) if how_many <= len(rval): return rval db = CrawlDBI.DBI(dbtype='crawler') kw = {'table': 'checkables', 'fields': ['rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported'], 'orderby': 'last_check'} if 0 < how_many: kw['limit'] = how_many rows = db.select(**kw) # check whether any roots from rootlist are missing and if so, add them # to the table reselect = False pathlist = [x[1] for x in rows] for root in rootlist: if root not in pathlist: nr = Checkable(path=root, type='d') nr.load() nr.persist() reselect = True if reselect: rows = db.select(**kw) for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), probability=prob, in_db=True, dirty=False) if new not in rval: rval.append(new) if how_many <= len(rval): break db.close() CrawlConfig.log("returning %d items" % len(rval)) return rval
def check(self): """ For a directory: - get a list of its contents if possible, - create a Checkable object for each item and persist it to the database - return the list of Checkables found in the directory For a file: - if it already has a hash, add it to the sample if not already and verify it - if it does not have a hash, decide whether to add it or not The value of probability [0.0 .. 1.0] indicates the likelihood with which we should check files. potential outcomes return read a directory list of Checkable objects file checksum fail Alert invalid Checkable type raise StandardError access denied "access denied" verified file checksum "matched" checksum a file "checksummed" skipped a file "skipped" hpss unavailable "unavailable" Here we examine a population member, count it as a member of the population, decide whether to add it to the sample, and if so, count it as a sample member. First, we have to make all the decisions and update the object accordingly. Then, we persist the object to the database. """ # fire up hsi # self.probability = probability rval = [] cfg = CrawlConfig.get_config() # hsi_timeout = int(cfg.get_d('crawler', 'hsi_timeout', 300)) try: # h = hpss.HSI(timeout=hsi_timeout, verbose=True) h = hpss.HSI(verbose=True) CrawlConfig.log("started hsi with pid %d" % h.pid()) except hpss.HSIerror as e: return "unavailable" if self.type == 'd': rsp = h.lsP(self.path) if "Access denied" in rsp: rval = "access denied" else: for line in rsp.split("\n"): new = Checkable.fdparse(line) if new is not None: rval.append(new) new.load() new.persist() # returning list of items found in the directory elif self.type == 'f': if self.cart is None: self.populate_cart(h) if self.checksum == 0: if self.has_hash(h): self.add_to_sample(h, already_hashed=True) rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() elif self.addable(): rval = self.add_to_sample(h) # returning "access denied" or "checksummed" else: rval = "skipped" else: rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() else: raise StandardError("Invalid Checkable type: %s" % self.type) if (3 < self.fails) and (0 == self.reported): self.fail_report(h.before()) rval = "skipped" h.quit() self.set('last_check', time.time()) CrawlConfig.log("Persisting checkable '%s' with %s = %f, %s = %d" % (self.path, 'last_check', self.last_check, 'fails', self.fails)) self.persist() return rval