def get_html_report(cfg_file=None, cfg=None): """ Format a report in HTML """ rval = "" if cfg is not None: # use it pass elif cfg_file is not None: cfg = CrawlConfig.add_config(filename=cfg_file) else: cfg = CrawlConfig.add_config() db = CrawlDBI.DBI(dbtype="crawler") last_rpt_time = rpt_lib.get_last_rpt_time(db) rval += ('<head><meta http-equiv="refresh" content="60">\n') rval += ("<title>HPSSIC Dashboard</title></head>") rval += ("<body><center><h1>HPSS Integrity Crawler Dashboard</h1>" + "<br><h4>Version %s</h4>" % version.__version__ + "</center>\n") rval += ("Report generated at %s\n" % time.strftime("%Y.%m%d %H:%M:%S")) rval += ("<br>Based on data from %s\n" % time.strftime("%Y.%m%d %H:%M:%S", time.localtime(last_rpt_time))) rval += get_html_cv_report(db, last_rpt_time) rval += get_html_mpra_report(db, last_rpt_time) rval += get_html_tcc_report(db, last_rpt_time) rval += "</body>" db.close() return rval
def cvv_ttype_missing(argv): """ttype_missing - Report records missing ttype information usage: cv ttype_missing [-d] """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-c', '--config', action='store', default='', dest='config', help='configuration to use') try: (o, a) = p.parse_args(argv) except SystemExit: return if o.debug: pdb.set_trace() CrawlConfig.get_config(o.config) rec_l = cv_lib.ttype_missing() for rec in rec_l: print("%-40s %-10s %s %s" % (rec[1], rec[4], rec[5], U.ymdhms(int(rec[7]))))
def simplug(plugin, args): """ Common plugin simulator. May be used by the interactive tools to simulate running the associated plugin. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--iterations', action='store', default=1, dest='iterations', type='int', help='how many iterations to run') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log("starting %s simplug, just got config" % plugin) sys.path.append(cfg.get('crawler', 'plugin-dir')) modname = cfg.get(plugin, 'module') try: P = __import__(modname) except ImportError: H = __import__('hpssic.plugins.' + modname) P = getattr(H.plugins, modname) P.main(cfg) if 1 < o.iterations: for count in range(o.iterations-1): stime = cfg.get_time(plugin, 'frequency') time.sleep(stime) P.main(cfg)
def verify(self, h): """ Attempt to verify the current file. """ CrawlConfig.log("hsi(%d) attempting to verify %s" % (h.pid(), self.path)) rsp = h.hashverify(self.path) if "TIMEOUT" in rsp or "ERROR" in rsp: rval = "skipped" self.set('fails', self.fails + 1) CrawlConfig.log( "hashverify transfer incomplete on %s -- skipping" % self.path) h.quit() elif "%s: (md5) OK" % self.path in rsp: rval = "matched" CrawlConfig.log("hashverify matched on %s" % self.path) elif "no valid checksum found" in rsp: if self.addable(self.cos): rval = self.add_to_sample(h) else: self.set('checksum', 0) rval = "skipped" CrawlConfig.log("hashverify skipped %s" % self.path) else: rval = Alert.Alert("Checksum mismatch: %s" % rsp) CrawlConfig.log("hashverify generated 'Checksum mismatch' " + "alert on %s" % self.path) return rval
def crl_log(argv): """log - write a message to the indicated log file usage: crawl log --log <filename> <message> """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default=None, dest='logfile', help='specify the log file') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log(" ".join(a), logpath=o.logfile, cfg=cfg)
def tcc_report(bitfile, cosinfo=None, path=None, log=True, store=True): """ The bitfile appears to not have the right number of copies. We're going to write its information out to a report for manual followup. """ cosinfo = get_cos_info() fmt = "%7s %8s %8s %s" hdr = fmt % ("COS", "Ccopies", "Fcopies", "Filepath") # Compute the bitfile's path if path is None: bfp = get_bitfile_path(bitfile['BFID']) else: bfp = path rpt = fmt % (bitfile['BFATTR_COS_ID'], str(cosinfo[bitfile['BFATTR_COS_ID']]), str(bitfile['SC_COUNT']), bfp) if log: CrawlConfig.log(rpt) if store: try: tcc_report._f.write(rpt + "\n") tcc_report._f.flush() except AttributeError: cfg = CrawlConfig.get_config() rptfname = cfg.get(sectname(), 'report_file') tcc_report._f = open(rptfname, 'a') tcc_report._f.write(hdr) tcc_report._f.write(rpt + "\n") tcc_report._f.flush() return rpt
def running_pid(proc_required=True, context=None): """ Return a list of pids if the crawler is running (per ps(1)) or [] otherwise """ cfg = CrawlConfig.add_config() rval = [] if proc_required: result = pidcmd() for line in result.split("\n"): if 'crawl start' in line: pid = int(line.split()[0]) pfpath = "%s/%d" % (CrawlConfig.pid_dir(), pid) if os.path.exists(pfpath): (ctx, xpath) = util.contents(pfpath).strip().split() rval.append((pid, ctx, xpath)) elif not os.path.exists(pfpath + '.DEFUNCT'): # crawler is running but the pid file has been lost ctx = context or cfg.get('crawler', 'context') xpath = cfg.get_d('crawler', 'exitpath', '%s.exit' % ctx) make_pidfile(pid, ctx, xpath) rval.append((pid, ctx, xpath)) # if pfpath + '.DEFUNCT' exists, the crawler is shutting down # so we don't want to recreate the pid file. else: pid_l = glob.glob("%s/*" % CrawlConfig.pid_dir()) for pid_n in pid_l: pid = int(os.path.basename(pid_n)) (ctx, xpath) = util.contents(pid_n).strip().split() rval.append((pid, ctx, xpath)) return rval
def record_checked_ids(cfg, low, high, correct, error): """ Save checked NSOBJECT ids in the HPSSIC database. If we check a range and get no hits (i.e., no NSOBJECT ids exist in the range), we'll store (<time>, <low-id>, <high-id>, 0, 0) If we get a hit with the right copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 1, 0) If we get a hit with the wrong copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 0, 1) """ tabname = cfg.get(sectname(), 'table_name') result = dbschem.make_table(tabname) ts = int(time.time()) CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts)) db = CrawlDBI.DBI(dbtype="crawler") db.insert(table=tabname, fields=[ 'check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct', 'error' ], data=[(ts, low, high, correct, error)]) db.close()
def check_path(path, verbose=False, plugin=True, xof=True): """ If plugin is True, we want to log and store, which tcc_report does by default so we leave those flags alone. If plugin is False, we're interactive and we want to write any report to stdout. However, we only make a report if 1) verbose is True, or 2) the counts don't match. """ cosinfo = get_cos_info() nsobj = path_nsobject(path) try: bfl = get_bitfile_set(int(nsobj), 1) except U.HpssicError as e: if plugin: CrawlConfig.log(e.value) return elif xof: raise SystemExit(e.value) else: raise U.HpssicError(e.value) bf = U.pop0(bfl) sc_count = int(bf['SC_COUNT']) cos_count = int(cosinfo[bf['BFATTR_COS_ID']]) if plugin and sc_count != cos_count: tcc_report(bf, path=path) elif not plugin and (verbose or sc_count != cos_count): print(tcc_report(bf, path=path, log=False, store=False))
def crl_cfgdump(argv): """cfgdump - load a config file and dump its contents usage: crawl cfgdump -c <filename> [--to stdout|log] [--logpath <path>] """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-t', '--to', action='store', default='', dest='target', help='specify where to send the output') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() if o.target == '': o.target = 'stdout' cfg = CrawlConfig.get_config(o.config) dumpstr = cfg.dump() if o.target == 'stdout': print dumpstr elif o.target == 'log': log = CrawlConfig.log(logpath=o.logpath, cfg=cfg) for line in dumpstr.split("\n"): CrawlConfig.log(line)
def record_checked_ids(cfg, low, high, correct, error): """ Save checked NSOBJECT ids in the HPSSIC database. If we check a range and get no hits (i.e., no NSOBJECT ids exist in the range), we'll store (<time>, <low-id>, <high-id>, 0, 0) If we get a hit with the right copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 1, 0) If we get a hit with the wrong copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 0, 1) """ tabname = cfg.get(sectname(), 'table_name') result = dbschem.make_table(tabname) ts = int(time.time()) CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts)) db = CrawlDBI.DBI(dbtype="crawler") db.insert(table=tabname, fields=['check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct', 'error'], data=[(ts, low, high, correct, error)]) db.close()
def verify(self, h): """ Attempt to verify the current file. """ CrawlConfig.log("hsi(%d) attempting to verify %s" % (h.pid(), self.path)) rsp = h.hashverify(self.path) if "TIMEOUT" in rsp or "ERROR" in rsp: rval = "skipped" self.set('fails', self.fails + 1) CrawlConfig.log("hashverify transfer incomplete on %s -- skipping" % self.path) h.quit() elif "%s: (md5) OK" % self.path in rsp: rval = "matched" CrawlConfig.log("hashverify matched on %s" % self.path) elif "no valid checksum found" in rsp: if self.addable(self.cos): rval = self.add_to_sample(h) else: self.set('checksum', 0) rval = "skipped" CrawlConfig.log("hashverify skipped %s" % self.path) else: rval = Alert.Alert("Checksum mismatch: %s" % rsp) CrawlConfig.log("hashverify generated 'Checksum mismatch' " + "alert on %s" % self.path) return rval
def crl_start(argv): """start - if the crawler is not already running as a daemon, start it usage: crawl start default config file: crawl.cfg, or $CRAWL_CONF, or -c <filename> on command line default log file: /var/log/crawl.log, or $CRAWL_LOG, or -l <filename> on command line """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default='', dest='logfile', help='specify the log file') p.add_option('-C', '--context', action='store', default='', dest='context', help="context of crawler ('TEST' or 'PROD')") (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) # # Initialize the configuration # if o.context != '': cfg.set('crawler', 'context', o.context) try: exitpath = cfg.get('crawler', 'exitpath') except CrawlConfig.NoOptionError as e: print("No exit path is specified in the configuration") sys.exit(1) vstr = "HPSS Integrity Crawler version %s" % version.__version__ log = CrawlConfig.log(vstr, logpath=o.logfile, cfg=cfg) pfpath = make_pidfile(os.getpid(), cfg.get('crawler', 'context'), exitpath, just_check=True) crawler = CrawlDaemon(pfpath, stdout="crawler.stdout", stderr="crawler.stderr", logger=log, workdir='.') CrawlConfig.log('crl_start: calling crawler.start()') crawler.start() pass
def clean_defunct_pidfiles(context): """ Remove .DEFUNCT pid files for *context* """ cfg = CrawlConfig.add_config() pdir = CrawlConfig.pid_dir() for path in glob.glob(os.path.join(pdir, '*.DEFUNCT')): c = util.contents(path) if context in c: os.unlink(path)
def cvv_report(argv): """report - show the checksum verifier database status select count(*) from checkables where type = 'f'; select count(*) from checkables where checksum <> 0; """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-p', '--prefix', action='store', default='', dest='prefix', help='table name prefix') p.add_option('-v', '--verbose', action='store_true', default=False, dest='verbose', help='pass verbose flag to HSI object') try: (o, a) = p.parse_args(argv) except SystemExit: return if o.debug: pdb.set_trace() if o.config != '': cfg = CrawlConfig.get_config(o.config) else: cfg = CrawlConfig.get_config() if o.prefix != '': cfg.set('dbi', 'tbl_prefix', o.prefix) dim = {} dim['cos'] = Dimension.get_dim('cos') dim['ttypes'] = Dimension.get_dim('ttypes') print dim['cos'].report() print dim['ttypes'].report()
def populate_cart(self, h): """ Fill in the cart field """ rsp = h.lsP(self.path) tmp = Checkable.fdparse(rsp.split("\n")[1]) try: self.cart = tmp.cart except AttributeError: self.cart = '' CrawlConfig.log("%s <- Checkable.fdparse('%s')" % (tmp, rsp.split("\n")[1]))
def load_recheck_list(cls, how_many): """ Look to see whether any of the already checksummed items in the database have a last check time over the threshold for rechecking. If so, we'll shove some of them to the front of the list based on the configuration. """ cfg = CrawlConfig.add_config() r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0')) r_age = cfg.get_time('cv', 'recheck_age', 365 * 24 * 3600) threshold = int(time.time() - r_age) CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold) if r_fraction == 0.0: return [] limit = round(r_fraction * how_many) db = CrawlDBI.DBI(dbtype='crawler') kw = { 'table': 'checkables', 'fields': [ 'rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported' ], 'where': 'checksum <> 0 and last_check < %d' % threshold, 'orderby': 'last_check', 'limit': limit } rows = db.select(**kw) db.close() rval = [] for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), in_db=True, dirty=False) rval.append(new) return rval
def load_recheck_list(cls, how_many): """ Look to see whether any of the already checksummed items in the database have a last check time over the threshold for rechecking. If so, we'll shove some of them to the front of the list based on the configuration. """ cfg = CrawlConfig.add_config() r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0')) r_age = cfg.get_time('cv', 'recheck_age', 365*24*3600) threshold = int(time.time() - r_age) CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold) if r_fraction == 0.0: return [] limit = round(r_fraction * how_many) db = CrawlDBI.DBI(dbtype='crawler') kw = {'table': 'checkables', 'fields': ['rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum', 'last_check', 'fails', 'reported'], 'where': 'checksum <> 0 and last_check < %d' % threshold, 'orderby': 'last_check', 'limit': limit} rows = db.select(**kw) db.close() rval = [] for row in rows: tmp = list(row) new = Checkable(rowid=tmp.pop(0), path=tmp.pop(0), type=tmp.pop(0), cos=tmp.pop(0), cart=tmp.pop(0), ttypes=tmp.pop(0), checksum=tmp.pop(0), last_check=tmp.pop(0), fails=tmp.pop(0), reported=tmp.pop(0), in_db=True, dirty=False) rval.append(new) return rval
def cvv_show_next(argv): """show_next - Report the Checkables in the order they will be checked usage: cvtool show_next """ p = optparse.OptionParser() p.add_option('-c', '--config', action='store', default='', dest='config', help='alternate configuration') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--id', action='store', default='', dest='id', help='id of entry to be checked') p.add_option('-l', '--limit', action='store', default=-1, dest='limit', type=int, help='max records to get') p.add_option('-p', '--path', action='store', default='', dest='path', help='name of path to be checked') p.add_option('-v', '--verbose', action='store_true', default=False, dest='verbose', help='more information') try: (o, a) = p.parse_args(argv) except SystemExit: return if o.debug: pdb.set_trace() if o.config: cfg = CrawlConfig.add_config(close=True, filename=o.config) else: cfg = CrawlConfig.add_config() if o.limit < 0: limit = int(cfg.get_d('cv', 'operations', '10')) else: limit = o.limit clist = Checkable.Checkable.get_list(limit) for c in clist: if c.last_check == 0: print("%18d %s %s" % (c.last_check, c.type, c.path)) else: print("%s %s %s" % (U.ymdhms(c.last_check), c.type, c.path))
def fire(self): """ Run the plugin. """ if self.firable: CrawlConfig.log("%s: firing" % self.name) # sys.modules[self.modname].main(self.cfg) errors = self.plugin.main(self.cfg) self.last_fired = time.time() crawl_sublib.record_history(self.name, self.last_fired, errors) elif self.cfg.getboolean('crawler', 'verbose'): CrawlConfig.log("%s: not firable" % self.name) self.last_fired = time.time()
def add_to_sample(self, hsi, already_hashed=False): """ Add the current Checkable to the sample. If already_hashed is True, this is a file for which a checksum has already been computed. We just need to record that fact by setting its checksum member to 1 and updating the sample count. If already_hashed is False, we need to carry out the following steps: 1) run hashcreate on the file 2) set checksum to non-zero to record that we have a checksum 3) update the sample count in the Dimension object """ if not already_hashed: CrawlConfig.log("starting hashcreate on %s", self.path) rsp = hsi.hashcreate(self.path) if "TIMEOUT" in rsp or "ERROR" in rsp: CrawlConfig.log("hashcreate transfer failed on %s", self.path) hsi.quit() self.set('fails', self.fails + 1) return "skipped" elif "Access denied" in rsp: CrawlConfig.log("hashcreate failed with 'access denied' on %s", self.path) hsi.quit() return "access denied" else: CrawlConfig.log("completed hashcreate on %s", self.path) if self.checksum == 0: for dn in self.dim: cat = getattr(self, dn) self.dim[dn].addone(cat) self.set('checksum', 1) return "checksummed"
def crl_fire(argv): """fire - run a plugin usage: crawl fire --cfg cfgname --logpath logfname --plugin plugname """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') p.add_option('-p', '--plugin', action='store', default='', dest='plugname', help='which plugin to fire') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) CrawlConfig.log(logpath=o.logpath, cfg=cfg) if o.plugname == '': print("'-p <plugin-name>' is required") elif not cfg.has_section(o.plugname): print("No plugin named '%s' found in configuration" % o.plugname) else: plugdir = cfg.get('crawler', 'plugin-dir') sys.path.append(plugdir) __import__(o.plugname) CrawlConfig.log('firing %s', o.plugname) sys.modules[o.plugname].main(cfg)
def make_pidfile(pid, context, exitpath, just_check=False): """ Generate a pid file in the pid directory (defined in CrawlDaemon), creating the directory if necessary. """ ok = False piddir = CrawlConfig.pid_dir() if not os.path.exists(piddir): os.mkdir(piddir) ok = True if not ok: pf_l = [x for x in glob.glob("%s/*" % piddir) if not x.endswith('.DEFUNCT')] for pf_n in pf_l: data = util.contents(pf_n) if 0 == len(data): continue (ctx, xp) = data.strip().split() if ctx == context: raise StandardError("The pidfile for context %s exists" % context) pfname = "%s/%d" % (piddir, pid) if just_check: return pfname with open(pfname, 'w') as f: f.write("%s %s\n" % (context, exitpath)) return pfname
def __init__(self, *args, **kwargs): """ Set piddir for the object from the configuration, then call the parent's constructor. """ self.piddir = CrawlConfig.pid_dir() super(CrawlDaemon, self).__init__(*args, **kwargs)
def crl_dbdrop(argv): """dbdrop - drop a database table usage: crawl dbdrop [-f] <table-name> Drop database table <table-name> """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-f', '--force', action='store_true', default=False, dest='force', help='proceed without confirmation') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() tbpfx = cfg.get('dbi', 'tbl_prefix') tname = a[0] answer = raw_input("About to drop db table %s_%s. Are you sure? > " % (tbpfx, tname)) if answer[0].lower() != "y": sys.exit() result = dbschem.drop_table(cfg, tname) print(result)
def make_pidfile(pid, context, exitpath, just_check=False): """ Generate a pid file in the pid directory (defined in CrawlDaemon), creating the directory if necessary. """ ok = False piddir = CrawlConfig.pid_dir() if not os.path.exists(piddir): os.mkdir(piddir) ok = True if not ok: pf_l = [ x for x in glob.glob("%s/*" % piddir) if not x.endswith('.DEFUNCT') ] for pf_n in pf_l: data = util.contents(pf_n) if 0 == len(data): continue (ctx, xp) = data.strip().split() if ctx == context: raise StandardError("The pidfile for context %s exists" % context) pfname = "%s/%d" % (piddir, pid) if just_check: return pfname with open(pfname, 'w') as f: f.write("%s %s\n" % (context, exitpath)) return pfname
def drop_table(cfg=None, prefix=None, table=None): """ This wraps the table dropping operation. """ if table is None: return(MSG.nothing_to_drop) if cfg is None: cfg = CrawlConfig.get_config() if prefix is None: prefix = cfg.get('dbi-crawler', 'tbl_prefix') else: cfg.set('dbi-crawler', 'tbl_prefix', prefix) db = CrawlDBI.DBI(dbtype="crawler", cfg=cfg) if not db.table_exists(table=table): rval = ("Table '%s' does not exist" % (table)) else: db.drop(table=table) if db.table_exists(table=table): rval = ("Attempt to drop table '%s' failed" % (table)) else: rval = ("Attempt to drop table '%s' was successful" % (table)) db.close() return rval
def tccp_zreport(args): """zreport - show what tcc_report will do with a bitfile id usage: tcc zreport NSOBJECT-ID Note: This will only report bitfiles where the COS count and file count differ. Giving it any old object id won't necessarily generate any output. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() try: nsobj_id = a[0] except: print("usage: tcc zreport OBJECT_ID") return cfg = CrawlConfig.get_config() outfile = cfg.get(tcc_lib.sectname(), 'report_file') cosinfo = tcc_lib.get_cos_info() try: bfl = tcc_lib.get_bitfile_set(int(nsobj_id), 1) except U.HpssicError as e: bfl = [] pass print("Writing output to %s" % outfile) for bf in bfl: tcc_lib.tcc_report(bf, cosinfo)
def __init__(self, connect=True, *args, **kwargs): """ Initialize the object """ self.prompt = "]:" self.verbose = False self.unavailable = False self.xobj = None self.timeout = 60 cmdopts = " ".join(args) for key in kwargs: setattr(self, key, kwargs[key]) cfg = CrawlConfig.get_config() if not hasattr(self, 'reset_atime'): self.reset_atime = cfg.getboolean('cv', 'reset_atime') if not hasattr(self, 'hash_algorithm'): self.hash_algorithm = cfg.get_d('cv', 'hash_algorithm', None) maybe_update_hsi() self.cmd = "hsi " + cmdopts if connect: self.connect()
def mprf_reset(args): """reset - drop the mpra table and remove mpra_report.txt usage: mpra reset """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-f', '--force', action='store_true', default=False, dest='force', help='force the operation') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() if not o.force: answer = raw_input(MSG.all_mpra_data_lost) if answer[0].lower() != "y": raise SystemExit() cfg = CrawlConfig.get_config(o.config) dbschem.drop_table(cfg=cfg, table='mpra') filename = cfg.get('mpra', 'report_file') util.conditional_rm(filename)
def get_last_rpt_time(db): """ Retrieve the last report time from the report table. If the table does not exist before make_table ('Created' in result), the table is empty so we just return 0 to indicate no last report time. """ result = dbschem.make_table("report") if "Created" in result: rval = 0 else: rows = db.select(table='report', fields=['max(report_time)']) (rval) = rows[0][0] if rval is None: rval = 0 CrawlConfig.log("time of last report: %d" % rval) return rval
def highest_nsobject_id(): """ Cache and return the largest NSOBJECT id in the DB2 database. The variables highest_nsobject_id._max_obj_id and highest_nsobject_id._when are local to this function but do not lose their values between invocations. """ if (not hasattr(highest_nsobject_id, '_max_obj_id') or not hasattr(highest_nsobject_id, '_when') or 60 < time.time() - highest_nsobject_id._when): highest_nsobject_id._max_obj_id = max_nsobj_id() highest_nsobject_id._when = time.time() CrawlConfig.log("max object id = %d at %s" % (highest_nsobject_id._max_obj_id, util.ymdhms(highest_nsobject_id._when))) rval = highest_nsobject_id._max_obj_id return rval
def hashcreate(self, pathnames): """ Argument pathnames should reference one or more files. It may be a string containing one or more space separated file paths, or a list of one or more file paths. If it has type unicode, it will be encoded to 'ascii' before being treated as a string. """ if type(pathnames) == str: pathlist = pathnames.split() elif type(pathnames) == list: pathlist = pathnames elif type(pathnames) == unicode: pathlist = pathnames.encode('ascii', 'ignore').split() else: raise HSIerror("%s: Invalid argument (%s: '%s')" % (util.my_name(), type(pathnames), pathnames)) rval = "" for path in pathlist: if self.reset_atime: prev_time = self.access_time(path) if self.hash_algorithm is None: cmd = "hashcreate %s" % path else: cmd = "hashcreate -H %s %s" % (self.hash_algorithm, path) self.xobj.sendline(cmd) which = self.xobj.expect([self.prompt, pexpect.TIMEOUT] + self.hsierrs) while which == 1 and 1 < len(self.xobj.before): CrawlConfig.log("got a timeout, continuing because before " + "is not empty and does not contain an error") rval += self.xobj.before which = self.xobj.expect([self.prompt, pexpect.TIMEOUT] + self.hsierrs) rval += self.xobj.before if 1 == which: rval += " TIMEOUT" elif 0 != which: rval += " ERROR" if self.reset_atime: self.touch(path, when=prev_time) return rval
def __init__(self, name=None, cfg=None): """ Configuration data is read and copied into the object by method init_cfg_data(), called by both the constructor and reload(). init_cfg_data() reverses the order of cfg and name in its argument list from the constructor so name can have a default and reload() doesn't have to pass it. last_fired is initialized by the constructor but not by reload(). So if the plugin is updated by a reconfigure, it won't lose its last fire time but will stay on the same schedule. """ assert(name is not None) assert(cfg is not None) self.cfg = cfg l = CrawlConfig.log(cfg=cfg, close=True) CrawlConfig.log("%s: Initializing plugin data" % name) self.init_cfg_data(name, cfg) self.last_fired = time.time() - self.frequency - 1 super(CrawlPlugin, self).__init__()
def addable(self): """ Determine which Dimensions want this item added. Note that we want this routine to be general across dimensions so we don't want it to assume anything about the dimension it's checking (like that it's named 'cos' for example). That why calls to this pass in cos rather than looking at the value in the object. """ for dn in self.dim: cval = getattr(self, dn) if self.dim[dn].vote(cval) is False: CrawlConfig.log("%s votes against %s -- skipping" % (dn, self.path)) return False randval = random.random() if self.probability < randval: CrawlConfig.log("random votes against %s -- skipping (%g < %g)" % (self.path, self.probability, randval)) return False return True
def mpra_fetch_recent(type): """ Retrieve and return the most recent record reported so we don't report the same record repeatedly """ db = CrawlDBI.DBI(dbtype="crawler") if not db.table_exists(table='mpra'): CrawlConfig.log("Fetch from not existent mpra table -- return 0") return 0 rows = db.select(table='mpra', fields=['scan_time, end_time'], where='type = ?', data=(type,)) last_end_time = -1 max_scan_time = 0 for r in rows: if max_scan_time < r[0]: max_scan_time = r[0] last_end_time = r[1] if last_end_time < 0: CrawlConfig.log("No '%s' value in mpra -- returning 0" % type) return 0 else: CrawlConfig.log("Fetch '%s' from mpra table -- return %d" % (type, last_end_time)) return last_end_time
def simplug(plugin, args): """ Common plugin simulator. May be used by the interactive tools to simulate running the associated plugin. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--iterations', action='store', default=1, dest='iterations', type='int', help='how many iterations to run') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log("starting %s simplug, just got config" % plugin) sys.path.append(cfg.get('crawler', 'plugin-dir')) modname = cfg.get(plugin, 'module') try: P = __import__(modname) except ImportError: H = __import__('hpssic.plugins.' + modname) P = getattr(H.plugins, modname) P.main(cfg) if 1 < o.iterations: for count in range(o.iterations - 1): stime = cfg.get_time(plugin, 'frequency') time.sleep(stime) P.main(cfg)
def maybe_update_hsi(): """ If the hsi wrapper script has changed, grab and edit a fresh copy """ l = util.which_all('hsi') trg = l[0] tc = util.contents(trg).split("\n") tv = util.grep('^BINARYVERSION=', tc) s = [x for x in l if 'sources/hpss' in x] src = s[0] sc = util.contents(src).split("\n") sv = util.grep('^BINARYVERSION=', sc) if tv[0] != sv[0]: z = util.grep("${EXECUTABLE}", sc, regex=False, index=True) sc[z[0]] = "exec " + sc[z[0]] try: f = open(trg, 'w') f.writelines("\n".join(sc) + "\n") f.close() except IOError as e: CrawlConfig.log(MSG.hsi_wrap_ood)
def fail_report(self, msg): """ Report a failure """ try: f = self.fail_report_fh except AttributeError: cfg = CrawlConfig.get_config() filename = cfg.get('checksum-verifier', 'fail_report') self.fail_report_fh = open(filename, 'a') f = self.fail_report_fh f.write("Failure retrieving file %s: '%s'\n" % (self.path, msg)) self.set('reported', 1) f.flush()
def stop_wait(cfg=None): """ Watch for the crawler's exit file to disappear. If it's still there after the timeout period, give up and throw an exception. """ if cfg is None: cfg = CrawlConfig.get_config() context = cfg.get('crawler', 'context') exitpath = cfg.get('crawler', 'exitpath') timeout = cfg.get_time('crawler', 'stopwait_timeout', 5.0) sleep_time = cfg.get_time('crawler', 'sleep_time', 0.25) lapse = 0.0 while is_running(context) and lapse < timeout: time.sleep(sleep_time) lapse += sleep_time if is_running(context) and timeout <= lapse: raise util.HpssicError("Stop wait timeout exceeded")
def is_running(context=None): """ Return True if the crawler is running (per ps(1)) or False otherwise. """ running = False if context is None: cfg = CrawlConfig.get_config() try: context = cfg.get('crawler', 'context') except CrawlConfig.NoOptionError as e: emsg = ("No option 'context' in section 'crawler', file '%s'" % cfg.filename) raise StandardError(emsg) rpi_l = running_pid(context=context) for rpi in rpi_l: if rpi[1] == context: running = True return running