def cvv_ttype_missing(argv): """ttype_missing - Report records missing ttype information usage: cv ttype_missing [-d] """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-c', '--config', action='store', default='', dest='config', help='configuration to use') try: (o, a) = p.parse_args(argv) except SystemExit: return if o.debug: pdb.set_trace() CrawlConfig.get_config(o.config) rec_l = cv_lib.ttype_missing() for rec in rec_l: print("%-40s %-10s %s %s" % (rec[1], rec[4], rec[5], U.ymdhms(int(rec[7]))))
def cvv_report(argv): """report - show the checksum verifier database status select count(*) from checkables where type = 'f'; select count(*) from checkables where checksum <> 0; """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-p', '--prefix', action='store', default='', dest='prefix', help='table name prefix') p.add_option('-v', '--verbose', action='store_true', default=False, dest='verbose', help='pass verbose flag to HSI object') try: (o, a) = p.parse_args(argv) except SystemExit: return if o.debug: pdb.set_trace() if o.config != '': cfg = CrawlConfig.get_config(o.config) else: cfg = CrawlConfig.get_config() if o.prefix != '': cfg.set('dbi', 'tbl_prefix', o.prefix) dim = {} dim['cos'] = Dimension.get_dim('cos') dim['ttypes'] = Dimension.get_dim('ttypes') print dim['cos'].report() print dim['ttypes'].report()
def crl_cfgdump(argv): """cfgdump - load a config file and dump its contents usage: crawl cfgdump -c <filename> [--to stdout|log] [--logpath <path>] """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-t', '--to', action='store', default='', dest='target', help='specify where to send the output') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() if o.target == '': o.target = 'stdout' cfg = CrawlConfig.get_config(o.config) dumpstr = cfg.dump() if o.target == 'stdout': print dumpstr elif o.target == 'log': log = CrawlConfig.log(logpath=o.logpath, cfg=cfg) for line in dumpstr.split("\n"): CrawlConfig.log(line)
def crl_log(argv): """log - write a message to the indicated log file usage: crawl log --log <filename> <message> """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default=None, dest='logfile', help='specify the log file') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log(" ".join(a), logpath=o.logfile, cfg=cfg)
def tccp_zreport(args): """zreport - show what tcc_report will do with a bitfile id usage: tcc zreport NSOBJECT-ID Note: This will only report bitfiles where the COS count and file count differ. Giving it any old object id won't necessarily generate any output. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() try: nsobj_id = a[0] except: print("usage: tcc zreport OBJECT_ID") return cfg = CrawlConfig.get_config() outfile = cfg.get(tcc_lib.sectname(), 'report_file') cosinfo = tcc_lib.get_cos_info() try: bfl = tcc_lib.get_bitfile_set(int(nsobj_id), 1) except U.HpssicError as e: bfl = [] pass print("Writing output to %s" % outfile) for bf in bfl: tcc_lib.tcc_report(bf, cosinfo)
def crl_dbdrop(argv): """dbdrop - drop a database table usage: crawl dbdrop [-f] <table-name> Drop database table <table-name> """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-f', '--force', action='store_true', default=False, dest='force', help='proceed without confirmation') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() tbpfx = cfg.get('dbi', 'tbl_prefix') tname = a[0] answer = raw_input("About to drop db table %s_%s. Are you sure? > " % (tbpfx, tname)) if answer[0].lower() != "y": sys.exit() result = dbschem.drop_table(cfg, tname) print(result)
def simplug(plugin, args): """ Common plugin simulator. May be used by the interactive tools to simulate running the associated plugin. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--iterations', action='store', default=1, dest='iterations', type='int', help='how many iterations to run') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log("starting %s simplug, just got config" % plugin) sys.path.append(cfg.get('crawler', 'plugin-dir')) modname = cfg.get(plugin, 'module') try: P = __import__(modname) except ImportError: H = __import__('hpssic.plugins.' + modname) P = getattr(H.plugins, modname) P.main(cfg) if 1 < o.iterations: for count in range(o.iterations-1): stime = cfg.get_time(plugin, 'frequency') time.sleep(stime) P.main(cfg)
def tcc_report(bitfile, cosinfo=None, path=None, log=True, store=True): """ The bitfile appears to not have the right number of copies. We're going to write its information out to a report for manual followup. """ cosinfo = get_cos_info() fmt = "%7s %8s %8s %s" hdr = fmt % ("COS", "Ccopies", "Fcopies", "Filepath") # Compute the bitfile's path if path is None: bfp = get_bitfile_path(bitfile['BFID']) else: bfp = path rpt = fmt % (bitfile['BFATTR_COS_ID'], str(cosinfo[bitfile['BFATTR_COS_ID']]), str(bitfile['SC_COUNT']), bfp) if log: CrawlConfig.log(rpt) if store: try: tcc_report._f.write(rpt + "\n") tcc_report._f.flush() except AttributeError: cfg = CrawlConfig.get_config() rptfname = cfg.get(sectname(), 'report_file') tcc_report._f = open(rptfname, 'a') tcc_report._f.write(hdr) tcc_report._f.write(rpt + "\n") tcc_report._f.flush() return rpt
def mprf_reset(args): """reset - drop the mpra table and remove mpra_report.txt usage: mpra reset """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-f', '--force', action='store_true', default=False, dest='force', help='force the operation') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() if not o.force: answer = raw_input(MSG.all_mpra_data_lost) if answer[0].lower() != "y": raise SystemExit() cfg = CrawlConfig.get_config(o.config) dbschem.drop_table(cfg=cfg, table='mpra') filename = cfg.get('mpra', 'report_file') util.conditional_rm(filename)
def drop_table(cfg=None, prefix=None, table=None): """ This wraps the table dropping operation. """ if table is None: return(MSG.nothing_to_drop) if cfg is None: cfg = CrawlConfig.get_config() if prefix is None: prefix = cfg.get('dbi-crawler', 'tbl_prefix') else: cfg.set('dbi-crawler', 'tbl_prefix', prefix) db = CrawlDBI.DBI(dbtype="crawler", cfg=cfg) if not db.table_exists(table=table): rval = ("Table '%s' does not exist" % (table)) else: db.drop(table=table) if db.table_exists(table=table): rval = ("Attempt to drop table '%s' failed" % (table)) else: rval = ("Attempt to drop table '%s' was successful" % (table)) db.close() return rval
def __init__(self, connect=True, *args, **kwargs): """ Initialize the object """ self.prompt = "]:" self.verbose = False self.unavailable = False self.xobj = None self.timeout = 60 cmdopts = " ".join(args) for key in kwargs: setattr(self, key, kwargs[key]) cfg = CrawlConfig.get_config() if not hasattr(self, 'reset_atime'): self.reset_atime = cfg.getboolean('cv', 'reset_atime') if not hasattr(self, 'hash_algorithm'): self.hash_algorithm = cfg.get_d('cv', 'hash_algorithm', None) maybe_update_hsi() self.cmd = "hsi " + cmdopts if connect: self.connect()
def crl_start(argv): """start - if the crawler is not already running as a daemon, start it usage: crawl start default config file: crawl.cfg, or $CRAWL_CONF, or -c <filename> on command line default log file: /var/log/crawl.log, or $CRAWL_LOG, or -l <filename> on command line """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--log', action='store', default='', dest='logfile', help='specify the log file') p.add_option('-C', '--context', action='store', default='', dest='context', help="context of crawler ('TEST' or 'PROD')") (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) # # Initialize the configuration # if o.context != '': cfg.set('crawler', 'context', o.context) try: exitpath = cfg.get('crawler', 'exitpath') except CrawlConfig.NoOptionError as e: print("No exit path is specified in the configuration") sys.exit(1) vstr = "HPSS Integrity Crawler version %s" % version.__version__ log = CrawlConfig.log(vstr, logpath=o.logfile, cfg=cfg) pfpath = make_pidfile(os.getpid(), cfg.get('crawler', 'context'), exitpath, just_check=True) crawler = CrawlDaemon(pfpath, stdout="crawler.stdout", stderr="crawler.stderr", logger=log, workdir='.') CrawlConfig.log('crl_start: calling crawler.start()') crawler.start() pass
def fail_report(self, msg): """ Report a failure """ try: f = self.fail_report_fh except AttributeError: cfg = CrawlConfig.get_config() filename = cfg.get('checksum-verifier', 'fail_report') self.fail_report_fh = open(filename, 'a') f = self.fail_report_fh f.write("Failure retrieving file %s: '%s'\n" % (self.path, msg)) self.set('reported', 1) f.flush()
def crl_fire(argv): """fire - run a plugin usage: crawl fire --cfg cfgname --logpath logfname --plugin plugname """ p = optparse.OptionParser() p.add_option('-c', '--cfg', action='store', default='', dest='config', help='config file name') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--logpath', action='store', default='', dest='logpath', help='specify where to send the output') p.add_option('-p', '--plugin', action='store', default='', dest='plugname', help='which plugin to fire') (o, a) = p.parse_args(argv) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config(o.config) CrawlConfig.log(logpath=o.logpath, cfg=cfg) if o.plugname == '': print("'-p <plugin-name>' is required") elif not cfg.has_section(o.plugname): print("No plugin named '%s' found in configuration" % o.plugname) else: plugdir = cfg.get('crawler', 'plugin-dir') sys.path.append(plugdir) __import__(o.plugname) CrawlConfig.log('firing %s', o.plugname) sys.modules[o.plugname].main(cfg)
def stop_wait(cfg=None): """ Watch for the crawler's exit file to disappear. If it's still there after the timeout period, give up and throw an exception. """ if cfg is None: cfg = CrawlConfig.get_config() context = cfg.get('crawler', 'context') exitpath = cfg.get('crawler', 'exitpath') timeout = cfg.get_time('crawler', 'stopwait_timeout', 5.0) sleep_time = cfg.get_time('crawler', 'sleep_time', 0.25) lapse = 0.0 while is_running(context) and lapse < timeout: time.sleep(sleep_time) lapse += sleep_time if is_running(context) and timeout <= lapse: raise util.HpssicError("Stop wait timeout exceeded")
def is_running(context=None): """ Return True if the crawler is running (per ps(1)) or False otherwise. """ running = False if context is None: cfg = CrawlConfig.get_config() try: context = cfg.get('crawler', 'context') except CrawlConfig.NoOptionError as e: emsg = ("No option 'context' in section 'crawler', file '%s'" % cfg.filename) raise StandardError(emsg) rpi_l = running_pid(context=context) for rpi in rpi_l: if rpi[1] == context: running = True return running
def xplocks(output=None, mark=False): """ Look for expired purge locks in bfpurgerec. """ cfg = CrawlConfig.get_config() now = time.time() hits = 0 opened = True if output is None: f = open(cfg.get('mpra', 'report_file'), 'a') elif type(output) == str: f = open(output, 'a') elif type(output) == file: f = output opened = False else: raise StandardError("output type must be 'str' or 'file' ") dbs = CrawlDBI.DBI(dbtype='hpss', dbname='sub') lock_min = cfg.getint('mpra', 'lock_duration') rows = dbs.select(table='bfpurgerec', fields=['bfid', 'record_lock_time'], where='record_lock_time <> 0') if 0 < len(rows): f.write("Expired Purge Locks\n") for r in rows: if (lock_min * 60) < (now - r['RECORD_LOCK_TIME']): hits += 1 f.write(" %s %s\n" % (CrawlDBI.DBIdb2.hexstr(r['BFID']), util.ymdhms(r['RECORD_LOCK_TIME']))) if mark: mpra_record_recent('purge', 0, 0, hits) if opened: f.close() return hits
def drop_tables_matching(tablike): """ Drop tables with names matching the *tablike* expression. At the time of writing, this is only used for drop test tables ('test_%') """ tcfg = CrawlConfig.get_config() tcfg.set('dbi-crawler', 'tbl_prefix', '') db = CrawlDBI.DBI(cfg=tcfg, dbtype='crawler') if CrawlDBI.mysql_available and 'mysql' in str(db): # db = CrawlDBI.DBI(cfg=tcfg, dbtype='crawler') with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Can't read dir of .*") tlist = db.select(table="information_schema.tables", fields=['table_name'], where="table_name like '%s'" % tablike) for (tname,) in tlist: if db.table_exists(table=tname): db.drop(table=tname) db.close()
def simplug(plugin, args): """ Common plugin simulator. May be used by the interactive tools to simulate running the associated plugin. """ p = optparse.OptionParser() p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-i', '--iterations', action='store', default=1, dest='iterations', type='int', help='how many iterations to run') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() CrawlConfig.log("starting %s simplug, just got config" % plugin) sys.path.append(cfg.get('crawler', 'plugin-dir')) modname = cfg.get(plugin, 'module') try: P = __import__(modname) except ImportError: H = __import__('hpssic.plugins.' + modname) P = getattr(H.plugins, modname) P.main(cfg) if 1 < o.iterations: for count in range(o.iterations - 1): stime = cfg.get_time(plugin, 'frequency') time.sleep(stime) P.main(cfg)
def load_priority_list(cls): """ If one or more priority list files are configured, read them and put their contents first in the list of Checkables to be processed """ rval = [] cfg = CrawlConfig.get_config() priglob = cfg.get_d('cv', 'priority', '') if priglob == '': return rval pricomp = cfg.get_d('cv', 'completed', U.pathjoin(U.dirname(priglob), 'completed')) for pripath in U.foldsort(glob.glob(priglob)): with open(pripath, 'r') as f: for line in f.readlines(): path = line.strip() rval.append(Checkable(path=path, type='f')) os.rename(pripath, U.pathjoin(pricomp, U.basename(pripath))) return rval
def mprf_age(args): """age - list the records in table BFMIGRREC or BFPURGEREC older than age usage: mpra age -t [migr|purge] -a/--age N[S|M|H|d|m|Y] [-c/--count] Report migration records (or a count of them) older than the age indicated. --age N -- report records older than N --before D -- report records from before date D --start S -- report records with timestamps larger than S --end E -- report recs with timestampes smaller than E """ p = optparse.OptionParser() p.add_option('-a', '--age', action='store', default='', dest='age', help='report records older than this') p.add_option('-b', '--before', action='store', default='', dest='before', help='report records from before this epoch') p.add_option('-c', '--count', action='store_true', default=False, dest='count', help='report record counts rather than records') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-e', '--end', action='store', default='', dest='end', help='ending epoch time') p.add_option('-p', '--path', action='store_true', default=False, dest='path', help='report paths as well as bitfile IDs') p.add_option('-s', '--start', action='store', default='', dest='start', help='starting epoch time') p.add_option('-t', '--table', action='store', default='', dest='table', help='which table to age') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() start = 0 if o.age and o.before: raise StandardError("--age and --before are mutually exclusive") elif o.age and '' != o.end: raise StandardError("--age and --end are mutually exclusive") elif o.before and '' != o.end: raise StandardError("--before and --end are mutually exclusive") elif o.before: end = time.mktime(time.strptime(o.before, "%Y.%m%d")) elif o.age: end = time.time() - cfg.to_seconds(o.age) elif o.end: end = util.epoch(o.end) if o.start: start = util.epoch(o.start) if o.table == '': o.table = 'migr' print("%d, %d" % (start, end)) mpra_lib.age(o.table, start, end, o.count, sys.stdout, path=o.path)
def run(self): """ This routine runs in the background as a daemon. Here's where we fire off plug-ins as appropriate. """ cfgname = '' self.cfg = CrawlConfig.get_config(cfgname) self.pidfile = "%s/%d" % (self.piddir, os.getpid()) exit_file = self.cfg.get('crawler', 'exitpath') ctx = self.cfg.get('crawler', 'context') clean_defunct_pidfiles(ctx) make_pidfile(os.getpid(), ctx, exit_file) atexit.register(self.delpid) keep_going = True plugin_d = {} while keep_going: try: pluglstr = self.cfg.get('crawler', 'plugins') pluglist = [x.strip() for x in pluglstr.split(',')] for s in pluglist: self.dlog('crawl: CONFIG: [%s]' % s) for o in self.cfg.options(s): self.dlog('crawl: CONFIG: %s: %s' % (o, self.cfg.get(s, o))) if s == 'crawler': continue elif s in plugin_d.keys(): CrawlConfig.log("reloading plugin %s" % s) plugin_d[s].reload(self.cfg) else: CrawlConfig.log("initial load of plugin %s" % s) plugin_d[s] = CrawlPlugin.CrawlPlugin(name=s, cfg=self.cfg) # remove any plugins that are not in the new configuration for p in plugin_d.keys(): if p not in self.cfg.sections(): CrawlConfig.log("unloading obsolete plugin %s" % p) del plugin_d[p] heartbeat = self.cfg.get_time('crawler', 'heartbeat', 10) while keep_going: # # Fire any plugins that are due # if not self.cfg.quiet_time(time.time()): hb_msg = "crawl: heartbeat..." if self.fire_plugins(plugin_d): keep_going = False else: hb_msg = "crawl: heartbeat... [quiet]" # CrawlConfig.log("issue the heartbeat") # # Issue the heartbeat if it's time # if 0 == (int(time.time()) % heartbeat): # self.dlog(hb_msg) CrawlConfig.log(hb_msg) # CrawlConfig.log("check for config changes") # # If config file has changed, reload it. # cached config object and breaking out of the inner loop. # if self.cfg.changed(): cfgname = self.cfg.get('crawler', 'filename') self.cfg = CrawlConfig.get_config(reset=True) break # CrawlConfig.log("check for exit signal") # # Check for the exit signal # if util.conditional_rm(exit_file): self.dlog('crawl: shutting down') keep_going = False # CrawlConfig.log("sleep") # # We cycle once per second so we can detect if the user # asks us to stop or if the config file changes and needs # to be reloaded # time.sleep(1.0) except: # if we get an exception, write the traceback to the log file tbstr = tb.format_exc() for line in tbstr.split('\n'): self.dlog("crawl: '%s'" % line) keep_going = False
def mprf_migr_recs(args): """migr_recs - list the records in table BFMIGRREC usage: mpra migr_recs [-l/limit N] [-b/--before DATE-TIME] [-a/--after DATE-TIME] with -l N, only report the first N records with -b DATE-TIME, only report the records with create times before DATE-TIME. with -a DATE-TIME, only report the records with create times after DATE-TIME. """ p = optparse.OptionParser() p.add_option('-c', '--count', action='store_true', default=False, dest='count', help='report record counts rather than records') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--limit', action='store', default='', dest='limit', help='how many records to fetch') p.add_option('-b', '--before', action='store', default='', dest='before', help='fetch records from before the date/time') p.add_option('-a', '--after', action='store', default='', dest='after', help='fetch records from after the date/time') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() dbargs = {'table': 'bfmigrrec'} if o.limit == '' and o.before == '' and o.after == '': dbargs['limit'] = 30 elif o.limit == '' and o.before == '' and o.after != '': dbargs['where'] = '? < record_create_time' dbargs['data'] = (util.epoch(o.after), ) elif o.limit == '' and o.before != '' and o.after == '': dbargs['where'] = 'record_create_time < ?' dbargs['data'] = (util.epoch(o.before), ) elif o.limit == '' and o.before != '' and o.after != '': dbargs['where'] = '? < record_create_time and record_create_time < ?' dbargs['data'] = (util.epoch(o.after), util.epoch(o.before)) elif o.limit != '' and o.before == '' and o.after == '': dbargs['limit'] = int(o.limit) elif o.limit != '' and o.before == '' and o.after != '': dbargs['limit'] = int(o.limit) dbargs['where'] = '? < record_create_time' dbargs['data'] = (util.epoch(o.after), ) elif o.limit != '' and o.before != '' and o.after == '': dbargs['limit'] = int(o.limit) dbargs['where'] = 'record_create_time < ?' dbargs['data'] = (util.epoch(o.before), ) elif o.limit != '' and o.before != '' and o.after != '': dbargs['limit'] = int(o.limit) dbargs['where'] = '? < record_create_time and record_create_time < ?' dbarsg['data'] = (util.epoch(o.after), util.epoch(o.before)) if o.count: dbargs['fields'] = ['count(*)'] else: dbargs['fields'] = [ 'bfid', 'record_create_time', 'migration_failure_count' ] dbargs['orderby'] = 'record_create_time' rows = mpra_lib.lookup_migr_recs(**dbargs) for row in rows: if o.count: print("Records found: %d" % row['1']) else: print("%s %s %d" % (CrawlDBI.DBIdb2.hexstr( row['BFID']), util.ymdhms(row['RECORD_CREATE_TIME']), row['MIGRATION_FAILURE_COUNT']))
def mprf_migr_recs(args): """migr_recs - list the records in table BFMIGRREC usage: mpra migr_recs [-l/limit N] [-b/--before DATE-TIME] [-a/--after DATE-TIME] with -l N, only report the first N records with -b DATE-TIME, only report the records with create times before DATE-TIME. with -a DATE-TIME, only report the records with create times after DATE-TIME. """ p = optparse.OptionParser() p.add_option('-c', '--count', action='store_true', default=False, dest='count', help='report record counts rather than records') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-l', '--limit', action='store', default='', dest='limit', help='how many records to fetch') p.add_option('-b', '--before', action='store', default='', dest='before', help='fetch records from before the date/time') p.add_option('-a', '--after', action='store', default='', dest='after', help='fetch records from after the date/time') (o, a) = p.parse_args(args) if o.debug: pdb.set_trace() cfg = CrawlConfig.get_config() dbargs = {'table': 'bfmigrrec'} if o.limit == '' and o.before == '' and o.after == '': dbargs['limit'] = 30 elif o.limit == '' and o.before == '' and o.after != '': dbargs['where'] = '? < record_create_time' dbargs['data'] = (util.epoch(o.after),) elif o.limit == '' and o.before != '' and o.after == '': dbargs['where'] = 'record_create_time < ?' dbargs['data'] = (util.epoch(o.before),) elif o.limit == '' and o.before != '' and o.after != '': dbargs['where'] = '? < record_create_time and record_create_time < ?' dbargs['data'] = (util.epoch(o.after), util.epoch(o.before)) elif o.limit != '' and o.before == '' and o.after == '': dbargs['limit'] = int(o.limit) elif o.limit != '' and o.before == '' and o.after != '': dbargs['limit'] = int(o.limit) dbargs['where'] = '? < record_create_time' dbargs['data'] = (util.epoch(o.after),) elif o.limit != '' and o.before != '' and o.after == '': dbargs['limit'] = int(o.limit) dbargs['where'] = 'record_create_time < ?' dbargs['data'] = (util.epoch(o.before),) elif o.limit != '' and o.before != '' and o.after != '': dbargs['limit'] = int(o.limit) dbargs['where'] = '? < record_create_time and record_create_time < ?' dbarsg['data'] = (util.epoch(o.after), util.epoch(o.before)) if o.count: dbargs['fields'] = ['count(*)'] else: dbargs['fields'] = ['bfid', 'record_create_time', 'migration_failure_count'] dbargs['orderby'] = 'record_create_time' rows = mpra_lib.lookup_migr_recs(**dbargs) for row in rows: if o.count: print("Records found: %d" % row['1']) else: print("%s %s %d" % (CrawlDBI.DBIdb2.hexstr(row['BFID']), util.ymdhms(row['RECORD_CREATE_TIME']), row['MIGRATION_FAILURE_COUNT']))
def check(self): """ For a directory: - get a list of its contents if possible, - create a Checkable object for each item and persist it to the database - return the list of Checkables found in the directory For a file: - if it already has a hash, add it to the sample if not already and verify it - if it does not have a hash, decide whether to add it or not The value of probability [0.0 .. 1.0] indicates the likelihood with which we should check files. potential outcomes return read a directory list of Checkable objects file checksum fail Alert invalid Checkable type raise StandardError access denied "access denied" verified file checksum "matched" checksum a file "checksummed" skipped a file "skipped" hpss unavailable "unavailable" Here we examine a population member, count it as a member of the population, decide whether to add it to the sample, and if so, count it as a sample member. First, we have to make all the decisions and update the object accordingly. Then, we persist the object to the database. """ # fire up hsi # self.probability = probability rval = [] cfg = CrawlConfig.get_config() # hsi_timeout = int(cfg.get_d('crawler', 'hsi_timeout', 300)) try: # h = hpss.HSI(timeout=hsi_timeout, verbose=True) h = hpss.HSI(verbose=True) CrawlConfig.log("started hsi with pid %d" % h.pid()) except hpss.HSIerror as e: return "unavailable" if self.type == 'd': rsp = h.lsP(self.path) if "Access denied" in rsp: rval = "access denied" else: for line in rsp.split("\n"): new = Checkable.fdparse(line) if new is not None: rval.append(new) new.load() new.persist() # returning list of items found in the directory elif self.type == 'f': if self.cart is None: self.populate_cart(h) if self.checksum == 0: if self.has_hash(h): self.add_to_sample(h, already_hashed=True) rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() elif self.addable(): rval = self.add_to_sample(h) # returning "access denied" or "checksummed" else: rval = "skipped" else: rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() else: raise StandardError("Invalid Checkable type: %s" % self.type) if (3 < self.fails) and (0 == self.reported): self.fail_report(h.before()) rval = "skipped" h.quit() self.set('last_check', time.time()) CrawlConfig.log( "Persisting checkable '%s' with %s = %f, %s = %d" % (self.path, 'last_check', self.last_check, 'fails', self.fails)) self.persist() return rval
def age(table, start=None, end=None, count=False, output=None, path=False, mark=False): """ Retrieve and return (count of) records older than end and younger than start. The result is written to output. If path is True, age_report will compute the bitfile pathname and report it as well. If mark is True, we update the mpra table with the date/time of the newest record reported. Strict less than compares is the right thing to do. We record the last time reported in the mpra recent table. We've reported all the records with that time. We're looking into the past, so any new records added cannot have that time -- they're being added in the present when timestamps have larger values. So we want to start with the next one after the last one reported. """ cfg = CrawlConfig.get_config() opened = True if output is None: f = open(cfg.get('mpra', 'report_file'), 'a') elif type(output) == str: f = open(output, 'a') elif type(output) == file: f = output opened = False else: raise StandardError("output type must be 'str' or 'file' ") db = CrawlDBI.DBI(dbtype='hpss', dbname='sub') # Here we set selection constraints for the select to retrieve the records # of interest, and we also set the time delta into the past, stored in age. # Arguments *start* and *end* provide boundaries delimiting a time segment. # We store in *age* the distance from the current time back to *end*. If # *end* is not set, it is presumed to be the same as the present, so age is # 0. *age* is passed to age_report in the count branch below. if start is not None and end is not None: dbargs = {'where': '? < record_create_time and record_create_time < ?', 'data': (start, end)} age = int(time.time()) - end elif start is None and end is not None: dbargs = {'where': 'record_create_time < ?', 'data': (end, )} age = int(time.time()) - end elif start is not None and end is None: dbargs = {'where': '? < record_create_time', 'data': (start, )} age = 0 else: age = 0 if count: dbargs['fields'] = ['count(*)'] else: dbargs['fields'] = ['bfid', 'record_create_time', 'migration_failure_count'] dbargs['orderby'] = 'record_create_time' try: dbargs['table'] = {'migr': 'bfmigrrec', 'purge': 'bfpurgerec'}[table] except KeyError: dbargs['table'] = 'bfmigrrec' rows = db.select(**dbargs) recent = 0 rval = len(rows) if count: age_report(table, age, count, rows, f, path) elif 0 < len(rows): for row in rows: if recent < row['RECORD_CREATE_TIME']: recent = row['RECORD_CREATE_TIME'] age_report(table, int(time.time()) - recent, count, rows, f, path) if mark: mpra_record_recent(table, start, recent if 0 < recent else end, len(rows)) if opened: f.close() return rval
def check(self): """ For a directory: - get a list of its contents if possible, - create a Checkable object for each item and persist it to the database - return the list of Checkables found in the directory For a file: - if it already has a hash, add it to the sample if not already and verify it - if it does not have a hash, decide whether to add it or not The value of probability [0.0 .. 1.0] indicates the likelihood with which we should check files. potential outcomes return read a directory list of Checkable objects file checksum fail Alert invalid Checkable type raise StandardError access denied "access denied" verified file checksum "matched" checksum a file "checksummed" skipped a file "skipped" hpss unavailable "unavailable" Here we examine a population member, count it as a member of the population, decide whether to add it to the sample, and if so, count it as a sample member. First, we have to make all the decisions and update the object accordingly. Then, we persist the object to the database. """ # fire up hsi # self.probability = probability rval = [] cfg = CrawlConfig.get_config() # hsi_timeout = int(cfg.get_d('crawler', 'hsi_timeout', 300)) try: # h = hpss.HSI(timeout=hsi_timeout, verbose=True) h = hpss.HSI(verbose=True) CrawlConfig.log("started hsi with pid %d" % h.pid()) except hpss.HSIerror as e: return "unavailable" if self.type == 'd': rsp = h.lsP(self.path) if "Access denied" in rsp: rval = "access denied" else: for line in rsp.split("\n"): new = Checkable.fdparse(line) if new is not None: rval.append(new) new.load() new.persist() # returning list of items found in the directory elif self.type == 'f': if self.cart is None: self.populate_cart(h) if self.checksum == 0: if self.has_hash(h): self.add_to_sample(h, already_hashed=True) rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() elif self.addable(): rval = self.add_to_sample(h) # returning "access denied" or "checksummed" else: rval = "skipped" else: rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() else: raise StandardError("Invalid Checkable type: %s" % self.type) if (3 < self.fails) and (0 == self.reported): self.fail_report(h.before()) rval = "skipped" h.quit() self.set('last_check', time.time()) CrawlConfig.log("Persisting checkable '%s' with %s = %f, %s = %d" % (self.path, 'last_check', self.last_check, 'fails', self.fails)) self.persist() return rval