def record_history(name, when, errors): """ Record a plugin name and runtime in the history table """ db = CrawlDBI.DBI(dbtype='crawler') if not db.table_exists(table='history'): dbschem.make_table('history') db.insert(table='history', fields=['plugin', 'runtime', 'errors'], data=[(name, when, errors)]) db.close()
def mpra_record_recent(type, start, end, hits): """ Record the most recent record reported so we don't report records repeatedly. However, if recent is not later than the time already stored, we don't want to update it. """ dbschem.make_table('mpra') db = CrawlDBI.DBI(dbtype="crawler") db.insert(table='mpra', fields=['type', 'scan_time', 'start_time', 'end_time', 'hits'], data=[(type, int(time.time()), int(start), int(end), hits)]) db.close()
def cvv_ttype_table(argv): """ttype_table - create (or drop) table tape_types usage: cv ttype_table [-d] {-D|-r /opt/hpss} Without the -D/--drop option, create the table tape_types in the mysql database. Populate it with information from an HPSS build tree (default is /opt/hpss). With -D or --drop, drop the table. """ p = optparse.OptionParser() p.add_option('-D', '--drop', action='store_true', default=False, dest='drop', help='drop the table') p.add_option('-d', '--debug', action='store_true', default=False, dest='debug', help='run the debugger') p.add_option('-r', '--root', action='store', default='', dest='hpssroot', help='where to look for data') try: (o, a) = p.parse_args(argv) except SystemExit: return if o.debug: pdb.set_trace() # lookup and report tape type for each pathname specified if o.drop: result = dbschem.drop_table(table="tape_types") print result else: dbschem.make_table("tape_types") hpssroot = o.hpssroot if hpssroot == '': hpssroot = os.getenv("HPSS_ROOT") if hpssroot is None: hpssroot = "/opt/hpss" tape_types_populate(hpssroot)
def record_checked_ids(cfg, low, high, correct, error): """ Save checked NSOBJECT ids in the HPSSIC database. If we check a range and get no hits (i.e., no NSOBJECT ids exist in the range), we'll store (<time>, <low-id>, <high-id>, 0, 0) If we get a hit with the right copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 1, 0) If we get a hit with the wrong copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 0, 1) """ tabname = cfg.get(sectname(), 'table_name') result = dbschem.make_table(tabname) ts = int(time.time()) CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts)) db = CrawlDBI.DBI(dbtype="crawler") db.insert(table=tabname, fields=['check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct', 'error'], data=[(ts, low, high, correct, error)]) db.close()
def record_checked_ids(cfg, low, high, correct, error): """ Save checked NSOBJECT ids in the HPSSIC database. If we check a range and get no hits (i.e., no NSOBJECT ids exist in the range), we'll store (<time>, <low-id>, <high-id>, 0, 0) If we get a hit with the right copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 1, 0) If we get a hit with the wrong copy count, we store it by itself as (<time>, <hit-id>, <hit-id>, 0, 1) """ tabname = cfg.get(sectname(), 'table_name') result = dbschem.make_table(tabname) ts = int(time.time()) CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts)) db = CrawlDBI.DBI(dbtype="crawler") db.insert(table=tabname, fields=[ 'check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct', 'error' ], data=[(ts, low, high, correct, error)]) db.close()
def history_load(loadlist, filename): """ Each plugin's sublib has a load_history() routine that knows how to load its data to the history file. Unfortunately, we do have to know here something special about plugin 'cv' to warn the user when a filename was specified without 'cv' in the load list or vice versa and when to pass filename to the plugin's load_history() method. """ cfg = CrawlConfig.add_config() pluglist = U.csv_list(cfg.get_d('crawler', 'plugins', U.default_plugins())) ll = U.csv_list(loadlist) if 'all' in ll or ll == []: ll = copy.deepcopy(pluglist) if filename is None and 'cv' in ll: print(MSG.history_cv_not_loaded) ll.remove('cv') elif filename is not None and 'cv' not in ll: print(MSG.history_filename_ignored) unk_plugs = [x for x in ll if x not in pluglist] if 0 < len(unk_plugs): print(MSG.unrecognized_plugin_S % ', '.join(unk_plugs)) map(ll.remove, unk_plugs) if ll == []: return dbschem.make_table('history') for plug in [x for x in ll if x in pluglist]: print("loading %s..." % plug) if plug == 'cv' and filename is not None: args = [filename] else: args = [] p = CrawlPlugin.CrawlPlugin(name=plug, cfg=cfg) p.load_history(*args)
def ex_nihilo(cls, dataroot='/'): """ Start from scratch. Create the database if necessary. Create the table(s) if necessary. Bootstrap the queue by adding the root director(ies). Field path is the location of the file or directory in the HPSS archive. Field type is 'f' for files or 'd' for directories. Field cos is the class of service for the file. For directories, cos is empty. Field cart starts with a null value. When populated from hsi, it may be set to the name of a tape cartridge or to ''. Empty files take up no space on any cartridge, so for them the field is empty. Field checksum is 0 if we have not computed or discoverd a checksum for the file. Once we know a checksum has been stored for the file, we set this to 1. Field last_check is the epoch time at which the file was last checked. Field fails is the number of times hashcreate and/or hashverify has failed on the file. Field reported is 0 or 1 indicating whether we've reported """ dbschem.make_table("checkables") if type(dataroot) == str: dataroot = [dataroot] if type(dataroot) == list: for root in dataroot: r = Checkable(path=root, type='d', in_db=False, dirty=True) r.load() r.persist()
def update_stats(cmf): """ Record the values in tuple cmf in table cvstats in the database. If the table does not exist, create it. """ result = dbschem.make_table(stats_table) db = CrawlDBI.DBI(dbtype="crawler") if result == "Created": db.insert(table=stats_table, fields=["rowid", "matches", "failures"], data=[(1, 0, 0)]) db.update(table=stats_table, fields=["matches", "failures"], data=[cmf], where="rowid = 1") db.close()
def get_last_rpt_time(db): """ Retrieve the last report time from the report table. If the table does not exist before make_table ('Created' in result), the table is empty so we just return 0 to indicate no last report time. """ result = dbschem.make_table("report") if "Created" in result: rval = 0 else: rows = db.select(table='report', fields=['max(report_time)']) (rval) = rows[0][0] if rval is None: rval = 0 CrawlConfig.log("time of last report: %d" % rval) return rval
def lscos_populate(): """ If table lscos already exists, we're done. Otherwise, retrieve the lscos info from hsi, create the table, and fill the table in. We store the min_size and max_size for each COS as text strings containing digits because the largest sizes are already within three orders of magnitude of a mysql bigint and growing. """ db = CrawlDBI.DBI(dbtype="crawler") tabname = 'lscos' st = dbschem.make_table(tabname) szrgx = "(\d+([KMGT]B)?)" rgx = ("\s*(\d+)\s*(([-_a-zA-Z0-9]+\s)+)\s+[UGAN]*\s+(\d+)" + "\s+(ALL)?\s+%s\s+-\s+%s" % (szrgx, szrgx)) if "Created" == st: H = hpss.HSI() raw = H.lscos() H.quit() z = [x.strip() for x in raw.split('\r')] rules = [q for q in z if '----------' in q] first = z.index(rules[0]) + 1 second = z[first:].index(rules[0]) + first lines = z[first:second] data = [] for line in lines: m = U.rgxin(rgx, line) (cos, desc, copies, lo_i, hi_i) = (m[0], m[1].strip(), m[3], U.scale(m[5], kb=1024), U.scale(m[7], kb=1024)) data.append((cos, desc, copies, lo_i, hi_i)) db.insert(table=tabname, fields=['cos', 'name', 'copies', 'min_size', 'max_size'], data=data) rval = MSG.table_created_S % tabname else: rval = MSG.table_already_S % tabname db.close() return rval