def populate_cart_field(pc_l, limit, dryrun, verbose): """ We get a list of paths and carts. The cart values may be empty or None. We talk to hsi to collect cart info for each of the paths, building a return list. If 0 < limit, the list returned is limit elements long. If dryrun, we just report what would happen without actually doing anything. In the array of tuples returned, the cart value comes first so we can pass the list to a db.update() call that is going to match on path. """ h = hpss.HSI(verbose=True) rval = [] for path, dcart in pc_l: info = h.lsP(path) hcart = info.split("\t")[5].strip() if dcart != hcart: if 0 < limit: try: populate_cart_field._count += 1 except AttributeError: populate_cart_field._count = 1 if 0 < limit and limit < populate_cart_field._count: return True rval.append((path, dcart, hcart)) if verbose: if 60 < len(path): dpath = '...' + path[-57:] else: dpath = path print("%-60s %-8s %-10s" % (dpath, dcart, hcart)) h.quit() return rval
def ttype_lookup(pathname, cart=None): """ Use hsi to get the name of the cart where this file lives. Look up the cart in table pvlpv and get the type and subtype. Look up the type/subtype combination in the *_tape_types table and return the corresponding string. """ rval = [] # Get the cart name from hsi if we don't already have it if cart is None or cart == '': H = hpss.HSI() r = H.lsP(pathname) H.quit() (type, name, cart, cos) = U.lsp_parse(r) if not cart: return None cartlist = cart.split(',') # Get the type/subtype from PVLPV for cart in cartlist: desc = ttype_cart_to_desc(cart) rval.append((cart, desc)) # Return the media description return rval
def copies_by_cos(): """ Use hsi to retrieve copy count information for each COS. """ h = hpss.HSI() rsp = h.lscos() h.quit() cbc = {} for line in rsp.split("\n"): tup = cos_parse(line) if tup: cbc[tup[0]] = int(tup[1]) return cbc
def lscos_populate(): """ If table lscos already exists, we're done. Otherwise, retrieve the lscos info from hsi, create the table, and fill the table in. We store the min_size and max_size for each COS as text strings containing digits because the largest sizes are already within three orders of magnitude of a mysql bigint and growing. """ db = CrawlDBI.DBI(dbtype="crawler") tabname = 'lscos' st = dbschem.make_table(tabname) szrgx = "(\d+([KMGT]B)?)" rgx = ("\s*(\d+)\s*(([-_a-zA-Z0-9]+\s)+)\s+[UGAN]*\s+(\d+)" + "\s+(ALL)?\s+%s\s+-\s+%s" % (szrgx, szrgx)) if "Created" == st: H = hpss.HSI() raw = H.lscos() H.quit() z = [x.strip() for x in raw.split('\r')] rules = [q for q in z if '----------' in q] first = z.index(rules[0]) + 1 second = z[first:].index(rules[0]) + first lines = z[first:second] data = [] for line in lines: m = U.rgxin(rgx, line) (cos, desc, copies, lo_i, hi_i) = (m[0], m[1].strip(), m[3], U.scale(m[5], kb=1024), U.scale(m[7], kb=1024)) data.append((cos, desc, copies, lo_i, hi_i)) db.insert(table=tabname, fields=['cos', 'name', 'copies', 'min_size', 'max_size'], data=data) rval = MSG.table_created_S % tabname else: rval = MSG.table_already_S % tabname db.close() return rval
def check(self): """ For a directory: - get a list of its contents if possible, - create a Checkable object for each item and persist it to the database - return the list of Checkables found in the directory For a file: - if it already has a hash, add it to the sample if not already and verify it - if it does not have a hash, decide whether to add it or not The value of probability [0.0 .. 1.0] indicates the likelihood with which we should check files. potential outcomes return read a directory list of Checkable objects file checksum fail Alert invalid Checkable type raise StandardError access denied "access denied" verified file checksum "matched" checksum a file "checksummed" skipped a file "skipped" hpss unavailable "unavailable" Here we examine a population member, count it as a member of the population, decide whether to add it to the sample, and if so, count it as a sample member. First, we have to make all the decisions and update the object accordingly. Then, we persist the object to the database. """ # fire up hsi # self.probability = probability rval = [] cfg = CrawlConfig.get_config() # hsi_timeout = int(cfg.get_d('crawler', 'hsi_timeout', 300)) try: # h = hpss.HSI(timeout=hsi_timeout, verbose=True) h = hpss.HSI(verbose=True) CrawlConfig.log("started hsi with pid %d" % h.pid()) except hpss.HSIerror as e: return "unavailable" if self.type == 'd': rsp = h.lsP(self.path) if "Access denied" in rsp: rval = "access denied" else: for line in rsp.split("\n"): new = Checkable.fdparse(line) if new is not None: rval.append(new) new.load() new.persist() # returning list of items found in the directory elif self.type == 'f': if self.cart is None: self.populate_cart(h) if self.checksum == 0: if self.has_hash(h): self.add_to_sample(h, already_hashed=True) rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() elif self.addable(): rval = self.add_to_sample(h) # returning "access denied" or "checksummed" else: rval = "skipped" else: rval = self.verify(h) # returning "matched", "checksummed", "skipped", or Alert() else: raise StandardError("Invalid Checkable type: %s" % self.type) if (3 < self.fails) and (0 == self.reported): self.fail_report(h.before()) rval = "skipped" h.quit() self.set('last_check', time.time()) CrawlConfig.log( "Persisting checkable '%s' with %s = %f, %s = %d" % (self.path, 'last_check', self.last_check, 'fails', self.fails)) self.persist() return rval