def mongo_cache(app, func, name, duration, force=False): """ Disk persistent cache that reruns a function once every 'duration' no of seconds """ from mad3.db import get_db db = get_db(app) cache = db['cache'] lg.debug("cache : %s", name) #find most recent cache record tcutoff = datetime.now() - timedelta(seconds=duration) ccursor = cache.find({ 'name': name, 'date': { "$gt": tcutoff } }).sort('date', -1) if not force: try: crec = ccursor.next() return crec['result'] except StopIteration as e: # no record in cache (?) lg.debug("no record found") lg.debug("run function {}".format(name)) result = func() cacherec = {'name': name, 'date': datetime.now(), 'result': result} cache.insert(cacherec) return result
def create_index(app, args): """Create mongodb indici.""" db = get_db(app) for idx in app.conf['index']['transient']: db.transient.create_index([(idx, pymongo.ASCENDING)]) for idx in app.conf['index']['transaction']: db.transaction.create_index([(idx, pymongo.ASCENDING)])
def find_in_db(self, strict=False): """ Attempt to find a similar transcation in the db. Subsequently, load the shasum data from the database """ query = [{'template_sha256': self.template_sha256}] # build a pymongo query if 'io' not in self.data: self.app.warning('Cannot query db for a relation ' 'with no IO files') return [] rv = [] for io in self.data['io']: if not io.get('sha256'): lg.info('Can\'t query, no input shasum for {}' .format(io['filename'])) return [] query.append({ 'io': {'$elemMatch': {'sha256': io['sha256'], 'category': io['category'], 'group': io['group']}}}) query = {'$and': query} db = get_db(self.app) # nicedictprint(query) for r in db.relation.find(query): rv.append(r) return rv
def drop(app, args): """Drop all data from one ore more databases""" if not args.i_know_what_im_doing: app.warning("Really drop? Add another command line flag") db = get_db(app) if args.transient: db.transient.drop() if args.core: db.core.drop() if args.transaction: db.transaction.drop()
def get_allkeys(): import bson.code mapper = bson.code.Code(''' function () { for (var key in this) { emit(key, {count: 1, size: this.size}); }; }''') reducer = bson.code.Code(''' function(key, values) { result = {count: 0, size: 0}; values.forEach(function(value) { result.count += value.count; result.size += value.size; }); return result; }''') db = get_db(app) result = db.transient.map_reduce(mapper, reducer, out="_allkeys") rv = [k for k in result.find()] rv.sort(key=lambda x: x['value']['size'], reverse=True) db['_allkeys'].drop() return rv
def tfind(app: Type[leip.app], args: Type[argparse.Namespace]): """Find a relation.""" db = get_db(app) transact = db['relation'] argsd = vars(args) # noqa: T484 query = {} def simple_query(key): if key in argsd and argsd[key]: query[key] = argsd[key] # id is a special case if argsd.get('id'): tid = argsd['id'] query['_id'] = tid if len(tid) == 64 \ else {'$regex': '^{}'.format(tid)} filename = argsd.get('file') if filename: madfile = MadFile(app, filename) sha256 = madfile.sha256 query['io.sha256'] = sha256 # print(madfile, sha256) simple_query('hostname') simple_query('state') for r in transact.find(query): if args.human: print("{:>6s} {} {:8} {}:{}".format( nicetimedelta(r['time']), r['_id'][:10], r['state'], r['hostname'], r['pwd'], )) for io in sorted(r['io'], key=lambda x: x['category']): marker = '<i' if io['category'] == 'input' else 'o>' print(' {} {} ({})'.format(marker, io['filename'], io['sha256'][:8])) else: nicedictprint(r)
def find(app, args): query = {} db = get_db(app) for term in args.term: rawkey, rawval = term.split('=', 1) keyname, keyinfo = key_info(app, rawkey) val = keyinfo['transformer'](rawval) lg.info('search: {} {}'.format(keyname, val)) if keyinfo['shape'] == 'one': if keyname not in query: query[keyname] = {"$in": [val]} else: raise NotImplemented() elif keyinfo['shape'] == 'set': if keyname not in query: query[keyname] = {'$all': [val]} for rec in db.transient.find(query): print(rec['filename'])
def forget(app, args): """Forget a key, or key value combination.""" key, kinfo = key_info(app.conf, args.key) db = get_db(app) if args.value: lg.warning("forget key=value {}={}".format(key, args.value)) if kinfo.get('shape', 'one') == 'set': db.transient.update({}, {"$pull": { key: args.value }}, upsert=False, multi=True) db.core.update({}, {"$pull": { key: args.value }}, upsert=False, multi=True) else: raise NotImplemented() else: lg.warning("forget key {}".format(key)) raise NotImplemented()
def _single_sum(app, group_by=None, force=False): groupby_field = "${}".format(group_by) db = get_db(app) query = [{ '$unwind': groupby_field }, { '$group': { "_id": groupby_field, "total": { "$sum": "$size" }, "count": { "$sum": 1 } } }, { "$sort": { "total": -1 } }] res = db.transient.aggregate(query) rv = list(res) return rv
def scan(app, args): basedir = os.getcwd().rstrip('/') + '/' db = get_db(app) starttime = time.time() app.bulk_init() ff_regex = "^{}".format(basedir) lg.info("Query database for files below\n {}".format(basedir)) allfilesdb = db.transient.find({'filename': {"$regex": ff_regex}}, projection=['filename', 'mtime', 'size']) file2id = {} allfiles = [] for x in allfilesdb: allfiles.append((x['filename'], x['mtime'], x['size'])) file2id[x['filename']] = x['_id'] allfiles = set(allfiles) lg.info("Found {} files in db".format(len(allfiles))) madignore = os.path.expanduser('~/.madignore') cwd = os.path.abspath(os.path.normpath(os.getcwd())) cl = r"find {} -type f -printf '%p\t%T@\t%s\n'".format(cwd) if os.path.exists(madignore): cl += ' | grep -v -f ~/.madignore' lg.info('running unix find') P = sp.Popen(cl, shell=True, stdout=sp.PIPE, stderr=sp.DEVNULL) o, e = P.communicate() o = list(map(str.strip, o.decode().split('\n'))) o = filter(None, o) def cnv2(l): p, m, s = l.rsplit("\t", 2) m = datetime.fromtimestamp(int(math.floor(float(m)))) s = int(s) return (p, m, s) o = list(map(cnv2, o)) lg.info("unix find found {} files".format(len(o))) now = set(o) if args.refresh: changed = list(now) else: changed = list(now - allfiles) deleted = list(allfiles - now) app.counter['indb'] = len(allfiles) app.counter['onfs'] = len(now) app.counter['check'] = len(changed) app.counter['rm'] = len(deleted) lg.info('in database : {:>8d}'.format(len(allfiles))) lg.info('on filesystem : {:>8d}'.format(len(now))) lg.info('total new/changed : {:>8d}'.format(len(changed))) lg.info('total deleted : {:>8d}'.format(len(deleted))) for i, c in enumerate(sorted(changed)): lg.info('changed: {} path {}'.format(i, c[0])) lg.info(' mtime {}'.format(c[1])) lg.info(' size {}'.format(c[2])) if i > 3: break delids = [file2id[x[0]] for x in deleted] db.transient.remove({'_id': {"$in": delids}}) changed = set([f[0] for f in changed]) deleted = set([f[0] for f in deleted]) lg.info("{} files seem changed".format(len(changed))) lastscreenupdate = time.time() # print_counter(app.counter) # store in database for filename in changed: app.counter['changed'] += 1 try: mfile = MadFile(app, filename, quick=args.quick) except PermissionError as e: app.counter['noaccess'] += 1 continue if mfile.dirty: mfile.save() if time.time() - lastscreenupdate > 2: print_counter(app.counter) lastscreenupdate = time.time() app.bulk_execute() print_counter(app.counter) # ensure we end on a newline print("\nruntime: {:.4f}".format(time.time() - starttime))
def scan2(app, args): basedir = os.getcwd().rstrip('/') + '/' db = get_db(app) lastscreenupdate = starttime = time.time() app.bulk_init() ff_regex = "^{}".format(basedir) lg.info("Query database for files below\n {}".format(basedir)) allfilesdb = db.transient.find({'filename': {"$regex": ff_regex}}, projection=['filename', 'mtime', 'size']) file2id = {} allfiles = [] for x in allfilesdb: allfiles.append((x['filename'], x['mtime'], x['size'])) file2id[x['filename']] = x['_id'] allfiles = set(allfiles) # print(len(allfiles)) lg.info("Found {} files in db".format(len(allfiles))) app.counter['indb'] = len(allfiles) app.message('Files in db: {}'.format(len(allfiles))) # run Unix FIND (seems the fastest way, plus we get the benefit # of adding in grep) madignore = os.path.expanduser('~/.madignore') cwd = os.path.abspath(os.path.normpath(os.getcwd())) cl = r"find {} -type f -printf '%p\t%T@\t%s\n'".format(cwd) if os.path.exists(madignore): cl += ' | grep -v -f ~/.madignore' lg.info('running unix find') def cnv2(l): p, m, s = l.rsplit("\t", 2) m = datetime.fromtimestamp(int(math.floor(float(m)))) s = int(s) return (p, m, s) deleted = set() changed = set() onfs = set() P = sp.Popen(cl, shell=True, bufsize=1 stdout=sp.PIPE, stderr=sp.DEVNULL) with P.stdout as uxfind: for ii, line in enumerate(uxfind.readlines()): ffile = cnv2(line.strip().decode()) if not ffile: continue onfs.add(ffile) app.counter['seenonfs'] += 1 in_allfiles = ffile in allfiles if (in_allfiles and args.refresh) or \ (not in_allfiles): #needs refreshing filename = ffile[0] if in_allfiles: app.counter['refresh'] += 1 else: app.counter['new'] += 1 try: mfile = MadFile(app, filename, quick=args.quick) except PermissionError as e: app.counter['noxs'] += 1 continue if mfile.dirty: mfile.save() if time.time() - lastscreenupdate > 2: print_counter(app.counter) lastscreenupdate = time.time() else: # ignoring this file, it exists, and has not changed app.counter['notnew'] += 1 app.bulk_execute() deleted = list(allfiles - onfs) app.counter['onfs'] = len(onfs) app.counter['rm'] = len(deleted) delids = [file2id[x[0]] for x in deleted] db.transient.remove({'_id': {"$in": delids}}) lg.info("{} files seem changed".format(len(changed))) print_counter(app.counter) # ensure we end on a newline print("\nruntime: {:.4f}".format(time.time() - starttime))
def __init__(self, app, filename, quick=False): """Prepare the MadFile.""" self.app = app self.quick = quick self.dirty = False self.app.counter['init_madfile'] += 1 filename = os.path.abspath(os.path.expanduser(filename)) if not os.path.exists(filename): lg.warning("{} does not exist:".format(filename)) raise FileNotFound() if not os.access(filename, os.R_OK): raise PermissionError("m3 cannot read file {}".format(filename)) self.filename = filename self.filestat = os.stat(filename) # step one - determine transient id of this file lg.debug('calc transient id for {}'.format(self.filename)) self.transient_id = self.get_transient_id() lg.debug(' -- transient id is {}'.format(self.transient_id)) # check the database if a record with the transient id exists self.db = get_db(app) lg.debug('check transient rec for {}'.format(self.filename)) self.transient_rec = self.db.transient.find_one( {'_id': self.transient_id}) # if there is no transient rec, calculate core id if self.transient_rec is None: self.app.counter['-trans'] += 1 lg.debug('transient record does not exist') # needs to be saved now self.dirty = True if self.quick: self.app.counter['nochksum'] += 1 self.sha1, self.sha256 = '0', '0' else: # not quick, calculate all shasums # calculate fresh sha256 self.app.counter['chksum'] += 1 self.sha1, self.sha256 = self.calculate_checksum() # create an stub transient rec self.transient_rec = { '_id': self.transient_id, 'sha256': self.sha256, 'sha1': self.sha1, 'filename': self.filename, 'hostname': self.app.conf['hostname'] } # and fill up the transient record see if there are changes self.refresh() else: lg.debug('transient record found!') self.app.counter['transload'] += 1 self.sha256 = self.transient_rec['sha256'] self.sha1 = self.transient_rec['sha1'] #check if this was a Q&D record (and this is not a Q&D call) if not self.quick and (self.sha256 == '0' or self.sha1 == '0'): self.app.counter['unquicken'] += 1 self.sha1, self.sha256 = self.calculate_checksum() self.transient_rec['sha256'] = self.sha256 self.transient_rec['sha1'] = self.sha1 self.dirty = True # refrehs the transient record - see if there are changes self.refresh() # if something changed - recalc thte sha256 if self.dirty: if self.quick: self.app.counter['~dirty'] += 1 #print('dirty?', self.filename) else: self.app.counter['re-chksum'] += 1 newsha1, newsha256 = self.calculate_checksum() if newsha256 == self.transient_rec['sha256']: self.app.counter['sha256_ok'] += 1 else: self.app.counter['sha256_change!'] += 1 self.transient_rec['sha1'] = newsha1 self.transient_rec['sha256'] = newsha256 # TODO: Create a transaction!!! # TODO: copy core record data?? if not self.quick: assert self.sha256 != "0" self.core_rec = self.db.core.find_one({'_id': self.sha256}) if self.core_rec is None: lg.debug('Core rec not found') self.core_rec = {'_id': self.sha256, 'sha1': self.sha1} else: for k, v in self.core_rec.items(): if k == '_id': continue if (k not in self.transient_rec) or \ (self.transient_rec[k] != v): self.transient_rec[k] = v self.dirty = True if self.dirty: lg.debug('dirty transient rec, saving') if getattr(self.app, 'bulk_mode', False): lg.debug('pepare bulk insert for {}'.format(self.filename)) self.app.bulk_transient\ .find({'_id': self.transient_id})\ .upsert()\ .update({"$set": self.transient_rec}) else: self.db.transient.update({'_id': self.transient_id}, self.transient_rec, upsert=True) self.dirty = False self.app.run_hook('onload', self)
def bulk_init(app): lg.debug("start bulk mode") db = get_db(app) app.bulk_mode = True app.bulk_transient = db.transient.initialize_unordered_bulk_op() app.bulk_core = db.core.initialize_unordered_bulk_op()
def sum(app, args): """ Show the associated mongodb record """ if not args.key: db = get_db(app) notrans = db.transient.count() print("No Transient records: ", notrans) if notrans > 0: print( "Total data Transient: ", nicesize( list( db.transient.aggregate([{ "$group": { "_id": None, "total": { "$sum": "$size" } } }]))[0]['total'])) print(" No Core records: ", db.transient.count()) return kname, kinfo = key_info(app.conf, args.key) res = _single_sum(app, group_by=kname, force=args.force) total_size = int(0) total_count = 0 mgn = len("Total") for reshost in res: gid = reshost['_id'] if gid is None: mgn = max(4, mgn) else: mgn = max(len(str(reshost['_id'])), mgn) fms = "{:" + str(mgn) + "}\t{:>10}\t{:>9}" if args.human: print("# {}:".format(kname)) for reshost in res: total = reshost['total'] count = reshost['count'] total_size += int(total) total_count += count if args.human: total_human = nicesize(total) count_human = nicenumber(count) categ = reshost['_id'] if categ is None: categ = "<undefined>" print(fms.format(categ, total_human, count_human)) else: print("{}\t{}\t{}".format(reshost['_id'], total, count)) if args.human: total_size_human = nicesize(total_size) total_count_human = nicenumber(total_count) print(fms.format('', '-' * 10, '-' * 9)) print(fms.format("Total", total_size_human, total_count_human)) else: print("Total\t{}\t{}".format(total_size, total_count))
def save(self): """Save relation to the database.""" db = get_db(self.app) transact = db['relation'] transact.insert(self.data)