Beispiel #1
0
def _run_mongo_command(app, name, collection, query, kwargs={}, force=False):
    """
    Execute mongo command.
    """
    MONGO_mad = get_mongo_transient_db(app)
    res = MONGO_mad.database.command(query, collection, **kwargs)
    return res
Beispiel #2
0
def notebooks(app, args):
    db = get_mongo_transient_db(app)

    curdir = os.path.abspath(os.getcwd())
    rex = re.compile('^' + re.escape(curdir) + '.*\.ipynb$', re.IGNORECASE)
    host = socket.gethostname()
    
    query = {
        'extension' : 'ipynb',
        'host': host,
        'fullpath': {'$regex': rex}
        }
    
    results = db.find(query).sort('mtime', -1).limit(args.limit)
    
    for res in results:
        relpath = os.path.relpath(res['fullpath'])
        atime = res['mtime']
        dtime = humanize.naturaltime(datetime.utcnow() - atime)
        host = res['host']
        project = res.get('project')
        kernel = res['ipy_kernel']
        
        if not relpath.startswith('.'):
            relpath = './' + relpath
        cprint(dtime, 'yellow', end=', ')
        cprint(kernel, 'green', end="")
        if not project is None:
            print(", ", end="")
            cprint(project, 'red')
        else:
            print()
        print("    ", relpath)
Beispiel #3
0
def _run_mongo_command(app, name, collection, query, kwargs={}, force=False):
    """
    Execute mongo command.
    """
    MONGO_mad = get_mongo_transient_db(app)
    res = MONGO_mad.database.command(query, collection, **kwargs)
    return res
Beispiel #4
0
def _run_waste_command(app, name, force=False):
    """
    Execute mongo command.
    """
    MONGO_mad = get_mongo_transient_db(app)
    res = MONGO_mad.aggregate(FIND_WASTER_PIPELINE, allowDiskUse=True)
    return list(res)
Beispiel #5
0
def mongo_flush(app):

    global MONGO_REMOVE_CACHE
    global MONGO_SAVE_CACHE
    global COUNTER

    lg.debug("flush")
    if (len(MONGO_SAVE_CACHE) + len(MONGO_REMOVE_CACHE)) == 0:
        lg.debug("nothing to flush")
        return

    collection = get_mongo_transient_db(app)

    if len(MONGO_SAVE_CACHE) > 0:
        bulk = collection.initialize_unordered_bulk_op()
        for i, r in MONGO_SAVE_CACHE:
            COUNTER['saved'] += 1
            bulk.find({'_id': i}).upsert().replace_one(r)
        res = bulk.execute()
        lg.debug("Saved %d records", res['nModified'])

    for i, r in enumerate(MONGO_REMOVE_CACHE):
        # hould try to do this in bulk, but uncertain how...
        COUNTER['removed'] += 1
        lg.info("removing id: %s", r)
        collection.remove({'_id': r})

    MONGO_SAVE_CACHE = []
    MONGO_REMOVE_CACHE = []
Beispiel #6
0
def _run_waste_command(app, name, force=False):
    """
    Execute mongo command.
    """
    MONGO_mad = get_mongo_transient_db(app)
    res = MONGO_mad.aggregate(FIND_WASTER_PIPELINE, allowDiskUse=True)
    return res
Beispiel #7
0
def _single_sum(app, group_by=None, force=False):
    groupby_field = "${}".format(group_by)
    MONGO_mad = get_mongo_transient_db(app)

    res = MONGO_mad.aggregate([{
        "$match": {
            "orphan": False
        }
    }, {
        '$group': {
            "_id": groupby_field,
            "total": {
                "$sum": "$filesize"
            },
            "count": {
                "$sum": 1
            }
        }
    }, {
        "$sort": {
            "total": -1
        }
    }])

    return list(res)
Beispiel #8
0
def mongo_flush(app):

    global MONGO_REMOVE_CACHE
    global MONGO_SAVE_CACHE
    global COUNTER

    lg.debug("flush")
    if (len(MONGO_SAVE_CACHE) + len(MONGO_REMOVE_CACHE)) == 0:
        lg.debug("nothing to flush")
        return

    collection = get_mongo_transient_db(app)

    if len(MONGO_SAVE_CACHE) > 0:
        bulk = collection.initialize_unordered_bulk_op()
        for i, r in MONGO_SAVE_CACHE:
            COUNTER['saved'] += 1
            bulk.find({'_id': i}).upsert().replace_one(r)

        res = bulk.execute()
        lg.debug("Saved %d records", res['nModified'])

    for i, r in enumerate(MONGO_REMOVE_CACHE):
        # should try to do this in bulk, but uncertain how...
        COUNTER['removed'] += 1
        lg.info("removing id: %s", r)
        collection.remove({'_id': r})

    MONGO_SAVE_CACHE = []
    MONGO_REMOVE_CACHE = []
Beispiel #9
0
def distinct(app, args):
    transient = get_mongo_transient_db(app)
    vals = sorted(transient.distinct(args.keyname))
    if None in vals:
        cprint('<None>', 'yellow')
    for v in vals:
        if v is None:
            continue
        cprint('"' + v + '"', 'green')
Beispiel #10
0
def distinct(app, args):
    transient = get_mongo_transient_db(app)
    vals = sorted(transient.distinct(args.keyname))
    if None in vals:
        cprint('<None>', 'yellow')
    for v in vals:
        if v is None:
            continue
        cprint('"' + v + '"', 'green')
Beispiel #11
0
def mongo_drop(app, args):
    """
    Show the associated mongodb record
    """
    if not args.force:
        print("use --force to really drop the database")
        exit()

    MONGO_mad = get_mongo_transient_db(app)
    MONGO_mad.drop()
Beispiel #12
0
def mongo_drop(app, args):
    """
    Show the associated mongodb record
    """
    if not args.force:
        print("use --force to really drop the database")
        exit()

    MONGO_mad = get_mongo_transient_db(app)
    MONGO_mad.drop()
Beispiel #13
0
def _complex_sum(app, name, fields=['username', 'host'],
                 force=False):

    MONGO_mad = get_mongo_transient_db(app)
    qid = dict([(x, "$" + x) for x in fields])
    aggp = [{'$group': {
        "_id": qid,
        "total": {"$sum": "$filesize"},
        "count": {"$sum": 1}}}]
    res = MONGO_mad.aggregate(aggp)
    return res['result']
Beispiel #14
0
def mongo_del(app, args):
    """
    get a mongodb record based on id
    """
    if args.core:
        MONGO = get_mongo_core_db(app)
    else:
        MONGO = get_mongo_transient_db(app)

    mongo_id = args.mongo_id
    MONGO.remove({'_id': mongo_id})
Beispiel #15
0
def mongo_del(app, args):
    """
    get a mongodb record based on id
    """
    if args.core:
        MONGO = get_mongo_core_db(app)
    else:
        MONGO = get_mongo_transient_db(app)

    mongo_id = args.mongo_id
    MONGO.remove({'_id': mongo_id})
Beispiel #16
0
def mongo_last(app, args):
    MONGO_mad = get_mongo_transient_db(app)
    res = MONGO_mad.aggregate([
        {"$sort": {"save_time": -1}},
        {"$limit": args.no},
    ])
    for i, r in enumerate(res['result']):
        if i > args.no:
            break
        print("\t".join(
            [arrow.get(r['save_time']).humanize(),
              r['filename'], r.get('_id', '')]))
Beispiel #17
0
def waste(app, args):

    db = get_mongo_transient_db(app)

    res = _run_waste_command(app, 'waste_pipeline',
                             force=args.force)

    if args.todb:
        dbrec = {'time': datetime.datetime.utcnow(),
                 'data': res}
        db = mad2.util.get_mongo_db(app)
        db.waste.insert_one(dbrec)
        return

    def cprint_nocolor(*args, **kwargs):
        if 'color' in kwargs:
            del kwargs['color']
        if len(args) > 1:
            args = args[:1]
        print(*args, **kwargs)

    # if args.no_color:
    #     cprint = cprint_nocolor

    for i, r in enumerate(res):
        if i >= args.no_records:
            break

        sha1sum = r['_id']
        if not sha1sum.strip():
            continue

        cprint(sha1sum, 'yellow', end='')
        cprint(" sz ", "grey", end="")
        cprint("{:>9}".format(humansize(r['waste'])), end='')
        cprint(" w ", "grey", end="")
        cprint("{:>9}".format(humansize(r['filesize'])), end='')
        hostcount = collections.defaultdict(lambda: 0)
        hostsize = collections.defaultdict(lambda: 0)
        owners = set()
        for f in db.find({'sha1sum': sha1sum}):
            owners.add(f['username'])
            host = f['host']
            hostcount[host] += 1
            hostsize[host] += float(f['filesize']) / float(f['nlink'])

        for h in hostcount:
            print(' ', end='')
            cprint(h, 'green', end=':')
            cprint(hostcount[h], 'cyan', end="")

        cprint(" ", end="")
        cprint(", ".join(owners), 'red')
Beispiel #18
0
def mongo_remove_key(app, args):
    lg.warning("removing %s from the %s db", args.key, args.database)
    if args.database == 'core':
        COLLECTION = get_mongo_core_db(app)
    elif args.databse == 'transient':
        COLLECTION = get_mongo_transient_db(app)

    print(COLLECTION)
    query = {args.key: {'$exists': True}}
    update = {"$unset": {args.key: ""}}
    print(query)
    print(update)
    COLLECTION.update(query, update, multi=True)
Beispiel #19
0
def mongo_index(app, args):
    """
    Ensure indexes on the relevant fields
    """
    MONGO_trans = get_mongo_transient_db(app)
    MONGO_core = get_mongo_core_db(app)

    core_index =app.conf['plugin.mongo.indici.core']
    trans_index =app.conf['plugin.mongo.indici.transient']
    for db, flds in [(MONGO_trans, trans_index), (MONGO_core, core_index)]:
        for k, v in list(flds.items()):
            assert v==1
            db.ensure_index(k)
Beispiel #20
0
def mongo_remove_key(app, args):
    lg.warning("removing %s from the %s db", args.key, args.database)
    if args.database == 'core':
        COLLECTION = get_mongo_core_db(app)
    elif args.databse == 'transient':
        COLLECTION = get_mongo_transient_db(app)

    print(COLLECTION)
    query = {args.key: {'$exists': True}}
    update = {"$unset": {args.key: ""}}
    print(query)
    print(update)
    COLLECTION.update(query, update, multi=True)
Beispiel #21
0
def waste(app, args):

    db = get_mongo_transient_db(app)

    res = _run_waste_command(app, 'waste_pipeline', force=args.force)

    if args.todb:
        dbrec = {'time': datetime.datetime.utcnow(), 'data': res}
        db = mad2.util.get_mongo_db(app)
        db.waste.insert_one(dbrec)
        return

    def cprint_nocolor(*args, **kwargs):
        if 'color' in kwargs:
            del kwargs['color']
        if len(args) > 1:
            args = args[:1]
        print(*args, **kwargs)

    # if args.no_color:
    #     cprint = cprint_nocolor

    for i, r in enumerate(res):
        if i >= args.no_records:
            break

        sha1sum = r['_id']
        if not sha1sum.strip():
            continue

        cprint(sha1sum, 'yellow', end='')
        cprint(" sz ", "grey", end="")
        cprint("{:>9}".format(humansize(r['waste'])), end='')
        cprint(" w ", "grey", end="")
        cprint("{:>9}".format(humansize(r['filesize'])), end='')
        hostcount = collections.defaultdict(lambda: 0)
        hostsize = collections.defaultdict(lambda: 0)
        owners = set()
        for f in db.find({'sha1sum': sha1sum}):
            owners.add(f['username'])
            host = f['host']
            hostcount[host] += 1
            hostsize[host] += float(f['filesize']) / float(f['nlink'])

        for h in hostcount:
            print(' ', end='')
            cprint(h, 'green', end=':')
            cprint(hostcount[h], 'cyan', end="")

        cprint(" ", end="")
        cprint(", ".join(owners), 'red')
Beispiel #22
0
def _single_sum(app, group_by=None, force=False):
    groupby_field = "${}".format(group_by)
    MONGO_mad = get_mongo_transient_db(app)

    res = MONGO_mad.aggregate([
        {"$match": {"orphan": False}},
        {'$group': {
            "_id": groupby_field,
            "total": {"$sum": "$filesize"},
            "count": {"$sum": 1}}},
        {"$sort": {"total": -1
                   }}])

    return res['result']
Beispiel #23
0
def mongo_get(app, args):
    """
    get a mongodb record based on id
    """

    if args.core:
        collection = get_mongo_core_db(app)
    else:
        collection = get_mongo_transient_db(app)

    rec = collection.find_one({'_id': args.mongo_id[:24]})
    if not rec:
        return

    print(yaml.safe_dump(rec, default_flow_style=False))
Beispiel #24
0
def mongo_get(app, args):
    """
    get a mongodb record based on id
    """

    if args.core:
        collection = get_mongo_core_db(app)
    else:
        collection = get_mongo_transient_db(app)

    rec = collection.find_one({'_id': args.mongo_id[:24]})
    if not rec:
        return

    print(yaml.safe_dump(rec, default_flow_style=False))
Beispiel #25
0
def mongo_show(app, args):
    """
    Show mongodb records
    """
    transient_db = get_mongo_transient_db(app)
    for madfile in get_all_mad_files(app, args):
        mongo_id = madfile['uuid']
        if mongo_id:
            print('#', mongo_id, madfile['filename'])
            rec = transient_db.find_one({'_id': mongo_id})
            if not rec:
                continue
            for key in rec:
                if key == '_id':
                    print('uuid\t{1}\t{2}'.format(key, rec[key]))
                    continue
                print('{0}\t{1}'.format(key, rec[key]))
Beispiel #26
0
def mongo_show(app, args):
    """
    Show mongodb records
    """
    transient_db = get_mongo_transient_db(app)
    for madfile in get_all_mad_files(app, args):
        mongo_id = madfile['uuid']
        if mongo_id:
            print('#', mongo_id, madfile['filename'])
            rec = transient_db.find_one({'_id': mongo_id})
            if not rec:
                continue
            for key in rec:
                if key == '_id':
                    print('uuid\t{1}\t{2}'.format(key, rec[key]))
                    continue
                print('{0}\t{1}'.format(key, rec[key]))
Beispiel #27
0
def _complex_sum(app, name, fields=['username', 'host'], force=False):

    MONGO_mad = get_mongo_transient_db(app)
    qid = dict([(x, "$" + x) for x in fields])
    aggp = [{
        '$group': {
            "_id": qid,
            "total": {
                "$sum": "$filesize"
            },
            "count": {
                "$sum": 1
            }
        }
    }]
    res = MONGO_mad.aggregate(aggp)
    return list(res)
Beispiel #28
0
def mongo_last(app, args):
    MONGO_mad = get_mongo_transient_db(app)
    res = MONGO_mad.aggregate([
        {
            "$sort": {
                "save_time": -1
            }
        },
        {
            "$limit": args.no
        },
    ])
    for i, r in enumerate(res['result']):
        if i > args.no:
            break
        print("\t".join([
            arrow.get(r['save_time']).humanize(), r['filename'],
            r.get('_id', '')
        ]))
Beispiel #29
0
def mongo_index(app, args):
    """
    Ensure indexes on the relevant fields
    """
    MONGO_transient = get_mongo_transient_db(app)
    MONGO_core = get_mongo_core_db(app)
    MONGO_transact, MONGO_sha1sum2transact = get_mongo_transact_db(app)

    core_index = app.conf['plugin.mongo.indici.core']
    transient_index = app.conf['plugin.mongo.indici.transient']
    transact_index = app.conf['plugin.mongo.indici.transact']
    sha2tra_index = app.conf['plugin.mongo.indici.sha1sum2transact']

    for db, flds in [(MONGO_transient, transient_index),
                     (MONGO_core, core_index),
                     (MONGO_transact, transact_index),
                     (MONGO_sha1sum2transact, sha2tra_index)]:
        for k, v in list(flds.items()):
            print(db, k, v)
            assert v == 1
            db.ensure_index(k)
Beispiel #30
0
def mongo_index(app, args):
    """
    Ensure indexes on the relevant fields
    """
    MONGO_transient = get_mongo_transient_db(app)
    MONGO_core = get_mongo_core_db(app)
    MONGO_transact, MONGO_sha1sum2transact = get_mongo_transact_db(app)

    core_index =app.conf['plugin.mongo.indici.core']
    transient_index =app.conf['plugin.mongo.indici.transient']
    transact_index =app.conf['plugin.mongo.indici.transact']
    sha2tra_index =app.conf['plugin.mongo.indici.sha1sum2transact']

    for db, flds in [(MONGO_transient, transient_index),
                     (MONGO_core, core_index),
                     (MONGO_transact, transact_index),
                     (MONGO_sha1sum2transact, sha2tra_index)]:
        for k, v in list(flds.items()):
            print(db, k, v)
            assert v==1
            db.ensure_index(k)
Beispiel #31
0
def forget(app, args):
    MONGO = get_mongo_transient_db(app)
    MONGO_CORE = get_mongo_core_db(app)
    to_remove = []
    to_remove_core = []

    def go(coll, lst):
        coll.remove({'_id': {'$in': lst}})

    for madfile in get_all_mad_files(app, args):
        to_remove.append(madfile['_id_transient'])
        if args.remove_from_core:
            to_remove_core.append(['_id'])

        if len(to_remove) > 100:
            go(MONGO, to_remove)
            to_remove = []
        if len(to_remove_core) > 100:
            go(MONGO_CORE, to_remove_core)
            to_remove_core = []

    go(MONGO, to_remove)
    go(MONGO_CORE, to_remove_core)
Beispiel #32
0
def forget(app, args):
    MONGO = get_mongo_transient_db(app)
    MONGO_CORE = get_mongo_core_db(app)
    to_remove = []
    to_remove_core = []

    def go(coll, lst):
        coll.remove({'_id': {'$in': lst}})

    for madfile in get_all_mad_files(app, args):
        to_remove.append(madfile['_id_transient'])
        if args.remove_from_core:
            to_remove_core.append(['_id'])

        if len(to_remove) > 100:
            go(MONGO, to_remove)
            to_remove = []
        if len(to_remove_core) > 100:
            go(MONGO_CORE, to_remove_core)
            to_remove_core = []

    go(MONGO, to_remove)
    go(MONGO_CORE, to_remove_core)
Beispiel #33
0
def _get_mongo_keys(app, collection, force=False):
    from bson.code import Code
    mapper = Code("""
        function() {
            for (var key in this) { emit(key, 1); }
        } """)

    rv = {}

    reducer = Code("function(key, vals) { return Array.sum(vals); }")
    if collection == 'transient':
        message("Get keys from the transient db")
        COLLECTION = get_mongo_transient_db(app)
    else:
        message("Get keys from the core db")
        COLLECTION = get_mongo_core_db(app)

    res = COLLECTION.map_reduce(mapper, reducer, "my_collection" + "_keys")

    for r in res.find():
        rv[r['_id']] = int(r['value'])

    return rv
Beispiel #34
0
def _get_mongo_keys(app, collection, force=False):
    from bson.code import Code
    mapper = Code("""
        function() {
            for (var key in this) { emit(key, 1); }
        } """)

    rv = {}

    reducer = Code("function(key, vals) { return Array.sum(vals); }")
    if collection == 'transient':
        message("Get keys from the transient db")
        COLLECTION = get_mongo_transient_db(app)
    else:
        message("Get keys from the core db")
        COLLECTION = get_mongo_core_db(app)

    res = COLLECTION.map_reduce(mapper, reducer, "my_collection" + "_keys")

    for r in res.find():
        rv[r['_id']] = int(r['value'])

    return rv
Beispiel #35
0
def waste(app, args):

    db = get_mongo_transient_db(app)

    res = _run_waste_command(app, 'waste_pipeline',
                             force=args.force)['result']

    for i, r in enumerate(res):
        if i >= args.no_records:
            break

        sha1sum = r['_id']
        if not sha1sum.strip():
            continue

        cprint(sha1sum, 'grey', end='')
        cprint(" sz ", "grey", end="")
        cprint("{:>9}".format(humansize(r['waste'])), end='')
        cprint(" w ", "grey", end="")
        cprint("{:>9}".format(humansize(r['filesize'])), end='')

        hostcount = collections.defaultdict(lambda: 0)
        hostsize = collections.defaultdict(lambda: 0)
        owners = set()
        for f in db.find({'sha1sum': sha1sum}):
            owners.add(f['username'])
            host = f['host']
            hostcount[host] += 1
            hostsize[host] += float(f['filesize']) / float(f['nlink'])

        for h in hostcount:
            print(' ', end='')
            cprint(h, 'green', end=':')
            cprint(hostcount[h], 'cyan', end="")

        cprint(" ", end="")
        cprint(", ".join(owners), 'red')
Beispiel #36
0
def search(app, args):
    """
    Find files
    """

    MONGO_mad = get_mongo_transient_db(app)

    query = {}

    for f in ['username', 'backup', 'volume', 'host',
              'sha1sum', 'project', 'project', 'pi']:
        if f not in args:
            continue

        v = getattr(args, f)
        if v is None:
            continue
        query[f] = v

    if args.delete:
        MONGO_mad.remove(query)
        return

    res = MONGO_mad.find(query)

    if args.sort:
        res = res.sort(args.sort, pymongo.ASCENDING)
    elif args.reverse_sort:
        res = res.sort(args.reverse_sort, pymongo.DESCENDING)

    if args.limit > 0:
        res = res.limit(args.limit)

    for r in res:

        print(args.format.format(**r))  # 'fullpath'])
Beispiel #37
0
def transient_delete(app, madfile):
    transient_id = madfile.get('_id_transient')
    MONGO = get_mongo_transient_db(app)
    lg.debug("Deleting %s (%s)", madfile['inputfile'], transient_id)
    MONGO.remove({'_id': transient_id})
Beispiel #38
0
def ta_tree(app, args):

    import networkx as nx

    G = nx.DiGraph()

    db_t, db_s2t = get_mongo_transact_db(app)
    trans_db = get_mongo_transient_db(app)

    if len(args.object) == 40 and not os.path.exists(args.object):
        sha1sum = args.object
    else:
        madfile = get_mad_file(app, args.object)
        sha1sum = madfile['sha1sum']

    def _get_trarec(sha1sum):
        rv = defaultdict(set)
        for rec in trans_db.find(dict(sha1sum=sha1sum)):
            for field in [
                    'project', 'filename', 'filesize', 'analyst', 'pi',
                    'username', 'fullpath'
            ]:
                if field in rec:
                    rv[field].add(rec[field])
        return {k: ';'.join(map(str, v)) for (k, v) in rv.items()}

    def _add_node(G, sha1sum):
        if sha1sum in G:
            return

        G.add_node(sha1sum)
        sdata = _get_trarec(sha1sum)
        G.node[sha1sum].update(sdata)

    _add_node(G, sha1sum)

    sha1sum_processed = set()

    def _find_relations_shasum(G, sha1sum):
        if sha1sum in sha1sum_processed:
            return

        sha1sum_processed.add(sha1sum)

        for s2t in db_s2t.find(dict(sha1sum=sha1sum)):
            tra = db_t.find_one(dict(_id=s2t['transaction_id']))
            io = tra['io']
            ioo = [x for x in io if x['category'] == 'output']
            if len(ioo) == 0:
                continue

            for fa, fb in itertools.product(io, ioo):
                if fa == fb:
                    continue
                fas, fbs = fa['sha1sum'], fb['sha1sum']
                _add_node(G, fas)
                _add_node(G, fbs)
                ltype = fa['category']
                if ltype == 'output':
                    ltype = 'sibling'
                G.add_edge(fas, fbs)
                G[fas][fbs]['count'] = G[fas][fbs].get('count', 0) + 1
                G[fas][fbs]['type'] = ltype

                _find_relations_shasum(G, fas)
                _find_relations_shasum(G, fbs)

    _find_relations_shasum(G, sha1sum)
    nx.write_graphml(G, 'test.graphml')
Beispiel #39
0
def mongo_count(app, args):
    """
    Show the associated mongodb record
    """
    MONGO_mad = get_mongo_transient_db(app)
    print(MONGO_mad.count())
Beispiel #40
0
def update(app, args):
    """
    update the transient db in this directory and below
    """

    global MONGO_REMOVE_CACHE
    global MONGO_SAVE_CACHE
    global COUNTER

    modfiles = collections.deque([], 5)
    newfiles = collections.deque([], 5)

    transient_db = get_mongo_transient_db(app)
    ignore_dirs = ['.*', '.git', 'tmp']
    ignore_files = ['.*', '*.log', '*~', '*#', 'SHA1SUMS*']
    basedir = os.getcwd()

    find_dir_regex = re.compile('^{}'.format(basedir))
    find_dir_regex = '^{}'.format(basedir)  #.replace('/', '\/')
    lg.debug("searching for dirs with regex: %s", find_dir_regex)
    tradirs = []

    query = {
        'host': socket.gethostname(),
        'dirname': {
            "$regex": find_dir_regex
        }
    }

    trans_dirs = list(transient_db.find(query).distinct('dirname'))
    if args.quiet:
        lg.debug("found %d directories below this directory in transient db",
                 len(trans_dirs))
    else:
        lg.warning("found %d directories below this directory in transient db",
                   len(trans_dirs))

    #to be safe - strip trailing slashes
    trans_dirs = [x.rstrip('/') for x in trans_dirs]
    dirs_to_delete = copy.copy(trans_dirs)
    lg.info("%d dirs with data in the transient db", len(trans_dirs))

    def screen_update(cnt, lud=0, msg=""):
        if args.quiet: return
        ts = shutil.get_terminal_size().columns - 1

        if (time.time() > lud) < 1:
            return lud

        def _add_sep(b):
            return re.sub(r'([0-9][0-9][0-9])', r'\1,',
                          str(b)[::-1])[::-1].strip(',')

        if len(cnt) == 0:
            return time.time()

        out = " ".join(
            ['{}:{}'.format(a, _add_sep(b)) for a, b in cnt.items()])

        rest = ts - (len(out) + 1)

        if rest > 5 and isinstance(msg, str):
            out += ':' + msg[:int(rest) - 1]

        rest = ts - (len(out) + 1)
        out += ' ' * rest

        print(out, end='\r')

        return time.time()

    start = time.time()

    last_screen_update = screen_update(COUNTER, msg='init')

    def _name_match(fn, ignore_list):
        for i in ignore_list:
            if fnmatch(fn, i):
                return True
        return False

    for root, dirs, files in os.walk(basedir):
        last_screen_update = screen_update(COUNTER,
                                           lud=last_screen_update,
                                           msg=root)

        root = root.rstrip('/')
        COUNTER['dir'] += 1

        if os.path.exists(os.path.join(basedir, 'mad.ignore')):
            dirs[:] = []
            must_save_files = []
        else:
            dirs[:] = [x for x in dirs if not _name_match(x, ignore_dirs)]
            dirs[:] = [
                x for x in dirs
                if not os.path.exists(os.path.join(x, 'mad.ignore'))
            ]

            must_save_files = [
                x for x in files if not _name_match(x, ignore_files)
            ]

        def check_access(_root, _fn):
            _path = os.path.join(_root, _fn)
            acc = os.access(_path, os.R_OK)
            if not acc:
                COUNTER['no_access'] += 1
                if COUNTER['no_access'] < 10:
                    lg.info("no access to: %s", _path)
            return acc

        must_save_files = [x for x in must_save_files if check_access(root, x)]

        remove_dir = True

        if len(must_save_files) > 0:
            lg.debug('files to be saved in %s', root)
            remove_dir = False

        if args.watch:
            sys.stdout.write(chr(27) + "[2J" + chr(27) + "[1;1f")
            print(datetime.datetime.now())
            print()
            print("dir: {}".format(root))
            for k in sorted(COUNTER.keys()):
                print("  {:<20}:{:<10d}".format(k, COUNTER[k]))

        else:
            lg.info('%s: %s', root[-40:], str(dict(COUNTER)))

        trans_records = transient_db.find(
            {
                "dirname": root,
                "host": socket.gethostname(),
            }, {
                "_id_transient": 1,
                "filename": 1,
                "sha1sum": 1,
                "sha1sum_time": 1,
            })

        trec_files = []

        for trec in trans_records:

            last_screen_update = screen_update(COUNTER,
                                               lud=last_screen_update,
                                               msg=root)

            if not trec['filename'] in files:
                COUNTER['rm'] += 1
                lg.debug('deleted: %s', trec['filename'])
                MONGO_REMOVE_CACHE.append(trec['_id_transient'])
                continue

            # this file is in both the db & on disk - check mtime
            remove_dir = False  # stuff in this folder - do not delete!
            fullpath = os.path.abspath(
                os.path.realpath(os.path.join(root, trec['filename'])))
            try:
                fstat = os.lstat(fullpath)
            except FileNotFoundError:
                # this happens for broken symlinks - in which case we should remove the
                # trans_record
                COUNTER['broken'] += 1
                MONGO_REMOVE_CACHE.append(trec['_id_transient'])
                continue

            mtime = datetime.datetime.utcfromtimestamp(fstat.st_mtime)

            if 'sha1sum_time' in trec:
                timediff = (mtime - trec['sha1sum_time']).total_seconds()
            else:
                timediff = 1e12  #force recalculation

            # allow for at least half a second of leeway - at times
            # the difference between modification time and when the
            # system has taken the sha1sum does not have enough
            # resolution
            if timediff > 0.5:
                # might be modified - create a madfile object which will check
                # more thoroughly
                COUNTER['mod?'] += 1
                modfiles.append(fullpath)
                madfile = mad2.util.get_mad_file(app, fullpath)
                save_to_mongo(app, madfile)
            else:
                COUNTER['ok'] += 1

            # remove this file from the "must save" list - it's already
            # present
            if trec['filename'] in must_save_files:
                must_save_files.remove(trec['filename'])

        # save new files
        for filename in must_save_files:
            last_screen_update = screen_update(COUNTER, last_screen_update,
                                               'new - ' + root)

            filename = os.path.join(root, filename)
            remove_dir = False  # again - stuff here - do not remove
            filestat = os.lstat(filename)
            if filestat.st_size < args.min_file_size:
                continue
            COUNTER['new'] += 1
            newfiles.append(filename)
            madfile = mad2.util.get_mad_file(app, filename)
            save_to_mongo(app, madfile)

        if not remove_dir:
            if root in dirs_to_delete:
                dirs_to_delete.remove(root)

        mongo_flush(app)

    if len(dirs_to_delete) > 0:
        lg.info("lastly: removing records from %d dirs", len(dirs_to_delete))

    for dirname in dirs_to_delete:
        #hmm - skipping the flush step - directly removing here...
        COUNTER['dir_rm'] += 1
        transient_db.remove({
            "dirname": dirname,
            "host": socket.gethostname(),
        })

    mongo_flush(app)

    for k, v in COUNTER.items():
        lg.warning("%10s: %d", k, v)
    if len(modfiles) > 0:
        lg.warning("Modified files: (last 5)")
        for mf in modfiles:
            lg.warning(" - %s", mf)
    if len(newfiles) > 0:
        lg.warning("New files: (last 5)")
        for mf in newfiles:
            lg.warning(" - %s", mf)
Beispiel #41
0
def waste_text_report(app, args):

    db = get_mongo_transient_db(app)

    res = _run_waste_command(app, 'waste_pipeline', force=args.force)['result']

    if args.subject:
        print("Subject: {}".format(args.subject))

    # his week's winner
    top = res[0]
    sha1sum = top['_id']
    owners = set()
    hostcount = collections.defaultdict(lambda: 0)
    hostsize = collections.defaultdict(lambda: 0)

    total = 0
    for rec in db.find({'sha1sum': sha1sum}):
        total += 1
        host = rec['host']
        hostcount[host] += 1
        hostsize[host] += float(rec['filesize']) / float(rec['nlink'])
        owners.add(rec['username'])

    print("This week's winner: {}".format(", ".join(owners)))
    print("One file, ", end="")
    print("{} location".format(total), end="")
    if total > 1:
        print("s", end="")
    print(", {} server,".format(len(hostcount)), end="")
    if len(hostcount) > 1:
        print("s", end="")
    print(" wasting {}.".format(humansize(top['waste'])))
    print("try:\n   mad repl {}\n\n".format(sha1sum))

    no_to_print = 20
    print("Waste overview: (no / sha1sum / waste / filesize)")
    print("=================================================\n")
    for i, r in enumerate(res):
        if i >= no_to_print:
            break

        sha1sum = r['_id']
        if not sha1sum.strip():
            continue
        print("{:2d} {} {:>10} {:>10}".format(i, sha1sum,
                                              humansize(r['waste']),
                                              humansize(r['filesize'])))

    print("\n\nDetails: (nlink/symlink/size/owner)")
    print("===================================")
    for i, r in enumerate(res):
        if i >= no_to_print:
            break

        sha1sum = r['_id']
        if not sha1sum.strip():
            continue

        print("# {:2d} {} {:>10} {:>10}".format(i, sha1sum,
                                                humansize(r['waste']),
                                                humansize(r['filesize'])))

        records = collections.defaultdict(list)
        hostcount = collections.defaultdict(lambda: 0)
        hostsize = collections.defaultdict(lambda: 0)

        for rec in db.find({'sha1sum': sha1sum}):
            host = rec['host']
            records[host].append(rec)
            hostcount[host] += 1
            hostsize[host] += float(rec['filesize']) / float(rec['nlink'])

        for h in hostcount:
            print("# Host: {}, copies: {}, total use: {}".format(
                h, hostcount[h], humansize(hostsize[h])))
            for rec in records[host]:
                smarker = '.'
                if rec.get('is_symlink'):
                    smarker = 'S'
                print("  {} {}".format(rec.get('nlink', '?'), smarker),
                      end=' ')
                print(humansize(rec['filesize']), end=' ')
                print(rec['username'])
                print("   " + rec['fullpath'])
#                for j, pp in enumerate(textwrap.wrap(rec['fullpath'], 70)):
#                    print(" " * 8 + pp)

        print("")
Beispiel #42
0
def repl(app, args):
    """
    Save to mongodb
    """

    MONGO_mad = get_mongo_transient_db(app)

    backup_hosts = set()
    for host in app.conf['host']:
        if app.conf['host'][host]['backup']:
            backup_hosts.add(host)

    check_shasums = False
    if len(args.file) > 0:
        for f in args.file:
            if os.path.exists(f):
                break
            if len(f) != 40:
                break
        else:
            check_shasums = True

    def _process_query(query, madfile_in):
        res = MONGO_mad.find(query)
        for r in res:
            if args.volume and \
               r['volume'] != args.volume:
                continue

            if args.path_fragment and \
               args.path_fragment not in r['fullpath']:
                continue

            if args.echo:
                print(madfile_in['inputfile'])
                break

            days = (arrow.now() - arrow.get(r['save_time'])).days
            symlink = r.get('is_symlink', False)
            if symlink:
                stag = 'S'
            else:
                stag = '.'
            if args.raw_output:
                print("\t".join(map(str, [
                    r['nlink'], stag, (arrow.now() -
                                       arrow.get(r['save_time'])),
                    r['filesize'], r['host'], r['fullpath']
                ])))
            else:
                cprint('%1d%s' % (r['nlink'], stag), 'yellow', end=" ")
                cprint('%3d' % days, 'green', end="d ")
                cprint('%6s' % humansize(r['filesize']), 'white', end=" ")
                if r['host'] in backup_hosts:
                    cprint(r['host'], 'green', attrs=['bold'], end=':')
                else:
                    cprint(r['host'], 'cyan', end=':')
                cprint(r['fullpath'])

    if check_shasums:
        for sha1sum in args.file:
            query = {'sha1sum': sha1sum}
            _process_query(query, None)
    else:
        for madfile in get_all_mad_files(app, args):
            query = {'sha1sum': madfile['sha1sum']}
            _process_query(query, madfile)
Beispiel #43
0
def transient_delete(app, madfile):
    transient_id = madfile.get('_id_transient')
    MONGO = get_mongo_transient_db(app)
    lg.debug("Deleting %s (%s)", madfile['inputfile'], transient_id)
    MONGO.remove({'_id': transient_id})
Beispiel #44
0
def update(app, args):
    """
    update the transient db in this directory and below
    """

    global MONGO_REMOVE_CACHE
    global MONGO_SAVE_CACHE
    global COUNTER

    modfiles = collections.deque([], 5)
    newfiles = collections.deque([], 5)

    transient_db = get_mongo_transient_db(app)
    ignore_dirs = ['.*', '.git', 'tmp']
    ignore_files = ['.*', '*.log', '*~', '*#', 'SHA1SUMS*']
    basedir = os.getcwd()

    find_dir_regex = re.compile('^{}'.format(basedir))
    find_dir_regex = '^{}'.format(basedir)#.replace('/', '\/')
    lg.debug("searching for dirs with regex: %s", find_dir_regex)
    tradirs = []
    
    query = {'host': socket.gethostname(),
             'dirname': { "$regex": find_dir_regex }}

    trans_dirs = list(transient_db.find(query).distinct('dirname'))
    if args.quiet:
        lg.debug("found %d directories below this directory in transient db", len(trans_dirs))
    else:
        lg.warning("found %d directories below this directory in transient db", len(trans_dirs))

    #to be safe - strip trailing slashes
    trans_dirs = [x.rstrip('/') for x in trans_dirs]
    dirs_to_delete = copy.copy(trans_dirs)
    lg.info("%d dirs with data in the transient db", len(trans_dirs))

    
    def screen_update(cnt, lud = 0, msg=""):
        if args.quiet: return
        ts = shutil.get_terminal_size().columns - 1

        if (time.time() > lud) < 1:
            return lud

        def _add_sep(b):
            return re.sub(r'([0-9][0-9][0-9])', r'\1,', str(b)[::-1])[::-1].strip(',')

        if len(cnt) == 0:
            return time.time()

        out = " ".join(['{}:{}'.format(a, _add_sep(b))
                        for a, b in cnt.items()])

        rest = ts - (len(out) + 1)

        if rest > 5 and isinstance(msg, str):
            out += ':' + msg[:int(rest)-1]

        rest = ts - (len(out) + 1)
        out += ' ' * rest

        print(out, end='\r')

        return time.time()

    start = time.time()

    last_screen_update = screen_update(COUNTER, msg='init')


    def _name_match(fn, ignore_list):
        for i in ignore_list:
            if fnmatch(fn, i):
                return True
        return False

    for root, dirs, files in os.walk(basedir):
        last_screen_update = screen_update(COUNTER, lud=last_screen_update, msg=root)

        root = root.rstrip('/')
        COUNTER['dir'] += 1

        if os.path.exists(os.path.join(basedir, 'mad.ignore')):
            dirs[:] = []
            must_save_files = []
        else:
            dirs[:] = [x for x in dirs if not _name_match(x, ignore_dirs)]
            dirs[:] = [x for x in dirs if not os.path.exists(os.path.join(x, 'mad.ignore'))]

            must_save_files = [x for x in files if not _name_match(x, ignore_files)]


        def check_access(_root, _fn):
            _path = os.path.join(_root, _fn)
            acc = os.access(_path, os.R_OK)
            if not acc:
                COUNTER['no_access'] += 1
                if COUNTER['no_access'] < 10:
                    lg.info("no access to: %s", _path)
            return acc

        must_save_files = [x for x in must_save_files if check_access(root, x)]

        remove_dir = True

        if len(must_save_files) > 0:
            lg.debug('files to be saved in %s', root)
            remove_dir = False

        if args.watch:
            sys.stdout.write(chr(27) + "[2J" + chr(27) + "[1;1f")
            print(datetime.datetime.now())
            print()
            print("dir: {}".format(root))
            for k in sorted(COUNTER.keys()):
                print("  {:<20}:{:<10d}".format(k, COUNTER[k]))

        else:
            lg.info('%s: %s', root[-40:], str(dict(COUNTER)))

        trans_records = transient_db.find(
            { "dirname": root,
              "host": socket.gethostname(), },
            { "_id_transient": 1,
              "filename": 1,
              "sha1sum": 1,
              "sha1sum_time": 1, })

        trec_files = []

        for trec in trans_records:

            last_screen_update = screen_update(COUNTER, lud=last_screen_update, msg=root)

            if not trec['filename'] in files:
                COUNTER['rm'] += 1
                lg.debug('deleted: %s', trec['filename'])
                MONGO_REMOVE_CACHE.append(trec['_id_transient'])
                continue

            # this file is in both the db & on disk - check mtime
            remove_dir = False  # stuff in this folder - do not delete!
            fullpath = os.path.abspath(os.path.realpath(os.path.join(root, trec['filename'])))
            try:
                fstat = os.lstat(fullpath)
            except FileNotFoundError:
                # this happens for broken symlinks - in which case we should remove the
                # trans_record
                COUNTER['broken'] += 1
                MONGO_REMOVE_CACHE.append(trec['_id_transient'])
                continue

                                
            mtime = datetime.datetime.utcfromtimestamp(fstat.st_mtime)

            if 'sha1sum_time' in trec:
                timediff =  (mtime - trec['sha1sum_time']).total_seconds()
            else:
                timediff = 1e12 #force recalculation

            # allow for at least half a second of leeway - at times
            # the difference between modification time and when the
            # system has taken the sha1sum does not have enough
            # resolution
            if timediff > 0.5:
                # might be modified - create a madfile object which will check
                # more thoroughly
                COUNTER['mod?'] += 1
                modfiles.append(fullpath)
                madfile = mad2.util.get_mad_file(app, fullpath)
                save_to_mongo(app, madfile)
            else:
                COUNTER['ok'] += 1

            # remove this file from the "must save" list - it's already
            # present
            if trec['filename'] in must_save_files:
                must_save_files.remove(trec['filename'])

        # save new files
        for filename in must_save_files:
            last_screen_update = screen_update(COUNTER, last_screen_update, 'new - ' + root)


            filename = os.path.join(root, filename)
            remove_dir = False # again - stuff here - do not remove
            filestat = os.lstat(filename)
            if filestat.st_size < args.min_file_size:
                continue
            COUNTER['new'] += 1
            newfiles.append(filename)
            madfile = mad2.util.get_mad_file(app, filename)
            save_to_mongo(app, madfile)

        if not remove_dir:
            if root in dirs_to_delete:
                dirs_to_delete.remove(root)

        mongo_flush(app)

    if len(dirs_to_delete) > 0:
        lg.info("lastly: removing records from %d dirs", len(dirs_to_delete))

    for dirname in dirs_to_delete:
        #hmm - skipping the flush step - directly removing here...
        COUNTER['dir_rm'] += 1
        transient_db.remove(
            { "dirname": dirname,
              "host": socket.gethostname(), })


    mongo_flush(app)

    for k, v in COUNTER.items():
        lg.warning("%10s: %d", k, v)
    if len(modfiles) > 0:
        lg.warning("Modified files: (last 5)")
        for mf in modfiles:
            lg.warning(" - %s", mf)
    if len(newfiles) > 0:
        lg.warning("New files: (last 5)")
        for mf in newfiles:
            lg.warning(" - %s", mf)
Beispiel #45
0
def search(app, args):
    """
    Find files
    """

    MONGO_mad = get_mongo_transient_db(app)

    query = {}

    for f in [
            'username', 'backup', 'volume', 'host', 'dirname', 'sha1sum',
            'project', 'project', 'pi', 'category', 'filename', 'userid'
    ]:

        v = getattr(args, f)
        if v is None:
            continue
        elif v == '(none)':
            query[f] = {"$exists": False}
        elif v.startswith('/') and v.endswith('/'):
            rrr = re.compile(v[1:-1])
            query[f] = rrr
        else:
            query[f] = v

    if args.min_filesize:
        query['filesize'] = {
            "$gt": mad2.util.interpret_humansize(args.min_filesize)
        }

    if args.max_filesize:
        nq = query.get('filesize', {})
        nq["$lt"] = mad2.util.interpret_humansize(args.max_filesize)
        query['filesize'] = nq

    if args.atime_older_than:
        delta = datetime.timedelta(
            seconds=pytimeparse.parse(args.atime_older_than))
        cutoffdate = datetime.datetime.utcnow() - delta
        query['atime'] = {"$lte": cutoffdate}

    if args.delete:
        MONGO_mad.remove(query)
        return

    res = MONGO_mad.find(query)

    if args.sort:
        res = res.sort(args.sort, pymongo.ASCENDING)
    elif args.reverse_sort:
        res = res.sort(args.reverse_sort, pymongo.DESCENDING)

    if args.limit > 0:
        res = res.limit(args.limit)

    if args.tsv:
        if args.format == '{fullpath}':
            fields = 'host fullpath filesize category'.split()
        else:
            fields = args.format.split(',')
        for r in res:
            vals = [r.get(x, 'n.a.') for x in fields]
            print("\t".join(map(str, vals)))
    elif args.raw:
        print(yaml.safe_dump(list(res), default_flow_style=False))
    else:
        #ensure tab characters
        format = args.format.replace(r'\t', '\t')
        for r in res:
            while True:
                try:
                    print(format.format(**r))  # 'fullpath'])
                except KeyError as e:
                    r[e.args[0]] = '(no value)'
                    continue
                break
Beispiel #46
0
def madfile_init(app, madfile):
    """
    Initialize this madfile - mainly - check if the mongo transient database
    knows about this file, and has the SHA1SUM. The SHA1SUM is then used to get
    the data from the core database
    """
    global COUNTER
    COUNTER['init'] += 1

    trans_db = get_mongo_transient_db(app)
    core_db = get_mongo_core_db(app)

    trans_id = get_mongo_transient_id(madfile)
    rec = trans_db.find_one({'_id': trans_id})
    nowtime = datetime.utcnow()
    mtime = madfile.get('mtime')
    sha1sum = None
    sha1sum_time = None

    #lg.setLevel(logging.DEBUG)

    if isinstance(rec, dict):
        sha1sum = rec.get('sha1sum')
        sha1sum_time = rec.get('sha1sum_time')

    def _prep_madfile(_madfile, sha1, sha1_time):

        _madfile.all['_id_core'] = sha1[:24]
        _madfile.all['sha1sum'] = sha1
        _madfile.mad['sha1sum'] = sha1
        _madfile.all['sha1sum_time'] = sha1_time

    def _create_new_sha1(_madfile):

        # TODO: temporary hack - see if we can get the data from the
        # SHA1SUM files.

        sha1, sha1_time = mad2.hash.check_sha1sum_file(_madfile['fullpath'])

        if sha1 is not None and arrow.get(mtime).to('local') <= sha1_time:
            COUNTER['shafile'] += 1
            lg.info("recoved sha1 from the SHA1SUM file")
        else:
            #also not in the sha1sum file - recalculate
            lg.debug("recreate shasum for %s", _madfile['inputfile'])
            COUNTER['calc'] += 1
            sha1 = mad2.hash.get_sha1(_madfile['fullpath'])
            sha1_time = datetime.utcnow()

        if sha1 is None:
            #still not?? maybe the file does not exist? Link is broken?? Will not save this
            return False

        lg.info("shasum for %s (%s) is %s", _madfile['inputfile'], trans_id, sha1)

        trans_db.update({'_id': trans_id},
                        {"$set": {'sha1sum': sha1,
                                  'sha1sum_time': nowtime}},
                        upsert=True)
        _prep_madfile(madfile, sha1, sha1_time)
        return sha1


    if sha1sum is None or not(isinstance(sha1sum_time, datetime)):
        # no shasum - recreate
        _create_new_sha1(madfile)
    elif sha1sum_time is None or mtime is None or  mtime > sha1sum_time:
        # changed sha1sum?
        old_sha1sum = sha1sum
        new_sha1sum = _create_new_sha1(madfile)

        if old_sha1sum == new_sha1sum:
            COUNTER['unchanged'] += 1
        else:
            #record has changed - copy the core data from the old to the
            #new record.

            old_core_id = old_sha1sum[:24]
            new_core_id = new_sha1sum[:24]

            lg.info("file changed: %s", madfile['inputfile'][-30:])
            lg.debug("coreid %s -> %s", old_core_id, new_core_id)

            #prepare record
            old_core_record = core_db.find_one({'_id': old_core_id})
            if not old_core_record:
                old_core_record = {}
            if not 'old_sha1sums' in old_core_record:
                old_core_record['old_sha1sums'] = []
            old_core_record['old_sha1sums'].append(old_sha1sum)
            old_core_record['sha1sum'] = new_sha1sum
            if '_id' in old_core_record:
                del old_core_record['_id']

            #store in core database
            core_db.update({'_id': new_sha1sum[:24]},
                           {"$set": old_core_record},
                           upsert=True)
            madfile.mad.update(old_core_record)

            save_to_mongo(app, madfile)
            COUNTER['changed'] += 1
    else:
        _prep_madfile(madfile, sha1sum, sha1sum_time)
Beispiel #47
0
def mongo_count(app, args):
    """
    Show the associated mongodb record
    """
    MONGO_mad = get_mongo_transient_db(app)
    print(MONGO_mad.count())
Beispiel #48
0
def madfile_init(app, madfile):
    """
    Initialize this madfile - mainly - check if the mongo transient database
    knows about this file, and has the SHA1SUM. The SHA1SUM is then used to get
    the data from the core database
    """
    global COUNTER
    COUNTER['init'] += 1

    trans_db = get_mongo_transient_db(app)
    core_db = get_mongo_core_db(app)

    trans_id = get_mongo_transient_id(madfile)
    rec = trans_db.find_one({'_id': trans_id})

    nowtime = datetime.datetime.utcnow()
    mtime = madfile.get('mtime')
    sha1sum = None
    sha1sum_time = None

    #lg.setLevel(logging.DEBUG)

    if isinstance(rec, dict):
        sha1sum = rec.get('sha1sum')
        sha1sum_time = rec.get('sha1sum_time')

    def _prep_madfile(_madfile, sha1, sha1_time):

        _madfile.all['_id_core'] = sha1[:24]
        _madfile.all['sha1sum'] = sha1
        _madfile.mad['sha1sum'] = sha1
        _madfile.all['sha1sum_time'] = sha1_time

    def _create_new_sha1(_madfile):

        #also not in the sha1sum file - recalculate
        lg.debug("recreate shasum for %s", _madfile['inputfile'])
        COUNTER['calc'] += 1
        sha1 = mad2.hash.get_sha1(_madfile['fullpath'])
        sha1_time = datetime.datetime.utcnow()

        if sha1 is None:
            #still not?? maybe the file does not exist? Link is broken?? Will not save this
            return False

        lg.debug("shasum for %s (%s) is %s", _madfile['inputfile'], trans_id,
                 sha1)

        #        trans_db.update({'_id': trans_id},
        #                        {"$set": {'sha1sum': sha1,
        #                                  'sha1sum_time': nowtime}},
        #                        upsert=True)
        _prep_madfile(madfile, sha1, sha1_time)
        return sha1

    if sha1sum is None or not (isinstance(sha1sum_time, datetime.datetime)):
        # no shasum - recreate
        _create_new_sha1(madfile)
    elif sha1sum_time is None or mtime is None or mtime > sha1sum_time:
        # changed sha1sum?
        old_sha1sum = sha1sum
        new_sha1sum = _create_new_sha1(madfile)

        if old_sha1sum == new_sha1sum:
            COUNTER['unchanged'] += 1
        else:
            #record has changed - copy the core data from the old to the
            #new record.

            old_core_id = old_sha1sum[:24]
            new_core_id = new_sha1sum[:24]

            lg.info("file changed: %s", madfile['inputfile'][-30:])
            lg.debug("coreid %s -> %s", old_core_id, new_core_id)

            #prepare record
            old_core_record = core_db.find_one({'_id': old_core_id})
            if not old_core_record:
                old_core_record = {}
            if not 'old_sha1sums' in old_core_record:
                old_core_record['old_sha1sums'] = []
            old_core_record['old_sha1sums'].append(old_sha1sum)
            old_core_record['sha1sum'] = new_sha1sum
            if '_id' in old_core_record:
                del old_core_record['_id']

            #store in core database
            core_db.update({'_id': new_sha1sum[:24]},
                           {"$set": old_core_record},
                           upsert=True)
            madfile.mad.update(old_core_record)

            save_to_mongo(app, madfile)
            COUNTER['changed'] += 1
    else:
        _prep_madfile(madfile, sha1sum, sha1sum_time)
Beispiel #49
0
def update(app, args):
    """
    update the transient db in this directory and below
    """

    global MONGO_REMOVE_CACHE
    global MONGO_SAVE_CACHE
    global COUNTER

    transient_db = get_mongo_transient_db(app)
    ignore_dirs = ['.*', '.git', 'tmp']
    ignore_files = ['*.log', '*~', '*#', 'SHA1SUMS*', 'mad.config']
    basedir = os.getcwd()

    find_dir_regex = '{}.*'.format(basedir)
    lg.debug("searching for dirs with regex: %s", find_dir_regex)
    tradirs = []
    query = {'host': socket.gethostname(),
             'dirname': { "$regex": find_dir_regex }}
    trans_dirs = list(transient_db.find(query).distinct('dirname'))

    #to be safe - strip trailing slashes
    trans_dirs = [x.rstrip('/') for x in trans_dirs]
    dirs_to_delete = copy.copy(trans_dirs)
    lg.info("%d dirs with data in the transient db", len(trans_dirs))

    def _name_match(fn, ignore_list):
        for i in ignore_list:
            if fnmatch(fn, i):
                return True
        return False

    for root, dirs, files in os.walk(basedir):

        root = root.rstrip('/')
        COUNTER['dir'] += 1

        dirs[:] = [x for x in dirs if not _name_match(x, ignore_dirs)]
        dirs[:] = [x for x in dirs if not os.path.exists(os.path.join(x, 'mad.ignore'))]

        must_save_files = [x for x in files if not _name_match(x, ignore_files)]

        remove_dir = True

        if len(must_save_files) > 0:
            lg.debug('files to be saved in %s', root)
            remove_dir = False

        lg.info('%s: %s', root[-40:], str(dict(COUNTER)))

        trans_records = transient_db.find(
            { "dirname": root,
              "host": socket.gethostname(), },
            { "_id_transient": 1,
              "filename": 1,
              "sha1sum": 1,
              "sha1sum_time": 1, })

        trec_files = []

        for trec in trans_records:

            if not trec['filename'] in files:
                COUNTER['rm'] += 1
                lg.debug('deleted: %s', trec['filename'])
                MONGO_REMOVE_CACHE.append(trec['_id_transient'])
                continue

            # this file is in both the db & on disk - check mtime
            remove_dir = False  # stuff in this folder - do not delete!
            fullpath = os.path.join(root, trec['filename'])
            fstat = os.lstat(fullpath)
            mtime = datetime.utcfromtimestamp(fstat.st_mtime)

            if 'sha1sum_time' in trec:
                timediff =  (mtime - trec['sha1sum_time']).total_seconds()
            else:
                timediff = 1e12 #force recalculation

            # allow for at least half a second of leeway - at times the
            # modification time and when the system has taken the sha1sum is
            # too small
            if timediff > 0.5:
                # might be modified - create a madfile object which will check
                # more thoroughly
                COUNTER['mod?'] += 1
                madfile = mad2.util.get_mad_file(app, fullpath)
                save_to_mongo(app, madfile)
            else:
                COUNTER['ok'] += 1

            # remove this file from the "must save" list - it's already
            # present
            if trec['filename'] in must_save_files:
                must_save_files.remove(trec['filename'])

        # save new files
        for filename in must_save_files:
            filename = os.path.join(root, filename)
            remove_dir = False # again - stuff here - do not remove
            filestat = os.lstat(filename)
            if filestat.st_size < args.min_file_size:
                continue
            COUNTER['new'] += 1
            madfile = mad2.util.get_mad_file(app, filename)
            save_to_mongo(app, madfile)

        if not remove_dir:
            if root in dirs_to_delete:
                dirs_to_delete.remove(root)
        mongo_flush(app)

    if len(dirs_to_delete) > 0:
        lg.info("lastly: removing records from %d dirs", len(dirs_to_delete))
    for dirname in dirs_to_delete:
        trans_records = transient_db.find(
            { "dirname": dirname,
              "host": socket.gethostname(), },
            { "_id_transient": 1 })
        for record in trans_records:
            MONGO_REMOVE_CACHE.append(record['_id_transient'])
        mongo_flush(app)

    mongo_flush(app)

    for k, v in list(COUNTER.items()):
        lg.warning("%10s: %d", k, v)
Beispiel #50
0
def search(app, args):
    """
    Find files
    """

    MONGO_mad = get_mongo_transient_db(app)

    query = {}

    for f in ['username', 'backup', 'volume', 'host', 'dirname',
              'sha1sum', 'project', 'project', 'pi', 'category',
              'filename', 'userid']:

        v = getattr(args, f)
        if v is None:
            continue
        elif v == '(none)':
            query[f] = { "$exists": False }
        elif v.startswith('/') and v.endswith('/'):
            rrr = re.compile(v[1:-1])
            query[f] = rrr
        else:
            query[f] = v

    if args.min_filesize:
        query['filesize'] = {"$gt": mad2.util.interpret_humansize(args.min_filesize)}

    if args.max_filesize:
        nq = query.get('filesize', {})
        nq["$lt"] = mad2.util.interpret_humansize(args.max_filesize)
        query['filesize'] = nq

    if args.atime_older_than:
        delta = datetime.timedelta(seconds=pytimeparse.parse(args.atime_older_than))
        cutoffdate = datetime.datetime.utcnow() - delta
        query['atime'] = {"$lte": cutoffdate}


    if args.delete:
        MONGO_mad.remove(query)
        return

    res = MONGO_mad.find(query)


    if args.sort:
        res = res.sort(args.sort, pymongo.ASCENDING)
    elif args.reverse_sort:
        res = res.sort(args.reverse_sort, pymongo.DESCENDING)

    if args.limit > 0:
        res = res.limit(args.limit)

    if args.tsv:
        if args.format == '{fullpath}':
            fields = 'host fullpath filesize category'.split()
        else:
            fields = args.format.split(',')
        for r in res:
            vals = [r.get(x, 'n.a.') for x in fields]
            print("\t".join(map(str, vals)))
    elif args.raw:
        print(yaml.safe_dump(list(res), default_flow_style=False))
    else:
        #ensure tab characters
        format = args.format.replace(r'\t', '\t')
        for r in res:
            while True:
                try:
                    print(format.format(**r))  # 'fullpath'])
                except KeyError as e:
                    r[e.args[0]] = '(no value)'
                    continue
                break
Beispiel #51
0
def mongo_sum2(app, args):
    """
    Show the associated mongodb record
    """
    gb1_field = "${}".format(args.group_by_1)
    gb2_field = "${}".format(args.group_by_2)

    # gb_pair_field = "${}_${}".format(gb1_field, gb2_field)

    MONGO_mad = get_mongo_transient_db(app)

    if args.sort_on_field:
        sort_field = '_id'
        sort_order = 1
    else:
        sort_field = 'total'
        sort_order = -1

    res = MONGO_mad.aggregate([
        {"$match": {"orphan": False}},
        {'$group': {
            "_id": {
                "group1": gb1_field,
                "group2": gb2_field},
            "total": {"$sum": "$filesize"},
            "count": {"$sum": 1}}},
        {"$sort": {
            "sort_field": sort_order
        }}
    ])
    total_size = 0
    total_count = 0

    gl1 = gl2 = len("Total")

    for r in res['result']:
        g1 = str(r['_id'].get('group1'))
        g2 = str(r['_id'].get('group2'))
        gl1 = max(gl1, len(g1))
        gl2 = max(gl2, len(g2))

    fms = "{:" + str(gl1) + "}  {:" + str(gl2) + "}  {:>10}  {:>9}"
    for r in res['result']:
        g1 = str(r['_id'].get('group1', '-'))
        g2 = str(r['_id'].get('group2', '-'))
        total = r['total']
        count = r['count']
        total_size += total
        total_count += count
        if args.human:
            total = humansize(total)
            print(fms.format(g1, g2, total, count))
        else:
            print("{}\t{}\t{}\t{}".format(g1, g2, total, count))

    if args.human:
        total_size = humansize(total_size)
        print(fms.format(
            "Total", "", total, count))
    else:
        print("Total\t\t{}\t{}".format(total_size, total_count))
Beispiel #52
0
def mongo_sum2(app, args):
    """
    Show the associated mongodb record
    """
    gb1_field = "${}".format(args.group_by_1)
    gb2_field = "${}".format(args.group_by_2)

    # gb_pair_field = "${}_${}".format(gb1_field, gb2_field)

    MONGO_mad = get_mongo_transient_db(app)

    if args.sort_on_field:
        sort_field = '_id'
        sort_order = 1
    else:
        sort_field = 'total'
        sort_order = -1

    query = [{
        "$match": {
            "orphan": False
        }
    }, {
        '$group': {
            "_id": {
                "group1": gb1_field,
                "group2": gb2_field
            },
            "total": {
                "$sum": "$filesize"
            },
            "count": {
                "$sum": 1
            }
        }
    }, {
        "$sort": {
            "sort_field": sort_order
        }
    }]

    res = list(MONGO_mad.aggregate(query))
    total_size = 0
    total_count = 0

    gl1 = gl2 = len("Total")

    for r in res:
        g1 = str(r['_id'].get('group1'))
        g2 = str(r['_id'].get('group2'))
        gl1 = max(gl1, len(g1))
        gl2 = max(gl2, len(g2))

    fms = "{:" + str(gl1) + "}  {:" + str(gl2) + "}  {:>10}  {:>9}"
    for r in res:
        g1 = str(r['_id'].get('group1', '-'))
        g2 = str(r['_id'].get('group2', '-'))
        total = r['total']
        count = r['count']
        total_size += total
        total_count += count
        if args.human:
            total = humansize(total)
            print(fms.format(g1, g2, total, count))
        else:
            print("{}\t{}\t{}\t{}".format(g1, g2, total, count))

    if args.human:
        total_size = humansize(total_size)
        print(fms.format("Total", "", total, count))
    else:
        print("Total\t\t{}\t{}".format(total_size, total_count))
Beispiel #53
0
def repl(app, args):
    """
    Show copies of a file
    """

    MONGO_mad = get_mongo_transient_db(app)

    backup_hosts = set()
    for host in app.conf['host']:
        if app.conf['host'][host]['backup']:
            backup_hosts.add(host)

    check_shasums = False
    if len(args.file) > 0:
        for f in args.file:
            if os.path.exists(f):
                break
            if len(f) != 40:
                break
        else:
            check_shasums = True

    def _process_query(query, madfile_in):
        res = MONGO_mad.find(query)
        res = list(res)
        for r in res:
            if args.volume and \
               r['volume'] != args.volume:
                continue

            if args.path_fragment and \
               args.path_fragment not in r['fullpath']:
                continue

            if args.echo:
                if len(res) > 1:
                    print(madfile_in['inputfile'])
                break

            days = (arrow.now() - arrow.get(r['save_time'])).days
            symlink = r.get('is_symlink', False)
            if symlink:
                stag = 'S'
            else:
                stag = '.'
            if args.raw_output:
                print("\t".join(
                    map(str, [
                        r['nlink'], stag,
                        (arrow.now() - arrow.get(r['save_time'])),
                        r['filesize'], r['host'], r['fullpath']
                    ])))
            else:
                cprint('%1d%s' % (r['nlink'], stag), 'yellow', end=" ")
                cprint('%3d' % days, 'green', end="d ")
                cprint('%6s' % humansize(r['filesize']), 'white', end=" ")
                if r['host'] in backup_hosts:
                    cprint(r['host'], 'green', attrs=['bold'], end=':')
                else:
                    cprint(r['host'], 'cyan', end=':')
                cprint(r['fullpath'])

    if check_shasums:
        for sha1sum in args.file:
            query = {'sha1sum': sha1sum}
            _process_query(query, None)
    else:
        for madfile in get_all_mad_files(app, args):
            query = {'sha1sum': madfile['sha1sum']}
            _process_query(query, madfile)
Beispiel #54
0
def ta_tree(app, args):

    import networkx as nx

    G = nx.DiGraph()

    db_t, db_s2t = get_mongo_transact_db(app)
    trans_db = get_mongo_transient_db(app)


    if len(args.object) == 40 and not os.path.exists(args.object):
        sha1sum = args.object
    else:
        madfile = get_mad_file(app, args.object)
        sha1sum = madfile['sha1sum']

    def _get_trarec(sha1sum):
        rv = defaultdict(set)
        for rec in trans_db.find(dict(sha1sum=sha1sum)):
            for field in ['project', 'filename', 'filesize', 'analyst',
                          'pi', 'username', 'fullpath']:
                if field in rec:
                    rv[field].add(rec[field])
        return {k: ';'.join(map(str, v)) for (k, v) in rv.items()}

    def _add_node(G, sha1sum):
        if sha1sum in G:
            return

        G.add_node(sha1sum)
        sdata = _get_trarec(sha1sum)
        G.node[sha1sum].update(sdata)

    _add_node(G, sha1sum)

    sha1sum_processed = set()

    def _find_relations_shasum(G, sha1sum):
        if sha1sum in sha1sum_processed:
            return

        sha1sum_processed.add(sha1sum)

        for s2t in db_s2t.find(dict(sha1sum=sha1sum)):
            tra = db_t.find_one(dict(_id=s2t['transaction_id']))
            io = tra['io']
            ioo = [x for x in io if x['category'] == 'output']
            if len(ioo) == 0:
                continue

            for fa, fb in itertools.product(io, ioo):
                if fa == fb:
                    continue
                fas, fbs = fa['sha1sum'], fb['sha1sum']
                _add_node(G, fas)
                _add_node(G, fbs)
                ltype = fa['category']
                if ltype == 'output':
                    ltype = 'sibling'
                G.add_edge(fas, fbs)
                G[fas][fbs]['count'] = G[fas][fbs].get('count', 0) + 1
                G[fas][fbs]['type'] = ltype

                _find_relations_shasum(G, fas)
                _find_relations_shasum(G, fbs)

    _find_relations_shasum(G, sha1sum)
    nx.write_graphml(G, 'test.graphml')
Beispiel #55
0
def waste_text_report(app, args):

    db = get_mongo_transient_db(app)

    res = _run_waste_command(app, 'waste_pipeline',
                             force=args.force)['result']

    if args.subject:
        print("Subject: {}".format(args.subject))

    # his week's winner
    top = res[0]
    sha1sum = top['_id']
    owners = set()
    hostcount = collections.defaultdict(lambda: 0)
    hostsize = collections.defaultdict(lambda: 0)

    total = 0
    for rec in db.find({'sha1sum': sha1sum}):
        total += 1
        host = rec['host']
        hostcount[host] += 1
        hostsize[host] += float(rec['filesize']) / float(rec['nlink'])
        owners.add(rec['username'])

    print("This week's winner: {}".format(", ".join(owners)))
    print("One file, ", end="")
    print("{} location".format(total), end="")
    if total > 1:
        print("s", end="")
    print(", {} server,".format(len(hostcount)), end="")
    if len(hostcount) > 1:
        print("s", end="")
    print(" wasting {}.".format(humansize(top['waste'])))
    print("try:\n   mad repl {}\n\n".format(sha1sum))

    no_to_print = 20
    print("Waste overview: (no / sha1sum / waste / filesize)")
    print("=================================================\n")
    for i, r in enumerate(res):
        if i >= no_to_print:
            break

        sha1sum = r['_id']
        if not sha1sum.strip():
            continue
        print("{:2d} {} {:>10} {:>10}"
              .format(i, sha1sum, humansize(r['waste']),
                      humansize(r['filesize'])))

    print("\n\nDetails: (nlink/symlink/size/owner)")
    print("===================================")
    for i, r in enumerate(res):
        if i >= no_to_print:
            break

        sha1sum = r['_id']
        if not sha1sum.strip():
            continue

        print("# {:2d} {} {:>10} {:>10}"
              .format(i, sha1sum, humansize(r['waste']),
                      humansize(r['filesize'])))

        records = collections.defaultdict(list)
        hostcount = collections.defaultdict(lambda: 0)
        hostsize = collections.defaultdict(lambda: 0)

        for rec in db.find({'sha1sum': sha1sum}):
            host = rec['host']
            records[host].append(rec)
            hostcount[host] += 1
            hostsize[host] += float(rec['filesize']) / float(rec['nlink'])

        for h in hostcount:
            print("# Host: {}, copies: {}, total use: {}".format(
                h, hostcount[h], humansize(hostsize[h])))
            for rec in records[host]:
                smarker = '.'
                if rec.get('is_symlink'):
                    smarker = 'S'
                print("  {} {}".format(rec.get('nlink', '?'), smarker),
                      end=' ')
                print(humansize(rec['filesize']), end=' ')
                print(rec['username'])
                print("   " + rec['fullpath'])
#                for j, pp in enumerate(textwrap.wrap(rec['fullpath'], 70)):
#                    print(" " * 8 + pp)

        print("")