Esempio n. 1
0
def madexpand(app, job):
    for io in job.data['io']:
        iname = io['name']
        fname = job.ctx[iname]
        if not isinstance(fname, str):
            continue
        if os.path.exists(fname):
            madfile = get_mad_file(get_mad_app(), fname)
            d = {}
            for s in madfile.stack[::-1]:
                d.update(dict(s))
            if not 'mad' in job.ctx:
                job.ctx['mad'] = {}
            job.ctx['mad'][iname] = d
Esempio n. 2
0
def madexpand(app, job):
    for io in job.data['io']:
        iname = io['name']
        fname = job.ctx[iname]
        if not isinstance(fname, str):
            continue
        if os.path.exists(fname):
            madfile = get_mad_file(get_mad_app(), fname)
            d = {}
            for s in madfile.stack[::-1]:
                d.update(dict(s))
            if not 'mad' in job.ctx:
                job.ctx['mad'] = {}
            job.ctx['mad'][iname] = d
Esempio n. 3
0
def ta_sha1sum(app, args):
    """Show transactions associated with a sha1sum"""
    db_t, db_s2t = get_mongo_transact_db(app)

    if len(args.object) == 40 and not os.path.exists(args.object):
        sha1sum = args.object
    else:
        madfile = get_mad_file(app, args.object)
        sha1sum = madfile['sha1sum']

    for s2t in db_s2t.find(dict(sha1sum=sha1sum)):
        tra = db_t.find_one(dict(_id=s2t['transaction_id']))
        natime = humanize.naturaldate(tra['time'])
        for io in tra['io']:
            if io['sha1sum'] == sha1sum:
                ncl = " ".join(shlex.split(tra.get('cl', 'n.a.')))
                if len(ncl) > 50:
                    ncl = ncl[:47] + '...'
                cprint(tra['_id'], color='cyan', end=' (')
                cprint(io['category'], color='yellow', end=') ')
                cprint(natime, color='green', end=": ")
                cprint(ncl)
Esempio n. 4
0
def ta_sha1sum(app, args):
    """Show transactions associated with a sha1sum"""
    db_t, db_s2t = get_mongo_transact_db(app)

    if len(args.object) == 40 and not os.path.exists(args.object):
        sha1sum = args.object
    else:
        madfile = get_mad_file(app, args.object)
        sha1sum = madfile['sha1sum']

    for s2t in db_s2t.find(dict(sha1sum=sha1sum)):
        tra = db_t.find_one(dict(_id=s2t['transaction_id']))
        natime = humanize.naturaldate(tra['time'])
        for io in tra['io']:
            if io['sha1sum'] == sha1sum:
                ncl = " ".join(shlex.split(tra.get('cl', 'n.a.')))
                if len(ncl) > 50:
                    ncl = ncl[:47] + '...'
                cprint(tra['_id'], color='cyan', end=' (')
                cprint(io['category'], color='yellow', end=') ')
                cprint(natime, color='green', end=": ")
                cprint(ncl)
Esempio n. 5
0
def prov(app, args):
    """
    Show provenance data
    """
    for madfile in get_all_mad_files(app, args):
        if not 'provenance' in madfile:
            # nothing to show - continue
            continue

        prov_data = madfile['provenance']
        prov_keys = sorted(prov_data.keys())
        latest_key = prov_keys[-1]
        prov = prov_data[latest_key]

        if args.raw:
            print("provenance_key: {}".format(latest_key))
            print(prov.pretty())
            return

        def ccp(*args, **kwargs):
            if not 'end' in kwargs:
                kwargs['end'] = ''
            cprint(*args, **kwargs)

        def cckv(key, val, **kwargs):
            cprint(key, "yellow", end=": ")
            cprint(val)

        # pretty output
        cckv("Date", prov['stopped_at_time'])
        cckv("Tool", prov['tool_name'])
        version = prov['tool_version']
        if len(version) > 50:
            ccp('Version: ', "yellow")
            for i, line in enumerate(
                    textwrap.wrap(version,
                                  initial_indent="         ",
                                  subsequent_indent="     ")):
                if i == 0:
                    line = line.strip()
                print(line)
        else:
            cckv("Version", prov['tool_version'])

        ccp("Command line:", "yellow", end="\n")
        print(" \\\n".join(
            textwrap.wrap(prov['kea_command_line'],
                          initial_indent='  ',
                          subsequent_indent='       ')))
        cprint("Related files:", "yellow")
        this_host = socket.gethostname()

        for filename in prov['derived_from']:
            finf = prov['derived_from'][filename]
            ccp("  " + finf['category'], 'magenta')
            ccp("/" + filename, "blue")
            ccp("\n")
            ccp("    Host: ", "yellow")
            if finf['host'] == this_host:
                ccp("{host}\n".format(**finf), "green")
            else:
                ccp("{host}\n".format(**finf), "red")

            ccp("    Path: ", "yellow")
            if finf['host'] == this_host:
                if os.path.exists(finf['filename']):
                    ccp("{filename}\n".format(**finf), "green")
                else:
                    ccp("{filename}\n".format(**finf), "red")
            else:
                ccp("{filename}\n".format(**finf), "grey")

            ccp("    Sha1sum: ", "yellow")
            if finf['host'] == this_host:
                fmaf = get_mad_file(app, finf['filename'])
                if fmaf['sha1sum'] == finf['sha1sum']:
                    ccp("{sha1sum}\n".format(**finf), "green")
                else:
                    ccp("{sha1sum}\n".format(**finf), "red")
            else:
                ccp("{sha1sum}\n".format(**finf), "grey")
Esempio n. 6
0
def ta_tree(app, args):

    import networkx as nx

    G = nx.DiGraph()

    db_t, db_s2t = get_mongo_transact_db(app)
    trans_db = get_mongo_transient_db(app)


    if len(args.object) == 40 and not os.path.exists(args.object):
        sha1sum = args.object
    else:
        madfile = get_mad_file(app, args.object)
        sha1sum = madfile['sha1sum']

    def _get_trarec(sha1sum):
        rv = defaultdict(set)
        for rec in trans_db.find(dict(sha1sum=sha1sum)):
            for field in ['project', 'filename', 'filesize', 'analyst',
                          'pi', 'username', 'fullpath']:
                if field in rec:
                    rv[field].add(rec[field])
        return {k: ';'.join(map(str, v)) for (k, v) in rv.items()}

    def _add_node(G, sha1sum):
        if sha1sum in G:
            return

        G.add_node(sha1sum)
        sdata = _get_trarec(sha1sum)
        G.node[sha1sum].update(sdata)

    _add_node(G, sha1sum)

    sha1sum_processed = set()

    def _find_relations_shasum(G, sha1sum):
        if sha1sum in sha1sum_processed:
            return

        sha1sum_processed.add(sha1sum)

        for s2t in db_s2t.find(dict(sha1sum=sha1sum)):
            tra = db_t.find_one(dict(_id=s2t['transaction_id']))
            io = tra['io']
            ioo = [x for x in io if x['category'] == 'output']
            if len(ioo) == 0:
                continue

            for fa, fb in itertools.product(io, ioo):
                if fa == fb:
                    continue
                fas, fbs = fa['sha1sum'], fb['sha1sum']
                _add_node(G, fas)
                _add_node(G, fbs)
                ltype = fa['category']
                if ltype == 'output':
                    ltype = 'sibling'
                G.add_edge(fas, fbs)
                G[fas][fbs]['count'] = G[fas][fbs].get('count', 0) + 1
                G[fas][fbs]['type'] = ltype

                _find_relations_shasum(G, fas)
                _find_relations_shasum(G, fbs)

    _find_relations_shasum(G, sha1sum)
    nx.write_graphml(G, 'test.graphml')
Esempio n. 7
0
def ta_add(app, args):
    """Record a new transaction

    All files are put in a group, by default, this group has the same
    name as the category, but when more group names are required, they
    can be specified using a colon (e.g. fq_input:filename.fq)

    """
    salt = str(uuid4())
    uname = sp.getoutput(['uname -a'])
    host = socket.gethostname()

    items_to_hash = [salt, uname, host]

    transact = dict(io=[],
                    salt=salt,
                    host=host,
                    uname=uname)
    if args.time:
        import dateutil.parser
        time = dateutil.parser.parse(args.time)
    else:
        time = datetime.utcnow()

    time = time.replace(microsecond=0)

    items_to_hash.append(time.isoformat())
    transact['time'] = time

    if args.cl:
        cl = args.cl.strip()
        transact['cl'] = cl
        items_to_hash.append(cl)
    elif args.script is not None and os.path.exists(args.script):
        with open(args.script) as F:
            cl = F.read()
        transact['cl'] = cl
        items_to_hash.append(cl)

    db_t, db_s2t = get_mongo_transact_db(app)

    all_file_shasums = []

    to_propagate = {}
    do_not_propagate = set()

    for cat in 'input output db executable misc'.split():
        filenames = getattr(args, cat)
        if filenames is None:
            continue

        for filename in filenames:

            group = cat
            if ':' in filename:
                group, filename = filename.split(':', 1)

            if cat == 'executable':
                filename = exec_expander(filename)

            if not os.path.isfile(filename):
                lg.info("all files of transaction must exist")
                lg.info("cannot find %s", filename)
                exit(-1)

            # print("get mad file", filename)

            madfile = get_mad_file(app, filename)

            if cat == 'input':
                # find propagateable properties
                for k, v in madfile.mad.items():
                    propable = app.conf['keywords'][k].get('propagate', False)
                    if not propable:
                        continue
                    if k in do_not_propagate:
                        continue
                    elif k in to_propagate:
                        if to_propagate[k] != v:
                            # if different values for var k in input
                            # do not propagate - exclude from further
                            # consideration
                            do_not_propagate.add(k)
                            del to_propagate[k]
                    else:
                        to_propagate[k] = v

            if cat == 'output' and to_propagate:
                for k, v in to_propagate.items():
                    if k not in madfile.mad:
                        lg.warning(
                            "propagating %s='%s' for %s", k, v, filename)
                        madfile.mad[k] = v
                madfile.mad.update(to_propagate)

            madfile.save()

            items_to_hash.append(madfile.mad['sha1sum'])
            all_file_shasums.append(madfile.mad['sha1sum'])

            transact['io'].append(
                dict(filename=filename,
                     category=cat,
                     group=group,
                     sha1sum=madfile.mad['sha1sum']))

    thash = sha1()
    for i, _ in enumerate(sorted(items_to_hash)):
        thash.update(_.encode('UTF-8'))

    thash = thash.hexdigest()
    lg.debug("transaction hash: %s", thash)
    transact['_id'] = thash

    # store transaction
    db_t.insert_one(transact)

    # store sha1sum to transaction links
    db_s2t.insert_many([dict(transaction_id=thash, sha1sum=x)
                        for x in set(all_file_shasums)])
Esempio n. 8
0
def prov(app, args):
    """
    Show provenance data
    """
    for madfile in get_all_mad_files(app, args):
        if not 'provenance' in madfile:
            # nothing to show - continue
            continue

        prov_data = madfile['provenance']
        prov_keys = sorted(prov_data.keys())
        latest_key = prov_keys[-1]
        prov = prov_data[latest_key]

        if args.raw:
            print("provenance_key: {}".format(latest_key))
            print(prov.pretty())
            return

        def ccp(*args, **kwargs):
            if not 'end' in kwargs:
                kwargs['end'] = ''
            cprint(*args, **kwargs)

        def cckv(key, val, **kwargs):
            cprint(key, "yellow", end=": ")
            cprint(val)

        # pretty output
        cckv("Date", prov['stopped_at_time'])
        cckv("Tool", prov['tool_name'])
        version = prov['tool_version']
        if len(version) > 50:
            ccp('Version: ', "yellow")
            for i, line in enumerate(textwrap.wrap(version,
                                                   initial_indent="         ",
                                                   subsequent_indent="     ")):
                if i == 0:
                    line = line.strip()
                print(line)
        else:
            cckv("Version", prov['tool_version'])

        ccp("Command line:", "yellow", end="\n")
        print(" \\\n".join(textwrap.wrap(prov['kea_command_line'],
                                         initial_indent='  ',
                                         subsequent_indent='       ')))
        cprint("Related files:", "yellow")
        this_host = socket.gethostname()

        for filename in prov['derived_from']:
            finf = prov['derived_from'][filename]
            ccp("  " + finf['category'], 'magenta')
            ccp("/" + filename, "blue")
            ccp("\n")
            ccp("    Host: ", "yellow")
            if finf['host'] == this_host:
                ccp("{host}\n".format(**finf), "green")
            else:
                ccp("{host}\n".format(**finf), "red")

            ccp("    Path: ", "yellow")
            if finf['host'] == this_host:
                if os.path.exists(finf['filename']):
                    ccp("{filename}\n".format(**finf), "green")
                else:
                    ccp("{filename}\n".format(**finf), "red")
            else:
                ccp("{filename}\n".format(**finf), "grey")


            ccp("    Sha1sum: ", "yellow")
            if finf['host'] == this_host:
                fmaf = get_mad_file(app, finf['filename'])
                if fmaf['sha1sum'] == finf['sha1sum']:
                    ccp("{sha1sum}\n".format(**finf), "green")
                else:
                    ccp("{sha1sum}\n".format(**finf), "red")
            else:
                ccp("{sha1sum}\n".format(**finf), "grey")
Esempio n. 9
0
 def get_madfile(self, filename):
     return mad2util.get_mad_file(self.madapp, filename)
Esempio n. 10
0
def ta_tree(app, args):

    import networkx as nx

    G = nx.DiGraph()

    db_t, db_s2t = get_mongo_transact_db(app)
    trans_db = get_mongo_transient_db(app)

    if len(args.object) == 40 and not os.path.exists(args.object):
        sha1sum = args.object
    else:
        madfile = get_mad_file(app, args.object)
        sha1sum = madfile['sha1sum']

    def _get_trarec(sha1sum):
        rv = defaultdict(set)
        for rec in trans_db.find(dict(sha1sum=sha1sum)):
            for field in [
                    'project', 'filename', 'filesize', 'analyst', 'pi',
                    'username', 'fullpath'
            ]:
                if field in rec:
                    rv[field].add(rec[field])
        return {k: ';'.join(map(str, v)) for (k, v) in rv.items()}

    def _add_node(G, sha1sum):
        if sha1sum in G:
            return

        G.add_node(sha1sum)
        sdata = _get_trarec(sha1sum)
        G.node[sha1sum].update(sdata)

    _add_node(G, sha1sum)

    sha1sum_processed = set()

    def _find_relations_shasum(G, sha1sum):
        if sha1sum in sha1sum_processed:
            return

        sha1sum_processed.add(sha1sum)

        for s2t in db_s2t.find(dict(sha1sum=sha1sum)):
            tra = db_t.find_one(dict(_id=s2t['transaction_id']))
            io = tra['io']
            ioo = [x for x in io if x['category'] == 'output']
            if len(ioo) == 0:
                continue

            for fa, fb in itertools.product(io, ioo):
                if fa == fb:
                    continue
                fas, fbs = fa['sha1sum'], fb['sha1sum']
                _add_node(G, fas)
                _add_node(G, fbs)
                ltype = fa['category']
                if ltype == 'output':
                    ltype = 'sibling'
                G.add_edge(fas, fbs)
                G[fas][fbs]['count'] = G[fas][fbs].get('count', 0) + 1
                G[fas][fbs]['type'] = ltype

                _find_relations_shasum(G, fas)
                _find_relations_shasum(G, fbs)

    _find_relations_shasum(G, sha1sum)
    nx.write_graphml(G, 'test.graphml')
Esempio n. 11
0
def ta_add(app, args):
    """Record a new transaction

    All files are put in a group, by default, this group has the same
    name as the category, but when more group names are required, they
    can be specified using a colon (e.g. fq_input:filename.fq)

    """
    salt = str(uuid4())
    uname = sp.getoutput(['uname -a'])
    host = socket.gethostname()

    items_to_hash = [salt, uname, host]

    transact = dict(io=[], salt=salt, host=host, uname=uname)
    if args.time:
        import dateutil.parser
        time = dateutil.parser.parse(args.time)
    else:
        time = datetime.utcnow()

    time = time.replace(microsecond=0)

    items_to_hash.append(time.isoformat())
    transact['time'] = time

    if args.cl:
        cl = args.cl.strip()
        transact['cl'] = cl
        items_to_hash.append(cl)
    elif args.script is not None and os.path.exists(args.script):
        with open(args.script) as F:
            cl = F.read()
        transact['cl'] = cl
        items_to_hash.append(cl)

    db_t, db_s2t = get_mongo_transact_db(app)

    all_file_shasums = []

    to_propagate = {}
    do_not_propagate = set()

    for cat in 'input output db executable misc'.split():
        filenames = getattr(args, cat)
        if filenames is None:
            continue

        for filename in filenames:

            group = cat
            if ':' in filename:
                group, filename = filename.split(':', 1)

            if cat == 'executable':
                filename = exec_expander(filename)

            if not os.path.isfile(filename):
                lg.info("all files of transaction must exist")
                lg.info("cannot find %s", filename)
                exit(-1)

            # print("get mad file", filename)

            madfile = get_mad_file(app, filename)

            if cat == 'input':
                # find propagateable properties
                for k, v in madfile.mad.items():
                    propable = app.conf['keywords'][k].get('propagate', False)
                    if not propable:
                        continue
                    if k in do_not_propagate:
                        continue
                    elif k in to_propagate:
                        if to_propagate[k] != v:
                            # if different values for var k in input
                            # do not propagate - exclude from further
                            # consideration
                            do_not_propagate.add(k)
                            del to_propagate[k]
                    else:
                        to_propagate[k] = v

            if cat == 'output' and to_propagate:
                for k, v in to_propagate.items():
                    if k not in madfile.mad:
                        lg.warning("propagating %s='%s' for %s", k, v,
                                   filename)
                        madfile.mad[k] = v
                madfile.mad.update(to_propagate)

            madfile.save()

            items_to_hash.append(madfile.mad['sha1sum'])
            all_file_shasums.append(madfile.mad['sha1sum'])

            transact['io'].append(
                dict(filename=filename,
                     category=cat,
                     group=group,
                     sha1sum=madfile.mad['sha1sum']))

    thash = sha1()
    for i, _ in enumerate(sorted(items_to_hash)):
        thash.update(_.encode('UTF-8'))

    thash = thash.hexdigest()
    lg.debug("transaction hash: %s", thash)
    transact['_id'] = thash

    # store transaction
    db_t.insert_one(transact)

    # store sha1sum to transaction links
    db_s2t.insert_many(
        [dict(transaction_id=thash, sha1sum=x) for x in set(all_file_shasums)])