def madexpand(app, job): for io in job.data['io']: iname = io['name'] fname = job.ctx[iname] if not isinstance(fname, str): continue if os.path.exists(fname): madfile = get_mad_file(get_mad_app(), fname) d = {} for s in madfile.stack[::-1]: d.update(dict(s)) if not 'mad' in job.ctx: job.ctx['mad'] = {} job.ctx['mad'][iname] = d
def ta_sha1sum(app, args): """Show transactions associated with a sha1sum""" db_t, db_s2t = get_mongo_transact_db(app) if len(args.object) == 40 and not os.path.exists(args.object): sha1sum = args.object else: madfile = get_mad_file(app, args.object) sha1sum = madfile['sha1sum'] for s2t in db_s2t.find(dict(sha1sum=sha1sum)): tra = db_t.find_one(dict(_id=s2t['transaction_id'])) natime = humanize.naturaldate(tra['time']) for io in tra['io']: if io['sha1sum'] == sha1sum: ncl = " ".join(shlex.split(tra.get('cl', 'n.a.'))) if len(ncl) > 50: ncl = ncl[:47] + '...' cprint(tra['_id'], color='cyan', end=' (') cprint(io['category'], color='yellow', end=') ') cprint(natime, color='green', end=": ") cprint(ncl)
def prov(app, args): """ Show provenance data """ for madfile in get_all_mad_files(app, args): if not 'provenance' in madfile: # nothing to show - continue continue prov_data = madfile['provenance'] prov_keys = sorted(prov_data.keys()) latest_key = prov_keys[-1] prov = prov_data[latest_key] if args.raw: print("provenance_key: {}".format(latest_key)) print(prov.pretty()) return def ccp(*args, **kwargs): if not 'end' in kwargs: kwargs['end'] = '' cprint(*args, **kwargs) def cckv(key, val, **kwargs): cprint(key, "yellow", end=": ") cprint(val) # pretty output cckv("Date", prov['stopped_at_time']) cckv("Tool", prov['tool_name']) version = prov['tool_version'] if len(version) > 50: ccp('Version: ', "yellow") for i, line in enumerate( textwrap.wrap(version, initial_indent=" ", subsequent_indent=" ")): if i == 0: line = line.strip() print(line) else: cckv("Version", prov['tool_version']) ccp("Command line:", "yellow", end="\n") print(" \\\n".join( textwrap.wrap(prov['kea_command_line'], initial_indent=' ', subsequent_indent=' '))) cprint("Related files:", "yellow") this_host = socket.gethostname() for filename in prov['derived_from']: finf = prov['derived_from'][filename] ccp(" " + finf['category'], 'magenta') ccp("/" + filename, "blue") ccp("\n") ccp(" Host: ", "yellow") if finf['host'] == this_host: ccp("{host}\n".format(**finf), "green") else: ccp("{host}\n".format(**finf), "red") ccp(" Path: ", "yellow") if finf['host'] == this_host: if os.path.exists(finf['filename']): ccp("{filename}\n".format(**finf), "green") else: ccp("{filename}\n".format(**finf), "red") else: ccp("{filename}\n".format(**finf), "grey") ccp(" Sha1sum: ", "yellow") if finf['host'] == this_host: fmaf = get_mad_file(app, finf['filename']) if fmaf['sha1sum'] == finf['sha1sum']: ccp("{sha1sum}\n".format(**finf), "green") else: ccp("{sha1sum}\n".format(**finf), "red") else: ccp("{sha1sum}\n".format(**finf), "grey")
def ta_tree(app, args): import networkx as nx G = nx.DiGraph() db_t, db_s2t = get_mongo_transact_db(app) trans_db = get_mongo_transient_db(app) if len(args.object) == 40 and not os.path.exists(args.object): sha1sum = args.object else: madfile = get_mad_file(app, args.object) sha1sum = madfile['sha1sum'] def _get_trarec(sha1sum): rv = defaultdict(set) for rec in trans_db.find(dict(sha1sum=sha1sum)): for field in ['project', 'filename', 'filesize', 'analyst', 'pi', 'username', 'fullpath']: if field in rec: rv[field].add(rec[field]) return {k: ';'.join(map(str, v)) for (k, v) in rv.items()} def _add_node(G, sha1sum): if sha1sum in G: return G.add_node(sha1sum) sdata = _get_trarec(sha1sum) G.node[sha1sum].update(sdata) _add_node(G, sha1sum) sha1sum_processed = set() def _find_relations_shasum(G, sha1sum): if sha1sum in sha1sum_processed: return sha1sum_processed.add(sha1sum) for s2t in db_s2t.find(dict(sha1sum=sha1sum)): tra = db_t.find_one(dict(_id=s2t['transaction_id'])) io = tra['io'] ioo = [x for x in io if x['category'] == 'output'] if len(ioo) == 0: continue for fa, fb in itertools.product(io, ioo): if fa == fb: continue fas, fbs = fa['sha1sum'], fb['sha1sum'] _add_node(G, fas) _add_node(G, fbs) ltype = fa['category'] if ltype == 'output': ltype = 'sibling' G.add_edge(fas, fbs) G[fas][fbs]['count'] = G[fas][fbs].get('count', 0) + 1 G[fas][fbs]['type'] = ltype _find_relations_shasum(G, fas) _find_relations_shasum(G, fbs) _find_relations_shasum(G, sha1sum) nx.write_graphml(G, 'test.graphml')
def ta_add(app, args): """Record a new transaction All files are put in a group, by default, this group has the same name as the category, but when more group names are required, they can be specified using a colon (e.g. fq_input:filename.fq) """ salt = str(uuid4()) uname = sp.getoutput(['uname -a']) host = socket.gethostname() items_to_hash = [salt, uname, host] transact = dict(io=[], salt=salt, host=host, uname=uname) if args.time: import dateutil.parser time = dateutil.parser.parse(args.time) else: time = datetime.utcnow() time = time.replace(microsecond=0) items_to_hash.append(time.isoformat()) transact['time'] = time if args.cl: cl = args.cl.strip() transact['cl'] = cl items_to_hash.append(cl) elif args.script is not None and os.path.exists(args.script): with open(args.script) as F: cl = F.read() transact['cl'] = cl items_to_hash.append(cl) db_t, db_s2t = get_mongo_transact_db(app) all_file_shasums = [] to_propagate = {} do_not_propagate = set() for cat in 'input output db executable misc'.split(): filenames = getattr(args, cat) if filenames is None: continue for filename in filenames: group = cat if ':' in filename: group, filename = filename.split(':', 1) if cat == 'executable': filename = exec_expander(filename) if not os.path.isfile(filename): lg.info("all files of transaction must exist") lg.info("cannot find %s", filename) exit(-1) # print("get mad file", filename) madfile = get_mad_file(app, filename) if cat == 'input': # find propagateable properties for k, v in madfile.mad.items(): propable = app.conf['keywords'][k].get('propagate', False) if not propable: continue if k in do_not_propagate: continue elif k in to_propagate: if to_propagate[k] != v: # if different values for var k in input # do not propagate - exclude from further # consideration do_not_propagate.add(k) del to_propagate[k] else: to_propagate[k] = v if cat == 'output' and to_propagate: for k, v in to_propagate.items(): if k not in madfile.mad: lg.warning( "propagating %s='%s' for %s", k, v, filename) madfile.mad[k] = v madfile.mad.update(to_propagate) madfile.save() items_to_hash.append(madfile.mad['sha1sum']) all_file_shasums.append(madfile.mad['sha1sum']) transact['io'].append( dict(filename=filename, category=cat, group=group, sha1sum=madfile.mad['sha1sum'])) thash = sha1() for i, _ in enumerate(sorted(items_to_hash)): thash.update(_.encode('UTF-8')) thash = thash.hexdigest() lg.debug("transaction hash: %s", thash) transact['_id'] = thash # store transaction db_t.insert_one(transact) # store sha1sum to transaction links db_s2t.insert_many([dict(transaction_id=thash, sha1sum=x) for x in set(all_file_shasums)])
def prov(app, args): """ Show provenance data """ for madfile in get_all_mad_files(app, args): if not 'provenance' in madfile: # nothing to show - continue continue prov_data = madfile['provenance'] prov_keys = sorted(prov_data.keys()) latest_key = prov_keys[-1] prov = prov_data[latest_key] if args.raw: print("provenance_key: {}".format(latest_key)) print(prov.pretty()) return def ccp(*args, **kwargs): if not 'end' in kwargs: kwargs['end'] = '' cprint(*args, **kwargs) def cckv(key, val, **kwargs): cprint(key, "yellow", end=": ") cprint(val) # pretty output cckv("Date", prov['stopped_at_time']) cckv("Tool", prov['tool_name']) version = prov['tool_version'] if len(version) > 50: ccp('Version: ', "yellow") for i, line in enumerate(textwrap.wrap(version, initial_indent=" ", subsequent_indent=" ")): if i == 0: line = line.strip() print(line) else: cckv("Version", prov['tool_version']) ccp("Command line:", "yellow", end="\n") print(" \\\n".join(textwrap.wrap(prov['kea_command_line'], initial_indent=' ', subsequent_indent=' '))) cprint("Related files:", "yellow") this_host = socket.gethostname() for filename in prov['derived_from']: finf = prov['derived_from'][filename] ccp(" " + finf['category'], 'magenta') ccp("/" + filename, "blue") ccp("\n") ccp(" Host: ", "yellow") if finf['host'] == this_host: ccp("{host}\n".format(**finf), "green") else: ccp("{host}\n".format(**finf), "red") ccp(" Path: ", "yellow") if finf['host'] == this_host: if os.path.exists(finf['filename']): ccp("{filename}\n".format(**finf), "green") else: ccp("{filename}\n".format(**finf), "red") else: ccp("{filename}\n".format(**finf), "grey") ccp(" Sha1sum: ", "yellow") if finf['host'] == this_host: fmaf = get_mad_file(app, finf['filename']) if fmaf['sha1sum'] == finf['sha1sum']: ccp("{sha1sum}\n".format(**finf), "green") else: ccp("{sha1sum}\n".format(**finf), "red") else: ccp("{sha1sum}\n".format(**finf), "grey")
def get_madfile(self, filename): return mad2util.get_mad_file(self.madapp, filename)
def ta_tree(app, args): import networkx as nx G = nx.DiGraph() db_t, db_s2t = get_mongo_transact_db(app) trans_db = get_mongo_transient_db(app) if len(args.object) == 40 and not os.path.exists(args.object): sha1sum = args.object else: madfile = get_mad_file(app, args.object) sha1sum = madfile['sha1sum'] def _get_trarec(sha1sum): rv = defaultdict(set) for rec in trans_db.find(dict(sha1sum=sha1sum)): for field in [ 'project', 'filename', 'filesize', 'analyst', 'pi', 'username', 'fullpath' ]: if field in rec: rv[field].add(rec[field]) return {k: ';'.join(map(str, v)) for (k, v) in rv.items()} def _add_node(G, sha1sum): if sha1sum in G: return G.add_node(sha1sum) sdata = _get_trarec(sha1sum) G.node[sha1sum].update(sdata) _add_node(G, sha1sum) sha1sum_processed = set() def _find_relations_shasum(G, sha1sum): if sha1sum in sha1sum_processed: return sha1sum_processed.add(sha1sum) for s2t in db_s2t.find(dict(sha1sum=sha1sum)): tra = db_t.find_one(dict(_id=s2t['transaction_id'])) io = tra['io'] ioo = [x for x in io if x['category'] == 'output'] if len(ioo) == 0: continue for fa, fb in itertools.product(io, ioo): if fa == fb: continue fas, fbs = fa['sha1sum'], fb['sha1sum'] _add_node(G, fas) _add_node(G, fbs) ltype = fa['category'] if ltype == 'output': ltype = 'sibling' G.add_edge(fas, fbs) G[fas][fbs]['count'] = G[fas][fbs].get('count', 0) + 1 G[fas][fbs]['type'] = ltype _find_relations_shasum(G, fas) _find_relations_shasum(G, fbs) _find_relations_shasum(G, sha1sum) nx.write_graphml(G, 'test.graphml')
def ta_add(app, args): """Record a new transaction All files are put in a group, by default, this group has the same name as the category, but when more group names are required, they can be specified using a colon (e.g. fq_input:filename.fq) """ salt = str(uuid4()) uname = sp.getoutput(['uname -a']) host = socket.gethostname() items_to_hash = [salt, uname, host] transact = dict(io=[], salt=salt, host=host, uname=uname) if args.time: import dateutil.parser time = dateutil.parser.parse(args.time) else: time = datetime.utcnow() time = time.replace(microsecond=0) items_to_hash.append(time.isoformat()) transact['time'] = time if args.cl: cl = args.cl.strip() transact['cl'] = cl items_to_hash.append(cl) elif args.script is not None and os.path.exists(args.script): with open(args.script) as F: cl = F.read() transact['cl'] = cl items_to_hash.append(cl) db_t, db_s2t = get_mongo_transact_db(app) all_file_shasums = [] to_propagate = {} do_not_propagate = set() for cat in 'input output db executable misc'.split(): filenames = getattr(args, cat) if filenames is None: continue for filename in filenames: group = cat if ':' in filename: group, filename = filename.split(':', 1) if cat == 'executable': filename = exec_expander(filename) if not os.path.isfile(filename): lg.info("all files of transaction must exist") lg.info("cannot find %s", filename) exit(-1) # print("get mad file", filename) madfile = get_mad_file(app, filename) if cat == 'input': # find propagateable properties for k, v in madfile.mad.items(): propable = app.conf['keywords'][k].get('propagate', False) if not propable: continue if k in do_not_propagate: continue elif k in to_propagate: if to_propagate[k] != v: # if different values for var k in input # do not propagate - exclude from further # consideration do_not_propagate.add(k) del to_propagate[k] else: to_propagate[k] = v if cat == 'output' and to_propagate: for k, v in to_propagate.items(): if k not in madfile.mad: lg.warning("propagating %s='%s' for %s", k, v, filename) madfile.mad[k] = v madfile.mad.update(to_propagate) madfile.save() items_to_hash.append(madfile.mad['sha1sum']) all_file_shasums.append(madfile.mad['sha1sum']) transact['io'].append( dict(filename=filename, category=cat, group=group, sha1sum=madfile.mad['sha1sum'])) thash = sha1() for i, _ in enumerate(sorted(items_to_hash)): thash.update(_.encode('UTF-8')) thash = thash.hexdigest() lg.debug("transaction hash: %s", thash) transact['_id'] = thash # store transaction db_t.insert_one(transact) # store sha1sum to transaction links db_s2t.insert_many( [dict(transaction_id=thash, sha1sum=x) for x in set(all_file_shasums)])