def safe_encode(x, as_literal=True): """Encodes a python value for prov """ if x is None: value = "Unknown" if as_literal: return prov.Literal(value, prov.XSD['string']) else: return value try: if isinstance(x, (str, unicode)): if os.path.exists(x): value = 'file://%s%s' % (getfqdn(), x) if not as_literal: return value try: return prov.URIRef(value) except AttributeError: return prov.Literal(value, prov.XSD['anyURI']) else: if len(x) > max_text_len: value = x[:max_text_len - 13] + ['...Clipped...'] else: value = x if not as_literal: return value return prov.Literal(value, prov.XSD['string']) if isinstance(x, (int,)): if not as_literal: return x return prov.Literal(int(x), prov.XSD['integer']) if isinstance(x, (float,)): if not as_literal: return x return prov.Literal(x, prov.XSD['float']) if not as_literal: return dumps(x) return prov.Literal(dumps(x), nidm['pickle']) except TypeError as e: value = "Could not encode: " + str(e) if not as_literal: return value return prov.Literal(value, prov.XSD['string'])
def create_entity(graph, fs_subject_id, filepath, hostname): """ Create a PROV entity for a file in a FreeSurfer directory """ # identify FreeSurfer terms based on directory and file names _, filename = os.path.split(filepath) relpath = filepath.split(fs_subject_id)[1].lstrip(os.path.sep) fstypes = relpath.split('/')[:-1] additional_types = relpath.split('/')[-1].split('.') file_md5_hash = hash_infile(filepath, crypto=hashlib.md5) file_sha512_hash = hash_infile(filepath, crypto=hashlib.sha512) if file_md5_hash is None: print('Empty file: %s' % filepath) url = "file://%s%s" % (hostname, filepath) obj_attr = [(prov.PROV["label"], filename), (fs["relative_path"], "%s" % relpath), (prov.PROV["location"], prov.URIRef(url)), (crypto["md5"], "%s" % file_md5_hash), (crypto["sha512"], "%s" % file_sha512_hash)] for key in fstypes: obj_attr.append((nidm["tag"], key)) for key in additional_types: obj_attr.append((nidm["tag"], key)) for key, uris in fs_file_map: if key in filename: if key.rstrip('.').lstrip('.') not in fstypes + additional_types: obj_attr.append((nidm["tag"], key.rstrip('.').lstrip('.'))) for uri in uris: if isinstance(uri, tuple): obj_attr.append((uri[0], uri[1])) else: obj_attr.append((prov.PROV["type"], uri)) id = uuid.uuid1().hex return graph.entity(niiri[id], obj_attr)
def safe_encode(x, as_literal=True): """ Encodes a python value for prov """ if x is None: value = "Unknown" if as_literal: return pm.Literal(value, pm.XSD['string']) else: return value if isinstance(x, (str, bytes)): if isinstance(x, bytes): x = str(x, 'utf-8') if os.path.exists(x): if x[0] != os.pathsep: x = os.path.abspath(x) value = 'file://{}{}'.format(platform.node().lower(), x) if not as_literal: return value try: return pm.URIRef(value) except AttributeError: return pm.Literal(value, pm.XSD['anyURI']) else: value = x if len(x) > max_text_len: cliptxt = '...Clipped...' value = x[:max_text_len - len(cliptxt)] + cliptxt if not as_literal: return value return pm.Literal(value, pm.XSD['string']) if isinstance(x, int): if not as_literal: return x return pm.Literal(int(x), pm.XSD['integer']) if isinstance(x, float): if not as_literal: return x return pm.Literal(x, pm.XSD['float']) if isinstance(x, dict): outdict = {} for key, value in list(x.items()): encoded_value = safe_encode(value, as_literal=False) if isinstance(encoded_value, pm.Literal): outdict[key] = encoded_value.json_representation() else: outdict[key] = encoded_value try: jsonstr = json.dumps(outdict) except UnicodeDecodeError as excp: jsonstr = "Could not encode dictionary. {}".format(excp) iflogger.warn('Prov: %s', jsonstr) if not as_literal: return jsonstr return pm.Literal(jsonstr, pm.XSD['string']) if isinstance(x, (list, tuple)): x = list(x) is_object = False try: nptype = np.array(x).dtype is_object = nptype == np.dtype(object) except ValueError: is_object = True # If the array contains an heterogeneous mixture of data types # they should be encoded sequentially if is_object: outlist = [] for value in x: encoded_value = safe_encode(value, as_literal=False) if isinstance(encoded_value, pm.Literal): outlist.append(encoded_value.json_representation()) else: outlist.append(encoded_value) x = outlist try: jsonstr = json.dumps(x) except UnicodeDecodeError as excp: jsonstr = "Could not encode list/tuple. {}".format(excp) iflogger.warn('Prov: %s', jsonstr) if not as_literal: return jsonstr return pm.Literal(jsonstr, pm.XSD['string']) # If is a literal, and as_literal do nothing. # else bring back to json. if isinstance(x, pm.Literal): if as_literal: return x return dumps(x.json_representation()) jsonstr = None ltype = pm.XSD['string'] try: jsonstr = json.dumps(x.__dict__) except AttributeError: pass if jsonstr is None: try: jsonstr = dumps(x) ltype = nipype_ns['pickle'] except TypeError as excp: jsonstr = 'Could not encode object. {}'.format(excp) if not as_literal: return jsonstr return pm.Literal(jsonstr, ltype)
def add_results(self, results): if results.provenance: try: self.g.add_bundle(results.provenance) except pm.ProvException: self.g.add_bundle(results.provenance, get_id()) return self.g runtime = results.runtime interface = results.interface inputs = results.inputs outputs = results.outputs classname = interface.__name__ modulepath = "{0}.{1}".format(interface.__module__, interface.__name__) activitytype = ''.join([i.capitalize() for i in modulepath.split('.')]) a0_attrs = { nipype_ns['module']: interface.__module__, nipype_ns["interface"]: classname, pm.PROV["type"]: nipype_ns[activitytype], pm.PROV["label"]: classname, nipype_ns['duration']: safe_encode(runtime.duration), nipype_ns['workingDirectory']: safe_encode(runtime.cwd), nipype_ns['returnCode']: safe_encode(runtime.returncode), nipype_ns['platform']: safe_encode(runtime.platform), nipype_ns['version']: safe_encode(runtime.version), } try: a0_attrs[foaf["host"]] = pm.URIRef(runtime.hostname) except AttributeError: a0_attrs[foaf["host"]] = pm.Literal(runtime.hostname, pm.XSD['anyURI']) try: a0_attrs.update( {nipype_ns['command']: safe_encode(runtime.cmdline)}) a0_attrs.update( {nipype_ns['commandPath']: safe_encode(runtime.command_path)}) a0_attrs.update( {nipype_ns['dependencies']: safe_encode(runtime.dependencies)}) except AttributeError: pass a0 = self.g.activity(get_id(), runtime.startTime, runtime.endTime, a0_attrs) # environment id = get_id() env_collection = self.g.collection(id) env_collection.add_extra_attributes({ pm.PROV['type']: nipype_ns['Environment'], pm.PROV['label']: "Environment" }) self.g.used(a0, id) # write environment entities for idx, (key, val) in enumerate(sorted(runtime.environ.items())): if key not in [ 'PATH', 'FSLDIR', 'FREESURFER_HOME', 'ANTSPATH', 'CAMINOPATH', 'CLASSPATH', 'LD_LIBRARY_PATH', 'DYLD_LIBRARY_PATH', 'FIX_VERTEX_AREA', 'FSF_OUTPUT_FORMAT', 'FSLCONFDIR', 'FSLOUTPUTTYPE', 'LOGNAME', 'USER', 'MKL_NUM_THREADS', 'OMP_NUM_THREADS' ]: continue in_attr = { pm.PROV["label"]: key, nipype_ns["environmentVariable"]: key, pm.PROV["value"]: safe_encode(val) } id = get_attr_id(in_attr) self.g.entity(id, in_attr) self.g.hadMember(env_collection, id) # write input entities if inputs: id = get_id() input_collection = self.g.collection(id) input_collection.add_extra_attributes({ pm.PROV['type']: nipype_ns['Inputs'], pm.PROV['label']: "Inputs" }) # write input entities for idx, (key, val) in enumerate(sorted(inputs.items())): in_entity = prov_encode(self.g, val).get_identifier() self.g.hadMember(input_collection, in_entity) used_attr = {pm.PROV["label"]: key, nipype_ns["inPort"]: key} self.g.used(activity=a0, entity=in_entity, other_attributes=used_attr) # write output entities if outputs: id = get_id() output_collection = self.g.collection(id) if not isinstance(outputs, dict): outputs = outputs.get_traitsfree() output_collection.add_extra_attributes({ pm.PROV['type']: nipype_ns['Outputs'], pm.PROV['label']: "Outputs" }) self.g.wasGeneratedBy(output_collection, a0) # write output entities for idx, (key, val) in enumerate(sorted(outputs.items())): out_entity = prov_encode(self.g, val).get_identifier() self.g.hadMember(output_collection, out_entity) gen_attr = {pm.PROV["label"]: key, nipype_ns["outPort"]: key} self.g.generation(out_entity, activity=a0, other_attributes=gen_attr) # write runtime entities id = get_id() runtime_collection = self.g.collection(id) runtime_collection.add_extra_attributes({ pm.PROV['type']: nipype_ns['Runtime'], pm.PROV['label']: "RuntimeInfo" }) self.g.wasGeneratedBy(runtime_collection, a0) for key, value in sorted(runtime.items()): if not value: continue if key not in ['stdout', 'stderr', 'merged']: continue attr = {pm.PROV["label"]: key, nipype_ns[key]: safe_encode(value)} id = get_id() self.g.entity(get_id(), attr) self.g.hadMember(runtime_collection, id) # create agents user_attr = { pm.PROV["type"]: pm.PROV["Person"], pm.PROV["label"]: getpass.getuser(), foaf["name"]: safe_encode(getpass.getuser()) } user_agent = self.g.agent(get_attr_id(user_attr), user_attr) agent_attr = { pm.PROV["type"]: pm.PROV["SoftwareAgent"], pm.PROV["label"]: "Nipype", foaf["name"]: safe_encode("Nipype") } for key, value in get_info().items(): agent_attr.update({nipype_ns[key]: safe_encode(value)}) software_agent = self.g.agent(get_attr_id(agent_attr), agent_attr) self.g.wasAssociatedWith( a0, user_agent, None, None, {pm.PROV["hadRole"]: nipype_ns["LoggedInUser"]}) self.g.wasAssociatedWith(a0, software_agent) return self.g
def safe_encode(x, as_literal=True): """Encodes a python value for prov """ if x is None: value = "Unknown" if as_literal: return pm.Literal(value, pm.XSD['string']) else: return value try: if isinstance(x, (str, unicode)): if os.path.exists(x): value = 'file://%s%s' % (getfqdn(), x) if not as_literal: return value try: return pm.URIRef(value) except AttributeError: return pm.Literal(value, pm.XSD['anyURI']) else: if len(x) > max_text_len: value = x[:max_text_len - 13] + ['...Clipped...'] else: value = x if not as_literal: return value return pm.Literal(value, pm.XSD['string']) if isinstance(x, (int, )): if not as_literal: return x return pm.Literal(int(x), pm.XSD['integer']) if isinstance(x, (float, )): if not as_literal: return x return pm.Literal(x, pm.XSD['float']) if isinstance(x, dict): outdict = {} for key, value in x.items(): encoded_value = safe_encode(value, as_literal=False) if isinstance(encoded_value, (pm.Literal, )): outdict[key] = encoded_value.json_representation() else: outdict[key] = encoded_value if not as_literal: return json.dumps(outdict) return pm.Literal(json.dumps(outdict), pm.XSD['string']) if isinstance(x, list): try: nptype = np.array(x).dtype if nptype == np.dtype(object): raise ValueError('dtype object') except ValueError, e: outlist = [] for value in x: encoded_value = safe_encode(value, as_literal=False) if isinstance(encoded_value, (pm.Literal, )): outlist.append(encoded_value.json_representation()) else: outlist.append(encoded_value) else: outlist = x if not as_literal: return json.dumps(outlist) return pm.Literal(json.dumps(outlist), pm.XSD['string']) if not as_literal: return dumps(x) return pm.Literal(dumps(x), nipype_ns['pickle'])
def encode_fs_directory(g, basedir, project_id, subject_id, n_items=100000): """ Convert a FreeSurfer directory to a PROV graph """ # directory collection/catalog collection_hash = uuid.uuid1().hex fsdir_collection = g.collection(niiri[collection_hash]) fsdir_collection.add_extra_attributes({ prov.PROV['type']: fs['SubjectDirectory'], nidm['tag']: project_id, fs['subjectID']: subject_id }) directory_id = g.entity(niiri[uuid.uuid1().hex]) hostname = getfqdn() url = "file://%s%s" % (hostname, os.path.abspath(basedir)) directory_id.add_extra_attributes( {prov.PROV['location']: prov.URIRef(url)}) g.wasDerivedFrom(fsdir_collection, directory_id) a0 = g.activity(niiri[uuid.uuid1().hex], startTime=dt.isoformat(dt.utcnow())) user_agent = g.agent( niiri[uuid.uuid1().hex], { prov.PROV["type"]: prov.PROV["Person"], prov.PROV["label"]: pwd.getpwuid(os.geteuid()).pw_name, foaf["name"]: pwd.getpwuid(os.geteuid()).pw_name }) g.wasAssociatedWith(a0, user_agent, None, None, {prov.PROV["Role"]: "LoggedInUser"}) g.wasGeneratedBy(fsdir_collection, a0) i = 0 for dirpath, dirnames, filenames in os.walk(os.path.realpath(basedir)): for filename in sorted(filenames): if filename.startswith('.'): continue i += 1 if i > n_items: break file2encode = os.path.realpath(os.path.join(dirpath, filename)) if not os.path.isfile(file2encode): print "%s not a file" % file2encode continue ignore_key_found = False for key in ignore_list: if key in file2encode: ignore_key_found = True continue if ignore_key_found: continue try: entity = create_entity(g, subject_id, file2encode, hostname) g.hadMember(fsdir_collection, entity.get_identifier()) rdf_g = entity.rdf().serialize(format='turtle') ''' query = """ PREFIX prov: <http://www.w3.org/ns/prov#> PREFIX fs: <http://www.incf.org/ns/nidash/fs#> PREFIX crypto: <http://www.w3.org/2000/10/swap/crypto#> PREFIX nidm: <http://www.incf.org/ns/nidash/nidm#> select ?e ?relpath ?path where {?e fs:fileType fs:StatisticFile; fs:relativePath ?relpath; prov:atLocation ?path . FILTER NOT EXISTS { ?e nidm:tag "curv" . } } """ results = rdf_g.query(query) ''' if 'StatisticFile' in rdf_g and 'curv' not in rdf_g: g, measure_graph = parse_stats(g, file2encode, entity) if os.path.exists('fsterms.ttl'): measure_graph.parse('fsterms.ttl', format='turtle') measure_graph.serialize('fsterms.ttl', format='turtle') except IOError, e: print e