def fmtInfo(self) -> MultiDict: if self._fmtInfo is None: self._fmtInfo = MultiDict() for prop in self.get_values('logset:logFormatInfo'): key, sep, val = prop.partition('=') self._fmtInfo.add(key, val) return self._fmtInfo
def instance(cls, uri): """ given the uri of a node, instantiate an object of this class based on the properties of that node in the graph """ if not isinstance(uri, URIRef): uri = graph.geturi(uri) mdict = MultiDict(uri=[uri]) preds = [k for k in cls.getters.keys()] # impose an order preds_by_uri = {graph.geturi(p): p for p in preds} for pred_uri, obj in self.graph.predicate_objects(uri): if pred_uri in preds_by_uri: mdict.add(preds_by_uri[pred_uri], obj) else: # its a property we're not prepared for, just include it as an Identifier mdict.add(pred_uri, obj) return cls(properties=mdict)
def __init__(self, properties: MultiDict = None, **kwargs) -> None: # lazy getting of uri is going to be common enough to just build it into the base class: self._uri: Optional[str] = None self._namespace: Optional[str] = None self._label: Optional[str] = None # eg "dct:title": set(Literal("my title")) self.properties = MultiDict(properties) for key, val in kwargs.items(): self.properties.add(key, val) if 'uri' in self.properties: self._uri = self.properties.one('uri') self.properties.remove('uri') if 'namespace' in self.properties: self._namespace = self.properties.one('namespace') self.properties.remove('namespace') # if properties is not None and 'uri' in properties: # logging.debug("setting uri from properties: {0}".format(properties['uri'])) # #self._uri = properties.pop('uri')[0] # self._uri = properties.one('uri') # properties.remove('uri') # # if properties is not None and 'namespace' in properties: # logging.debug("setting namespace from properties: {0}".format(properties['namespace'])) # #self._uri = properties.pop('uri')[0] # self._namespace = properties.one('namespace') # properties.remove('namespace') # all attributes we have a getter for should have an # entry in properties, even if it is empty: for predicate in self.getters: if predicate not in self.properties: self.properties.add(predicate) # when adding a node to the graph and recursing into its properties, # we want a mechanism to bypass adding nodes that are already in the # graph: self._in_graph = False
def __init__(self, mods : dict, ties : set, name : str = ""): super().__init__(name) #build operations _ops = dict() for mod_name, opcode in mods.items(): _ops[mod_name] = Operation(mod_name, opcode) #gather values _ties = MultiDict() # src -> (dst, dst_port) for src_name, dst_name, dst_port in ties: _ties[_ops[src_name]] = (_ops[dst_name], dst_port) #build actual val objects _values = set() for src in _ties: _values.add(Value(src, _ties[src])) self._operations = frozenset(_ops.values()) self._values = frozenset(_values)
def factory(node: str): cls = classes[node] properties = MultiDict(uri=[cls.rdf_node]) return cls(properties=properties)
def known(cls, filters: Dict[str, str] = dict()): """ generator-constructor over known nodes of a given type: """ # TODO instead of sampling property values, use "order by" and a douple # loop to actually get the full set of properties for a uri # (the general finder query is like: # select ?uri <other fields> <optional fields where { # ?uri a <self.rdf_class> . # ?uri <other predicate> <other variable> . # optional { # ?uri <other predicate> <other variable> . # } } order by ?uri # then make a multi-dict of the properties # so: # - first add a class var: required: list of required properties # # # list of variables: # nrequired = len(self.required) # optionals = [ key for key in self.getters if key not in self.required ] # ntotal = nrequired + len(optionals) # required = [ '?v{:d}'.format(i) for i in range(nrequired) ] # optional = [ '?v{:d}'.format(i) for i in range(nrequired,ntotal) ] # query = "SELECT ?uri " + ' '.join(self.required) + ' '.join(optionals) # query += " WHERE { " # if self.rdf_superclass is None: # query += " ?uri a {0} .".format(self.rdf_class) # else: # query += " ?uri a ?type ." # query += " ?type rdfs:subClassOf* {0} .".format(self.rdf_superclass) # for clause in zip(self.required, required): # query += " ?uri {0} {1} . ".format(clause[0], clause[1]) # if noptional > 0: # query += " OPTIONAL { " # for clause in zip(optionals, optional): # query += " ?uri {0} {1} . ".format(clause[0], clause[1]) # query += " } " # query += " } ORDER BY ?uri " # curr = None # next = None # for row in Graph.graph.query(query): # next = row[0] # the uri # if next != curr: # mdict = Multidict(next) # if curr is not None: # yield cls(properties=mdict) # curr = next # # add each var to mdict # for key,val in zip(required+optional,row): # mdict.add(key, [val]) # yield cls(properties=mdict) # the last one # # or easier still: since we don't actually enforce that certain properties are # required, make everything optional: # # if filters has uris/rdflib identifiers, then to convert to str they need < > around it # but if they are a string like 'ddict:someThing' then they should stay as they are: as_str = lambda x: "<{0}>".format(str(x)) if isinstance(x, Identifier ) else x logging.debug("filters has: {0}".format(filters)) preds = [k for k in cls.getters.keys()] # impose an order qvars = ['?v{:d}'.format(i) for i in range(len(preds))] query = "SELECT ?uri {0} WHERE {{ ".format(' '.join(qvars)) if cls.rdf_superclass is None: query += "?uri a {0} . ".format(cls.rdf_class) else: query += "?uri a ?type . " query += "?type rdfs:subClassOf* {0} . ".format(cls.rdf_superclass) for pred, var in zip(preds, qvars): if pred in filters: #query += "?uri {0} {1} . ".format(pred,str(filters[pred])) query += "?uri {0} {1} . ".format(pred, as_str(filters[pred])) else: query += "OPTIONAL {{ ?uri {0} {1} . }} ".format(pred, var) query += "} ORDER BY ?uri " logging.debug("query is: {0}".format(query)) curr_uri = None next_uri = None mdict: MultiDict = None #for row in graph.Graph.the_graph.query(query): for row in graph.query(query): logging.debug("found {0}".format(str(row))) next_uri = row[0] # the uri if next_uri != curr_uri: if curr_uri is not None: logging.debug("making a {0} with props {1}".format( cls.__name__, str(mdict))) yield cls(properties=mdict) mdict = MultiDict(uri=[next_uri]) curr_uri = next_uri # add each var to mdict for key, val in zip(preds, row[1:]): mdict.add(key, val) if mdict is not None: logging.debug("making a {0} with props {1}".format( cls.__name__, str(mdict))) yield cls(properties=mdict) # the last one else: logging.debug("no nodes of type {0} found".format(cls.__name__))
def catalog(self, candidates: Set[FileInfo], context: Context) -> Set[FileInfo]: filepatterns = context['filepatterns'] matching = set() for regex in filepatterns: logging.info("filepattern: {0}: {1}".format(regex, regex.pattern)) logging.info("{0:d} candidates".format(len(candidates))) logging.info("candidates: {0}".format(candidates)) logging.info( "after filtering: " + str(filter(lambda x: regex.match(x.filename), candidates))) matching |= set( filter(lambda x: regex.match(x.filename), candidates)) # filter by MIMEtype: mediatypes = self.properties['dcat:mediaType'] mimetest = lambda x: self.right_mime_type(x) matching -= set(filter(mimetest, matching)) # args for the LogFormatType that will handle the actual file/source: handler_args = { 'rdf_class': context['logFormatType'], 'fmtinfo': context['formatinfo'] } # 'properties': self.properties #context.push(handler_factory=handlers.factory) context.push(handler_args=handler_args) # hmm, everything the ConcreteLog infers, we can just pass as properties: common_properties = MultiDict() common_properties.add('logset:isInstanceOf', context['logseries_uri']) common_properties.add('dcat:accessURL', context['dcat:accessURL']) #common_properties.add_values('logset:subject', context['subjects']) common_properties.add('logset:subject', *context['subjects']) common_properties.add('namespace', context['namespace']) logging.info("logset for concretelog has: {0}".format( context['logset'])) # in most cases we are looking file-by-file .. FilePerTimepoint # can override this default behavior for f in matching: #context.push(label=f.filename) #context.push({'dcat:downloadURL': f.relpath + os.sep + f.filename}) properties = MultiDict(common_properties) properties.add('rdfs:label', Literal(f.filename)) relpath = (f.relpath + os.sep + f.filename).lstrip(os.sep) properties.add('dcat:downloadURL', Literal(relpath)) logging.info( "properties for concretelog has: {0}".format(properties)) handler_args['target_url'] = os.sep.join(f) # full local path log = ConcreteLog(properties=properties) logging.info("adding log to graph: {0}".format(log)) try: log.add_to_graph(context) except UnsupportedLogFormatHandler as err: logging.warn("logformat {0} not implemented".format(err)) properties.remove('rdfs:label') properties.remove('dcat:downloadURL') context.pop(('handler_args', )) #context.pop('handler_factory') return candidates - matching
def __init__(self, properties: MultiDict = MultiDict()) -> None: super().__init__(properties) self._fmtInfo = None # MultiDict() self._filePatterns = None self._logFormatType = None