Example #1
0
class LuceneQuery(object):
    MANDATORY = 'mandatory'
    PROHIBITED = 'prohibited'
    OPTIONAL = 'optional'

    required_modifiers = {MANDATORY: '+',
                          PROHIBITED: '-',
                          OPTIONAL: ''}

    def __init__(self, *args, **kw):
        first_arg = args[0]
        if hasattr(first_arg, 'solr_field_name'):
            self.field, args = first_arg, args[1:]
        elif isinstance(first_arg, LuceneQuery):
            # Only pass LuceneQuery if all the other args are LuceneQueries
            self.field = None
            self.components = itertools.chain([query.components for query in args])
        else:
            self.field = None

        self.use_colon = True
        self.components = list(args)
        self.local_params = MultiDict()
        self.required = LuceneQuery.OPTIONAL
        self.boost_factor = None

    def __pow__(self, power):
        if not self.components:
            self.use_colon = False
        self.boost_factor = power
        return self

    def tag(self, tag):
        self.local_params.add('tag', tag)
        return self

    def require(self):
        self.required = LuceneQuery.MANDATORY
        return self

    def negate(self):
        self.required = LuceneQuery.PROHIBITED
        return self

    def fuzzy(self, factor=NotGiven):
        self.components.append('~')
        if factor != NotGiven:
            self.components.append(factor)
        return self

    def __unicode__(self):
        modifier = self.required_modifiers[self.required]
        field_clause = '' if not self.field else unicode(self.field)
        separator = ':' if self.use_colon else ''
        local_params = ''
        if self.local_params:
            local_params = '{!' + ' '.join(['%s=%s' % (key, unicode(val)) for key, val in self.local_params.iteritems() ]) + '}'
        component_clause = ''.join([unicode(component) for component in self.components])
        boost_factor = '^%s' % self.boost_factor if self.boost_factor else ''
        return u''.join([local_params, modifier, field_clause, separator, component_clause, boost_factor])
Example #2
0
 def instance(cls, uri):
     """ given the uri of a node, instantiate an object of this class based
         on the properties of that node in the graph
     """
     if not isinstance(uri, URIRef):
         uri = graph.geturi(uri)
     mdict = MultiDict(uri=[uri])
     preds = [k for k in cls.getters.keys()]  # impose an order
     preds_by_uri = {graph.geturi(p): p for p in preds}
     for pred_uri, obj in self.graph.predicate_objects(uri):
         if pred_uri in preds_by_uri:
             mdict.add(preds_by_uri[pred_uri], obj)
         else:
             # its a property we're not prepared for, just include it as an Identifier
             mdict.add(pred_uri, obj)
     return cls(properties=mdict)
Example #3
0
class Node:
    """ A node in the global RDF graph """

    # what RDF class is this? (corresponds to "thing a class" in RDF). Note
    # that this is a string like "logset:ConcreteLog" - we won't know the
    # URI until after the graph has been constructed
    # Should be concrete, ie vcard:Organization not vcard:Kind
    rdf_class: ClassVar[str] = ''  # eg "foaf:Organization"

    # certain node types are created as a specific class (eg foaf:Organization
    # for Agent), but when querying the graph we need to find any subclass of
    # some superclass (eg foaf:Agent for Agent). If that is the case, the
    # Node class should specify the superclass with:
    rdf_superclass: ClassVar[str] = ''  # eg "foaf:Agent"

    # when adding a Node to the graph, triples for a certain set of properties
    # (based on the type of Node) are expected. These might be obtained by
    # querying a file/source or asking the user or infering from context, etc.
    # Each class specifies the expected properties and how to obtain them via a
    # class-variable 'getters', which is a dict mapping a predicate (str) to
    # the method (of self) that obtains it.
    # getter methods are called like:
    #   values:PropertyValues = self.getter(context:Context)
    getters: ClassVar[PropertyGetterDict] = {}
    # some predicates *must* be present, indicate these with:
    required_properties: ClassVar[Set[str]] = set()

    # to support finding known nodes of a given type, Node classes should
    # define a sparql query that returns rows from which a Node can be
    # instantiated. This is done by mapping each row to the finder_fields
    # and passing the result as a PropertyDict to __init__ (see method known())
    finder_query: str = ''
    finder_fields: List[str] = []

    # for 'select' and 'multi_select' getters, what target class is being selected?
    targets: Dict[str, NodeType] = {}  # eg 'dct:publisher': Agent

    # prompts for simple questions system can "ask" to get some properties:
    prompts: Dict[str, str] = {}  # eg 'dct:title': "Give the LogSet a title"

    # which property to use as a label? (this is useful because when defining
    # a new Thing and asking the user questions about it, it is helpful to
    # create the instance with a label that is included in the prompts. The
    # label generally corresponds with one of the properties of the Node, so
    # we require each subclass to indicate which property it will use
    label_property: str = None
    label_alternate = 'this'

    @property
    def graph(self):
        #logging.debug("getting graph: {0}".format(graph.Graph.the_graph))
        return graph.Graph.the_graph

    #def __init__(self, properties:PropertyDict = None) -> None:
    def __init__(self, properties: MultiDict = None, **kwargs) -> None:
        # lazy getting of uri is going to be common enough to just build it into the base class:
        self._uri: Optional[str] = None
        self._namespace: Optional[str] = None
        self._label: Optional[str] = None

        # eg "dct:title": set(Literal("my title"))
        self.properties = MultiDict(properties)
        for key, val in kwargs.items():
            self.properties.add(key, val)

        if 'uri' in self.properties:
            self._uri = self.properties.one('uri')
            self.properties.remove('uri')

        if 'namespace' in self.properties:
            self._namespace = self.properties.one('namespace')
            self.properties.remove('namespace')

#        if properties is not None and 'uri' in properties:
#            logging.debug("setting uri from properties: {0}".format(properties['uri']))
#            #self._uri = properties.pop('uri')[0]
#            self._uri = properties.one('uri')
#            properties.remove('uri')
#
#        if properties is not None and 'namespace' in properties:
#            logging.debug("setting namespace from properties: {0}".format(properties['namespace']))
#            #self._uri = properties.pop('uri')[0]
#            self._namespace = properties.one('namespace')
#            properties.remove('namespace')

# all attributes we have a getter for should have an
# entry in properties, even if it is empty:
        for predicate in self.getters:
            if predicate not in self.properties:
                self.properties.add(predicate)

        # when adding a node to the graph and recursing into its properties,
        # we want a mechanism to bypass adding nodes that are already in the
        # graph:
        self._in_graph = False

    @classmethod
    def instance(cls, uri):
        """ given the uri of a node, instantiate an object of this class based
            on the properties of that node in the graph
        """
        if not isinstance(uri, URIRef):
            uri = graph.geturi(uri)
        mdict = MultiDict(uri=[uri])
        preds = [k for k in cls.getters.keys()]  # impose an order
        preds_by_uri = {graph.geturi(p): p for p in preds}
        for pred_uri, obj in self.graph.predicate_objects(uri):
            if pred_uri in preds_by_uri:
                mdict.add(preds_by_uri[pred_uri], obj)
            else:
                # its a property we're not prepared for, just include it as an Identifier
                mdict.add(pred_uri, obj)
        return cls(properties=mdict)

    @classmethod
    def known(cls, filters: Dict[str, str] = dict()):
        """ generator-constructor over known nodes of a given type: """
        # TODO instead of sampling property values, use "order by" and a douple
        # loop to actually get the full set of properties for a uri
        # (the general finder query is like:
        # select ?uri <other fields> <optional fields where {
        #    ?uri a <self.rdf_class> .
        #    ?uri <other predicate> <other variable> .
        #    optional {
        #       ?uri <other predicate> <other variable> .
        #    } } order by ?uri
        # then make a multi-dict of the properties
        # so:
        #  - first add a class var: required: list of required properties
        #
        #   # list of variables:
        #   nrequired = len(self.required)
        #   optionals = [ key for key in self.getters if key not in self.required ]
        #   ntotal = nrequired + len(optionals)
        #   required = [ '?v{:d}'.format(i) for i in range(nrequired) ]
        #   optional = [ '?v{:d}'.format(i) for i in range(nrequired,ntotal) ]
        #   query  = "SELECT ?uri " + ' '.join(self.required) + ' '.join(optionals)
        #   query += " WHERE { "
        #   if self.rdf_superclass is None:
        #       query += "   ?uri a {0} .".format(self.rdf_class)
        #   else:
        #       query += "   ?uri a ?type ."
        #       query += "   ?type rdfs:subClassOf* {0} .".format(self.rdf_superclass)
        #   for clause in zip(self.required, required):
        #       query += " ?uri {0} {1} . ".format(clause[0], clause[1])
        #   if noptional > 0:
        #       query += " OPTIONAL { "
        #       for clause in zip(optionals, optional):
        #           query += " ?uri {0} {1} . ".format(clause[0], clause[1])
        #       query += " } "
        #   query += " } ORDER BY ?uri "
        #   curr = None
        #   next = None
        #   for row in  Graph.graph.query(query):
        #       next = row[0] # the uri
        #       if next != curr:
        #           mdict = Multidict(next)
        #           if curr is not None:
        #               yield cls(properties=mdict)
        #           curr = next
        #       # add each var to mdict
        #       for key,val in zip(required+optional,row):
        #           mdict.add(key, [val])
        #   yield cls(properties=mdict) # the last one
        #
        # or easier still: since we don't actually enforce that certain properties are
        # required, make everything optional:
        #
        # if filters has uris/rdflib identifiers, then to convert to str they need < > around it
        # but if they are a string like 'ddict:someThing' then they should stay as they are:
        as_str = lambda x: "<{0}>".format(str(x)) if isinstance(x, Identifier
                                                                ) else x
        logging.debug("filters has: {0}".format(filters))
        preds = [k for k in cls.getters.keys()]  # impose an order
        qvars = ['?v{:d}'.format(i) for i in range(len(preds))]
        query = "SELECT ?uri {0} WHERE {{ ".format(' '.join(qvars))
        if cls.rdf_superclass is None:
            query += "?uri a {0} . ".format(cls.rdf_class)
        else:
            query += "?uri a ?type . "
            query += "?type rdfs:subClassOf* {0} . ".format(cls.rdf_superclass)
        for pred, var in zip(preds, qvars):
            if pred in filters:
                #query += "?uri {0} {1} . ".format(pred,str(filters[pred]))
                query += "?uri {0} {1} . ".format(pred, as_str(filters[pred]))
            else:
                query += "OPTIONAL {{ ?uri {0} {1} . }} ".format(pred, var)
        query += "} ORDER BY ?uri "
        logging.debug("query is: {0}".format(query))

        curr_uri = None
        next_uri = None
        mdict: MultiDict = None
        #for row in graph.Graph.the_graph.query(query):
        for row in graph.query(query):
            logging.debug("found {0}".format(str(row)))
            next_uri = row[0]  # the uri
            if next_uri != curr_uri:
                if curr_uri is not None:
                    logging.debug("making a {0} with props {1}".format(
                        cls.__name__, str(mdict)))
                    yield cls(properties=mdict)
                mdict = MultiDict(uri=[next_uri])
                curr_uri = next_uri
            # add each var to mdict
            for key, val in zip(preds, row[1:]):
                mdict.add(key, val)
        if mdict is not None:
            logging.debug("making a {0} with props {1}".format(
                cls.__name__, str(mdict)))
            yield cls(properties=mdict)  # the last one
        else:
            logging.debug("no nodes of type {0} found".format(cls.__name__))

        #
        #if cls.finder_query is None:
        #    return
        #for row in Graph.graph.query(cls.finder_query):
        #    logging.info("found row:" + str(row))
        #    # each think in the row needs to be a list for MultiDict:
        #    lists = [ [r] for r in row ]
        #    props = dict(zip(cls.finder_fields, lists))
        #    logging.debug("making a {0} with props {1}".format(cls.__name__, str(props)))
        #    yield cls(properties=props)

    def get_values(self,
                   predicate: str,
                   context: Optional[Context] = None) -> PropertyValues:
        """ return a set of values for a property """
        if context is None:
            context = Context(predicate=predicate)
        logging.debug("looking for {0} in {1}".format(predicate,
                                                      str(self.properties)))
        props = self.properties.get(predicate)
        if len(props) == 0:
            logging.debug("calling a getter for {0}".format(predicate))
            getter = getattr(self, self.getters[predicate])
            logging.debug("got getter {0}, context {1}".format(
                getter, context))
            generator = (v for v in getter(context))
            self.properties.add(predicate, *generator)
            logging.debug("now {0} has: {1}".format(
                predicate, str(self.properties[predicate])))
        return self.properties[predicate]

    def get_one_value(self,
                      predicate: str,
                      context: Context = None) -> PropertyValue:
        values = self.get_values(predicate, context)
        if values is None or len(values) == 0:
            return None
        else:
            return values.pop()

    #def label(self, context:Context=None) -> str:
    @property
    def label(self) -> str:
        """ when asking questions of the user, hinting at who is asking 
            is helpful. If the label should have more to it than the 
            content of the label property, then the subclass should 
            override this (eg, the subjectttype label is from
            'skos:prefLabel', but the logset label is "this logset {dct:title}"
        """
        #return self.properties.one(self.label_property, 'this')
        #return self.get_one_value(self.label_property, context) or 'this'
        # Note: this should not trigger getters, so we use self.properties
        # not self.get_one_value:
        #self._label = self.get_one_value(self.label_property) or self.label_alternate
        if self._label is None:
            candidates = self.properties[self.label_property]
            if len(candidates) > 0:
                self._label = candidates.pop()
            else:
                self._label = self.label_alternate
        return self._label

    def __str__(self):
        # note that this should not trigger getters, so we use _uri not uri
        return "{0}: {1}".format(self.label, str(self._uri))

    @property
    def uri(self):
        # many subclasses will override this to lazily set a uri based on
        # the value of a property
        return self._uri

    @uri.setter
    def uri(self, value):
        self._uri = value

    def add_to_graph(self, context: Context = None):
        if self._in_graph:
            logging.debug("already in graph, skipping")
            return

        if context is None:
            context = Context()

        # describe my properties first, so subclasses can use them to generate a helpful uri if necessary:
        # I think that to avoid loops we need to do all of the asking (the user)
        # before doing any of the adding to graph:
        triples = []
        context.push(node=self)
        for predicate in self.properties:
            context.push(predicate=predicate)
            # need to convert string eg foaf:name to an actual uri for adding
            # to graph:
            logging.debug("calling Graph.geturi on {0}".format(predicate))
            pred_uri = graph.geturi(predicate)
            logging.debug("calling get_values with {0}, {1}".format(
                str(predicate), str(context)))
            for v in self.get_values(predicate, context):
                if isinstance(v, Identifier):
                    triples.append((self.uri, pred_uri, v))
                elif isinstance(v, Node):
                    triples.append((self.uri, pred_uri, v.uri))
                else:
                    # I'm pretty sure this should never happen
                    raise Exception("oh oh! " + str(v) + " ... " +
                                    str(type(v)))
            context.pop(('predicate', ))
        context.pop(('node', ))
        for triple in triples:
            logging.debug("adding triple {0}".format(triple))
            self.graph.add(triple)

        # finally, describe me:
        rdf = graph.getns('rdf')
        myclass = graph.geturi(self.rdf_class)
        logging.info("adding me to graph: {0}, {1}, {2}".format(
            self.uri, str(rdf.type), str(myclass)))
        self.graph.add((self.uri, rdf.type, myclass))

        self._in_graph = True

    # some common getters:
    def skip(self, context: Context) -> PropertyValues:
        """ if it's not there, don't include it """
        logging.debug("calling skip wiht {0}".format(str(context)))
        return set()

    def abort(self, context: Context) -> None:
        """ if it's not there, something is badly wrong """
        logging.debug("calling abort wiht {0}".format(str(context)))
        predicate = str(context['predicate'])
        msg = "{0} {1} missing predicate {2}".format(self.rdf_class,
                                                     str(self.uri), predicate)
        raise Exception(msg)

    def ask(self, context: Context) -> PropertyValues:
        """ ask user for simple text descriptions (subclasses should override
            if needing something more complicated)
        """
        logging.debug("calling ask wiht {0}".format(str(context)))
        retval = set()
        predicate = context['predicate']
        logging.debug("asking about {0} (label property is {1}".format(
            str(predicate), str(self.label_property)))
        label = self.label_alternate if predicate == self.label_property else self.label
        prompt = self.prompts.get(predicate)
        if prompt is not None:
            insist = predicate in self.required_properties
            value = UI.ask(prompt.format(label), insist=insist)
            if value != '':
                #self._properties.add(pred, Literal(value))
                retval.add(Literal(value))
        return retval

    def truefalse(self, context: Context) -> PropertyValues:
        prompt = self.prompts.get(context['predicate'])
        choice = UI.truefalse(prompt)
        response = 'true' if choice else 'false'
        xsd = graph.getns('xsd')
        logging.debug("truefalse returning " + response)
        retval = set([Literal(response, datatype=xsd.boolean)])
        return retval

    def select(self, context: Context) -> PropertyValues:
        """ get a list of existing nodes of appropriate type (via the target
            class), and ask the user to select one, or create a new one
        """
        logging.debug("calling select with {0}".format(str(context)))
        predicate = context['predicate']
        label = self.label_alternate if predicate == self.label_property else self.label
        return self.select_from_known(context, label, multi=False)

    #@classmethod
    def multi_select(self, context: Context) -> PropertyValues:
        logging.debug("calling multi_select with {0}".format(str(context)))
        predicate = context['predicate']
        label = self.label_alternate if predicate == self.label_property else self.label
        return self.select_from_known(context, label)

    @classmethod
    def select_from_known(
        cls,
        context: Context = Context(),
        label=None,
        multi=True,
        allow_create=True,
        filters: Dict[str, str] = dict()) -> PropertyValues:
        """ classmethod select getter ..."""
        retval = set()
        predicate = context.get('predicate')
        target_cls = cls.targets.get(predicate, cls)
        known = list(target_cls.known(filters))

        if label is None:
            label = target_cls.__name__

        if multi:
            default_prompt = "Please select one or more {0} "
            default_prompt += "(space-separated{1} or empty when done) "
            #ui_method = UI.multi_select
        else:
            default_prompt = "Please select a {0}{1}"
            #ui_method = UI.select

        prompt_new = ", or (n)ew " if allow_create else ""
        additional = ['n'] if allow_create else []

        prompt = (context.get('prompt') or cls.prompts.get(predicate)
                  or default_prompt).format(label, prompt_new)

        if multi:
            selection = UI.multi_select(prompt, known, *additional)
        else:
            selection = [UI.select(prompt, known, *additional)]

        while True:
            # loop so user can create multiple new entries
            if len(selection) == 0:
                break
            # for multi, but with single it is same logic:
            for choice in selection:
                if allow_create and str(choice).lower() == 'n':
                    # now it gets giddyingly recursive, make a new target_cls Node
                    # and add that:
                    classname = target_cls.__name__
                    label = UI.ask(
                        "please give the new {0} a label: ".format(classname))
                    props = {target_cls.label_property: [label]}
                    new = target_cls(properties=props)
                    new.add_to_graph(context)
                    uri = new.uri
                else:
                    logging.info(
                        f"got choice {choice} from selection {selection}")
                    #obj = known[choice]
                    obj = choice
                    uri = obj.uri
                retval.add(uri)
            selection = UI.multi_select("more? ", known, 'n') if multi else []
            #selection = ui_method("more? ", known, 'n') if multi else []
        return retval

    def make_uri(self, prop, namespace):
        """ make a candidate uri that is human-readable if possible based on 
            the value of a selected property. If that property is not set, or
            the uri would clash with one that already exists, add some 
            random characters
        """
        # make an rdf-friendly name:
        try:
            name = re.sub('^[^A-Za-z]+|\W', '', self.properties.one(prop))
        except KeyError:
            name = ran_str(8)

        # does it exist already?
        while existing in self.graph.predicate_objects(namespace[name]):
            # yep, choose a new name by adding some random characters to the end
            name = '{0}_{1}'.format(name, ran_str(4))

#        if len(name)==0:
#            name = ran_str(8)
#        else:
#            # does it exist already?
#            logging.info("makeing a uri in namespace {0}".format(namespace))
#            for existing in self.graph.predicate_objects(namespace[name]):
#                # yep, better choose a new name
#                name = '{0}_{1}'.format(name, ran_str(4))
#                break
#        logging.info("makeuri returning {0}".format(name))
        return namespace[name]
Example #4
0
    def known(cls, filters: Dict[str, str] = dict()):
        """ generator-constructor over known nodes of a given type: """
        # TODO instead of sampling property values, use "order by" and a douple
        # loop to actually get the full set of properties for a uri
        # (the general finder query is like:
        # select ?uri <other fields> <optional fields where {
        #    ?uri a <self.rdf_class> .
        #    ?uri <other predicate> <other variable> .
        #    optional {
        #       ?uri <other predicate> <other variable> .
        #    } } order by ?uri
        # then make a multi-dict of the properties
        # so:
        #  - first add a class var: required: list of required properties
        #
        #   # list of variables:
        #   nrequired = len(self.required)
        #   optionals = [ key for key in self.getters if key not in self.required ]
        #   ntotal = nrequired + len(optionals)
        #   required = [ '?v{:d}'.format(i) for i in range(nrequired) ]
        #   optional = [ '?v{:d}'.format(i) for i in range(nrequired,ntotal) ]
        #   query  = "SELECT ?uri " + ' '.join(self.required) + ' '.join(optionals)
        #   query += " WHERE { "
        #   if self.rdf_superclass is None:
        #       query += "   ?uri a {0} .".format(self.rdf_class)
        #   else:
        #       query += "   ?uri a ?type ."
        #       query += "   ?type rdfs:subClassOf* {0} .".format(self.rdf_superclass)
        #   for clause in zip(self.required, required):
        #       query += " ?uri {0} {1} . ".format(clause[0], clause[1])
        #   if noptional > 0:
        #       query += " OPTIONAL { "
        #       for clause in zip(optionals, optional):
        #           query += " ?uri {0} {1} . ".format(clause[0], clause[1])
        #       query += " } "
        #   query += " } ORDER BY ?uri "
        #   curr = None
        #   next = None
        #   for row in  Graph.graph.query(query):
        #       next = row[0] # the uri
        #       if next != curr:
        #           mdict = Multidict(next)
        #           if curr is not None:
        #               yield cls(properties=mdict)
        #           curr = next
        #       # add each var to mdict
        #       for key,val in zip(required+optional,row):
        #           mdict.add(key, [val])
        #   yield cls(properties=mdict) # the last one
        #
        # or easier still: since we don't actually enforce that certain properties are
        # required, make everything optional:
        #
        # if filters has uris/rdflib identifiers, then to convert to str they need < > around it
        # but if they are a string like 'ddict:someThing' then they should stay as they are:
        as_str = lambda x: "<{0}>".format(str(x)) if isinstance(x, Identifier
                                                                ) else x
        logging.debug("filters has: {0}".format(filters))
        preds = [k for k in cls.getters.keys()]  # impose an order
        qvars = ['?v{:d}'.format(i) for i in range(len(preds))]
        query = "SELECT ?uri {0} WHERE {{ ".format(' '.join(qvars))
        if cls.rdf_superclass is None:
            query += "?uri a {0} . ".format(cls.rdf_class)
        else:
            query += "?uri a ?type . "
            query += "?type rdfs:subClassOf* {0} . ".format(cls.rdf_superclass)
        for pred, var in zip(preds, qvars):
            if pred in filters:
                #query += "?uri {0} {1} . ".format(pred,str(filters[pred]))
                query += "?uri {0} {1} . ".format(pred, as_str(filters[pred]))
            else:
                query += "OPTIONAL {{ ?uri {0} {1} . }} ".format(pred, var)
        query += "} ORDER BY ?uri "
        logging.debug("query is: {0}".format(query))

        curr_uri = None
        next_uri = None
        mdict: MultiDict = None
        #for row in graph.Graph.the_graph.query(query):
        for row in graph.query(query):
            logging.debug("found {0}".format(str(row)))
            next_uri = row[0]  # the uri
            if next_uri != curr_uri:
                if curr_uri is not None:
                    logging.debug("making a {0} with props {1}".format(
                        cls.__name__, str(mdict)))
                    yield cls(properties=mdict)
                mdict = MultiDict(uri=[next_uri])
                curr_uri = next_uri
            # add each var to mdict
            for key, val in zip(preds, row[1:]):
                mdict.add(key, val)
        if mdict is not None:
            logging.debug("making a {0} with props {1}".format(
                cls.__name__, str(mdict)))
            yield cls(properties=mdict)  # the last one
        else:
            logging.debug("no nodes of type {0} found".format(cls.__name__))
    def catalog(self, candidates: Set[FileInfo],
                context: Context) -> Set[FileInfo]:
        filepatterns = context['filepatterns']
        matching = set()
        for regex in filepatterns:
            logging.info("filepattern: {0}: {1}".format(regex, regex.pattern))
            logging.info("{0:d} candidates".format(len(candidates)))
            logging.info("candidates: {0}".format(candidates))
            logging.info(
                "after filtering: " +
                str(filter(lambda x: regex.match(x.filename), candidates)))
            matching |= set(
                filter(lambda x: regex.match(x.filename), candidates))

        # filter by MIMEtype:
        mediatypes = self.properties['dcat:mediaType']
        mimetest = lambda x: self.right_mime_type(x)
        matching -= set(filter(mimetest, matching))

        # args for the LogFormatType that will handle the actual file/source:
        handler_args = {
            'rdf_class': context['logFormatType'],
            'fmtinfo': context['formatinfo']
        }
        #  'properties': self.properties
        #context.push(handler_factory=handlers.factory)
        context.push(handler_args=handler_args)

        # hmm, everything the ConcreteLog infers, we can just pass as properties:
        common_properties = MultiDict()
        common_properties.add('logset:isInstanceOf', context['logseries_uri'])
        common_properties.add('dcat:accessURL', context['dcat:accessURL'])
        #common_properties.add_values('logset:subject', context['subjects'])
        common_properties.add('logset:subject', *context['subjects'])
        common_properties.add('namespace', context['namespace'])
        logging.info("logset for concretelog has: {0}".format(
            context['logset']))

        # in most cases we are looking file-by-file .. FilePerTimepoint
        # can override this default behavior
        for f in matching:
            #context.push(label=f.filename)
            #context.push({'dcat:downloadURL': f.relpath + os.sep + f.filename})
            properties = MultiDict(common_properties)
            properties.add('rdfs:label', Literal(f.filename))
            relpath = (f.relpath + os.sep + f.filename).lstrip(os.sep)
            properties.add('dcat:downloadURL', Literal(relpath))
            logging.info(
                "properties for concretelog has: {0}".format(properties))

            handler_args['target_url'] = os.sep.join(f)  # full local path
            log = ConcreteLog(properties=properties)
            logging.info("adding log to graph: {0}".format(log))
            try:
                log.add_to_graph(context)
            except UnsupportedLogFormatHandler as err:
                logging.warn("logformat {0} not implemented".format(err))

            properties.remove('rdfs:label')
            properties.remove('dcat:downloadURL')
        context.pop(('handler_args', ))
        #context.pop('handler_factory')

        return candidates - matching
Example #6
0
class LogSeries(Node):
    rdf_class = "logset:LogSeries"
    getters = {
        'rdfs:label': 'ask',
        'logset:logFormatType': 'select',
        'logset:infoType': 'select',
        'logset:subjectType': 'select',
        'logset:logFormatInfo': 'skip'  # need to work this out
    }
    required_properties = set([
        'rdfs:label', 'logset:logFormatType', 'logset:infoType',
        'logset:subjectType'
    ])
    prompts = {
        'rdfs:label': 'Give {0} a short identifying lael: ',
        'logset:logFormatType':
        'what logformattype is {0}? (TODO get a better question!) ',
        'logset:infoType': 'what type of information does {0} hold? ',
        'logset:subjectType': 'what type of system/component is {0} about? '
    }
    targets = {
        'logset:infoType': InfoType,
        'logset:subjectType': SubjectType,
        'logset:logFormatType': LogFormatType
    }

    #    finder_query = ''' SELECT ?uri (SAMPLE(?label) as ?label)
    #                                   (SAMPLE(?fmttype) as ?fmttype)
    #                                   (SAMPLE(?infotype) as ?infotype)
    #                                   (SAMPLE(?subjtype) as ?subjtype)  WHERE {
    #                          ?uri a logset:LogSeries .
    #                          ?uri rdfs:label ?label .
    #                          ?uri logset:logFormatType ?fmttype .
    #                          ?uri logset:infoType ?infotype .
    #                          ?uri logset:subjectType ?subjtype .
    #                        } GROUP BY ?uri
    #                   '''
    #    finder_fields = [ 'uri', 'rdfs:label', 'logset:logFormatType',
    #                      'logset:infoType', 'logset:subjectType' ]
    label_property: str = 'rdfs:label'
    label_alternate: str = 'log series'

    def __init__(self, properties: MultiDict = MultiDict()) -> None:
        super().__init__(properties)
        self._fmtInfo = None  # MultiDict()
        self._filePatterns = None
        self._logFormatType = None

    @property
    def fmtInfo(self) -> MultiDict:
        if self._fmtInfo is None:
            self._fmtInfo = MultiDict()
            for prop in self.get_values('logset:logFormatInfo'):
                key, sep, val = prop.partition('=')
                self._fmtInfo.add(key, val)
        return self._fmtInfo

#                logging.info("prop is {0}".format(prop))
#                logging.info("values is {0}".format())
#                for value in prop:
#                    key,sep,val = prop.partition('=')
#                    logging.info("found key {0} and val {1}".format(key,val))
#                    self._fmtInfo.add(key, val)

    @property
    def filePatterns(self) -> List:
        """ return a list, sorted by pattern length (as a proxy for "most 
            specific to least specific") of filename patterns that generally 
            match this logseries
        """
        if self._filePatterns is None:
            pats = set()
            tags = re.compile('(<[\w:]+>)')
            for pattern in self.fmtInfo['filepattern']:
                # convert from a tag-based-pattern to a regex:
                parts = tags.split(pattern)
                # every 2nd part will be a tag, replace it with corresponding pattern
                tagpatterns = [tagPattern(t) for t in parts[1::2]]
                regex_p = ''.join(sum(zip(parts[:-1:2], tagpatterns),
                                      ())) + parts[-1]
                pats.add(regex_p)
                self._filePatterns = [
                    re.compile(p) for p in sorted(pats, key=len, reverse=True)
                ]
        return self._filePatterns

    @property
    def logFormatType(self):
        if self._logFormatType is None:
            handlerName = self.get_one_value('logset:logFormatType')
            #self._logFormatType = LogFormatType.handlers[handlerName]
        return self._logFormatType

#    logset:logFormatInfo "filepattern=console-<date:YYYYMMDD>$" ;
#    logset:logFormatInfo "filepattern=console$" ;
#    logset:logFormatInfo "ts_words=0" ;
#    logset:logFormatInfo "part_word=1" ;

    @property
    def uri(self):
        # lazily find uri so there is a chance of deriving a readable one
        if self._uri is None:
            ns = self._namespace or localdict_ns()
            self._uri = self.make_uri(self.label_property, ns)
        return self._uri

    def identify_subjects(self, subject_list: List[URIRef]) -> URIRef:
        """ given a list of high-level subjects, look for specific subject that are
            a partOf one of these and an isSpecific of the subjecttypes relevant to
            this logseries. Returns a list of uris
        """
        as_str = lambda x: "<{0}>".format(str(x)) if isinstance(x, Identifier
                                                                ) else x
        subjtypes = ', '.join(
            [as_str(uri) for uri in self.get_values('logset:subjectType')])
        #logging.debug("my subtypes are: {0}".format(subjtypes))
        subjects = []
        for subj in subject_list:
            # find subjects whose subjecttpye corresponds with subjtype
            # and that are partOf something in the subjectlist
            q = ''' SELECT ?uri 
                    WHERE {{
                        ?uri a logset:Subject .
                        ?uri logset:isSpecific ?type .
                        ?uri logset:partOf* {0} .
                        FILTER ( ?type in ({1}) ) .
                    }}
                '''.format(as_str(subj), subjtypes)
            subjects += [row[0] for row in self.graph.query(q)]
        #logging.debug("query is: {0}".format(q))
        return subjects
Example #7
0
class SolrQuery(object):
    separators = {}

    def __init__(self, connections=None, index=None):
        self.params = MultiDict()
        self.connections = connections
        self.conn = self.connections.get('main', None)
        self.index = index
        self.params_joined = False

    def use_index(self, index):
        self.index = index

    def handle_row(self, row):
        return self.index(row)

    def join_params(self):
        return MultiDict((k,self.separators[k].join([unicode(v) for v in val]) if k in self.separators else unicode(val)) for k,val in self.params.iteritems())

    def execute(self, handler=handle_row, conn='main'):
        if not self.params_joined:
            self.params=self.join_params()
            self.params_joined = True
        return SolrResult(self._execute_search(), handler=partial(handler, self))


    def query(self, query):
        self.params.add('q', query)
        return self

    def filter(self, *filters):
        [self.params.add('fq', filter) for filter in filters]
        return self

    def facet(self, field, method='enum', missing_facet=NotGiven, sort=NotGiven, min_count=1):
        self.params['facet'] = 'true'
        field_name = SolrQuery._name_or_val(field)
        self.params.add('facet.field', field_name)
        self.params['f.%s.facet.method' % field_name] = method
        if sort != NotGiven:
            self.params['f.%s.facet.sort' % field_name] = sort
        if missing_facet != NotGiven:
            self.params['f.%s.facet.missing' % field_name] = missing_facet
        self.params['f.%s.facet.mincount' % field_name] = min_count
        return self

    def facet_date(self, date_field, start_date=NotGiven, end_date=NotGiven,
                   gap=NotGiven, hard_end=NotGiven, other=NotGiven):
        self.params['facet'] = 'true'
        field_name = SolrQuery._name_or_val(date_field)
        self.params.add('facet.date', field_name)

        if start_date != NotGiven:
            self.params['f.%s.facet.date.start' % field_name] = start_date

        if end_date != NotGiven:
            self.params['f.%s.facet.date.end' % field_name] = end_date

        if gap != NotGiven:
            self.params['f.%s.facet.date.gap' % field_name] = gap

        if hard_end != NotGiven:
            self.params['f.%s.facet.date.hardend' % field_name] = hard_end

        if other != NotGiven:
            self.params['f.%s.facet.date.other' % field_name] = other

        return self

    def facet_query(self, *queries):
        self.params['facet'] = 'true'
        [self.params.add('facet.query', query) for query in queries]
        return self

    def facet_prefix(self, field, prefix):
        self.facet(field)
        self.params['facet.prefix'] = prefix
        return self

    def default_operator(self, op):
        self.params['q.op'] = op
        return self

    def default_field(self, field):
        self.params['df'] = field
        return self

    def def_type(self, type):
        self.params['defType'] = type
        return self

    def tie(self, tie):
        self.params['tie'] = tie
        return self

    def default_query(self, query):
        self.params['q.alt'] = query
        return self

    @join_with_separator(separators, 'mm', ' ')
    def min_should_match(self, *args):
        self._add_items_to_key('mm', *args)
        return self

    def query_type(self, query_type):
        self.params['qt'] = query_type
        return self

    def paginate(self, limit, offset):
        self.params['start'] = limit
        self.params['rows'] = offset
        return self

    def limit(self, limit):
        self.params['rows'] = limit
        return self

    def offset(self, offset):
        self.params['start'] = offset
        return self

    @join_with_separator(separators, 'bf', ' ')
    def boost(self, *fields):
        self._add_items_to_key('bf', *fields)
        return self

    @join_with_separator(separators, 'pf', ' ')
    def phrase_boost(self, *fields):
        self._add_items_to_key('pf', *fields)
        return self

    def query_slop(self, slop):
        self.params['qs'] = slop
        return self

    def phrase_slop(self, slop):
        self.params['ps'] = slop
        return self

    def boost_query(self, query):
        self.params.add('bq', query)

    @staticmethod
    def _name_or_val(arg):
        return arg.solr_field_name if hasattr(arg, 'solr_field_name') else arg

    # Emulating defaultdict but on MultiDict
    def _add_items_to_key(self, param, *args):
        list_args = self.params.get(param, [])
        list_args.extend([SolrQuery._name_or_val(arg) for arg in args])
        self.params[param] = list_args

    @join_with_separator(separators, 'qf', ' ')
    def queried_fields(self, *fields):
        self._add_items_to_key('qf', *fields)
        return self

    def dismax_of(self, *fields):
        # Wrapper for queries using the dismax query parser
        self.def_type('dismax')
        self.queried_fields(*fields)
        self.tie(0.1)
        return self

    @join_with_separator(separators, 'fl', ',')
    def fields(self, *args):
        self._add_items_to_key('fl', *args)
        return self

    @join_with_separator(separators, 'sort', ',')
    def sort_by(self, *args):
        self._add_items_to_key('sort', *args)
        return self

    def indent(self, indent):
        self.params['indent']=indent
        return self

    def debug(self, debug):
        self.params['debugQuery'] = debug
        return self

    def echo_handler(self, echo):
        self.params['echoHandler'] = echo
        return self

    def echo_params(self, echo):
        self.params['echoParams'] = echo
        return self

    def _execute_search(self):
        return self.conn.search(self)