Esempio n. 1
0
    def get_map_strategy(self, alist: Alist):
        """ Get decomposition rules to apply to an alist

        Args
        ----
        alist : Alist

        Return
        ------
        ops : A list of reduce functions for aggregating alists

        """
        # TODO: learn to predict best strategy given path of root from
        # node and attributes in alist
        self.last_heartbeat = time.time()
        if alist.get(tt.OP).lower() in ['eq', 'lt', 'gt', 'lte', 'gte']:
            return [(frank.map.map_wrapper.get_mapper_fn("comparison"),
                     "comparison")]
        # if compound frame (i.e nesting point in frame), then normalize
        elif alist.uninstantiated_nesting_variables():
            return [(frank.map.map_wrapper.get_mapper_fn("normalize"),
                     "normalize")]
        else:
            ops = []
            for allowed_op in config.config["base_decompositions"]:
                try:
                    ops.append(
                        (frank.map.map_wrapper.get_mapper_fn(allowed_op),
                         allowed_op))
                except Exception as ex:
                    print("Error in decomposition mapper: " + str(ex))
            random.shuffle(ops)
            return ops
Esempio n. 2
0
    def decompose(self, alist: A, G: InferenceGraph):
        nest_vars = alist.uninstantiated_nesting_variables()
        for nest_attr, v in nest_vars.items():
            if NormalizeFn.FILTER in v:
                op_alist = alist.copy()
                op_alist.set(tt.OPVAR, nest_attr)
                op_alist.set(tt.OP, 'comp')
                del op_alist.attributes[nest_attr]
                op_alist.cost = alist.cost + 1
                op_alist.branch_type = br.AND
                op_alist.state = states.EXPLORED
                op_alist.parent_decomposition = 'normalize'
                op_alist.node_type = nt.HNODE
                # alist.link_child(op_alist)
                G.link(alist, op_alist, op_alist.parent_decomposition)
                # check for filters that heuristics apply to
                # e.g type==country and location==Europs
                filter_patterns = {}
                geo_class = ''
                for x in v[NormalizeFn.FILTER]:
                    prop = str(x['p'])
                    obj = str(x['o'])
                    if prop == 'type' and (obj == 'country'
                                           or obj == 'continent'):
                        filter_patterns['geopolitical'] = obj
                    elif prop == 'location':
                        filter_patterns['location'] = obj

                if {'geopolitical', 'location'} <= set(filter_patterns):
                    # use heuristics to create a single alist containing the
                    # conjunction to find the X located in Y
                    child = A(**{})
                    child.set(tt.OP, 'values')
                    child.set(tt.OPVAR, nest_attr)
                    child.set(tt.SUBJECT, nest_attr)
                    child.set(
                        tt.PROPERTY,
                        '__geopolitical:' + filter_patterns['geopolitical'])
                    child.set(tt.OBJECT, filter_patterns['location'])
                    child.cost = op_alist.cost + 1
                    child.state = states.UNEXPLORED
                    child.node_type = nt.ZNODE
                    child.set(tt.CONTEXT, op_alist.get(tt.CONTEXT))
                    child = frank.context.inject_query_context(child)
                    G.link(op_alist, child, op_alist.parent_decomposition)
                    return op_alist
                else:
                    for x in v[NormalizeFn.FILTER]:
                        child = A(**{})
                        child.set(tt.OP, 'values')
                        child.set(tt.OPVAR, nest_attr)
                        child.set(tt.SUBJECT, nest_attr)
                        for attr, attrval in x.items():
                            child.set(attr, attrval)
                        child.cost = op_alist.cost + 1
                        child.state = states.UNEXPLORED
                        child.node_type = nt.ZNODE
                        child.set(tt.CONTEXT, op_alist.get(tt.CONTEXT))
                        child = frank.context.inject_query_context(child)
                        G.link(op_alist, child, op_alist.parent_decomposition)
                    return op_alist

            elif NormalizeFn.IN in v:
                op_alist = alist.copy()
                op_alist.set(tt.OPVAR, nest_attr)
                op_alist.set(tt.OP, 'comp')
                del op_alist.attributes[nest_attr]
                op_alist.cost = alist.cost + 1
                op_alist.state = states.EXPLORED
                op_alist.parent_decomposition = 'normalize'
                op_alist.node_type = nt.HNODE
                # alist.link_child(op_alist)
                G.link(alist, op_alist, op_alist.parent_decomposition)

                listed_items = []
                if isinstance(v[NormalizeFn.IN], list):
                    for x in v[NormalizeFn.IN]:
                        listed_items.append(str(x))
                elif isinstance(v[NormalizeFn.IN], str):
                    for x in str(v[NormalizeFn.IN]).split(';'):
                        listed_items.append(str(x).strip())
                for x in listed_items:
                    child = A(**{})
                    child.set(tt.OP, 'value')
                    if nest_attr[0] in [
                            vx.AUXILLIARY, vx.PROJECTION, vx.NESTING
                    ]:
                        child.set(tt.OPVAR, nest_attr)
                        child.set(nest_attr, x)
                    else:
                        new_var = vx.PROJECTION + '_x' + \
                            str(len(op_alist.attributes))
                        child.set(tt.OPVAR, new_var)
                        child.set(nest_attr, new_var)
                        child.set(new_var, x)
                    child.state = states.UNEXPLORED
                    child.node_type = nt.ZNODE
                    child.cost = op_alist.cost + 1
                    child.set(tt.CONTEXT, op_alist.get(tt.CONTEXT))
                    child = frank.context.inject_query_context(child)
                    G.link(op_alist, child, op_alist.parent_decomposition)
                return op_alist

            elif NormalizeFn.IS in v:
                op_alist = alist.copy()
                op_alist.set(tt.OPVAR, nest_attr)
                op_alist.set(tt.OP, 'comp')
                del op_alist.attributes[nest_attr]
                op_alist.cost = alist.cost + 1
                op_alist.state = states.EXPLORED
                op_alist.parent_decomposition = 'normalize'
                op_alist.node_type = nt.HNODE
                # alist.link_child(op_alist)
                G.link(alist, op_alist, op_alist.parent_decomposition)

                child = A(**{})
                child.set(tt.OP, 'value')
                new_var = vx.PROJECTION + '_x' + str(len(op_alist.attributes))
                child.set(tt.OPVAR, new_var)
                child.set(new_var, v[NormalizeFn.IS])
                child.state = states.REDUCIBLE
                child.cost = op_alist.cost + 1
                child.node_type = nt.ZNODE
                child.set(tt.CONTEXT, op_alist.get(tt.CONTEXT))
                child = frank.context.inject_query_context(child)
                G.link(op_alist, child, op_alist.parent_decomposition)

                if v[NormalizeFn.IS].startswith(
                    (vx.AUXILLIARY, vx.NESTING, vx.PROJECTION)) == False:
                    # this is an instantiation, so a pseudo leaf node should be created
                    leaf = A(**{})
                    leaf.set(tt.OP, 'value')
                    new_var = vx.PROJECTION + '_x' + \
                        str(len(op_alist.attributes))
                    leaf.set(tt.OPVAR, new_var)
                    leaf.set(new_var, v[NormalizeFn.IS])
                    leaf.state = states.REDUCIBLE
                    leaf.cost = op_alist.cost + 1
                    leaf.node_type = nt.ZNODE
                    leaf.set(tt.CONTEXT, op_alist.get(tt.CONTEXT))
                    leaf = frank.context.inject_query_context(leaf)
                    G.link(child, leaf, op_alist.parent_decomposition)

                return op_alist

            elif tt.OP in v:
                op_alist = alist.copy()
                op_alist.set(tt.OPVAR, nest_attr)
                op_alist.set(tt.OP, 'comp')
                # del op_alist.attributes[nest_attr]
                op_alist.set(nest_attr, '')
                op_alist.cost = alist.cost + 1
                op_alist.parent_decomposition = 'normalize'
                op_alist.node_type = nt.HNODE
                # alist.link_child(op_alist)
                G.link(alist, op_alist, op_alist.parent_decomposition)

                var_ctr = 200
                child = A(**{})
                for ak, av in v.items():
                    if isinstance(av, str):
                        child.set(ak, av.strip())
                    elif ak == tt.CONTEXT:
                        child.set(ak, av)
                    else:
                        new_var = vx.NESTING + str(var_ctr)
                        child.set(ak, new_var)
                        child.set(new_var, av)
                        var_ctr = var_ctr + 1
                child.cost = op_alist.cost + 1
                child.node_type = nt.ZNODE
                child.set(tt.CONTEXT, op_alist.get(tt.CONTEXT))
                child = frank.context.inject_query_context(child)
                G.link(op_alist, child, op_alist.parent_decomposition)
                return op_alist
        return None
Esempio n. 3
0
    def search_kb(self, alist: Alist):
        """ Search knowledge bases to instantiate variables in alist.

        Args
        ----
        alist: Alist

        Return
        ------
        Returns `True` if variable instantiation is successful from a KB search.

        """
        self.last_heartbeat = time.time()
        prop_refs = []
        found_facts = []
        # cannot search if alist has uninstantiated nested variables
        if alist.uninstantiated_nesting_variables():
            return found_facts

        self.write_trace(
            f"{pcol.MAGENTA}search {alist.id}{pcol.RESET} {alist}{pcol.RESETALL}"
        )
        if alist.state == states.EXPLORED:
            new_alist = alist.copy()
            new_alist.state = states.EXPLORED
            new_alist.set(tt.OPVAR, alist.get(tt.OPVAR))
            return True

        prop_string = alist.get(tt.PROPERTY)
        sources = {
            'wikidata': {
                'fn': wikidata,
                'trust': 'low'
            },
            'worldbank': {
                'fn': worldbank,
                'trust': 'high'
            },
            'musicbrainz': {
                'fn': musicbrainz,
                'trust': 'high'
            }
        }
        # ,
        #     'gregbrimblecom!': {'fn': jsonld.JSONLD.from_url('gregbrimblecom!', 'https://gregbrimble.com'), 'trust': 'high'},
        #     'mozilla': {'fn': jsonld.JSONLD.from_url('mozilla', 'https://www.mozilla.org/en-GB/'), 'trust': 'high'}
        # }
        context = alist.get(tt.CONTEXT)
        context_store = {}
        context_store = {
            **context[0],
            **context[1],
            **context[2]
        } if context else {}
        for source_name, source in sources.items():
            # check context for trust
            if ctx.trust in context_store:
                if context_store[
                        ctx.trust] == 'high' and source['trust'] != 'high':
                    continue
            # for source_name, source in {'worldbank':worldbank}.items():
            search_alist = alist.copy()
            # inject context into IR
            search_alist = frank.context.inject_retrieval_context(
                search_alist, source_name)

            # if the property_refs does not contain an entry for the property in this alist
            # search KB for a ref for the property
            prop_sources = []
            if prop_string in self.property_refs:
                prop_sources = [x[1] for x in self.property_refs[prop_string]]

            if (prop_string not in self.property_refs and not prop_string.startswith('__')) \
                    or (prop_string in self.property_refs and source_name not in prop_sources):

                props = source['fn'].search_properties(prop_string)

                if len(props) > 0:
                    maxScore = 0
                    for p in props:
                        if p[2] >= maxScore:
                            prop_refs.append((p, source_name))
                            self.reverse_property_refs[p[0]] = prop_string
                            maxScore = p[2]
                        else:
                            break
                self.property_refs[prop_string] = prop_refs

            search_attr = tt.SUBJECT
            uninstantiated_variables = search_alist.uninstantiated_attributes()
            if tt.SUBJECT in uninstantiated_variables:
                search_attr = tt.SUBJECT
            elif tt.OBJECT in uninstantiated_variables:
                search_attr = tt.OBJECT
            elif tt.TIME in uninstantiated_variables:
                search_attr = tt.TIME

            cache_found_flag = False
            if config.config['use_cache']:
                searchable_attr = list(
                    filter(lambda x: x != search_attr,
                           [tt.SUBJECT, tt.PROPERTY, tt.OBJECT, tt.TIME]))
                # search with original property name
                (cache_found_flag, results) = (False, [])
                # (cache_found_flag, results) = frank.cache.neo4j.search_cache(alist_to_instantiate=search_alist,
                #                                                         attribute_to_instantiate=search_attr,
                #                                                         search_attributes=searchable_attr)
                if cache_found_flag == True:
                    found_facts.append(results[0])
                # search with source-specific property IDs

                for (propid, _source_name) in self.property_refs[prop_string]:
                    self.last_heartbeat = time.time()
                    search_alist.set(tt.PROPERTY, propid[0])
                    (cache_found_flag, results) = (False, [])
                    #  = frank.cache.neo4j.search_cache(alist_to_instantiate=search_alist,
                    #                                                         attribute_to_instantiate=search_attr,
                    #                                                         search_attributes=searchable_attr)
                    if cache_found_flag == True:
                        found_facts.append(results[0])
                        self.write_trace(
                            f'{pcol.MAGENTA}found: cache{pcol.RESETALL}')
                # if not found_facts:
                #     self.write_trace('found:>>> cache')
            if not cache_found_flag and prop_string in self.property_refs:
                # search for data for each property reference source
                for propid_label, _source_name in self.property_refs[
                        prop_string]:
                    self.last_heartbeat = time.time()

                    try:
                        if _source_name == source_name:
                            search_alist.set(tt.PROPERTY, propid_label[0])
                            found_facts.extend(
                                source['fn'].find_property_values(
                                    search_alist, search_attr))
                            # TODO: handle location search in less adhoc manner
                            if alist.get(tt.PROPERTY).lower() == "location":
                                if search_attr == tt.SUBJECT:
                                    found_facts.extend(
                                        wikidata.part_of_relation_subject(
                                            search_alist))
                                elif search_attr == tt.OBJECT:
                                    found_facts.extend(
                                        wikidata.part_of_relation_object(
                                            search_alist))
                            break
                    except Exception as ex:
                        self.write_trace(
                            f"{pcol.RED}Search Error{pcol.RESETALL}",
                            processLog.LogLevel.ERROR)
                        print(str(ex))
            if not found_facts and alist.get(
                    tt.PROPERTY).startswith('__geopolitical:'):
                if search_attr == tt.SUBJECT:
                    found_facts.extend(
                        wikidata.part_of_geopolitical_subject(search_alist))
            # TODO: save facts found to cache if caching is enabled
            # if foundFacts and config.config['use_cache']:
            #     for ff in foundFacts:
            #         cache().save(ff, ff.dataSources[0])

        if found_facts:
            self.last_heartbeat = time.time()
            all_numeric = True
            non_numeric_data_items = []
            numeric_data_items = []

            for ff in found_facts:
                self.last_heartbeat = time.time()
                if utils.is_numeric(ff.get(search_attr)):
                    numeric_data_items.append(
                        utils.get_number(ff.get(search_attr), 0.0))
                else:
                    all_numeric = False
                    non_numeric_data_items.append(ff.get(search_attr))
                ff.set(tt.OPVAR, alist.get(tt.OPVAR))
                ff.set(ff.get(tt.OPVAR), ff.get(search_attr))
                sourceCov = sourcePrior().get_prior(
                    source=list(ff.data_sources)[0]).cov
                ff.set(tt.COV, sourceCov)
                ff.state = states.REDUCIBLE
                ff.set(tt.EXPLAIN, '')
                ff.node_type = nt.FACT
                if ff.get(tt.PROPERTY) in self.reverse_property_refs:
                    ff.set(tt.PROPERTY,
                           self.reverse_property_refs[ff.get(tt.PROPERTY)])

                alist.parent_decomposition = "Lookup"
                self.G.add_alist(alist)
                self.G.link(alist, ff, alist.parent_decomposition)

                # fact is considered reduced
                self.write_trace(
                    f'  {pcol.MAGENTA}found:{pcol.RESET} {str(ff)}{pcol.RESETALL}'
                )
        return len(found_facts) > 0