Example #1
0
    def __induce_level(self, rules):
        '''
        Specializes the rules for the last level with unary predicates.
        '''
        while True:
            old_score = self.group_score(rules)
            new_rules = rules[:]
            for i, rule in enumerate(rules):
                specializations = self.specialize(rule)
                self.extend(new_rules, specializations)

            # Take the first N rules
            rules = sorted(new_rules,
                           key=lambda rule: rule.score,
                           reverse=True)[:self.n]

            new_score = self.group_score(rules)

            logger.debug("Old score: %.3f, New score: %.3f" %
                         (old_score, new_score))

            if 1 - abs(old_score / (new_score + 0.0001)) < 0.01:
                break

        return rules
Example #2
0
 def specialize_optimal_subclass(rule):
     rules = []
     eligible_preds = rule.shared_var[rule.latest_var]
     for pred in filter(is_unary, eligible_preds):
         for sub_class in self.get_subclasses(pred):
             logger.debug('Swapping with %s' % sub_class)
             new_rule = rule.clone_swap_with_subclass(pred, sub_class)
             if self.can_specialize(new_rule):
                 rules.append(new_rule)
                 rules.extend(specialize_optimal_subclass(new_rule))
     return rules
Example #3
0
 def specialize_optimal_subclass(rule):
     rules = []
     eligible_preds = rule.shared_var[rule.latest_var]
     for pred in filter(is_unary, eligible_preds):
         for sub_class in self.get_subclasses(pred):
             logger.debug('Swapping with %s' % sub_class)
             new_rule = rule.clone_swap_with_subclass(pred, sub_class)
             if self.can_specialize(new_rule):
                 rules.append(new_rule)
                 rules.extend(specialize_optimal_subclass(new_rule))
     return rules
Example #4
0
def csv_parse_data(g, data_file):
    '''
    Assumes the following csv format:

    example_uri_or_label; attr_uri_1; attr_uri_2; ...; attr_uri_n
    http://example.org/uri_1; 0/1; 0/1; 0/1; 0/1; ...
    http://example.org/uri_2; 0/1; 0/1; 0/1; 0/1; ...
    ...

    Alternatively attribute values can be URIs themselves.
    '''
    attributes = []
    class_labels = []
    examples = []

    with open(data_file) as f:
        data_lines = f.readlines()
        domain = [a.strip() for a in data_lines[0].split(';')]
        attributes = domain[:-1]

        logger.debug('Attributes: %s' % str(attributes))
        logger.debug('# Examples: %d' % (len(data_lines) - 1))

        for ex_i, example_line in enumerate(data_lines[1:]):
            values = [v.strip() for v in example_line.split(';')]
            if len(values) != len(attributes) + 1:
                raise Exception(
                    'Whoa! The number of values %d != the number of attributes (%d) on line %d.'
                    % (len(values), len(attributes) + 1, ex_i + 2))

            examples.append(values)

    for example in examples:
        # Write to rdf graph
        u = build_uri(example[0])
        g.add((u, rdflib.RDF.type, HEDWIG.Example))
        g.add((u, HEDWIG.class_label, rdflib.Literal(example[-1])))

        for att_idx, att in enumerate(attributes):

            # Skip the label
            if att_idx == 0:
                continue

            attribute_value = example[att_idx]
            value_is_uri = attribute_value.startswith('http://')
            if not (value_is_uri or attribute_value == '1'):
                continue
            annotation_uri = build_uri(
                attribute_value) if value_is_uri else build_uri(att)
            blank = rdflib.BNode()
            g.add((u, HEDWIG.annotated_with, blank))
            g.add((blank, HEDWIG.annotation, annotation_uri))
Example #5
0
def csv_parse_data(g, data_file):
    '''
    Assumes the following csv format:

    example_uri_or_label; attr_uri_1; attr_uri_2; ...; attr_uri_n
    http://example.org/uri_1; 0/1; 0/1; 0/1; 0/1; ...
    http://example.org/uri_2; 0/1; 0/1; 0/1; 0/1; ...
    ...

    Alternatively attribute values can be URIs themselves.
    '''
    attributes = []
    class_labels = []
    examples = []

    with open(data_file) as f:
        data_lines = f.readlines()
        domain = [a.strip() for a in data_lines[0].split(';')]
        attributes = domain[:-1]

        logger.debug('Attributes: %s' % str(attributes))
        logger.debug('# Examples: %d' % (len(data_lines) - 1))

        for ex_i, example_line in enumerate(data_lines[1:]):
            values = [v.strip() for v in example_line.split(';')]
            if len(values) != len(attributes) + 1:
                raise Exception('Whoa! The number of values %d != the number of attributes (%d) on line %d.' % (len(values), len(attributes) + 1, ex_i + 2))

            examples.append(values)

    for example in examples:
        # Write to rdf graph
        u = build_uri(example[0])
        g.add((u, rdflib.RDF.type, HEDWIG.Example))
        g.add((u, HEDWIG.class_label, rdflib.Literal(example[-1])))

        for att_idx, att in enumerate(attributes):
            
            # Skip the label 
            if att_idx == 0:
                continue

            attribute_value = example[att_idx]
            value_is_uri = attribute_value.startswith('http://')
            if not (value_is_uri or attribute_value == '1'):
                continue
            annotation_uri = build_uri(attribute_value) if value_is_uri else build_uri(att)
            blank = rdflib.BNode()
            g.add((u, HEDWIG.annotated_with, blank))
            g.add((blank, HEDWIG.annotation, annotation_uri))
Example #6
0
    def _propagate_annotation_names(self, g):
        # Query for annotation names
        for sub, obj in g.subject_objects(predicate=HEDWIG.annotation_name):
            sub, obj = str(sub), str(obj)
            self.annotation_name[sub].append(obj)
            logger.debug('Annotation name root: %s, %s' % (sub, obj))

        # Propagate the annotation names to children
        annotation_name_roots = list(self.annotation_name.keys())
        for pred in self.predicates:
            for annotation_root in annotation_name_roots:
                if annotation_root in self.super_classes(pred):
                    name = self.annotation_name[annotation_root]
                    self.annotation_name[pred] = name
Example #7
0
    def _propagate_annotation_names(self, g):
        to_uni = lambda s: unicode(s).encode('ascii', 'ignore')

        # Query for annotation names
        for sub, obj in g.subject_objects(predicate=HEDWIG.annotation_name):
            sub, obj = to_uni(sub), to_uni(obj)
            self.annotation_name[sub].append(obj)
            logger.debug('Annotation name root: %s, %s' % (sub, obj))

        # Propagate the annotation names to children
        annotation_name_roots = self.annotation_name.keys()
        for pred in self.predicates:
            for annotation_root in annotation_name_roots:
                if annotation_root in self.super_classes(pred):
                    name = self.annotation_name[annotation_root]
                    self.annotation_name[pred] = name
Example #8
0
    def _propagate_annotation_names(self, g):
        to_uni = lambda s: unicode(s).encode("ascii", "ignore")

        # Query for annotation names
        for sub, obj in g.subject_objects(predicate=HEDWIG.annotation_name):
            sub, obj = to_uni(sub), to_uni(obj)
            self.annotation_name[sub].append(obj)
            logger.debug("Annotation name root: %s, %s" % (sub, obj))

        # Propagate the annotation names to children
        annotation_name_roots = self.annotation_name.keys()
        for pred in self.predicates:
            for annotation_root in annotation_name_roots:
                if annotation_root in self.super_classes(pred):
                    name = self.annotation_name[annotation_root]
                    self.annotation_name[pred] = name
Example #9
0
    def _find_roots(self, all_annotations):
        roots = filter(lambda pred: not self.sub_class_of[pred], self.super_class_of.keys())

        # Check for annotations not in the ontology to add them as roots
        for annotation in all_annotations:
            if annotation not in self.predicates:
                roots.append(annotation)
                logger.debug("Adding leaf %s as root, as it is not specified in the ontology" % annotation)

        logger.debug("Detected root nodes: %s" % str(roots))

        # Add a dummy root
        self.dummy_root = "root"
        self.predicates.add(self.dummy_root)
        for root in roots:
            self.add_sub_class(root, self.dummy_root)
Example #10
0
    def _find_roots(self, all_annotations):
        roots = filter(lambda pred: not self.sub_class_of[pred],
                       self.super_class_of.keys())

        # Check for annotations not in the ontology to add them as roots
        for annotation in all_annotations:
            if annotation not in self.predicates:
                roots.append(annotation)
                logger.debug(
                    'Adding leaf %s as root, as it is not specified in the ontology'
                    % annotation)

        logger.debug('Detected root nodes: %s' % str(roots))

        # Add a dummy root
        self.dummy_root = 'root'
        self.predicates.add(self.dummy_root)
        for root in roots:
            self.add_sub_class(root, self.dummy_root)
Example #11
0
    def __induce_level(self, rules):
        '''
        Specializes the rules for the last level with unary predicates.
        '''
        while True:
            old_score = self.group_score(rules)
            new_rules = rules[:]
            for i, rule in enumerate(rules):
                specializations = self.specialize(rule)
                self.extend(new_rules, specializations)

            # Take the first N rules
            rules = sorted(new_rules,
                           key=lambda rule: rule.score,
                           reverse=True)[:self.n]

            new_score = self.group_score(rules)

            logger.debug("Old score: %.3f, New score: %.3f" % (old_score, new_score))

            if 1 - abs(old_score/(new_score+0.0001)) < 0.01:
                break

        return rules
Example #12
0
    def __init__(self, triplets, score_fun, instances_as_leaves=True):
        '''
        Initialize the knowledge base with the given triplet graph.
        The target class is given with 'target_class' - this is the
        class to be described in the induction step.
        '''
        self.instances_as_leaves = instances_as_leaves
        self.score_fun = score_fun
        self.sub_class_of = defaultdict(list)
        self.super_class_of = defaultdict(list)
        self.predicates = set()
        self.binary_predicates = set()
        self.class_values = set()
        self.annotation_name = defaultdict(list)

        self.examples, all_annotations = self._build_examples(triplets)

        # Ranked or class-labeled data
        self.target_type = self.examples[0].target_type

        self._build_subclassof(triplets)
        self._calc_predicate_members(triplets)
        self._find_roots(all_annotations)
        self._calc_members_closure()
        self._calc_binary_members()
        self._propagate_annotation_names(triplets)

        # Statistics
        if self.target_type == Example.Ranked:
            self.mean = avg([ex.score for ex in self.examples])
            self.sd = std([ex.score for ex in self.examples])
        else:
            self.distribution = defaultdict(int)
            for ex in self.examples:
                self.distribution[ex.score] += 1
            logger.debug('Class distribution: %s' % str(self.distribution))
Example #13
0
    def __init__(self, triplets, score_fun, instances_as_leaves=True):
        """
        Initialize the knowledge base with the given triplet graph.
        The target class is given with 'target_class' - this is the
        class to be described in the induction step.
        """
        self.instances_as_leaves = instances_as_leaves
        self.score_fun = score_fun
        self.sub_class_of = defaultdict(list)
        self.super_class_of = defaultdict(list)
        self.predicates = set()
        self.binary_predicates = set()
        self.class_values = set()
        self.annotation_name = defaultdict(list)

        self.examples, all_annotations = self._build_examples(triplets)

        # Ranked or class-labeled data
        self.target_type = self.examples[0].target_type

        self._build_subclassof(triplets)
        self._calc_predicate_members(triplets)
        self._find_roots(all_annotations)
        self._calc_members_closure()
        self._calc_binary_members()
        self._propagate_annotation_names(triplets)

        # Statistics
        if self.target_type == Example.Ranked:
            self.mean = avg([ex.score for ex in self.examples])
            self.sd = std([ex.score for ex in self.examples])
        else:
            self.distribution = defaultdict(int)
            for ex in self.examples:
                self.distribution[ex.score] += 1
            logger.debug("Class distribution: %s" % str(self.distribution))
Example #14
0
    def _calc_members_closure(self):
        self.sub_class_of_closure = defaultdict(set)
        for pred in list(self.super_class_of.keys()):
            self.sub_class_of_closure[pred].update(self.sub_class_of[pred])

        # Calc the closure to get the members of the subClassOf hierarchy
        def closure(pred, lvl, visited=[]):

            if pred in visited:
                raise Exception(
                    'Cycle detected in the hierarchy at predicate %s!' % pred)

            children = self.super_class_of[pred]
            self.levels[lvl].add(pred)

            if children:
                mems = set()
                visited.append(pred)
                for child in children:
                    parent_closure = self.sub_class_of_closure[pred]
                    self.sub_class_of_closure[child].update(parent_closure)
                    mems.update(closure(child, lvl + 1, visited=visited))
                self.members[pred].update(mems)
                visited.remove(pred)

                return self.members[pred]
            else:
                return self.members[pred]

        # Level-wise predicates
        self.levels = defaultdict(set)

        # Run the closure from root
        closure(self.dummy_root, 0)
        logger.debug('root members {}'.format(
            len(self.members[self.dummy_root])))
Example #15
0
    def specialize(self, rule):
        '''
        Returns a list of all specializations of 'rule'.
        '''
        is_unary = lambda p: isinstance(p, UnaryPredicate)

        def specialize_optimal_subclass(rule):
            rules = []
            eligible_preds = rule.shared_var[rule.latest_var]
            for pred in filter(is_unary, eligible_preds):
                for sub_class in self.get_subclasses(pred):
                    logger.debug('Swapping with %s' % sub_class)
                    new_rule = rule.clone_swap_with_subclass(pred, sub_class)
                    if self.can_specialize(new_rule):
                        rules.append(new_rule)
                        rules.extend(specialize_optimal_subclass(new_rule))
            return rules

        logger.debug('Specializing rule: %s' % rule)
        specializations = []
        eligible_preds = rule.shared_var[rule.latest_var]

        # Swapping unary predicates with subclasses, swap only
        # the predicates with the latest variable
        if not self.optimal_subclass:
            for pred in filter(is_unary, eligible_preds):
                logger.debug('Predicate to swap: %s' % pred.label)
                for sub_class in self.get_subclasses(pred):
                    logger.debug('Swapping with %s' % sub_class)
                    new_rule = rule.clone_swap_with_subclass(pred, sub_class)
                    if self.can_specialize(new_rule):
                        specializations.append(new_rule)
        else:
            specializations.extend(specialize_optimal_subclass(rule))

        if self.use_negations:
            # Negate the last predicate
            for pred in filter(is_unary, eligible_preds):
                logger.debug('Predicate to negate: %s' % pred.label)
                new_rule = rule.clone_negate(pred)
                if self.can_specialize(new_rule):
                    specializations.append(new_rule)

        # This makes sure we are not specializing a default rule by appending,
        # this rule should instead be reached by the specialization step above.
        if not (len(eligible_preds) == 1 and
           (eligible_preds[0].label == self.kb.get_root().label or
           self.is_implicit_root(eligible_preds[0].label))):

            # Calculate the union of superclasses of each predicate
            supers = set()
            for pred in eligible_preds:
                supers.update(self.get_superclasses(pred.label))
                supers.add(pred)

            # Calculate the top-most left-most non-ancestor
            for lvl in sorted(self.kb.levels.keys()):

                level = self.kb.levels[lvl]
                diff = level.difference(supers)
                if diff:

                    # The next predicate to specialize with is the left-most
                    for pred in sorted(list(diff)):

                        # Appending a new predicate, the last predicate
                        # is always the producer
                        last_pred = rule.predicates[-1]
                        new_rule = rule.clone_append(pred,
                                                     producer_pred=last_pred)
                        if self.can_specialize(new_rule) and \
                           self.non_redundant(rule, new_rule):
                            specializations.append(new_rule)
                            break

        # Introduce new binary relation
        if isinstance(rule.predicates[-1], UnaryPredicate):
            specializations.extend(self.specialize_add_relation(rule))

        logger.debug('All specializations %s'
                     % [str(rule) for rule in specializations])

        return specializations
Example #16
0
    def specialize(self, rule):
        '''
        Returns a list of all specializations of 'rule'.
        '''
        is_unary = lambda p: isinstance(p, UnaryPredicate)

        def specialize_optimal_subclass(rule):
            rules = []
            eligible_preds = rule.shared_var[rule.latest_var]
            for pred in filter(is_unary, eligible_preds):
                for sub_class in self.get_subclasses(pred):
                    logger.debug('Swapping with %s' % sub_class)
                    new_rule = rule.clone_swap_with_subclass(pred, sub_class)
                    if self.can_specialize(new_rule):
                        rules.append(new_rule)
                        rules.extend(specialize_optimal_subclass(new_rule))
            return rules

        logger.debug('Specializing rule: %s' % rule)
        specializations = []
        eligible_preds = rule.shared_var[rule.latest_var]

        # Swapping unary predicates with subclasses, swap only
        # the predicates with the latest variable
        if not self.optimal_subclass:
            for pred in filter(is_unary, eligible_preds):
                logger.debug('Predicate to swap: %s' % pred.label)
                for sub_class in self.get_subclasses(pred):
                    logger.debug('Swapping with %s' % sub_class)
                    new_rule = rule.clone_swap_with_subclass(pred, sub_class)
                    if self.can_specialize(new_rule):
                        specializations.append(new_rule)
        else:
            specializations.extend(specialize_optimal_subclass(rule))

        if self.use_negations:
            # Negate the last predicate
            for pred in filter(is_unary, eligible_preds):
                logger.debug('Predicate to negate: %s' % pred.label)
                new_rule = rule.clone_negate(pred)
                if self.can_specialize(new_rule):
                    specializations.append(new_rule)

        # This makes sure we are not specializing a default rule by appending,
        # this rule should instead be reached by the specialization step above.
        if not (len(eligible_preds) == 1 and
                (eligible_preds[0].label == self.kb.get_root().label
                 or self.is_implicit_root(eligible_preds[0].label))):

            # Calculate the union of superclasses of each predicate
            supers = set()
            for pred in eligible_preds:
                supers.update(self.get_superclasses(pred.label))
                supers.add(pred)

            # Calculate the top-most left-most non-ancestor
            for lvl in sorted(self.kb.levels.keys()):

                level = self.kb.levels[lvl]
                diff = level.difference(supers)
                if diff:

                    # The next predicate to specialize with is the left-most
                    for pred in sorted(list(diff)):

                        # Appending a new predicate, the last predicate
                        # is always the producer
                        last_pred = rule.predicates[-1]
                        new_rule = rule.clone_append(pred,
                                                     producer_pred=last_pred)
                        if self.can_specialize(new_rule) and \
                           self.non_redundant(rule, new_rule):
                            specializations.append(new_rule)
                            break

        # Introduce new binary relation
        if isinstance(rule.predicates[-1], UnaryPredicate):
            specializations.extend(self.specialize_add_relation(rule))

        logger.debug('All specializations %s' %
                     [str(rule) for rule in specializations])

        return specializations