def __induce_level(self, rules): ''' Specializes the rules for the last level with unary predicates. ''' while True: old_score = self.group_score(rules) new_rules = rules[:] for i, rule in enumerate(rules): specializations = self.specialize(rule) self.extend(new_rules, specializations) # Take the first N rules rules = sorted(new_rules, key=lambda rule: rule.score, reverse=True)[:self.n] new_score = self.group_score(rules) logger.debug("Old score: %.3f, New score: %.3f" % (old_score, new_score)) if 1 - abs(old_score / (new_score + 0.0001)) < 0.01: break return rules
def specialize_optimal_subclass(rule): rules = [] eligible_preds = rule.shared_var[rule.latest_var] for pred in filter(is_unary, eligible_preds): for sub_class in self.get_subclasses(pred): logger.debug('Swapping with %s' % sub_class) new_rule = rule.clone_swap_with_subclass(pred, sub_class) if self.can_specialize(new_rule): rules.append(new_rule) rules.extend(specialize_optimal_subclass(new_rule)) return rules
def csv_parse_data(g, data_file): ''' Assumes the following csv format: example_uri_or_label; attr_uri_1; attr_uri_2; ...; attr_uri_n http://example.org/uri_1; 0/1; 0/1; 0/1; 0/1; ... http://example.org/uri_2; 0/1; 0/1; 0/1; 0/1; ... ... Alternatively attribute values can be URIs themselves. ''' attributes = [] class_labels = [] examples = [] with open(data_file) as f: data_lines = f.readlines() domain = [a.strip() for a in data_lines[0].split(';')] attributes = domain[:-1] logger.debug('Attributes: %s' % str(attributes)) logger.debug('# Examples: %d' % (len(data_lines) - 1)) for ex_i, example_line in enumerate(data_lines[1:]): values = [v.strip() for v in example_line.split(';')] if len(values) != len(attributes) + 1: raise Exception( 'Whoa! The number of values %d != the number of attributes (%d) on line %d.' % (len(values), len(attributes) + 1, ex_i + 2)) examples.append(values) for example in examples: # Write to rdf graph u = build_uri(example[0]) g.add((u, rdflib.RDF.type, HEDWIG.Example)) g.add((u, HEDWIG.class_label, rdflib.Literal(example[-1]))) for att_idx, att in enumerate(attributes): # Skip the label if att_idx == 0: continue attribute_value = example[att_idx] value_is_uri = attribute_value.startswith('http://') if not (value_is_uri or attribute_value == '1'): continue annotation_uri = build_uri( attribute_value) if value_is_uri else build_uri(att) blank = rdflib.BNode() g.add((u, HEDWIG.annotated_with, blank)) g.add((blank, HEDWIG.annotation, annotation_uri))
def csv_parse_data(g, data_file): ''' Assumes the following csv format: example_uri_or_label; attr_uri_1; attr_uri_2; ...; attr_uri_n http://example.org/uri_1; 0/1; 0/1; 0/1; 0/1; ... http://example.org/uri_2; 0/1; 0/1; 0/1; 0/1; ... ... Alternatively attribute values can be URIs themselves. ''' attributes = [] class_labels = [] examples = [] with open(data_file) as f: data_lines = f.readlines() domain = [a.strip() for a in data_lines[0].split(';')] attributes = domain[:-1] logger.debug('Attributes: %s' % str(attributes)) logger.debug('# Examples: %d' % (len(data_lines) - 1)) for ex_i, example_line in enumerate(data_lines[1:]): values = [v.strip() for v in example_line.split(';')] if len(values) != len(attributes) + 1: raise Exception('Whoa! The number of values %d != the number of attributes (%d) on line %d.' % (len(values), len(attributes) + 1, ex_i + 2)) examples.append(values) for example in examples: # Write to rdf graph u = build_uri(example[0]) g.add((u, rdflib.RDF.type, HEDWIG.Example)) g.add((u, HEDWIG.class_label, rdflib.Literal(example[-1]))) for att_idx, att in enumerate(attributes): # Skip the label if att_idx == 0: continue attribute_value = example[att_idx] value_is_uri = attribute_value.startswith('http://') if not (value_is_uri or attribute_value == '1'): continue annotation_uri = build_uri(attribute_value) if value_is_uri else build_uri(att) blank = rdflib.BNode() g.add((u, HEDWIG.annotated_with, blank)) g.add((blank, HEDWIG.annotation, annotation_uri))
def _propagate_annotation_names(self, g): # Query for annotation names for sub, obj in g.subject_objects(predicate=HEDWIG.annotation_name): sub, obj = str(sub), str(obj) self.annotation_name[sub].append(obj) logger.debug('Annotation name root: %s, %s' % (sub, obj)) # Propagate the annotation names to children annotation_name_roots = list(self.annotation_name.keys()) for pred in self.predicates: for annotation_root in annotation_name_roots: if annotation_root in self.super_classes(pred): name = self.annotation_name[annotation_root] self.annotation_name[pred] = name
def _propagate_annotation_names(self, g): to_uni = lambda s: unicode(s).encode('ascii', 'ignore') # Query for annotation names for sub, obj in g.subject_objects(predicate=HEDWIG.annotation_name): sub, obj = to_uni(sub), to_uni(obj) self.annotation_name[sub].append(obj) logger.debug('Annotation name root: %s, %s' % (sub, obj)) # Propagate the annotation names to children annotation_name_roots = self.annotation_name.keys() for pred in self.predicates: for annotation_root in annotation_name_roots: if annotation_root in self.super_classes(pred): name = self.annotation_name[annotation_root] self.annotation_name[pred] = name
def _propagate_annotation_names(self, g): to_uni = lambda s: unicode(s).encode("ascii", "ignore") # Query for annotation names for sub, obj in g.subject_objects(predicate=HEDWIG.annotation_name): sub, obj = to_uni(sub), to_uni(obj) self.annotation_name[sub].append(obj) logger.debug("Annotation name root: %s, %s" % (sub, obj)) # Propagate the annotation names to children annotation_name_roots = self.annotation_name.keys() for pred in self.predicates: for annotation_root in annotation_name_roots: if annotation_root in self.super_classes(pred): name = self.annotation_name[annotation_root] self.annotation_name[pred] = name
def _find_roots(self, all_annotations): roots = filter(lambda pred: not self.sub_class_of[pred], self.super_class_of.keys()) # Check for annotations not in the ontology to add them as roots for annotation in all_annotations: if annotation not in self.predicates: roots.append(annotation) logger.debug("Adding leaf %s as root, as it is not specified in the ontology" % annotation) logger.debug("Detected root nodes: %s" % str(roots)) # Add a dummy root self.dummy_root = "root" self.predicates.add(self.dummy_root) for root in roots: self.add_sub_class(root, self.dummy_root)
def _find_roots(self, all_annotations): roots = filter(lambda pred: not self.sub_class_of[pred], self.super_class_of.keys()) # Check for annotations not in the ontology to add them as roots for annotation in all_annotations: if annotation not in self.predicates: roots.append(annotation) logger.debug( 'Adding leaf %s as root, as it is not specified in the ontology' % annotation) logger.debug('Detected root nodes: %s' % str(roots)) # Add a dummy root self.dummy_root = 'root' self.predicates.add(self.dummy_root) for root in roots: self.add_sub_class(root, self.dummy_root)
def __induce_level(self, rules): ''' Specializes the rules for the last level with unary predicates. ''' while True: old_score = self.group_score(rules) new_rules = rules[:] for i, rule in enumerate(rules): specializations = self.specialize(rule) self.extend(new_rules, specializations) # Take the first N rules rules = sorted(new_rules, key=lambda rule: rule.score, reverse=True)[:self.n] new_score = self.group_score(rules) logger.debug("Old score: %.3f, New score: %.3f" % (old_score, new_score)) if 1 - abs(old_score/(new_score+0.0001)) < 0.01: break return rules
def __init__(self, triplets, score_fun, instances_as_leaves=True): ''' Initialize the knowledge base with the given triplet graph. The target class is given with 'target_class' - this is the class to be described in the induction step. ''' self.instances_as_leaves = instances_as_leaves self.score_fun = score_fun self.sub_class_of = defaultdict(list) self.super_class_of = defaultdict(list) self.predicates = set() self.binary_predicates = set() self.class_values = set() self.annotation_name = defaultdict(list) self.examples, all_annotations = self._build_examples(triplets) # Ranked or class-labeled data self.target_type = self.examples[0].target_type self._build_subclassof(triplets) self._calc_predicate_members(triplets) self._find_roots(all_annotations) self._calc_members_closure() self._calc_binary_members() self._propagate_annotation_names(triplets) # Statistics if self.target_type == Example.Ranked: self.mean = avg([ex.score for ex in self.examples]) self.sd = std([ex.score for ex in self.examples]) else: self.distribution = defaultdict(int) for ex in self.examples: self.distribution[ex.score] += 1 logger.debug('Class distribution: %s' % str(self.distribution))
def __init__(self, triplets, score_fun, instances_as_leaves=True): """ Initialize the knowledge base with the given triplet graph. The target class is given with 'target_class' - this is the class to be described in the induction step. """ self.instances_as_leaves = instances_as_leaves self.score_fun = score_fun self.sub_class_of = defaultdict(list) self.super_class_of = defaultdict(list) self.predicates = set() self.binary_predicates = set() self.class_values = set() self.annotation_name = defaultdict(list) self.examples, all_annotations = self._build_examples(triplets) # Ranked or class-labeled data self.target_type = self.examples[0].target_type self._build_subclassof(triplets) self._calc_predicate_members(triplets) self._find_roots(all_annotations) self._calc_members_closure() self._calc_binary_members() self._propagate_annotation_names(triplets) # Statistics if self.target_type == Example.Ranked: self.mean = avg([ex.score for ex in self.examples]) self.sd = std([ex.score for ex in self.examples]) else: self.distribution = defaultdict(int) for ex in self.examples: self.distribution[ex.score] += 1 logger.debug("Class distribution: %s" % str(self.distribution))
def _calc_members_closure(self): self.sub_class_of_closure = defaultdict(set) for pred in list(self.super_class_of.keys()): self.sub_class_of_closure[pred].update(self.sub_class_of[pred]) # Calc the closure to get the members of the subClassOf hierarchy def closure(pred, lvl, visited=[]): if pred in visited: raise Exception( 'Cycle detected in the hierarchy at predicate %s!' % pred) children = self.super_class_of[pred] self.levels[lvl].add(pred) if children: mems = set() visited.append(pred) for child in children: parent_closure = self.sub_class_of_closure[pred] self.sub_class_of_closure[child].update(parent_closure) mems.update(closure(child, lvl + 1, visited=visited)) self.members[pred].update(mems) visited.remove(pred) return self.members[pred] else: return self.members[pred] # Level-wise predicates self.levels = defaultdict(set) # Run the closure from root closure(self.dummy_root, 0) logger.debug('root members {}'.format( len(self.members[self.dummy_root])))
def specialize(self, rule): ''' Returns a list of all specializations of 'rule'. ''' is_unary = lambda p: isinstance(p, UnaryPredicate) def specialize_optimal_subclass(rule): rules = [] eligible_preds = rule.shared_var[rule.latest_var] for pred in filter(is_unary, eligible_preds): for sub_class in self.get_subclasses(pred): logger.debug('Swapping with %s' % sub_class) new_rule = rule.clone_swap_with_subclass(pred, sub_class) if self.can_specialize(new_rule): rules.append(new_rule) rules.extend(specialize_optimal_subclass(new_rule)) return rules logger.debug('Specializing rule: %s' % rule) specializations = [] eligible_preds = rule.shared_var[rule.latest_var] # Swapping unary predicates with subclasses, swap only # the predicates with the latest variable if not self.optimal_subclass: for pred in filter(is_unary, eligible_preds): logger.debug('Predicate to swap: %s' % pred.label) for sub_class in self.get_subclasses(pred): logger.debug('Swapping with %s' % sub_class) new_rule = rule.clone_swap_with_subclass(pred, sub_class) if self.can_specialize(new_rule): specializations.append(new_rule) else: specializations.extend(specialize_optimal_subclass(rule)) if self.use_negations: # Negate the last predicate for pred in filter(is_unary, eligible_preds): logger.debug('Predicate to negate: %s' % pred.label) new_rule = rule.clone_negate(pred) if self.can_specialize(new_rule): specializations.append(new_rule) # This makes sure we are not specializing a default rule by appending, # this rule should instead be reached by the specialization step above. if not (len(eligible_preds) == 1 and (eligible_preds[0].label == self.kb.get_root().label or self.is_implicit_root(eligible_preds[0].label))): # Calculate the union of superclasses of each predicate supers = set() for pred in eligible_preds: supers.update(self.get_superclasses(pred.label)) supers.add(pred) # Calculate the top-most left-most non-ancestor for lvl in sorted(self.kb.levels.keys()): level = self.kb.levels[lvl] diff = level.difference(supers) if diff: # The next predicate to specialize with is the left-most for pred in sorted(list(diff)): # Appending a new predicate, the last predicate # is always the producer last_pred = rule.predicates[-1] new_rule = rule.clone_append(pred, producer_pred=last_pred) if self.can_specialize(new_rule) and \ self.non_redundant(rule, new_rule): specializations.append(new_rule) break # Introduce new binary relation if isinstance(rule.predicates[-1], UnaryPredicate): specializations.extend(self.specialize_add_relation(rule)) logger.debug('All specializations %s' % [str(rule) for rule in specializations]) return specializations