def get_children(cat: str, include_listcategories=False) -> set: """Return all direct subcategories for the given category.""" category_graph = _get_category_graph() if cat not in category_graph: return set() children = category_graph.successors(cat) return {c for c in children if include_listcategories or not list_util.is_listcategory(c)}
def get_parents(cat: str, include_listcategories=False) -> set: """Return all direct supercategories for the given category.""" category_graph = _get_category_graph() if cat not in category_graph: return set() parents = category_graph.predecessors(cat) return {p for p in parents if include_listcategories or not list_util.is_listcategory(p)}
def remove_leaf_listcategories(self): self._remove_nodes({ n for n in self.nodes if list_util.is_listcategory(n) and not self.children(n) and not any(list_util.is_listpage(p) for p in self.get_lists(n)) }) return self
def get_listcategories() -> set: """Return all list categories (i.e. categories starting with 'Lists of').""" global __LISTCATEGORIES__ if '__LISTCATEGORIES__' not in globals(): __LISTCATEGORIES__ = { lc for lc in cat_store.get_categories(include_listcategories=True) if list_util.is_listcategory(lc) } return __LISTCATEGORIES__
def _get_candidate_categories_for_list(lst, cat_graph) -> set: if list_util.is_listcategory(lst): # list category candidates = cat_store.get_parents(lst) else: # list page candidates = cat_store.get_topic_categories(lst) | cat_store.get_resource_categories(lst) return {n for cat in candidates for n in cat_graph.get_nodes_for_category(cat)}
def statistics(self) -> str: """Return statistics of CaLiGraph in a printable format.""" leaf_nodes = {node for node in self.nodes if not self.children(node)} node_depths = self.depths() class_count = len(self.nodes) classes_connected_to_dbpedia_count = len( {n for n in self.nodes if self.get_transitive_dbpedia_types(n)}) edge_count = len(self.edges) predicate_count = len(self.get_all_predicates()) axiom_predicate_count = len({ pred for axioms in self._node_axioms.values() for pred, _ in axioms }) parts_count = len({p for n in self.nodes for p in self.get_parts(n)}) cat_parts_count = len( {p for n in self.nodes for p in self.get_category_parts(n)}) list_parts_count = len( {p for n in self.nodes for p in self.get_list_parts(n)}) listcat_parts_count = len({ p for n in self.nodes for p in self.get_parts(n) if list_util.is_listcategory(p) }) classtree_depth_avg = np.mean( [node_depths[node] for node in leaf_nodes]) branching_factor_avg = np.mean( [d for _, d in self.graph.out_degree if d > 0]) axiom_count = sum( [len(axioms) for axioms in self._node_axioms.values()]) direct_node_axiom_count = len( {n for n in self.nodes if self.get_axioms(n, transitive=False)}) node_axiom_count = len( {n for n in self.nodes if self.get_axioms(n, transitive=True)}) resources = self.get_all_resources() types_per_resource = np.mean([ len( self.get_nodes_for_resource(r) | { tt for t in self.get_nodes_for_resource(r) for tt in self.ancestors(t) }) for r in resources ]) relations = self.get_all_relations() in_degree = len({ r for r in relations if clg_util.is_clg_resource(r[2]) }) / len(resources) out_degree = len(relations) / len(resources) return '\n'.join([ '{:^40}'.format('STATISTICS'), '=' * 40, '{:<30} | {:>7}'.format('nodes', class_count), '{:<30} | {:>7}'.format('nodes below root', len(self.children(self.root_node))), '{:<30} | {:>7}'.format('nodes connected to DBpedia', classes_connected_to_dbpedia_count), '{:<30} | {:>7}'.format('edges', edge_count), '{:<30} | {:>7}'.format('predicates', predicate_count), '{:<30} | {:>7}'.format('axiom predicates', axiom_predicate_count), '{:<30} | {:>7}'.format('parts', parts_count), '{:<30} | {:>7}'.format('category parts', cat_parts_count), '{:<30} | {:>7}'.format('list parts', list_parts_count), '{:<30} | {:>7}'.format('listcat parts', listcat_parts_count), '{:<30} | {:>7.2f}'.format('classtree depth', classtree_depth_avg), '{:<30} | {:>7.2f}'.format('branching factor', branching_factor_avg), '{:<30} | {:>7}'.format('axioms', axiom_count), '{:<30} | {:>7}'.format('nodes with direct axiom', direct_node_axiom_count), '{:<30} | {:>7}'.format('nodes with axiom', node_axiom_count), '-' * 40, '{:<30} | {:>7}'.format('resources', len(resources)), '{:<30} | {:>7}'.format('types per resource', types_per_resource), '{:<30} | {:>7}'.format('relations', len(relations)), '{:<30} | {:>7}'.format('resource in-degree', in_degree), '{:<30} | {:>7}'.format('resource out-degree', out_degree), ])
def get_categories(include_listcategories=False) -> set: """Return all categories that are not hidden or used as any kind of organisational category.""" return {n for n in _get_category_graph() if include_listcategories or not list_util.is_listcategory(n)}
def _extract_axioms(graph, patterns) -> dict: """Run actual axiom extraction on CaLiGraph.""" utils.get_logger().debug('CaLi2Ax: Extracting axioms..') axioms = defaultdict(set) front_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set( patterns, True, False).items(): cat_axioms._fill_dict( front_pattern_dict, list(front_pattern), lambda d: cat_axioms._fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) back_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set( patterns, False, True).items(): cat_axioms._fill_dict( back_pattern_dict, list(front_pattern), lambda d: cat_axioms._fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) enclosing_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set( patterns, True, True).items(): cat_axioms._fill_dict( enclosing_pattern_dict, list(front_pattern), lambda d: cat_axioms._fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) for node in graph.content_nodes: property_frequencies = graph.get_property_frequencies(node) node_labels = set() for part in graph.get_parts(node): if cat_util.is_category(part): node_labels.add(cat_util.category2name(part)) elif list_util.is_listcategory(part) or list_util.is_listpage( part): node_labels.add(list_util.list2name(part)) labels_without_by_phrases = [ nlp_util.remove_by_phrase(label, return_doc=True) for label in node_labels ] for node_doc in labels_without_by_phrases: node_axioms = [] front_prop_axiom = _find_axioms(front_pattern_dict, node, node_doc, property_frequencies) if front_prop_axiom: node_axioms.append(front_prop_axiom) back_prop_axiom = _find_axioms(back_pattern_dict, node, node_doc, property_frequencies) if back_prop_axiom: node_axioms.append(back_prop_axiom) enclosing_prop_axiom = _find_axioms(enclosing_pattern_dict, node, node_doc, property_frequencies) if enclosing_prop_axiom: node_axioms.append(enclosing_prop_axiom) prop_axioms_by_pred = { a[1]: {x for x in node_axioms if x[1] == a[1]} for a in node_axioms } for pred, similar_prop_axioms in prop_axioms_by_pred.items(): if dbp_store.is_object_property(pred): res_labels = { a[2]: dbp_store.get_label(a[2]) for a in similar_prop_axioms } similar_prop_axioms = { a for a in similar_prop_axioms if all(res_labels[a[2]] == val or res_labels[a[2]] not in val for val in res_labels.values()) } best_prop_axiom = max(similar_prop_axioms, key=operator.itemgetter(3)) axioms[node].add(best_prop_axiom) utils.get_logger().debug( f'CaLi2Ax: Extracted {sum(len(axioms) for axioms in axioms.values())} axioms for {len(axioms)} categories.' ) return axioms