Beispiel #1
0
def get_children(cat: str, include_listcategories=False) -> set:
    """Return all direct subcategories for the given category."""
    category_graph = _get_category_graph()
    if cat not in category_graph:
        return set()
    children = category_graph.successors(cat)
    return {c for c in children if include_listcategories or not list_util.is_listcategory(c)}
Beispiel #2
0
def get_parents(cat: str, include_listcategories=False) -> set:
    """Return all direct supercategories for the given category."""
    category_graph = _get_category_graph()
    if cat not in category_graph:
        return set()
    parents = category_graph.predecessors(cat)
    return {p for p in parents if include_listcategories or not list_util.is_listcategory(p)}
Beispiel #3
0
 def remove_leaf_listcategories(self):
     self._remove_nodes({
         n
         for n in self.nodes
         if list_util.is_listcategory(n) and not self.children(n)
         and not any(list_util.is_listpage(p) for p in self.get_lists(n))
     })
     return self
Beispiel #4
0
def get_listcategories() -> set:
    """Return all list categories (i.e. categories starting with 'Lists of')."""
    global __LISTCATEGORIES__
    if '__LISTCATEGORIES__' not in globals():
        __LISTCATEGORIES__ = {
            lc
            for lc in cat_store.get_categories(include_listcategories=True)
            if list_util.is_listcategory(lc)
        }

    return __LISTCATEGORIES__
Beispiel #5
0
def _get_candidate_categories_for_list(lst, cat_graph) -> set:
    if list_util.is_listcategory(lst):  # list category
        candidates = cat_store.get_parents(lst)
    else:  # list page
        candidates = cat_store.get_topic_categories(lst) | cat_store.get_resource_categories(lst)
    return {n for cat in candidates for n in cat_graph.get_nodes_for_category(cat)}
Beispiel #6
0
    def statistics(self) -> str:
        """Return statistics of CaLiGraph in a printable format."""
        leaf_nodes = {node for node in self.nodes if not self.children(node)}
        node_depths = self.depths()

        class_count = len(self.nodes)
        classes_connected_to_dbpedia_count = len(
            {n
             for n in self.nodes if self.get_transitive_dbpedia_types(n)})
        edge_count = len(self.edges)
        predicate_count = len(self.get_all_predicates())
        axiom_predicate_count = len({
            pred
            for axioms in self._node_axioms.values() for pred, _ in axioms
        })
        parts_count = len({p for n in self.nodes for p in self.get_parts(n)})
        cat_parts_count = len(
            {p
             for n in self.nodes for p in self.get_category_parts(n)})
        list_parts_count = len(
            {p
             for n in self.nodes for p in self.get_list_parts(n)})
        listcat_parts_count = len({
            p
            for n in self.nodes for p in self.get_parts(n)
            if list_util.is_listcategory(p)
        })
        classtree_depth_avg = np.mean(
            [node_depths[node] for node in leaf_nodes])
        branching_factor_avg = np.mean(
            [d for _, d in self.graph.out_degree if d > 0])
        axiom_count = sum(
            [len(axioms) for axioms in self._node_axioms.values()])
        direct_node_axiom_count = len(
            {n
             for n in self.nodes if self.get_axioms(n, transitive=False)})
        node_axiom_count = len(
            {n
             for n in self.nodes if self.get_axioms(n, transitive=True)})

        resources = self.get_all_resources()
        types_per_resource = np.mean([
            len(
                self.get_nodes_for_resource(r) | {
                    tt
                    for t in self.get_nodes_for_resource(r)
                    for tt in self.ancestors(t)
                }) for r in resources
        ])
        relations = self.get_all_relations()
        in_degree = len({
            r
            for r in relations if clg_util.is_clg_resource(r[2])
        }) / len(resources)
        out_degree = len(relations) / len(resources)

        return '\n'.join([
            '{:^40}'.format('STATISTICS'),
            '=' * 40,
            '{:<30} | {:>7}'.format('nodes', class_count),
            '{:<30} | {:>7}'.format('nodes below root',
                                    len(self.children(self.root_node))),
            '{:<30} | {:>7}'.format('nodes connected to DBpedia',
                                    classes_connected_to_dbpedia_count),
            '{:<30} | {:>7}'.format('edges', edge_count),
            '{:<30} | {:>7}'.format('predicates', predicate_count),
            '{:<30} | {:>7}'.format('axiom predicates', axiom_predicate_count),
            '{:<30} | {:>7}'.format('parts', parts_count),
            '{:<30} | {:>7}'.format('category parts', cat_parts_count),
            '{:<30} | {:>7}'.format('list parts', list_parts_count),
            '{:<30} | {:>7}'.format('listcat parts', listcat_parts_count),
            '{:<30} | {:>7.2f}'.format('classtree depth', classtree_depth_avg),
            '{:<30} | {:>7.2f}'.format('branching factor',
                                       branching_factor_avg),
            '{:<30} | {:>7}'.format('axioms', axiom_count),
            '{:<30} | {:>7}'.format('nodes with direct axiom',
                                    direct_node_axiom_count),
            '{:<30} | {:>7}'.format('nodes with axiom', node_axiom_count),
            '-' * 40,
            '{:<30} | {:>7}'.format('resources', len(resources)),
            '{:<30} | {:>7}'.format('types per resource', types_per_resource),
            '{:<30} | {:>7}'.format('relations', len(relations)),
            '{:<30} | {:>7}'.format('resource in-degree', in_degree),
            '{:<30} | {:>7}'.format('resource out-degree', out_degree),
        ])
Beispiel #7
0
def get_categories(include_listcategories=False) -> set:
    """Return all categories that are not hidden or used as any kind of organisational category."""
    return {n for n in _get_category_graph() if include_listcategories or not list_util.is_listcategory(n)}
Beispiel #8
0
def _extract_axioms(graph, patterns) -> dict:
    """Run actual axiom extraction on CaLiGraph."""
    utils.get_logger().debug('CaLi2Ax: Extracting axioms..')
    axioms = defaultdict(set)

    front_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in _get_confidence_pattern_set(
             patterns, True, False).items():
        cat_axioms._fill_dict(
            front_pattern_dict,
            list(front_pattern), lambda d: cat_axioms._fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    back_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in _get_confidence_pattern_set(
             patterns, False, True).items():
        cat_axioms._fill_dict(
            back_pattern_dict,
            list(front_pattern), lambda d: cat_axioms._fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    enclosing_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in _get_confidence_pattern_set(
             patterns, True, True).items():
        cat_axioms._fill_dict(
            enclosing_pattern_dict,
            list(front_pattern), lambda d: cat_axioms._fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    for node in graph.content_nodes:
        property_frequencies = graph.get_property_frequencies(node)

        node_labels = set()
        for part in graph.get_parts(node):
            if cat_util.is_category(part):
                node_labels.add(cat_util.category2name(part))
            elif list_util.is_listcategory(part) or list_util.is_listpage(
                    part):
                node_labels.add(list_util.list2name(part))

        labels_without_by_phrases = [
            nlp_util.remove_by_phrase(label, return_doc=True)
            for label in node_labels
        ]
        for node_doc in labels_without_by_phrases:
            node_axioms = []

            front_prop_axiom = _find_axioms(front_pattern_dict, node, node_doc,
                                            property_frequencies)
            if front_prop_axiom:
                node_axioms.append(front_prop_axiom)

            back_prop_axiom = _find_axioms(back_pattern_dict, node, node_doc,
                                           property_frequencies)
            if back_prop_axiom:
                node_axioms.append(back_prop_axiom)

            enclosing_prop_axiom = _find_axioms(enclosing_pattern_dict, node,
                                                node_doc, property_frequencies)
            if enclosing_prop_axiom:
                node_axioms.append(enclosing_prop_axiom)

            prop_axioms_by_pred = {
                a[1]: {x
                       for x in node_axioms if x[1] == a[1]}
                for a in node_axioms
            }
            for pred, similar_prop_axioms in prop_axioms_by_pred.items():
                if dbp_store.is_object_property(pred):
                    res_labels = {
                        a[2]: dbp_store.get_label(a[2])
                        for a in similar_prop_axioms
                    }
                    similar_prop_axioms = {
                        a
                        for a in similar_prop_axioms
                        if all(res_labels[a[2]] == val
                               or res_labels[a[2]] not in val
                               for val in res_labels.values())
                    }
                best_prop_axiom = max(similar_prop_axioms,
                                      key=operator.itemgetter(3))
                axioms[node].add(best_prop_axiom)

    utils.get_logger().debug(
        f'CaLi2Ax: Extracted {sum(len(axioms) for axioms in axioms.values())} axioms for {len(axioms)} categories.'
    )
    return axioms