Beispiel #1
0
def _extract_axioms(category_graph, patterns):
    """Return axioms extracted from `category_graph` by applying `patterns` to all categories."""
    utils.get_logger().debug('CATEGORY/CAT2AX: Extracting axioms..')
    category_axioms = defaultdict(list)

    # process front/back/front+back patterns individually to reduce computational complexity
    front_pattern_dict = {}
    for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, False).items():
        _fill_dict(front_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns))

    back_pattern_dict = {}
    for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, False, True).items():
        _fill_dict(back_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns))

    enclosing_pattern_dict = {}
    for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, True).items():
        _fill_dict(enclosing_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns))

    cat_contexts = [(
        cat,
        nlp_util.remove_by_phrase(cat_store.get_label(cat)),
        cat_store.get_statistics(cat),
        front_pattern_dict, back_pattern_dict, enclosing_pattern_dict
    ) for cat in category_graph.content_nodes]

    with mp.Pool(processes=utils.get_config('max_cpus')) as pool:
        category_axioms = {cat: axioms for cat, axioms in tqdm(pool.imap_unordered(_extract_axioms_for_cat, cat_contexts, chunksize=1000), total=len(cat_contexts), desc='CATEGORY/CAT2AX: Extracting axioms')}
    category_axioms = {cat: axioms for cat, axioms in category_axioms.items() if axioms}  # filter out empty axioms

    utils.get_logger().debug(f'CATEGORY/CAT2AX: Extracted {sum(len(axioms) for axioms in category_axioms.values())} axioms for {len(category_axioms)} categories.')
    return category_axioms
Beispiel #2
0
def _compute_category_sets() -> dict:
    """Iterate over DBpedia categories and identify all category sets.

    1) Retrieve all usable categories (i.e. categories that are not used for maintenance/organisational purposes)
    2) Normalize their names by removing by-phrases (e.g. "X by genre", "Y by country")
    3) For each category, retrieve all its children and search for name patterns (see '_find_child_sets')
    """
    category_sets = {}
    for cat in cat_store.get_categories():
        children_docs = {c: nlp_util.remove_by_phrase(cat_store.get_label(c)) for c in cat_store.get_children(cat)}
        child_sets = _find_child_sets(cat, children_docs)
        if child_sets:
            category_sets[cat] = child_sets
    return category_sets
Beispiel #3
0
def _without_by_phrase(with_by_phrase: str, without_by_phrase: str):
    with_removed_by_phrase = nlp_util.remove_by_phrase(with_by_phrase,
                                                       return_doc=False)
    assert with_removed_by_phrase.lower() == without_by_phrase.lower(
    ), f'{with_by_phrase} should be converted to {without_by_phrase}'
Beispiel #4
0
def _get_match_for_category(category: str, first_words: tuple, last_words: tuple) -> str:
    """Return variable part of the category name."""
    doc = nlp_util.remove_by_phrase(cat_store.get_label(category))
    return doc[len(first_words):len(doc)-len(last_words)].text
Beispiel #5
0
def _extract_axioms(graph, patterns) -> dict:
    """Run actual axiom extraction on CaLiGraph."""
    utils.get_logger().debug('CaLi2Ax: Extracting axioms..')
    axioms = defaultdict(set)

    front_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in _get_confidence_pattern_set(
             patterns, True, False).items():
        cat_axioms._fill_dict(
            front_pattern_dict,
            list(front_pattern), lambda d: cat_axioms._fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    back_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in _get_confidence_pattern_set(
             patterns, False, True).items():
        cat_axioms._fill_dict(
            back_pattern_dict,
            list(front_pattern), lambda d: cat_axioms._fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    enclosing_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in _get_confidence_pattern_set(
             patterns, True, True).items():
        cat_axioms._fill_dict(
            enclosing_pattern_dict,
            list(front_pattern), lambda d: cat_axioms._fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    for node in graph.content_nodes:
        property_frequencies = graph.get_property_frequencies(node)

        node_labels = set()
        for part in graph.get_parts(node):
            if cat_util.is_category(part):
                node_labels.add(cat_util.category2name(part))
            elif list_util.is_listcategory(part) or list_util.is_listpage(
                    part):
                node_labels.add(list_util.list2name(part))

        labels_without_by_phrases = [
            nlp_util.remove_by_phrase(label, return_doc=True)
            for label in node_labels
        ]
        for node_doc in labels_without_by_phrases:
            node_axioms = []

            front_prop_axiom = _find_axioms(front_pattern_dict, node, node_doc,
                                            property_frequencies)
            if front_prop_axiom:
                node_axioms.append(front_prop_axiom)

            back_prop_axiom = _find_axioms(back_pattern_dict, node, node_doc,
                                           property_frequencies)
            if back_prop_axiom:
                node_axioms.append(back_prop_axiom)

            enclosing_prop_axiom = _find_axioms(enclosing_pattern_dict, node,
                                                node_doc, property_frequencies)
            if enclosing_prop_axiom:
                node_axioms.append(enclosing_prop_axiom)

            prop_axioms_by_pred = {
                a[1]: {x
                       for x in node_axioms if x[1] == a[1]}
                for a in node_axioms
            }
            for pred, similar_prop_axioms in prop_axioms_by_pred.items():
                if dbp_store.is_object_property(pred):
                    res_labels = {
                        a[2]: dbp_store.get_label(a[2])
                        for a in similar_prop_axioms
                    }
                    similar_prop_axioms = {
                        a
                        for a in similar_prop_axioms
                        if all(res_labels[a[2]] == val
                               or res_labels[a[2]] not in val
                               for val in res_labels.values())
                    }
                best_prop_axiom = max(similar_prop_axioms,
                                      key=operator.itemgetter(3))
                axioms[node].add(best_prop_axiom)

    utils.get_logger().debug(
        f'CaLi2Ax: Extracted {sum(len(axioms) for axioms in axioms.values())} axioms for {len(axioms)} categories.'
    )
    return axioms