Beispiel #1
0
def _find_Y(X: Span, subcat_uri: str):
    """Return Y if the category follows one of the patterns 'YX' or 'X <prep> Y'."""
    if X.text.lower() not in cat_store.get_label(subcat_uri).lower():
        return None
    subcat = nlp_util.parse(cat_store.get_label(subcat_uri))
    if subcat.text.lower().endswith(' ' + X.text.lower()):  # "YX"
        if len(X) >= len(subcat) or subcat[-(len(X) + 1)].pos_ == 'ADP':
            return None
        return subcat[:-len(X)]
    elif subcat.text.lower().startswith(X.text.lower() + ' '):  # "X <prep> Y"
        adp_indices = [w.i for w in subcat if w.pos_ == 'ADP']
        if len(adp_indices) != 1:
            return None
        adp_index = adp_indices[0]
        Y = subcat[adp_index + 1:]
        if subcat[adp_index].text == 'by':
            childcats = cat_store.get_children(subcat_uri)
            resources = cat_store.get_resources(subcat_uri)
            predicate_labels = {
                dbp_store.get_label(pred)
                for res in resources for pred in dbp_store.get_properties(res)
            }
            if len(childcats) * 10 >= len(resources) or any(
                    Y.text.lower() in p for p in predicate_labels):
                return None
        return Y
    return None
Beispiel #2
0
def _extract_axioms(category_graph, patterns):
    """Return axioms extracted from `category_graph` by applying `patterns` to all categories."""
    utils.get_logger().debug('CATEGORY/CAT2AX: Extracting axioms..')
    category_axioms = defaultdict(list)

    # process front/back/front+back patterns individually to reduce computational complexity
    front_pattern_dict = {}
    for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, False).items():
        _fill_dict(front_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns))

    back_pattern_dict = {}
    for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, False, True).items():
        _fill_dict(back_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns))

    enclosing_pattern_dict = {}
    for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, True).items():
        _fill_dict(enclosing_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns))

    cat_contexts = [(
        cat,
        nlp_util.remove_by_phrase(cat_store.get_label(cat)),
        cat_store.get_statistics(cat),
        front_pattern_dict, back_pattern_dict, enclosing_pattern_dict
    ) for cat in category_graph.content_nodes]

    with mp.Pool(processes=utils.get_config('max_cpus')) as pool:
        category_axioms = {cat: axioms for cat, axioms in tqdm(pool.imap_unordered(_extract_axioms_for_cat, cat_contexts, chunksize=1000), total=len(cat_contexts), desc='CATEGORY/CAT2AX: Extracting axioms')}
    category_axioms = {cat: axioms for cat, axioms in category_axioms.items() if axioms}  # filter out empty axioms

    utils.get_logger().debug(f'CATEGORY/CAT2AX: Extracted {sum(len(axioms) for axioms in category_axioms.values())} axioms for {len(category_axioms)} categories.')
    return category_axioms
Beispiel #3
0
 def make_conceptual(self):
     """Remove all nodes that are non-conceptual (i.e. that do not represent a class in a taxonomy)."""
     cat_names = [cat_store.get_label(cat) for cat in self.nodes]
     conceptual_categories = {
         cat
         for cat, has_plural_lexhead in zip(
             self.nodes, nlp_util.has_plural_lexhead_subjects(cat_names))
         if has_plural_lexhead
     }
     # clearing the graph of any invalid nodes
     self._remove_all_nodes_except(conceptual_categories | {self.root_node})
     return self
Beispiel #4
0
def _compute_category_sets() -> dict:
    """Iterate over DBpedia categories and identify all category sets.

    1) Retrieve all usable categories (i.e. categories that are not used for maintenance/organisational purposes)
    2) Normalize their names by removing by-phrases (e.g. "X by genre", "Y by country")
    3) For each category, retrieve all its children and search for name patterns (see '_find_child_sets')
    """
    category_sets = {}
    for cat in cat_store.get_categories():
        children_docs = {c: nlp_util.remove_by_phrase(cat_store.get_label(c)) for c in cat_store.get_children(cat)}
        child_sets = _find_child_sets(cat, children_docs)
        if child_sets:
            category_sets[cat] = child_sets
    return category_sets
Beispiel #5
0
def _apply_rules(pattern_dict: dict, cat: str) -> set:
    """Apply rules form `pattern_dict` and return the implied axioms."""
    cat_words = cat_store.get_label(cat).split(' ')

    axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words)
    if not axiom_patterns:
        return set()

    (pred, pred_type), additional_axioms = axiom_patterns
    front_pattern_idx = pattern_lengths[0] or None
    back_pattern_idx = -1 * pattern_lengths[1] or None
    resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx])

    if pred_type:
        resource = dbp_util.name2resource(resource)
        if resource not in dbp_store.get_resources(
        ) or pred_type not in dbp_store.get_transitive_types(resource):
            return set()
    return {(cat, pred, resource)} | {(cat, pred, val)
                                      for pred, val in additional_axioms}
Beispiel #6
0
def parse_category(category: str) -> Doc:
    """Return the category name as parsed Doc."""
    label = cat_store.get_label(category)
    return nlp_util.parse(label)
Beispiel #7
0
def _extract_axioms_with_rules(cat_dfs: dict) -> set:
    """Return axioms genered by applying C-DF rules."""

    # generate rule candidates by extracting shared pre-/postfixes
    cdf_rule_candidates = defaultdict(lambda: defaultdict(lambda: 0))
    for cat, (df, _) in cat_dfs.items():
        cat_label = cat_store.get_label(cat)
        for f in {f for f in df if f[0] != rdf_util.PREDICATE_TYPE}:
            if dbp_util.is_dbp_resource(f[1]):
                f_label = dbp_store._get_label_mapping()[
                    f[1]] if f[1] in dbp_store._get_label_mapping(
                    ) else dbp_util.object2name(f[1])
            else:
                f_label = f[1]
            if f_label in cat_label:
                first_words = cat_label[:cat_label.index(f_label)].strip()
                first_words = tuple(
                    first_words.split(' ')) if first_words else tuple()
                last_words = cat_label[cat_label.index(f_label) +
                                       len(f_label):].strip()
                last_words = tuple(
                    last_words.split(' ')) if last_words else tuple()
                if first_words or last_words:
                    f_types = dbp_store.get_independent_types(
                        dbp_store.get_types(f[1])) if dbp_util.is_dbp_resource(
                            f[1]) else set()
                    f_type = f_types.pop() if f_types else None
                    cdf_rule_candidates[(first_words,
                                         last_words)][((f[0], f_type),
                                                       tuple(
                                                           set(df).difference(
                                                               {f})))] += 1

    # filter rules using the threshold parameters min_support and beta
    cdf_rules = {}
    min_support = util.get_config('cdf.min_support')
    beta = util.get_config('cdf.beta')
    for word_patterns in cdf_rule_candidates:
        total_support = sum(cdf_rule_candidates[word_patterns].values())
        valid_axiom_patterns = [
            pattern
            for pattern, support in cdf_rule_candidates[word_patterns].items()
            if support >= min_support and (support / total_support) >= beta
        ]

        if len(valid_axiom_patterns) > 0:
            cdf_rules[word_patterns] = valid_axiom_patterns[0]

    # apply the patterns to all categories in order to extract axioms
    # (the rules are applied individually depending on whether the pattern is at the front, back, or front+back in order to reduce computational complexity)
    cdf_front_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if word_patterns[0] and not word_patterns[1]
    }
    cdf_front_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_front_patterns.items():
        _fill_dict(
            cdf_front_pattern_dict, list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    cdf_back_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if not word_patterns[0] and word_patterns[1]
    }
    cdf_back_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_back_patterns.items():
        _fill_dict(
            cdf_back_pattern_dict, list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    cdf_enclosing_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if word_patterns[0] and word_patterns[1]
    }
    cdf_enclosing_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_enclosing_patterns.items():
        _fill_dict(
            cdf_enclosing_pattern_dict,
            list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    rule_axioms = set()
    for cat in cat_store.get_usable_cats():
        rule_axioms.update(_apply_rules(cdf_front_pattern_dict, cat))
        rule_axioms.update(_apply_rules(cdf_back_pattern_dict, cat))
        rule_axioms.update(_apply_rules(cdf_enclosing_pattern_dict, cat))
    return rule_axioms
Beispiel #8
0
def _get_match_for_category(category: str, first_words: tuple, last_words: tuple) -> str:
    """Return variable part of the category name."""
    doc = nlp_util.remove_by_phrase(cat_store.get_label(category))
    return doc[len(first_words):len(doc)-len(last_words)].text