Ejemplo n.º 1
0
def _extract_patterns() -> dict:
    """Return patterns extracted from parent-child relationships of categories.

    For the identification of such patterns, the fact is exploited that categories are organized in one of two ways:
    1) The parent name follows the pattern "X by Z" and the child name follows either the pattern "X <prep> Y" or "YX"
    2) The parent name follows the pattern "X" and the child name follows either the pattern "X <prep> Y" or "YX"
    """
    patterns = {}
    for cat in cat_store.get_usable_cats():
        # locate parents that follow the pattern "X by Z" or "X"
        X, Z = _find_X_and_Z(cat)
        if X:
            subcats = [
                cat for cat in cat_store.get_children(cat)
                if cat_store.is_usable(cat)
            ]
            for subcat in subcats:
                # find Y by checking for the patterns "X <prep> Y" and "YX"
                Y = _find_Y(X, subcat)
                if Y:
                    if cat in patterns:
                        patterns[cat][2][subcat] = Y.text
                    else:
                        patterns[cat] = (X.text, Z, {subcat: Y.text})
    return patterns
Ejemplo n.º 2
0
def _compute_category_sets() -> dict:
    """Iterate over DBpedia categories and identify all category sets.

    1) Retrieve all usable categories (i.e. categories that are not used for maintenance/organisational purposes)
    2) Normalize their names by removing by-phrases (e.g. "X by genre", "Y by country")
    3) For each category, retrieve all its children and search for name patterns (see '_find_child_sets')
    """
    category_sets = {}
    for cat in cat_store.get_usable_cats():
        children = {
            c
            for c in cat_store.get_children(cat) if cat_store.is_usable(c)
        }
        children_docs = {
            c: _remove_by_phrase(cat_nlp.parse_category(c))
            for c in children
        }
        child_sets = _find_child_sets(cat, children_docs)
        if child_sets:
            category_sets[cat] = child_sets
    return category_sets
Ejemplo n.º 3
0
def _generate_dbpedia_coverage_graph():
    """Create graph of Figure 4a"""
    # retrieve data from extracted axioms and assertions
    cat2ax_relation_axioms = pd.read_csv(
        util.get_results_file('results.cat2ax.relation_axioms'), sep=';')
    cat2ax_type_axioms = pd.read_csv(
        util.get_results_file('results.cat2ax.type_axioms'), sep=';')
    cat2ax_relation_triples = pd.read_csv(
        util.get_results_file('results.cat2ax.relation_assertions'), sep=';')
    cat2ax_type_triples = pd.read_csv(
        util.get_results_file('results.cat2ax.type_assertions'), sep=';')

    catriple_relation_axioms = pd.read_csv(
        util.get_results_file('results.catriple.relation_axioms'), sep=';')
    catriple_relation_triples = pd.read_csv(
        util.get_results_file('results.catriple.relation_assertions'), sep=';')

    cdf_relation_axioms = pd.read_csv(
        util.get_results_file('results.cdf.relation_axioms'), sep=';')
    cdf_type_axioms = pd.read_csv(
        util.get_results_file('results.cdf.type_axioms'), sep=';')
    cdf_relation_triples = pd.read_csv(
        util.get_results_file('results.cdf.relation_assertions'), sep=';')
    cdf_type_triples = pd.read_csv(
        util.get_results_file('results.cdf.type_assertions'), sep=';')

    # retrieve unique entity counts
    cat2ax_cat_count = len(
        set(cat2ax_relation_axioms['cat'].unique())
        | set(cat2ax_type_axioms['cat'].unique()))
    catriple_cat_count = len(set(catriple_relation_axioms['cat'].unique()))
    cdf_cat_count = len(
        set(cdf_relation_axioms['cat'].unique())
        | set(cdf_type_axioms['cat'].unique()))
    total_cat_count = len(cat_store.get_usable_cats())

    cat2ax_preds = cat2ax_relation_triples.groupby(by='pred').count()
    cat2ax_pred_count = len(cat2ax_preds[cat2ax_preds['sub'] >= 100].index)
    catriple_preds = catriple_relation_triples.groupby(by='pred').count()
    catriple_pred_count = len(
        catriple_preds[catriple_preds['sub'] >= 100].index)
    cdf_preds = cdf_relation_triples.groupby(by='pred').count()
    cdf_pred_count = len(cdf_preds[cdf_preds['sub'] >= 100].index)
    total_pred_count = len(dbp_store.get_all_predicates())

    cat2ax_res_count = len(
        set(cat2ax_relation_triples['sub'].unique())
        | set(cat2ax_type_triples['sub'].unique()))
    catriple_res_count = len(set(catriple_relation_triples['sub'].unique()))
    cdf_res_count = len(
        set(cdf_relation_triples['sub'].unique())
        | set(cdf_type_triples['sub'].unique()))
    total_res_count = len(dbp_store.get_resources())

    # initialise bars
    bars_ca = [
        cat2ax_cat_count / total_cat_count, cat2ax_res_count / total_res_count,
        cat2ax_pred_count / total_pred_count
    ]
    bars_ct = [
        catriple_cat_count / total_cat_count,
        catriple_res_count / total_res_count,
        catriple_pred_count / total_pred_count
    ]
    bars_cdf = [
        cdf_cat_count / total_cat_count, cdf_res_count / total_res_count,
        cdf_pred_count / total_pred_count
    ]

    # arrange bars
    bar_width = 0.25
    r1 = np.arange(len(bars_ca))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]

    # make plot
    plt.figure(figsize=(8, 5))
    plt.bar(r1,
            bars_ca,
            color='#2d7f5e',
            width=bar_width,
            edgecolor='white',
            label='Cat2Ax')
    plt.bar(r2,
            bars_ct,
            color='darkgrey',
            width=bar_width,
            edgecolor='white',
            label='Catriple')
    plt.bar(r3,
            bars_cdf,
            color='black',
            width=bar_width,
            edgecolor='white',
            label='C-DF')
    plt.ylabel('Fraction of items covered', fontsize=16)
    plt.xticks([r + bar_width for r in range(len(bars_ca))],
               ['(1) Categories', '(2) Resources', '(3) Properties'],
               fontsize=16)
    plt.yticks(fontsize=14)
    plt.legend(fontsize=15)
    ax = plt.gca()
    ax.yaxis.grid()

    plt.savefig(util.get_results_file('results.graphs.dbpedia_coverage'),
                bbox_inches='tight')
Ejemplo n.º 4
0
Archivo: cdf.py Proyecto: nheist/Cat2Ax
def _extract_cat_dfs() -> dict:
    """Return DFs of categories that are frequent in the category and infrequent globally."""
    cat_df_candidates = {}
    alpha = util.get_config('cdf.alpha')

    for cat in cat_store.get_usable_cats():
        df_candidates = {}

        if len(cat_store.get_resources(cat)) < 2:
            # discard a category if it has at most one resource (as there is not enough evidence)
            continue

        # collect base features for DF generation
        cat_stats = cat_store.get_statistics(cat)
        base_props = {
            prop
            for prop, freq in cat_stats['property_frequencies'].items()
            if freq >= alpha
        }
        base_types = {(rdf_util.PREDICATE_TYPE, t)
                      for t, freq in cat_stats['type_frequencies'].items()
                      if freq >= alpha}
        independent_base_types = dbp_store.get_independent_types(
            {val[1]
             for val in base_types})
        base_types = {
            val
            for val in base_types if val[1] in independent_base_types
        }
        base_features = base_props | base_types

        if len(base_features) > 20:
            # discard a category if there are way too many base features (as computational complexity is too high)
            continue
        df_candidates.update({(prop, ):
                              (cat_stats['property_counts'][prop],
                               cat_stats['property_frequencies'][prop])
                              for prop in base_props})
        df_candidates.update({(t, ): (cat_stats['type_counts'][t[1]],
                                      cat_stats['type_frequencies'][t[1]])
                              for t in base_types})

        # iteratively look for promising DFs
        current_features = {(f, ) for f in base_features}
        current_features_strings = {
            _get_feature_set_as_string(f_set)
            for f_set in current_features
        }
        while True:
            new_features = {}
            new_features_strings = set()
            for cf in current_features:
                for bf in base_features:
                    if bf not in cf:
                        nf = cf + (bf, )
                        nf_string = _get_feature_set_as_string(nf)
                        if nf_string not in new_features_strings:
                            if all(
                                    _get_feature_set_as_string(
                                        set(nf).difference({elem})) in
                                    current_features_strings for elem in nf):
                                nf_count = _get_overall_features_count(nf,
                                                                       cat=cat)
                                nf_freq = nf_count / len(
                                    cat_store.get_resources(cat))
                                if nf_freq > alpha:
                                    new_features[nf] = (nf_count, nf_freq)
                                    new_features_strings.add(nf_string)

            if not new_features:
                break
            current_features = set(new_features)
            current_features_strings = new_features_strings
            df_candidates.update(new_features)

        if df_candidates:
            cat_df_candidates[cat] = df_candidates

    # find best DFs by scoring them
    cat_df_candidate_scores = {}
    for cat, candidates in cat_df_candidates.items():
        candidate_scores = {}
        for features, (count, freq) in candidates.items():
            overall_count = _get_overall_features_count(features)
            candidate_scores[
                features] = freq * count / overall_count if overall_count > 0 else 0
        cat_df_candidate_scores[cat] = candidate_scores

    cat_dfs = {}
    for cat, candidate_dfs in cat_df_candidate_scores.items():
        best_df, score = max(candidate_dfs.items(),
                             key=operator.itemgetter(1),
                             default=(None, 0))
        if score > alpha:
            cat_dfs[cat] = (best_df, score)
    return cat_dfs
Ejemplo n.º 5
0
Archivo: cdf.py Proyecto: nheist/Cat2Ax
def _extract_axioms_with_rules(cat_dfs: dict) -> set:
    """Return axioms genered by applying C-DF rules."""

    # generate rule candidates by extracting shared pre-/postfixes
    cdf_rule_candidates = defaultdict(lambda: defaultdict(lambda: 0))
    for cat, (df, _) in cat_dfs.items():
        cat_label = cat_store.get_label(cat)
        for f in {f for f in df if f[0] != rdf_util.PREDICATE_TYPE}:
            if dbp_util.is_dbp_resource(f[1]):
                f_label = dbp_store._get_label_mapping()[
                    f[1]] if f[1] in dbp_store._get_label_mapping(
                    ) else dbp_util.object2name(f[1])
            else:
                f_label = f[1]
            if f_label in cat_label:
                first_words = cat_label[:cat_label.index(f_label)].strip()
                first_words = tuple(
                    first_words.split(' ')) if first_words else tuple()
                last_words = cat_label[cat_label.index(f_label) +
                                       len(f_label):].strip()
                last_words = tuple(
                    last_words.split(' ')) if last_words else tuple()
                if first_words or last_words:
                    f_types = dbp_store.get_independent_types(
                        dbp_store.get_types(f[1])) if dbp_util.is_dbp_resource(
                            f[1]) else set()
                    f_type = f_types.pop() if f_types else None
                    cdf_rule_candidates[(first_words,
                                         last_words)][((f[0], f_type),
                                                       tuple(
                                                           set(df).difference(
                                                               {f})))] += 1

    # filter rules using the threshold parameters min_support and beta
    cdf_rules = {}
    min_support = util.get_config('cdf.min_support')
    beta = util.get_config('cdf.beta')
    for word_patterns in cdf_rule_candidates:
        total_support = sum(cdf_rule_candidates[word_patterns].values())
        valid_axiom_patterns = [
            pattern
            for pattern, support in cdf_rule_candidates[word_patterns].items()
            if support >= min_support and (support / total_support) >= beta
        ]

        if len(valid_axiom_patterns) > 0:
            cdf_rules[word_patterns] = valid_axiom_patterns[0]

    # apply the patterns to all categories in order to extract axioms
    # (the rules are applied individually depending on whether the pattern is at the front, back, or front+back in order to reduce computational complexity)
    cdf_front_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if word_patterns[0] and not word_patterns[1]
    }
    cdf_front_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_front_patterns.items():
        _fill_dict(
            cdf_front_pattern_dict, list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    cdf_back_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if not word_patterns[0] and word_patterns[1]
    }
    cdf_back_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_back_patterns.items():
        _fill_dict(
            cdf_back_pattern_dict, list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    cdf_enclosing_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if word_patterns[0] and word_patterns[1]
    }
    cdf_enclosing_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_enclosing_patterns.items():
        _fill_dict(
            cdf_enclosing_pattern_dict,
            list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    rule_axioms = set()
    for cat in cat_store.get_usable_cats():
        rule_axioms.update(_apply_rules(cdf_front_pattern_dict, cat))
        rule_axioms.update(_apply_rules(cdf_back_pattern_dict, cat))
        rule_axioms.update(_apply_rules(cdf_enclosing_pattern_dict, cat))
    return rule_axioms