Beispiel #1
0
def _find_Y(X: Span, subcat_uri: str):
    """Return Y if the category follows one of the patterns 'YX' or 'X <prep> Y'."""
    if X.text.lower() not in cat_store.get_label(subcat_uri).lower():
        return None
    subcat = nlp_util.parse(cat_store.get_label(subcat_uri))
    if subcat.text.lower().endswith(' ' + X.text.lower()):  # "YX"
        if len(X) >= len(subcat) or subcat[-(len(X) + 1)].pos_ == 'ADP':
            return None
        return subcat[:-len(X)]
    elif subcat.text.lower().startswith(X.text.lower() + ' '):  # "X <prep> Y"
        adp_indices = [w.i for w in subcat if w.pos_ == 'ADP']
        if len(adp_indices) != 1:
            return None
        adp_index = adp_indices[0]
        Y = subcat[adp_index + 1:]
        if subcat[adp_index].text == 'by':
            childcats = cat_store.get_children(subcat_uri)
            resources = cat_store.get_resources(subcat_uri)
            predicate_labels = {
                dbp_store.get_label(pred)
                for res in resources for pred in dbp_store.get_properties(res)
            }
            if len(childcats) * 10 >= len(resources) or any(
                    Y.text.lower() in p for p in predicate_labels):
                return None
        return Y
    return None
Beispiel #2
0
 def get_resources(self, node: str) -> set:
     if not self.has_node(node):
         raise Exception(f'Node {node} not in category graph.')
     return {
         res
         for cat in self.get_categories(node)
         for res in cat_store.get_resources(cat)
     }
Beispiel #3
0
 def get_resources_from_categories(self, node: str) -> set:
     """Return all DBpedia resources directly associated with the node through Wikipedia categories."""
     if node not in self._node_direct_cat_resources:
         cat_resources = {
             r
             for cat in self.get_category_parts(node)
             for r in cat_store.get_resources(cat)
         }
         self._node_direct_cat_resources[
             node] = self._filter_invalid_resources(cat_resources)
     return set(self._node_direct_cat_resources[node])
Beispiel #4
0
def _get_overall_features_count(feats: tuple, cat: str = None) -> int:
    """Return global count of features."""
    valid_res_idxs = set()
    if cat:
        valid_res_idxs = {
            resource_to_idx_dict[res]
            for res in cat_store.get_resources(cat)
            if res in resource_to_idx_dict
        }

    for f in feats:
        res_idxs_with_f = feature_to_idx_dict[f]
        valid_res_idxs = valid_res_idxs.intersection(
            res_idxs_with_f) if valid_res_idxs else res_idxs_with_f
    return len(valid_res_idxs)
Beispiel #5
0
 def get_resource_provenance(self, resource: str) -> set:
     """Return provenance information of a resource (i.e. which categories and lists have been used to extract it)."""
     if not self._resource_provenance:
         for node in self.nodes:
             for cat in self.get_category_parts(node):
                 for res in cat_store.get_resources(cat):
                     self._resource_provenance[
                         clg_util.dbp_resource2clg_resource(res)].add(cat)
         if self.use_listing_resources:
             for res, res_data in listing.get_page_entities(self).items():
                 self._resource_provenance[clg_util.name2clg_resource(
                     res)].update({
                         dbp_util.name2resource(o)
                         for o in res_data['origins']
                     })
     return self._resource_provenance[resource]
Beispiel #6
0
    def create_from_dbpedia(cls):
        """Initialise the graph by combining list categories with list pages."""
        # add nodes and edges for listcategories
        nodes = list_store.get_listcategories()
        edges = set()
        for listcat in nodes:
            listcat_children = {
                child
                for child in cat_store.get_children(
                    listcat, include_listcategories=True) if child in nodes
            }
            edges.update({(listcat, child) for child in listcat_children})

        # add nodes and edges for listpages
        for listcat in list(nodes):
            listpages = {
                dbp_store.resolve_redirect(page)
                for page in cat_store.get_resources(listcat)
                if list_util.is_listpage(page)
            }
            listpages = {lp
                         for lp in listpages if list_util.is_listpage(lp)
                         }  # filter out redirects on non-listpages
            nodes.update(listpages)
            edges.update({(listcat, lp) for lp in listpages})

        # make sure that all listpages are in the graph
        nodes.update(list_store.get_listpages())

        # initialise graph
        graph = nx.DiGraph(incoming_graph_data=list(edges))
        graph.add_nodes_from(
            list({n
                  for n in nodes.difference(set(graph.nodes))}))
        list_graph = ListGraph(graph)

        for node in graph.nodes:
            list_graph._set_name(node, list_util.list2name(node))
            list_graph._set_parts(node, {node})

        # add root node
        graph.add_node(list_graph.root_node)
        list_graph._set_name(list_graph.root_node,
                             cat_util.category2name(list_graph.root_node))
        list_graph._set_parts(list_graph.root_node, {list_graph.root_node})

        return list_graph
Beispiel #7
0
def _extract_axioms(patterns: dict) -> set:
    """Return the axioms extracted by applying the patterns to Wikipedia categories."""
    axioms = {}

    for cat, (sub, pred, subcats) in patterns.items():
        if pred:  # simple mapping of label to predicate (case 1)
            if pred.lower() in predicate_names:
                axioms[cat] = (sub, predicate_names[pred.lower()], subcats)
        else:  # Voting required to discover Z (case 2)
            predicate_counts = defaultdict(int)
            for subcat, value in subcats.items():
                value = normalize_val(value)
                for res in cat_store.get_resources(subcat):
                    for pred, values in dbp_store.get_properties(res).items():
                        normalized_values = {
                            normalize_val(val)
                            for val in values
                        }
                        if value in normalized_values:
                            predicate_counts[pred] += 1
            if predicate_counts:
                pred = max(predicate_counts.items(),
                           key=operator.itemgetter(1))[0]
                axioms[cat] = (sub, pred, subcats)

    # map values to dbpedia resources if necessary (only possible if we have an object property)
    valid_axioms = {}

    for cat in axioms:
        _, pred, subcats = axioms[cat]
        if dbp_store.is_object_property(pred):
            for subcat, obj in subcats.items():
                obj_uri = dbp_util.name2resource(obj)
                if obj_uri in dbp_store.get_resources():
                    if cat in valid_axioms:
                        valid_axioms[cat][1][subcat] = obj_uri
                    else:
                        valid_axioms[cat] = (pred, {subcat: obj_uri})
        else:
            valid_axioms[cat] = (pred, subcats)

    return {(cat, pred, val)
            for pred, cat_vals in valid_axioms.values()
            for cat, val in cat_vals.items()}
Beispiel #8
0
def _compute_labeled_entities_for_listpage(page_uri: str, page_data: dict,
                                           graph) -> tuple:
    positive_SEs, negative_SEs = dict(), set()
    # compute potential subject entities for list page
    page_potential_SEs = {
        dbp_util.resource2name(res)
        for cat in _get_category_descendants_for_list(page_uri)
        for res in cat_store.get_resources(cat)
    }
    # compute types of list page
    page_types = {
        t
        for n in graph.get_nodes_for_part(page_uri)
        for t in dbp_store.get_independent_types(
            graph.get_transitive_dbpedia_types(n))
    }
    page_disjoint_types = {
        dt
        for t in page_types for dt in dbp_heur.get_disjoint_types(t)
    }
    # collect all linked entities on the page
    page_entities = {
        ent['name']
        for s in page_data['sections'] for enum in s['enums'] for entry in enum
        for ent in entry['entities']
    }
    page_entities.update({
        ent['name']
        for s in page_data['sections'] for table in s['tables']
        for row in table['data'] for cell in row for ent in cell['entities']
    })
    for ent in page_entities:
        ent_uri = dbp_util.name2resource(ent)
        if not dbp_store.is_possible_resource(ent_uri):
            negative_SEs.add(ent)
        elif ent in page_potential_SEs:
            positive_SEs[ent] = _compute_entity_label(ent_uri)
        elif page_disjoint_types.intersection(dbp_store.get_types(ent_uri)):
            negative_SEs.add(ent)
    return positive_SEs, negative_SEs
Beispiel #9
0
def _extract_assertions(axioms: set) -> list:
    """Return assertions generated by applying the extracted axioms to the respective categories."""
    assertions = []

    for cat, pred, value in axioms:
        new_val = normalize_val(value)
        for res in cat_store.get_resources(cat):
            res_props = dbp_store.get_properties(res)

            # discard generated assertion if the value is too similar to an existing relation in DBpedia
            if pred in res_props:
                existing_values = {
                    normalize_val(val)
                    for val in res_props[pred]
                }
                if any((new_val in ex_val) or (ex_val in new_val)
                       for ex_val in existing_values):
                    continue

                if any(
                        edit_distance(new_val, ex_val) < 3
                        for ex_val in existing_values):
                    continue

                if existing_values.intersection(
                        nlp_util.get_synonyms(new_val)):
                    continue

                new_val_words = normalize_to_words(new_val)
                if any(
                        new_val_words.intersection(normalize_to_words(ex_val))
                        for ex_val in existing_values):
                    continue

            assertions.append((res, pred, value))
    return assertions
Beispiel #10
0
def _extract_cat_dfs() -> dict:
    """Return DFs of categories that are frequent in the category and infrequent globally."""
    cat_df_candidates = {}
    alpha = util.get_config('cdf.alpha')

    for cat in cat_store.get_usable_cats():
        df_candidates = {}

        if len(cat_store.get_resources(cat)) < 2:
            # discard a category if it has at most one resource (as there is not enough evidence)
            continue

        # collect base features for DF generation
        cat_stats = cat_store.get_statistics(cat)
        base_props = {
            prop
            for prop, freq in cat_stats['property_frequencies'].items()
            if freq >= alpha
        }
        base_types = {(rdf_util.PREDICATE_TYPE, t)
                      for t, freq in cat_stats['type_frequencies'].items()
                      if freq >= alpha}
        independent_base_types = dbp_store.get_independent_types(
            {val[1]
             for val in base_types})
        base_types = {
            val
            for val in base_types if val[1] in independent_base_types
        }
        base_features = base_props | base_types

        if len(base_features) > 20:
            # discard a category if there are way too many base features (as computational complexity is too high)
            continue
        df_candidates.update({(prop, ):
                              (cat_stats['property_counts'][prop],
                               cat_stats['property_frequencies'][prop])
                              for prop in base_props})
        df_candidates.update({(t, ): (cat_stats['type_counts'][t[1]],
                                      cat_stats['type_frequencies'][t[1]])
                              for t in base_types})

        # iteratively look for promising DFs
        current_features = {(f, ) for f in base_features}
        current_features_strings = {
            _get_feature_set_as_string(f_set)
            for f_set in current_features
        }
        while True:
            new_features = {}
            new_features_strings = set()
            for cf in current_features:
                for bf in base_features:
                    if bf not in cf:
                        nf = cf + (bf, )
                        nf_string = _get_feature_set_as_string(nf)
                        if nf_string not in new_features_strings:
                            if all(
                                    _get_feature_set_as_string(
                                        set(nf).difference({elem})) in
                                    current_features_strings for elem in nf):
                                nf_count = _get_overall_features_count(nf,
                                                                       cat=cat)
                                nf_freq = nf_count / len(
                                    cat_store.get_resources(cat))
                                if nf_freq > alpha:
                                    new_features[nf] = (nf_count, nf_freq)
                                    new_features_strings.add(nf_string)

            if not new_features:
                break
            current_features = set(new_features)
            current_features_strings = new_features_strings
            df_candidates.update(new_features)

        if df_candidates:
            cat_df_candidates[cat] = df_candidates

    # find best DFs by scoring them
    cat_df_candidate_scores = {}
    for cat, candidates in cat_df_candidates.items():
        candidate_scores = {}
        for features, (count, freq) in candidates.items():
            overall_count = _get_overall_features_count(features)
            candidate_scores[
                features] = freq * count / overall_count if overall_count > 0 else 0
        cat_df_candidate_scores[cat] = candidate_scores

    cat_dfs = {}
    for cat, candidate_dfs in cat_df_candidate_scores.items():
        best_df, score = max(candidate_dfs.items(),
                             key=operator.itemgetter(1),
                             default=(None, 0))
        if score > alpha:
            cat_dfs[cat] = (best_df, score)
    return cat_dfs
Beispiel #11
0
def run_extraction():
    """Run the C-DF extraction procedure and create result files for relation/type axioms and assertions.

    The extraction is performed in two steps:
    1) Find defining features (DFs) of a category, i.e. sets of types/relations that are frequent in the category and globally infrequent
    2) Use DFs to extract rules and subsequently apply the rules to extract axioms and assertions
    """
    util.get_logger().debug('Step 1: Defining Feature Extraction')
    cat_dfs = _extract_cat_dfs()
    direct_axioms = {(cat, pred, val)
                     for cat, (df, _) in cat_dfs.items() for (pred, val) in df}

    util.get_logger().debug('Step 2: Rule Extraction')
    rule_axioms = _extract_axioms_with_rules(cat_dfs)
    all_axioms = rule_axioms | {(cat, pred, val)
                                for cat, pred, val in direct_axioms
                                if cat not in rule_axioms}
    all_assertions = {(res, pred, val)
                      for cat, pred, val in all_axioms
                      for res in cat_store.get_resources(cat)}

    util.get_logger().debug('Finished extraction - persisting results..')
    relation_axioms = {
        ax
        for ax in all_axioms if ax[1] != rdf_util.PREDICATE_TYPE
    }
    pd.DataFrame(data=relation_axioms, columns=['cat', 'pred', 'val']).to_csv(
        util.get_results_file('results.cdf.relation_axioms'),
        sep=';',
        index=False)

    type_axioms = {ax for ax in all_axioms if ax[1] == rdf_util.PREDICATE_TYPE}
    pd.DataFrame(data=type_axioms, columns=['cat', 'pred', 'val']).to_csv(
        util.get_results_file('results.cdf.type_axioms'), sep=';', index=False)

    relation_assertions = {
        a
        for a in all_assertions if a[1] != rdf_util.PREDICATE_TYPE
    }
    df_relation_assertions = pd.DataFrame(data=relation_assertions,
                                          columns=['sub', 'pred', 'val'])
    df_relation_assertions.to_csv(
        util.get_results_file('results.cdf.relation_assertions'),
        sep=';',
        index=False)
    rdf_util.write_triple_file(
        df_relation_assertions,
        util.get_results_file('results.cdf.relation_assertion_triples'))

    type_assertions = {
        a
        for a in all_assertions if a[1] == rdf_util.PREDICATE_TYPE
    }
    df_type_assertions = pd.DataFrame(data=type_assertions,
                                      columns=['sub', 'pred', 'val'])
    df_type_assertions.to_csv(
        util.get_results_file('results.cdf.type_assertions'),
        sep=';',
        index=False)
    rdf_util.write_triple_file(
        df_type_assertions,
        util.get_results_file('results.cdf.type_assertion_triples'))