def _find_Y(X: Span, subcat_uri: str): """Return Y if the category follows one of the patterns 'YX' or 'X <prep> Y'.""" if X.text.lower() not in cat_store.get_label(subcat_uri).lower(): return None subcat = nlp_util.parse(cat_store.get_label(subcat_uri)) if subcat.text.lower().endswith(' ' + X.text.lower()): # "YX" if len(X) >= len(subcat) or subcat[-(len(X) + 1)].pos_ == 'ADP': return None return subcat[:-len(X)] elif subcat.text.lower().startswith(X.text.lower() + ' '): # "X <prep> Y" adp_indices = [w.i for w in subcat if w.pos_ == 'ADP'] if len(adp_indices) != 1: return None adp_index = adp_indices[0] Y = subcat[adp_index + 1:] if subcat[adp_index].text == 'by': childcats = cat_store.get_children(subcat_uri) resources = cat_store.get_resources(subcat_uri) predicate_labels = { dbp_store.get_label(pred) for res in resources for pred in dbp_store.get_properties(res) } if len(childcats) * 10 >= len(resources) or any( Y.text.lower() in p for p in predicate_labels): return None return Y return None
def get_resources(self, node: str) -> set: if not self.has_node(node): raise Exception(f'Node {node} not in category graph.') return { res for cat in self.get_categories(node) for res in cat_store.get_resources(cat) }
def get_resources_from_categories(self, node: str) -> set: """Return all DBpedia resources directly associated with the node through Wikipedia categories.""" if node not in self._node_direct_cat_resources: cat_resources = { r for cat in self.get_category_parts(node) for r in cat_store.get_resources(cat) } self._node_direct_cat_resources[ node] = self._filter_invalid_resources(cat_resources) return set(self._node_direct_cat_resources[node])
def _get_overall_features_count(feats: tuple, cat: str = None) -> int: """Return global count of features.""" valid_res_idxs = set() if cat: valid_res_idxs = { resource_to_idx_dict[res] for res in cat_store.get_resources(cat) if res in resource_to_idx_dict } for f in feats: res_idxs_with_f = feature_to_idx_dict[f] valid_res_idxs = valid_res_idxs.intersection( res_idxs_with_f) if valid_res_idxs else res_idxs_with_f return len(valid_res_idxs)
def get_resource_provenance(self, resource: str) -> set: """Return provenance information of a resource (i.e. which categories and lists have been used to extract it).""" if not self._resource_provenance: for node in self.nodes: for cat in self.get_category_parts(node): for res in cat_store.get_resources(cat): self._resource_provenance[ clg_util.dbp_resource2clg_resource(res)].add(cat) if self.use_listing_resources: for res, res_data in listing.get_page_entities(self).items(): self._resource_provenance[clg_util.name2clg_resource( res)].update({ dbp_util.name2resource(o) for o in res_data['origins'] }) return self._resource_provenance[resource]
def create_from_dbpedia(cls): """Initialise the graph by combining list categories with list pages.""" # add nodes and edges for listcategories nodes = list_store.get_listcategories() edges = set() for listcat in nodes: listcat_children = { child for child in cat_store.get_children( listcat, include_listcategories=True) if child in nodes } edges.update({(listcat, child) for child in listcat_children}) # add nodes and edges for listpages for listcat in list(nodes): listpages = { dbp_store.resolve_redirect(page) for page in cat_store.get_resources(listcat) if list_util.is_listpage(page) } listpages = {lp for lp in listpages if list_util.is_listpage(lp) } # filter out redirects on non-listpages nodes.update(listpages) edges.update({(listcat, lp) for lp in listpages}) # make sure that all listpages are in the graph nodes.update(list_store.get_listpages()) # initialise graph graph = nx.DiGraph(incoming_graph_data=list(edges)) graph.add_nodes_from( list({n for n in nodes.difference(set(graph.nodes))})) list_graph = ListGraph(graph) for node in graph.nodes: list_graph._set_name(node, list_util.list2name(node)) list_graph._set_parts(node, {node}) # add root node graph.add_node(list_graph.root_node) list_graph._set_name(list_graph.root_node, cat_util.category2name(list_graph.root_node)) list_graph._set_parts(list_graph.root_node, {list_graph.root_node}) return list_graph
def _extract_axioms(patterns: dict) -> set: """Return the axioms extracted by applying the patterns to Wikipedia categories.""" axioms = {} for cat, (sub, pred, subcats) in patterns.items(): if pred: # simple mapping of label to predicate (case 1) if pred.lower() in predicate_names: axioms[cat] = (sub, predicate_names[pred.lower()], subcats) else: # Voting required to discover Z (case 2) predicate_counts = defaultdict(int) for subcat, value in subcats.items(): value = normalize_val(value) for res in cat_store.get_resources(subcat): for pred, values in dbp_store.get_properties(res).items(): normalized_values = { normalize_val(val) for val in values } if value in normalized_values: predicate_counts[pred] += 1 if predicate_counts: pred = max(predicate_counts.items(), key=operator.itemgetter(1))[0] axioms[cat] = (sub, pred, subcats) # map values to dbpedia resources if necessary (only possible if we have an object property) valid_axioms = {} for cat in axioms: _, pred, subcats = axioms[cat] if dbp_store.is_object_property(pred): for subcat, obj in subcats.items(): obj_uri = dbp_util.name2resource(obj) if obj_uri in dbp_store.get_resources(): if cat in valid_axioms: valid_axioms[cat][1][subcat] = obj_uri else: valid_axioms[cat] = (pred, {subcat: obj_uri}) else: valid_axioms[cat] = (pred, subcats) return {(cat, pred, val) for pred, cat_vals in valid_axioms.values() for cat, val in cat_vals.items()}
def _compute_labeled_entities_for_listpage(page_uri: str, page_data: dict, graph) -> tuple: positive_SEs, negative_SEs = dict(), set() # compute potential subject entities for list page page_potential_SEs = { dbp_util.resource2name(res) for cat in _get_category_descendants_for_list(page_uri) for res in cat_store.get_resources(cat) } # compute types of list page page_types = { t for n in graph.get_nodes_for_part(page_uri) for t in dbp_store.get_independent_types( graph.get_transitive_dbpedia_types(n)) } page_disjoint_types = { dt for t in page_types for dt in dbp_heur.get_disjoint_types(t) } # collect all linked entities on the page page_entities = { ent['name'] for s in page_data['sections'] for enum in s['enums'] for entry in enum for ent in entry['entities'] } page_entities.update({ ent['name'] for s in page_data['sections'] for table in s['tables'] for row in table['data'] for cell in row for ent in cell['entities'] }) for ent in page_entities: ent_uri = dbp_util.name2resource(ent) if not dbp_store.is_possible_resource(ent_uri): negative_SEs.add(ent) elif ent in page_potential_SEs: positive_SEs[ent] = _compute_entity_label(ent_uri) elif page_disjoint_types.intersection(dbp_store.get_types(ent_uri)): negative_SEs.add(ent) return positive_SEs, negative_SEs
def _extract_assertions(axioms: set) -> list: """Return assertions generated by applying the extracted axioms to the respective categories.""" assertions = [] for cat, pred, value in axioms: new_val = normalize_val(value) for res in cat_store.get_resources(cat): res_props = dbp_store.get_properties(res) # discard generated assertion if the value is too similar to an existing relation in DBpedia if pred in res_props: existing_values = { normalize_val(val) for val in res_props[pred] } if any((new_val in ex_val) or (ex_val in new_val) for ex_val in existing_values): continue if any( edit_distance(new_val, ex_val) < 3 for ex_val in existing_values): continue if existing_values.intersection( nlp_util.get_synonyms(new_val)): continue new_val_words = normalize_to_words(new_val) if any( new_val_words.intersection(normalize_to_words(ex_val)) for ex_val in existing_values): continue assertions.append((res, pred, value)) return assertions
def _extract_cat_dfs() -> dict: """Return DFs of categories that are frequent in the category and infrequent globally.""" cat_df_candidates = {} alpha = util.get_config('cdf.alpha') for cat in cat_store.get_usable_cats(): df_candidates = {} if len(cat_store.get_resources(cat)) < 2: # discard a category if it has at most one resource (as there is not enough evidence) continue # collect base features for DF generation cat_stats = cat_store.get_statistics(cat) base_props = { prop for prop, freq in cat_stats['property_frequencies'].items() if freq >= alpha } base_types = {(rdf_util.PREDICATE_TYPE, t) for t, freq in cat_stats['type_frequencies'].items() if freq >= alpha} independent_base_types = dbp_store.get_independent_types( {val[1] for val in base_types}) base_types = { val for val in base_types if val[1] in independent_base_types } base_features = base_props | base_types if len(base_features) > 20: # discard a category if there are way too many base features (as computational complexity is too high) continue df_candidates.update({(prop, ): (cat_stats['property_counts'][prop], cat_stats['property_frequencies'][prop]) for prop in base_props}) df_candidates.update({(t, ): (cat_stats['type_counts'][t[1]], cat_stats['type_frequencies'][t[1]]) for t in base_types}) # iteratively look for promising DFs current_features = {(f, ) for f in base_features} current_features_strings = { _get_feature_set_as_string(f_set) for f_set in current_features } while True: new_features = {} new_features_strings = set() for cf in current_features: for bf in base_features: if bf not in cf: nf = cf + (bf, ) nf_string = _get_feature_set_as_string(nf) if nf_string not in new_features_strings: if all( _get_feature_set_as_string( set(nf).difference({elem})) in current_features_strings for elem in nf): nf_count = _get_overall_features_count(nf, cat=cat) nf_freq = nf_count / len( cat_store.get_resources(cat)) if nf_freq > alpha: new_features[nf] = (nf_count, nf_freq) new_features_strings.add(nf_string) if not new_features: break current_features = set(new_features) current_features_strings = new_features_strings df_candidates.update(new_features) if df_candidates: cat_df_candidates[cat] = df_candidates # find best DFs by scoring them cat_df_candidate_scores = {} for cat, candidates in cat_df_candidates.items(): candidate_scores = {} for features, (count, freq) in candidates.items(): overall_count = _get_overall_features_count(features) candidate_scores[ features] = freq * count / overall_count if overall_count > 0 else 0 cat_df_candidate_scores[cat] = candidate_scores cat_dfs = {} for cat, candidate_dfs in cat_df_candidate_scores.items(): best_df, score = max(candidate_dfs.items(), key=operator.itemgetter(1), default=(None, 0)) if score > alpha: cat_dfs[cat] = (best_df, score) return cat_dfs
def run_extraction(): """Run the C-DF extraction procedure and create result files for relation/type axioms and assertions. The extraction is performed in two steps: 1) Find defining features (DFs) of a category, i.e. sets of types/relations that are frequent in the category and globally infrequent 2) Use DFs to extract rules and subsequently apply the rules to extract axioms and assertions """ util.get_logger().debug('Step 1: Defining Feature Extraction') cat_dfs = _extract_cat_dfs() direct_axioms = {(cat, pred, val) for cat, (df, _) in cat_dfs.items() for (pred, val) in df} util.get_logger().debug('Step 2: Rule Extraction') rule_axioms = _extract_axioms_with_rules(cat_dfs) all_axioms = rule_axioms | {(cat, pred, val) for cat, pred, val in direct_axioms if cat not in rule_axioms} all_assertions = {(res, pred, val) for cat, pred, val in all_axioms for res in cat_store.get_resources(cat)} util.get_logger().debug('Finished extraction - persisting results..') relation_axioms = { ax for ax in all_axioms if ax[1] != rdf_util.PREDICATE_TYPE } pd.DataFrame(data=relation_axioms, columns=['cat', 'pred', 'val']).to_csv( util.get_results_file('results.cdf.relation_axioms'), sep=';', index=False) type_axioms = {ax for ax in all_axioms if ax[1] == rdf_util.PREDICATE_TYPE} pd.DataFrame(data=type_axioms, columns=['cat', 'pred', 'val']).to_csv( util.get_results_file('results.cdf.type_axioms'), sep=';', index=False) relation_assertions = { a for a in all_assertions if a[1] != rdf_util.PREDICATE_TYPE } df_relation_assertions = pd.DataFrame(data=relation_assertions, columns=['sub', 'pred', 'val']) df_relation_assertions.to_csv( util.get_results_file('results.cdf.relation_assertions'), sep=';', index=False) rdf_util.write_triple_file( df_relation_assertions, util.get_results_file('results.cdf.relation_assertion_triples')) type_assertions = { a for a in all_assertions if a[1] == rdf_util.PREDICATE_TYPE } df_type_assertions = pd.DataFrame(data=type_assertions, columns=['sub', 'pred', 'val']) df_type_assertions.to_csv( util.get_results_file('results.cdf.type_assertions'), sep=';', index=False) rdf_util.write_triple_file( df_type_assertions, util.get_results_file('results.cdf.type_assertion_triples'))