def extract_wiki_corpus_resources(): """Crawl the Wikipedia corpus for hearst patterns to retrieve hypernyms and type lexicalisations.""" if utils.load_cache('wikipedia_type_lexicalisations') is not None: return # only compute hypernyms and type lexicalisations if they are not existing already utils.get_logger().info( 'WIKIPEDIA/NIF: Computing wikipedia hypernyms and type lexicalisations..' ) total_hypernyms = defaultdict(lambda: defaultdict(int)) total_type_lexicalisations = defaultdict(lambda: defaultdict(int)) # initialize some caches to reduce the setup time of the individual processes dbp_store.get_types('') dbp_store.get_inverse_lexicalisations('') spacy_util.get_hearst_pairs('') with mp.Pool(processes=utils.get_config('max_cpus')) as pool: for hypernyms, type_lexicalisations in pool.imap_unordered( _compute_counts_for_resource, tqdm(_retrieve_plaintexts()), chunksize=1000): for (sub, obj), count in hypernyms.items(): total_hypernyms[sub][obj] += count for (sub, obj), count in type_lexicalisations.items(): total_type_lexicalisations[sub][obj] += count wikipedia_hypernyms = { word: dict(hypernym_counts) for word, hypernym_counts in total_hypernyms.items() } utils.update_cache('wikipedia_hypernyms', wikipedia_hypernyms) type_lexicalisations = { word: dict(type_counts) for word, type_counts in total_type_lexicalisations.items() if word not in STOP_WORDS } utils.update_cache('wikipedia_type_lexicalisations', type_lexicalisations)
def _assign_entity_types_for_section(df: pd.DataFrame, section_entity: str) -> pd.DataFrame: """Retrieve the types of section entities.""" section_types = {} for ent in df[section_entity].unique(): types = dbp_store.get_independent_types( dbp_store.get_types(dbp_util.name2resource(str(ent)))) if types: section_types[ent] = dbp_util.type2name(list(types)[0]) section_types = pd.Series(section_types, name=f'{section_entity}type') return pd.merge(how='left', left=df, right=section_types, left_on=section_entity, right_index=True)
def _compute_counts_for_resource(uri_with_text: tuple) -> tuple: uri, text = uri_with_text hypernyms = defaultdict(int) type_lexicalisations = defaultdict(int) for sub, obj in spacy_util.get_hearst_pairs(text): # collect hypernym statistics in Wikipedia hypernyms[(nlp_util.lemmatize_token(sub.root).lower(), nlp_util.lemmatize_token(obj.root).lower())] += 1 # for each word, count the types that it refers to if uri not in dbp_store.get_inverse_lexicalisations(sub.text): continue # discard, if the resource text does not refer to the subject of the article for t in dbp_store.get_types(uri): for word in obj: type_lexicalisations[(nlp_util.lemmatize_token(word).lower(), t)] += 1 return hypernyms, type_lexicalisations
def _retrieve_training_data_wle(nlp: Language): listpages = list_store.get_parsed_listpages(wikipedia.ARTICLE_TYPE_ENUM) lp_to_cat_mapping = { lp: list_mapping.get_equivalent_categories(lp) | list_mapping.get_parent_categories(lp) for lp in listpages } lp_to_cat_mapping = { lp: cats for lp, cats in lp_to_cat_mapping.items() if cats } training_data = [] # extract entities for lp, cats in lp_to_cat_mapping.items(): lp_data = listpages[lp] for section_data in lp_data['sections']: for enum_data in section_data['enums']: for entry_data in enum_data: text = entry_data['text'] if not text: continue entities = entry_data['entities'] if not entities: continue valid_entities = [] for entity_data in entities: entity_uri = dbp_util.name2resource( entity_data['name']) entity_tag = _get_tag_for_types( dbp_store.get_types(entity_uri)) if not entity_tag: continue entity_text = entity_data['text'] start = int(entity_data['idx']) end = start + len(text) if end > len(text) or text[start:end] != entity_text: continue valid_entities.append((start, end, entity_tag)) if len(entities) == len(valid_entities): training_data.append( Example.from_dict(nlp.make_doc(text), {'entities': valid_entities})) return training_data
def _compute_type_resource_scores(graph, node: str, direct_resources_only: bool) -> dict: node_resources = graph.get_resources_from_categories(node) if not direct_resources_only or len( [r for r in node_resources if dbp_store.get_types(r)]) < 5: node_resources.update({ r for sn in graph.descendants(node) for r in graph.get_resources_from_categories(sn) }) node_resources = node_resources.intersection(dbp_store.get_resources()) if len(node_resources) < 5: return { } # better not return anything, if number of resources is too small type_counts = defaultdict(int) for res in node_resources: for t in dbp_store.get_transitive_types(res): type_counts[t] += 1 return {t: count / len(node_resources) for t, count in type_counts.items()}
def _assign_pagetypes(df: pd.DataFrame) -> pd.DataFrame: """Assign (most basic and most specific) page types to the existing dataframe.""" data = [] for page_name in df['P'].unique(): if page_name.startswith('List of'): data.append((page_name, 'List', 'List')) continue page_uri = dbp_util.name2resource(page_name) P_types = dbp_store.get_independent_types( dbp_store.get_types(page_uri)) if not P_types: data.append((page_name, 'Other', 'Other')) continue P_type = sorted(P_types)[0] P_basetype = _get_basetype(P_type) data.append((page_name, dbp_util.type2name(P_type), dbp_util.type2name(P_basetype))) return pd.merge(left=df, right=pd.DataFrame(data, columns=['P', 'P_type', 'P_basetype']), on='P')
def _compute_labeled_entities_for_listpage(page_uri: str, page_data: dict, graph) -> tuple: positive_SEs, negative_SEs = dict(), set() # compute potential subject entities for list page page_potential_SEs = { dbp_util.resource2name(res) for cat in _get_category_descendants_for_list(page_uri) for res in cat_store.get_resources(cat) } # compute types of list page page_types = { t for n in graph.get_nodes_for_part(page_uri) for t in dbp_store.get_independent_types( graph.get_transitive_dbpedia_types(n)) } page_disjoint_types = { dt for t in page_types for dt in dbp_heur.get_disjoint_types(t) } # collect all linked entities on the page page_entities = { ent['name'] for s in page_data['sections'] for enum in s['enums'] for entry in enum for ent in entry['entities'] } page_entities.update({ ent['name'] for s in page_data['sections'] for table in s['tables'] for row in table['data'] for cell in row for ent in cell['entities'] }) for ent in page_entities: ent_uri = dbp_util.name2resource(ent) if not dbp_store.is_possible_resource(ent_uri): negative_SEs.add(ent) elif ent in page_potential_SEs: positive_SEs[ent] = _compute_entity_label(ent_uri) elif page_disjoint_types.intersection(dbp_store.get_types(ent_uri)): negative_SEs.add(ent) return positive_SEs, negative_SEs
def get_resources(self, node: str) -> set: """Return all resources of a node.""" if node not in self._node_resources: disjoint_dbp_types = self.get_disjoint_dbp_types(node, transitive=True) dbp_resources = self.get_resources_from_categories(node) | { r for t in self.get_type_parts(node) for r in dbp_store.get_direct_resources_for_type(t) } dbp_resources = { r for r in dbp_resources if not disjoint_dbp_types.intersection(dbp_store.get_types(r)) } self._node_resources[node] = { clg_util.dbp_resource2clg_resource(r) for r in dbp_resources } if self.use_listing_resources: self._node_resources[node].update( self.get_resources_from_listings(node)) return self._node_resources[node]
def _generate_dbpedia_unknown_resources_graph(): """Create graph of Figure 4b""" # retrieve data from extracted assertions cat2ax_relation_triples = pd.read_csv( util.get_results_file('results.cat2ax.relation_assertions'), sep=';') cat2ax_new_relation_resources = len({ r for r in cat2ax_relation_triples['sub'].unique() if not dbp_store.get_properties(r) }) cat2ax_type_triples = pd.read_csv( util.get_results_file('results.cat2ax.type_assertions'), sep=';') cat2ax_new_type_resources = len({ r for r in cat2ax_type_triples['sub'].unique() if not dbp_store.get_types(r) }) catriple_relation_triples = pd.read_csv( util.get_results_file('results.catriple.relation_assertions'), sep=';') catriple_new_relation_resources = len({ r for r in catriple_relation_triples['sub'].unique() if not dbp_store.get_properties(r) }) cdf_relation_triples = pd.read_csv( util.get_results_file('results.cdf.relation_assertions'), sep=';') cdf_new_relation_resources = len({ r for r in cdf_relation_triples['sub'].unique() if not dbp_store.get_properties(r) }) cdf_type_triples = pd.read_csv( util.get_results_file('results.cdf.type_assertions'), sep=';') cdf_new_type_resources = len({ r for r in cdf_type_triples['sub'].unique() if not dbp_store.get_types(r) }) # initialise bars bars_ca = [cat2ax_new_relation_resources, cat2ax_new_type_resources] bars_ct = [catriple_new_relation_resources, 0] bars_cdf = [cdf_new_relation_resources, cdf_new_type_resources] # arrange bars bar_width = 0.25 r1 = np.arange(len(bars_ca)) r2 = [x + bar_width for x in r1] r3 = [x + bar_width for x in r2] # make plot plt.figure(figsize=(8, 5)) plt.bar(r1, bars_ca, color='#2d7f5e', width=bar_width, edgecolor='white', label='Cat2Ax') plt.bar(r2, bars_ct, color='darkgrey', width=bar_width, edgecolor='white', label='Catriple') plt.bar(r3, bars_cdf, color='black', width=bar_width, edgecolor='white', label='C-DF') plt.ylabel('Amount of resources', fontsize=16) plt.xticks([r + bar_width for r in range(len(bars_ca))], ['(1) Relations', '(2) Types'], fontsize=16) plt.yticks(fontsize=14) plt.legend(fontsize=15) ax = plt.gca() ax.yaxis.grid() plt.savefig( util.get_results_file('results.graphs.dbpedia_unknown_resources'), bbox_inches='tight')
def _extract_axioms_with_rules(cat_dfs: dict) -> set: """Return axioms genered by applying C-DF rules.""" # generate rule candidates by extracting shared pre-/postfixes cdf_rule_candidates = defaultdict(lambda: defaultdict(lambda: 0)) for cat, (df, _) in cat_dfs.items(): cat_label = cat_store.get_label(cat) for f in {f for f in df if f[0] != rdf_util.PREDICATE_TYPE}: if dbp_util.is_dbp_resource(f[1]): f_label = dbp_store._get_label_mapping()[ f[1]] if f[1] in dbp_store._get_label_mapping( ) else dbp_util.object2name(f[1]) else: f_label = f[1] if f_label in cat_label: first_words = cat_label[:cat_label.index(f_label)].strip() first_words = tuple( first_words.split(' ')) if first_words else tuple() last_words = cat_label[cat_label.index(f_label) + len(f_label):].strip() last_words = tuple( last_words.split(' ')) if last_words else tuple() if first_words or last_words: f_types = dbp_store.get_independent_types( dbp_store.get_types(f[1])) if dbp_util.is_dbp_resource( f[1]) else set() f_type = f_types.pop() if f_types else None cdf_rule_candidates[(first_words, last_words)][((f[0], f_type), tuple( set(df).difference( {f})))] += 1 # filter rules using the threshold parameters min_support and beta cdf_rules = {} min_support = util.get_config('cdf.min_support') beta = util.get_config('cdf.beta') for word_patterns in cdf_rule_candidates: total_support = sum(cdf_rule_candidates[word_patterns].values()) valid_axiom_patterns = [ pattern for pattern, support in cdf_rule_candidates[word_patterns].items() if support >= min_support and (support / total_support) >= beta ] if len(valid_axiom_patterns) > 0: cdf_rules[word_patterns] = valid_axiom_patterns[0] # apply the patterns to all categories in order to extract axioms # (the rules are applied individually depending on whether the pattern is at the front, back, or front+back in order to reduce computational complexity) cdf_front_patterns = { word_patterns: axiom_pattern for word_patterns, axiom_pattern in cdf_rules.items() if word_patterns[0] and not word_patterns[1] } cdf_front_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in cdf_front_patterns.items(): _fill_dict( cdf_front_pattern_dict, list(front_pattern), lambda d: _fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) cdf_back_patterns = { word_patterns: axiom_pattern for word_patterns, axiom_pattern in cdf_rules.items() if not word_patterns[0] and word_patterns[1] } cdf_back_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in cdf_back_patterns.items(): _fill_dict( cdf_back_pattern_dict, list(front_pattern), lambda d: _fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) cdf_enclosing_patterns = { word_patterns: axiom_pattern for word_patterns, axiom_pattern in cdf_rules.items() if word_patterns[0] and word_patterns[1] } cdf_enclosing_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in cdf_enclosing_patterns.items(): _fill_dict( cdf_enclosing_pattern_dict, list(front_pattern), lambda d: _fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) rule_axioms = set() for cat in cat_store.get_usable_cats(): rule_axioms.update(_apply_rules(cdf_front_pattern_dict, cat)) rule_axioms.update(_apply_rules(cdf_back_pattern_dict, cat)) rule_axioms.update(_apply_rules(cdf_enclosing_pattern_dict, cat)) return rule_axioms
best_df, score = max(candidate_dfs.items(), key=operator.itemgetter(1), default=(None, 0)) if score > alpha: cat_dfs[cat] = (best_df, score) return cat_dfs # create an index of resources and properties that converts string-uris to integers in order to speed up indexing and reduce complexity resource_features = { res: {(k, v) for k, values in props.items() for v in values} for res, props in dbp_store.get_resource_property_mapping().items() } for res in resource_features: for t in dbp_store.get_types(res): resource_features[res].add((rdf_util.PREDICATE_TYPE, t)) resource_to_idx_dict = { res: i for res, i in zip(resource_features, range(len(resource_features))) } feature_to_idx_dict = defaultdict(set) for res, feats in resource_features.items(): res_idx = resource_to_idx_dict[res] for f in feats: feature_to_idx_dict[f].add(res_idx) def _get_overall_features_count(feats: tuple, cat: str = None) -> int:
def _compute_entity_label(resource_uri: str) -> str: for t in dbp_store.get_types(resource_uri): if t in TYPE_LABEL_MAPPING: return TYPE_LABEL_MAPPING[t] return LABEL_OTHER
def rejects_resource(self, dbp_resource: str) -> bool: return self.value in {dt for t in dbp_store.get_types(dbp_resource) for dt in dbp_heur.get_disjoint_types(t)}