def _extract_axioms(category_graph, patterns): """Return axioms extracted from `category_graph` by applying `patterns` to all categories.""" utils.get_logger().debug('CATEGORY/CAT2AX: Extracting axioms..') category_axioms = defaultdict(list) # process front/back/front+back patterns individually to reduce computational complexity front_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, False).items(): _fill_dict(front_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns)) back_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, False, True).items(): _fill_dict(back_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns)) enclosing_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set(patterns, True, True).items(): _fill_dict(enclosing_pattern_dict, list(front_pattern), lambda d: _fill_dict(d, list(reversed(back_pattern)), axiom_patterns)) cat_contexts = [( cat, nlp_util.remove_by_phrase(cat_store.get_label(cat)), cat_store.get_statistics(cat), front_pattern_dict, back_pattern_dict, enclosing_pattern_dict ) for cat in category_graph.content_nodes] with mp.Pool(processes=utils.get_config('max_cpus')) as pool: category_axioms = {cat: axioms for cat, axioms in tqdm(pool.imap_unordered(_extract_axioms_for_cat, cat_contexts, chunksize=1000), total=len(cat_contexts), desc='CATEGORY/CAT2AX: Extracting axioms')} category_axioms = {cat: axioms for cat, axioms in category_axioms.items() if axioms} # filter out empty axioms utils.get_logger().debug(f'CATEGORY/CAT2AX: Extracted {sum(len(axioms) for axioms in category_axioms.values())} axioms for {len(category_axioms)} categories.') return category_axioms
def _compute_category_sets() -> dict: """Iterate over DBpedia categories and identify all category sets. 1) Retrieve all usable categories (i.e. categories that are not used for maintenance/organisational purposes) 2) Normalize their names by removing by-phrases (e.g. "X by genre", "Y by country") 3) For each category, retrieve all its children and search for name patterns (see '_find_child_sets') """ category_sets = {} for cat in cat_store.get_categories(): children_docs = {c: nlp_util.remove_by_phrase(cat_store.get_label(c)) for c in cat_store.get_children(cat)} child_sets = _find_child_sets(cat, children_docs) if child_sets: category_sets[cat] = child_sets return category_sets
def _without_by_phrase(with_by_phrase: str, without_by_phrase: str): with_removed_by_phrase = nlp_util.remove_by_phrase(with_by_phrase, return_doc=False) assert with_removed_by_phrase.lower() == without_by_phrase.lower( ), f'{with_by_phrase} should be converted to {without_by_phrase}'
def _get_match_for_category(category: str, first_words: tuple, last_words: tuple) -> str: """Return variable part of the category name.""" doc = nlp_util.remove_by_phrase(cat_store.get_label(category)) return doc[len(first_words):len(doc)-len(last_words)].text
def _extract_axioms(graph, patterns) -> dict: """Run actual axiom extraction on CaLiGraph.""" utils.get_logger().debug('CaLi2Ax: Extracting axioms..') axioms = defaultdict(set) front_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set( patterns, True, False).items(): cat_axioms._fill_dict( front_pattern_dict, list(front_pattern), lambda d: cat_axioms._fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) back_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set( patterns, False, True).items(): cat_axioms._fill_dict( back_pattern_dict, list(front_pattern), lambda d: cat_axioms._fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) enclosing_pattern_dict = {} for (front_pattern, back_pattern), axiom_patterns in _get_confidence_pattern_set( patterns, True, True).items(): cat_axioms._fill_dict( enclosing_pattern_dict, list(front_pattern), lambda d: cat_axioms._fill_dict( d, list(reversed(back_pattern)), axiom_patterns)) for node in graph.content_nodes: property_frequencies = graph.get_property_frequencies(node) node_labels = set() for part in graph.get_parts(node): if cat_util.is_category(part): node_labels.add(cat_util.category2name(part)) elif list_util.is_listcategory(part) or list_util.is_listpage( part): node_labels.add(list_util.list2name(part)) labels_without_by_phrases = [ nlp_util.remove_by_phrase(label, return_doc=True) for label in node_labels ] for node_doc in labels_without_by_phrases: node_axioms = [] front_prop_axiom = _find_axioms(front_pattern_dict, node, node_doc, property_frequencies) if front_prop_axiom: node_axioms.append(front_prop_axiom) back_prop_axiom = _find_axioms(back_pattern_dict, node, node_doc, property_frequencies) if back_prop_axiom: node_axioms.append(back_prop_axiom) enclosing_prop_axiom = _find_axioms(enclosing_pattern_dict, node, node_doc, property_frequencies) if enclosing_prop_axiom: node_axioms.append(enclosing_prop_axiom) prop_axioms_by_pred = { a[1]: {x for x in node_axioms if x[1] == a[1]} for a in node_axioms } for pred, similar_prop_axioms in prop_axioms_by_pred.items(): if dbp_store.is_object_property(pred): res_labels = { a[2]: dbp_store.get_label(a[2]) for a in similar_prop_axioms } similar_prop_axioms = { a for a in similar_prop_axioms if all(res_labels[a[2]] == val or res_labels[a[2]] not in val for val in res_labels.values()) } best_prop_axiom = max(similar_prop_axioms, key=operator.itemgetter(3)) axioms[node].add(best_prop_axiom) utils.get_logger().debug( f'CaLi2Ax: Extracted {sum(len(axioms) for axioms in axioms.values())} axioms for {len(axioms)} categories.' ) return axioms