def _get_lines_dbpedia_instance_transitive_caligraph_types(graph) -> list: """Serialize transitive CaLiGraph types for DBpedia resources.""" instance_transitive_clg_types = [] caligraph_ancestors = defaultdict(set) for n in graph.traverse_nodes_topdown(): parents = graph.parents(n) caligraph_ancestors[n] = parents | { a for p in parents for a in caligraph_ancestors[p] } for res in graph.get_all_resources(): dbp_res = clg_util.clg_resource2dbp_resource(res) if dbp_res not in dbp_store.get_resources(): continue types = graph.get_nodes_for_resource(res) direct_types = types.difference( {a for t in types for a in caligraph_ancestors[t]}) transitive_types = { tt for t in direct_types for tt in graph.ancestors(t) }.difference(direct_types | {rdf_util.CLASS_OWL_THING}) instance_transitive_clg_types.extend([ serialize_util.as_object_triple(dbp_res, rdf_util.PREDICATE_TYPE, tt) for tt in transitive_types ]) return instance_transitive_clg_types
def _compute_inverse_type_frequencies() -> dict: predicate_types = defaultdict(set) for r in dbp_store.get_resources(): for pred in dbp_store.get_properties(r): predicate_types[pred].update(dbp_store.get_transitive_types(r)) overall_type_count = len(dbp_store.get_all_types()) return { pred: math.log(overall_type_count / (len(predicate_types[pred]) + 1)) for pred in dbp_store.get_all_predicates() }
def _get_resource_surface_scores(text): """Return resource lexicalisation scores for the given text.""" resource_surface_scores = {} if not text: return resource_surface_scores resource_surface_scores[text] = 1 direct_match = dbp_store.resolve_redirect(dbp_util.name2resource(text)) if direct_match in dbp_store.get_resources(): resource_surface_scores[direct_match] = 1 for surface_match, frequency in sorted(dbp_store.get_inverse_lexicalisations(text.lower()).items(), key=operator.itemgetter(1)): resource_surface_scores[surface_match] = frequency return resource_surface_scores
def _compute_property_frequencies() -> dict: property_frequencies = defaultdict(lambda: defaultdict(int)) for r in dbp_store.get_resources(): types = dbp_store.get_transitive_types(r) for pred, values in dbp_store.get_properties(r).items(): for t in types: property_frequencies[t][pred] += len(values) return defaultdict( lambda: defaultdict(float), { t: defaultdict( float, { pred: (1 + math.log(count) if count > 0 else 0) for pred, count in property_frequencies[t].items() }) for t in property_frequencies })
def _get_lines_instances_dbpedia_mapping(graph) -> list: """Serialize DBpedia mapping for resources.""" lines_instances_dbpedia_mapping = [] axiom_resources = { ax[1] for n in graph.nodes for ax in graph.get_axioms(n, transitive=False) if clg_util.is_clg_resource(ax[1]) } for res in graph.get_all_resources() | axiom_resources: equivalent_res = clg_util.clg_resource2dbp_resource(res) if equivalent_res in dbp_store.get_resources(): lines_instances_dbpedia_mapping.append( serialize_util.as_object_triple(res, rdf_util.PREDICATE_SAME_AS, equivalent_res)) return lines_instances_dbpedia_mapping
def _extract_axioms(patterns: dict) -> set: """Return the axioms extracted by applying the patterns to Wikipedia categories.""" axioms = {} for cat, (sub, pred, subcats) in patterns.items(): if pred: # simple mapping of label to predicate (case 1) if pred.lower() in predicate_names: axioms[cat] = (sub, predicate_names[pred.lower()], subcats) else: # Voting required to discover Z (case 2) predicate_counts = defaultdict(int) for subcat, value in subcats.items(): value = normalize_val(value) for res in cat_store.get_resources(subcat): for pred, values in dbp_store.get_properties(res).items(): normalized_values = { normalize_val(val) for val in values } if value in normalized_values: predicate_counts[pred] += 1 if predicate_counts: pred = max(predicate_counts.items(), key=operator.itemgetter(1))[0] axioms[cat] = (sub, pred, subcats) # map values to dbpedia resources if necessary (only possible if we have an object property) valid_axioms = {} for cat in axioms: _, pred, subcats = axioms[cat] if dbp_store.is_object_property(pred): for subcat, obj in subcats.items(): obj_uri = dbp_util.name2resource(obj) if obj_uri in dbp_store.get_resources(): if cat in valid_axioms: valid_axioms[cat][1][subcat] = obj_uri else: valid_axioms[cat] = (pred, {subcat: obj_uri}) else: valid_axioms[cat] = (pred, subcats) return {(cat, pred, val) for pred, cat_vals in valid_axioms.values() for cat, val in cat_vals.items()}
def _get_lines_dbpedia_instances(graph) -> list: """Serialize new DBpedia resources in DBpedia namespace.""" lines_dbpedia_instances = [] new_instances = { clg_util.clg_resource2dbp_resource(res) for res in graph.get_all_resources() }.difference(dbp_store.get_resources()) for inst in new_instances: lines_dbpedia_instances.append( serialize_util.as_object_triple( inst, rdf_util.PREDICATE_TYPE, rdf_util.CLASS_OWL_NAMED_INDIVIDUAL)) label = graph.get_label(clg_util.dbp_resource2clg_resource(inst)) if label: lines_dbpedia_instances.append( serialize_util.as_literal_triple(inst, rdf_util.PREDICATE_LABEL, label)) return lines_dbpedia_instances
def _compute_type_resource_scores(graph, node: str, direct_resources_only: bool) -> dict: node_resources = graph.get_resources_from_categories(node) if not direct_resources_only or len( [r for r in node_resources if dbp_store.get_types(r)]) < 5: node_resources.update({ r for sn in graph.descendants(node) for r in graph.get_resources_from_categories(sn) }) node_resources = node_resources.intersection(dbp_store.get_resources()) if len(node_resources) < 5: return { } # better not return anything, if number of resources is too small type_counts = defaultdict(int) for res in node_resources: for t in dbp_store.get_transitive_types(res): type_counts[t] += 1 return {t: count / len(node_resources) for t, count in type_counts.items()}
def _apply_rules(pattern_dict: dict, cat: str) -> set: """Apply rules form `pattern_dict` and return the implied axioms.""" cat_words = cat_store.get_label(cat).split(' ') axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words) if not axiom_patterns: return set() (pred, pred_type), additional_axioms = axiom_patterns front_pattern_idx = pattern_lengths[0] or None back_pattern_idx = -1 * pattern_lengths[1] or None resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx]) if pred_type: resource = dbp_util.name2resource(resource) if resource not in dbp_store.get_resources( ) or pred_type not in dbp_store.get_transitive_types(resource): return set() return {(cat, pred, resource)} | {(cat, pred, val) for pred, val in additional_axioms}
def get_resource_stats(self, node: str) -> dict: """Return resource stats of a node (i.e. resource count and property count).""" if node not in self._node_resource_stats: resource_count = 0 new_resource_count = 0 property_counts = defaultdict(int) transitive_resource_count = 0 transitive_new_resource_count = 0 transitive_property_counts = defaultdict(int) for res in self.get_resources_from_categories(node): if res in dbp_store.get_resources(): resource_count += 1 transitive_resource_count += 1 for pred, values in dbp_store.get_properties(res).items(): for val in values: property_counts[(pred, val)] += 1 transitive_property_counts[(pred, val)] += 1 else: new_resource_count += 1 transitive_new_resource_count += 1 for child in self.children(node): child_stats = self.get_resource_stats(child) transitive_resource_count += child_stats[ 'transitive_resource_count'] transitive_new_resource_count += child_stats[ 'transitive_new_resource_count'] for prop, count in child_stats[ 'transitive_property_counts'].items(): transitive_property_counts[prop] += count self._node_resource_stats[node] = { 'resource_count': resource_count, 'new_resource_count': new_resource_count, 'property_counts': property_counts, 'transitive_resource_count': transitive_resource_count, 'transitive_new_resource_count': transitive_new_resource_count, 'transitive_property_counts': transitive_property_counts } return self._node_resource_stats[node]
def _get_lines_dbpedia_instance_types(graph) -> list: """Serialize new types for DBpedia resources in DBpedia namespace.""" new_dbpedia_types = defaultdict(set) for node in graph.nodes: node_types = graph.get_transitive_dbpedia_types(node, force_recompute=True) transitive_node_types = { tt for t in node_types for tt in dbp_store.get_transitive_supertype_closure(t) }.difference({rdf_util.CLASS_OWL_THING}) for res in graph.get_resources(node): dbp_res = clg_util.clg_resource2dbp_resource(res) if dbp_res in dbp_store.get_resources(): new_dbpedia_types[dbp_res].update( transitive_node_types.difference( dbp_store.get_transitive_types(dbp_res))) else: new_dbpedia_types[dbp_res].update(transitive_node_types) return [ serialize_util.as_object_triple(res, rdf_util.PREDICATE_TYPE, t) for res, types in new_dbpedia_types.items() for t in types ]
def _get_lines_dbpedia_instance_relations(graph) -> list: """Serialize new facts for DBpedia resources in DBpedia namespace.""" new_instance_relations = set() for node in graph.nodes: for prop, val in graph.get_axioms(node): dbp_prop = clg_util.clg_type2dbp_type(prop) dbp_val = clg_util.clg_resource2dbp_resource( val) if clg_util.is_clg_resource(val) else val for res in graph.get_resources(node): dbp_res = clg_util.clg_resource2dbp_resource(res) if dbp_res not in dbp_store.get_resources( ) or dbp_prop not in dbp_store.get_properties( dbp_res) or dbp_val not in dbp_store.get_properties( dbp_res)[dbp_prop]: new_instance_relations.add((dbp_res, dbp_prop, dbp_val)) lines_dbpedia_instance_relations = [] for s, p, o in new_instance_relations: if dbp_util.is_dbp_resource(o): lines_dbpedia_instance_relations.append( serialize_util.as_object_triple(s, p, o)) else: lines_dbpedia_instance_relations.append( serialize_util.as_literal_triple(s, p, o)) return lines_dbpedia_instance_relations
def _generate_dbpedia_coverage_graph(): """Create graph of Figure 4a""" # retrieve data from extracted axioms and assertions cat2ax_relation_axioms = pd.read_csv( util.get_results_file('results.cat2ax.relation_axioms'), sep=';') cat2ax_type_axioms = pd.read_csv( util.get_results_file('results.cat2ax.type_axioms'), sep=';') cat2ax_relation_triples = pd.read_csv( util.get_results_file('results.cat2ax.relation_assertions'), sep=';') cat2ax_type_triples = pd.read_csv( util.get_results_file('results.cat2ax.type_assertions'), sep=';') catriple_relation_axioms = pd.read_csv( util.get_results_file('results.catriple.relation_axioms'), sep=';') catriple_relation_triples = pd.read_csv( util.get_results_file('results.catriple.relation_assertions'), sep=';') cdf_relation_axioms = pd.read_csv( util.get_results_file('results.cdf.relation_axioms'), sep=';') cdf_type_axioms = pd.read_csv( util.get_results_file('results.cdf.type_axioms'), sep=';') cdf_relation_triples = pd.read_csv( util.get_results_file('results.cdf.relation_assertions'), sep=';') cdf_type_triples = pd.read_csv( util.get_results_file('results.cdf.type_assertions'), sep=';') # retrieve unique entity counts cat2ax_cat_count = len( set(cat2ax_relation_axioms['cat'].unique()) | set(cat2ax_type_axioms['cat'].unique())) catriple_cat_count = len(set(catriple_relation_axioms['cat'].unique())) cdf_cat_count = len( set(cdf_relation_axioms['cat'].unique()) | set(cdf_type_axioms['cat'].unique())) total_cat_count = len(cat_store.get_usable_cats()) cat2ax_preds = cat2ax_relation_triples.groupby(by='pred').count() cat2ax_pred_count = len(cat2ax_preds[cat2ax_preds['sub'] >= 100].index) catriple_preds = catriple_relation_triples.groupby(by='pred').count() catriple_pred_count = len( catriple_preds[catriple_preds['sub'] >= 100].index) cdf_preds = cdf_relation_triples.groupby(by='pred').count() cdf_pred_count = len(cdf_preds[cdf_preds['sub'] >= 100].index) total_pred_count = len(dbp_store.get_all_predicates()) cat2ax_res_count = len( set(cat2ax_relation_triples['sub'].unique()) | set(cat2ax_type_triples['sub'].unique())) catriple_res_count = len(set(catriple_relation_triples['sub'].unique())) cdf_res_count = len( set(cdf_relation_triples['sub'].unique()) | set(cdf_type_triples['sub'].unique())) total_res_count = len(dbp_store.get_resources()) # initialise bars bars_ca = [ cat2ax_cat_count / total_cat_count, cat2ax_res_count / total_res_count, cat2ax_pred_count / total_pred_count ] bars_ct = [ catriple_cat_count / total_cat_count, catriple_res_count / total_res_count, catriple_pred_count / total_pred_count ] bars_cdf = [ cdf_cat_count / total_cat_count, cdf_res_count / total_res_count, cdf_pred_count / total_pred_count ] # arrange bars bar_width = 0.25 r1 = np.arange(len(bars_ca)) r2 = [x + bar_width for x in r1] r3 = [x + bar_width for x in r2] # make plot plt.figure(figsize=(8, 5)) plt.bar(r1, bars_ca, color='#2d7f5e', width=bar_width, edgecolor='white', label='Cat2Ax') plt.bar(r2, bars_ct, color='darkgrey', width=bar_width, edgecolor='white', label='Catriple') plt.bar(r3, bars_cdf, color='black', width=bar_width, edgecolor='white', label='C-DF') plt.ylabel('Fraction of items covered', fontsize=16) plt.xticks([r + bar_width for r in range(len(bars_ca))], ['(1) Categories', '(2) Resources', '(3) Properties'], fontsize=16) plt.yticks(fontsize=14) plt.legend(fontsize=15) ax = plt.gca() ax.yaxis.grid() plt.savefig(util.get_results_file('results.graphs.dbpedia_coverage'), bbox_inches='tight')