def _align_section_entity_types(df: pd.DataFrame) -> pd.DataFrame: """Align the types of section entities to the most common entity type aggregated by top-section.""" section_types = {} for ts, s_df in df.groupby('TS_text'): section_ents = set(s_df['S_ent'].unique()) type_counter = defaultdict(int) for s_ent in section_ents: for t in dbp_store.get_transitive_types( dbp_util.name2resource(str(s_ent))): type_counter[t] += 1 top_types = dbp_store.get_independent_types({ t for t, cnt in type_counter.items() if cnt == max(type_counter.values()) }) if top_types: top_type = list(top_types)[0] section_types.update({ (ts, se): dbp_util.type2name(top_type) for se in section_ents if top_type in dbp_store.get_transitive_types( dbp_util.name2resource(str(se))) }) section_types = pd.Series(section_types, name='S_enttype_new') df = pd.merge(how='left', left=df, right=section_types, left_on=['TS_text', 'S_ent'], right_index=True) df['S_enttype_new'].fillna(df['S_enttype'], inplace=True) return df.drop(columns='S_enttype').rename( columns={'S_enttype_new': 'S_enttype'})
def _compute_inverse_type_frequencies() -> dict: predicate_types = defaultdict(set) for r in dbp_store.get_resources(): for pred in dbp_store.get_properties(r): predicate_types[pred].update(dbp_store.get_transitive_types(r)) overall_type_count = len(dbp_store.get_all_types()) return { pred: math.log(overall_type_count / (len(predicate_types[pred]) + 1)) for pred in dbp_store.get_all_predicates() }
def _compute_property_frequencies() -> dict: property_frequencies = defaultdict(lambda: defaultdict(int)) for r in dbp_store.get_resources(): types = dbp_store.get_transitive_types(r) for pred, values in dbp_store.get_properties(r).items(): for t in types: property_frequencies[t][pred] += len(values) return defaultdict( lambda: defaultdict(float), { t: defaultdict( float, { pred: (1 + math.log(count) if count > 0 else 0) for pred, count in property_frequencies[t].items() }) for t in property_frequencies })
def _compute_type_resource_scores(graph, node: str, direct_resources_only: bool) -> dict: node_resources = graph.get_resources_from_categories(node) if not direct_resources_only or len( [r for r in node_resources if dbp_store.get_types(r)]) < 5: node_resources.update({ r for sn in graph.descendants(node) for r in graph.get_resources_from_categories(sn) }) node_resources = node_resources.intersection(dbp_store.get_resources()) if len(node_resources) < 5: return { } # better not return anything, if number of resources is too small type_counts = defaultdict(int) for res in node_resources: for t in dbp_store.get_transitive_types(res): type_counts[t] += 1 return {t: count / len(node_resources) for t, count in type_counts.items()}
def _apply_rules(pattern_dict: dict, cat: str) -> set: """Apply rules form `pattern_dict` and return the implied axioms.""" cat_words = cat_store.get_label(cat).split(' ') axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words) if not axiom_patterns: return set() (pred, pred_type), additional_axioms = axiom_patterns front_pattern_idx = pattern_lengths[0] or None back_pattern_idx = -1 * pattern_lengths[1] or None resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx]) if pred_type: resource = dbp_util.name2resource(resource) if resource not in dbp_store.get_resources( ) or pred_type not in dbp_store.get_transitive_types(resource): return set() return {(cat, pred, resource)} | {(cat, pred, val) for pred, val in additional_axioms}
def _compute_predicate_types(resource_property_mapping: dict, threshold: float) -> dict: predicate_type_distribution = defaultdict(lambda: defaultdict(int)) for r in resource_property_mapping: for pred, values in resource_property_mapping[r].items(): triple_count = len(values) predicate_type_distribution[pred]['_sum'] += triple_count for t in dbp_store.get_transitive_types(r): predicate_type_distribution[pred][t] += triple_count matching_types = {} for pred in predicate_type_distribution: t_sum = predicate_type_distribution[pred]['_sum'] t_scores = { t: t_count / t_sum for t, t_count in predicate_type_distribution[pred].items() if t != '_sum' } if t_scores: t_score_max = max(t_scores.values()) if t_score_max >= threshold: type_candidates = { t for t, t_score in t_scores.items() if t_score == t_score_max } if len(type_candidates) > 1: type_candidates = { t for t in type_candidates if not type_candidates.intersection( dbp_store.get_transitive_subtypes(t)) } if len(type_candidates) == 1 or dbp_store.are_equivalent_types( type_candidates): matching_types[pred] = type_candidates.pop() return matching_types
def _get_lines_dbpedia_instance_types(graph) -> list: """Serialize new types for DBpedia resources in DBpedia namespace.""" new_dbpedia_types = defaultdict(set) for node in graph.nodes: node_types = graph.get_transitive_dbpedia_types(node, force_recompute=True) transitive_node_types = { tt for t in node_types for tt in dbp_store.get_transitive_supertype_closure(t) }.difference({rdf_util.CLASS_OWL_THING}) for res in graph.get_resources(node): dbp_res = clg_util.clg_resource2dbp_resource(res) if dbp_res in dbp_store.get_resources(): new_dbpedia_types[dbp_res].update( transitive_node_types.difference( dbp_store.get_transitive_types(dbp_res))) else: new_dbpedia_types[dbp_res].update(transitive_node_types) return [ serialize_util.as_object_triple(res, rdf_util.PREDICATE_TYPE, t) for res, types in new_dbpedia_types.items() for t in types ]
def accepts_resource(self, dbp_resource: str) -> bool: return self.value in dbp_store.get_transitive_types(dbp_resource)