def _align_section_entity_types(df: pd.DataFrame) -> pd.DataFrame: """Align the types of section entities to the most common entity type aggregated by top-section.""" section_types = {} for ts, s_df in df.groupby('TS_text'): section_ents = set(s_df['S_ent'].unique()) type_counter = defaultdict(int) for s_ent in section_ents: for t in dbp_store.get_transitive_types( dbp_util.name2resource(str(s_ent))): type_counter[t] += 1 top_types = dbp_store.get_independent_types({ t for t, cnt in type_counter.items() if cnt == max(type_counter.values()) }) if top_types: top_type = list(top_types)[0] section_types.update({ (ts, se): dbp_util.type2name(top_type) for se in section_ents if top_type in dbp_store.get_transitive_types( dbp_util.name2resource(str(se))) }) section_types = pd.Series(section_types, name='S_enttype_new') df = pd.merge(how='left', left=df, right=section_types, left_on=['TS_text', 'S_ent'], right_index=True) df['S_enttype_new'].fillna(df['S_enttype'], inplace=True) return df.drop(columns='S_enttype').rename( columns={'S_enttype_new': 'S_enttype'})
def _assign_pagetypes(df: pd.DataFrame) -> pd.DataFrame: """Assign (most basic and most specific) page types to the existing dataframe.""" data = [] for page_name in df['P'].unique(): if page_name.startswith('List of'): data.append((page_name, 'List', 'List')) continue page_uri = dbp_util.name2resource(page_name) P_types = dbp_store.get_independent_types( dbp_store.get_types(page_uri)) if not P_types: data.append((page_name, 'Other', 'Other')) continue P_type = sorted(P_types)[0] P_basetype = _get_basetype(P_type) data.append((page_name, dbp_util.type2name(P_type), dbp_util.type2name(P_basetype))) return pd.merge(left=df, right=pd.DataFrame(data, columns=['P', 'P_type', 'P_basetype']), on='P')
def _create_relation_type_df(dfr: pd.DataFrame) -> pd.DataFrame: """Retrieve domains and ranges for predicates.""" data = [] for _, row in dfr[['pred', 'inv']].drop_duplicates().iterrows(): pred = row['pred'] e_type = (dbp_heur.get_domain(pred) if row['inv'] else dbp_heur.get_range(pred)) or rdf_util.CLASS_OWL_THING e_type = dbp_util.type2name(e_type) if dbp_util.is_dbp_type( e_type) else e_type data.append({ 'pred': row['pred'], 'inv': row['inv'], 'E_predtype': e_type }) return pd.DataFrame(data)
def _assign_entity_types_for_section(df: pd.DataFrame, section_entity: str) -> pd.DataFrame: """Retrieve the types of section entities.""" section_types = {} for ent in df[section_entity].unique(): types = dbp_store.get_independent_types( dbp_store.get_types(dbp_util.name2resource(str(ent)))) if types: section_types[ent] = dbp_util.type2name(list(types)[0]) section_types = pd.Series(section_types, name=f'{section_entity}type') return pd.merge(how='left', left=df, right=section_types, left_on=section_entity, right_index=True)
def normalize_to_words(val: str) -> set: if dbp_util.is_dbp_type(val): val = dbp_util.type2name(val) return {normalizer_regex.sub('', word).lower() for word in val.split()}
obj_uri = dbp_util.name2resource(obj) if obj_uri in dbp_store.get_resources(): if cat in valid_axioms: valid_axioms[cat][1][subcat] = obj_uri else: valid_axioms[cat] = (pred, {subcat: obj_uri}) else: valid_axioms[cat] = (pred, subcats) return {(cat, pred, val) for pred, cat_vals in valid_axioms.values() for cat, val in cat_vals.items()} predicate_names = { dbp_util.type2name(pred).lower(): pred for pred in dbp_store.get_all_predicates() } normalizer_regex = re.compile(r'[^a-zA-Z0-9]') def normalize_val(val: str) -> str: if val.startswith('http://dbpedia.org/'): val = dbp_util.object2name(val) return normalizer_regex.sub('', val).lower() # --- ASSERTION EXTRACTION --- def _extract_assertions(axioms: set) -> list: