Example #1
0
def _align_section_entity_types(df: pd.DataFrame) -> pd.DataFrame:
    """Align the types of section entities to the most common entity type aggregated by top-section."""
    section_types = {}
    for ts, s_df in df.groupby('TS_text'):
        section_ents = set(s_df['S_ent'].unique())
        type_counter = defaultdict(int)
        for s_ent in section_ents:
            for t in dbp_store.get_transitive_types(
                    dbp_util.name2resource(str(s_ent))):
                type_counter[t] += 1
        top_types = dbp_store.get_independent_types({
            t
            for t, cnt in type_counter.items()
            if cnt == max(type_counter.values())
        })
        if top_types:
            top_type = list(top_types)[0]
            section_types.update({
                (ts, se): dbp_util.type2name(top_type)
                for se in section_ents
                if top_type in dbp_store.get_transitive_types(
                    dbp_util.name2resource(str(se)))
            })
    section_types = pd.Series(section_types, name='S_enttype_new')
    df = pd.merge(how='left',
                  left=df,
                  right=section_types,
                  left_on=['TS_text', 'S_ent'],
                  right_index=True)
    df['S_enttype_new'].fillna(df['S_enttype'], inplace=True)
    return df.drop(columns='S_enttype').rename(
        columns={'S_enttype_new': 'S_enttype'})
Example #2
0
def _assign_pagetypes(df: pd.DataFrame) -> pd.DataFrame:
    """Assign (most basic and most specific) page types to the existing dataframe."""
    data = []
    for page_name in df['P'].unique():
        if page_name.startswith('List of'):
            data.append((page_name, 'List', 'List'))
            continue
        page_uri = dbp_util.name2resource(page_name)
        P_types = dbp_store.get_independent_types(
            dbp_store.get_types(page_uri))
        if not P_types:
            data.append((page_name, 'Other', 'Other'))
            continue
        P_type = sorted(P_types)[0]
        P_basetype = _get_basetype(P_type)
        data.append((page_name, dbp_util.type2name(P_type),
                     dbp_util.type2name(P_basetype)))
    return pd.merge(left=df,
                    right=pd.DataFrame(data,
                                       columns=['P', 'P_type', 'P_basetype']),
                    on='P')
Example #3
0
def _create_relation_type_df(dfr: pd.DataFrame) -> pd.DataFrame:
    """Retrieve domains and ranges for predicates."""
    data = []
    for _, row in dfr[['pred', 'inv']].drop_duplicates().iterrows():
        pred = row['pred']
        e_type = (dbp_heur.get_domain(pred) if row['inv'] else
                  dbp_heur.get_range(pred)) or rdf_util.CLASS_OWL_THING
        e_type = dbp_util.type2name(e_type) if dbp_util.is_dbp_type(
            e_type) else e_type
        data.append({
            'pred': row['pred'],
            'inv': row['inv'],
            'E_predtype': e_type
        })
    return pd.DataFrame(data)
Example #4
0
def _assign_entity_types_for_section(df: pd.DataFrame,
                                     section_entity: str) -> pd.DataFrame:
    """Retrieve the types of section entities."""
    section_types = {}
    for ent in df[section_entity].unique():
        types = dbp_store.get_independent_types(
            dbp_store.get_types(dbp_util.name2resource(str(ent))))
        if types:
            section_types[ent] = dbp_util.type2name(list(types)[0])
    section_types = pd.Series(section_types, name=f'{section_entity}type')
    return pd.merge(how='left',
                    left=df,
                    right=section_types,
                    left_on=section_entity,
                    right_index=True)
Example #5
0
def normalize_to_words(val: str) -> set:
    if dbp_util.is_dbp_type(val):
        val = dbp_util.type2name(val)
    return {normalizer_regex.sub('', word).lower() for word in val.split()}
Example #6
0
                obj_uri = dbp_util.name2resource(obj)
                if obj_uri in dbp_store.get_resources():
                    if cat in valid_axioms:
                        valid_axioms[cat][1][subcat] = obj_uri
                    else:
                        valid_axioms[cat] = (pred, {subcat: obj_uri})
        else:
            valid_axioms[cat] = (pred, subcats)

    return {(cat, pred, val)
            for pred, cat_vals in valid_axioms.values()
            for cat, val in cat_vals.items()}


predicate_names = {
    dbp_util.type2name(pred).lower(): pred
    for pred in dbp_store.get_all_predicates()
}
normalizer_regex = re.compile(r'[^a-zA-Z0-9]')


def normalize_val(val: str) -> str:
    if val.startswith('http://dbpedia.org/'):
        val = dbp_util.object2name(val)
    return normalizer_regex.sub('', val).lower()


# --- ASSERTION EXTRACTION ---


def _extract_assertions(axioms: set) -> list: