Beispiel #1
0
def _align_section_entity_types(df: pd.DataFrame) -> pd.DataFrame:
    """Align the types of section entities to the most common entity type aggregated by top-section."""
    section_types = {}
    for ts, s_df in df.groupby('TS_text'):
        section_ents = set(s_df['S_ent'].unique())
        type_counter = defaultdict(int)
        for s_ent in section_ents:
            for t in dbp_store.get_transitive_types(
                    dbp_util.name2resource(str(s_ent))):
                type_counter[t] += 1
        top_types = dbp_store.get_independent_types({
            t
            for t, cnt in type_counter.items()
            if cnt == max(type_counter.values())
        })
        if top_types:
            top_type = list(top_types)[0]
            section_types.update({
                (ts, se): dbp_util.type2name(top_type)
                for se in section_ents
                if top_type in dbp_store.get_transitive_types(
                    dbp_util.name2resource(str(se)))
            })
    section_types = pd.Series(section_types, name='S_enttype_new')
    df = pd.merge(how='left',
                  left=df,
                  right=section_types,
                  left_on=['TS_text', 'S_ent'],
                  right_index=True)
    df['S_enttype_new'].fillna(df['S_enttype'], inplace=True)
    return df.drop(columns='S_enttype').rename(
        columns={'S_enttype_new': 'S_enttype'})
Beispiel #2
0
def _compute_inverse_type_frequencies() -> dict:
    predicate_types = defaultdict(set)
    for r in dbp_store.get_resources():
        for pred in dbp_store.get_properties(r):
            predicate_types[pred].update(dbp_store.get_transitive_types(r))

    overall_type_count = len(dbp_store.get_all_types())
    return {
        pred: math.log(overall_type_count / (len(predicate_types[pred]) + 1))
        for pred in dbp_store.get_all_predicates()
    }
Beispiel #3
0
def _compute_property_frequencies() -> dict:
    property_frequencies = defaultdict(lambda: defaultdict(int))
    for r in dbp_store.get_resources():
        types = dbp_store.get_transitive_types(r)
        for pred, values in dbp_store.get_properties(r).items():
            for t in types:
                property_frequencies[t][pred] += len(values)
    return defaultdict(
        lambda: defaultdict(float), {
            t: defaultdict(
                float, {
                    pred: (1 + math.log(count) if count > 0 else 0)
                    for pred, count in property_frequencies[t].items()
                })
            for t in property_frequencies
        })
Beispiel #4
0
def _compute_type_resource_scores(graph, node: str,
                                  direct_resources_only: bool) -> dict:
    node_resources = graph.get_resources_from_categories(node)
    if not direct_resources_only or len(
        [r for r in node_resources if dbp_store.get_types(r)]) < 5:
        node_resources.update({
            r
            for sn in graph.descendants(node)
            for r in graph.get_resources_from_categories(sn)
        })
    node_resources = node_resources.intersection(dbp_store.get_resources())
    if len(node_resources) < 5:
        return {
        }  # better not return anything, if number of resources is too small
    type_counts = defaultdict(int)
    for res in node_resources:
        for t in dbp_store.get_transitive_types(res):
            type_counts[t] += 1
    return {t: count / len(node_resources) for t, count in type_counts.items()}
Beispiel #5
0
def _apply_rules(pattern_dict: dict, cat: str) -> set:
    """Apply rules form `pattern_dict` and return the implied axioms."""
    cat_words = cat_store.get_label(cat).split(' ')

    axiom_patterns, pattern_lengths = _detect_pattern(pattern_dict, cat_words)
    if not axiom_patterns:
        return set()

    (pred, pred_type), additional_axioms = axiom_patterns
    front_pattern_idx = pattern_lengths[0] or None
    back_pattern_idx = -1 * pattern_lengths[1] or None
    resource = ' '.join(cat_words[front_pattern_idx:back_pattern_idx])

    if pred_type:
        resource = dbp_util.name2resource(resource)
        if resource not in dbp_store.get_resources(
        ) or pred_type not in dbp_store.get_transitive_types(resource):
            return set()
    return {(cat, pred, resource)} | {(cat, pred, val)
                                      for pred, val in additional_axioms}
Beispiel #6
0
def _compute_predicate_types(resource_property_mapping: dict,
                             threshold: float) -> dict:
    predicate_type_distribution = defaultdict(lambda: defaultdict(int))
    for r in resource_property_mapping:
        for pred, values in resource_property_mapping[r].items():
            triple_count = len(values)
            predicate_type_distribution[pred]['_sum'] += triple_count
            for t in dbp_store.get_transitive_types(r):
                predicate_type_distribution[pred][t] += triple_count

    matching_types = {}
    for pred in predicate_type_distribution:
        t_sum = predicate_type_distribution[pred]['_sum']
        t_scores = {
            t: t_count / t_sum
            for t, t_count in predicate_type_distribution[pred].items()
            if t != '_sum'
        }
        if t_scores:
            t_score_max = max(t_scores.values())
            if t_score_max >= threshold:
                type_candidates = {
                    t
                    for t, t_score in t_scores.items()
                    if t_score == t_score_max
                }
                if len(type_candidates) > 1:
                    type_candidates = {
                        t
                        for t in type_candidates
                        if not type_candidates.intersection(
                            dbp_store.get_transitive_subtypes(t))
                    }

                if len(type_candidates) == 1 or dbp_store.are_equivalent_types(
                        type_candidates):
                    matching_types[pred] = type_candidates.pop()
    return matching_types
Beispiel #7
0
def _get_lines_dbpedia_instance_types(graph) -> list:
    """Serialize new types for DBpedia resources in DBpedia namespace."""
    new_dbpedia_types = defaultdict(set)
    for node in graph.nodes:
        node_types = graph.get_transitive_dbpedia_types(node,
                                                        force_recompute=True)
        transitive_node_types = {
            tt
            for t in node_types
            for tt in dbp_store.get_transitive_supertype_closure(t)
        }.difference({rdf_util.CLASS_OWL_THING})
        for res in graph.get_resources(node):
            dbp_res = clg_util.clg_resource2dbp_resource(res)
            if dbp_res in dbp_store.get_resources():
                new_dbpedia_types[dbp_res].update(
                    transitive_node_types.difference(
                        dbp_store.get_transitive_types(dbp_res)))
            else:
                new_dbpedia_types[dbp_res].update(transitive_node_types)
    return [
        serialize_util.as_object_triple(res, rdf_util.PREDICATE_TYPE, t)
        for res, types in new_dbpedia_types.items() for t in types
    ]
Beispiel #8
0
 def accepts_resource(self, dbp_resource: str) -> bool:
     return self.value in dbp_store.get_transitive_types(dbp_resource)