Ejemplo n.º 1
0
def extract_wiki_corpus_resources():
    """Crawl the Wikipedia corpus for hearst patterns to retrieve hypernyms and type lexicalisations."""
    if utils.load_cache('wikipedia_type_lexicalisations') is not None:
        return  # only compute hypernyms and type lexicalisations if they are not existing already

    utils.get_logger().info(
        'WIKIPEDIA/NIF: Computing wikipedia hypernyms and type lexicalisations..'
    )
    total_hypernyms = defaultdict(lambda: defaultdict(int))
    total_type_lexicalisations = defaultdict(lambda: defaultdict(int))

    # initialize some caches to reduce the setup time of the individual processes
    dbp_store.get_types('')
    dbp_store.get_inverse_lexicalisations('')
    spacy_util.get_hearst_pairs('')

    with mp.Pool(processes=utils.get_config('max_cpus')) as pool:
        for hypernyms, type_lexicalisations in pool.imap_unordered(
                _compute_counts_for_resource,
                tqdm(_retrieve_plaintexts()),
                chunksize=1000):
            for (sub, obj), count in hypernyms.items():
                total_hypernyms[sub][obj] += count
            for (sub, obj), count in type_lexicalisations.items():
                total_type_lexicalisations[sub][obj] += count

    wikipedia_hypernyms = {
        word: dict(hypernym_counts)
        for word, hypernym_counts in total_hypernyms.items()
    }
    utils.update_cache('wikipedia_hypernyms', wikipedia_hypernyms)

    type_lexicalisations = {
        word: dict(type_counts)
        for word, type_counts in total_type_lexicalisations.items()
        if word not in STOP_WORDS
    }
    utils.update_cache('wikipedia_type_lexicalisations', type_lexicalisations)
Ejemplo n.º 2
0
def _assign_entity_types_for_section(df: pd.DataFrame,
                                     section_entity: str) -> pd.DataFrame:
    """Retrieve the types of section entities."""
    section_types = {}
    for ent in df[section_entity].unique():
        types = dbp_store.get_independent_types(
            dbp_store.get_types(dbp_util.name2resource(str(ent))))
        if types:
            section_types[ent] = dbp_util.type2name(list(types)[0])
    section_types = pd.Series(section_types, name=f'{section_entity}type')
    return pd.merge(how='left',
                    left=df,
                    right=section_types,
                    left_on=section_entity,
                    right_index=True)
Ejemplo n.º 3
0
def _compute_counts_for_resource(uri_with_text: tuple) -> tuple:
    uri, text = uri_with_text
    hypernyms = defaultdict(int)
    type_lexicalisations = defaultdict(int)
    for sub, obj in spacy_util.get_hearst_pairs(text):
        # collect hypernym statistics in Wikipedia
        hypernyms[(nlp_util.lemmatize_token(sub.root).lower(),
                   nlp_util.lemmatize_token(obj.root).lower())] += 1

        # for each word, count the types that it refers to
        if uri not in dbp_store.get_inverse_lexicalisations(sub.text):
            continue  # discard, if the resource text does not refer to the subject of the article
        for t in dbp_store.get_types(uri):
            for word in obj:
                type_lexicalisations[(nlp_util.lemmatize_token(word).lower(),
                                      t)] += 1
    return hypernyms, type_lexicalisations
Ejemplo n.º 4
0
def _retrieve_training_data_wle(nlp: Language):
    listpages = list_store.get_parsed_listpages(wikipedia.ARTICLE_TYPE_ENUM)
    lp_to_cat_mapping = {
        lp: list_mapping.get_equivalent_categories(lp)
        | list_mapping.get_parent_categories(lp)
        for lp in listpages
    }
    lp_to_cat_mapping = {
        lp: cats
        for lp, cats in lp_to_cat_mapping.items() if cats
    }

    training_data = []
    # extract entities
    for lp, cats in lp_to_cat_mapping.items():
        lp_data = listpages[lp]
        for section_data in lp_data['sections']:
            for enum_data in section_data['enums']:
                for entry_data in enum_data:
                    text = entry_data['text']
                    if not text:
                        continue
                    entities = entry_data['entities']
                    if not entities:
                        continue
                    valid_entities = []
                    for entity_data in entities:
                        entity_uri = dbp_util.name2resource(
                            entity_data['name'])
                        entity_tag = _get_tag_for_types(
                            dbp_store.get_types(entity_uri))
                        if not entity_tag:
                            continue
                        entity_text = entity_data['text']
                        start = int(entity_data['idx'])
                        end = start + len(text)
                        if end > len(text) or text[start:end] != entity_text:
                            continue
                        valid_entities.append((start, end, entity_tag))
                    if len(entities) == len(valid_entities):
                        training_data.append(
                            Example.from_dict(nlp.make_doc(text),
                                              {'entities': valid_entities}))
    return training_data
Ejemplo n.º 5
0
def _compute_type_resource_scores(graph, node: str,
                                  direct_resources_only: bool) -> dict:
    node_resources = graph.get_resources_from_categories(node)
    if not direct_resources_only or len(
        [r for r in node_resources if dbp_store.get_types(r)]) < 5:
        node_resources.update({
            r
            for sn in graph.descendants(node)
            for r in graph.get_resources_from_categories(sn)
        })
    node_resources = node_resources.intersection(dbp_store.get_resources())
    if len(node_resources) < 5:
        return {
        }  # better not return anything, if number of resources is too small
    type_counts = defaultdict(int)
    for res in node_resources:
        for t in dbp_store.get_transitive_types(res):
            type_counts[t] += 1
    return {t: count / len(node_resources) for t, count in type_counts.items()}
Ejemplo n.º 6
0
def _assign_pagetypes(df: pd.DataFrame) -> pd.DataFrame:
    """Assign (most basic and most specific) page types to the existing dataframe."""
    data = []
    for page_name in df['P'].unique():
        if page_name.startswith('List of'):
            data.append((page_name, 'List', 'List'))
            continue
        page_uri = dbp_util.name2resource(page_name)
        P_types = dbp_store.get_independent_types(
            dbp_store.get_types(page_uri))
        if not P_types:
            data.append((page_name, 'Other', 'Other'))
            continue
        P_type = sorted(P_types)[0]
        P_basetype = _get_basetype(P_type)
        data.append((page_name, dbp_util.type2name(P_type),
                     dbp_util.type2name(P_basetype)))
    return pd.merge(left=df,
                    right=pd.DataFrame(data,
                                       columns=['P', 'P_type', 'P_basetype']),
                    on='P')
Ejemplo n.º 7
0
def _compute_labeled_entities_for_listpage(page_uri: str, page_data: dict,
                                           graph) -> tuple:
    positive_SEs, negative_SEs = dict(), set()
    # compute potential subject entities for list page
    page_potential_SEs = {
        dbp_util.resource2name(res)
        for cat in _get_category_descendants_for_list(page_uri)
        for res in cat_store.get_resources(cat)
    }
    # compute types of list page
    page_types = {
        t
        for n in graph.get_nodes_for_part(page_uri)
        for t in dbp_store.get_independent_types(
            graph.get_transitive_dbpedia_types(n))
    }
    page_disjoint_types = {
        dt
        for t in page_types for dt in dbp_heur.get_disjoint_types(t)
    }
    # collect all linked entities on the page
    page_entities = {
        ent['name']
        for s in page_data['sections'] for enum in s['enums'] for entry in enum
        for ent in entry['entities']
    }
    page_entities.update({
        ent['name']
        for s in page_data['sections'] for table in s['tables']
        for row in table['data'] for cell in row for ent in cell['entities']
    })
    for ent in page_entities:
        ent_uri = dbp_util.name2resource(ent)
        if not dbp_store.is_possible_resource(ent_uri):
            negative_SEs.add(ent)
        elif ent in page_potential_SEs:
            positive_SEs[ent] = _compute_entity_label(ent_uri)
        elif page_disjoint_types.intersection(dbp_store.get_types(ent_uri)):
            negative_SEs.add(ent)
    return positive_SEs, negative_SEs
Ejemplo n.º 8
0
 def get_resources(self, node: str) -> set:
     """Return all resources of a node."""
     if node not in self._node_resources:
         disjoint_dbp_types = self.get_disjoint_dbp_types(node,
                                                          transitive=True)
         dbp_resources = self.get_resources_from_categories(node) | {
             r
             for t in self.get_type_parts(node)
             for r in dbp_store.get_direct_resources_for_type(t)
         }
         dbp_resources = {
             r
             for r in dbp_resources
             if not disjoint_dbp_types.intersection(dbp_store.get_types(r))
         }
         self._node_resources[node] = {
             clg_util.dbp_resource2clg_resource(r)
             for r in dbp_resources
         }
         if self.use_listing_resources:
             self._node_resources[node].update(
                 self.get_resources_from_listings(node))
     return self._node_resources[node]
Ejemplo n.º 9
0
def _generate_dbpedia_unknown_resources_graph():
    """Create graph of Figure 4b"""
    # retrieve data from extracted assertions
    cat2ax_relation_triples = pd.read_csv(
        util.get_results_file('results.cat2ax.relation_assertions'), sep=';')
    cat2ax_new_relation_resources = len({
        r
        for r in cat2ax_relation_triples['sub'].unique()
        if not dbp_store.get_properties(r)
    })
    cat2ax_type_triples = pd.read_csv(
        util.get_results_file('results.cat2ax.type_assertions'), sep=';')
    cat2ax_new_type_resources = len({
        r
        for r in cat2ax_type_triples['sub'].unique()
        if not dbp_store.get_types(r)
    })

    catriple_relation_triples = pd.read_csv(
        util.get_results_file('results.catriple.relation_assertions'), sep=';')
    catriple_new_relation_resources = len({
        r
        for r in catriple_relation_triples['sub'].unique()
        if not dbp_store.get_properties(r)
    })

    cdf_relation_triples = pd.read_csv(
        util.get_results_file('results.cdf.relation_assertions'), sep=';')
    cdf_new_relation_resources = len({
        r
        for r in cdf_relation_triples['sub'].unique()
        if not dbp_store.get_properties(r)
    })
    cdf_type_triples = pd.read_csv(
        util.get_results_file('results.cdf.type_assertions'), sep=';')
    cdf_new_type_resources = len({
        r
        for r in cdf_type_triples['sub'].unique() if not dbp_store.get_types(r)
    })

    # initialise bars
    bars_ca = [cat2ax_new_relation_resources, cat2ax_new_type_resources]
    bars_ct = [catriple_new_relation_resources, 0]
    bars_cdf = [cdf_new_relation_resources, cdf_new_type_resources]

    # arrange bars
    bar_width = 0.25
    r1 = np.arange(len(bars_ca))
    r2 = [x + bar_width for x in r1]
    r3 = [x + bar_width for x in r2]

    # make plot
    plt.figure(figsize=(8, 5))
    plt.bar(r1,
            bars_ca,
            color='#2d7f5e',
            width=bar_width,
            edgecolor='white',
            label='Cat2Ax')
    plt.bar(r2,
            bars_ct,
            color='darkgrey',
            width=bar_width,
            edgecolor='white',
            label='Catriple')
    plt.bar(r3,
            bars_cdf,
            color='black',
            width=bar_width,
            edgecolor='white',
            label='C-DF')
    plt.ylabel('Amount of resources', fontsize=16)
    plt.xticks([r + bar_width for r in range(len(bars_ca))],
               ['(1) Relations', '(2) Types'],
               fontsize=16)
    plt.yticks(fontsize=14)
    plt.legend(fontsize=15)
    ax = plt.gca()
    ax.yaxis.grid()

    plt.savefig(
        util.get_results_file('results.graphs.dbpedia_unknown_resources'),
        bbox_inches='tight')
Ejemplo n.º 10
0
Archivo: cdf.py Proyecto: nheist/Cat2Ax
def _extract_axioms_with_rules(cat_dfs: dict) -> set:
    """Return axioms genered by applying C-DF rules."""

    # generate rule candidates by extracting shared pre-/postfixes
    cdf_rule_candidates = defaultdict(lambda: defaultdict(lambda: 0))
    for cat, (df, _) in cat_dfs.items():
        cat_label = cat_store.get_label(cat)
        for f in {f for f in df if f[0] != rdf_util.PREDICATE_TYPE}:
            if dbp_util.is_dbp_resource(f[1]):
                f_label = dbp_store._get_label_mapping()[
                    f[1]] if f[1] in dbp_store._get_label_mapping(
                    ) else dbp_util.object2name(f[1])
            else:
                f_label = f[1]
            if f_label in cat_label:
                first_words = cat_label[:cat_label.index(f_label)].strip()
                first_words = tuple(
                    first_words.split(' ')) if first_words else tuple()
                last_words = cat_label[cat_label.index(f_label) +
                                       len(f_label):].strip()
                last_words = tuple(
                    last_words.split(' ')) if last_words else tuple()
                if first_words or last_words:
                    f_types = dbp_store.get_independent_types(
                        dbp_store.get_types(f[1])) if dbp_util.is_dbp_resource(
                            f[1]) else set()
                    f_type = f_types.pop() if f_types else None
                    cdf_rule_candidates[(first_words,
                                         last_words)][((f[0], f_type),
                                                       tuple(
                                                           set(df).difference(
                                                               {f})))] += 1

    # filter rules using the threshold parameters min_support and beta
    cdf_rules = {}
    min_support = util.get_config('cdf.min_support')
    beta = util.get_config('cdf.beta')
    for word_patterns in cdf_rule_candidates:
        total_support = sum(cdf_rule_candidates[word_patterns].values())
        valid_axiom_patterns = [
            pattern
            for pattern, support in cdf_rule_candidates[word_patterns].items()
            if support >= min_support and (support / total_support) >= beta
        ]

        if len(valid_axiom_patterns) > 0:
            cdf_rules[word_patterns] = valid_axiom_patterns[0]

    # apply the patterns to all categories in order to extract axioms
    # (the rules are applied individually depending on whether the pattern is at the front, back, or front+back in order to reduce computational complexity)
    cdf_front_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if word_patterns[0] and not word_patterns[1]
    }
    cdf_front_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_front_patterns.items():
        _fill_dict(
            cdf_front_pattern_dict, list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    cdf_back_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if not word_patterns[0] and word_patterns[1]
    }
    cdf_back_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_back_patterns.items():
        _fill_dict(
            cdf_back_pattern_dict, list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    cdf_enclosing_patterns = {
        word_patterns: axiom_pattern
        for word_patterns, axiom_pattern in cdf_rules.items()
        if word_patterns[0] and word_patterns[1]
    }
    cdf_enclosing_pattern_dict = {}
    for (front_pattern,
         back_pattern), axiom_patterns in cdf_enclosing_patterns.items():
        _fill_dict(
            cdf_enclosing_pattern_dict,
            list(front_pattern), lambda d: _fill_dict(
                d, list(reversed(back_pattern)), axiom_patterns))

    rule_axioms = set()
    for cat in cat_store.get_usable_cats():
        rule_axioms.update(_apply_rules(cdf_front_pattern_dict, cat))
        rule_axioms.update(_apply_rules(cdf_back_pattern_dict, cat))
        rule_axioms.update(_apply_rules(cdf_enclosing_pattern_dict, cat))
    return rule_axioms
Ejemplo n.º 11
0
Archivo: cdf.py Proyecto: nheist/Cat2Ax
        best_df, score = max(candidate_dfs.items(),
                             key=operator.itemgetter(1),
                             default=(None, 0))
        if score > alpha:
            cat_dfs[cat] = (best_df, score)
    return cat_dfs


# create an index of resources and properties that converts string-uris to integers in order to speed up indexing and reduce complexity
resource_features = {
    res: {(k, v)
          for k, values in props.items() for v in values}
    for res, props in dbp_store.get_resource_property_mapping().items()
}
for res in resource_features:
    for t in dbp_store.get_types(res):
        resource_features[res].add((rdf_util.PREDICATE_TYPE, t))

resource_to_idx_dict = {
    res: i
    for res, i in zip(resource_features, range(len(resource_features)))
}

feature_to_idx_dict = defaultdict(set)
for res, feats in resource_features.items():
    res_idx = resource_to_idx_dict[res]
    for f in feats:
        feature_to_idx_dict[f].add(res_idx)


def _get_overall_features_count(feats: tuple, cat: str = None) -> int:
Ejemplo n.º 12
0
def _compute_entity_label(resource_uri: str) -> str:
    for t in dbp_store.get_types(resource_uri):
        if t in TYPE_LABEL_MAPPING:
            return TYPE_LABEL_MAPPING[t]
    return LABEL_OTHER
Ejemplo n.º 13
0
 def rejects_resource(self, dbp_resource: str) -> bool:
     return self.value in {dt for t in dbp_store.get_types(dbp_resource) for dt in dbp_heur.get_disjoint_types(t)}