Esempio n. 1
0
def get_label_category(label: str) -> str:
    """Return the category that fits the given label best."""
    global __INVERSE_CATEGORY_LABELS__
    if '__INVERSE_CATEGORY_LABELS__' not in globals():
        labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL)
        __INVERSE_CATEGORY_LABELS__ = {v: k for k, v in labels.items()}
    return __INVERSE_CATEGORY_LABELS__[label] if label in __INVERSE_CATEGORY_LABELS__ else cat_util.name2category(label)
Esempio n. 2
0
def get_label(dbp_object: str) -> str:
    """Return the label of a DBpedia resource or type."""
    global __RESOURCE_LABELS__
    if '__RESOURCE_LABELS__' not in globals():
        __RESOURCE_LABELS__ = dict(_get_label_mapping())
        __RESOURCE_LABELS__.update(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL))
    return __RESOURCE_LABELS__[dbp_object] if dbp_object in __RESOURCE_LABELS__ else dbp_util.object2name(dbp_object)
Esempio n. 3
0
def get_label(category: str) -> str:
    """Return the label for the given category."""
    global __CATEGORY_LABELS__
    if '__CATEGORY_LABELS__' not in globals():
        __CATEGORY_LABELS__ = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL)

    return __CATEGORY_LABELS__[category] if category in __CATEGORY_LABELS__ else cat_util.category2name(category)
Esempio n. 4
0
def _create_category_graph() -> nx.DiGraph:
    skos_nodes = set(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_TYPE))
    skos_edges = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_BROADER)
    skos_edges = [(p, c) for c, parents in skos_edges.items() for p in parents if p != c]
    wiki_category_edges = [(p, c) for c, ps in wikipedia.extract_parent_categories().items() for p in ps if p != c]
    graph = nx.DiGraph(incoming_graph_data=skos_edges + wiki_category_edges)
    graph.add_nodes_from(skos_nodes)

    # identify maintenance categories
    invalid_parent_categories = [
        'Hidden categories', 'Tracking categories', 'Disambiguation categories', 'Non-empty disambiguation categories',
        'All redirect categories', 'Wikipedia soft redirected categories', 'Category redirects with possibilities',
        'Wikipedia non-empty soft redirected categories'
    ]
    invalid_categories = {c for ipc in invalid_parent_categories for c in graph.successors(cat_util.name2category(ipc))}
    # identify any remaining invalid categories (maintenance categories etc) using indicator tokens
    ignored_category_endings = ('files', 'images', 'lists', 'articles', 'stubs', 'pages', 'categories')
    maintenance_category_indicators = {
        'wikipedia', 'wikipedians', 'wikimedia', 'wikiproject', 'redirects',
        'mediawiki', 'template', 'templates', 'user', 'portal', 'navigational'
    }
    for cat in graph:
        cat_tokens = {t.lower() for t in cat_util.remove_category_prefix(cat).split('_')}
        if cat.lower().endswith(ignored_category_endings) or cat_tokens.intersection(maintenance_category_indicators):
            invalid_categories.add(cat)
    invalid_categories.update(set(graph.nodes).difference(skos_nodes))  # only keep categories mentioned in skos
    invalid_categories.discard(utils.get_config('category.root_category'))  # make sure to keep root node
    graph.remove_nodes_from(invalid_categories)
    return graph
Esempio n. 5
0
def get_maintenance_categories() -> set:
    global __MAINTENANCE_CATS__
    if '__MAINTENANCE_CATS__' not in globals():
        __MAINTENANCE_CATS__ = set(
            rdf_util.create_single_val_dict_from_rdf(
                [util.get_data_file('files.dbpedia.maintenance_categories')],
                rdf_util.PREDICATE_TYPE))

    return __MAINTENANCE_CATS__
Esempio n. 6
0
def get_range(dbp_predicate: str) -> Optional[str]:
    global __PREDICATE_RANGE__
    if '__PREDICATE_RANGE__' not in globals():
        __PREDICATE_RANGE__ = rdf_util.create_single_val_dict_from_rdf(
            [util.get_data_file('files.dbpedia.taxonomy')],
            rdf_util.PREDICATE_RANGE)

    return __PREDICATE_RANGE__[
        dbp_predicate] if dbp_predicate in __PREDICATE_RANGE__ else None
Esempio n. 7
0
def get_label(category: str) -> str:
    global __CATEGORY_LABELS__
    if '__CATEGORY_LABELS__' not in globals():
        __CATEGORY_LABELS__ = rdf_util.create_single_val_dict_from_rdf(
            [util.get_data_file('files.dbpedia.categories')],
            rdf_util.PREDICATE_SKOS_LABEL)

    return __CATEGORY_LABELS__[
        category] if category in __CATEGORY_LABELS__ else cat_util.category2name(
            category)
Esempio n. 8
0
def _get_label_mapping() -> dict:
    global __RESOURCE_LABEL_MAPPING__
    if '__RESOURCE_LABEL_MAPPING__' not in globals():
        initializer = lambda: rdf_util.create_single_val_dict_from_rdf([
            util.get_data_file('files.dbpedia.labels')
        ], rdf_util.PREDICATE_LABEL)
        __RESOURCE_LABEL_MAPPING__ = util.load_or_create_cache(
            'dbpedia_resource_labels', initializer)

    return __RESOURCE_LABEL_MAPPING__
Esempio n. 9
0
def get_categories() -> set:
    global __CATEGORIES__
    if '__CATEGORIES__' not in globals():
        initializer = lambda: set(
            rdf_util.create_single_val_dict_from_rdf([
                util.get_data_file('files.dbpedia.categories')
            ], rdf_util.PREDICATE_TYPE))
        __CATEGORIES__ = util.load_or_create_cache('dbpedia_categories',
                                                   initializer)

    return __CATEGORIES__
Esempio n. 10
0
def resolve_redirect(dbp_resource: str, visited=None) -> str:
    """Return the resource to which `dbp_resource` redirects (if any) or `dbp_resource` itself."""
    global __REDIRECTS__
    if '__REDIRECTS__' not in globals():
        initializer = lambda: rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.redirects')], rdf_util.PREDICATE_REDIRECTS)
        __REDIRECTS__ = utils.load_or_create_cache('dbpedia_resource_redirects', initializer)

    if dbp_resource in __REDIRECTS__:
        visited = visited or set()
        if dbp_resource not in visited:
            return resolve_redirect(__REDIRECTS__[dbp_resource], visited | {dbp_resource})
    return dbp_resource
Esempio n. 11
0
def get_object_for_label(label: str) -> str:
    """Return the object that fits the given label."""
    global __RESOURCE_INVERSE_LABELS__
    global __ONTOLOGY_INVERSE_LABELS__
    if '__RESOURCE_INVERSE_LABELS__' not in globals():
        __RESOURCE_INVERSE_LABELS__ = {v: k for k, v in _get_label_mapping().items()}
        ontology_labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL)
        __ONTOLOGY_INVERSE_LABELS__ = {v: k for k, v in ontology_labels.items()}
    if label in __ONTOLOGY_INVERSE_LABELS__:
        return __ONTOLOGY_INVERSE_LABELS__[label]
    if label in __RESOURCE_INVERSE_LABELS__:
        return __RESOURCE_INVERSE_LABELS__[label]
    return dbp_util.name2resource(label)
Esempio n. 12
0
def get_range(dbp_predicate: str) -> Optional[str]:
    """Return the range of a given predicate."""
    global __PREDICATE_RANGE__
    if '__PREDICATE_RANGE__' not in globals():
        __PREDICATE_RANGE__ = defaultdict(lambda: None, rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_RANGE))
    return __PREDICATE_RANGE__[dbp_predicate]