def get_label_category(label: str) -> str: """Return the category that fits the given label best.""" global __INVERSE_CATEGORY_LABELS__ if '__INVERSE_CATEGORY_LABELS__' not in globals(): labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL) __INVERSE_CATEGORY_LABELS__ = {v: k for k, v in labels.items()} return __INVERSE_CATEGORY_LABELS__[label] if label in __INVERSE_CATEGORY_LABELS__ else cat_util.name2category(label)
def get_label(dbp_object: str) -> str: """Return the label of a DBpedia resource or type.""" global __RESOURCE_LABELS__ if '__RESOURCE_LABELS__' not in globals(): __RESOURCE_LABELS__ = dict(_get_label_mapping()) __RESOURCE_LABELS__.update(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL)) return __RESOURCE_LABELS__[dbp_object] if dbp_object in __RESOURCE_LABELS__ else dbp_util.object2name(dbp_object)
def get_label(category: str) -> str: """Return the label for the given category.""" global __CATEGORY_LABELS__ if '__CATEGORY_LABELS__' not in globals(): __CATEGORY_LABELS__ = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL) return __CATEGORY_LABELS__[category] if category in __CATEGORY_LABELS__ else cat_util.category2name(category)
def _create_category_graph() -> nx.DiGraph: skos_nodes = set(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_TYPE)) skos_edges = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_BROADER) skos_edges = [(p, c) for c, parents in skos_edges.items() for p in parents if p != c] wiki_category_edges = [(p, c) for c, ps in wikipedia.extract_parent_categories().items() for p in ps if p != c] graph = nx.DiGraph(incoming_graph_data=skos_edges + wiki_category_edges) graph.add_nodes_from(skos_nodes) # identify maintenance categories invalid_parent_categories = [ 'Hidden categories', 'Tracking categories', 'Disambiguation categories', 'Non-empty disambiguation categories', 'All redirect categories', 'Wikipedia soft redirected categories', 'Category redirects with possibilities', 'Wikipedia non-empty soft redirected categories' ] invalid_categories = {c for ipc in invalid_parent_categories for c in graph.successors(cat_util.name2category(ipc))} # identify any remaining invalid categories (maintenance categories etc) using indicator tokens ignored_category_endings = ('files', 'images', 'lists', 'articles', 'stubs', 'pages', 'categories') maintenance_category_indicators = { 'wikipedia', 'wikipedians', 'wikimedia', 'wikiproject', 'redirects', 'mediawiki', 'template', 'templates', 'user', 'portal', 'navigational' } for cat in graph: cat_tokens = {t.lower() for t in cat_util.remove_category_prefix(cat).split('_')} if cat.lower().endswith(ignored_category_endings) or cat_tokens.intersection(maintenance_category_indicators): invalid_categories.add(cat) invalid_categories.update(set(graph.nodes).difference(skos_nodes)) # only keep categories mentioned in skos invalid_categories.discard(utils.get_config('category.root_category')) # make sure to keep root node graph.remove_nodes_from(invalid_categories) return graph
def get_maintenance_categories() -> set: global __MAINTENANCE_CATS__ if '__MAINTENANCE_CATS__' not in globals(): __MAINTENANCE_CATS__ = set( rdf_util.create_single_val_dict_from_rdf( [util.get_data_file('files.dbpedia.maintenance_categories')], rdf_util.PREDICATE_TYPE)) return __MAINTENANCE_CATS__
def get_range(dbp_predicate: str) -> Optional[str]: global __PREDICATE_RANGE__ if '__PREDICATE_RANGE__' not in globals(): __PREDICATE_RANGE__ = rdf_util.create_single_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_RANGE) return __PREDICATE_RANGE__[ dbp_predicate] if dbp_predicate in __PREDICATE_RANGE__ else None
def get_label(category: str) -> str: global __CATEGORY_LABELS__ if '__CATEGORY_LABELS__' not in globals(): __CATEGORY_LABELS__ = rdf_util.create_single_val_dict_from_rdf( [util.get_data_file('files.dbpedia.categories')], rdf_util.PREDICATE_SKOS_LABEL) return __CATEGORY_LABELS__[ category] if category in __CATEGORY_LABELS__ else cat_util.category2name( category)
def _get_label_mapping() -> dict: global __RESOURCE_LABEL_MAPPING__ if '__RESOURCE_LABEL_MAPPING__' not in globals(): initializer = lambda: rdf_util.create_single_val_dict_from_rdf([ util.get_data_file('files.dbpedia.labels') ], rdf_util.PREDICATE_LABEL) __RESOURCE_LABEL_MAPPING__ = util.load_or_create_cache( 'dbpedia_resource_labels', initializer) return __RESOURCE_LABEL_MAPPING__
def get_categories() -> set: global __CATEGORIES__ if '__CATEGORIES__' not in globals(): initializer = lambda: set( rdf_util.create_single_val_dict_from_rdf([ util.get_data_file('files.dbpedia.categories') ], rdf_util.PREDICATE_TYPE)) __CATEGORIES__ = util.load_or_create_cache('dbpedia_categories', initializer) return __CATEGORIES__
def resolve_redirect(dbp_resource: str, visited=None) -> str: """Return the resource to which `dbp_resource` redirects (if any) or `dbp_resource` itself.""" global __REDIRECTS__ if '__REDIRECTS__' not in globals(): initializer = lambda: rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.redirects')], rdf_util.PREDICATE_REDIRECTS) __REDIRECTS__ = utils.load_or_create_cache('dbpedia_resource_redirects', initializer) if dbp_resource in __REDIRECTS__: visited = visited or set() if dbp_resource not in visited: return resolve_redirect(__REDIRECTS__[dbp_resource], visited | {dbp_resource}) return dbp_resource
def get_object_for_label(label: str) -> str: """Return the object that fits the given label.""" global __RESOURCE_INVERSE_LABELS__ global __ONTOLOGY_INVERSE_LABELS__ if '__RESOURCE_INVERSE_LABELS__' not in globals(): __RESOURCE_INVERSE_LABELS__ = {v: k for k, v in _get_label_mapping().items()} ontology_labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL) __ONTOLOGY_INVERSE_LABELS__ = {v: k for k, v in ontology_labels.items()} if label in __ONTOLOGY_INVERSE_LABELS__: return __ONTOLOGY_INVERSE_LABELS__[label] if label in __RESOURCE_INVERSE_LABELS__: return __RESOURCE_INVERSE_LABELS__[label] return dbp_util.name2resource(label)
def get_range(dbp_predicate: str) -> Optional[str]: """Return the range of a given predicate.""" global __PREDICATE_RANGE__ if '__PREDICATE_RANGE__' not in globals(): __PREDICATE_RANGE__ = defaultdict(lambda: None, rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_RANGE)) return __PREDICATE_RANGE__[dbp_predicate]