def get_resource_property_mapping() -> dict: global __RESOURCE_PROPERTY_MAPPING__ if '__RESOURCE_PROPERTY_MAPPING__' not in globals(): property_files = [ util.get_data_file('files.dbpedia.mappingbased_literals'), util.get_data_file('files.dbpedia.mappingbased_objects') ] initializer = lambda: rdf_util.create_dict_from_rdf(property_files) __RESOURCE_PROPERTY_MAPPING__ = util.load_or_create_cache( 'dbpedia_resource_properties', initializer) return __RESOURCE_PROPERTY_MAPPING__
def _get_resource_type_mapping() -> dict: global __RESOURCE_TYPE_MAPPING__ if '__RESOURCE_TYPE_MAPPING__' not in globals(): type_files = [ util.get_data_file('files.dbpedia.instance_types'), util.get_data_file('files.dbpedia.transitive_instance_types'), ] initializer = lambda: rdf_util.create_multi_val_dict_from_rdf( type_files, rdf_util.PREDICATE_TYPE) __RESOURCE_TYPE_MAPPING__ = util.load_or_create_cache( 'dbpedia_resource_type_mapping', initializer) return __RESOURCE_TYPE_MAPPING__
def get_disjoint_types(dbp_type: str) -> set: """Return all types that are disjoint with `dbp_type` (excluding the wrong disjointness Agent<->Place).""" global __DISJOINT_TYPE_MAPPING__ if '__DISJOINT_TYPE_MAPPING__' not in globals(): __DISJOINT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_DISJOINT_WITH, reflexive=True) # add/remove custom axioms __DISJOINT_TYPE_MAPPING__ = defaultdict( set, { k: { v for v in values if {k, v} not in REMOVED_DISJOINTNESS_AXIOMS } for k, values in __DISJOINT_TYPE_MAPPING__.items() }) for a, b in ADDED_DISJOINTNESS_AXIOMS: __DISJOINT_TYPE_MAPPING__[a].add(b) __DISJOINT_TYPE_MAPPING__[b].add(a) # completing the subtype of each type with the subtypes of its disjoint types __DISJOINT_TYPE_MAPPING__ = defaultdict( set, { t: { st for dt in disjoint_types for st in get_transitive_subtypes(dt) } for t, disjoint_types in __DISJOINT_TYPE_MAPPING__.items() }) return __DISJOINT_TYPE_MAPPING__[dbp_type]
def _save_to_file(netgen_output, problem_num, density): with open(util.get_data_file('%s_%d.txt' % (density, problem_num)), 'w') as f: nl = '' for line in netgen_output.split('\n'): if not line.replace('\n', '') or line.startswith('c'): continue f.write('%s%s' % (nl, line)) nl = '\n'
def get_maintenance_categories() -> set: global __MAINTENANCE_CATS__ if '__MAINTENANCE_CATS__' not in globals(): __MAINTENANCE_CATS__ = set( rdf_util.create_single_val_dict_from_rdf( [util.get_data_file('files.dbpedia.maintenance_categories')], rdf_util.PREDICATE_TYPE)) return __MAINTENANCE_CATS__
def get_equivalent_types(dbp_type: str) -> set: global __EQUIVALENT_TYPE_MAPPING__ if '__EQUIVALENT_TYPE_MAPPING__' not in globals(): __EQUIVALENT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_CLASS, reflexive=True) return {dbp_type} | __EQUIVALENT_TYPE_MAPPING__[dbp_type]
def get_range(dbp_predicate: str) -> Optional[str]: global __PREDICATE_RANGE__ if '__PREDICATE_RANGE__' not in globals(): __PREDICATE_RANGE__ = rdf_util.create_single_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_RANGE) return __PREDICATE_RANGE__[ dbp_predicate] if dbp_predicate in __PREDICATE_RANGE__ else None
def get_label(category: str) -> str: global __CATEGORY_LABELS__ if '__CATEGORY_LABELS__' not in globals(): __CATEGORY_LABELS__ = rdf_util.create_single_val_dict_from_rdf( [util.get_data_file('files.dbpedia.categories')], rdf_util.PREDICATE_SKOS_LABEL) return __CATEGORY_LABELS__[ category] if category in __CATEGORY_LABELS__ else cat_util.category2name( category)
def _get_label_mapping() -> dict: global __RESOURCE_LABEL_MAPPING__ if '__RESOURCE_LABEL_MAPPING__' not in globals(): initializer = lambda: rdf_util.create_single_val_dict_from_rdf([ util.get_data_file('files.dbpedia.labels') ], rdf_util.PREDICATE_LABEL) __RESOURCE_LABEL_MAPPING__ = util.load_or_create_cache( 'dbpedia_resource_labels', initializer) return __RESOURCE_LABEL_MAPPING__
def get_children(category: str) -> set: global __CHILDREN__ if '__CHILDREN__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.categories')], rdf_util.PREDICATE_BROADER, reverse_key=True) __CHILDREN__ = util.load_or_create_cache('dbpedia_category_children', initializer) return __CHILDREN__[category].difference({category})
def get_resources(category: str) -> set: global __CATEGORY_RESOURCES__ if '__CATEGORY_RESOURCES__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.article_categories')], rdf_util.PREDICATE_SUBJECT, reverse_key=True) __CATEGORY_RESOURCES__ = util.load_or_create_cache( 'dbpedia_category_resources', initializer) return __CATEGORY_RESOURCES__[category]
def get_label(dbp_object: str) -> str: global __RESOURCE_LABELS__ if '__RESOURCE_LABELS__' not in globals(): __RESOURCE_LABELS__ = dict(_get_label_mapping()) __RESOURCE_LABELS__.update( rdf_util.create_single_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL)) return __RESOURCE_LABELS__[ dbp_object] if dbp_object in __RESOURCE_LABELS__ else dbp_util.object2name( dbp_object)
def get_categories() -> set: global __CATEGORIES__ if '__CATEGORIES__' not in globals(): initializer = lambda: set( rdf_util.create_single_val_dict_from_rdf([ util.get_data_file('files.dbpedia.categories') ], rdf_util.PREDICATE_TYPE)) __CATEGORIES__ = util.load_or_create_cache('dbpedia_categories', initializer) return __CATEGORIES__
def _get_type_graph() -> nx.DiGraph: """Return the initialised graph of DBpedia types.""" global __TYPE_GRAPH__ if '__TYPE_GRAPH__' not in globals(): subtype_mapping = rdf_util.create_multi_val_dict_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_SUBCLASS_OF, reverse_key=True) # add missing types (i.e. those, that do not have subclasses at all) all_types = rdf_util.create_set_from_rdf( [util.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_TYPE, rdf_util.CLASS_OWL_CLASS) subtype_mapping.update({ et: set() for t in all_types for et in get_equivalent_types(t) if et not in subtype_mapping }) # completing subtypes with subtypes of equivalent types subtype_mapping = { t: { est for et in get_equivalent_types(t) for st in subtype_mapping[et] for est in get_equivalent_types(st) } for t in set(subtype_mapping) } # remove non-dbpedia types from ontology subtype_mapping = { t: { st for st in sts if dbp_util.is_dbp_type(st) or st == rdf_util.CLASS_OWL_THING } for t, sts in subtype_mapping.items() if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING } __TYPE_GRAPH__ = nx.DiGraph( incoming_graph_data=[(t, st) for t, sts in subtype_mapping.items() for st in sts]) return __TYPE_GRAPH__
def _retrieve_plaintexts() -> Tuple[str, str]: """Return an iterator over DBpedia resources and their Wikipedia plaintexts.""" with bz2.open(util.get_data_file('files.dbpedia.nif_context'), mode='rb') as nif_file: nif_collection = pynif.NIFCollection.loads(nif_file.read(), format='turtle') for nif_context in nif_collection.contexts: resource_uri = nif_context.original_uri[:nif_context.original_uri. rfind('?')] # remove parentheses and line breaks from text for easier parsing resource_plaintext = _remove_parentheses_content( nif_context.mention.replace('\n', ' ')) yield resource_uri, resource_plaintext
def resolve_redirect(dbp_resource: str, visited=None) -> str: """Return the resource to which `dbp_resource` redirects (if any) or `dbp_resource` itself.""" global __REDIRECTS__ if '__REDIRECTS__' not in globals(): initializer = lambda: rdf_util.create_single_val_dict_from_rdf([ util.get_data_file('files.dbpedia.redirects') ], rdf_util.PREDICATE_REDIRECTS) __REDIRECTS__ = util.load_or_create_cache('dbpedia_resource_redirects', initializer) if dbp_resource in __REDIRECTS__: visited = visited or set() if dbp_resource not in visited: return resolve_redirect(__REDIRECTS__[dbp_resource], visited | {dbp_resource}) return dbp_resource
def _compute_inverse_lexicalisations(): inverse_lexicalisation_dict = rdf_util.create_multi_val_freq_dict_from_rdf( [util.get_data_file('files.dbpedia.anchor_texts')], rdf_util.PREDICATE_ANCHOR_TEXT, reverse_key=True) for lex, resources in inverse_lexicalisation_dict.items(): for res in set(resources.keys()): redirect_res = resolve_redirect(res) if res != redirect_res: if redirect_res in inverse_lexicalisation_dict[lex]: inverse_lexicalisation_dict[lex][ redirect_res] += inverse_lexicalisation_dict[lex][res] else: inverse_lexicalisation_dict[lex][ redirect_res] = inverse_lexicalisation_dict[lex][res] del inverse_lexicalisation_dict[lex][res] return inverse_lexicalisation_dict
def _run_maxflow(self, maxflow_func, data_file): flow_network = DIMACSGraphFactory.create(util.get_data_file(data_file)) return self.measure_execution_time(maxflow_func, flow_network)