def _create_category_graph() -> nx.DiGraph: skos_nodes = set(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_TYPE)) skos_edges = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_BROADER) skos_edges = [(p, c) for c, parents in skos_edges.items() for p in parents if p != c] wiki_category_edges = [(p, c) for c, ps in wikipedia.extract_parent_categories().items() for p in ps if p != c] graph = nx.DiGraph(incoming_graph_data=skos_edges + wiki_category_edges) graph.add_nodes_from(skos_nodes) # identify maintenance categories invalid_parent_categories = [ 'Hidden categories', 'Tracking categories', 'Disambiguation categories', 'Non-empty disambiguation categories', 'All redirect categories', 'Wikipedia soft redirected categories', 'Category redirects with possibilities', 'Wikipedia non-empty soft redirected categories' ] invalid_categories = {c for ipc in invalid_parent_categories for c in graph.successors(cat_util.name2category(ipc))} # identify any remaining invalid categories (maintenance categories etc) using indicator tokens ignored_category_endings = ('files', 'images', 'lists', 'articles', 'stubs', 'pages', 'categories') maintenance_category_indicators = { 'wikipedia', 'wikipedians', 'wikimedia', 'wikiproject', 'redirects', 'mediawiki', 'template', 'templates', 'user', 'portal', 'navigational' } for cat in graph: cat_tokens = {t.lower() for t in cat_util.remove_category_prefix(cat).split('_')} if cat.lower().endswith(ignored_category_endings) or cat_tokens.intersection(maintenance_category_indicators): invalid_categories.add(cat) invalid_categories.update(set(graph.nodes).difference(skos_nodes)) # only keep categories mentioned in skos invalid_categories.discard(utils.get_config('category.root_category')) # make sure to keep root node graph.remove_nodes_from(invalid_categories) return graph
def _get_type_graph() -> nx.DiGraph: """Return the initialised graph of DBpedia types.""" global __TYPE_GRAPH__ if '__TYPE_GRAPH__' not in globals(): subtype_mapping = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_SUBCLASS_OF, reverse_key=True) # add missing types (i.e. those, that do not have subclasses at all) all_types = rdf_util.create_set_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_TYPE, rdf_util.CLASS_OWL_CLASS) subtype_mapping.update({et: set() for t in all_types for et in get_equivalent_types(t) if et not in subtype_mapping}) # completing subtypes with subtypes of equivalent types subtype_mapping = {t: {est for et in get_equivalent_types(t) for st in subtype_mapping[et] for est in get_equivalent_types(st)} for t in set(subtype_mapping)} # remove non-dbpedia types from ontology subtype_mapping = {t: {st for st in sts if dbp_util.is_dbp_type(st) or st == rdf_util.CLASS_OWL_THING} for t, sts in subtype_mapping.items() if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING} __TYPE_GRAPH__ = nx.DiGraph(incoming_graph_data=[(t, st) for t, sts in subtype_mapping.items() for st in sts]) return __TYPE_GRAPH__
def get_inverse_resource_property_mapping() -> dict: """Return a mapping from DBpedia resources to a dict containing property-value assignments (containing inverted facts of DBpedia).""" global __INVERSE_RESOURCE_PROPERTY_MAPPING__ if '__INVERSE_RESOURCE_PROPERTY_MAPPING__' not in globals(): initializer = lambda: rdf_util.create_dict_from_rdf([utils.get_data_file('files.dbpedia.mappingbased_objects')], reverse_key=True) __INVERSE_RESOURCE_PROPERTY_MAPPING__ = utils.load_or_create_cache('dbpedia_inverse_resource_properties', initializer) return __INVERSE_RESOURCE_PROPERTY_MAPPING__
def get_label(dbp_object: str) -> str: """Return the label of a DBpedia resource or type.""" global __RESOURCE_LABELS__ if '__RESOURCE_LABELS__' not in globals(): __RESOURCE_LABELS__ = dict(_get_label_mapping()) __RESOURCE_LABELS__.update(rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL)) return __RESOURCE_LABELS__[dbp_object] if dbp_object in __RESOURCE_LABELS__ else dbp_util.object2name(dbp_object)
def get_label_category(label: str) -> str: """Return the category that fits the given label best.""" global __INVERSE_CATEGORY_LABELS__ if '__INVERSE_CATEGORY_LABELS__' not in globals(): labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL) __INVERSE_CATEGORY_LABELS__ = {v: k for k, v in labels.items()} return __INVERSE_CATEGORY_LABELS__[label] if label in __INVERSE_CATEGORY_LABELS__ else cat_util.name2category(label)
def get_label(category: str) -> str: """Return the label for the given category.""" global __CATEGORY_LABELS__ if '__CATEGORY_LABELS__' not in globals(): __CATEGORY_LABELS__ = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_skos')], rdf_util.PREDICATE_PREFLABEL) return __CATEGORY_LABELS__[category] if category in __CATEGORY_LABELS__ else cat_util.category2name(category)
def get_topics(category: str) -> set: """Return the topics for the given category.""" global __TOPICS__ if '__TOPICS__' not in globals(): __TOPICS__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.topical_concepts')], rdf_util.PREDICATE_SUBJECT) return __TOPICS__[category]
def get_resource_property_mapping() -> dict: """Return a mapping from DBpedia resources to a dict containing property-value assignments (containing facts of DBpedia).""" global __RESOURCE_PROPERTY_MAPPING__ if '__RESOURCE_PROPERTY_MAPPING__' not in globals(): property_files = [utils.get_data_file('files.dbpedia.mappingbased_literals'), utils.get_data_file('files.dbpedia.mappingbased_objects')] initializer = lambda: rdf_util.create_dict_from_rdf(property_files) __RESOURCE_PROPERTY_MAPPING__ = utils.load_or_create_cache('dbpedia_resource_properties', initializer) return __RESOURCE_PROPERTY_MAPPING__
def get_equivalent_types(dbp_type: str) -> set: """Return the set of equivalent types to the given type (including itself).""" global __EQUIVALENT_TYPE_MAPPING__ if '__EQUIVALENT_TYPE_MAPPING__' not in globals(): __EQUIVALENT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_CLASS, reflexive=True) # remove external types from equivalent mappings as they are prone to errors __EQUIVALENT_TYPE_MAPPING__ = defaultdict(set, {t: {et for et in __EQUIVALENT_TYPE_MAPPING__[t] if dbp_util.is_dbp_type(et) or et == rdf_util.CLASS_OWL_THING} for t in __EQUIVALENT_TYPE_MAPPING__ if dbp_util.is_dbp_type(t) or t == rdf_util.CLASS_OWL_THING}) return {dbp_type} | __EQUIVALENT_TYPE_MAPPING__[dbp_type]
def get_resources(category: str) -> set: """Return all resources of the given category.""" global __CATEGORY_RESOURCES__ if '__CATEGORY_RESOURCES__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT, reverse_key=True) __CATEGORY_RESOURCES__ = utils.load_or_create_cache('dbpedia_category_resources', initializer) return __CATEGORY_RESOURCES__[category]
def get_resource_categories(dbp_resource: str) -> set: """Return all categories the given resource is contained in.""" global __RESOURCE_CATEGORIES__ if '__RESOURCE_CATEGORIES__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.category_articles')], rdf_util.PREDICATE_SUBJECT) __RESOURCE_CATEGORIES__ = utils.load_or_create_cache('dbpedia_resource_categories', initializer) return __RESOURCE_CATEGORIES__[dbp_resource]
def load_info(self, body): if body == CelestialBodyType.CELESTIAL_BODY: # self.load_file() pass elif body == CelestialBodyType.STAR: # self.load_file() pass elif body == CelestialBodyType.PLANET: # self.load_file() pass elif body == CelestialBodyType.DWARF_PLANET: # self.load_file() pass elif body == CelestialBodyType.NATURAL_SATELLITE: # self.load_file() pass elif body == CelestialBodyType.SUN: # self.load_file() pass elif body == CelestialBodyType.MERCURY: self.load_file(get_data_file("mercury")) elif body == CelestialBodyType.VENUS: self.load_file(get_data_file("venus")) elif body == CelestialBodyType.EARTH: self.load_file(get_data_file("earth")) elif body == CelestialBodyType.MARS: self.load_file(get_data_file("mars")) elif body == CelestialBodyType.JUPITER: self.load_file(get_data_file("jupiter")) elif body == CelestialBodyType.SATURN: self.load_file(get_data_file("saturn")) elif body == CelestialBodyType.URANUS: self.load_file(get_data_file("uranus")) elif body == CelestialBodyType.NEPTUNE: self.load_file(get_data_file("neptune")) elif body == CelestialBodyType.MOON: # self.load_file() pass
def resolve_redirect(dbp_resource: str, visited=None) -> str: """Return the resource to which `dbp_resource` redirects (if any) or `dbp_resource` itself.""" global __REDIRECTS__ if '__REDIRECTS__' not in globals(): initializer = lambda: rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.redirects')], rdf_util.PREDICATE_REDIRECTS) __REDIRECTS__ = utils.load_or_create_cache('dbpedia_resource_redirects', initializer) if dbp_resource in __REDIRECTS__: visited = visited or set() if dbp_resource not in visited: return resolve_redirect(__REDIRECTS__[dbp_resource], visited | {dbp_resource}) return dbp_resource
def _compute_inverse_lexicalisations(): # count how often a lexicalisation points to a given resource inv_lex_counts = rdf_util.create_multi_val_count_dict_from_rdf([utils.get_data_file('files.dbpedia.anchor_texts')], rdf_util.PREDICATE_ANCHOR_TEXT, reverse_key=True) # make sure that redirects are taken into account for lex, resources in inv_lex_counts.items(): for res in set(resources): redirect_res = resolve_redirect(res) if res != redirect_res: inv_lex_counts[lex][redirect_res] += inv_lex_counts[lex][res] del inv_lex_counts[lex][res] # convert to frequencies before returning return defaultdict(dict, {sub: {obj: count / sum(inv_lex_counts[sub].values()) for obj, count in obj_counts.items()} for sub, obj_counts in inv_lex_counts.items()})
def get_object_for_label(label: str) -> str: """Return the object that fits the given label.""" global __RESOURCE_INVERSE_LABELS__ global __ONTOLOGY_INVERSE_LABELS__ if '__RESOURCE_INVERSE_LABELS__' not in globals(): __RESOURCE_INVERSE_LABELS__ = {v: k for k, v in _get_label_mapping().items()} ontology_labels = rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_LABEL) __ONTOLOGY_INVERSE_LABELS__ = {v: k for k, v in ontology_labels.items()} if label in __ONTOLOGY_INVERSE_LABELS__: return __ONTOLOGY_INVERSE_LABELS__[label] if label in __RESOURCE_INVERSE_LABELS__: return __RESOURCE_INVERSE_LABELS__[label] return dbp_util.name2resource(label)
def get_disjoint_types(dbp_type: str) -> set: """Return all types that are disjoint with `dbp_type` (excluding the wrong disjointness Agent<->Place).""" global __DISJOINT_TYPE_MAPPING__ if '__DISJOINT_TYPE_MAPPING__' not in globals(): __DISJOINT_TYPE_MAPPING__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_DISJOINT_WITH, reflexive=True) # add/remove custom axioms __DISJOINT_TYPE_MAPPING__ = defaultdict(set, {k: {v for v in values if {k, v} not in REMOVED_DISJOINTNESS_AXIOMS} for k, values in __DISJOINT_TYPE_MAPPING__.items()}) for a, b in ADDED_DISJOINTNESS_AXIOMS: __DISJOINT_TYPE_MAPPING__[a].add(b) __DISJOINT_TYPE_MAPPING__[b].add(a) # completing the subtype of each type with the subtypes of its disjoint types __DISJOINT_TYPE_MAPPING__ = defaultdict(set, {t: {st for dt in disjoint_types for st in get_transitive_subtype_closure(dt)} for t, disjoint_types in __DISJOINT_TYPE_MAPPING__.items()}) return __DISJOINT_TYPE_MAPPING__[dbp_type]
def _retrieve_training_data_gs(nlp: Language): training_data = [] with open(utils.get_data_file( 'files.listpages.goldstandard_named-entity-tagging'), mode='r') as f: for line in f: data = json.loads(line) text = data['content'] entities = [] for annotation in data['annotation']: point = annotation['points'][0] entities.append( (point['start'], point['end'] + 1, annotation['label'][0])) training_data.append( Example.from_dict(nlp.make_doc(text), {'entities': entities})) return training_data
def _retrieve_plaintexts() -> Tuple[str, str]: """Return an iterator over DBpedia resources and their Wikipedia plaintexts.""" with bz2.open(utils.get_data_file('files.dbpedia.nif_context'), mode='rb') as nif_file: nif_collection = pynif.NIFCollection.loads(nif_file.read(), format='turtle') for nif_context in nif_collection.contexts: resource_uri = nif_context.original_uri[:nif_context.original_uri. rfind('?')] # remove parentheses and line breaks from text for easier parsing resource_plaintext = nif_context.mention.replace('\n', ' ') resource_plaintext = nlp_util.remove_bracket_content( resource_plaintext, substitute='') resource_plaintext = nlp_util.remove_bracket_content( resource_plaintext, bracket_type='[', substitute='') yield resource_uri, resource_plaintext
def compute_hypernyms(category_graph) -> dict: """Retrieves all hypernym relationships from the three sources (Wiki corpus, WebIsALOD, Category axioms).""" hypernyms = defaultdict(set) # collect hypernyms from axiom matches between Wikipedia categories cat_headlemmas = category_graph.get_node_LHS() axiom_hypernyms = defaultdict(lambda: defaultdict(int)) for parent, child in category_graph.get_axiom_edges(): for cl in cat_headlemmas[child]: for pl in cat_headlemmas[parent]: axiom_hypernyms[cl.lower()][pl.lower()] += 1 # load remaining hypernyms wiki_hypernyms = utils.load_cache('wikipedia_hypernyms') webisalod_data = pickle.load( bz2.open(utils.get_data_file('files.dbpedia.webisalod_hypernyms'), mode='rb')) webisalod_hypernyms = defaultdict(dict) for parent, child, conf in webisalod_data: webisalod_hypernyms[child][parent] = conf # merge hypernyms candidates = set(axiom_hypernyms) | set(wiki_hypernyms) | set( webisalod_hypernyms) for candidate in candidates: hyper_count = defaultdict(int) if candidate in axiom_hypernyms: for word, count in axiom_hypernyms[candidate].items(): if count >= THRESHOLD_AXIOM: hyper_count[word] += 2 if candidate in wiki_hypernyms: for word, count in wiki_hypernyms[candidate].items(): if count >= THRESHOLD_WIKI: hyper_count[word] += 1 if candidate in webisalod_hypernyms: for word, conf in webisalod_hypernyms[candidate].items(): if conf >= THRESHOLD_WEBISALOD: hyper_count[word] += 1 hypernyms[candidate] = { word for word, count in hyper_count.items() if count > 1 } return hypernyms
def get_disambiguation_mapping() -> dict: global __DISAMBIGUATIONS__ if '__DISAMBIGUATIONS__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.disambiguations')], rdf_util.PREDICATE_DISAMBIGUATES) __DISAMBIGUATIONS__ = defaultdict(set, utils.load_or_create_cache('dbpedia_resource_disambiguations', initializer)) return __DISAMBIGUATIONS__
def _get_label_mapping() -> dict: global __RESOURCE_LABEL_MAPPING__ if '__RESOURCE_LABEL_MAPPING__' not in globals(): initializer = lambda: rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.labels')], rdf_util.PREDICATE_LABEL) __RESOURCE_LABEL_MAPPING__ = utils.load_or_create_cache('dbpedia_resource_labels', initializer) return __RESOURCE_LABEL_MAPPING__
None) class Planet: def __init__(self, planet_data): self.name, self.resource, self.influence, self.planet_type, self.planet_tech = parse_planet_data( planet_data) class System: def __init__(self, planet_names): self.planets = [ get_planet(planet_name) for planet_name in planet_names ] PLANETS = [ Planet(planet_data) for planet_data in utils.get_data_file(PLANETS_PATH) ] SYSTEMS = [ System(planet_names) for planet_names in utils.get_data_file(SYSTEMS_PATH) ] print(len(PLANETS)) names1 = [planet.name for planet in PLANETS] names2 = [[planet.name for planet in system.planets] for system in SYSTEMS] import more_itertools print(sorted(names1) == sorted(more_itertools.flatten(names2)))
def get_main_equivalence_types() -> set: global __MAIN_EQUIVALENCE_TYPES__ if '__MAIN_EQUIVALENCE_TYPES__' not in globals(): __MAIN_EQUIVALENCE_TYPES__ = rdf_util.create_set_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_SUBCLASS_OF, None) return __MAIN_EQUIVALENCE_TYPES__
def get_equivalent_predicates(dbp_predicate: str) -> set: """Return all equivalent predicates of a given predicate.""" global __EQUIVALENT_PREDICATE__ if '__EQUIVALENT_PREDICATE__' not in globals(): __EQUIVALENT_PREDICATE__ = rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_EQUIVALENT_PROPERTY) return __EQUIVALENT_PREDICATE__[dbp_predicate]
def get_range(dbp_predicate: str) -> Optional[str]: """Return the range of a given predicate.""" global __PREDICATE_RANGE__ if '__PREDICATE_RANGE__' not in globals(): __PREDICATE_RANGE__ = defaultdict(lambda: None, rdf_util.create_single_val_dict_from_rdf([utils.get_data_file('files.dbpedia.taxonomy')], rdf_util.PREDICATE_RANGE)) return __PREDICATE_RANGE__[dbp_predicate]
def _get_resource_type_mapping() -> dict: global __RESOURCE_TYPE_MAPPING__ if '__RESOURCE_TYPE_MAPPING__' not in globals(): initializer = lambda: rdf_util.create_multi_val_dict_from_rdf([utils.get_data_file('files.dbpedia.instance_types')], rdf_util.PREDICATE_TYPE) __RESOURCE_TYPE_MAPPING__ = utils.load_or_create_cache('dbpedia_resource_type_mapping', initializer) return __RESOURCE_TYPE_MAPPING__
technologies = starting_values['technologies'] units = starting_values['units'] home_planets = starting_values['home planets'] commodities = starting_values['commodities'] return name, technologies, units, home_planets, commodities def print_race_data(race): print('Race name:\n\t', race.name) print('Techs:\n\t', ', '.join(race.technologies)) print('Starting planets:\n\t', '\n\t '.join(race.home_planets)) print('Units:', ) for unit in race.units: print('\t', unit, ':', race.units[unit]) print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') class Race: def __init__(self, race_data): self.name, self.technologies, self.units, planet_names, self.commodities = parse_race_data( race_data) self.home_planets = [ planet.get_planet(planet_name) for planet_name in planet_names ] RACES = [ Race(race_info) for race_info in utils.get_data_file(RACES_PATH).items() ] random.shuffle(RACES)
def _show_info_cb(self, widget): if self.screen != Screen.INFO: self.set_screen(Screen.INFO) self.info_view.load_file(get_data_file("index"))
def _parse_raw_markup_from_xml() -> dict: utils.get_logger().info('WIKIPEDIA/XML: Parsing raw markup from XML dump..') parser = etree.XMLParser(target=WikiPageParser()) with bz2.open(utils.get_data_file('files.wikipedia.pages')) as dbp_pages_file: page_markup = etree.parse(dbp_pages_file, parser) return {dbp_util.name2resource(p): markup for p, markup in page_markup.items()}