def to_graph(self): g = utilities.create_graph() g.add((self.uri, RDF.type, utilities.NS_DICT["cwrc"].NaturalPerson)) g.add((self.uri, utilities.NS_DICT["foaf"].name, Literal(self.name, datatype=rdflib.namespace.XSD.string))) g.add((self.uri, RDFS.label, Literal(self.name, datatype=rdflib.namespace.XSD.string))) g.add((self.uri, utilities.NS_DICT["cwrc"].hasGender, self.gender)) g.add((self.uri, utilities.NS_DICT["foaf"].isPrimaryTopicOf, self.url)) g += self.create_triples(self.cf_list) g += self.create_triples(self.context_list) g += self.create_triples(self.location_list) g += self.create_triples(self.event_list) g += self.create_triples(self.education_list) g += self.create_triples(self.occupation_list) g += self.create_triples(self.birth_list) if self.deathObj is not None: g += self.deathObj.to_triples() g += self.create_triples(self.cohabitants_list) g += self.create_triples(self.family_list) g += self.create_triples(self.friendsAssociates_list) g += self.create_triples(self.intimateRelationships_list) g += self.create_triples(self.childless_list) g += self.create_triples(self.children_list) g += self.create_triples(self.name_list) if self.wd_id: g.add((self.uri, utilities.NS_DICT["owl"].sameAs, self.wd_id)) return g
def to_triple(self): g = utilities.create_graph() g.add((self.uri, utilities.NS_DICT["foaf"].name, Literal(self.name))) g.add((self.uri, RDFS.label, Literal(self.name))) g.add((self.uri, RDF.type, utilities.NS_DICT["org"].Organization)) for x in self.altlabels: g.add((self.uri, utilities.NS_DICT["skos"].altLabel, Literal(x))) return g
def create_multiple_triples(self, graph): """handles multitple triples """ temp_g = utilities.create_graph() g = rdflib.Graph() for x in graph[:]: temp_g.add(x) triple_str = temp_g.serialize( format="ttl").decode().splitlines()[-2] temp_g.remove(x) g += self.create_ttl_body(triple_str) return g
def main(): from bs4 import BeautifulSoup import culturalForm file_dict = utilities.parse_args(__file__, "Occupation") entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography( person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_occupation_data(soup, person) graph = person.to_graph() temp_path = "extracted_triples/occupation_turtle/" + person_id + "_occupations.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/occupations.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/occupations.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml") if 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation' in fail_dict: job_fail_dict = fail_dict[ 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation'] logger.info("Missed Terms: " + str(len(job_fail_dict.keys()))) count = 0 for x in job_fail_dict.keys(): logger.info(x + " : " + str(job_fail_dict[x])) count += job_fail_dict[x] logger.info("Total Terms: " + str(count))
def to_triple(self, person): g = utilities.create_graph() if self.date: g.add((person.uri, utilities.NS_DICT["cwrc"].hasBirthDate, format_date(self.date))) for x in self.position: g.add((person.uri, utilities.NS_DICT["cwrc"].hasBirthPosition, x)) if self.place: g.add((person.uri, utilities.NS_DICT["cwrc"].hasBirthPlace, self.place)) return g
def to_triple(self, person): g = utilities.create_graph() if self.date: g.add((person.uri, utilities.NS_DICT["cwrc"].hasDeathDate, format_date(self.date))) if self.burial: g.add((person.uri, utilities.NS_DICT["cwrc"].hasBurialPlace, self.burial)) if self.place: g.add((person.uri, utilities.NS_DICT["cwrc"].hasDeathPlace, self.place)) return g
def main(): from bs4 import BeautifulSoup import culturalForm from biography import Biography file_dict = utilities.parse_args(__file__, "Birth/Death") print("-" * 200) entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography( person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_birth_data(soup, person) extract_death_data(soup, person) person.name = utilities.get_readable_name(soup) print(person.to_file()) temp_path = "extracted_triples/birthdeath_turtle/" + person_id + "_birthdeath.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += person.to_graph() entry_num += 1 print("=" * 55) print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/birthdeath.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/birthdeath.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
def main(): from bs4 import BeautifulSoup import culturalForm ext_type = "Violence, Wealth, Leisure and Society, Other Life Event, Health contexts" file_dict = utilities.parse_args(__file__, ext_type) entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography(person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_other_contexts_data(soup, person) graph = person.to_graph() temp_path = "extracted_triples/other_contexts_turtle/" + person_id + "_other_contexts.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/other_contexts.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/other_contexts.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
def to_triple(self, person): global g g = utilities.create_graph() # thisNameEntity = utilities.make_standard_uri(person.name + " NameEnt " + self.value) thisNameEntity = self.value for type in self.typeLabels: g.add((thisNameEntity, utilities.NS_DICT["rdf"].type, type)) g.add( (thisNameEntity, utilities.NS_DICT["rdf"].label, self.personName)) if self.otherTriples is not None: for otherTriple in self.otherTriples: g.add((thisNameEntity, otherTriple["predicate"], Literal(otherTriple["value"]))) g.add((person.uri, utilities.NS_DICT["cwrc"].hasName, thisNameEntity)) if self.hasSpareGraph: g += self.spareGraph return g
def makeBirthGraph(self, givenNameList, surNameList): g = utilities.create_graph() # thisNameEntity = utilities.make_standard_uri(personName + " NameEnt " + self.value) thisNameEntity = self.value numPart = 1 for thisName in givenNameList: thisNamePart = utilities.make_standard_uri(thisName) g.add((thisNamePart, utilities.NS_DICT["rdf"].type, utilities.NS_DICT["cwrc"].Forename)) g.add((thisNamePart, utilities.NS_DICT["cwrc"].hasSortOrder, Literal(numPart))) g.add((thisNamePart, utilities.NS_DICT["rdf"].label, Literal(thisName))) g.add((thisNameEntity, utilities.NS_DICT["cwrc"].hasNamePart, thisNamePart)) numPart += 1 for thisName in surNameList: thisNamePart = utilities.make_standard_uri(thisName) g.add((thisNamePart, utilities.NS_DICT["rdf"].type, utilities.NS_DICT["cwrc"].Surname)) g.add((thisNamePart, utilities.NS_DICT["cwrc"].hasSortOrder, Literal(numPart))) g.add((thisNamePart, utilities.NS_DICT["rdf"].label, Literal(thisName))) g.add((thisNameEntity, utilities.NS_DICT["cwrc"].hasNamePart, thisNamePart)) numPart += 1 g.add( (thisNameEntity, utilities.NS_DICT["rdf"].label, self.personName)) return g
def to_triple(self, person): g = utilities.create_graph() g.add((person.uri, self.uri, self.value)) return g
from Utils import utilities from Utils.context import Context from Utils.event import Event from Utils.organizations import get_org_uri """ Status: ~75% TODO: - review unmapped instances - revise method of capturing failed mappings to be similar to culturalforms """ # temp log library for debugging # --> to be eventually replaced with proper logging library logger = utilities.config_logger("occupation") uber_graph = utilities.create_graph() context_count = 0 event_count = 0 class Occupation(object): """docstring for Occupation """ def __init__(self, job_tag, predicate=None, other_attributes=None): super(Occupation, self).__init__() if predicate: self.predicate = predicate self.value = self.get_mapped_term(job_tag) else: self.predicate = self.get_occupation_predicate(job_tag)
def main(): file_dict = utilities.parse_args(__file__, "Majority of biography related data") entry_num = 1 uber_graph = utilities.create_graph() highest_triples = 0 least_triples = 0 smallest_person = None largest_person = None logger.info("Time started: " + utilities.get_current_time() + "\n") for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(person_id) print(file_dict[filename]) print("*" * 55) person = Biography( person_id, soup, cf.get_mapped_term("Gender", utilities.get_sex(soup))) cf.extract_cf_data(soup, person) other_contexts.extract_other_contexts_data(soup, person) location.extract_location_data(soup, person) occupation.extract_occupation_data(soup, person) education.extract_education_data(soup, person) # personname.extract_person_name(soup, person) birthDeath.extract_birth_data(soup, person) # birthDeath.extract_death(soup, person) # lifeInfo.extract_cohabitants(soup, person) # lifeInfo.extract_family(soup, person) # lifeInfo.extract_friends_associates(soup, person) # lifeInfo.extract_intimate_relationships(soup, person) # lifeInfo.extract_childlessness(soup, person) # lifeInfo.extract_children(soup, person) graph = person.to_graph() triple_count = len(graph) if triple_count > highest_triples: highest_triples = triple_count largest_person = filename if least_triples == 0 or triple_count < least_triples: least_triples = triple_count smallest_person = filename # triples to files temp_path = "extracted_triples/biography_turtle/" + person_id + "_biography.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 temp_path = "extracted_triples/biography_triples.ttl" create_extracted_uberfile(temp_path, uber_graph) cf.log_mapping_fails() logger.info(str(len(uber_graph)) + " total triples created") logger.info( str(largest_person) + " produces the most triples(" + str(highest_triples) + ")") logger.info( str(smallest_person) + " produces the least triples(" + str(least_triples) + ")") logger.info("Time completed: " + utilities.get_current_time())
def to_triple(self, person=None): # if tag is a describing None create the identifying triples g = utilities.create_graph() # Creating Textual body first snippet_uri = rdflib.term.URIRef(str(self.uri) + "_Snippet") if person: source_url = rdflib.term.URIRef(self.src + person.id + "#" + self.heading) snippet_label = person.name + " - " + self.context_label + " snippet" else: source_url = rdflib.term.URIRef(self.src + "#FE") snippet_label = "FE" + " - " + self.context_label + " snippet" g.add((snippet_uri, RDF.type, utilities.NS_DICT["oa"].TextualBody)) g.add((snippet_uri, RDFS.label, rdflib.term.Literal(snippet_label))) g.add((snippet_uri, utilities.NS_DICT["oa"].hasSource, source_url)) g.add((snippet_uri, utilities.NS_DICT["dcterms"].description, rdflib.term.Literal(self.text, datatype=rdflib.namespace.XSD.string))) # Creating identifying context first and always if person: context_label = person.name + " - " + self.context_label + " identifying annotation" else: context_label = self.context_label + " identifying annotation" identifying_uri = utilities.create_uri("data", self.id + "_identifying") g.add((identifying_uri, RDF.type, self.context_type)) g.add( (identifying_uri, RDFS.label, rdflib.term.Literal(context_label))) g.add( (identifying_uri, utilities.NS_DICT["oa"].hasTarget, snippet_uri)) g.add((identifying_uri, utilities.NS_DICT["oa"].motivatedBy, utilities.NS_DICT["oa"].identifying)) self.subjects += identifying_motivation(self.tag) if self.triples and person: self.subjects += self.get_subjects(self.triples, person) for x in self.subjects: g.add((identifying_uri, utilities.NS_DICT["oa"].hasBody, x)) if person: g.add( (identifying_uri, utilities.NS_DICT["oa"].hasBody, person.uri)) if self.event: g.add((identifying_uri, utilities.NS_DICT["cwrc"].hasEvent, self.event)) # Creating describing context if applicable if self.motivation == utilities.NS_DICT["oa"].describing: self.uri = utilities.create_uri("data", self.id + "_describing") context_label = person.name + " - " + self.context_label + " describing annotation" g.add((self.uri, RDF.type, self.context_type)) g.add((self.uri, RDFS.label, rdflib.term.Literal(context_label))) g.add((self.uri, utilities.NS_DICT["cwrc"].hasIDependencyOn, identifying_uri)) g.add((self.uri, utilities.NS_DICT["oa"].hasTarget, person.uri)) g.add((self.uri, utilities.NS_DICT["oa"].hasTarget, snippet_uri)) g.add((self.uri, utilities.NS_DICT["oa"].motivatedBy, self.motivation)) for x in self.subjects: g.add((self.uri, utilities.NS_DICT["dcterms"].subject, x)) for x in self.triples: temp_str = x.to_triple(person).serialize( format="ttl").decode().splitlines() triple_str_test = [ y for y in temp_str if "@prefix" not in y and y != '' ] if len(triple_str_test) == 1: triple_str = x.to_triple(person).serialize( format="ttl").decode().splitlines()[-2] g += self.create_ttl_body(triple_str) else: triple_str = "\n".join(triple_str_test) g += self.create_multiple_triples(x.to_triple(person)) if self.event: g.add( (self.uri, utilities.NS_DICT["cwrc"].hasEvent, self.event)) g.add((self.event, utilities.NS_DICT["cwrc"].hasContext, self.uri)) # Creating the mentioned people as natural person for x in self.tag.find_all("NAME"): uri = utilities.make_standard_uri(x.get("STANDARD")) g.add((uri, RDF.type, utilities.NS_DICT["cwrc"].NaturalPerson)) g.add((uri, RDFS.label, Literal(x.get("STANDARD"), datatype=rdflib.namespace.XSD.string))) g.add((uri, utilities.NS_DICT["foaf"].name, Literal(x.get("STANDARD"), datatype=rdflib.namespace.XSD.string))) return g
def to_triple(self, person=None): g = utilities.create_graph() # attaching event to person, context will need link event fx if person: g.add((person.uri, utilities.NS_DICT["cwrc"].hasEvent, self.uri)) # Not sure if inverse is necessary atm # g.add((self.uri, utilities.NS_DICT["cwrc"].eventOf, person.uri)) # g.add((person.uri, utilities.NS_DICT["sem"].actorType, utilities.NS_DICT["cwrc"].NaturalPerson)) # Labelling the event g.add((self.uri, RDFS.label, Literal(self.title))) text = self.date_tag.text + ": " + self.text g.add((self.uri, utilities.NS_DICT["dcterms"].description, Literal(text))) # Typing of the event g.add((self.uri, RDF.type, utilities.NS_DICT["sem"].Event)) for x in self.event_type: g.add((self.uri, utilities.NS_DICT["sem"].eventType, x)) # Attaching place for x in self.place: g.add((self.uri, utilities.NS_DICT["sem"].hasPlace, x)) # Attaching actors, including the biographee incase they're not mentioned if person: g.add((self.uri, utilities.NS_DICT["sem"].hasActor, person.uri)) for x in self.actors: g.add((self.uri, utilities.NS_DICT["sem"].hasActor, x)) # Typing of time and attaching certainty g.add((self.uri, utilities.NS_DICT["sem"].timeType, utilities.create_cwrc_uri(self.time_type))) if self.time_certainty: g.add((self.uri, utilities.NS_DICT["cwrc"].hasTimeCertainty, utilities.create_cwrc_uri(self.time_certainty))) # Attaching the time stamp to the event if self.predicate: g.add((self.uri, self.predicate, self.date)) else: if self.time_type == "PunctiveTime": g.add((self.uri, utilities.NS_DICT["sem"].hasEarliestBeginTimeStamp, format_date(self.date.split(":")[0]))) g.add( (self.uri, utilities.NS_DICT["sem"].hasLatestEndTimeStamp, format_date(self.date.split(":")[1]))) elif self.time_type == "IntervalTime": g.add((self.uri, utilities.NS_DICT["sem"].hasBeginTimeStamp, format_date(self.date.split(":")[0]))) g.add((self.uri, utilities.NS_DICT["sem"].hasEndTimeStamp, format_date(self.date.split(":")[1]))) if self.date_tag.name == "DATESTRUCT": g.add((self.uri, utilities.NS_DICT["sem"].hasTime, Literal(' '.join(str(self.date_tag.get_text()).split())))) return g