def extract_org_data(bio): import culturalForm as cf global uber_graph elements = ["POLITICALAFFILIATION", "DENOMINATION", "SCHOOL"] for element in elements: tag = bio.find_all(element) for instance in tag: org = get_org(instance) if org: if element == elements[0]: org_type = utilities.NS_DICT["cwrc"].PoliticalOrganization elif element == elements[1]: org_type = utilities.NS_DICT["cwrc"].ReligiousOrganization elif element == elements[2]: org_type = utilities.NS_DICT[ "cwrc"].EducationalOrganization for x in org: org_uri = get_org_uri(x) uber_graph.add((org_uri, RDF.type, org_type)) uber_graph.remove((org_uri, RDF.type, utilities.NS_DICT["org"].Organization)) # Adding the hasOrganization relation if org_type == utilities.NS_DICT[ "cwrc"].ReligiousOrganization: mapped_value = cf.get_mapped_term( "Religion", cf.get_value(instance)) if type(mapped_value) is rdflib.term.URIRef: uber_graph.add( (mapped_value, utilities.NS_DICT["cwrc"].hasOrganization, org_uri)) elif org_type == utilities.NS_DICT[ "cwrc"].PoliticalOrganization: mapped_value = cf.get_mapped_term( "PoliticalAffiliation", cf.get_value(instance)) if type(mapped_value) is rdflib.term.URIRef: uber_graph.add( (mapped_value, utilities.NS_DICT["cwrc"].hasOrganization, org_uri))
def main(): from bs4 import BeautifulSoup import culturalForm file_dict = utilities.parse_args(__file__, "Occupation") entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography( person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_occupation_data(soup, person) graph = person.to_graph() temp_path = "extracted_triples/occupation_turtle/" + person_id + "_occupations.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/occupations.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/occupations.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml") if 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation' in fail_dict: job_fail_dict = fail_dict[ 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation'] logger.info("Missed Terms: " + str(len(job_fail_dict.keys()))) count = 0 for x in job_fail_dict.keys(): logger.info(x + " : " + str(job_fail_dict[x])) count += job_fail_dict[x] logger.info("Total Terms: " + str(count))
def main(): from bs4 import BeautifulSoup import culturalForm from biography import Biography file_dict = utilities.parse_args(__file__, "Birth/Death") print("-" * 200) entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography( person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_birth_data(soup, person) extract_death_data(soup, person) person.name = utilities.get_readable_name(soup) print(person.to_file()) temp_path = "extracted_triples/birthdeath_turtle/" + person_id + "_birthdeath.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += person.to_graph() entry_num += 1 print("=" * 55) print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/birthdeath.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/birthdeath.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
def main(): from bs4 import BeautifulSoup import culturalForm ext_type = "Violence, Wealth, Leisure and Society, Other Life Event, Health contexts" file_dict = utilities.parse_args(__file__, ext_type) entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography(person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_other_contexts_data(soup, person) graph = person.to_graph() temp_path = "extracted_triples/other_contexts_turtle/" + person_id + "_other_contexts.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/other_contexts.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/other_contexts.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
def main(): file_dict = utilities.parse_args(__file__, "Majority of biography related data") entry_num = 1 uber_graph = utilities.create_graph() highest_triples = 0 least_triples = 0 smallest_person = None largest_person = None logger.info("Time started: " + utilities.get_current_time() + "\n") for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(person_id) print(file_dict[filename]) print("*" * 55) person = Biography( person_id, soup, cf.get_mapped_term("Gender", utilities.get_sex(soup))) cf.extract_cf_data(soup, person) other_contexts.extract_other_contexts_data(soup, person) location.extract_location_data(soup, person) occupation.extract_occupation_data(soup, person) education.extract_education_data(soup, person) # personname.extract_person_name(soup, person) birthDeath.extract_birth_data(soup, person) # birthDeath.extract_death(soup, person) # lifeInfo.extract_cohabitants(soup, person) # lifeInfo.extract_family(soup, person) # lifeInfo.extract_friends_associates(soup, person) # lifeInfo.extract_intimate_relationships(soup, person) # lifeInfo.extract_childlessness(soup, person) # lifeInfo.extract_children(soup, person) graph = person.to_graph() triple_count = len(graph) if triple_count > highest_triples: highest_triples = triple_count largest_person = filename if least_triples == 0 or triple_count < least_triples: least_triples = triple_count smallest_person = filename # triples to files temp_path = "extracted_triples/biography_turtle/" + person_id + "_biography.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 temp_path = "extracted_triples/biography_triples.ttl" create_extracted_uberfile(temp_path, uber_graph) cf.log_mapping_fails() logger.info(str(len(uber_graph)) + " total triples created") logger.info( str(largest_person) + " produces the most triples(" + str(highest_triples) + ")") logger.info( str(smallest_person) + " produces the least triples(" + str(least_triples) + ")") logger.info("Time completed: " + utilities.get_current_time())