def main(): from bs4 import BeautifulSoup import culturalForm file_dict = utilities.parse_args(__file__, "Occupation") entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography( person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_occupation_data(soup, person) graph = person.to_graph() temp_path = "extracted_triples/occupation_turtle/" + person_id + "_occupations.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/occupations.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/occupations.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml") if 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation' in fail_dict: job_fail_dict = fail_dict[ 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation'] logger.info("Missed Terms: " + str(len(job_fail_dict.keys()))) count = 0 for x in job_fail_dict.keys(): logger.info(x + " : " + str(job_fail_dict[x])) count += job_fail_dict[x] logger.info("Total Terms: " + str(count))
def main(): import os from biography import Biography from bs4 import BeautifulSoup file_dict = utilities.parse_args(__file__, "CulturalForm") entry_num = 1 global uber_graph logger.info("Time started: " + utilities.get_current_time() + "\n") for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print("Running on:", filename) logger.info(file_dict[filename]) print(file_dict[filename]) print("*" * 55) person = Biography(person_id, soup, get_mapped_term("Gender", utilities.get_sex(soup))) extract_cf_data(soup, person) person.name = utilities.get_readable_name(soup) graph = person.to_graph() temp_path = "extracted_triples/cf_turtle/" + person_id + "_cf.ttl" utilities.create_extracted_file(temp_path, person) temp_path = "extracted_triples/cf_rdf/" + person_id + "_cf.rdf" utilities.create_extracted_file(temp_path, person, "pretty-xml") uber_graph += graph entry_num += 1 logger.info(str(len(uber_graph)) + " triples created") temp_path = "extracted_triples/culturalForms.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/culturalForms.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml") log_mapping_fails() logger.info("Time completed: " + utilities.get_current_time())
def main(): from bs4 import BeautifulSoup import culturalForm from biography import Biography file_dict = utilities.parse_args(__file__, "Birth/Death") print("-" * 200) entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography( person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_birth_data(soup, person) extract_death_data(soup, person) person.name = utilities.get_readable_name(soup) print(person.to_file()) temp_path = "extracted_triples/birthdeath_turtle/" + person_id + "_birthdeath.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += person.to_graph() entry_num += 1 print("=" * 55) print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/birthdeath.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/birthdeath.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
def new_biography(self, id): """create a new biography bio definition, these biographies are of identified people, so the id must be a valid bioport id """ bio = Biography(id=id, source=self) bio._set_up_basic_structure() # this gives some basic structure, but strictly speaking this is not a valid biodes document bio._set_bioport_id(id) bio.save() return bio
def main(): from bs4 import BeautifulSoup import culturalForm ext_type = "Violence, Wealth, Leisure and Society, Other Life Event, Health contexts" file_dict = utilities.parse_args(__file__, ext_type) entry_num = 1 uber_graph = utilities.create_graph() for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(filename) print(file_dict[filename]) print(person_id) print("*" * 55) person = Biography(person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup))) extract_other_contexts_data(soup, person) graph = person.to_graph() temp_path = "extracted_triples/other_contexts_turtle/" + person_id + "_other_contexts.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 print("UberGraph is size:", len(uber_graph)) temp_path = "extracted_triples/other_contexts.ttl" utilities.create_extracted_uberfile(temp_path, uber_graph) temp_path = "extracted_triples/other_contexts.rdf" utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
def main(): file_dict = utilities.parse_args(__file__, "Majority of biography related data") entry_num = 1 uber_graph = utilities.create_graph() highest_triples = 0 least_triples = 0 smallest_person = None largest_person = None logger.info("Time started: " + utilities.get_current_time() + "\n") for filename in file_dict.keys(): with open(filename) as f: soup = BeautifulSoup(f, 'lxml-xml') person_id = filename.split("/")[-1][:6] print(person_id) print(file_dict[filename]) print("*" * 55) person = Biography( person_id, soup, cf.get_mapped_term("Gender", utilities.get_sex(soup))) cf.extract_cf_data(soup, person) other_contexts.extract_other_contexts_data(soup, person) location.extract_location_data(soup, person) occupation.extract_occupation_data(soup, person) education.extract_education_data(soup, person) # personname.extract_person_name(soup, person) birthDeath.extract_birth_data(soup, person) # birthDeath.extract_death(soup, person) # lifeInfo.extract_cohabitants(soup, person) # lifeInfo.extract_family(soup, person) # lifeInfo.extract_friends_associates(soup, person) # lifeInfo.extract_intimate_relationships(soup, person) # lifeInfo.extract_childlessness(soup, person) # lifeInfo.extract_children(soup, person) graph = person.to_graph() triple_count = len(graph) if triple_count > highest_triples: highest_triples = triple_count largest_person = filename if least_triples == 0 or triple_count < least_triples: least_triples = triple_count smallest_person = filename # triples to files temp_path = "extracted_triples/biography_turtle/" + person_id + "_biography.ttl" utilities.create_extracted_file(temp_path, person) uber_graph += graph entry_num += 1 temp_path = "extracted_triples/biography_triples.ttl" create_extracted_uberfile(temp_path, uber_graph) cf.log_mapping_fails() logger.info(str(len(uber_graph)) + " total triples created") logger.info( str(largest_person) + " produces the most triples(" + str(highest_triples) + ")") logger.info( str(smallest_person) + " produces the least triples(" + str(least_triples) + ")") logger.info("Time completed: " + utilities.get_current_time())