Beispiel #1
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm

    file_dict = utilities.parse_args(__file__, "Occupation")

    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(
            person_id, soup,
            culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_occupation_data(soup, person)

        graph = person.to_graph()

        temp_path = "extracted_triples/occupation_turtle/" + person_id + "_occupations.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/occupations.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/occupations.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")

    if 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation' in fail_dict:
        job_fail_dict = fail_dict[
            'http://sparql.cwrc.ca/ontologies/cwrc#Occupation']
        logger.info("Missed Terms: " + str(len(job_fail_dict.keys())))
        count = 0
        for x in job_fail_dict.keys():
            logger.info(x + " : " + str(job_fail_dict[x]))
            count += job_fail_dict[x]
        logger.info("Total Terms: " + str(count))
Beispiel #2
0
def main():
    import os
    from biography import Biography
    from bs4 import BeautifulSoup

    file_dict = utilities.parse_args(__file__, "CulturalForm")
    entry_num = 1

    global uber_graph

    logger.info("Time started: " + utilities.get_current_time() + "\n")

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print("Running on:", filename)
        logger.info(file_dict[filename])
        print(file_dict[filename])
        print("*" * 55)

        person = Biography(person_id, soup,
                           get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_cf_data(soup, person)
        person.name = utilities.get_readable_name(soup)
        graph = person.to_graph()

        temp_path = "extracted_triples/cf_turtle/" + person_id + "_cf.ttl"
        utilities.create_extracted_file(temp_path, person)
        temp_path = "extracted_triples/cf_rdf/" + person_id + "_cf.rdf"
        utilities.create_extracted_file(temp_path, person, "pretty-xml")

        uber_graph += graph
        entry_num += 1

    logger.info(str(len(uber_graph)) + " triples created")

    temp_path = "extracted_triples/culturalForms.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/culturalForms.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")

    log_mapping_fails()
    logger.info("Time completed: " + utilities.get_current_time())
Beispiel #3
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm
    from biography import Biography

    file_dict = utilities.parse_args(__file__, "Birth/Death")
    print("-" * 200)
    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(
            person_id, soup,
            culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_birth_data(soup, person)
        extract_death_data(soup, person)
        person.name = utilities.get_readable_name(soup)
        print(person.to_file())

        temp_path = "extracted_triples/birthdeath_turtle/" + person_id + "_birthdeath.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += person.to_graph()
        entry_num += 1
        print("=" * 55)

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/birthdeath.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/birthdeath.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
    def new_biography(self, id):
        """create a new biography

        bio definition, these biographies are of identified people, so the id must be a valid bioport id
        """
        bio = Biography(id=id, source=self)
        bio._set_up_basic_structure()  # this gives some basic structure, but strictly speaking this is not a valid biodes document
        bio._set_bioport_id(id)
        bio.save()
        return bio
Beispiel #5
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm

    ext_type = "Violence, Wealth, Leisure and Society, Other Life Event, Health contexts"
    file_dict = utilities.parse_args(__file__, ext_type)

    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_other_contexts_data(soup, person)

        graph = person.to_graph()

        temp_path = "extracted_triples/other_contexts_turtle/" + person_id + "_other_contexts.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/other_contexts.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/other_contexts.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
Beispiel #6
0
def main():
    file_dict = utilities.parse_args(__file__,
                                     "Majority of biography related data")

    entry_num = 1
    uber_graph = utilities.create_graph()

    highest_triples = 0
    least_triples = 0
    smallest_person = None
    largest_person = None
    logger.info("Time started: " + utilities.get_current_time() + "\n")

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(person_id)
        print(file_dict[filename])
        print("*" * 55)
        person = Biography(
            person_id, soup,
            cf.get_mapped_term("Gender", utilities.get_sex(soup)))
        cf.extract_cf_data(soup, person)
        other_contexts.extract_other_contexts_data(soup, person)
        location.extract_location_data(soup, person)
        occupation.extract_occupation_data(soup, person)
        education.extract_education_data(soup, person)

        # personname.extract_person_name(soup, person)
        birthDeath.extract_birth_data(soup, person)
        # birthDeath.extract_death(soup, person)
        # lifeInfo.extract_cohabitants(soup, person)
        # lifeInfo.extract_family(soup, person)
        # lifeInfo.extract_friends_associates(soup, person)
        # lifeInfo.extract_intimate_relationships(soup, person)
        # lifeInfo.extract_childlessness(soup, person)
        # lifeInfo.extract_children(soup, person)

        graph = person.to_graph()
        triple_count = len(graph)

        if triple_count > highest_triples:
            highest_triples = triple_count
            largest_person = filename
        if least_triples == 0 or triple_count < least_triples:
            least_triples = triple_count
            smallest_person = filename

        # triples to files
        temp_path = "extracted_triples/biography_turtle/" + person_id + "_biography.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    temp_path = "extracted_triples/biography_triples.ttl"
    create_extracted_uberfile(temp_path, uber_graph)

    cf.log_mapping_fails()
    logger.info(str(len(uber_graph)) + " total triples created")
    logger.info(
        str(largest_person) + " produces the most triples(" +
        str(highest_triples) + ")")
    logger.info(
        str(smallest_person) + " produces the least triples(" +
        str(least_triples) + ")")

    logger.info("Time completed: " + utilities.get_current_time())