Exemple #1
0
    def to_graph(self):
        g = utilities.create_graph()

        g.add((self.uri, RDF.type, utilities.NS_DICT["cwrc"].NaturalPerson))
        g.add((self.uri, utilities.NS_DICT["foaf"].name,
               Literal(self.name, datatype=rdflib.namespace.XSD.string)))
        g.add((self.uri, RDFS.label,
               Literal(self.name, datatype=rdflib.namespace.XSD.string)))
        g.add((self.uri, utilities.NS_DICT["cwrc"].hasGender, self.gender))
        g.add((self.uri, utilities.NS_DICT["foaf"].isPrimaryTopicOf, self.url))

        g += self.create_triples(self.cf_list)
        g += self.create_triples(self.context_list)
        g += self.create_triples(self.location_list)
        g += self.create_triples(self.event_list)
        g += self.create_triples(self.education_list)
        g += self.create_triples(self.occupation_list)
        g += self.create_triples(self.birth_list)

        if self.deathObj is not None:
            g += self.deathObj.to_triples()

        g += self.create_triples(self.cohabitants_list)
        g += self.create_triples(self.family_list)
        g += self.create_triples(self.friendsAssociates_list)
        g += self.create_triples(self.intimateRelationships_list)
        g += self.create_triples(self.childless_list)
        g += self.create_triples(self.children_list)
        g += self.create_triples(self.name_list)

        if self.wd_id:
            g.add((self.uri, utilities.NS_DICT["owl"].sameAs, self.wd_id))

        return g
Exemple #2
0
 def to_triple(self):
     g = utilities.create_graph()
     g.add((self.uri, utilities.NS_DICT["foaf"].name, Literal(self.name)))
     g.add((self.uri, RDFS.label, Literal(self.name)))
     g.add((self.uri, RDF.type, utilities.NS_DICT["org"].Organization))
     for x in self.altlabels:
         g.add((self.uri, utilities.NS_DICT["skos"].altLabel, Literal(x)))
     return g
Exemple #3
0
    def create_multiple_triples(self, graph):
        """handles multitple triples
        """
        temp_g = utilities.create_graph()
        g = rdflib.Graph()
        for x in graph[:]:
            temp_g.add(x)
            triple_str = temp_g.serialize(
                format="ttl").decode().splitlines()[-2]
            temp_g.remove(x)
            g += self.create_ttl_body(triple_str)

        return g
Exemple #4
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm

    file_dict = utilities.parse_args(__file__, "Occupation")

    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(
            person_id, soup,
            culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_occupation_data(soup, person)

        graph = person.to_graph()

        temp_path = "extracted_triples/occupation_turtle/" + person_id + "_occupations.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/occupations.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/occupations.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")

    if 'http://sparql.cwrc.ca/ontologies/cwrc#Occupation' in fail_dict:
        job_fail_dict = fail_dict[
            'http://sparql.cwrc.ca/ontologies/cwrc#Occupation']
        logger.info("Missed Terms: " + str(len(job_fail_dict.keys())))
        count = 0
        for x in job_fail_dict.keys():
            logger.info(x + " : " + str(job_fail_dict[x]))
            count += job_fail_dict[x]
        logger.info("Total Terms: " + str(count))
Exemple #5
0
    def to_triple(self, person):
        g = utilities.create_graph()
        if self.date:
            g.add((person.uri, utilities.NS_DICT["cwrc"].hasBirthDate,
                   format_date(self.date)))

        for x in self.position:
            g.add((person.uri, utilities.NS_DICT["cwrc"].hasBirthPosition, x))

        if self.place:
            g.add((person.uri, utilities.NS_DICT["cwrc"].hasBirthPlace,
                   self.place))

        return g
Exemple #6
0
    def to_triple(self, person):
        g = utilities.create_graph()
        if self.date:
            g.add((person.uri, utilities.NS_DICT["cwrc"].hasDeathDate,
                   format_date(self.date)))

        if self.burial:
            g.add((person.uri, utilities.NS_DICT["cwrc"].hasBurialPlace,
                   self.burial))

        if self.place:
            g.add((person.uri, utilities.NS_DICT["cwrc"].hasDeathPlace,
                   self.place))

        return g
Exemple #7
0
def main():
    from bs4 import BeautifulSoup
    import culturalForm
    from biography import Biography

    file_dict = utilities.parse_args(__file__, "Birth/Death")
    print("-" * 200)
    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(
            person_id, soup,
            culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_birth_data(soup, person)
        extract_death_data(soup, person)
        person.name = utilities.get_readable_name(soup)
        print(person.to_file())

        temp_path = "extracted_triples/birthdeath_turtle/" + person_id + "_birthdeath.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += person.to_graph()
        entry_num += 1
        print("=" * 55)

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/birthdeath.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/birthdeath.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
def main():
    from bs4 import BeautifulSoup
    import culturalForm

    ext_type = "Violence, Wealth, Leisure and Society, Other Life Event, Health contexts"
    file_dict = utilities.parse_args(__file__, ext_type)

    entry_num = 1

    uber_graph = utilities.create_graph()

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(filename)
        print(file_dict[filename])
        print(person_id)
        print("*" * 55)

        person = Biography(person_id, soup, culturalForm.get_mapped_term("Gender", utilities.get_sex(soup)))
        extract_other_contexts_data(soup, person)

        graph = person.to_graph()

        temp_path = "extracted_triples/other_contexts_turtle/" + person_id + "_other_contexts.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    print("UberGraph is size:", len(uber_graph))
    temp_path = "extracted_triples/other_contexts.ttl"
    utilities.create_extracted_uberfile(temp_path, uber_graph)

    temp_path = "extracted_triples/other_contexts.rdf"
    utilities.create_extracted_uberfile(temp_path, uber_graph, "pretty-xml")
Exemple #9
0
    def to_triple(self, person):
        global g
        g = utilities.create_graph()

        # thisNameEntity = utilities.make_standard_uri(person.name + " NameEnt " + self.value)
        thisNameEntity = self.value

        for type in self.typeLabels:
            g.add((thisNameEntity, utilities.NS_DICT["rdf"].type, type))

        g.add(
            (thisNameEntity, utilities.NS_DICT["rdf"].label, self.personName))

        if self.otherTriples is not None:
            for otherTriple in self.otherTriples:
                g.add((thisNameEntity, otherTriple["predicate"],
                       Literal(otherTriple["value"])))

        g.add((person.uri, utilities.NS_DICT["cwrc"].hasName, thisNameEntity))

        if self.hasSpareGraph:
            g += self.spareGraph
        return g
Exemple #10
0
    def makeBirthGraph(self, givenNameList, surNameList):
        g = utilities.create_graph()
        # thisNameEntity = utilities.make_standard_uri(personName + " NameEnt " + self.value)
        thisNameEntity = self.value

        numPart = 1

        for thisName in givenNameList:
            thisNamePart = utilities.make_standard_uri(thisName)
            g.add((thisNamePart, utilities.NS_DICT["rdf"].type,
                   utilities.NS_DICT["cwrc"].Forename))
            g.add((thisNamePart, utilities.NS_DICT["cwrc"].hasSortOrder,
                   Literal(numPart)))
            g.add((thisNamePart, utilities.NS_DICT["rdf"].label,
                   Literal(thisName)))

            g.add((thisNameEntity, utilities.NS_DICT["cwrc"].hasNamePart,
                   thisNamePart))
            numPart += 1

        for thisName in surNameList:
            thisNamePart = utilities.make_standard_uri(thisName)
            g.add((thisNamePart, utilities.NS_DICT["rdf"].type,
                   utilities.NS_DICT["cwrc"].Surname))
            g.add((thisNamePart, utilities.NS_DICT["cwrc"].hasSortOrder,
                   Literal(numPart)))
            g.add((thisNamePart, utilities.NS_DICT["rdf"].label,
                   Literal(thisName)))

            g.add((thisNameEntity, utilities.NS_DICT["cwrc"].hasNamePart,
                   thisNamePart))
            numPart += 1

        g.add(
            (thisNameEntity, utilities.NS_DICT["rdf"].label, self.personName))

        return g
Exemple #11
0
 def to_triple(self, person):
     g = utilities.create_graph()
     g.add((person.uri, self.uri, self.value))
     return g
Exemple #12
0
from Utils import utilities
from Utils.context import Context
from Utils.event import Event
from Utils.organizations import get_org_uri
"""
Status: ~75%
TODO:
 - review unmapped instances
 - revise method of capturing failed mappings to be similar to culturalforms
"""

# temp log library for debugging
# --> to be eventually replaced with proper logging library

logger = utilities.config_logger("occupation")
uber_graph = utilities.create_graph()

context_count = 0
event_count = 0


class Occupation(object):
    """docstring for Occupation
    """
    def __init__(self, job_tag, predicate=None, other_attributes=None):
        super(Occupation, self).__init__()
        if predicate:
            self.predicate = predicate
            self.value = self.get_mapped_term(job_tag)
        else:
            self.predicate = self.get_occupation_predicate(job_tag)
Exemple #13
0
def main():
    file_dict = utilities.parse_args(__file__,
                                     "Majority of biography related data")

    entry_num = 1
    uber_graph = utilities.create_graph()

    highest_triples = 0
    least_triples = 0
    smallest_person = None
    largest_person = None
    logger.info("Time started: " + utilities.get_current_time() + "\n")

    for filename in file_dict.keys():
        with open(filename) as f:
            soup = BeautifulSoup(f, 'lxml-xml')

        person_id = filename.split("/")[-1][:6]

        print(person_id)
        print(file_dict[filename])
        print("*" * 55)
        person = Biography(
            person_id, soup,
            cf.get_mapped_term("Gender", utilities.get_sex(soup)))
        cf.extract_cf_data(soup, person)
        other_contexts.extract_other_contexts_data(soup, person)
        location.extract_location_data(soup, person)
        occupation.extract_occupation_data(soup, person)
        education.extract_education_data(soup, person)

        # personname.extract_person_name(soup, person)
        birthDeath.extract_birth_data(soup, person)
        # birthDeath.extract_death(soup, person)
        # lifeInfo.extract_cohabitants(soup, person)
        # lifeInfo.extract_family(soup, person)
        # lifeInfo.extract_friends_associates(soup, person)
        # lifeInfo.extract_intimate_relationships(soup, person)
        # lifeInfo.extract_childlessness(soup, person)
        # lifeInfo.extract_children(soup, person)

        graph = person.to_graph()
        triple_count = len(graph)

        if triple_count > highest_triples:
            highest_triples = triple_count
            largest_person = filename
        if least_triples == 0 or triple_count < least_triples:
            least_triples = triple_count
            smallest_person = filename

        # triples to files
        temp_path = "extracted_triples/biography_turtle/" + person_id + "_biography.ttl"
        utilities.create_extracted_file(temp_path, person)

        uber_graph += graph
        entry_num += 1

    temp_path = "extracted_triples/biography_triples.ttl"
    create_extracted_uberfile(temp_path, uber_graph)

    cf.log_mapping_fails()
    logger.info(str(len(uber_graph)) + " total triples created")
    logger.info(
        str(largest_person) + " produces the most triples(" +
        str(highest_triples) + ")")
    logger.info(
        str(smallest_person) + " produces the least triples(" +
        str(least_triples) + ")")

    logger.info("Time completed: " + utilities.get_current_time())
Exemple #14
0
    def to_triple(self, person=None):
        # if tag is a describing None create the identifying triples
        g = utilities.create_graph()

        # Creating Textual body first
        snippet_uri = rdflib.term.URIRef(str(self.uri) + "_Snippet")

        if person:
            source_url = rdflib.term.URIRef(self.src + person.id + "#" +
                                            self.heading)
            snippet_label = person.name + " - " + self.context_label + " snippet"
        else:
            source_url = rdflib.term.URIRef(self.src + "#FE")
            snippet_label = "FE" + " - " + self.context_label + " snippet"

        g.add((snippet_uri, RDF.type, utilities.NS_DICT["oa"].TextualBody))
        g.add((snippet_uri, RDFS.label, rdflib.term.Literal(snippet_label)))
        g.add((snippet_uri, utilities.NS_DICT["oa"].hasSource, source_url))
        g.add((snippet_uri, utilities.NS_DICT["dcterms"].description,
               rdflib.term.Literal(self.text,
                                   datatype=rdflib.namespace.XSD.string)))

        # Creating identifying context first and always
        if person:
            context_label = person.name + " - " + self.context_label + " identifying annotation"
        else:
            context_label = self.context_label + " identifying annotation"

        identifying_uri = utilities.create_uri("data",
                                               self.id + "_identifying")
        g.add((identifying_uri, RDF.type, self.context_type))
        g.add(
            (identifying_uri, RDFS.label, rdflib.term.Literal(context_label)))
        g.add(
            (identifying_uri, utilities.NS_DICT["oa"].hasTarget, snippet_uri))
        g.add((identifying_uri, utilities.NS_DICT["oa"].motivatedBy,
               utilities.NS_DICT["oa"].identifying))
        self.subjects += identifying_motivation(self.tag)
        if self.triples and person:
            self.subjects += self.get_subjects(self.triples, person)
        for x in self.subjects:
            g.add((identifying_uri, utilities.NS_DICT["oa"].hasBody, x))

        if person:
            g.add(
                (identifying_uri, utilities.NS_DICT["oa"].hasBody, person.uri))

        if self.event:
            g.add((identifying_uri, utilities.NS_DICT["cwrc"].hasEvent,
                   self.event))

        # Creating describing context if applicable
        if self.motivation == utilities.NS_DICT["oa"].describing:
            self.uri = utilities.create_uri("data", self.id + "_describing")
            context_label = person.name + " - " + self.context_label + " describing annotation"
            g.add((self.uri, RDF.type, self.context_type))
            g.add((self.uri, RDFS.label, rdflib.term.Literal(context_label)))
            g.add((self.uri, utilities.NS_DICT["cwrc"].hasIDependencyOn,
                   identifying_uri))
            g.add((self.uri, utilities.NS_DICT["oa"].hasTarget, person.uri))
            g.add((self.uri, utilities.NS_DICT["oa"].hasTarget, snippet_uri))
            g.add((self.uri, utilities.NS_DICT["oa"].motivatedBy,
                   self.motivation))

            for x in self.subjects:
                g.add((self.uri, utilities.NS_DICT["dcterms"].subject, x))

            for x in self.triples:
                temp_str = x.to_triple(person).serialize(
                    format="ttl").decode().splitlines()
                triple_str_test = [
                    y for y in temp_str if "@prefix" not in y and y != ''
                ]
                if len(triple_str_test) == 1:
                    triple_str = x.to_triple(person).serialize(
                        format="ttl").decode().splitlines()[-2]
                    g += self.create_ttl_body(triple_str)
                else:
                    triple_str = "\n".join(triple_str_test)
                    g += self.create_multiple_triples(x.to_triple(person))

            if self.event:
                g.add(
                    (self.uri, utilities.NS_DICT["cwrc"].hasEvent, self.event))
                g.add((self.event, utilities.NS_DICT["cwrc"].hasContext,
                       self.uri))

        # Creating the mentioned people as natural person
        for x in self.tag.find_all("NAME"):
            uri = utilities.make_standard_uri(x.get("STANDARD"))
            g.add((uri, RDF.type, utilities.NS_DICT["cwrc"].NaturalPerson))
            g.add((uri, RDFS.label,
                   Literal(x.get("STANDARD"),
                           datatype=rdflib.namespace.XSD.string)))
            g.add((uri, utilities.NS_DICT["foaf"].name,
                   Literal(x.get("STANDARD"),
                           datatype=rdflib.namespace.XSD.string)))

        return g
Exemple #15
0
    def to_triple(self, person=None):
        g = utilities.create_graph()

        # attaching event to person, context will need link event fx
        if person:
            g.add((person.uri, utilities.NS_DICT["cwrc"].hasEvent, self.uri))
        # Not sure if inverse is necessary atm
        # g.add((self.uri, utilities.NS_DICT["cwrc"].eventOf, person.uri))
        # g.add((person.uri, utilities.NS_DICT["sem"].actorType, utilities.NS_DICT["cwrc"].NaturalPerson))

        # Labelling the event
        g.add((self.uri, RDFS.label, Literal(self.title)))
        text = self.date_tag.text + ": " + self.text
        g.add((self.uri, utilities.NS_DICT["dcterms"].description,
               Literal(text)))

        # Typing of the event
        g.add((self.uri, RDF.type, utilities.NS_DICT["sem"].Event))
        for x in self.event_type:
            g.add((self.uri, utilities.NS_DICT["sem"].eventType, x))

        # Attaching place
        for x in self.place:
            g.add((self.uri, utilities.NS_DICT["sem"].hasPlace, x))

        # Attaching actors, including the biographee incase they're not mentioned
        if person:
            g.add((self.uri, utilities.NS_DICT["sem"].hasActor, person.uri))

        for x in self.actors:
            g.add((self.uri, utilities.NS_DICT["sem"].hasActor, x))

        # Typing of time and attaching certainty
        g.add((self.uri, utilities.NS_DICT["sem"].timeType,
               utilities.create_cwrc_uri(self.time_type)))
        if self.time_certainty:
            g.add((self.uri, utilities.NS_DICT["cwrc"].hasTimeCertainty,
                   utilities.create_cwrc_uri(self.time_certainty)))

        # Attaching the time stamp to the event
        if self.predicate:
            g.add((self.uri, self.predicate, self.date))
        else:
            if self.time_type == "PunctiveTime":
                g.add((self.uri,
                       utilities.NS_DICT["sem"].hasEarliestBeginTimeStamp,
                       format_date(self.date.split(":")[0])))
                g.add(
                    (self.uri, utilities.NS_DICT["sem"].hasLatestEndTimeStamp,
                     format_date(self.date.split(":")[1])))
            elif self.time_type == "IntervalTime":
                g.add((self.uri, utilities.NS_DICT["sem"].hasBeginTimeStamp,
                       format_date(self.date.split(":")[0])))
                g.add((self.uri, utilities.NS_DICT["sem"].hasEndTimeStamp,
                       format_date(self.date.split(":")[1])))

        if self.date_tag.name == "DATESTRUCT":
            g.add((self.uri, utilities.NS_DICT["sem"].hasTime,
                   Literal(' '.join(str(self.date_tag.get_text()).split()))))

        return g