Example #1
0
 def setUp(self):
     ontology_content = '''
             @prefix : <http://dig.isi.edu/ontologies/dig/> .
             @prefix owl: <http://www.w3.org/2002/07/owl#> .
             @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
             @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
             @prefix schema: <http://schema.org/> .
             @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
             :Person a owl:Class ;
                 rdfs:subClassOf :Actor, :Biological_Object ;
                 :common_properties :label, :title, :religion ; .
             :has_name a owl:DatatypeProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes xsd:string ; .
             :has_child a owl:ObjectProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes :Person ; .
         '''
     ontology = Ontology(ontology_content,
                         validation=False,
                         include_undefined_class=True,
                         quiet=True)
     kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
     etk = ETK(kg_schema=kg_schema,
               ontology=ontology,
               generate_json_ld=True)
     etk2 = ETK(kg_schema=kg_schema,
                ontology=ontology,
                generate_json_ld=False)
     self.doc = etk.create_document(dict(),
                                    doc_id='http://xxx/1',
                                    type_=[DIG.Person.toPython()])
     self.doc2 = etk2.create_document(dict(),
                                      doc_id='http://xxx/2',
                                      type_=[DIG.Person.toPython()])
 def setUp(self):
     ontology_content = '''
         @prefix : <http://dig.isi.edu/ontologies/dig/> .
         @prefix dig: <http://dig.isi.edu/ontologies/dig/> .
         @prefix owl: <http://www.w3.org/2002/07/owl#> .
         @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
         @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
         @prefix schema: <http://schema.org/> .
         @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
         :Person a owl:Class ;
             rdfs:subClassOf :Actor, :Biological_Object ;
             :common_properties :label, :title, :religion ; .
         :has_name a owl:DatatypeProperty ;
             schema:domainIncludes :Person ;
             schema:rangeIncludes xsd:string ; .
         :has_child a owl:ObjectProperty ;
             schema:domainIncludes :Person ;
             schema:rangeIncludes :Person ; .
         '''
     kg_schema = KGSchema()
     kg_schema.add_schema(ontology_content, 'ttl')
     etk = ETK(kg_schema=kg_schema)
     self.doc = etk.create_document(dict(),
                                    doc_id='http://xxx/1',
                                    type_=[URI('dig:Person')])
Example #3
0
    def setUp(self):
        sample_doc = {
            "projects": [{
                "name": "etk",
                "description":
                "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
                "members": ["dongyu", "amandeep", "sylvia", "Runqi12"],
                "date": "2007-12-05",
                "place": "columbus:georgia:united states:-84.98771:32.46098",
                "s": "segment_test_1"
            }, {
                "name": "rltk",
                "description":
                "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
                "members": ["mayank", "yixiang"],
                "date": ["2007-12-05T23:19:00"],
                "cost": -3213.32,
                "s": "segment_test_2"
            }]
        }
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema)
        self.doc = etk.create_document(sample_doc)
 def test_website_patterns_condition(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc, website_patterns=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, website_patterns=[".*ABc", ".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
Example #5
0
 def test_segment(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     descriptions = doc.select_segments("projects[*].description")
     description_value = [i.value for i in descriptions]
     expected = [
         'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
         'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
     ]
     self.assertEqual(description_value, expected)
 def test_json_paths_and_json_paths_regex(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
Example #7
0
    def test_segment(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema)
        doc = etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        description_value = [i.value for i in descriptions]
        expected = [
            'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
            'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
        ]
        self.assertEqual(description_value, expected)
 def test_all_condition(self) -> None:
     etk = ETK()
     doc = etk.create_document(sample_input)
     default_doc_selector = DefaultDocumentSelector()
     res_true = default_doc_selector.select_document(
         doc,
         datasets=[".*unittest", ".*abc"],
         url_patterns=[".*unittest", ".*zxc"],
         website_patterns=[".*unittest", ".*abc"],
         json_paths=["$.website"],
         json_paths_regex=[".*unittest", ".*abc"])
     res_false = default_doc_selector.select_document(
         doc,
         datasets=[".*abc", ".*hhhh"],
         url_patterns=[".*ZXc", ".*hhhh"],
         website_patterns=[".*ABc", ".*hhhh"],
         json_paths=["$.website"],
         json_paths_regex=[".*hhhh"])
     self.assertEqual(True, res_true)
     self.assertEqual(False, res_false)
Example #9
0
    def test_KnowledgeGraph_provenance(self) -> None:
        sample_doc = {
            "projects": [
                {
                    "name": "etk",
                    "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
                    "members": [
                        "dongyu",
                        "amandeep",
                        "sylvia",
                        "Runqi12"
                    ],
                    "date": "2007-12-05",
                    "place": "columbus:georgia:united states:-84.98771:32.46098"
                },
                {
                    "name": "rltk",
                    "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
                    "members": [
                        "mayank",
                        "yixiang"
                    ],
                    "date": ["2007-12-05T23:19:00"],
                    "cost": -3213.32
                }
            ]
        }

        kg_schema = KGSchema(json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema)
        doc = etk.create_document(sample_doc)

        try:
            doc.kg.add_value("developer", json_path="projects[*].members[*]")
        except KgValueError:
            pass

        try:
            doc.kg.add_value("test_date", json_path="projects[*].date[*]")
        except KgValueError:
            pass

        try:
            doc.kg.add_value("test_add_value_date", value=[date(2018, 3, 28), {}, datetime(2018, 3, 28, 1, 1, 1)],
                             json_path_extraction="projects[0].date")
        except KgValueError:
            pass

        try:
            doc.kg.add_value("test_location", json_path="projects[*].place")
        except KgValueError:
            pass

        # print (json.dumps(doc.value, indent=2))

        expeced_provenances = [
            {
                "@id": 0,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "dongyu",
                "json_path": "projects.[0].members.[0]"
            },
            {
                "@id": 1,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "amandeep",
                "json_path": "projects.[0].members.[1]"
            },
            {
                "@id": 2,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "sylvia",
                "json_path": "projects.[0].members.[2]"
            },
            {
                "@id": 3,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "Runqi12",
                "json_path": "projects.[0].members.[3]"
            },
            {
                "@id": 4,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "mayank",
                "json_path": "projects.[1].members.[0]"
            },
            {
                "@id": 5,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "yixiang",
                "json_path": "projects.[1].members.[1]"
            },
            {
                "@id": 6,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "2007-12-05T00:00:00",
                "json_path": "projects.[0].date.[0]"
            },
            {
                "@id": 7,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "2007-12-05T23:19:00",
                "json_path": "projects.[1].date.[0]"
            },
            {
                "@id": 8,
                "@type": "kg_provenance_record",
                "reference_type": "constant",
                "value": "2018-03-28",
                "json_path": "projects[0].date"
            },
            {
                "@id": 9,
                "@type": "kg_provenance_record",
                "reference_type": "constant",
                "value": "2018-03-28T01:01:01",
                "json_path": "projects[0].date"
            },
            {
                "@id": 10,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "columbus:georgia:united states:-84.98771:32.46098",
                "json_path": "projects.[0].place"
            }
        ]

        self.assertEqual(expeced_provenances, doc.value["provenances"])
Example #10
0
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.rule_extractor, d)
            p.store(names, "members")
        return list()


if __name__ == "__main__":

    sample_input = {
        "projects": [{
            "name":
            "etk",
            "description":
            "version  2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others."
        }, {
            "name":
            "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }]
    }

    etk = ETK(modules=RuleETKModule)
    doc = etk.create_document(sample_input)

    docs = etk.process_ems(doc)

    print(json.dumps(docs[0].value, indent=2))
Example #11
0
import os, sys, json
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from etk.etk import ETK
from etk.knowledge_graph import KGSchema
from examples.config_to_em.em_base_generator import EmBaseGenerator

ebg = EmBaseGenerator('template.tpl')
ebg.generate_em_base('master_config.json', 'ems/em_base.py')

kg_schema = KGSchema(json.load(open("master_config.json", "r")))

etk = ETK(kg_schema, ["./ems"])

doc = etk.create_document(json.load(open('sample_html.jl', 'r')))

docs = etk.process_ems(doc)

print(json.dumps(docs[0].value, indent=2))
Example #12
0
            },
            "matched_sentence": {
                "type": "string"
            },
            "date": {
                "type": "string"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./"])

    # read the news
    news_file = open(
        '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl'
    )
    # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl')
    news_stories = [
        etk.create_document(json.loads(line),
                            url=json.loads(line)['tld'],
                            doc_id=json.loads(line)['doc_id'])
        for line in news_file
    ]
    results = list()
    for news_story in news_stories:
        results.extend(etk.process_ems(news_story))
    o = open('ifp_news_similarity.jl', 'w')
    for result in results:
        o.write(json.dumps(result.value))
        o.write('\n')
Example #13
0
    def __init__(self, query_server=None, update_server=None):
        self.punctuation_table = str.maketrans(
            dict.fromkeys(string.punctuation))
        if query_server and update_server:
            self.query_server = query_server
            self.update_server = update_server
        else:
            self.query_server = DATAMRT_SERVER
            self.update_server = DATAMRT_SERVER

        # initialize
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        self.doc = etk.create_document(
            {}, doc_id="http://isi.edu/default-ns/projects")

        # bind prefixes
        self.doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
        self.doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
        self.doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
        self.doc.kg.bind('wdtn',
                         'http://www.wikidata.org/prop/direct-normalized/')
        self.doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
        self.doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
        self.doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
        self.doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
        self.doc.kg.bind('p', 'http://www.wikidata.org/prop/')
        self.doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
        self.doc.kg.bind('prv',
                         'http://www.wikidata.org/prop/reference/value/')
        self.doc.kg.bind(
            'prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
        self.doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
        self.doc.kg.bind('psv',
                         'http://www.wikidata.org/prop/statement/value/')
        self.doc.kg.bind(
            'psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
        self.doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
        self.doc.kg.bind('pqv',
                         'http://www.wikidata.org/prop/qualifier/value/')
        self.doc.kg.bind(
            'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
        self.doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
        self.doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
        self.doc.kg.bind('schema', 'http://schema.org/')

        # give definition of the nodes we definied
        p = WDProperty('C2001', Datatype.MonolingualText)
        p.add_label('keywords', lang='en')
        p.add_description('identifier of a dataset in the Datamart system',
                          lang='en')
        p.add_statement('P31', Item('Q19847637'))
        p.add_statement('P1629', Item('Q1172284'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2004', Datatype.StringValue)
        p.add_label('datamart identifier', lang='en')
        p.add_description(
            'keywords associated with an item to facilitate finding the item using text search',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2005', Datatype.StringValue)
        p.add_label('variable measured', lang='en')
        p.add_description('the variables measured in a dataset', lang='en')
        p.add_statement('P31', Item('Q18616576'))
        p.add_statement('P1628',
                        URLValue('http://schema.org/variableMeasured'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2006', Datatype.StringValue)
        p.add_label('values', lang='en')
        p.add_description(
            'the values of a variable represented as a text document',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2007', Datatype.Item)
        p.add_label('data type', lang='en')
        p.add_description(
            'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), '
            'Real (Q4385701), String (Q184754), Categorical (Q2285707)',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2008', Datatype.URLValue)
        p.add_label('semantic type', lang='en')
        p.add_description(
            'a URL that identifies the semantic type of a variable in a dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        # get the starting source id
        sparql_query = """
            prefix wdt: <http://www.wikidata.org/prop/direct/>
            prefix wd: <http://www.wikidata.org/entity/>
            prefix wikibase: <http://wikiba.se/ontology#>
            PREFIX p: <http://www.wikidata.org/prop/>
            PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/>
            PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
            PREFIX ps: <http://www.wikidata.org/prop/statement/>
            prefix bd: <http://www.bigdata.com/rdf#>
            prefix bds: <http://www.bigdata.com/rdf/search#>

            select ?x where {
              wd:Z00000 wdt:P1114 ?x .
            }
            """
        try:
            sparql = SPARQLWrapper(self.query_server)
            sparql.setQuery(sparql_query)
            sparql.setReturnFormat(JSON)
            sparql.setMethod(POST)
            sparql.setRequestMethod(URLENCODED)
            results = sparql.query().convert()['results']['bindings']
        except:
            print("Getting query of wiki data failed!")
            raise ValueError("Unable to initialize the datamart query service")
        if not results:
            print(
                "[WARNING] No starting source id found! Will initialize the starting source with D1000001"
            )
            self.resource_id = 1000001
        else:
            self.resource_id = 1000001
Example #14
0
def main():
    filename = sys.argv[1]
    query_title = sys.argv[2]
    ranking_criteria = sys.argv[3]
    top_k = sys.argv[4]

    if ranking_criteria not in ('TITLE', 'SENTENCE'):
        print('Wrong mode! Please check the input argument!')
        return

    master_config = {
        "fields": {
            "developer": {
                "type": "string"
            },
            "student_developer": {
                "type": "string"
            },
            "spacy_name": {
                "type": "string"
            },
            "date": {
                "type": "date"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./extraction_modules/"])
    nlp = spacy.load('en_core_web_lg')

    date_extractor = DateExtractor(etk=etk)

    queries = dict()
    queries_ent_map = dict()

    with open(query_title) as f:
        for line in f:
            orig_ifp_title = line
            # remove date information from query term
            res = date_extractor.extract(text=line)
            start, end = float('inf'), -1
            for i in res:
                start = min(start, i.provenance['start_char'])
                end = max(end, i.provenance['end_char'])
            # delete date from query term
            if len(res) != 0:
                line = line[:start] + line[end+1:]

            queries[orig_ifp_title] = line
            queries_ent_map[line] = list()
            # extract entities from query term
            doc = nlp(line)
            for ent in doc.ents:
                queries_ent_map[line].append(re.escape(ent.text.strip()))
            # remove empty entities
            queries_ent_map[line] = list(filter(bool, queries_ent_map[line]))

    # the list of selected docs for given query term
    query_docs_mapping = dict()

    docs = list()
    with open(filename) as f:
        for line in f:
            json_obj = json.loads(line)
            docs.append(etk.create_document(json_obj))

    ds = DefaultDocumentSelector()

    for orig_query, proc_query in queries.items():
        content_regex = queries_ent_map[proc_query]
        query_docs_mapping[proc_query] = list()
        for doc in docs:
            if len(content_regex) == 0 \
                    or ds.select_document(document=doc,
                              json_paths=['$.lexisnexis.doc_description'],
                              json_paths_regex=content_regex):
                query_docs_mapping[proc_query].append(doc)

    # TODO: pass ifp_id in
    for orig_query, proc_query in queries.items():
        # print(len(query_docs_mapping[proc_query]))
        dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query)
        heap = list()
        for doc in query_docs_mapping[proc_query]:
            processed_doc = dict()

            if ranking_criteria == 'SENTENCE':
                processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document
            elif ranking_criteria == 'TITLE':
                processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document

            if len(heap) < top_k:
                heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))
            else:
                if processed_doc['similarity'] > heap[0][0]:
                    heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))

        heap.sort(reverse=True)

        output_filename = './resources/output/'+orig_ifp_title+"_result.jl"

        with open(output_filename, 'a+b') as f:
            for item in heap:
                print(item[0])
                jl_str = json.dumps(item[2]) + '\n'
                f.write(jl_str.encode())
Example #15
0
    def model_statement(self):
        # initialize KGSchema
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri})

        # extract files
        self.extract_files()

        # model statement
        inputs = self.data['inputs']
        for k, v in inputs.items():
            if k != 'metadata':
                # construct wikifier instance
                if k == 'wikifier' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A wikifier file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1001',
                        namespace=self.ns))  # an instance of Wikifier
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP3003',
                                    StringValue(v['content']),
                                    namespace=self.ns)  # hasFileContent
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)  # hashValue

                # construct mapping_file instance
                elif k == 'mappingFile' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A mapping file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1002',
                        namespace=self.ns))  # an instance of MappingFile
                    q.add_statement('P170', StringValue('T2WML'))
                    q.add_statement('P127', Item('SDQ1003', namespace=self.ns))
                    q.add_statement('SDP3003',
                                    StringValue(json.dumps(v['content'])),
                                    namespace=self.ns)
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                # construct dataset instance
                elif k == 'dataset' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label(v['content']['title'], lang='en')
                    q.add_description(v['content']['description'], lang='en')
                    q.add_statement('P31',
                                    Item('Q1172284'))  # an instance of Dataset
                    q.add_statement('SDP3001',
                                    Item(inputs['wikifier']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a wikifier file
                    q.add_statement('SDP3002',
                                    Item(inputs['mappingFile']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a mapping file
                    q.add_statement('P1476', StringValue(
                        v['content']['title']))  # title
                    q.add_statement(
                        'P921',
                        StringValue(v['content']['description']))  # described
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP2004',
                                    StringValue(', '.join(
                                        v['content']['keywords'])),
                                    namespace=self.ns)  # keywords
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                    if self.data['storeColumnValue']:
                        for data in v['content']['variable_measured']:
                            statement = q.add_statement(
                                'SDP2005',
                                StringValue(data['column_name']),
                                namespace=self.ns)  # variable measured
                            statement.add_qualifier(
                                'SDP2006',
                                StringValue(data['values_of_a_column']),
                                namespace=self.ns)  # the values of a column
                            statement.add_qualifier(
                                'SDP2007',
                                Item(data['data_structure_type']),
                                namespace=self.ns)  # data structure type
                            statement.add_qualifier(
                                'SDP2008',
                                URLValue(data['semantic_type_identifier']),
                                namespace=self.ns)  # semantic type
                            statement.add_qualifier(
                                'P1545',
                                QuantityValue(
                                    data['column_index'],
                                    namespace=self.ns))  # column index

                doc.kg.add_subject(q)

        return doc
Example #16
0
def model_data() -> None:
    """
	This function generates triples for user defined properties for uploading them to wikidata
	:return:
	"""
    stream = open(Path.cwd().parent /
                  "Datasets/new-property-configuration.yaml",
                  'r',
                  encoding='utf8')
    yaml_data = yaml.safe_load(stream)
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')
    sparql_endpoint = "https://query.wikidata.org/sparql"
    type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue}
    property_type_cache = {}
    for k, v in yaml_data.items():
        p = WDProperty(k,
                       type_map[v['type']],
                       creator='http://www.isi.edu/t2wml')
        for lang, value in v['label'].items():
            for val in value:
                p.add_label(val, lang=lang)
        for lang, value in v['description'].items():
            for val in value:
                p.add_description(val, lang=lang)
        for pnode, items in v['statements'].items():
            for item in items:
                try:
                    property_type = property_type_cache[pnode]
                except KeyError:
                    property_type = get_property_type(pnode, sparql_endpoint)
                    property_type_cache[pnode] = property_type
                if property_type == "WikibaseItem":
                    value = Item(str(item['value']))
                elif property_type == "WikibaseProperty":
                    value = Property(item['value'])
                elif property_type == "String":
                    value = StringValue(item['value'])
                elif property_type == "Quantity":
                    value = QuantityValue(item['value'])
                elif property_type == "Time":
                    value = TimeValue(
                        str(item['value']), Item(item["calendar"]),
                        translate_precision_to_integer(item["precision"]),
                        item["time_zone"])
                elif property_type == "Url":
                    value = URLValue(item['value'])
                elif property_type == "Monolingualtext":
                    value = MonolingualText(item['value'], item["lang"])
                elif property_type == "ExternalId":
                    value = ExternalIdentifier(item['value'])
                elif property_type == "GlobeCoordinate":
                    value = GlobeCoordinate(item["latitude"],
                                            item["longitude"],
                                            item["precision"])

                p.add_statement(pnode, value)

        doc.kg.add_subject(p)

    with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f:
        data = doc.kg.serialize('ttl')
        f.write(data)
Example #17
0
        douglas.add_statement('P2048', QuantityValue(1.96,
                                                     unit=Item('Q11573')))

        # official website
        # statement = douglas.add_statement('P856', URLValue('http://douglasadams.com/'))
        statement = douglas.add_truthy_statement(
            'P856', URLValue('http://douglasadams.com/'))

        statement.add_qualifier('P407', Item('Q1860'))

        # Freebase ID
        douglas.add_statement(
            'P646',
            ExternalIdentifier('/m/0282x', URLValue('http://g.co/kg/m/0282x')))

        doc.kg.add_subject(douglas)
        return list()


if __name__ == "__main__":
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

    revise(True)

    docs = etk.process_ems(doc)

    print(docs[0].kg.serialize('ttl'))
Example #18
0
class TestProvenance(unittest.TestCase):
    def test_Provenance(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        self.etk = ETK(kg_schema=kg_schema)
        g = [
            'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep',
            'yixiang'
        ]
        self.name_extractor = GlossaryExtractor(g,
                                                "name_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=1)
        doc = self.etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.name_extractor, d)
            p.store(names, "members")

        expected_provenances = [{
            "@id": 0,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 33,
                "end_char": 38
            }
        }, {
            "@id": 1,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 40,
                "end_char": 46
            }
        }, {
            "@id": 2,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 48,
                "end_char": 54
            }
        }, {
            "@id": 3,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 56,
                "end_char": 64
            }
        }, {
            "@id": 4,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[0].members",
            "parent_provenances": {
                "Runqi": 0,
                "Dongyu": 1,
                "Sylvia": 2,
                "Amandeep": 3
            }
        }, {
            "@id": 5,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 39,
                "end_char": 44
            }
        }, {
            "@id": 6,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 46,
                "end_char": 52
            }
        }, {
            "@id": 7,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 54,
                "end_char": 61
            }
        }, {
            "@id": 8,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[1].members",
            "parent_provenances": {
                "Pedro": 5,
                "Mayank": 6,
                "Yixiang": 7
            }
        }]
        expected_projects = [{
            "name":
            "etk",
            "description":
            "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.",
            "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"]
        }, {
            "name": "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
            "members": ["Pedro", "Mayank", "Yixiang"]
        }]
        #print ("hiiiiiiiiiiiiiiiii")
        #print ("projects: " + str(doc.value["projects"]))
        #print ("provenances: " + str(doc.value["provenances"]))
        self.assertEqual(expected_projects, doc.value["projects"])
        self.assertEqual(expected_provenances, doc.value["provenances"])
Example #19
0
class ETKWorker(object):
    def __init__(self, master_config, em_paths, logger, worker_id,
                 project_name, kafka_input_args=None, kafka_output_args=None):
        self.logger = logger
        self.worker_id = worker_id
        self.check_interval = 1000
        self.exit_sign = False

        try:
            kg_schema = KGSchema(master_config)
            self.etk_ins = ETK(kg_schema, em_paths, logger=logger)
        except Exception as e:
            logger.exception('ETK initialization failed')
            raise e

        # kafka input
        self.kafka_input_server = config['input_server']
        self.kafka_input_session_timeout = config['input_session_timeout']
        self.kafka_input_group_id = config['input_group_id']
        self.kafka_input_topic = '{project_name}_in'.format(project_name=project_name)
        self.kafka_input_args = dict() if kafka_input_args is None else kafka_input_args
        self.kafka_consumer = KafkaConsumer(
            bootstrap_servers=self.kafka_input_server,
            group_id=self.kafka_input_group_id,
            consumer_timeout_ms=self.check_interval,
            value_deserializer=lambda v: json.loads(v.decode('utf-8')),
            **self.kafka_input_args
        )
        self.kafka_consumer.subscribe([self.kafka_input_topic])

        # kafka output
        self.kafka_output_server = config['output_server']
        self.kafka_output_topic = '{project_name}_out'.format(project_name=project_name)
        self.kafka_output_args = dict() if kafka_output_args is None else kafka_output_args
        self.kafka_producer = KafkaProducer(
            bootstrap_servers=self.kafka_output_server,
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            **self.kafka_output_args
        )

        self.timeout_count = self.kafka_input_session_timeout / self.check_interval
        self.current_timeout_count = 0

    def process(self):
        # prev_doc_sent_time = None

        while not self.exit_sign:
            # high level api handles batching
            # will exit once timeout
            try:
                for msg in self.kafka_consumer:
                    # force to commit, block till getting response
                    self.kafka_consumer.commit()
                    # get message, clear timeout count
                    self.current_timeout_count = 0

                    cdr = msg.value
                    # TODO better way to add execution profile
                    # cdr['@execution_profile'] = {'@worker_id': self.worker_id}
                    # doc_arrived_time = time.time()
                    # cdr['@execution_profile']['@doc_arrived_time'] = \
                    #     datetime.utcfromtimestamp(doc_arrived_time).isoformat()
                    # cdr['@execution_profile']['@doc_wait_time'] = \
                    #     0.0 if not prev_doc_sent_time \
                    #         else float(doc_arrived_time - prev_doc_sent_time)
                    # cdr['@execution_profile']['@doc_length'] = len(json.dumps(cdr))

                    if 'doc_id' not in cdr or len(cdr['doc_id']) == 0:
                        self.logger.error('invalid cdr: unknown doc_id')
                        continue

                    self.logger.info('processing %s' % cdr['doc_id'])
                    try:
                        # start_run_core_time = time.time()
                        # run etk module

                        doc = self.etk_ins.create_document(cdr, url=cdr['url'], doc_id=cdr['doc_id'])
                        # process_ems returns a list of Documents
                        results = self.etk_ins.process_ems(doc)
                        for result in results:
                            cdr_result = result.cdr_document

                            # indexing
                            # TODO
                            indexed_cdr = index_knowledge_graph_fields(cdr_result)
                            if not indexed_cdr:
                                logger.error('indexing in sandpaper failed')
                                continue
                            # cdr = indexed_cdr

                        # cdr['@execution_profile']['@run_core_time'] = \
                        #     float(time.time() - start_run_core_time)
                        # doc_sent_time = time.time()
                        # cdr['@execution_profile']['@doc_sent_time'] = \
                        #     datetime.utcfromtimestamp(doc_sent_time).isoformat()
                        # prev_doc_sent_time = doc_sent_time
                        # cdr['@execution_profile']['@doc_processed_time'] = \
                        #     float(doc_sent_time - doc_arrived_time)

                            # output result
                            r = self.kafka_producer.send(self.kafka_output_topic, indexed_cdr)
                            r.get(timeout=60)  # wait till sent

                            self.logger.info('{} done'.format(indexed_cdr['doc_id']))

                    except Exception as e:
                        self.logger.exception('failed at %s' % cdr['doc_id'])

            except ValueError as e:
                # I/O operation on closed epoll fd
                self.logger.info('consumer closed')
                self.exit_sign = True

            except StopIteration as e:
                # timeout
                self.current_timeout_count += 1
                if self.current_timeout_count >= self.timeout_count:
                    self.exit_sign = True

            except CommitFailedError as e:
                self.exit_sign = True

                # https://github.com/dpkp/kafka-python/blob/535d8f6a85969c4e07de0bc81e14513c677995be/kafka/errors.py#L65
                # if this worker is dead, restart and reattach to the group
                g_restart_worker = True

    def __del__(self):

        self.logger.info('ETK worker {} is exiting...'.format(self.worker_id))

        try:
            self.kafka_consumer.close()
        except:
            pass
        try:
            self.kafka_producer.close()
        except:
            pass
Example #20
0
    def _init_etk():
        # initialize for etk
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id="http://isi.edu/default-ns/projects")

        # bind prefixes
        doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
        doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
        doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
        doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
        doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
        doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
        doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
        doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
        doc.kg.bind('p', 'http://www.wikidata.org/prop/')
        doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
        doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
        doc.kg.bind(
            'prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
        doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
        doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
        doc.kg.bind(
            'psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
        doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
        doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
        doc.kg.bind(
            'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
        doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
        doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
        doc.kg.bind('schema', 'http://schema.org/')

        # give definition of the nodes we definied
        p = WDProperty('C2001', Datatype.MonolingualText)
        p.add_label('datamart identifier', lang='en')
        p.add_description('identifier of a dataset in the Datamart system',
                          lang='en')
        p.add_statement('P31', Item('Q19847637'))
        p.add_statement('P1629', Item('Q1172284'))
        doc.kg.add_subject(p)

        p = WDProperty('C2004', Datatype.StringValue)
        p.add_label('keywords', lang='en')
        p.add_description(
            'keywords associated with an item to facilitate finding the item using text search',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2005', Datatype.StringValue)
        p.add_label('variable measured', lang='en')
        p.add_description('the variables measured in a dataset', lang='en')
        p.add_statement('P31', Item('Q18616576'))
        p.add_statement('P1628',
                        URLValue('http://schema.org/variableMeasured'))
        doc.kg.add_subject(p)

        p = WDProperty('C2006', Datatype.StringValue)
        p.add_label('values', lang='en')
        p.add_description(
            'the values of a variable represented as a text document',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2007', Datatype.Item)
        p.add_label('data type', lang='en')
        p.add_description(
            'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), '
            'Real (Q4385701), String (Q184754), Categorical (Q2285707)',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2008', Datatype.URLValue)
        p.add_label('semantic type', lang='en')
        p.add_description(
            'a URL that identifies the semantic type of a variable in a dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2010', Datatype.StringValue)
        p.add_label('extra information', lang='en')
        p.add_description(
            'some extra information that may needed for this dataset',
            lang='en')
        doc.kg.add_subject(p)

        p = WDProperty('C2011', Datatype.TimeValue)
        p.add_label('start date', lang='en')
        p.add_description(
            'The earlist time exist in this dataset, only valid when there exists time format data in this dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2012', Datatype.TimeValue)
        p.add_label('end date', lang='en')
        p.add_description(
            'The latest time exist in this dataset, only valid when there exists time format data in this dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2013', Datatype.QuantityValue)
        p.add_label('time granularity', lang='en')
        p.add_description('time granularity in a dataset', lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2014', Datatype.StringValue)
        p.add_label('uploader information', lang='en')
        p.add_description('information about who uploaded and when uploaded',
                          lang='en')
        doc.kg.add_subject(p)
        return doc
Example #21
0
            doc.kg.add_value("developer", member.value)
        return list()


if __name__ == "__main__":

    sample_input = {
        "projects": [{
            "name":
            "etk",
            "description":
            "version 2 of etk, implemented by Runqi Shao, Dongyu Li, Sylvia lin, Amandeep and "
            "others."
        }, {
            "name":
            "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }]
    }

    kg_schema = KGSchema(json.load(open("master_config.json", "r")))
    etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule)
    doc = etk.create_document(sample_input,
                              doc_id="http://isi.edu/default-ns/projects")

    docs = etk.process_ems(doc)

    print(json.dumps(docs[0].kg.value, indent=2))
    print(docs[0].kg.get_values('developer'))
Example #22
0
class TripleGenerator(Generator):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        prop_declaration = kwargs.pop("prop_declaration")
        dest_fp = kwargs.pop("dest_fp")
        truthy = kwargs.pop("truthy")
        use_id = kwargs.pop("use_id")
        prefix_path = kwargs.pop("prefix_path")
        self.datatype_mapping = {
            # nomenclature from https://w.wiki/Tfn
            "item": Item,
            "WikibaseItem": Item,
            "time": TimeValue,
            "Time": TimeValue,
            "globe-coordinate": GlobeCoordinate,
            "GlobeCoordinate": GlobeCoordinate,
            "quantity": QuantityValue,
            "Quantity": QuantityValue,
            "monolingualtext": MonolingualText,
            "Monolingualtext": MonolingualText,
            "string": StringValue,
            "String": StringValue,
            "external-identifier": ExternalIdentifier,
            "ExternalId": ExternalIdentifier,
            "url": StringValue,  #TODO bug potentially in rdflib
            "Url": StringValue,
            "property": WDProperty,
            "WikibaseProperty": WDProperty
        }
        self.set_prefix(prefix_path)
        self.prop_declaration = prop_declaration
        self.set_properties(self.prop_file)
        self.fp = dest_fp
        self.truthy = truthy
        self.reset_etk_doc()
        self.serialize_prefix()
        self.use_id = use_id

    def set_prefix(self, prefix_path: str):
        self.prefix_dict = {}
        if prefix_path != "NONE":
            with open(prefix_path, "r") as fp:
                for line_num, edge in enumerate(fp):
                    edge_list = edge.strip("\r\n").split("\t")
                    if line_num == 0:
                        node1_index, node2_index = edge_list.index(
                            "node1"), edge_list.index("node2")
                    else:
                        prefix, expand = edge_list[node1_index], edge_list[
                            node2_index]
                        self.prefix_dict[prefix] = expand

    def read_prop_declaration(self, line_number: int, edge: str):
        node1, node2, prop, e_id = self.parse_edges(edge)
        if prop == "data_type":
            self.prop_types[node1] = self.datatype_mapping[node2.strip()]
        return

    def set_properties(self, prop_file: str):
        self.prop_types = {}
        if prop_file == "NONE":
            return

        with open(prop_file, "r") as fp:
            props = fp.readlines()
        for line in props[1:]:
            node1, _, node2 = line.split("\t")
            try:
                self.prop_types[node1] = self.datatype_mapping[node2.strip()]
            except:
                raise KGTKException(
                    "DataType {} of node {} is not supported.\n".format(
                        node2, node1))

    def _node_2_entity(self, node: str):
        '''
        A node can be Qxxx or Pxxx, return the proper entity.
        '''
        if node in self.prop_types:
            entity = WDProperty(node, self.prop_types[node])
        else:
            entity = WDItem(TripleGenerator.replace_illegal_string(node))
        return entity

    def reset_etk_doc(self,
                      doc_id: str = "http://isi.edu/default-ns/projects"):
        """
        reset the doc object and return it. Called at initialization and after outputting triples.
        """
        kg_schema = KGSchema()
        kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
        self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        self.doc = self.etk.create_document({}, doc_id=doc_id)
        for k, v in wiki_namespaces.items():
            if k in self.prefix_dict:
                self.doc.kg.bind(k, self.prefix_dict[k])
            else:
                self.doc.kg.bind(k, v)

    def serialize(self):
        """
        Seriealize the triples. Used a hack to avoid serializing the prefix again.
        """
        docs = self.etk.process_ems(self.doc)
        self.fp.write("\n\n".join(
            docs[0].kg.serialize("ttl").split("\n\n")[1:]))
        self.fp.flush()
        self.reset()

    def serialize_prefix(self):
        """
        This function should be called only once after the doc object is initialized.
        In order to serialize the prefix at the very begining it has to be printed per the change of rdflib 4.2.2->5.0.0
        Relevent issue: https://github.com/RDFLib/rdflib/issues/965
        """
        for k, v in wiki_namespaces.items():
            if k in self.prefix_dict:
                line = "@prefix " + k + ": <" + self.prefix_dict[k] + "> .\n"
            else:
                line = "@prefix " + k + ": <" + v + "> .\n"
            self.fp.write(line)
        self.fp.write("\n")
        self.fp.flush()
        self.reset()

    def reset(self):
        self.to_append_statement_id = None
        self.to_append_statement = None
        self.read_num_of_lines = 0
        self.reset_etk_doc()

    def generate_label_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_label(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_description_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_description(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_alias_triple(self, node1: str, node2: str) -> bool:
        entity = self._node_2_entity(node1)
        text_string, lang = TripleGenerator.process_text_string(node2)
        entity.add_alias(text_string, lang=lang)
        self.doc.kg.add_subject(entity)
        return True

    def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool:
        # update the known prop_types
        if node1 in self.prop_types:
            if not self.prop_declaration:
                raise KGTKException(
                    "Duplicated property definition of {} found!".format(
                        node1))
        else:
            self.prop_types[node1] = node2

        prop = WDProperty(node1, self.datatype_mapping[node2])
        self.doc.kg.add_subject(prop)
        return True

    def generate_normal_triple(self, node1: str, property: str, node2: str,
                               is_qualifier_edge: bool, e_id: str) -> bool:
        if self.use_id:
            e_id = TripleGenerator.replace_illegal_string(e_id)
        entity = self._node_2_entity(node1)
        edge_type = self.prop_types[property]
        if edge_type == Item:
            object = WDItem(TripleGenerator.replace_illegal_string(node2))
        elif edge_type == WDProperty:
            object = WDProperty(TripleGenerator.replace_illegal_string(node2),
                                self.prop_types[node2])

        elif edge_type == TimeValue:
            if self.yyyy_mm_dd_pattern.match(node2):
                try:
                    dateTimeString = node2
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            elif self.yyyy_pattern.match(node2):
                try:
                    dateTimeString = node2 + "-01-01"
                    object = TimeValue(
                        value=dateTimeString,  # TODO
                        calendar=Item("Q1985727"),
                        precision=Precision.year,
                        time_zone=0,
                    )
                except:
                    return False
            else:
                try:
                    # TODO, in future, the two cases above will be dropped in principle to comply with the iso format
                    # now it is iso format
                    assert (node2[0] == "^")
                    node2 = node2[1:]  # remove ^
                    if node2.startswith("+"):
                        node2 = node2[1:]
                    dateTimeString, precision = node2.split("/")
                    dateTimeString = dateTimeString[:-1]  # remove Z
                    object = TimeValue(
                        value=dateTimeString,
                        calendar=Item("Q1985727"),
                        precision=precision,
                        time_zone=0,
                    )
                except:
                    return False

        elif edge_type == GlobeCoordinate:
            latitude, longitude = node2[1:].split("/")
            latitude = float(latitude)
            longitude = float(longitude)
            object = GlobeCoordinate(latitude,
                                     longitude,
                                     0.0001,
                                     globe=Item("Q2"))  # earth

        elif edge_type == QuantityValue:
            # +70[+60,+80]Q743895
            res = self.quantity_pattern.match(node2).groups()
            amount, lower_bound, upper_bound, unit = res

            amount = TripleGenerator.clean_number_string(amount)
            num_type = self.xsd_number_type(amount)

            lower_bound = TripleGenerator.clean_number_string(lower_bound)
            upper_bound = TripleGenerator.clean_number_string(upper_bound)
            if unit != None:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount,
                                           unit=Item(unit),
                                           type=num_type)
            else:
                if upper_bound != None and lower_bound != None:
                    object = QuantityValue(amount,
                                           upper_bound=upper_bound,
                                           lower_bound=lower_bound,
                                           type=num_type)
                else:
                    object = QuantityValue(amount, type=num_type)

        elif edge_type == MonolingualText:
            text_string, lang = TripleGenerator.process_text_string(node2)
            object = MonolingualText(text_string, lang)
        elif edge_type == ExternalIdentifier:
            object = ExternalIdentifier(node2)
        elif edge_type == URLValue:
            if TripleGenerator.is_valid_uri_with_scheme_and_host(node2):
                object = URLValue(node2)
            else:
                return False
        else:
            # treat everything else as stringValue
            object = StringValue(node2)

        if type(object) == WDItem or type(object) == WDProperty:
            self.doc.kg.add_subject(object)

        if is_qualifier_edge:
            # edge: e8 p9 ^2013-01-01T00:00:00Z/11
            # create qualifier edge on previous STATEMENT and return the updated STATEMENT
            self.to_append_statement.add_qualifier(property, object)
            self.doc.kg.add_subject(self.to_append_statement)
        else:
            # edge: q1 p8 q2 e8
            # create brand new property edge and replace STATEMENT
            if self.truthy:
                self.to_append_statement = entity.add_truthy_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_truthy_statement(
                    property, object)
            else:
                self.to_append_statement = entity.add_statement(
                    property, object, statement_id=e_id
                ) if self.use_id else entity.add_statement(property, object)
            self.doc.kg.add_subject(entity)
        return True

    def entry_point(self, line_number: int, edge: str):
        # print(line_number,edge)
        """
        generates a list of two, the first element is the determination of the edge type using corresponding edge type
        the second element is a bool indicating whether this is a valid property edge or qualifier edge.
        Call corresponding downstream functions
        """
        if line_number == 1:
            # initialize the order_map
            self.initialize_order_map(edge)
            return

        # use the order_map to map the node
        node1, node2, prop, e_id = self.parse_edges(edge)
        if line_number == 2:
            # by default a statement edge
            is_qualifier_edge = False
        else:
            if node1 != self.to_append_statement_id and node1 != self.corrupted_statement_id:
                is_qualifier_edge = False
                # also a new statement edge
                if self.read_num_of_lines >= self.n:
                    self.serialize()
            else:
                # qualifier edge or property declaration edge
                is_qualifier_edge = True
                if node1 == self.corrupted_statement_id:
                    self.warn_log.write(
                        "QUALIFIER edge at line [{}] associated of corrupted statement edge of id [{}] dropped.\n"
                        .format(line_number, self.corrupted_statement_id))
                    return
        if prop in self.label_set:
            success = self.generate_label_triple(node1, node2)
        elif prop in self.description_set:
            success = self.generate_description_triple(node1, node2)
        elif prop in self.alias_set:
            success = self.generate_alias_triple(node1, node2)
        elif prop == "data_type":
            # special edge of prop declaration
            success = self.generate_prop_declaration_triple(node1, node2)
        else:
            if prop in self.prop_types:
                success = self.generate_normal_triple(node1, prop, node2,
                                                      is_qualifier_edge, e_id)
            else:
                raise KGTKException(
                    "property [{}]'s type is unknown at line [{}].\n".format(
                        prop, line_number))
        if (not success) and self.warning:
            if not is_qualifier_edge:
                self.warn_log.write(
                    "CORRUPTED_STATEMENT edge at line: [{}] with edge id [{}].\n"
                    .format(line_number, e_id))
                self.corrupted_statement_id = e_id
            else:
                self.warn_log.write(
                    "CORRUPTED_QUALIFIER edge at line: [{}] with edge id [{}].\n"
                    .format(line_number, e_id))

        else:
            self.read_num_of_lines += 1
            if not is_qualifier_edge:
                self.to_append_statement_id = e_id

    @staticmethod
    def xsd_number_type(num):
        if isinstance(num, float) and 'e' in str(num).lower():
            return LiteralType.double
        return LiteralType.decimal
Example #23
0
def generate_triples(user_id: str,
                     resolved_excel: list,
                     sparql_endpoint: str,
                     filetype: str = 'ttl',
                     created_by: str = 't2wml') -> str:
    """
    This function uses ETK to generate the RDF triples
    :param user_id:
    :param resolved_excel:
    :param sparql_endpoint:
    :param filetype:
    :return:
    """
    # initialize
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")
    property_type_map = property_type_dict

    # bind prefixes
    doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
    doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
    doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
    doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
    doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
    doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
    doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
    doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
    doc.kg.bind('p', 'http://www.wikidata.org/prop/')
    doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
    doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
    doc.kg.bind('prn',
                'http://www.wikidata.org/prop/reference/value-normalized/')
    doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
    doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
    doc.kg.bind('psn',
                'http://www.wikidata.org/prop/statement/value-normalized/')
    doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
    doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
    doc.kg.bind('pqn',
                'http://www.wikidata.org/prop/qualifier/value-normalized/')
    doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
    doc.kg.bind('schema', 'http://schema.org/')

    # property_type_cache = {}
    is_error = False
    for i in resolved_excel:
        _item = i["statement"]["item"]
        if _item is not None:
            item = WDItem(_item,
                          creator='http://www.isi.edu/{}'.format(created_by))
            try:
                property_type = property_type_map[i["statement"]["property"]]
            except KeyError:
                property_type = get_property_type(i["statement"]["property"],
                                                  sparql_endpoint)
                if property_type != "Property Not Found" and i["statement"][
                        "property"] not in property_type_map:
                    property_type_map[i["statement"]
                                      ["property"]] = property_type
            if property_type == "WikibaseItem":
                value = Item(str(i["statement"]["value"]))
            elif property_type == "WikibaseProperty":
                value = Property(i["statement"]["value"])
            elif property_type == "String":
                value = StringValue(i["statement"]["value"])
            elif property_type == "Quantity":
                _value = i["statement"]["value"]
                _value = str(_value).replace(',', '')
                value = QuantityValue(_value)
            elif property_type == "Time":
                value = TimeValue(
                    str(i["statement"]["value"]),
                    Item(i["statement"]["calendar"]),
                    translate_precision_to_integer(
                        i["statement"]["precision"]),
                    i["statement"]["time_zone"])
            elif property_type == "Url":
                value = URLValue(i["statement"]["value"])
            elif property_type == "Monolingualtext":
                value = MonolingualText(i["statement"]["value"],
                                        i["statement"]["lang"])
            elif property_type == "ExternalId":
                value = ExternalIdentifier(i["statement"]["value"])
            elif property_type == "GlobeCoordinate":
                value = GlobeCoordinate(i["statement"]["latitude"],
                                        i["statement"]["longitude"],
                                        i["statement"]["precision"])
            elif property_type == "Property Not Found":
                is_error = True
                break
            s = item.add_statement(i["statement"]["property"], value)
            doc.kg.add_subject(item)

            if "qualifier" in i["statement"]:
                for j in i["statement"]["qualifier"]:
                    try:
                        property_type = property_type_map[j["property"]]

                    except KeyError:
                        property_type = get_property_type(
                            j["property"], sparql_endpoint)
                        if property_type != "Property Not Found" and i[
                                "statement"][
                                    "property"] not in property_type_map:
                            property_type_map[i["statement"]
                                              ["property"]] = property_type
                    if property_type == "WikibaseItem":
                        value = Item(str(j["value"]))
                    elif property_type == "WikibaseProperty":
                        value = Property(j["value"])
                    elif property_type == "String":
                        value = StringValue(j["value"])
                    elif property_type == "Quantity":
                        value = QuantityValue(j["value"])
                    elif property_type == "Time":
                        value = TimeValue(str(j["value"]), Item(j["calendar"]),
                                          j["precision"], j["time_zone"])
                    elif property_type == "Url":
                        value = URLValue(j["value"])
                    elif property_type == "Monolingualtext":
                        value = MonolingualText(j["value"], j["lang"])
                    elif property_type == "ExternalId":
                        value = ExternalIdentifier(j["value"])
                    elif property_type == "GlobeCoordinate":
                        value = GlobeCoordinate(j["latitude"], j["longitude"],
                                                j["precision"])
                    elif property_type == "Property Not Found":
                        is_error = True
                    if value is None:
                        continue
                    else:
                        s.add_qualifier(j["property"], value)
            doc.kg.add_subject(s)
    if not is_error:
        data = doc.kg.serialize(filetype)
    else:
        # data = "Property Not Found"
        raise Exception('data exception while generating triples')

    return data
Example #24
0
if __name__ == "__main__":

    with open('date_ground_truth.txt', 'r') as f:
        texts = f.readlines()

    etk = ETK(modules=DateETKModule)
    res = []
    for text in texts:
        text = text.strip()
        if text and text[0] != '#':
            temp = text.split('|')
            if len(temp) == 3:
                input_text, expected, format = temp
                doc = etk.create_document({
                    'input': input_text,
                    'expected': expected,
                    'format': format
                })
                docs = etk.process_ems(doc)
                res.append(docs[0].value)

    for r in res:
        extracted = r['extracted_date'][0] if 'extracted_date' in r and r[
            'extracted_date'] else '            '
        expected = r['expected'].replace(
            '@today',
            datetime.datetime.now().isoformat()[:10])
        print(
            'extracted: ',
            extracted,
            '\texpected:',
Example #25
0
    def model_schema(self):
        # read data
        data = self.read_data(self.data['schema'])

        # initialize KGSchema
        custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'}
        for each in data['prefix']:
            for k, v in each.items():
                custom_dict[k] = v
                if k != 'wd':
                    ns_dict[k] = v + '/entity'
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict)

        type_map = {
            'quantity': Datatype.QuantityValue,
            'url': URLValue,
            'item': Datatype.Item,
            'time': Datatype.TimeValue,
            'string': Datatype.StringValue,
            'text': Datatype.MonolingualText
        }

        # model schema
        for k, v in data.items():
            if ':' in k:
                k = k.split(':')
                if 'Q' in k[1]:
                    p = WDItem(k[1], namespace=k[0], creator=':datamart')
                elif 'P' in k[1]:
                    p = WDProperty(k[1],
                                   type_map[v['type']],
                                   namespace=k[0],
                                   creator=':datamart')
                else:
                    raise Exception('There is no P/Q information.')
                    return None

                for lang, value in v['description'].items():
                    for val in value:
                        p.add_description(val, lang=lang)

                for lang, value in v['label'].items():
                    for val in value:
                        p.add_label(val, lang=lang)

                for node, value in v['statements'].items():
                    ns = node.split(':')[0] if ':' in node else 'wd'
                    for val in value:
                        prop_type = self.get_property_type(node, ns_dict[ns])
                        if prop_type == 'WikibaseItem':
                            v = Item(str(val['value']))
                        elif prop_type == 'WikibaseProperty':
                            v = Property(val['value'])
                        elif prop_type == 'String':
                            v = StringValue(val['value'])
                        elif prop_type == 'Quantity':
                            v = QuantityValue(val['value'])
                        elif prop_type == 'Url':
                            v = URLValue(val['value'])
                        elif prop_type == 'Monolingualtext':
                            v = MonolingualText(val['value'], val['lang'])
                        p.add_statement(node, v)
                doc.kg.add_subject(p)

        return doc
Example #26
0
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str:
	"""
	This function uses ETK to generate the RDF triples
	:param user_id:
	:param resolved_excel:
	:param sparql_endpoint:
	:param filetype:
	:return:
	"""
	# initialize
	kg_schema = KGSchema()
	kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
	etk = ETK(kg_schema=kg_schema, modules=ETKModule)
	doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

	# bind prefixes
	doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
	doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
	doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
	doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
	doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
	doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
	doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
	doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
	doc.kg.bind('p', 'http://www.wikidata.org/prop/')
	doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
	doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
	doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
	doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
	doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
	doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
	doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
	doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
	doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
	doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
	doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
	doc.kg.bind('schema', 'http://schema.org/')

	# property_type_cache = {}
	is_error = False
	for i in resolved_excel:
		item = WDItem(i["statement"]["item"],  creator='http://www.isi.edu/t2wml')
		try:
			property_type = property_type_map[i["statement"]["property"]]
		except KeyError:
			property_type = get_property_type(i["statement"]["property"], sparql_endpoint)
			property_type_map[i["statement"]["property"]] = property_type
		if property_type == "WikibaseItem":
			value = Item(str(i["statement"]["value"]))
		elif property_type == "WikibaseProperty":
			value = Property(i["statement"]["value"])
		elif property_type == "String":
			value = StringValue(i["statement"]["value"])
		elif property_type == "Quantity":
			value = QuantityValue(i["statement"]["value"])
		elif property_type == "Time":
			value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"])
		elif property_type == "Url":
			value = URLValue(i["statement"]["value"])
		elif property_type == "Monolingualtext":
			value = MonolingualText(i["statement"]["value"], i["statement"]["lang"])
		elif property_type == "ExternalId":
			value = ExternalIdentifier(i["statement"]["value"])
		elif property_type == "GlobeCoordinate":
			value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"])
		elif property_type == "Property Not Found":
			is_error = True
			break
		s = item.add_statement(i["statement"]["property"], value)
		doc.kg.add_subject(item)

		if "qualifier" in i["statement"]:
			for j in i["statement"]["qualifier"]:
				try:
					property_type = property_type_map[j["property"]]
				except KeyError:
					property_type = get_property_type(j["property"], sparql_endpoint)
					property_type_map[j["property"]] = property_type
				if property_type == "WikibaseItem":
					value = Item(str(j["value"]))
				elif property_type == "WikibaseProperty":
					value = Property(j["value"])
				elif property_type == "String":
					value = StringValue(j["value"])
				elif property_type == "Quantity":
					value = QuantityValue(j["value"])
				elif property_type == "Time":
					value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"])
				elif property_type == "Url":
					value = URLValue(j["value"])
				elif property_type == "Monolingualtext":
					value = MonolingualText(j["value"], j["lang"])
				elif property_type == "ExternalId":
					value = ExternalIdentifier(j["value"])
				elif property_type == "GlobeCoordinate":
					value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"])
				elif property_type == "Property Not Found":
					is_error = True
				s.add_qualifier(j["property"], value)
		doc.kg.add_subject(s)
	if not is_error:
		data = doc.kg.serialize(filetype)
	else:
		data = "Property Not Found"
	# os.makedirs(Path.cwd() / "new_properties", exist_ok=True)
	# results_file_name = user_id + "_results.ttl"
	# changes_file_name = user_id + "_changes.tsv"

	# with open(Path(app.config['downloads']) / results_file_name, "w") as fp:
	# 	fp.write(data)
	# with open(Path(app.config['downloads']) / changes_file_name, "w") as fp:
	# 	serialize_change_record(fp)
	return data
    parser.add_option("-o",
                      "--output_file",
                      action="store",
                      type="string",
                      dest="output_file")
    (c_options, args) = parser.parse_args()

    input_file = c_options.input_file
    output_file = c_options.output_file

    f = open(input_file, mode='r', encoding='utf-8')
    o = open(output_file, mode='w', encoding='utf-8')
    l = open('{}.log'.format(output_file), mode='w', encoding='utf-8')
    print('Starting to process file: {}'.format(input_file))
    count = 0
    sum = 0
    for line in f:
        if count == 10000:
            sum += count
            l.write('Processed {} lines'.format(str(sum)))
            l.write('\n')
            count = 0
        json_x = json.loads(line)
        doc = etk.create_document(json_x)
        doc.doc_id = json_x['doc_id']
        sentences = etk.process_ems(doc)
        for s in sentences:
            o.write(json.dumps(s.value))
            o.write('\n')
        count += 1
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix schema: <http://schema.org/> .
:Event a owl:Class ; .
:Entity a owl:Class ; .
:Organization a owl:Class ; .
:MOVEMENT_TRANSPORT a owl:Class ; .
:GeopoliticalEntity a owl:Class ; .
skos:prefLabel a owl:DatatypeProperty ; 
    schema:domainIncludes :Entity, :Event ;
    rdfs:range xsd:string ; .
:conflict_attack_place a owl:ObjectProperty ;
    schema:domainIncludes :Entity, :Event ;
    schema:rangeIncludes :GeopoliticalEntity ; .
    '''

    ontology = Ontology(ontology_content,
                        validation=False,
                        include_undefined_class=True,
                        quiet=True)
    kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
    etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology)
    input_data = {'doc_id': '1', 'data': json.loads(sample_input)}
    doc = etk.create_document(input_data)
    docs = etk.process_ems(doc)
    kgs = [json.dumps(doc.kg.value) for doc in docs[1:]]
    with open('output.jsonl', 'w') as f:
        f.write('\n'.join(kgs))
    with open('output.nt', 'w') as f:
        f.writelines(map(rdf_generation, kgs))
Example #29
0
import os, sys, json, codecs
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from etk.etk import ETK
from etk.extractors.html_content_extractor import HTMLContentExtractor, Strategy
from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor
from etk.extractors.inferlink_extractor import InferlinkExtractor, InferlinkRuleSet

sample_html = json.load(codecs.open('sample_html.json',
                                    'r'))  # read sample file from disk

etk = ETK()
doc = etk.create_document(sample_html,
                          mime_type="text/html",
                          url="http://ex.com/123")

metadata_extractor = HTMLMetadataExtractor()
content_extractor = HTMLContentExtractor()
landmark_extractor = InferlinkExtractor(
    InferlinkRuleSet(
        InferlinkRuleSet.load_rules_file('sample_inferlink_rules.json')))

root = doc.select_segments("$")[0]
raw = doc.select_segments("$.raw_content")[0]

# root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_title=True), "title")
# root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_meta=True), "metadata")
root.store_extractions(
    doc.invoke_extractor(content_extractor, raw, strategy=Strategy.ALL_TEXT),
    "etk2_text")
root.store_extractions(
                json_path='$.factoid.metadata.file_name')
            extracted_doc.kg.add_value(
                'provenance_sheet', json_path='$.factoid.metadata.sheet_name')
            extracted_doc.kg.add_value('value', json_path='$.factoid.value')
            extracted_doc.kg.add_value('type', json_path='$.factoid.type')
            extracted_doc.kg.add_value('identifier_key',
                                       json_path='$.factoid.identifier_key')
            extracted_doc.kg.add_value('identifier_value',
                                       json_path='$.factoid.identifier_value')

            extracted_docs.append(extracted_doc)

        return extracted_docs


if __name__ == "__main__":
    # elicit_alignment/m9/datasets/orig/structured/west_african_food_composition/example/
    dir_path = sys.argv[1]
    file_name = 'West African Food Composition.xls'
    input_path = os.path.join(dir_path, file_name)
    output_path = os.path.join(dir_path, file_name + '.jl')

    kg_schema = KGSchema(json.load(open('master_config.json')))
    etk = ETK(modules=ElicitWestAmericanFoodModule, kg_schema=kg_schema)
    doc = etk.create_document({'file_path': input_path})

    docs = etk.process_ems(doc)
    with open(output_path, 'w') as f:
        for i in range(1, len(docs)):  # ignore the first
            f.write(json.dumps(docs[i].value) + '\n')