def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): if k in self.prefix_dict: self.doc.kg.bind(k, self.prefix_dict[k]) else: self.doc.kg.bind(k, v)
def setUp(self): sample_doc = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.", "members": ["dongyu", "amandeep", "sylvia", "Runqi12"], "date": "2007-12-05", "place": "columbus:georgia:united states:-84.98771:32.46098", "s": "segment_test_1" }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["mayank", "yixiang"], "date": ["2007-12-05T23:19:00"], "cost": -3213.32, "s": "segment_test_2" }] } kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema) self.doc = etk.create_document(sample_doc)
def test_EmailExtractor(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True) text = "[email protected] [email protected] " \ "[email protected] [email protected] E-mail:[email protected] [email protected]" email_extractor = EmailExtractor(nlp=etk.default_nlp, tokenizer=etk.default_tokenizer, extractor_name="email_extractor") extractions = email_extractor.extract(text) extracted = [] for i in extractions: extracted_value = { "value": i.value, "start_char": i.provenance["start_char"], "end_char": i.provenance["end_char"], "value_from_text": text[i.provenance["start_char"]:i.provenance["end_char"]] } extracted.append(extracted_value) self.assertEqual(extracted_value["value"], extracted_value["value_from_text"]) expected = [{ 'value': '*****@*****.**', 'start_char': 97, 'end_char': 122, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 0, 'end_char': 16, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 77, 'end_char': 96, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 17, 'end_char': 40, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 51, 'end_char': 68, 'value_from_text': '*****@*****.**' }] self.assertEqual(sorted(expected, key=lambda x: x["start_char"]), sorted(extracted, key=lambda x: x["start_char"]))
def setUp(self): ontology_content = ''' @prefix : <http://dig.isi.edu/ontologies/dig/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix schema: <http://schema.org/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Person a owl:Class ; rdfs:subClassOf :Actor, :Biological_Object ; :common_properties :label, :title, :religion ; . :has_name a owl:DatatypeProperty ; schema:domainIncludes :Person ; schema:rangeIncludes xsd:string ; . :has_child a owl:ObjectProperty ; schema:domainIncludes :Person ; schema:rangeIncludes :Person ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=True) etk2 = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=False) self.doc = etk.create_document(dict(), doc_id='http://xxx/1', type_=[DIG.Person.toPython()]) self.doc2 = etk2.create_document(dict(), doc_id='http://xxx/2', type_=[DIG.Person.toPython()])
def test_segment(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
def __init__( self, propFile: str, labelSet: str, aliasSet: str, descriptionSet: str, n: int, destFp: TextIO = sys.stdout, ): self.propTypes = self.__setPropTypes(propFile) self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( labelSet, aliasSet, descriptionSet) # TODO handle standard output self.fp = destFp self.n = int(n) self.read = 0 # serialize prfix kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.__setDoc() self.__serialize_prefix()
def __init__(self, master_config, em_paths, logger, worker_id, project_name, kafka_input_args=None, kafka_output_args=None): self.logger = logger self.worker_id = worker_id self.check_interval = 1000 self.exit_sign = False try: kg_schema = KGSchema(master_config) self.etk_ins = ETK(kg_schema, em_paths, logger=logger) except Exception as e: logger.exception('ETK initialization failed') raise e # kafka input self.kafka_input_server = config['input_server'] self.kafka_input_session_timeout = config['input_session_timeout'] self.kafka_input_group_id = config['input_group_id'] self.kafka_input_topic = '{project_name}_in'.format( project_name=project_name) self.kafka_input_args = dict( ) if kafka_input_args is None else kafka_input_args self.kafka_consumer = KafkaConsumer( bootstrap_servers=self.kafka_input_server, group_id=self.kafka_input_group_id, consumer_timeout_ms=self.check_interval, value_deserializer=lambda v: json.loads(v.decode('utf-8')), **self.kafka_input_args) self.kafka_consumer.subscribe([self.kafka_input_topic]) # kafka output self.kafka_output_server = config['output_server'] self.kafka_output_topic = '{project_name}_out'.format( project_name=project_name) self.kafka_output_args = dict( ) if kafka_output_args is None else kafka_output_args self.kafka_producer = KafkaProducer( bootstrap_servers=self.kafka_output_server, value_serializer=lambda v: json.dumps(v).encode('utf-8'), **self.kafka_output_args) self.timeout_count = self.kafka_input_session_timeout / self.check_interval self.current_timeout_count = 0
import unittest, json from etk.timeseries_processor import TimeseriesProcessor from etk.etk import ETK from etk.knowledge_graph import KGSchema kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) # python -m unittest etk.unit_tests.test_timeseries_processor to run all unittests class TestTimeseriesProcessor(unittest.TestCase): def test_excel_file(self) -> None: annotation = 'etk/timeseries/DIESEL_june_annotation.json' spreadsheet = 'etk/unit_tests/ground_truth/DIESEL_june_2017.xlsx' timeseriesProcessor = TimeseriesProcessor(etk=etk, annotation=annotation, spreadsheet=spreadsheet) docs = [ doc.cdr_document for doc in timeseriesProcessor.timeseries_extractor() ] selected_docs = docs[1] expected_metadata = { "name": "AVERAGE DIESEL (AUTOMATIVE GAS OIL) PRICES/ Litre NGN", "granularity": "monthly", "provenance": { "filename": "DIESEL_june_2017.xlsx",
ontology = """ @prefix : <http://isi.edu/xij-rule-set#> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Software a owl:Class ; rdfs:label "Software" . :Person a owl:Class ; rdfs:label "Person" . :Developer a owl:Class ; rdfs:label "Developer" . :name a owl:DatatypeProperty ; rdf:domain :Person ; rdf:range xsd:string . :developer a owl:ObjectProperty ; rdfs:label "developer" ; rdf:domain :Software ; rdf:range :Developer . """ kg_schema = KGSchema() kg_schema.add_schema(ontology, 'ttl') etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule) doc = etk.create_document(sample_input, doc_id="http://isi.edu/default-ns/projects") docs = etk.process_ems(doc) print(docs[0].kg.serialize('ttl'))
def model_statement(self): # initialize KGSchema kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri}) # extract files self.extract_files() # model statement inputs = self.data['inputs'] for k, v in inputs.items(): if k != 'metadata': # construct wikifier instance if k == 'wikifier' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A wikifier file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1001', namespace=self.ns)) # an instance of Wikifier q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP3003', StringValue(v['content']), namespace=self.ns) # hasFileContent q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # hashValue # construct mapping_file instance elif k == 'mappingFile' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A mapping file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1002', namespace=self.ns)) # an instance of MappingFile q.add_statement('P170', StringValue('T2WML')) q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) q.add_statement('SDP3003', StringValue(json.dumps(v['content'])), namespace=self.ns) q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # construct dataset instance elif k == 'dataset' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label(v['content']['title'], lang='en') q.add_description(v['content']['description'], lang='en') q.add_statement('P31', Item('Q1172284')) # an instance of Dataset q.add_statement('SDP3001', Item(inputs['wikifier']['qnode'], namespace=self.ns), namespace=self.ns) # a wikifier file q.add_statement('SDP3002', Item(inputs['mappingFile']['qnode'], namespace=self.ns), namespace=self.ns) # a mapping file q.add_statement('P1476', StringValue( v['content']['title'])) # title q.add_statement( 'P921', StringValue(v['content']['description'])) # described q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP2004', StringValue(', '.join( v['content']['keywords'])), namespace=self.ns) # keywords q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) if self.data['storeColumnValue']: for data in v['content']['variable_measured']: statement = q.add_statement( 'SDP2005', StringValue(data['column_name']), namespace=self.ns) # variable measured statement.add_qualifier( 'SDP2006', StringValue(data['values_of_a_column']), namespace=self.ns) # the values of a column statement.add_qualifier( 'SDP2007', Item(data['data_structure_type']), namespace=self.ns) # data structure type statement.add_qualifier( 'SDP2008', URLValue(data['semantic_type_identifier']), namespace=self.ns) # semantic type statement.add_qualifier( 'P1545', QuantityValue( data['column_index'], namespace=self.ns)) # column index doc.kg.add_subject(q) return doc
def model_schema(self): # read data data = self.read_data(self.data['schema']) # initialize KGSchema custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'} for each in data['prefix']: for k, v in each.items(): custom_dict[k] = v if k != 'wd': ns_dict[k] = v + '/entity' kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict) type_map = { 'quantity': Datatype.QuantityValue, 'url': URLValue, 'item': Datatype.Item, 'time': Datatype.TimeValue, 'string': Datatype.StringValue, 'text': Datatype.MonolingualText } # model schema for k, v in data.items(): if ':' in k: k = k.split(':') if 'Q' in k[1]: p = WDItem(k[1], namespace=k[0], creator=':datamart') elif 'P' in k[1]: p = WDProperty(k[1], type_map[v['type']], namespace=k[0], creator=':datamart') else: raise Exception('There is no P/Q information.') return None for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for node, value in v['statements'].items(): ns = node.split(':')[0] if ':' in node else 'wd' for val in value: prop_type = self.get_property_type(node, ns_dict[ns]) if prop_type == 'WikibaseItem': v = Item(str(val['value'])) elif prop_type == 'WikibaseProperty': v = Property(val['value']) elif prop_type == 'String': v = StringValue(val['value']) elif prop_type == 'Quantity': v = QuantityValue(val['value']) elif prop_type == 'Url': v = URLValue(val['value']) elif prop_type == 'Monolingualtext': v = MonolingualText(val['value'], val['lang']) p.add_statement(node, v) doc.kg.add_subject(p) return doc
import os, sys, json sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from etk.etk import ETK from etk.knowledge_graph import KGSchema from examples.config_to_em.em_base_generator import EmBaseGenerator ebg = EmBaseGenerator('template.tpl') ebg.generate_em_base('master_config.json', 'ems/em_base.py') kg_schema = KGSchema(json.load(open("master_config.json", "r"))) etk = ETK(kg_schema, ["./ems"]) doc = etk.create_document(json.load(open('sample_html.jl', 'r'))) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
def test_KnowledgeGraph_provenance(self) -> None: sample_doc = { "projects": [ { "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.", "members": [ "dongyu", "amandeep", "sylvia", "Runqi12" ], "date": "2007-12-05", "place": "columbus:georgia:united states:-84.98771:32.46098" }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": [ "mayank", "yixiang" ], "date": ["2007-12-05T23:19:00"], "cost": -3213.32 } ] } kg_schema = KGSchema(json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema) doc = etk.create_document(sample_doc) try: doc.kg.add_value("developer", json_path="projects[*].members[*]") except KgValueError: pass try: doc.kg.add_value("test_date", json_path="projects[*].date[*]") except KgValueError: pass try: doc.kg.add_value("test_add_value_date", value=[date(2018, 3, 28), {}, datetime(2018, 3, 28, 1, 1, 1)], json_path_extraction="projects[0].date") except KgValueError: pass try: doc.kg.add_value("test_location", json_path="projects[*].place") except KgValueError: pass # print (json.dumps(doc.value, indent=2)) expeced_provenances = [ { "@id": 0, "@type": "kg_provenance_record", "reference_type": "location", "value": "dongyu", "json_path": "projects.[0].members.[0]" }, { "@id": 1, "@type": "kg_provenance_record", "reference_type": "location", "value": "amandeep", "json_path": "projects.[0].members.[1]" }, { "@id": 2, "@type": "kg_provenance_record", "reference_type": "location", "value": "sylvia", "json_path": "projects.[0].members.[2]" }, { "@id": 3, "@type": "kg_provenance_record", "reference_type": "location", "value": "Runqi12", "json_path": "projects.[0].members.[3]" }, { "@id": 4, "@type": "kg_provenance_record", "reference_type": "location", "value": "mayank", "json_path": "projects.[1].members.[0]" }, { "@id": 5, "@type": "kg_provenance_record", "reference_type": "location", "value": "yixiang", "json_path": "projects.[1].members.[1]" }, { "@id": 6, "@type": "kg_provenance_record", "reference_type": "location", "value": "2007-12-05T00:00:00", "json_path": "projects.[0].date.[0]" }, { "@id": 7, "@type": "kg_provenance_record", "reference_type": "location", "value": "2007-12-05T23:19:00", "json_path": "projects.[1].date.[0]" }, { "@id": 8, "@type": "kg_provenance_record", "reference_type": "constant", "value": "2018-03-28", "json_path": "projects[0].date" }, { "@id": 9, "@type": "kg_provenance_record", "reference_type": "constant", "value": "2018-03-28T01:01:01", "json_path": "projects[0].date" }, { "@id": 10, "@type": "kg_provenance_record", "reference_type": "location", "value": "columbus:georgia:united states:-84.98771:32.46098", "json_path": "projects.[0].place" } ] self.assertEqual(expeced_provenances, doc.value["provenances"])
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix skos: <http://www.w3.org/2004/02/skos/core#> . @prefix schema: <http://schema.org/> . :Event a owl:Class ; . :Entity a owl:Class ; . :Organization a owl:Class ; . :MOVEMENT_TRANSPORT a owl:Class ; . :GeopoliticalEntity a owl:Class ; . skos:prefLabel a owl:DatatypeProperty ; schema:domainIncludes :Entity, :Event ; rdfs:range xsd:string ; . :conflict_attack_place a owl:ObjectProperty ; schema:domainIncludes :Entity, :Event ; schema:rangeIncludes :GeopoliticalEntity ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology) input_data = {'doc_id': '1', 'data': json.loads(sample_input)} doc = etk.create_document(input_data) docs = etk.process_ems(doc) kgs = [json.dumps(doc.kg.value) for doc in docs[1:]] with open('output.jsonl', 'w') as f: f.write('\n'.join(kgs)) with open('output.nt', 'w') as f: f.writelines(map(rdf_generation, kgs))
}, "news_story": { "type": "string" }, "similarity": { "type": "number" }, "matched_sentence": { "type": "string" }, "date": { "type": "string" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./"]) # read the news news_file = open( '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl' ) # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl') news_stories = [ etk.create_document(json.loads(line), url=json.loads(line)['tld'], doc_id=json.loads(line)['doc_id']) for line in news_file ] results = list() for news_story in news_stories:
def main(): filename = sys.argv[1] query_title = sys.argv[2] ranking_criteria = sys.argv[3] top_k = sys.argv[4] if ranking_criteria not in ('TITLE', 'SENTENCE'): print('Wrong mode! Please check the input argument!') return master_config = { "fields": { "developer": { "type": "string" }, "student_developer": { "type": "string" }, "spacy_name": { "type": "string" }, "date": { "type": "date" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./extraction_modules/"]) nlp = spacy.load('en_core_web_lg') date_extractor = DateExtractor(etk=etk) queries = dict() queries_ent_map = dict() with open(query_title) as f: for line in f: orig_ifp_title = line # remove date information from query term res = date_extractor.extract(text=line) start, end = float('inf'), -1 for i in res: start = min(start, i.provenance['start_char']) end = max(end, i.provenance['end_char']) # delete date from query term if len(res) != 0: line = line[:start] + line[end+1:] queries[orig_ifp_title] = line queries_ent_map[line] = list() # extract entities from query term doc = nlp(line) for ent in doc.ents: queries_ent_map[line].append(re.escape(ent.text.strip())) # remove empty entities queries_ent_map[line] = list(filter(bool, queries_ent_map[line])) # the list of selected docs for given query term query_docs_mapping = dict() docs = list() with open(filename) as f: for line in f: json_obj = json.loads(line) docs.append(etk.create_document(json_obj)) ds = DefaultDocumentSelector() for orig_query, proc_query in queries.items(): content_regex = queries_ent_map[proc_query] query_docs_mapping[proc_query] = list() for doc in docs: if len(content_regex) == 0 \ or ds.select_document(document=doc, json_paths=['$.lexisnexis.doc_description'], json_paths_regex=content_regex): query_docs_mapping[proc_query].append(doc) # TODO: pass ifp_id in for orig_query, proc_query in queries.items(): # print(len(query_docs_mapping[proc_query])) dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query) heap = list() for doc in query_docs_mapping[proc_query]: processed_doc = dict() if ranking_criteria == 'SENTENCE': processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document elif ranking_criteria == 'TITLE': processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document if len(heap) < top_k: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) else: if processed_doc['similarity'] > heap[0][0]: heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc)) heap.sort(reverse=True) output_filename = './resources/output/'+orig_ifp_title+"_result.jl" with open(output_filename, 'a+b') as f: for item in heap: print(item[0]) jl_str = json.dumps(item[2]) + '\n' f.write(jl_str.encode())
douglas.add_statement('P2048', QuantityValue(1.96, unit=Item('Q11573'))) # official website # statement = douglas.add_statement('P856', URLValue('http://douglasadams.com/')) statement = douglas.add_truthy_statement( 'P856', URLValue('http://douglasadams.com/')) statement.add_qualifier('P407', Item('Q1860')) # Freebase ID douglas.add_statement( 'P646', ExternalIdentifier('/m/0282x', URLValue('http://g.co/kg/m/0282x'))) doc.kg.add_subject(douglas) return list() if __name__ == "__main__": kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") revise(True) docs = etk.process_ems(doc) print(docs[0].kg.serialize('ttl'))
def test_Provenance(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) self.etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True) g = [ 'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep', 'yixiang' ] self.name_extractor = GlossaryExtractor(g, "name_extractor", self.etk.default_tokenizer, case_sensitive=False, ngrams=1) doc = self.etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.name_extractor, d) p.store(names, "members") expected_provenances = [{ "@id": 0, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 33, "end_char": 38 } }, { "@id": 1, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 40, "end_char": 46 } }, { "@id": 2, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 48, "end_char": 54 } }, { "@id": 3, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[0].description", "start_char": 56, "end_char": 64 } }, { "@id": 4, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[0].members", "parent_provenances": { "Runqi": 0, "Dongyu": 1, "Sylvia": 2, "Amandeep": 3 } }, { "@id": 5, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 39, "end_char": 44 } }, { "@id": 6, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 46, "end_char": 52 } }, { "@id": 7, "@type": "extraction_provenance_record", "method": "name_extractor", "confidence": 1.0, "origin_record": { "path": "projects.[1].description", "start_char": 54, "end_char": 61 } }, { "@id": 8, "@type": "storage_provenance_record", "doc_id": None, "field": None, "destination": "projects.[1].members", "parent_provenances": { "Pedro": 5, "Mayank": 6, "Yixiang": 7 } }] expected_projects = [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.", "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"] }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["Pedro", "Mayank", "Yixiang"] }] self.assertEqual(expected_projects, doc.value["projects"]) self.assertEqual(expected_provenances, doc.value["provenances"])
def _init_etk(): # initialize for etk kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('datamart identifier', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('keywords', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2010', Datatype.StringValue) p.add_label('extra information', lang='en') p.add_description( 'some extra information that may needed for this dataset', lang='en') doc.kg.add_subject(p) p = WDProperty('C2011', Datatype.TimeValue) p.add_label('start date', lang='en') p.add_description( 'The earlist time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2012', Datatype.TimeValue) p.add_label('end date', lang='en') p.add_description( 'The latest time exist in this dataset, only valid when there exists time format data in this dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2013', Datatype.QuantityValue) p.add_label('time granularity', lang='en') p.add_description('time granularity in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) doc.kg.add_subject(p) p = WDProperty('C2014', Datatype.StringValue) p.add_label('uploader information', lang='en') p.add_description('information about who uploaded and when uploaded', lang='en') doc.kg.add_subject(p) return doc
def __init__(self, query_server=None, update_server=None): self.punctuation_table = str.maketrans( dict.fromkeys(string.punctuation)) if query_server and update_server: self.query_server = query_server self.update_server = update_server else: self.query_server = DATAMRT_SERVER self.update_server = DATAMRT_SERVER # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = etk.create_document( {}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes self.doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') self.doc.kg.bind('wd', 'http://www.wikidata.org/entity/') self.doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') self.doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') self.doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') self.doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') self.doc.kg.bind('wdv', 'http://www.wikidata.org/value/') self.doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') self.doc.kg.bind('p', 'http://www.wikidata.org/prop/') self.doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') self.doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') self.doc.kg.bind( 'prn', 'http://www.wikidata.org/prop/reference/value-normalized/') self.doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') self.doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') self.doc.kg.bind( 'psn', 'http://www.wikidata.org/prop/statement/value-normalized/') self.doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') self.doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') self.doc.kg.bind( 'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') self.doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') self.doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') self.doc.kg.bind('schema', 'http://schema.org/') # give definition of the nodes we definied p = WDProperty('C2001', Datatype.MonolingualText) p.add_label('keywords', lang='en') p.add_description('identifier of a dataset in the Datamart system', lang='en') p.add_statement('P31', Item('Q19847637')) p.add_statement('P1629', Item('Q1172284')) self.doc.kg.add_subject(p) p = WDProperty('C2004', Datatype.StringValue) p.add_label('datamart identifier', lang='en') p.add_description( 'keywords associated with an item to facilitate finding the item using text search', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2005', Datatype.StringValue) p.add_label('variable measured', lang='en') p.add_description('the variables measured in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) p.add_statement('P1628', URLValue('http://schema.org/variableMeasured')) self.doc.kg.add_subject(p) p = WDProperty('C2006', Datatype.StringValue) p.add_label('values', lang='en') p.add_description( 'the values of a variable represented as a text document', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2007', Datatype.Item) p.add_label('data type', lang='en') p.add_description( 'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), ' 'Real (Q4385701), String (Q184754), Categorical (Q2285707)', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) p = WDProperty('C2008', Datatype.URLValue) p.add_label('semantic type', lang='en') p.add_description( 'a URL that identifies the semantic type of a variable in a dataset', lang='en') p.add_statement('P31', Item('Q18616576')) self.doc.kg.add_subject(p) # get the starting source id sparql_query = """ prefix wdt: <http://www.wikidata.org/prop/direct/> prefix wd: <http://www.wikidata.org/entity/> prefix wikibase: <http://wikiba.se/ontology#> PREFIX p: <http://www.wikidata.org/prop/> PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/> PREFIX pq: <http://www.wikidata.org/prop/qualifier/> PREFIX ps: <http://www.wikidata.org/prop/statement/> prefix bd: <http://www.bigdata.com/rdf#> prefix bds: <http://www.bigdata.com/rdf/search#> select ?x where { wd:Z00000 wdt:P1114 ?x . } """ try: sparql = SPARQLWrapper(self.query_server) sparql.setQuery(sparql_query) sparql.setReturnFormat(JSON) sparql.setMethod(POST) sparql.setRequestMethod(URLENCODED) results = sparql.query().convert()['results']['bindings'] except: print("Getting query of wiki data failed!") raise ValueError("Unable to initialize the datamart query service") if not results: print( "[WARNING] No starting source id found! Will initialize the starting source with D1000001" ) self.resource_id = 1000001 else: self.resource_id = 1000001