def setUp(self): ontology_content = ''' @prefix : <http://dig.isi.edu/ontologies/dig/> . @prefix dig: <http://dig.isi.edu/ontologies/dig/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix schema: <http://schema.org/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Person a owl:Class ; rdfs:subClassOf :Actor, :Biological_Object ; :common_properties :label, :title, :religion ; . :has_name a owl:DatatypeProperty ; schema:domainIncludes :Person ; schema:rangeIncludes xsd:string ; . :has_child a owl:ObjectProperty ; schema:domainIncludes :Person ; schema:rangeIncludes :Person ; . ''' kg_schema = KGSchema() kg_schema.add_schema(ontology_content, 'ttl') etk = ETK(kg_schema=kg_schema) self.doc = etk.create_document(dict(), doc_id='http://xxx/1', type_=[URI('dig:Person')])
def setUp(self): sample_doc = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.", "members": ["dongyu", "amandeep", "sylvia", "Runqi12"], "date": "2007-12-05", "place": "columbus:georgia:united states:-84.98771:32.46098", "s": "segment_test_1" }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.", "members": ["mayank", "yixiang"], "date": ["2007-12-05T23:19:00"], "cost": -3213.32, "s": "segment_test_2" }] } kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema) self.doc = etk.create_document(sample_doc)
def test_segment(self) -> None: etk = ETK() doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
def test_website_patterns_condition(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, website_patterns=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, website_patterns=[".*ABc", ".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_json_paths_and_json_paths_regex(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def test_segment(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) doc = etk.create_document(sample_input) descriptions = doc.select_segments("projects[*].description") description_value = [i.value for i in descriptions] expected = [ 'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.', 'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.' ] self.assertEqual(description_value, expected)
def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): if k in self.prefix_dict: self.doc.kg.bind(k, self.prefix_dict[k]) else: self.doc.kg.bind(k, v)
def __init__(self, master_config, em_paths, logger, worker_id, project_name, kafka_input_args=None, kafka_output_args=None): self.logger = logger self.worker_id = worker_id self.check_interval = 1000 self.exit_sign = False try: kg_schema = KGSchema(master_config) self.etk_ins = ETK(kg_schema, em_paths, logger=logger) except Exception as e: logger.exception('ETK initialization failed') raise e # kafka input self.kafka_input_server = config['input_server'] self.kafka_input_session_timeout = config['input_session_timeout'] self.kafka_input_group_id = config['input_group_id'] self.kafka_input_topic = '{project_name}_in'.format( project_name=project_name) self.kafka_input_args = dict( ) if kafka_input_args is None else kafka_input_args self.kafka_consumer = KafkaConsumer( bootstrap_servers=self.kafka_input_server, group_id=self.kafka_input_group_id, consumer_timeout_ms=self.check_interval, value_deserializer=lambda v: json.loads(v.decode('utf-8')), **self.kafka_input_args) self.kafka_consumer.subscribe([self.kafka_input_topic]) # kafka output self.kafka_output_server = config['output_server'] self.kafka_output_topic = '{project_name}_out'.format( project_name=project_name) self.kafka_output_args = dict( ) if kafka_output_args is None else kafka_output_args self.kafka_producer = KafkaProducer( bootstrap_servers=self.kafka_output_server, value_serializer=lambda v: json.dumps(v).encode('utf-8'), **self.kafka_output_args) self.timeout_count = self.kafka_input_session_timeout / self.check_interval self.current_timeout_count = 0
def test_EmailExtractor(self) -> None: kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True) text = "[email protected] [email protected] " \ "[email protected] [email protected] E-mail:[email protected] [email protected]" email_extractor = EmailExtractor(nlp=etk.default_nlp, tokenizer=etk.default_tokenizer, extractor_name="email_extractor") extractions = email_extractor.extract(text) extracted = [] for i in extractions: extracted_value = { "value": i.value, "start_char": i.provenance["start_char"], "end_char": i.provenance["end_char"], "value_from_text": text[i.provenance["start_char"]:i.provenance["end_char"]] } extracted.append(extracted_value) self.assertEqual(extracted_value["value"], extracted_value["value_from_text"]) expected = [{ 'value': '*****@*****.**', 'start_char': 97, 'end_char': 122, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 0, 'end_char': 16, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 77, 'end_char': 96, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 17, 'end_char': 40, 'value_from_text': '*****@*****.**' }, { 'value': '*****@*****.**', 'start_char': 51, 'end_char': 68, 'value_from_text': '*****@*****.**' }] self.assertEqual(sorted(expected, key=lambda x: x["start_char"]), sorted(extracted, key=lambda x: x["start_char"]))
def test_all_condition(self) -> None: etk = ETK() doc = etk.create_document(sample_input) default_doc_selector = DefaultDocumentSelector() res_true = default_doc_selector.select_document( doc, datasets=[".*unittest", ".*abc"], url_patterns=[".*unittest", ".*zxc"], website_patterns=[".*unittest", ".*abc"], json_paths=["$.website"], json_paths_regex=[".*unittest", ".*abc"]) res_false = default_doc_selector.select_document( doc, datasets=[".*abc", ".*hhhh"], url_patterns=[".*ZXc", ".*hhhh"], website_patterns=[".*ABc", ".*hhhh"], json_paths=["$.website"], json_paths_regex=[".*hhhh"]) self.assertEqual(True, res_true) self.assertEqual(False, res_false)
def setUp(self): ontology_content = ''' @prefix : <http://dig.isi.edu/ontologies/dig/> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix schema: <http://schema.org/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . :Person a owl:Class ; rdfs:subClassOf :Actor, :Biological_Object ; :common_properties :label, :title, :religion ; . :has_name a owl:DatatypeProperty ; schema:domainIncludes :Person ; schema:rangeIncludes xsd:string ; . :has_child a owl:ObjectProperty ; schema:domainIncludes :Person ; schema:rangeIncludes :Person ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=True) etk2 = ETK(kg_schema=kg_schema, ontology=ontology, generate_json_ld=False) self.doc = etk.create_document(dict(), doc_id='http://xxx/1', type_=[DIG.Person.toPython()]) self.doc2 = etk2.create_document(dict(), doc_id='http://xxx/2', type_=[DIG.Person.toPython()])
def __init__( self, propFile: str, labelSet: str, aliasSet: str, descriptionSet: str, n: int, destFp: TextIO = sys.stdout, ): self.propTypes = self.__setPropTypes(propFile) self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets( labelSet, aliasSet, descriptionSet) # TODO handle standard output self.fp = destFp self.n = int(n) self.read = 0 # serialize prfix kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.__setDoc() self.__serialize_prefix()
def test_etk_crf_glossary_extraction(self): etk = ETK(use_spacy_tokenizer=False) s = time.time() city_extractor = GlossaryExtractor( ['los angeles', 'new york', 'angeles'], 'city_extractor', etk.default_tokenizer, case_sensitive=False, ngrams=3) doc_json = { 'text': 'i live in los angeles. my hometown is Beijing. I love New York City.' } doc = Document(etk, cdr_document=doc_json, mime_type='json', url='', doc_id='1') t_segments = doc.select_segments("$.text") for t_segment in t_segments: extracted_cities = doc.extract(city_extractor, t_segment) for extracted_city in extracted_cities: self.assertTrue(extracted_city.value in ['los angeles', 'New York', 'angeles'])
from etk.timeseries_processor import TimeseriesProcessor import pprint class TimeseriesETKModule(ETKModule): """ Abstract class for extraction module """ def __init__(self, etk): ETKModule.__init__(self, etk) def process_document(self, doc): pass if __name__ == "__main__": etk = ETK(modules=TimeseriesETKModule) annotation = './resources/DIESEL_june_annotation.json' spreadsheet = './resources/DIESEL_june_2017.xlsx' timeseries_processor = TimeseriesProcessor(etk=etk, annotation=annotation, spreadsheet=spreadsheet) file_name = 'test_file_name' data_set = 'test_data_set' docs = [ doc.cdr_document for doc in timeseries_processor.timeseries_extractor( file_name=file_name, data_set=data_set) ] pprint.pprint(docs)
# "groundpig, whistlepig, whistler, thickwood badger, " # "Canada marmot, monax, moonack, weenusk, red monk and, " # "among French Canadians in eastern Canada, siffleur" # }, # { # "name": "Test3 - Social Media", # "description": "Parser stress test for tweets", # "text": "Slides onto twitter..... \n" # ".......slippery floor....... \n" # "............slides out the other side..." # } # ], # "doc_id": 42069 # } # etk = ETK(modules=SentenceSplittingETKModule) # doc = etk.create_document(toy_doc) # # split_doc = etk.process_ems(doc) # # print(json.dumps(split_doc[0].value, indent=2)) parser = OptionParser(conflict_handler="resolve") parser.add_option("-i", "--input_file", action="store", type="string", dest="input_file") parser.add_option("-o", "--output_file", action="store", type="string",
def model_statement(self): # initialize KGSchema kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri}) # extract files self.extract_files() # model statement inputs = self.data['inputs'] for k, v in inputs.items(): if k != 'metadata': # construct wikifier instance if k == 'wikifier' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A wikifier file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1001', namespace=self.ns)) # an instance of Wikifier q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP3003', StringValue(v['content']), namespace=self.ns) # hasFileContent q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # hashValue # construct mapping_file instance elif k == 'mappingFile' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label('A mapping file for ' + inputs['dataset']['content']['filename'], lang='en') q.add_statement('P31', Item( 'SDQ1002', namespace=self.ns)) # an instance of MappingFile q.add_statement('P170', StringValue('T2WML')) q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) q.add_statement('SDP3003', StringValue(json.dumps(v['content'])), namespace=self.ns) q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) # construct dataset instance elif k == 'dataset' and not v['existed']: q = WDItem(v['qnode'], namespace=self.ns, creator=':datamart') q.add_label(v['content']['title'], lang='en') q.add_description(v['content']['description'], lang='en') q.add_statement('P31', Item('Q1172284')) # an instance of Dataset q.add_statement('SDP3001', Item(inputs['wikifier']['qnode'], namespace=self.ns), namespace=self.ns) # a wikifier file q.add_statement('SDP3002', Item(inputs['mappingFile']['qnode'], namespace=self.ns), namespace=self.ns) # a mapping file q.add_statement('P1476', StringValue( v['content']['title'])) # title q.add_statement( 'P921', StringValue(v['content']['description'])) # described q.add_statement('P127', Item('SDQ1003', namespace=self.ns)) # belongs to q.add_statement('SDP2004', StringValue(', '.join( v['content']['keywords'])), namespace=self.ns) # keywords q.add_statement('SDP3004', StringValue(v['hashcode']), namespace=self.ns) if self.data['storeColumnValue']: for data in v['content']['variable_measured']: statement = q.add_statement( 'SDP2005', StringValue(data['column_name']), namespace=self.ns) # variable measured statement.add_qualifier( 'SDP2006', StringValue(data['values_of_a_column']), namespace=self.ns) # the values of a column statement.add_qualifier( 'SDP2007', Item(data['data_structure_type']), namespace=self.ns) # data structure type statement.add_qualifier( 'SDP2008', URLValue(data['semantic_type_identifier']), namespace=self.ns) # semantic type statement.add_qualifier( 'P1545', QuantityValue( data['column_index'], namespace=self.ns)) # column index doc.kg.add_subject(q) return doc
def model_schema(self): # read data data = self.read_data(self.data['schema']) # initialize KGSchema custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'} for each in data['prefix']: for k, v in each.items(): custom_dict[k] = v if k != 'wd': ns_dict[k] = v + '/entity' kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id='http://isi.edu/default-ns/projects') # bind prefix doc = create_custom_prefix(doc, custom_dict) type_map = { 'quantity': Datatype.QuantityValue, 'url': URLValue, 'item': Datatype.Item, 'time': Datatype.TimeValue, 'string': Datatype.StringValue, 'text': Datatype.MonolingualText } # model schema for k, v in data.items(): if ':' in k: k = k.split(':') if 'Q' in k[1]: p = WDItem(k[1], namespace=k[0], creator=':datamart') elif 'P' in k[1]: p = WDProperty(k[1], type_map[v['type']], namespace=k[0], creator=':datamart') else: raise Exception('There is no P/Q information.') return None for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for node, value in v['statements'].items(): ns = node.split(':')[0] if ':' in node else 'wd' for val in value: prop_type = self.get_property_type(node, ns_dict[ns]) if prop_type == 'WikibaseItem': v = Item(str(val['value'])) elif prop_type == 'WikibaseProperty': v = Property(val['value']) elif prop_type == 'String': v = StringValue(val['value']) elif prop_type == 'Quantity': v = QuantityValue(val['value']) elif prop_type == 'Url': v = URLValue(val['value']) elif prop_type == 'Monolingualtext': v = MonolingualText(val['value'], val['lang']) p.add_statement(node, v) doc.kg.add_subject(p) return doc
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl', created_by: str = 't2wml') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") property_type_map = property_type_dict # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: _item = i["statement"]["item"] if _item is not None: item = WDItem(_item, creator='http://www.isi.edu/{}'.format(created_by)) try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) if property_type != "Property Not Found" and i["statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": _value = i["statement"]["value"] _value = str(_value).replace(',', '') value = QuantityValue(_value) elif property_type == "Time": value = TimeValue( str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer( i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type( j["property"], sparql_endpoint) if property_type != "Property Not Found" and i[ "statement"][ "property"] not in property_type_map: property_type_map[i["statement"] ["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True if value is None: continue else: s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: # data = "Property Not Found" raise Exception('data exception while generating triples') return data
""" def __init__(self, etk): ETKModule.__init__(self, etk) self.inferlink_extractor = InferlinkExtractor( InferlinkRuleSet( InferlinkRuleSet.load_rules_file( '../html_basic/sample_inferlink_rules.json'))) def process_document(self, doc): """ Add your code for processing the document """ raw = doc.select_segments("$.raw_content")[0] extractions = doc.extract(self.inferlink_extractor, raw) doc.store(extractions, "inferlink_extraction") return list() if __name__ == "__main__": sample_html = json.load(codecs.open('../html_basic/sample_html.json', 'r')) # read sample file from disk etk = ETK(modules=InferlinkETKModule) doc = etk.create_document(sample_html, mime_type="text/html", url="http://ex.com/123") docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
class TripleGenerator(Generator): def __init__(self, **kwargs): super().__init__(**kwargs) prop_declaration = kwargs.pop("prop_declaration") dest_fp = kwargs.pop("dest_fp") truthy = kwargs.pop("truthy") use_id = kwargs.pop("use_id") prefix_path = kwargs.pop("prefix_path") self.datatype_mapping = { # nomenclature from https://w.wiki/Tfn "item": Item, "WikibaseItem": Item, "time": TimeValue, "Time": TimeValue, "globe-coordinate": GlobeCoordinate, "GlobeCoordinate": GlobeCoordinate, "quantity": QuantityValue, "Quantity": QuantityValue, "monolingualtext": MonolingualText, "Monolingualtext": MonolingualText, "string": StringValue, "String": StringValue, "external-identifier": ExternalIdentifier, "ExternalId": ExternalIdentifier, "url": StringValue, #TODO bug potentially in rdflib "Url": StringValue, "property": WDProperty, "WikibaseProperty": WDProperty } self.set_prefix(prefix_path) self.prop_declaration = prop_declaration self.set_properties(self.prop_file) self.fp = dest_fp self.truthy = truthy self.reset_etk_doc() self.serialize_prefix() self.use_id = use_id def set_prefix(self, prefix_path: str): self.prefix_dict = {} if prefix_path != "NONE": with open(prefix_path, "r") as fp: for line_num, edge in enumerate(fp): edge_list = edge.strip("\r\n").split("\t") if line_num == 0: node1_index, node2_index = edge_list.index( "node1"), edge_list.index("node2") else: prefix, expand = edge_list[node1_index], edge_list[ node2_index] self.prefix_dict[prefix] = expand def read_prop_declaration(self, line_number: int, edge: str): node1, node2, prop, e_id = self.parse_edges(edge) if prop == "data_type": self.prop_types[node1] = self.datatype_mapping[node2.strip()] return def set_properties(self, prop_file: str): self.prop_types = {} if prop_file == "NONE": return with open(prop_file, "r") as fp: props = fp.readlines() for line in props[1:]: node1, _, node2 = line.split("\t") try: self.prop_types[node1] = self.datatype_mapping[node2.strip()] except: raise KGTKException( "DataType {} of node {} is not supported.\n".format( node2, node1)) def _node_2_entity(self, node: str): ''' A node can be Qxxx or Pxxx, return the proper entity. ''' if node in self.prop_types: entity = WDProperty(node, self.prop_types[node]) else: entity = WDItem(TripleGenerator.replace_illegal_string(node)) return entity def reset_etk_doc(self, doc_id: str = "http://isi.edu/default-ns/projects"): """ reset the doc object and return it. Called at initialization and after outputting triples. """ kg_schema = KGSchema() kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl") self.etk = ETK(kg_schema=kg_schema, modules=ETKModule) self.doc = self.etk.create_document({}, doc_id=doc_id) for k, v in wiki_namespaces.items(): if k in self.prefix_dict: self.doc.kg.bind(k, self.prefix_dict[k]) else: self.doc.kg.bind(k, v) def serialize(self): """ Seriealize the triples. Used a hack to avoid serializing the prefix again. """ docs = self.etk.process_ems(self.doc) self.fp.write("\n\n".join( docs[0].kg.serialize("ttl").split("\n\n")[1:])) self.fp.flush() self.reset() def serialize_prefix(self): """ This function should be called only once after the doc object is initialized. In order to serialize the prefix at the very begining it has to be printed per the change of rdflib 4.2.2->5.0.0 Relevent issue: https://github.com/RDFLib/rdflib/issues/965 """ for k, v in wiki_namespaces.items(): if k in self.prefix_dict: line = "@prefix " + k + ": <" + self.prefix_dict[k] + "> .\n" else: line = "@prefix " + k + ": <" + v + "> .\n" self.fp.write(line) self.fp.write("\n") self.fp.flush() self.reset() def reset(self): self.to_append_statement_id = None self.to_append_statement = None self.read_num_of_lines = 0 self.reset_etk_doc() def generate_label_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_label(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_description_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_description(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_alias_triple(self, node1: str, node2: str) -> bool: entity = self._node_2_entity(node1) text_string, lang = TripleGenerator.process_text_string(node2) entity.add_alias(text_string, lang=lang) self.doc.kg.add_subject(entity) return True def generate_prop_declaration_triple(self, node1: str, node2: str) -> bool: # update the known prop_types if node1 in self.prop_types: if not self.prop_declaration: raise KGTKException( "Duplicated property definition of {} found!".format( node1)) else: self.prop_types[node1] = node2 prop = WDProperty(node1, self.datatype_mapping[node2]) self.doc.kg.add_subject(prop) return True def generate_normal_triple(self, node1: str, property: str, node2: str, is_qualifier_edge: bool, e_id: str) -> bool: if self.use_id: e_id = TripleGenerator.replace_illegal_string(e_id) entity = self._node_2_entity(node1) edge_type = self.prop_types[property] if edge_type == Item: object = WDItem(TripleGenerator.replace_illegal_string(node2)) elif edge_type == WDProperty: object = WDProperty(TripleGenerator.replace_illegal_string(node2), self.prop_types[node2]) elif edge_type == TimeValue: if self.yyyy_mm_dd_pattern.match(node2): try: dateTimeString = node2 object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False elif self.yyyy_pattern.match(node2): try: dateTimeString = node2 + "-01-01" object = TimeValue( value=dateTimeString, # TODO calendar=Item("Q1985727"), precision=Precision.year, time_zone=0, ) except: return False else: try: # TODO, in future, the two cases above will be dropped in principle to comply with the iso format # now it is iso format assert (node2[0] == "^") node2 = node2[1:] # remove ^ if node2.startswith("+"): node2 = node2[1:] dateTimeString, precision = node2.split("/") dateTimeString = dateTimeString[:-1] # remove Z object = TimeValue( value=dateTimeString, calendar=Item("Q1985727"), precision=precision, time_zone=0, ) except: return False elif edge_type == GlobeCoordinate: latitude, longitude = node2[1:].split("/") latitude = float(latitude) longitude = float(longitude) object = GlobeCoordinate(latitude, longitude, 0.0001, globe=Item("Q2")) # earth elif edge_type == QuantityValue: # +70[+60,+80]Q743895 res = self.quantity_pattern.match(node2).groups() amount, lower_bound, upper_bound, unit = res amount = TripleGenerator.clean_number_string(amount) num_type = self.xsd_number_type(amount) lower_bound = TripleGenerator.clean_number_string(lower_bound) upper_bound = TripleGenerator.clean_number_string(upper_bound) if unit != None: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, unit=Item(unit), upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, unit=Item(unit), type=num_type) else: if upper_bound != None and lower_bound != None: object = QuantityValue(amount, upper_bound=upper_bound, lower_bound=lower_bound, type=num_type) else: object = QuantityValue(amount, type=num_type) elif edge_type == MonolingualText: text_string, lang = TripleGenerator.process_text_string(node2) object = MonolingualText(text_string, lang) elif edge_type == ExternalIdentifier: object = ExternalIdentifier(node2) elif edge_type == URLValue: if TripleGenerator.is_valid_uri_with_scheme_and_host(node2): object = URLValue(node2) else: return False else: # treat everything else as stringValue object = StringValue(node2) if type(object) == WDItem or type(object) == WDProperty: self.doc.kg.add_subject(object) if is_qualifier_edge: # edge: e8 p9 ^2013-01-01T00:00:00Z/11 # create qualifier edge on previous STATEMENT and return the updated STATEMENT self.to_append_statement.add_qualifier(property, object) self.doc.kg.add_subject(self.to_append_statement) else: # edge: q1 p8 q2 e8 # create brand new property edge and replace STATEMENT if self.truthy: self.to_append_statement = entity.add_truthy_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_truthy_statement( property, object) else: self.to_append_statement = entity.add_statement( property, object, statement_id=e_id ) if self.use_id else entity.add_statement(property, object) self.doc.kg.add_subject(entity) return True def entry_point(self, line_number: int, edge: str): # print(line_number,edge) """ generates a list of two, the first element is the determination of the edge type using corresponding edge type the second element is a bool indicating whether this is a valid property edge or qualifier edge. Call corresponding downstream functions """ if line_number == 1: # initialize the order_map self.initialize_order_map(edge) return # use the order_map to map the node node1, node2, prop, e_id = self.parse_edges(edge) if line_number == 2: # by default a statement edge is_qualifier_edge = False else: if node1 != self.to_append_statement_id and node1 != self.corrupted_statement_id: is_qualifier_edge = False # also a new statement edge if self.read_num_of_lines >= self.n: self.serialize() else: # qualifier edge or property declaration edge is_qualifier_edge = True if node1 == self.corrupted_statement_id: self.warn_log.write( "QUALIFIER edge at line [{}] associated of corrupted statement edge of id [{}] dropped.\n" .format(line_number, self.corrupted_statement_id)) return if prop in self.label_set: success = self.generate_label_triple(node1, node2) elif prop in self.description_set: success = self.generate_description_triple(node1, node2) elif prop in self.alias_set: success = self.generate_alias_triple(node1, node2) elif prop == "data_type": # special edge of prop declaration success = self.generate_prop_declaration_triple(node1, node2) else: if prop in self.prop_types: success = self.generate_normal_triple(node1, prop, node2, is_qualifier_edge, e_id) else: raise KGTKException( "property [{}]'s type is unknown at line [{}].\n".format( prop, line_number)) if (not success) and self.warning: if not is_qualifier_edge: self.warn_log.write( "CORRUPTED_STATEMENT edge at line: [{}] with edge id [{}].\n" .format(line_number, e_id)) self.corrupted_statement_id = e_id else: self.warn_log.write( "CORRUPTED_QUALIFIER edge at line: [{}] with edge id [{}].\n" .format(line_number, e_id)) else: self.read_num_of_lines += 1 if not is_qualifier_edge: self.to_append_statement_id = e_id @staticmethod def xsd_number_type(num): if isinstance(num, float) and 'e' in str(num).lower(): return LiteralType.double return LiteralType.decimal
def generate_triples(user_id: str, resolved_excel: list, sparql_endpoint: str, filetype: str = 'ttl') -> str: """ This function uses ETK to generate the RDF triples :param user_id: :param resolved_excel: :param sparql_endpoint: :param filetype: :return: """ # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') # property_type_cache = {} is_error = False for i in resolved_excel: item = WDItem(i["statement"]["item"], creator='http://www.isi.edu/t2wml') try: property_type = property_type_map[i["statement"]["property"]] except KeyError: property_type = get_property_type(i["statement"]["property"], sparql_endpoint) property_type_map[i["statement"]["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(i["statement"]["value"])) elif property_type == "WikibaseProperty": value = Property(i["statement"]["value"]) elif property_type == "String": value = StringValue(i["statement"]["value"]) elif property_type == "Quantity": value = QuantityValue(i["statement"]["value"]) elif property_type == "Time": value = TimeValue(str(i["statement"]["value"]), Item(i["statement"]["calendar"]), translate_precision_to_integer(i["statement"]["precision"]), i["statement"]["time_zone"]) elif property_type == "Url": value = URLValue(i["statement"]["value"]) elif property_type == "Monolingualtext": value = MonolingualText(i["statement"]["value"], i["statement"]["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(i["statement"]["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(i["statement"]["latitude"], i["statement"]["longitude"], i["statement"]["precision"]) elif property_type == "Property Not Found": is_error = True break s = item.add_statement(i["statement"]["property"], value) doc.kg.add_subject(item) if "qualifier" in i["statement"]: for j in i["statement"]["qualifier"]: try: property_type = property_type_map[j["property"]] except KeyError: property_type = get_property_type(j["property"], sparql_endpoint) property_type_map[j["property"]] = property_type if property_type == "WikibaseItem": value = Item(str(j["value"])) elif property_type == "WikibaseProperty": value = Property(j["value"]) elif property_type == "String": value = StringValue(j["value"]) elif property_type == "Quantity": value = QuantityValue(j["value"]) elif property_type == "Time": value = TimeValue(str(j["value"]), Item(j["calendar"]), j["precision"], j["time_zone"]) elif property_type == "Url": value = URLValue(j["value"]) elif property_type == "Monolingualtext": value = MonolingualText(j["value"], j["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(j["value"]) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(j["latitude"], j["longitude"], j["precision"]) elif property_type == "Property Not Found": is_error = True s.add_qualifier(j["property"], value) doc.kg.add_subject(s) if not is_error: data = doc.kg.serialize(filetype) else: data = "Property Not Found" # os.makedirs(Path.cwd() / "new_properties", exist_ok=True) # results_file_name = user_id + "_results.ttl" # changes_file_name = user_id + "_changes.tsv" # with open(Path(app.config['downloads']) / results_file_name, "w") as fp: # fp.write(data) # with open(Path(app.config['downloads']) / changes_file_name, "w") as fp: # serialize_change_record(fp) return data
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix skos: <http://www.w3.org/2004/02/skos/core#> . @prefix schema: <http://schema.org/> . :Event a owl:Class ; . :Entity a owl:Class ; . :Organization a owl:Class ; . :MOVEMENT_TRANSPORT a owl:Class ; . :GeopoliticalEntity a owl:Class ; . skos:prefLabel a owl:DatatypeProperty ; schema:domainIncludes :Entity, :Event ; rdfs:range xsd:string ; . :conflict_attack_place a owl:ObjectProperty ; schema:domainIncludes :Entity, :Event ; schema:rangeIncludes :GeopoliticalEntity ; . ''' ontology = Ontology(ontology_content, validation=False, include_undefined_class=True, quiet=True) kg_schema = KGSchema(ontology.merge_with_master_config(dict())) etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology) input_data = {'doc_id': '1', 'data': json.loads(sample_input)} doc = etk.create_document(input_data) docs = etk.process_ems(doc) kgs = [json.dumps(doc.kg.value) for doc in docs[1:]] with open('output.jsonl', 'w') as f: f.write('\n'.join(kgs)) with open('output.nt', 'w') as f: f.writelines(map(rdf_generation, kgs))
"news_story": { "type": "string" }, "similarity": { "type": "number" }, "matched_sentence": { "type": "string" }, "date": { "type": "string" } } } kg_schema = KGSchema(master_config) etk = ETK(kg_schema, ["./"]) # read the news news_file = open( '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl' ) # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl') news_stories = [ etk.create_document(json.loads(line), url=json.loads(line)['tld'], doc_id=json.loads(line)['doc_id']) for line in news_file ] results = list() for news_story in news_stories: results.extend(etk.process_ems(news_story))
import unittest, json from etk.timeseries_processor import TimeseriesProcessor from etk.etk import ETK from etk.knowledge_graph import KGSchema kg_schema = KGSchema( json.load(open('etk/unit_tests/ground_truth/test_config.json'))) etk = ETK(kg_schema=kg_schema) # python -m unittest etk.unit_tests.test_timeseries_processor to run all unittests class TestTimeseriesProcessor(unittest.TestCase): def test_excel_file(self) -> None: annotation = 'etk/timeseries/DIESEL_june_annotation.json' spreadsheet = 'etk/unit_tests/ground_truth/DIESEL_june_2017.xlsx' timeseriesProcessor = TimeseriesProcessor(etk=etk, annotation=annotation, spreadsheet=spreadsheet) docs = [ doc.cdr_document for doc in timeseriesProcessor.timeseries_extractor() ] selected_docs = docs[1] expected_metadata = { "name": "AVERAGE DIESEL (AUTOMATIVE GAS OIL) PRICES/ Litre NGN", "granularity": "monthly", "provenance": { "filename": "DIESEL_june_2017.xlsx",
"projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep, Anika and others." }] }, { "projects": [{ "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." }] }] etk = ETK(modules=ExampleETKModule) extractions = [] def mapper(sample, _idx): doc = etk.create_document(sample) docs = etk.process_ems(doc) sys.stdout.flush() re = docs[0].value # print(re) return re def collect(extracted): extractions.append(extracted) pp = ParallelProcessor(2, mapper=mapper,
# Add a title to the actor document doc.kg.add_value("title", json_path="$.Side") # Return an empty list because we didn't create new documents return [] # The main is for testing, and is not used in the DIG pipeline if __name__ == "__main__": # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema. kg_schema = KGSchema(json.load(open('master_config.json'))) # Instantiate ETK, with the two processing modules and the schema. etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema) # Create a CSV processor to create documents for the relevant rows in the Excel sheet cp = CsvProcessor(etk=etk, heading_row=1) with open("ucdp.jl", "w") as f: # Iterate over all the rows in the spredsheet for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'): # Each row produces a document, which we sent to ETK. # Note that each invocation of process_ems will also process any new documents created while # processing each doc etk.process_and_frame(doc) f.write(json.dumps(doc.cdr_document) + "\n") # for result in etk.process_ems(doc): # # print(result.cdr_document["knowledge_graph"]) # f.write(json.dumps(result.cdr_document) + "\n")
def model_data() -> None: """ This function generates triples for user defined properties for uploading them to wikidata :return: """ stream = open(Path.cwd().parent / "Datasets/new-property-configuration.yaml", 'r', encoding='utf8') yaml_data = yaml.safe_load(stream) # initialize kg_schema = KGSchema() kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl') etk = ETK(kg_schema=kg_schema, modules=ETKModule) doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects") # bind prefixes doc.kg.bind('wikibase', 'http://wikiba.se/ontology#') doc.kg.bind('wd', 'http://www.wikidata.org/entity/') doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/') doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/') doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/') doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/') doc.kg.bind('wdv', 'http://www.wikidata.org/value/') doc.kg.bind('wdref', 'http://www.wikidata.org/reference/') doc.kg.bind('p', 'http://www.wikidata.org/prop/') doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/') doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/') doc.kg.bind('prn', 'http://www.wikidata.org/prop/reference/value-normalized/') doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/') doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/') doc.kg.bind('psn', 'http://www.wikidata.org/prop/statement/value-normalized/') doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/') doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/') doc.kg.bind('pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/') doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#') doc.kg.bind('prov', 'http://www.w3.org/ns/prov#') doc.kg.bind('schema', 'http://schema.org/') sparql_endpoint = "https://query.wikidata.org/sparql" type_map = {'quantity': Datatype.QuantityValue, 'url': URLValue} property_type_cache = {} for k, v in yaml_data.items(): p = WDProperty(k, type_map[v['type']], creator='http://www.isi.edu/t2wml') for lang, value in v['label'].items(): for val in value: p.add_label(val, lang=lang) for lang, value in v['description'].items(): for val in value: p.add_description(val, lang=lang) for pnode, items in v['statements'].items(): for item in items: try: property_type = property_type_cache[pnode] except KeyError: property_type = get_property_type(pnode, sparql_endpoint) property_type_cache[pnode] = property_type if property_type == "WikibaseItem": value = Item(str(item['value'])) elif property_type == "WikibaseProperty": value = Property(item['value']) elif property_type == "String": value = StringValue(item['value']) elif property_type == "Quantity": value = QuantityValue(item['value']) elif property_type == "Time": value = TimeValue( str(item['value']), Item(item["calendar"]), translate_precision_to_integer(item["precision"]), item["time_zone"]) elif property_type == "Url": value = URLValue(item['value']) elif property_type == "Monolingualtext": value = MonolingualText(item['value'], item["lang"]) elif property_type == "ExternalId": value = ExternalIdentifier(item['value']) elif property_type == "GlobeCoordinate": value = GlobeCoordinate(item["latitude"], item["longitude"], item["precision"]) p.add_statement(pnode, value) doc.kg.add_subject(p) with open(Path.cwd().parent / "new_properties/result.ttl", "w") as f: data = doc.kg.serialize('ttl') f.write(data)
import os, sys, json sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from etk.etk import ETK from etk.knowledge_graph import KGSchema from examples.config_to_em.em_base_generator import EmBaseGenerator ebg = EmBaseGenerator('template.tpl') ebg.generate_em_base('master_config.json', 'ems/em_base.py') kg_schema = KGSchema(json.load(open("master_config.json", "r"))) etk = ETK(kg_schema, ["./ems"]) doc = etk.create_document(json.load(open('sample_html.jl', 'r'))) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))
import os, sys, json, codecs sys.path.append(os.path.join(os.path.dirname(__file__), '..')) sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) from etk.etk import ETK from etk.extractors.html_content_extractor import HTMLContentExtractor, Strategy from etk.extractors.html_metadata_extractor import HTMLMetadataExtractor from etk.extractors.inferlink_extractor import InferlinkExtractor, InferlinkRuleSet sample_html = json.load(codecs.open('sample_html.json', 'r')) # read sample file from disk etk = ETK() doc = etk.create_document(sample_html, mime_type="text/html", url="http://ex.com/123") metadata_extractor = HTMLMetadataExtractor() content_extractor = HTMLContentExtractor() landmark_extractor = InferlinkExtractor( InferlinkRuleSet( InferlinkRuleSet.load_rules_file('sample_inferlink_rules.json'))) root = doc.select_segments("$")[0] raw = doc.select_segments("$.raw_content")[0] # root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_title=True), "title") # root.store_extractions(doc.invoke_extractor(metadata_extractor, extract_meta=True), "metadata") root.store_extractions( doc.invoke_extractor(content_extractor, raw, strategy=Strategy.ALL_TEXT), "etk2_text") root.store_extractions(
projects = doc.select_segments("projects[*]") for d, p in zip(descriptions, projects): names = doc.extract(self.rule_extractor, d) p.store(names, "members") return list() if __name__ == "__main__": sample_input = { "projects": [{ "name": "etk", "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others." }, { "name": "rltk", "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students." }] } etk = ETK(modules=RuleETKModule) doc = etk.create_document(sample_input) docs = etk.process_ems(doc) print(json.dumps(docs[0].value, indent=2))