Beispiel #1
0
 def reset_etk_doc(self,
                   doc_id: str = "http://isi.edu/default-ns/projects"):
     """
     reset the doc object and return it. Called at initialization and after outputting triples.
     """
     kg_schema = KGSchema()
     kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
     self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
     self.doc = self.etk.create_document({}, doc_id=doc_id)
     for k, v in wiki_namespaces.items():
         if k in self.prefix_dict:
             self.doc.kg.bind(k, self.prefix_dict[k])
         else:
             self.doc.kg.bind(k, v)
Beispiel #2
0
    def setUp(self):
        sample_doc = {
            "projects": [{
                "name": "etk",
                "description":
                "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
                "members": ["dongyu", "amandeep", "sylvia", "Runqi12"],
                "date": "2007-12-05",
                "place": "columbus:georgia:united states:-84.98771:32.46098",
                "s": "segment_test_1"
            }, {
                "name": "rltk",
                "description":
                "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
                "members": ["mayank", "yixiang"],
                "date": ["2007-12-05T23:19:00"],
                "cost": -3213.32,
                "s": "segment_test_2"
            }]
        }
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema)
        self.doc = etk.create_document(sample_doc)
    def test_EmailExtractor(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True)

        text = "[email protected] [email protected] " \
               "[email protected] [email protected]  E-mail:[email protected] [email protected]"

        email_extractor = EmailExtractor(nlp=etk.default_nlp,
                                         tokenizer=etk.default_tokenizer,
                                         extractor_name="email_extractor")

        extractions = email_extractor.extract(text)

        extracted = []
        for i in extractions:
            extracted_value = {
                "value":
                i.value,
                "start_char":
                i.provenance["start_char"],
                "end_char":
                i.provenance["end_char"],
                "value_from_text":
                text[i.provenance["start_char"]:i.provenance["end_char"]]
            }
            extracted.append(extracted_value)
            self.assertEqual(extracted_value["value"],
                             extracted_value["value_from_text"])

        expected = [{
            'value': '*****@*****.**',
            'start_char': 97,
            'end_char': 122,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 0,
            'end_char': 16,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 77,
            'end_char': 96,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 17,
            'end_char': 40,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 51,
            'end_char': 68,
            'value_from_text': '*****@*****.**'
        }]

        self.assertEqual(sorted(expected, key=lambda x: x["start_char"]),
                         sorted(extracted, key=lambda x: x["start_char"]))
Beispiel #4
0
 def setUp(self):
     ontology_content = '''
             @prefix : <http://dig.isi.edu/ontologies/dig/> .
             @prefix owl: <http://www.w3.org/2002/07/owl#> .
             @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
             @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
             @prefix schema: <http://schema.org/> .
             @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
             :Person a owl:Class ;
                 rdfs:subClassOf :Actor, :Biological_Object ;
                 :common_properties :label, :title, :religion ; .
             :has_name a owl:DatatypeProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes xsd:string ; .
             :has_child a owl:ObjectProperty ;
                 schema:domainIncludes :Person ;
                 schema:rangeIncludes :Person ; .
         '''
     ontology = Ontology(ontology_content,
                         validation=False,
                         include_undefined_class=True,
                         quiet=True)
     kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
     etk = ETK(kg_schema=kg_schema,
               ontology=ontology,
               generate_json_ld=True)
     etk2 = ETK(kg_schema=kg_schema,
                ontology=ontology,
                generate_json_ld=False)
     self.doc = etk.create_document(dict(),
                                    doc_id='http://xxx/1',
                                    type_=[DIG.Person.toPython()])
     self.doc2 = etk2.create_document(dict(),
                                      doc_id='http://xxx/2',
                                      type_=[DIG.Person.toPython()])
Beispiel #5
0
    def test_segment(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema)
        doc = etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        description_value = [i.value for i in descriptions]
        expected = [
            'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
            'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
        ]
        self.assertEqual(description_value, expected)
Beispiel #6
0
 def __init__(
     self,
     propFile: str,
     labelSet: str,
     aliasSet: str,
     descriptionSet: str,
     n: int,
     destFp: TextIO = sys.stdout,
 ):
     self.propTypes = self.__setPropTypes(propFile)
     self.labelSet, self.aliasSet, self.descriptionSet = self.__setSets(
         labelSet, aliasSet, descriptionSet)
     # TODO handle standard output
     self.fp = destFp
     self.n = int(n)
     self.read = 0
     # serialize prfix
     kg_schema = KGSchema()
     kg_schema.add_schema("@prefix : <http://isi.edu/> .", "ttl")
     self.etk = ETK(kg_schema=kg_schema, modules=ETKModule)
     self.doc = self.__setDoc()
     self.__serialize_prefix()
Beispiel #7
0
    def __init__(self,
                 master_config,
                 em_paths,
                 logger,
                 worker_id,
                 project_name,
                 kafka_input_args=None,
                 kafka_output_args=None):
        self.logger = logger
        self.worker_id = worker_id
        self.check_interval = 1000
        self.exit_sign = False

        try:
            kg_schema = KGSchema(master_config)
            self.etk_ins = ETK(kg_schema, em_paths, logger=logger)
        except Exception as e:
            logger.exception('ETK initialization failed')
            raise e

        # kafka input
        self.kafka_input_server = config['input_server']
        self.kafka_input_session_timeout = config['input_session_timeout']
        self.kafka_input_group_id = config['input_group_id']
        self.kafka_input_topic = '{project_name}_in'.format(
            project_name=project_name)
        self.kafka_input_args = dict(
        ) if kafka_input_args is None else kafka_input_args
        self.kafka_consumer = KafkaConsumer(
            bootstrap_servers=self.kafka_input_server,
            group_id=self.kafka_input_group_id,
            consumer_timeout_ms=self.check_interval,
            value_deserializer=lambda v: json.loads(v.decode('utf-8')),
            **self.kafka_input_args)
        self.kafka_consumer.subscribe([self.kafka_input_topic])

        # kafka output
        self.kafka_output_server = config['output_server']
        self.kafka_output_topic = '{project_name}_out'.format(
            project_name=project_name)
        self.kafka_output_args = dict(
        ) if kafka_output_args is None else kafka_output_args
        self.kafka_producer = KafkaProducer(
            bootstrap_servers=self.kafka_output_server,
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            **self.kafka_output_args)

        self.timeout_count = self.kafka_input_session_timeout / self.check_interval
        self.current_timeout_count = 0
Beispiel #8
0
import unittest, json
from etk.timeseries_processor import TimeseriesProcessor
from etk.etk import ETK
from etk.knowledge_graph import KGSchema

kg_schema = KGSchema(
    json.load(open('etk/unit_tests/ground_truth/test_config.json')))

etk = ETK(kg_schema=kg_schema)

# python -m unittest etk.unit_tests.test_timeseries_processor to run all unittests


class TestTimeseriesProcessor(unittest.TestCase):
    def test_excel_file(self) -> None:
        annotation = 'etk/timeseries/DIESEL_june_annotation.json'
        spreadsheet = 'etk/unit_tests/ground_truth/DIESEL_june_2017.xlsx'

        timeseriesProcessor = TimeseriesProcessor(etk=etk,
                                                  annotation=annotation,
                                                  spreadsheet=spreadsheet)
        docs = [
            doc.cdr_document
            for doc in timeseriesProcessor.timeseries_extractor()
        ]
        selected_docs = docs[1]
        expected_metadata = {
            "name": "AVERAGE DIESEL (AUTOMATIVE GAS OIL) PRICES/ Litre NGN",
            "granularity": "monthly",
            "provenance": {
                "filename": "DIESEL_june_2017.xlsx",
    ontology = """
@prefix : <http://isi.edu/xij-rule-set#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
:Software a owl:Class ;
          rdfs:label "Software" .
:Person a owl:Class ;
        rdfs:label "Person" .
:Developer a owl:Class ;
           rdfs:label "Developer" .
:name a owl:DatatypeProperty ;
      rdf:domain :Person ;
      rdf:range xsd:string .
:developer a owl:ObjectProperty ;
           rdfs:label "developer" ;
           rdf:domain :Software ;
           rdf:range :Developer .
    """
    kg_schema = KGSchema()
    kg_schema.add_schema(ontology, 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule)
    doc = etk.create_document(sample_input,
                              doc_id="http://isi.edu/default-ns/projects")

    docs = etk.process_ems(doc)

    print(docs[0].kg.serialize('ttl'))
Beispiel #10
0
    def model_statement(self):
        # initialize KGSchema
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict={self.ns: self.uri})

        # extract files
        self.extract_files()

        # model statement
        inputs = self.data['inputs']
        for k, v in inputs.items():
            if k != 'metadata':
                # construct wikifier instance
                if k == 'wikifier' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A wikifier file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1001',
                        namespace=self.ns))  # an instance of Wikifier
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP3003',
                                    StringValue(v['content']),
                                    namespace=self.ns)  # hasFileContent
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)  # hashValue

                # construct mapping_file instance
                elif k == 'mappingFile' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label('A mapping file for ' +
                                inputs['dataset']['content']['filename'],
                                lang='en')
                    q.add_statement('P31', Item(
                        'SDQ1002',
                        namespace=self.ns))  # an instance of MappingFile
                    q.add_statement('P170', StringValue('T2WML'))
                    q.add_statement('P127', Item('SDQ1003', namespace=self.ns))
                    q.add_statement('SDP3003',
                                    StringValue(json.dumps(v['content'])),
                                    namespace=self.ns)
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                # construct dataset instance
                elif k == 'dataset' and not v['existed']:
                    q = WDItem(v['qnode'],
                               namespace=self.ns,
                               creator=':datamart')
                    q.add_label(v['content']['title'], lang='en')
                    q.add_description(v['content']['description'], lang='en')
                    q.add_statement('P31',
                                    Item('Q1172284'))  # an instance of Dataset
                    q.add_statement('SDP3001',
                                    Item(inputs['wikifier']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a wikifier file
                    q.add_statement('SDP3002',
                                    Item(inputs['mappingFile']['qnode'],
                                         namespace=self.ns),
                                    namespace=self.ns)  # a mapping file
                    q.add_statement('P1476', StringValue(
                        v['content']['title']))  # title
                    q.add_statement(
                        'P921',
                        StringValue(v['content']['description']))  # described
                    q.add_statement('P127',
                                    Item('SDQ1003',
                                         namespace=self.ns))  # belongs to
                    q.add_statement('SDP2004',
                                    StringValue(', '.join(
                                        v['content']['keywords'])),
                                    namespace=self.ns)  # keywords
                    q.add_statement('SDP3004',
                                    StringValue(v['hashcode']),
                                    namespace=self.ns)

                    if self.data['storeColumnValue']:
                        for data in v['content']['variable_measured']:
                            statement = q.add_statement(
                                'SDP2005',
                                StringValue(data['column_name']),
                                namespace=self.ns)  # variable measured
                            statement.add_qualifier(
                                'SDP2006',
                                StringValue(data['values_of_a_column']),
                                namespace=self.ns)  # the values of a column
                            statement.add_qualifier(
                                'SDP2007',
                                Item(data['data_structure_type']),
                                namespace=self.ns)  # data structure type
                            statement.add_qualifier(
                                'SDP2008',
                                URLValue(data['semantic_type_identifier']),
                                namespace=self.ns)  # semantic type
                            statement.add_qualifier(
                                'P1545',
                                QuantityValue(
                                    data['column_index'],
                                    namespace=self.ns))  # column index

                doc.kg.add_subject(q)

        return doc
Beispiel #11
0
    def model_schema(self):
        # read data
        data = self.read_data(self.data['schema'])

        # initialize KGSchema
        custom_dict, ns_dict = {}, {'wd': 'http://www.wikidata.org/entity/'}
        for each in data['prefix']:
            for k, v in each.items():
                custom_dict[k] = v
                if k != 'wd':
                    ns_dict[k] = v + '/entity'
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id='http://isi.edu/default-ns/projects')

        # bind prefix
        doc = create_custom_prefix(doc, custom_dict)

        type_map = {
            'quantity': Datatype.QuantityValue,
            'url': URLValue,
            'item': Datatype.Item,
            'time': Datatype.TimeValue,
            'string': Datatype.StringValue,
            'text': Datatype.MonolingualText
        }

        # model schema
        for k, v in data.items():
            if ':' in k:
                k = k.split(':')
                if 'Q' in k[1]:
                    p = WDItem(k[1], namespace=k[0], creator=':datamart')
                elif 'P' in k[1]:
                    p = WDProperty(k[1],
                                   type_map[v['type']],
                                   namespace=k[0],
                                   creator=':datamart')
                else:
                    raise Exception('There is no P/Q information.')
                    return None

                for lang, value in v['description'].items():
                    for val in value:
                        p.add_description(val, lang=lang)

                for lang, value in v['label'].items():
                    for val in value:
                        p.add_label(val, lang=lang)

                for node, value in v['statements'].items():
                    ns = node.split(':')[0] if ':' in node else 'wd'
                    for val in value:
                        prop_type = self.get_property_type(node, ns_dict[ns])
                        if prop_type == 'WikibaseItem':
                            v = Item(str(val['value']))
                        elif prop_type == 'WikibaseProperty':
                            v = Property(val['value'])
                        elif prop_type == 'String':
                            v = StringValue(val['value'])
                        elif prop_type == 'Quantity':
                            v = QuantityValue(val['value'])
                        elif prop_type == 'Url':
                            v = URLValue(val['value'])
                        elif prop_type == 'Monolingualtext':
                            v = MonolingualText(val['value'], val['lang'])
                        p.add_statement(node, v)
                doc.kg.add_subject(p)

        return doc
Beispiel #12
0
import os, sys, json
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from etk.etk import ETK
from etk.knowledge_graph import KGSchema
from examples.config_to_em.em_base_generator import EmBaseGenerator

ebg = EmBaseGenerator('template.tpl')
ebg.generate_em_base('master_config.json', 'ems/em_base.py')

kg_schema = KGSchema(json.load(open("master_config.json", "r")))

etk = ETK(kg_schema, ["./ems"])

doc = etk.create_document(json.load(open('sample_html.jl', 'r')))

docs = etk.process_ems(doc)

print(json.dumps(docs[0].value, indent=2))
Beispiel #13
0
    def test_KnowledgeGraph_provenance(self) -> None:
        sample_doc = {
            "projects": [
                {
                    "name": "etk",
                    "description": "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others.",
                    "members": [
                        "dongyu",
                        "amandeep",
                        "sylvia",
                        "Runqi12"
                    ],
                    "date": "2007-12-05",
                    "place": "columbus:georgia:united states:-84.98771:32.46098"
                },
                {
                    "name": "rltk",
                    "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
                    "members": [
                        "mayank",
                        "yixiang"
                    ],
                    "date": ["2007-12-05T23:19:00"],
                    "cost": -3213.32
                }
            ]
        }

        kg_schema = KGSchema(json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema)
        doc = etk.create_document(sample_doc)

        try:
            doc.kg.add_value("developer", json_path="projects[*].members[*]")
        except KgValueError:
            pass

        try:
            doc.kg.add_value("test_date", json_path="projects[*].date[*]")
        except KgValueError:
            pass

        try:
            doc.kg.add_value("test_add_value_date", value=[date(2018, 3, 28), {}, datetime(2018, 3, 28, 1, 1, 1)],
                             json_path_extraction="projects[0].date")
        except KgValueError:
            pass

        try:
            doc.kg.add_value("test_location", json_path="projects[*].place")
        except KgValueError:
            pass

        # print (json.dumps(doc.value, indent=2))

        expeced_provenances = [
            {
                "@id": 0,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "dongyu",
                "json_path": "projects.[0].members.[0]"
            },
            {
                "@id": 1,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "amandeep",
                "json_path": "projects.[0].members.[1]"
            },
            {
                "@id": 2,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "sylvia",
                "json_path": "projects.[0].members.[2]"
            },
            {
                "@id": 3,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "Runqi12",
                "json_path": "projects.[0].members.[3]"
            },
            {
                "@id": 4,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "mayank",
                "json_path": "projects.[1].members.[0]"
            },
            {
                "@id": 5,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "yixiang",
                "json_path": "projects.[1].members.[1]"
            },
            {
                "@id": 6,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "2007-12-05T00:00:00",
                "json_path": "projects.[0].date.[0]"
            },
            {
                "@id": 7,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "2007-12-05T23:19:00",
                "json_path": "projects.[1].date.[0]"
            },
            {
                "@id": 8,
                "@type": "kg_provenance_record",
                "reference_type": "constant",
                "value": "2018-03-28",
                "json_path": "projects[0].date"
            },
            {
                "@id": 9,
                "@type": "kg_provenance_record",
                "reference_type": "constant",
                "value": "2018-03-28T01:01:01",
                "json_path": "projects[0].date"
            },
            {
                "@id": 10,
                "@type": "kg_provenance_record",
                "reference_type": "location",
                "value": "columbus:georgia:united states:-84.98771:32.46098",
                "json_path": "projects.[0].place"
            }
        ]

        self.assertEqual(expeced_provenances, doc.value["provenances"])
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix schema: <http://schema.org/> .
:Event a owl:Class ; .
:Entity a owl:Class ; .
:Organization a owl:Class ; .
:MOVEMENT_TRANSPORT a owl:Class ; .
:GeopoliticalEntity a owl:Class ; .
skos:prefLabel a owl:DatatypeProperty ; 
    schema:domainIncludes :Entity, :Event ;
    rdfs:range xsd:string ; .
:conflict_attack_place a owl:ObjectProperty ;
    schema:domainIncludes :Entity, :Event ;
    schema:rangeIncludes :GeopoliticalEntity ; .
    '''

    ontology = Ontology(ontology_content,
                        validation=False,
                        include_undefined_class=True,
                        quiet=True)
    kg_schema = KGSchema(ontology.merge_with_master_config(dict()))
    etk = ETK(modules=ExampleETKModule, kg_schema=kg_schema, ontology=ontology)
    input_data = {'doc_id': '1', 'data': json.loads(sample_input)}
    doc = etk.create_document(input_data)
    docs = etk.process_ems(doc)
    kgs = [json.dumps(doc.kg.value) for doc in docs[1:]]
    with open('output.jsonl', 'w') as f:
        f.write('\n'.join(kgs))
    with open('output.nt', 'w') as f:
        f.writelines(map(rdf_generation, kgs))
Beispiel #15
0
            },
            "news_story": {
                "type": "string"
            },
            "similarity": {
                "type": "number"
            },
            "matched_sentence": {
                "type": "string"
            },
            "date": {
                "type": "string"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./"])

    # read the news
    news_file = open(
        '/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/new_2018-04-03-first-10000.jl'
    )
    # news_file = open('/Users/amandeep/Github/etk/examples/ranking_pipeline/resources/news_stories_3.jl')
    news_stories = [
        etk.create_document(json.loads(line),
                            url=json.loads(line)['tld'],
                            doc_id=json.loads(line)['doc_id'])
        for line in news_file
    ]
    results = list()
    for news_story in news_stories:
Beispiel #16
0
def main():
    filename = sys.argv[1]
    query_title = sys.argv[2]
    ranking_criteria = sys.argv[3]
    top_k = sys.argv[4]

    if ranking_criteria not in ('TITLE', 'SENTENCE'):
        print('Wrong mode! Please check the input argument!')
        return

    master_config = {
        "fields": {
            "developer": {
                "type": "string"
            },
            "student_developer": {
                "type": "string"
            },
            "spacy_name": {
                "type": "string"
            },
            "date": {
                "type": "date"
            }
        }
    }
    kg_schema = KGSchema(master_config)
    etk = ETK(kg_schema, ["./extraction_modules/"])
    nlp = spacy.load('en_core_web_lg')

    date_extractor = DateExtractor(etk=etk)

    queries = dict()
    queries_ent_map = dict()

    with open(query_title) as f:
        for line in f:
            orig_ifp_title = line
            # remove date information from query term
            res = date_extractor.extract(text=line)
            start, end = float('inf'), -1
            for i in res:
                start = min(start, i.provenance['start_char'])
                end = max(end, i.provenance['end_char'])
            # delete date from query term
            if len(res) != 0:
                line = line[:start] + line[end+1:]

            queries[orig_ifp_title] = line
            queries_ent_map[line] = list()
            # extract entities from query term
            doc = nlp(line)
            for ent in doc.ents:
                queries_ent_map[line].append(re.escape(ent.text.strip()))
            # remove empty entities
            queries_ent_map[line] = list(filter(bool, queries_ent_map[line]))

    # the list of selected docs for given query term
    query_docs_mapping = dict()

    docs = list()
    with open(filename) as f:
        for line in f:
            json_obj = json.loads(line)
            docs.append(etk.create_document(json_obj))

    ds = DefaultDocumentSelector()

    for orig_query, proc_query in queries.items():
        content_regex = queries_ent_map[proc_query]
        query_docs_mapping[proc_query] = list()
        for doc in docs:
            if len(content_regex) == 0 \
                    or ds.select_document(document=doc,
                              json_paths=['$.lexisnexis.doc_description'],
                              json_paths_regex=content_regex):
                query_docs_mapping[proc_query].append(doc)

    # TODO: pass ifp_id in
    for orig_query, proc_query in queries.items():
        # print(len(query_docs_mapping[proc_query]))
        dr_processor = DocRetrieveProcessor(etk=etk, ifp_id="1233", ifp_title=proc_query, orig_ifp_title=orig_query)
        heap = list()
        for doc in query_docs_mapping[proc_query]:
            processed_doc = dict()

            if ranking_criteria == 'SENTENCE':
                processed_doc = dr_processor.process_by_sentence(doc=doc, threshold=0).cdr_document
            elif ranking_criteria == 'TITLE':
                processed_doc = dr_processor.process_by_title(doc=doc, threshold=0).cdr_document

            if len(heap) < top_k:
                heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))
            else:
                if processed_doc['similarity'] > heap[0][0]:
                    heappush(heap, (processed_doc['similarity'], processed_doc['date'], processed_doc))

        heap.sort(reverse=True)

        output_filename = './resources/output/'+orig_ifp_title+"_result.jl"

        with open(output_filename, 'a+b') as f:
            for item in heap:
                print(item[0])
                jl_str = json.dumps(item[2]) + '\n'
                f.write(jl_str.encode())
Beispiel #17
0
        douglas.add_statement('P2048', QuantityValue(1.96,
                                                     unit=Item('Q11573')))

        # official website
        # statement = douglas.add_statement('P856', URLValue('http://douglasadams.com/'))
        statement = douglas.add_truthy_statement(
            'P856', URLValue('http://douglasadams.com/'))

        statement.add_qualifier('P407', Item('Q1860'))

        # Freebase ID
        douglas.add_statement(
            'P646',
            ExternalIdentifier('/m/0282x', URLValue('http://g.co/kg/m/0282x')))

        doc.kg.add_subject(douglas)
        return list()


if __name__ == "__main__":
    kg_schema = KGSchema()
    kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
    etk = ETK(kg_schema=kg_schema, modules=ExampleETKModule)
    doc = etk.create_document({}, doc_id="http://isi.edu/default-ns/projects")

    revise(True)

    docs = etk.process_ems(doc)

    print(docs[0].kg.serialize('ttl'))
Beispiel #18
0
    def test_Provenance(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        self.etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True)
        g = [
            'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep',
            'yixiang'
        ]
        self.name_extractor = GlossaryExtractor(g,
                                                "name_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=1)
        doc = self.etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.name_extractor, d)
            p.store(names, "members")

        expected_provenances = [{
            "@id": 0,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 33,
                "end_char": 38
            }
        }, {
            "@id": 1,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 40,
                "end_char": 46
            }
        }, {
            "@id": 2,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 48,
                "end_char": 54
            }
        }, {
            "@id": 3,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 56,
                "end_char": 64
            }
        }, {
            "@id": 4,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[0].members",
            "parent_provenances": {
                "Runqi": 0,
                "Dongyu": 1,
                "Sylvia": 2,
                "Amandeep": 3
            }
        }, {
            "@id": 5,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 39,
                "end_char": 44
            }
        }, {
            "@id": 6,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 46,
                "end_char": 52
            }
        }, {
            "@id": 7,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 54,
                "end_char": 61
            }
        }, {
            "@id": 8,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[1].members",
            "parent_provenances": {
                "Pedro": 5,
                "Mayank": 6,
                "Yixiang": 7
            }
        }]
        expected_projects = [{
            "name":
            "etk",
            "description":
            "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.",
            "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"]
        }, {
            "name": "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
            "members": ["Pedro", "Mayank", "Yixiang"]
        }]
        self.assertEqual(expected_projects, doc.value["projects"])
        self.assertEqual(expected_provenances, doc.value["provenances"])
Beispiel #19
0
    def _init_etk():
        # initialize for etk
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        doc = etk.create_document({},
                                  doc_id="http://isi.edu/default-ns/projects")

        # bind prefixes
        doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
        doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
        doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
        doc.kg.bind('wdtn', 'http://www.wikidata.org/prop/direct-normalized/')
        doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
        doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
        doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
        doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
        doc.kg.bind('p', 'http://www.wikidata.org/prop/')
        doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
        doc.kg.bind('prv', 'http://www.wikidata.org/prop/reference/value/')
        doc.kg.bind(
            'prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
        doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
        doc.kg.bind('psv', 'http://www.wikidata.org/prop/statement/value/')
        doc.kg.bind(
            'psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
        doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
        doc.kg.bind('pqv', 'http://www.wikidata.org/prop/qualifier/value/')
        doc.kg.bind(
            'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
        doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
        doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
        doc.kg.bind('schema', 'http://schema.org/')

        # give definition of the nodes we definied
        p = WDProperty('C2001', Datatype.MonolingualText)
        p.add_label('datamart identifier', lang='en')
        p.add_description('identifier of a dataset in the Datamart system',
                          lang='en')
        p.add_statement('P31', Item('Q19847637'))
        p.add_statement('P1629', Item('Q1172284'))
        doc.kg.add_subject(p)

        p = WDProperty('C2004', Datatype.StringValue)
        p.add_label('keywords', lang='en')
        p.add_description(
            'keywords associated with an item to facilitate finding the item using text search',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2005', Datatype.StringValue)
        p.add_label('variable measured', lang='en')
        p.add_description('the variables measured in a dataset', lang='en')
        p.add_statement('P31', Item('Q18616576'))
        p.add_statement('P1628',
                        URLValue('http://schema.org/variableMeasured'))
        doc.kg.add_subject(p)

        p = WDProperty('C2006', Datatype.StringValue)
        p.add_label('values', lang='en')
        p.add_description(
            'the values of a variable represented as a text document',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2007', Datatype.Item)
        p.add_label('data type', lang='en')
        p.add_description(
            'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), '
            'Real (Q4385701), String (Q184754), Categorical (Q2285707)',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2008', Datatype.URLValue)
        p.add_label('semantic type', lang='en')
        p.add_description(
            'a URL that identifies the semantic type of a variable in a dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2010', Datatype.StringValue)
        p.add_label('extra information', lang='en')
        p.add_description(
            'some extra information that may needed for this dataset',
            lang='en')
        doc.kg.add_subject(p)

        p = WDProperty('C2011', Datatype.TimeValue)
        p.add_label('start date', lang='en')
        p.add_description(
            'The earlist time exist in this dataset, only valid when there exists time format data in this dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2012', Datatype.TimeValue)
        p.add_label('end date', lang='en')
        p.add_description(
            'The latest time exist in this dataset, only valid when there exists time format data in this dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2013', Datatype.QuantityValue)
        p.add_label('time granularity', lang='en')
        p.add_description('time granularity in a dataset', lang='en')
        p.add_statement('P31', Item('Q18616576'))
        doc.kg.add_subject(p)

        p = WDProperty('C2014', Datatype.StringValue)
        p.add_label('uploader information', lang='en')
        p.add_description('information about who uploaded and when uploaded',
                          lang='en')
        doc.kg.add_subject(p)
        return doc
Beispiel #20
0
    def __init__(self, query_server=None, update_server=None):
        self.punctuation_table = str.maketrans(
            dict.fromkeys(string.punctuation))
        if query_server and update_server:
            self.query_server = query_server
            self.update_server = update_server
        else:
            self.query_server = DATAMRT_SERVER
            self.update_server = DATAMRT_SERVER

        # initialize
        kg_schema = KGSchema()
        kg_schema.add_schema('@prefix : <http://isi.edu/> .', 'ttl')
        etk = ETK(kg_schema=kg_schema, modules=ETKModule)
        self.doc = etk.create_document(
            {}, doc_id="http://isi.edu/default-ns/projects")

        # bind prefixes
        self.doc.kg.bind('wikibase', 'http://wikiba.se/ontology#')
        self.doc.kg.bind('wd', 'http://www.wikidata.org/entity/')
        self.doc.kg.bind('wdt', 'http://www.wikidata.org/prop/direct/')
        self.doc.kg.bind('wdtn',
                         'http://www.wikidata.org/prop/direct-normalized/')
        self.doc.kg.bind('wdno', 'http://www.wikidata.org/prop/novalue/')
        self.doc.kg.bind('wds', 'http://www.wikidata.org/entity/statement/')
        self.doc.kg.bind('wdv', 'http://www.wikidata.org/value/')
        self.doc.kg.bind('wdref', 'http://www.wikidata.org/reference/')
        self.doc.kg.bind('p', 'http://www.wikidata.org/prop/')
        self.doc.kg.bind('pr', 'http://www.wikidata.org/prop/reference/')
        self.doc.kg.bind('prv',
                         'http://www.wikidata.org/prop/reference/value/')
        self.doc.kg.bind(
            'prn', 'http://www.wikidata.org/prop/reference/value-normalized/')
        self.doc.kg.bind('ps', 'http://www.wikidata.org/prop/statement/')
        self.doc.kg.bind('psv',
                         'http://www.wikidata.org/prop/statement/value/')
        self.doc.kg.bind(
            'psn', 'http://www.wikidata.org/prop/statement/value-normalized/')
        self.doc.kg.bind('pq', 'http://www.wikidata.org/prop/qualifier/')
        self.doc.kg.bind('pqv',
                         'http://www.wikidata.org/prop/qualifier/value/')
        self.doc.kg.bind(
            'pqn', 'http://www.wikidata.org/prop/qualifier/value-normalized/')
        self.doc.kg.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
        self.doc.kg.bind('prov', 'http://www.w3.org/ns/prov#')
        self.doc.kg.bind('schema', 'http://schema.org/')

        # give definition of the nodes we definied
        p = WDProperty('C2001', Datatype.MonolingualText)
        p.add_label('keywords', lang='en')
        p.add_description('identifier of a dataset in the Datamart system',
                          lang='en')
        p.add_statement('P31', Item('Q19847637'))
        p.add_statement('P1629', Item('Q1172284'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2004', Datatype.StringValue)
        p.add_label('datamart identifier', lang='en')
        p.add_description(
            'keywords associated with an item to facilitate finding the item using text search',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2005', Datatype.StringValue)
        p.add_label('variable measured', lang='en')
        p.add_description('the variables measured in a dataset', lang='en')
        p.add_statement('P31', Item('Q18616576'))
        p.add_statement('P1628',
                        URLValue('http://schema.org/variableMeasured'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2006', Datatype.StringValue)
        p.add_label('values', lang='en')
        p.add_description(
            'the values of a variable represented as a text document',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2007', Datatype.Item)
        p.add_label('data type', lang='en')
        p.add_description(
            'the data type used to represent the values of a variable, integer (Q729138), Boolean (Q520777), '
            'Real (Q4385701), String (Q184754), Categorical (Q2285707)',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        p = WDProperty('C2008', Datatype.URLValue)
        p.add_label('semantic type', lang='en')
        p.add_description(
            'a URL that identifies the semantic type of a variable in a dataset',
            lang='en')
        p.add_statement('P31', Item('Q18616576'))
        self.doc.kg.add_subject(p)

        # get the starting source id
        sparql_query = """
            prefix wdt: <http://www.wikidata.org/prop/direct/>
            prefix wd: <http://www.wikidata.org/entity/>
            prefix wikibase: <http://wikiba.se/ontology#>
            PREFIX p: <http://www.wikidata.org/prop/>
            PREFIX pqv: <http://www.wikidata.org/prop/qualifier/value/>
            PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
            PREFIX ps: <http://www.wikidata.org/prop/statement/>
            prefix bd: <http://www.bigdata.com/rdf#>
            prefix bds: <http://www.bigdata.com/rdf/search#>

            select ?x where {
              wd:Z00000 wdt:P1114 ?x .
            }
            """
        try:
            sparql = SPARQLWrapper(self.query_server)
            sparql.setQuery(sparql_query)
            sparql.setReturnFormat(JSON)
            sparql.setMethod(POST)
            sparql.setRequestMethod(URLENCODED)
            results = sparql.query().convert()['results']['bindings']
        except:
            print("Getting query of wiki data failed!")
            raise ValueError("Unable to initialize the datamart query service")
        if not results:
            print(
                "[WARNING] No starting source id found! Will initialize the starting source with D1000001"
            )
            self.resource_id = 1000001
        else:
            self.resource_id = 1000001