Example #1
0
    def test_EmailExtractor(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema, use_spacy_tokenizer=True)

        text = "[email protected] [email protected] " \
               "[email protected] [email protected]  E-mail:[email protected] [email protected]"

        email_extractor = EmailExtractor(nlp=etk.default_nlp,
                                         tokenizer=etk.default_tokenizer,
                                         extractor_name="email_extractor")

        extractions = email_extractor.extract(text)

        extracted = []
        for i in extractions:
            extracted_value = {
                "value":
                i.value,
                "start_char":
                i.provenance["start_char"],
                "end_char":
                i.provenance["end_char"],
                "value_from_text":
                text[i.provenance["start_char"]:i.provenance["end_char"]]
            }
            extracted.append(extracted_value)
            self.assertEqual(extracted_value["value"],
                             extracted_value["value_from_text"])

        expected = [{
            'value': '*****@*****.**',
            'start_char': 97,
            'end_char': 122,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 0,
            'end_char': 16,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 77,
            'end_char': 96,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 17,
            'end_char': 40,
            'value_from_text': '*****@*****.**'
        }, {
            'value': '*****@*****.**',
            'start_char': 51,
            'end_char': 68,
            'value_from_text': '*****@*****.**'
        }]

        self.assertEqual(sorted(expected, key=lambda x: x["start_char"]),
                         sorted(extracted, key=lambda x: x["start_char"]))
Example #2
0
    def test_segment(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        etk = ETK(kg_schema=kg_schema)
        doc = etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        description_value = [i.value for i in descriptions]
        expected = [
            'version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.',
            'record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.'
        ]
        self.assertEqual(description_value, expected)
Example #3
0
        # Record the country of this actor
        doc.kg.add_value("country", json_path="$.Side")

        # Add a title to the actor document
        doc.kg.add_value("title", json_path="$.Side")

        # Return an empty list because we didn't create new documents
        return []


# The main is for testing, and is not used in the DIG pipeline
if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[UCDPModule, UCDPActorModule], kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the Excel sheet
    cp = CsvProcessor(etk=etk, heading_row=1)

    with open("ucdp.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for doc in cp.tabular_extractor(filename="ucdp_sample.xls", dataset='ucdp'):
            # Each row produces a document, which we sent to ETK.
            # Note that each invocation of process_ems will also process any new documents created while
            # processing each doc
            etk.process_and_frame(doc)
            f.write(json.dumps(doc.cdr_document) + "\n")
Example #4
0
import unittest, json
from etk.csv_processor import CsvProcessor
from etk.etk import ETK
from etk.knowledge_graph_schema import KGSchema

csv_str = """text,with,Polish,non-Latin,lettes
1,2,3,4,5,6
a,b,c,d,e,f

gęś,zółty,wąż,idzie,wąską,dróżką,
,b,c,s,w,f
"""

kg_schema = KGSchema(
    json.load(open('etk/unit_tests/ground_truth/test_config.json')))

etk = ETK(kg_schema=kg_schema)


class TestCsvProcessor(unittest.TestCase):
    def test_csv_str_with_all_args(self) -> None:
        csv_processor = CsvProcessor(etk=etk,
                                     heading_row=1,
                                     content_start_row=2,
                                     heading_columns=(1, 3),
                                     content_end_row=3,
                                     ends_with_blank_row=True,
                                     remove_leading_empty_rows=True,
                                     required_columns=['text'])

        test_docs = [
Example #5
0
                    if extractions:
                        path = '$."' + \
                               extractions[0].value + '"[?(@.country == "Italy")]'
                        jsonpath_expr = jex.parse(path)
                        city_match = jsonpath_expr.find(self.city_dataset)
                        if city_match:
                            # add corresponding values of city_dataset into knowledge graph of the doc
                            for field in city_match[0].value:
                                doc.kg.add_value(
                                    field, value=city_match[0].value[field])
                    new_docs.append(doc)
        return new_docs

    def document_selector(self, doc) -> bool:
        return doc.cdr_document.get("dataset") == "italy_team"


if __name__ == "__main__":
    # url = 'https://en.wikipedia.org/wiki/List_of_football_clubs_in_Italy'

    cdr = json.load(
        open('./resources/italy_teams.json', mode='r', encoding='utf-8'))
    kg_schema = KGSchema(json.load(open('./resources/master_config.json')))
    etk = ETK(modules=ItalyTeamsModule, kg_schema=kg_schema)
    etk.parser = jex.parse
    cdr_doc = Document(etk, cdr_document=cdr, mime_type='json', url=cdr['url'])
    results = etk.process_ems(cdr_doc)[1:]
    print('Total docs:', len(results))
    print("Sample result:\n")
    print(json.dumps(results[0].value, indent=2))
Example #6
0
    def test_Provenance(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        self.etk = ETK(kg_schema=kg_schema)
        g = [
            'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep',
            'yixiang'
        ]
        self.name_extractor = GlossaryExtractor(g,
                                                "name_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=1)
        doc = self.etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.name_extractor, d)
            p.store(names, "members")

        expected_provenances = [{
            "@id": 0,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 33,
                "end_char": 38
            }
        }, {
            "@id": 1,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 40,
                "end_char": 46
            }
        }, {
            "@id": 2,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 48,
                "end_char": 54
            }
        }, {
            "@id": 3,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 56,
                "end_char": 64
            }
        }, {
            "@id": 4,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[0].members",
            "parent_provenances": {
                "Runqi": 0,
                "Dongyu": 1,
                "Sylvia": 2,
                "Amandeep": 3
            }
        }, {
            "@id": 5,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 39,
                "end_char": 44
            }
        }, {
            "@id": 6,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 46,
                "end_char": 52
            }
        }, {
            "@id": 7,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 54,
                "end_char": 61
            }
        }, {
            "@id": 8,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[1].members",
            "parent_provenances": {
                "Pedro": 5,
                "Mayank": 6,
                "Yixiang": 7
            }
        }]
        expected_projects = [{
            "name":
            "etk",
            "description":
            "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.",
            "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"]
        }, {
            "name": "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
            "members": ["Pedro", "Mayank", "Yixiang"]
        }]
        #print ("hiiiiiiiiiiiiiiiii")
        #print ("projects: " + str(doc.value["projects"]))
        #print ("provenances: " + str(doc.value["provenances"]))
        self.assertEqual(expected_projects, doc.value["projects"])
        self.assertEqual(expected_provenances, doc.value["provenances"])
Example #7
0
        # Country, refers to the affiliation, being mapped to country of actor, losing the distinction.
        doc.kg.add_value("country", json_path="$.ActorCountryCode")
        doc.kg.add_value("country",
                         value=doc.extract(
                             self.country_decoder,
                             doc.select_segments("$.ActorCountryCode")[0]))

        # Note: not mapping the Actor Geo codes, because Pedro doesn't understand what they mean.
        return list()


if __name__ == "__main__":

    # Tell ETK the schema of the fields in the KG, the DIG master_config can be used as the schema.
    kg_schema = KGSchema(json.load(open('../events_ucdp/master_config.json')))

    # Instantiate ETK, with the two processing modules and the schema.
    etk = ETK(modules=[GdeltModule, GdeltActorModule], kg_schema=kg_schema)

    # Create a CSV processor to create documents for the relevant rows in the TSV file
    cp = CsvProcessor(etk=etk,
                      heading_columns=(1, len(GdeltModule.header_fields)),
                      column_name_prefix="COL")

    with open("gdelt.jl", "w") as f:
        # Iterate over all the rows in the spredsheet
        for d in cp.tabular_extractor(filename="20170912.export_sample.tsv",
                                      dataset='gdelt'):
            for result in etk.process_ems(d):
                # print(d.cdr_document)
Example #8
0
        return doc.cdr_document.get(
            "dataset") == "lake_chad_basin_displaced_victim"

    def process_document(self, doc: Document) -> List[Document]:
        doc.kg.add_value("size", json_path="total")
        doc.kg.add_value("type", json_path="type")
        return list()


if __name__ == "__main__":
    dir_path = sys.argv[1]
    master_config_path = sys.argv[2]
    file_name = 'lake_chad_basin_displaced.csv'
    input_path = os.path.join(dir_path, file_name)
    output_path = os.path.join(dir_path, file_name + '.jl')

    kg_schema = KGSchema(json.load(open(master_config_path)))
    etk = ETK(modules=[
        LakeChadBasinDisplacedModule, LakeChadBasinDisplacedVictimModule,
        LCBPlaceModule
    ],
              kg_schema=kg_schema)
    cp = CsvProcessor(etk=etk, heading_row=1, content_start_row=3)

    with open(output_path, "w") as f:
        print(input_path, output_path)
        for doc in cp.tabular_extractor(filename=input_path,
                                        dataset='lake_chad_basin_displaced'):
            etk.process_and_frame(doc)
            f.write(json.dumps(doc.cdr_document) + "\n")