def test_glossary_extractor(self) -> None:
     t = Tokenizer()
     g = ['New York', 'Shanghai', 'Los Angeles', 'Beijing']
     ge = GlossaryExtractor(g, 'test_glossary', t, 2, False)
     text = 'i live in los angeles. my hometown is Beijing'
     tokens = t.tokenize(text)
     test_result = [i.value for i in ge.extract(tokens)]
     expected = ["Beijing", "Los Angeles"]
     self.assertEqual(test_result, expected)
Esempio n. 2
0
    def test_glossary_extractor(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        ge = GlossaryExtractor(self.glossary_1, 'test_glossary', t, 3, False)
        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'los angeles', 'New York']

        self.assertEqual(results, expected)
Esempio n. 3
0
    def test_case_sensitive(self) -> None:
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)

        g = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        ge = GlossaryExtractor(g, 'test_glossary', t, 2, True)

        results = [i.value for i in ge.extract(tokens)]
        expected = ['Beijing', 'New York']

        self.assertEqual(results, expected)
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.name_extractor = GlossaryExtractor(
         self.etk.load_glossary("./names.txt"),
         "name_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=1)
     self.student_extractor = GlossaryExtractor(
         self.etk.load_glossary("./student.txt"),
         "student_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=1)
Esempio n. 5
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.metadata_extractor = HTMLMetadataExtractor()
     self.content_extractor = HTMLContentExtractor()
     self.date_extractor = DateExtractor(self.etk, 'demo_date_parser')
     self.country_extractor = GlossaryExtractor(
         self.etk.load_glossary("${GLOSSARY_PATH}/countries.txt"),
         "country_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
     self.cities_extractor = GlossaryExtractor(
         self.etk.load_glossary("${GLOSSARY_PATH}/cities.txt"),
         "cities_extractor",
         self.etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
Esempio n. 6
0
    def setUp(self):
        self.text = 'Napoléon Bonaparte was a French statesman and military leader who rose to prominence during the ' \
                    'French Revolution and led several successful campaigns during the French Revolutionary Wars. ' \
                    'As Napoleon, he was Emperor of the French from 1804 until 1814, and again briefly in 1815 during ' \
                    'the Hundred Days. Napoleon dominated European and global affairs for more than a decade while ' \
                    'leading France against a series of coalitions in the Napoleonic Wars. He won most of these wars ' \
                    'and the vast majority of his battles, building a large empire that ruled over continental Europe ' \
                    'before its final collapse in 1815. He is considered one of the greatest commanders in history, ' \
                    'and his wars and campaigns are studied at military schools worldwide. Napoleon\'s political and ' \
                    'cultural legacy has endured as one of the most celebrated and controversial leaders in human history.'
        extractor = SpacyNerExtractor(extractor_name='spacy_ner_extractor')
        self.results = extractor.extract(self.text)

        glossary_1 = ['Beijing', 'Los Angeles', 'New York', 'Shanghai']
        t = Tokenizer()
        text = 'i live in los angeles. my hometown is Beijing. I love New York City.'
        tokens = t.tokenize(text)
        ge = GlossaryExtractor(glossary_1, 'test_glossary', t, 3, False)
        self.results2 = ge.extract(tokens)
Esempio n. 7
0
    def __init__(self, etk):
        ETKModule.__init__(self, etk)
        self.my_table_extractor = TableExtractor()
        self.etk.parser = jex.parse
        file_name = '${GLOSSARY_PATH}/cities_ppl_25000.json'
        file = open(file_name, 'r')
        self.city_dataset = json.loads(file.read())
        file.close()
        self.city_list = list(self.city_dataset.keys())

        self.my_glossary_extractor = GlossaryExtractor(
            glossary=self.city_list,
            extractor_name='tutorial_glossary',
            tokenizer=etk.default_tokenizer,
            ngrams=3,
            case_sensitive=False)
Esempio n. 8
0
 def test_etk_crf_glossary_extraction(self):
     etk = ETK(use_spacy_tokenizer=False)
     s = time.time()
     city_extractor = GlossaryExtractor(
         ['los angeles', 'new york', 'angeles'],
         'city_extractor',
         etk.default_tokenizer,
         case_sensitive=False,
         ngrams=3)
     doc_json = {
         'text':
         'i live in los angeles. my hometown is Beijing. I love New York City.'
     }
     doc = Document(etk,
                    cdr_document=doc_json,
                    mime_type='json',
                    url='',
                    doc_id='1')
     t_segments = doc.select_segments("$.text")
     for t_segment in t_segments:
         extracted_cities = doc.extract(city_extractor, t_segment)
         for extracted_city in extracted_cities:
             self.assertTrue(extracted_city.value in
                             ['los angeles', 'New York', 'angeles'])
Esempio n. 9
0
    def test_Provenance(self) -> None:
        kg_schema = KGSchema(
            json.load(open('etk/unit_tests/ground_truth/test_config.json')))

        self.etk = ETK(kg_schema=kg_schema)
        g = [
            'runqi', 'sylvia', 'dongyu', 'mayank', 'pedro', 'amandeep',
            'yixiang'
        ]
        self.name_extractor = GlossaryExtractor(g,
                                                "name_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=1)
        doc = self.etk.create_document(sample_input)
        descriptions = doc.select_segments("projects[*].description")
        projects = doc.select_segments("projects[*]")

        for d, p in zip(descriptions, projects):
            names = doc.extract(self.name_extractor, d)
            p.store(names, "members")

        expected_provenances = [{
            "@id": 0,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 33,
                "end_char": 38
            }
        }, {
            "@id": 1,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 40,
                "end_char": 46
            }
        }, {
            "@id": 2,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 48,
                "end_char": 54
            }
        }, {
            "@id": 3,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[0].description",
                "start_char": 56,
                "end_char": 64
            }
        }, {
            "@id": 4,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[0].members",
            "parent_provenances": {
                "Runqi": 0,
                "Dongyu": 1,
                "Sylvia": 2,
                "Amandeep": 3
            }
        }, {
            "@id": 5,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 39,
                "end_char": 44
            }
        }, {
            "@id": 6,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 46,
                "end_char": 52
            }
        }, {
            "@id": 7,
            "@type": "extraction_provenance_record",
            "method": "name_extractor",
            "confidence": 1.0,
            "origin_record": {
                "path": "projects.[1].description",
                "start_char": 54,
                "end_char": 61
            }
        }, {
            "@id": 8,
            "@type": "storage_provenance_record",
            "doc_id": None,
            "field": None,
            "destination": "projects.[1].members",
            "parent_provenances": {
                "Pedro": 5,
                "Mayank": 6,
                "Yixiang": 7
            }
        }]
        expected_projects = [{
            "name":
            "etk",
            "description":
            "version 2 of etk, implemented by Runqi, Dongyu, Sylvia, Amandeep and others.",
            "members": ["Runqi", "Dongyu", "Sylvia", "Amandeep"]
        }, {
            "name": "rltk",
            "description":
            "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students.",
            "members": ["Pedro", "Mayank", "Yixiang"]
        }]
        #print ("hiiiiiiiiiiiiiiiii")
        #print ("projects: " + str(doc.value["projects"]))
        #print ("provenances: " + str(doc.value["provenances"]))
        self.assertEqual(expected_projects, doc.value["projects"])
        self.assertEqual(expected_provenances, doc.value["provenances"])
Esempio n. 10
0
 def __init__(self, etk):
     ETKModule.__init__(self, etk)
     self.date_extractor = DateExtractor(self.etk, 'acled_date_parser')
     self.country_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/countries.json.gz", read_json=True),
                                                "country_extractor",
                                                self.etk.default_tokenizer,
                                                case_sensitive=False,
                                                ngrams=3)
     self.states_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/states_usa_canada.json.gz", read_json=True),
                                               "states_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.cities_extractor = GlossaryExtractor(self.etk.load_glossary(
         "${GLOSSARY_PATH}/cities.json.gz", read_json=True),
                                               "cities_extractor",
                                               self.etk.default_tokenizer,
                                               case_sensitive=False,
                                               ngrams=3)
     self.csv_processor = CsvProcessor(etk=etk, heading_row=1)
     self.interaction_decoding_dict = {
         "10": "Sole Military Action",
         "11": "Military Versus Military",
         "12": "Military Versus Rebels",
         "13": "Military Versus Political Militia",
         "14": "Military Versus Communal Militia",
         "15": "Military Versus Rioters",
         "16": "Military Versus Protesters",
         "17": "Military Versus Civilians",
         "18": "Military Versus Other",
         "20": "Sole Rebel Action",
         "22": "Rebels Versus Rebels",
         "23": "Rebels Versus Political Militia",
         "24": "Rebels Versus Communal Militia",
         "25": "Rebels Versus Rioters",
         "26": "Rebels Versus Protesters",
         "27": "Rebels Versus Civilians",
         "28": "Rebels Versus Other",
         "30": "Sole Political Militia Action",
         "33": "Political Militia Versus Political Militia",
         "34": "Political Militia Versus Communal Militia",
         "35": "Political Militia Versus Rioters",
         "36": "Political Militia Versus Protesters",
         "37": "Political Militia Versus Civilians",
         "38": "Political Militia Versus Other",
         "40": "Sole Communal Militia Action",
         "44": "Communal Militia Versus Communal Militia",
         "45": "Communal Militia Versus Rioters",
         "46": "Communal Militia Versus Protesters",
         "47": "Communal Militia Versus Civilians",
         "48": "Communal Militia Versus Other",
         "50": "Sole Rioter Action",
         "55": "Rioters Versus Rioters",
         "56": "Rioters Versus Protesters",
         "57": "Rioters Versus Civilians",
         "58": "Rioters Versus Other",
         "60": "Sole Protester Action",
         "66": "Protesters Versus Protesters",
         "68": "Protesters Versus Other",
         "78": "Other Actor Versus Civilians",
         "80": "Sole Other Action"
     }
     self.interaction_decoder = DecodingValueExtractor(
         self.interaction_decoding_dict,
         'default_decoding',
         case_sensitive=True)
Esempio n. 11
0
            "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }
    ],
    "members": [
        {
            "name": "Dongyu Li",
            "description": "03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994."
        }
    ]
}

etk = ETK()
doc = etk.create_document(sample_input)

# example for glossary extractor:
name_extractor = GlossaryExtractor(etk.load_glossary("./names.txt"), "name_extractor", etk.default_tokenizer, case_sensitive=False, ngrams=1)

descriptions = doc.select_segments("projects[*].description")
projects = doc.select_segments("projects[*]")

for d, p in zip(descriptions, projects):
    print ("Iam d path: " + d.full_path)
    names = doc.invoke_extractor(name_extractor, d)
    p.store_extractions(names, "members")

# example for date extractor:
date_extractor = DateExtractor('test_date_parser')
member_descriptions = doc.select_segments("members[*].description")
members = doc.select_segments("members[*]")

for m_d, m in zip(member_descriptions, members):