Exemple #1
0
doc = etk.create_document(sample_html,
                          mime_type="text/html",
                          url="http://ex.com/123")

my_table_extractor = TableExtractor()

d = doc.select_segments("$.raw_content")[0]
root = doc.select_segments("$")[0]

tables = doc.invoke_extractor(my_table_extractor, d)
for t in tables:
    root.store_extractions([t], t.tag, group_by_tags=False)

table_data_extractor = EntityTableDataExtraction()
table_data_extractor.add_glossary(
    etk.load_glossary("./resources/address_dict.txt"), "address")
table_data_extractor.add_glossary(
    etk.load_glossary("./resources/calibre_dict.txt"), "caliber")
table_data_extractor.add_glossary(
    etk.load_glossary("./resources/capacity_dict.txt"), "capacity")
table_data_extractor.add_glossary(
    etk.load_glossary("./resources/manufacturer_dict.txt"), "manufacturer")
table_data_extractor.add_glossary(
    etk.load_glossary("./resources/price_dict.txt"), "price")

tables = doc.select_segments("$.tables[*]")
root = doc.select_segments("$")[0]

for t in tables:
    extractions = doc.invoke_extractor(table_data_extractor, t)
    root.store_extractions(extractions, "table_data_extraction")
Exemple #2
0
            "description": "record linkage toolkit, implemented by Pedro, Mayank, Yixiang and several students."
        }
    ],
    "members": [
        {
            "name": "Dongyu Li",
            "description": "03/05/2018: I went to USC on Aug 20th, 2016 and will graduate on 2018, May 11. My birthday is 29-04-1994."
        }
    ]
}

etk = ETK()
doc = etk.create_document(sample_input)

# example for glossary extractor:
name_extractor = GlossaryExtractor(etk.load_glossary("./names.txt"), "name_extractor", etk.default_tokenizer, case_sensitive=False, ngrams=1)

descriptions = doc.select_segments("projects[*].description")
projects = doc.select_segments("projects[*]")

for d, p in zip(descriptions, projects):
    print ("Iam d path: " + d.full_path)
    names = doc.invoke_extractor(name_extractor, d)
    p.store_extractions(names, "members")

# example for date extractor:
date_extractor = DateExtractor('test_date_parser')
member_descriptions = doc.select_segments("members[*].description")
members = doc.select_segments("members[*]")

for m_d, m in zip(member_descriptions, members):