def test_ce_readability(self): e_config = { 'content_extraction': { "input_path": "raw_content", "extractors": { "readability": [{ "strict": "yes", "extraction_policy": "keep_existing" }, { "strict": "no", "extraction_policy": "keep_existing", "field_name": "content_relaxed" }] } } } c = Core(extraction_config=e_config) r = c.process(self.doc) self.assertTrue('tld' in r) self.assertEqual('eroticmugshots.com', r['tld']) self.assertTrue("content_extraction" in r) self.assertTrue("content_strict" in r["content_extraction"]) self.assertTrue("content_relaxed" in r["content_extraction"]) self.assertTrue("title" not in r["content_extraction"]) self.assertTrue("inferlink_extractions" not in r["content_extraction"]) c_s = """\n \n \n \n \n \n smoothlegs24 28 \n \n \n chrissy391 27 \n \n \n My name is Helena height 160cms weight 55 kilos contact me at [email protected] jefferson ave age: 23 HrumpMeNow 28 \n \n \n xxtradition 24 \n \n \n jumblyjumb 26 \n \n \n claudia77 26 \n \n \n gushinPuss 28 \n \n \n Littlexdit 25 \n \n \n PinkSweets2 28 \n \n \n withoutlimit 27 \n \n \n bothOfUs3 28 \n \n \n lovelylips 27 \n \n \n killerbod 27 \n \n \n Littlexdit 27 \n \n \n azneyes 23 \n \n \n \n \n \n Escort's Phone: \n \n \n323-452-2013 \n \n Escort's Location: \nLos Angeles, California \n Escort's Age: 23 Date of Escort Post: Jan 02nd 6:46am \n REVIEWS: \n READ AND CREATE REVIEWS FOR THIS ESCORT \n \n \n \n \n \nThere are 50 girls looking in .\n VIEW GIRLS \n \nHey I'm luna 3234522013 Let's explore , embrace and indulge in your favorite fantasy % independent. discreet no drama Firm Thighs and Sexy. My Soft skin & Tight Grip is exactly what you deserve Call or text Fetish friendly Fantasy friendly Party friendly 140 Hr SPECIALS 3234522013. Call 323-452-2013 . Me and my friends are on EZsex soooo you can find us all on there if you want... skittlegirl \n \n \n \n \n \n Call me on my cell at 323-452-2013. Date of ad: 2017-01-02 06:46:00 \n \n \n""" c_r = """\n \n \n \n \n \n \n smoothlegs24 28 \n \n \n chrissy391 27 \n \n \n My name is Helena height 160cms weight 55 kilos contact me at [email protected] jefferson ave age: 23 HrumpMeNow 28 \n \n \n xxtradition 24 \n \n \n jumblyjumb 26 \n \n \n claudia77 26 \n \n \n gushinPuss 28 \n \n \n Littlexdit 25 \n \n \n PinkSweets2 28 \n \n \n withoutlimit 27 \n \n \n bothOfUs3 28 \n \n \n lovelylips 27 \n \n \n killerbod 27 \n \n \n Littlexdit 27 \n \n \n azneyes 23 \n \n \n \n \n \n Escort's Phone: \n \n \n323-452-2013 \n \n Escort's Location: \nLos Angeles, California \n Escort's Age: 23 Date of Escort Post: Jan 02nd 6:46am \n REVIEWS: \n READ AND CREATE REVIEWS FOR THIS ESCORT \n \n \n \n \n \nThere are 50 girls looking in .\n VIEW GIRLS \n \nHey I'm luna 3234522013 Let's explore , embrace and indulge in your favorite fantasy % independent. discreet no drama Firm Thighs and Sexy. My Soft skin & Tight Grip is exactly what you deserve Call or text Fetish friendly Fantasy friendly Party friendly 140 Hr SPECIALS 3234522013. Call 323-452-2013 . Me and my friends are on EZsex soooo you can find us all on there if you want... skittlegirl \n \n \n \n \n \n Call me on my cell at 323-452-2013. Date of ad: 2017-01-02 06:46:00 \n \n \n \n""" self.assertEqual( json.dumps(r["content_extraction"]["content_strict"]["text"]), json.dumps(c_s)) self.assertEqual( json.dumps(r["content_extraction"]["content_relaxed"]["text"]), json.dumps(c_r))
class TestExtractionConfig(unittest.TestCase): def setUp(self): self.c = Core() def test_extraction_policy(self): config = {} self.assertEqual(self.c.determine_extraction_policy(config), core._REPLACE) config = {'extraction_policy': 'replace'} self.assertEqual(self.c.determine_extraction_policy(config), core._REPLACE) config = {'extraction_policy': 'keep_existing'} self.assertEqual(self.c.determine_extraction_policy(config), core._KEEP_EXISTING) config = None self.assertEqual(self.c.determine_extraction_policy(config), core._REPLACE) config = {'extraction_policy': 'something'} with self.assertRaises(ValueError): self.c.determine_extraction_policy(config) def test_determine_segment(self): full_path = '' self.assertEqual(self.c.determine_segment(full_path), core._SEGMENT_OTHER) full_path = 'content_extraction.title' self.assertEqual(self.c.determine_segment(full_path), core._SEGMENT_TITLE) full_path = 'content_extraction.inferlink_extractions.inferlink_description' self.assertEqual(self.c.determine_segment(full_path), core._SEGMENT_INFERLINK_DESC) full_path = 'content_extraction.content_relaxed' self.assertEqual(self.c.determine_segment(full_path), core._SEGMENT_OTHER)
def test_extractor__no_regex(self): e_config = { "data_extraction": [{ "input_path": ["content_extraction.content_strict.text.`parent`"], "fields": { "name": { "extractors": { "extract_using_regex": { "config": { "include_context": "true", "regex_options": ["IGNORECASE"], "pre_filter": [ "x.replace('\\n', '')", "x.replace('\\r', '')" ] }, "extraction_policy": "replace" } } } } }] } c = Core(extraction_config=e_config) with self.assertRaises(KeyError): r = c.process(self.doc)
def test_tld_extraction_from_doc(self): doc = { "url": "https://www.google.com/blah/this/part/doesnt/matter", 'uri': "uri.1", "tld": "xyz.org" } e_config = { "document_id": "uri", "content_extraction": {}, "data_extraction": [{ "input_path": "content_extraction.url.text.`parent`", "fields": { "website": { "extractors": { "extract_website_domain": {} } } } }] } c = Core(extraction_config=e_config) r = c.process(doc) self.assertEqual(r['knowledge_graph']['website'][0]['value'], 'xyz.org')
def test_document_id_not_present(self): e_config = { 'document_id': 'blah' } c = Core(extraction_config=e_config) with self.assertRaises(KeyError): r = c.process(self.doc)
def test_add_constants(self): e_config = { "document_id": "doc_id", "kg_enhancement": { "fields": { "type": { "priority": 0, "extractors": { "add_constant_kg": { "config": { "constants": ["Type A", "Type B"] } } } } }, "input_path": "knowledge_graph.`parent`" }} c = Core(extraction_config=e_config) r = c.process(self.doc) self.assertTrue('knowledge_graph' in self.doc) self.assertTrue('type' in self.doc['knowledge_graph']) self.assertTrue(len(self.doc['knowledge_graph']['type']) == 2) self.assertTrue(self.doc['knowledge_graph']['type'][0]['value'] in ["Type A", "Type B"])
def test_document_id(self): e_config = {'document_id': 'doc_id'} c = Core(extraction_config=e_config) r = c.process(self.doc) self.assertTrue('document_id' in r) doc_id = '1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21' self.assertEqual(r['document_id'], doc_id)
def test_filter_results(self): c = Core(extraction_config=self.e_config) r = c.process(self.doc) self.assertTrue('knowledge_graph' in self.doc) self.assertTrue('name' in self.doc['knowledge_graph']) self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1) self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)
def test_filter_results_multiple_filters(self): doc = { "url": "http://www.testurl.com", "doc_id": "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E" } e_config = { "document_id": "doc_id", "filters": { "testurl.com": [{ "field": "url", "action": "keep", "regex": "testt" }, { "field": "url", "action": "discard", "regex": "test*" }] } } c = Core(extraction_config=e_config) doc = c.process_doc_filters(doc) self.assertTrue('prefilter_filter_outcome' in doc) self.assertTrue(doc['prefilter_filter_outcome'] == 'discard')
def test_invalid_json_path(self): doc = { "url": "http:www.hitman.org", "doc_id": "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E", "actors": { "name": "agent 47", "affiliation": "International Contract Agency" } } e_config = { "document_id": "doc_id", "data_extraction": [ { "input_path": [ "actors[" ], "fields": { "actors": { "extractors": { "create_kg_node_extractor": { "config": { "segment_name": "actor_information" } } } } } } ] } c = Core(extraction_config=e_config) with self.assertRaises(Exception): r = c.process(doc)
def test_spacy_date(self): doc = { "url": "http://date.test.com", "doc_id": "12344", "content_extraction": { "useful_text": { "text": u"Alert: Tue, 2006-02-07" } } } e_config = { "document_id": "doc_id", 'data_extraction': [{ "fields": { "event_date": { "extractors": { "extract_using_spacy": { "config": { "post_filter": "parse_date" } } } } }, "input_path": ["content_extraction.useful_text.text.`parent`"] }] } core = Core(extraction_config=e_config) r = core.process(doc) kg = r['knowledge_graph'] self.assertTrue('event_date' in kg) self.assertEqual(kg['event_date'][0]['value'], '2006-02-07T00:00:00')
def setUp(self): e_config = { 'data_extraction': [{ 'input_path': 'text.`parent`', 'fields': { "posting_date": { "extractors": { "extract_using_spacy": { "config": {} } } }, "age": { "extractors": { "extract_using_spacy": { "config": {} } } }, "social_media": { "extractors": { "extract_using_spacy": { "config": {} } } }, "address": { "extractors": { "extract_using_spacy": { "config": {} } } } } }] } self.c = Core(extraction_config=e_config, load_spacy=True) self.ground_truth = dict() ground_truth_files = { "age": os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"), "date": os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"), "social_media": os.path.join(os.path.dirname(__file__), "ground_truth/social_media.jl"), "address": os.path.join(os.path.dirname(__file__), "ground_truth/address.jl") } for extractor, file_name in ground_truth_files.items(): with open(file_name, 'r') as f: test_data = f.read().split('\n') self.ground_truth[extractor] = list() for test_case in test_data: self.ground_truth[extractor].append(json.loads(test_case))
def test_extractor_regex(self): e_config = { "data_extraction": [{ "input_path": ["content_extraction.content_strict.text.`parent`"], "fields": { "name": { "extractors": { "extract_using_regex": { "config": { "include_context": "true", "regex": "(?:my[\\s]+name[\\s]+is[\\s]+([-a-z0-9@$!]+))", "regex_options": ["IGNORECASE"], "pre_filter": [ "x.replace('\\n', '')", "x.replace('\\r', '')" ] }, "extraction_policy": "replace" } } } } }] } c = Core(extraction_config=e_config) r = c.process(self.doc) self.assertTrue("content_extraction" in r) self.assertTrue("content_strict" in r["content_extraction"]) self.assertTrue("text" in r["content_extraction"]["content_strict"]) self.assertTrue("tokens" in r["content_extraction"]["content_strict"]) self.assertTrue( "simple_tokens" in r["content_extraction"]["content_strict"]) self.assertTrue( "data_extraction" in r["content_extraction"]["content_strict"]) self.assertTrue("name" in r["content_extraction"]["content_strict"] ["data_extraction"]) self.assertTrue("extract_using_regex" in r["content_extraction"] ["content_strict"]["data_extraction"]["name"]) extraction = r["content_extraction"]["content_strict"][ "data_extraction"]["name"]["extract_using_regex"] ex = { "results": [{ "origin": { "score": 1, "segment": "readability_strict", "method": "other_method" }, "context": { 'text': u' 27 \n \n \n My name is Helena height 16', "end": 73, "start": 56 }, "value": "Helena" }] } self.assertEqual(extraction, ex)
def test_guards(self): c = Core(extraction_config=self.e_config) r = c.process(self.doc1) self.assertTrue("knowledge_graph" in r) self.assertTrue("event_date" in r['knowledge_graph']) r = c.process(self.doc2) self.assertTrue("knowledge_graph" not in r or "event_date" not in r['knowledge_graph'])
def test_table_extractor_empty_config(self): c = Core(extraction_config=self.e_config) r = c.process(self.doc) self.assertTrue("content_extraction" in r) self.assertTrue("table" in r["content_extraction"]) ex = json.loads(json.JSONEncoder().encode(r["content_extraction"]["table"]["tables"])) self.assertEqual(ex, self.table_ex)
def test_extract_as_is_post_filter_3(self): doc = { "uri": "1", "event_actors": [{ "description": "Non-State, Internal, No State Sanction", "id": "internalnononstatesanctionstate", "size": "54" }, { "description": "Noncombatant Status Asserted", "id": "assertedcontestednoncombatantnoncombatantnotstatusstatus", "size": "34.0" }, { "description": "Noncombatant Status Asserted", "id": "assertedcontestednoncombatantnoncombatantnotstatusstatus", "size": "redme34" }] } e_config = { "extraction_policy": "replace", "error_handling": "raise_error", "document_id": "uri", "content_extraction": { "json_content": [{ "input_path": "event_actors[*].size", "segment_name": "actor_size" }] }, "data_extraction": [{ "input_path": "content_extraction.actor_size[*].text.`parent`", "fields": { "actor_size": { "extractors": { "extract_as_is": { "extraction_policy": "keep_existing", "config": { "post_filter": ["parse_number"] } } } } } }] } c = Core(extraction_config=e_config) r = c.process(doc) self.assertTrue(len(r['knowledge_graph']['actor_size']) == 2) self.assertEqual(r['knowledge_graph']['actor_size'][0]['value'], '54') self.assertEqual(r['knowledge_graph']['actor_size'][1]['value'], '34.0')
def test_decode_value_dictionary(self): women_name_file_path = os.path.join( os.path.dirname(__file__), "resources/case_sensitive_female_name.json") name_decoding_dict_path = os.path.join(os.path.dirname(__file__), "resources/name_decode.json") doc = { 'content_extraction': { 'content_strict': { 'text': 'My name is Margie and this is a test for extracting this name using case sensitive ' 'dictionary' } }, 'doc_id': 'id', 'url': 'http://givemeabreak.com' } e_config = { "resources": { "dictionaries": { "women_name": women_name_file_path }, "decoding_dictionary": { "name": name_decoding_dict_path } }, "document_id": "doc_id", "data_extraction": [{ "input_path": "content_extraction.content_strict.text.`parent`", "fields": { "name": { "extractors": { "extract_using_dictionary": { "config": { 'case_sensitive': 'True', "dictionary": "women_name", "ngrams": 1, "joiner": " ", "pre_filter": ["x"], "post_filter": ["isinstance(x, basestring)"], "post_filter_s": "decode_value" }, "extraction_policy": "keep_existing" } } } } }] } c = Core(extraction_config=e_config) r = c.process(doc) self.assertEqual(r['knowledge_graph']['name'][0]['value'], 'Not Margie')
def test_extraction_from_age(self): c = Core() for t in self.doc: extracted_ages = c._extract_age(t['content']) extracted_ages = [age['value'] for age in extracted_ages] for extracted_age in extracted_ages: for correct_age in t['correct']: if extracted_age == correct_age: self.assertTrue(extracted_age, correct_age)
def test_extraction_from_default_spacy(self): c = Core(extraction_config=self.e_config, load_spacy=True) for i in range(len(self.ground_truth_input)): r = c.process(self.ground_truth_input[i], create_knowledge_graph=True, html_description=False) self.assertEquals(self.ground_truth_output[i]['knowledge_graph'], r['knowledge_graph'])
def test_table_extractor(self): c = Core(extraction_config=self.e_config) r = c.process(self.doc) with open("table_out.jl", "w") as f: f.write(json.dumps(r["content_extraction"]["table"]["tables"])) self.assertTrue("content_extraction" in r) self.assertTrue("table" in r["content_extraction"]) ex = json.loads(json.JSONEncoder().encode(r["content_extraction"]["table"]["tables"])) self.assertEqual(ex, self.table_ex)
def test_negative_case_sensitive(self): women_name_file_path = os.path.join( os.path.dirname(__file__), "resources/case_sensitive_female_name.json") doc = { 'content_extraction': { 'content_strict': { 'text': 'My name is margie and this is a test for extracting this name using case sensitive ' 'dictionary' } }, 'doc_id': 'id', 'url': 'http://givemeabreak.com' } e_config = { "resources": { "dictionaries": { "women_name": women_name_file_path } }, "document_id": "doc_id", "data_extraction": [{ "input_path": "content_extraction.content_strict.text.`parent`", "fields": { "name": { "extractors": { "extract_using_dictionary": { "config": { 'case_sensitive': 'trUe', "dictionary": "women_name", "ngrams": 1, "joiner": " ", "pre_filter": ["x"], "post_filter": ["isinstance(x, basestring)"] }, "extraction_policy": "keep_existing" } } } } }] } c = Core(extraction_config=e_config) r = c.process(doc) self.assertTrue( "simple_tokens" in r["content_extraction"]["content_strict"]) self.assertTrue('simple_tokens_original_case' in r["content_extraction"]["content_strict"]) self.assertTrue( "data_extraction" not in r["content_extraction"]["content_strict"])
def test_create_kg_node(self): doc = { "url": "http:www.hitman.org", "doc_id": "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E", "actors": { "name": "agent 47", "affiliation": "International Contract Agency" } } e_config = { "document_id": "doc_id", "data_extraction": [{ "input_path": ["actors"], "fields": { "actors": { "extractors": { "create_kg_node_extractor": { "config": { "segment_name": "actor_information" } } } } } }] } c = Core(extraction_config=e_config) r = c.process(doc) self.assertTrue('knowledge_graph' in doc) self.assertTrue('actors' in doc['knowledge_graph']) self.assertTrue(len(doc['knowledge_graph']['actors']) == 1) self.assertTrue('nested_docs' in r) self.assertTrue(len(r['nested_docs']) == 1) nested_doc = r['nested_docs'][0] ce_expected = { "actor_information": { "affiliation": "International Contract Agency", "name": "agent 47" } } self.assertEqual(nested_doc['content_extraction'], ce_expected) # self.assertTrue('parent_doc_id' in nested_doc) # self.assertEqual(nested_doc['parent_doc_id'], # '19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E') self.assertTrue('created_by' in nested_doc) self.assertTrue('@timestamp_created' in nested_doc) self.assertTrue('url' in nested_doc) self.assertEqual( r['knowledge_graph']['actors'][0]['provenance'][0]['qualifiers'] ['timestamp_created'], nested_doc['@timestamp_created'])
def test_guard_field_regex_pass(self): self.e_config['kg_enhancement']['fields']['name']['guard'] = [{ "field": "fieldA", "regex": "ach" }] c = Core(extraction_config=self.e_config) r = c.process(self.doc) self.assertTrue('knowledge_graph' in self.doc) self.assertTrue('name' in self.doc['knowledge_graph']) self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
def test_guard_field_stop_value_fail(self): self.e_config['kg_enhancement']['fields']['name']['guard'] = [{ "field": "fieldA", "stop_value": "SACHIN" }] c = Core(extraction_config=self.e_config) r = c.process(self.doc) self.assertTrue('knowledge_graph' in self.doc) self.assertTrue('name' in self.doc['knowledge_graph']) self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
def test_extraction_from_default_spacy(self): c = Core(extraction_config=self.e_config, load_spacy=True) dd = codecs.open('temp', 'w') for i in range(len(self.ground_truth_input)): r = c.process(self.ground_truth_input[i], create_knowledge_graph=True, html_description=False) # dd.write(json.dumps(r)) # dd.write('\n') self.assertEquals(self.ground_truth_output[i]['knowledge_graph'], r['knowledge_graph'])
def test_empty_tokens(self): tokens = [] pre_process = lambda x: x pre_filter = lambda x: x post_filter = lambda x: isinstance(x, basestring) ngrams = 1 joiner = ' ' n_trie = trie.CharTrie() c = Core() r = c._extract_using_dictionary(tokens, pre_process, n_trie, pre_filter, post_filter, ngrams, joiner) self.assertEqual(r, None)
def setUp(self): self.c = Core() self.doc = 'Call meorčpžsíáýd at \n\r \t ♥❤⚘sdj,,,?? ?123 fd123-123(123))),345 fdkjf☺☻✌☹♡♥❤⚘❀❃❁✼☀’ŰűŲųŴŵŶŷŸŹźŻżŽžſ0180ƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏ0190ƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟdsfkjhsdf' self.correct_tokens = [{'char_end': 8, 'char_start': 4, 'type': 'alphabet', 'value': u'Call'}, {'char_end': 6, 'char_start': 5, 'type': 'break', 'value': u' '}, {'char_end': 29, 'char_start': 17, 'type': 'alphabet', 'value': u'meor\u010dp\u017es\xed\xe1\xfdd'}, {'char_end': 19, 'char_start': 18, 'type': 'break', 'value': u' '}, {'char_end': 22, 'char_start': 20, 'type': 'alphabet', 'value': u'at'}, {'char_end': 38, 'char_start': 29, 'type': 'break', 'value': u' \n\r \t '}, {'char_end': 35, 'char_start': 32, 'type': 'emoji', 'value': u'\u2665\u2764\u2698'}, {'char_end': 38, 'char_start': 35, 'type': 'alphabet', 'value': u'sdj'}, {'char_end': 36, 'char_start': 35, 'type': 'punctuation', 'value': u','}, {'char_end': 37, 'char_start': 36, 'type': 'punctuation', 'value': u','}, {'char_end': 38, 'char_start': 37, 'type': 'punctuation', 'value': u','}, {'char_end': 39, 'char_start': 38, 'type': 'punctuation', 'value': u'?'}, {'char_end': 40, 'char_start': 39, 'type': 'punctuation', 'value': u'?'}, {'char_end': 42, 'char_start': 41, 'type': 'break', 'value': u' '}, {'char_end': 42, 'char_start': 41, 'type': 'punctuation', 'value': u'?'}, {'char_end': 48, 'char_start': 45, 'type': 'digit', 'value': u'123'}, {'char_end': 47, 'char_start': 46, 'type': 'break', 'value': u' '}, {'char_end': 50, 'char_start': 48, 'type': 'alphabet', 'value': u'fd'}, {'char_end': 54, 'char_start': 51, 'type': 'digit', 'value': u'123'}, {'char_end': 52, 'char_start': 51, 'type': 'punctuation', 'value': u'-'}, {'char_end': 58, 'char_start': 55, 'type': 'digit', 'value': u'123'}, {'char_end': 56, 'char_start': 55, 'type': 'punctuation', 'value': u'('}, {'char_end': 62, 'char_start': 59, 'type': 'digit', 'value': u'123'}, {'char_end': 60, 'char_start': 59, 'type': 'punctuation', 'value': u')'}, {'char_end': 61, 'char_start': 60, 'type': 'punctuation', 'value': u')'}, {'char_end': 62, 'char_start': 61, 'type': 'punctuation', 'value': u')'}, {'char_end': 63, 'char_start': 62, 'type': 'punctuation', 'value': u','}, {'char_end': 69, 'char_start': 66, 'type': 'digit', 'value': u'345'}, {'char_end': 68, 'char_start': 67, 'type': 'break', 'value': u' '}, {'char_end': 77, 'char_start': 72, 'type': 'alphabet', 'value': u'fdkjf'}, {'char_end': 98, 'char_start': 85, 'type': 'emoji', 'value': u'\u263a\u263b\u270c\u2639\u2661\u2665\u2764\u2698\u2740\u2743\u2741\u273c\u2600'}, {'char_end': 89, 'char_start': 87, 'type': 'alphabet', 'value': u'\xe2\u20ac'}, {'char_end': 89, 'char_start': 88, 'type': 'emoji', 'value': u'\u2122'}, {'char_end': 120, 'char_start': 104, 'type': 'alphabet', 'value': u'\u0170\u0171\u0172\u0173\u0174\u0175\u0176\u0177\u0178\u0179\u017a\u017b\u017c\u017d\u017e\u017f'}, {'char_end': 112, 'char_start': 108, 'type': 'digit', 'value': u'0180'}, {'char_end': 140, 'char_start': 124, 'type': 'alphabet', 'value': u'\u0180\u0181\u0182\u0183\u0184\u0185\u0186\u0187\u0188\u0189\u018a\u018b\u018c\u018d\u018e\u018f'}, {'char_end': 132, 'char_start': 128, 'type': 'digit', 'value': u'0190'}, {'char_end': 177, 'char_start': 152, 'type': 'alphabet', 'value': u'\u0190\u0191\u0192\u0193\u0194\u0195\u0196\u0197\u0198\u0199\u019a\u019b\u019c\u019d\u019e\u019fdsfkjhsdf'}] self.correct_simple_tokens = [u'Call', u' ', u'meor\u010dp\u017es\xed\xe1\xfdd', u' ', u'at', u' \n\r \t ', u'\u2665\u2764\u2698', u'sdj', u',', u',', u',', u'?', u'?', u' ', u'?', u'123', u' ', u'fd', u'123', u'-', u'123', u'(', u'123', u')', u')', u')', u',', u'345', u' ', u'fdkjf', u'\u263a\u263b\u270c\u2639\u2661\u2665\u2764\u2698\u2740\u2743\u2741\u273c\u2600', u'\xe2\u20ac', u'\u2122', u'\u0170\u0171\u0172\u0173\u0174\u0175\u0176\u0177\u0178\u0179\u017a\u017b\u017c\u017d\u017e\u017f', u'0180', u'\u0180\u0181\u0182\u0183\u0184\u0185\u0186\u0187\u0188\u0189\u018a\u018b\u018c\u018d\u018e\u018f', u'0190', u'\u0190\u0191\u0192\u0193\u0194\u0195\u0196\u0197\u0198\u0199\u019a\u019b\u019c\u019d\u019e\u019fdsfkjhsdf'] self.correct_reverse_map = [1, 3, 5, 6, 13, 15, 16, 18, 20, 22, 27, 28, 30, 32, 34, 36] self.correct_filtered_tokens = [{'char_end': 6, 'char_start': 5, 'type': 'break', 'value': u' '}, {'char_end': 19, 'char_start': 18, 'type': 'break', 'value': u' '}, {'char_end': 38, 'char_start': 29, 'type': 'break', 'value': u' \n\r \t '}, {'char_end': 35, 'char_start': 32, 'type': 'emoji', 'value': u'\u2665\u2764\u2698'}, {'char_end': 42, 'char_start': 41, 'type': 'break', 'value': u' '}, {'char_end': 48, 'char_start': 45, 'type': 'digit', 'value': u'123'}, {'char_end': 47, 'char_start': 46, 'type': 'break', 'value': u' '}, {'char_end': 54, 'char_start': 51, 'type': 'digit', 'value': u'123'}, { 'char_end': 58, 'char_start': 55, 'type': 'digit', 'value': u'123'}, {'char_end': 62, 'char_start': 59, 'type': 'digit', 'value': u'123'}, {'char_end': 69, 'char_start': 66, 'type': 'digit', 'value': u'345'}, {'char_end': 68, 'char_start': 67, 'type': 'break', 'value': u' '}, {'char_end': 98, 'char_start': 85, 'type': 'emoji', 'value': u'\u263a\u263b\u270c\u2639\u2661\u2665\u2764\u2698\u2740\u2743\u2741\u273c\u2600'}, {'char_end': 89, 'char_start': 88, 'type': 'emoji', 'value': u'\u2122'}, {'char_end': 112, 'char_start': 108, 'type': 'digit', 'value': u'0180'}, {'char_end': 132, 'char_start': 128, 'type': 'digit', 'value': u'0190'}]
def test_guard_url_fail(self): self.e_config['kg_enhancement']['fields']['name']['guard'] = [{ "field": "url", "value": "http://www.testffffffurl.com" }] c = Core(extraction_config=self.e_config) r = c.process(self.doc) self.assertTrue('knowledge_graph' in self.doc) self.assertTrue('name' in self.doc['knowledge_graph']) self.assertTrue(len(self.doc['knowledge_graph']['name']) == 2) self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)
def test_extract_as_is_post_filter(self): doc = { "uri": "1", "event_actors": [{ "description": "Non-State, Internal, No State Sanction", "id": "internalnononstatesanctionstate", "title": "" }, { "description": "Noncombatant Status Asserted", "id": "assertedcontestednoncombatantnoncombatantnotstatusstatus", "title": "Noncombatant Status Not Contested" }] } e_config = { "extraction_policy": "replace", "error_handling": "raise_error", "document_id": "uri", "content_extraction": { "json_content": [{ "input_path": "event_actors[*].title", "segment_name": "actor_title" }] }, "data_extraction": [{ "input_path": "content_extraction.actor_title[*].text.`parent`", "fields": { "actor_title": { "extractors": { "extract_as_is": { "extraction_policy": "keep_existing", "config": { "post_filter": ["x.upper()"] } } } } } }] } c = Core(extraction_config=e_config) r = c.process(doc) self.assertTrue('actor_title' in r['knowledge_graph']) self.assertTrue(len(r['knowledge_graph']['actor_title']) == 1) self.assertTrue(r['knowledge_graph']['actor_title'][0]['value'] == 'noncombatant status not contested'.upper())
def test_guard_field_value_pass(self): self.e_config['kg_enhancement']['fields']['name']['guard'] = [{ "field": "fieldA", "value": "sachin" }] c = Core(extraction_config=self.e_config) r = c.process(self.doc) self.assertTrue('knowledge_graph' in self.doc) self.assertTrue('name' in self.doc['knowledge_graph']) self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1) self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0) self.assertTrue(self.doc['knowledge_graph']['name'][0]['value'] == 'Aname')