Exemple #1
0
    def test_ce_readability(self):
        e_config = {
            'content_extraction': {
                "input_path": "raw_content",
                "extractors": {
                    "readability": [{
                        "strict": "yes",
                        "extraction_policy": "keep_existing"
                    }, {
                        "strict": "no",
                        "extraction_policy": "keep_existing",
                        "field_name": "content_relaxed"
                    }]
                }
            }
        }
        c = Core(extraction_config=e_config)
        r = c.process(self.doc)
        self.assertTrue('tld' in r)
        self.assertEqual('eroticmugshots.com', r['tld'])
        self.assertTrue("content_extraction" in r)
        self.assertTrue("content_strict" in r["content_extraction"])
        self.assertTrue("content_relaxed" in r["content_extraction"])
        self.assertTrue("title" not in r["content_extraction"])
        self.assertTrue("inferlink_extractions" not in r["content_extraction"])

        c_s = """\n \n \n \n \n \n smoothlegs24  28 \n \n \n chrissy391  27 \n \n \n My name is Helena height 160cms weight 55 kilos  contact me at [email protected]           jefferson ave         age: 23 HrumpMeNow  28 \n \n \n xxtradition  24 \n \n \n jumblyjumb  26 \n \n \n claudia77  26 \n \n \n gushinPuss  28 \n \n \n Littlexdit  25 \n \n \n PinkSweets2  28 \n \n \n withoutlimit  27 \n \n \n bothOfUs3  28 \n \n \n lovelylips  27 \n \n \n killerbod  27 \n \n \n Littlexdit  27 \n \n \n azneyes  23 \n \n \n \n \n \n Escort's Phone: \n \n \n323-452-2013  \n \n Escort's Location: \nLos Angeles, California  \n Escort's Age:   23   Date of Escort Post:   Jan 02nd 6:46am \n REVIEWS:   \n READ AND CREATE REVIEWS FOR THIS ESCORT   \n \n \n \n \n \nThere are  50  girls looking in  .\n VIEW GIRLS \n \nHey I'm luna 3234522013 Let's explore , embrace and indulge in your favorite fantasy  % independent. discreet no drama Firm Thighs and Sexy. My Soft skin & Tight Grip is exactly what you deserve Call or text   Fetish friendly   Fantasy friendly   Party friendly 140 Hr SPECIALS 3234522013. Call  323-452-2013 .  Me and my friends are on EZsex  soooo you can find us all on there if you want... skittlegirl \n   \n \n   \n \n   \n Call me on my cell at 323-452-2013. Date of ad: 2017-01-02 06:46:00 \n \n \n"""
        c_r = """\n \n \n \n \n \n \n smoothlegs24  28 \n \n \n chrissy391  27 \n \n \n My name is Helena height 160cms weight 55 kilos  contact me at [email protected]           jefferson ave         age: 23 HrumpMeNow  28 \n \n \n xxtradition  24 \n \n \n jumblyjumb  26 \n \n \n claudia77  26 \n \n \n gushinPuss  28 \n \n \n Littlexdit  25 \n \n \n PinkSweets2  28 \n \n \n withoutlimit  27 \n \n \n bothOfUs3  28 \n \n \n lovelylips  27 \n \n \n killerbod  27 \n \n \n Littlexdit  27 \n \n \n azneyes  23 \n \n \n \n \n \n Escort's Phone: \n \n \n323-452-2013  \n \n Escort's Location: \nLos Angeles, California  \n Escort's Age:   23   Date of Escort Post:   Jan 02nd 6:46am \n REVIEWS:   \n READ AND CREATE REVIEWS FOR THIS ESCORT   \n \n \n \n \n \nThere are  50  girls looking in  .\n VIEW GIRLS \n \nHey I'm luna 3234522013 Let's explore , embrace and indulge in your favorite fantasy  % independent. discreet no drama Firm Thighs and Sexy. My Soft skin & Tight Grip is exactly what you deserve Call or text   Fetish friendly   Fantasy friendly   Party friendly 140 Hr SPECIALS 3234522013. Call  323-452-2013 .  Me and my friends are on EZsex  soooo you can find us all on there if you want... skittlegirl \n   \n \n   \n \n   \n Call me on my cell at 323-452-2013. Date of ad: 2017-01-02 06:46:00 \n \n \n \n"""
        self.assertEqual(
            json.dumps(r["content_extraction"]["content_strict"]["text"]),
            json.dumps(c_s))
        self.assertEqual(
            json.dumps(r["content_extraction"]["content_relaxed"]["text"]),
            json.dumps(c_r))
class TestExtractionConfig(unittest.TestCase):
    def setUp(self):
        self.c = Core()

    def test_extraction_policy(self):
        config = {}
        self.assertEqual(self.c.determine_extraction_policy(config),
                         core._REPLACE)
        config = {'extraction_policy': 'replace'}
        self.assertEqual(self.c.determine_extraction_policy(config),
                         core._REPLACE)
        config = {'extraction_policy': 'keep_existing'}
        self.assertEqual(self.c.determine_extraction_policy(config),
                         core._KEEP_EXISTING)
        config = None
        self.assertEqual(self.c.determine_extraction_policy(config),
                         core._REPLACE)
        config = {'extraction_policy': 'something'}
        with self.assertRaises(ValueError):
            self.c.determine_extraction_policy(config)

    def test_determine_segment(self):
        full_path = ''
        self.assertEqual(self.c.determine_segment(full_path),
                         core._SEGMENT_OTHER)
        full_path = 'content_extraction.title'
        self.assertEqual(self.c.determine_segment(full_path),
                         core._SEGMENT_TITLE)
        full_path = 'content_extraction.inferlink_extractions.inferlink_description'
        self.assertEqual(self.c.determine_segment(full_path),
                         core._SEGMENT_INFERLINK_DESC)
        full_path = 'content_extraction.content_relaxed'
        self.assertEqual(self.c.determine_segment(full_path),
                         core._SEGMENT_OTHER)
 def test_extractor__no_regex(self):
     e_config = {
         "data_extraction": [{
             "input_path":
             ["content_extraction.content_strict.text.`parent`"],
             "fields": {
                 "name": {
                     "extractors": {
                         "extract_using_regex": {
                             "config": {
                                 "include_context":
                                 "true",
                                 "regex_options": ["IGNORECASE"],
                                 "pre_filter": [
                                     "x.replace('\\n', '')",
                                     "x.replace('\\r', '')"
                                 ]
                             },
                             "extraction_policy": "replace"
                         }
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     with self.assertRaises(KeyError):
         r = c.process(self.doc)
Exemple #4
0
 def test_tld_extraction_from_doc(self):
     doc = {
         "url": "https://www.google.com/blah/this/part/doesnt/matter",
         'uri': "uri.1",
         "tld": "xyz.org"
     }
     e_config = {
         "document_id":
         "uri",
         "content_extraction": {},
         "data_extraction": [{
             "input_path": "content_extraction.url.text.`parent`",
             "fields": {
                 "website": {
                     "extractors": {
                         "extract_website_domain": {}
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     r = c.process(doc)
     self.assertEqual(r['knowledge_graph']['website'][0]['value'],
                      'xyz.org')
 def test_document_id_not_present(self):
     e_config = {
         'document_id': 'blah'
     }
     c = Core(extraction_config=e_config)
     with self.assertRaises(KeyError):
         r = c.process(self.doc)
Exemple #6
0
    def test_add_constants(self):
        e_config = {
            "document_id": "doc_id",
            "kg_enhancement": {
                "fields": {
                    "type": {
                        "priority": 0,
                        "extractors": {
                            "add_constant_kg": {
                                "config": {
                                    "constants": ["Type A", "Type B"]
                                }
                            }
                        }
                    }
                },
                "input_path": "knowledge_graph.`parent`"
            }}

        c = Core(extraction_config=e_config)
        r = c.process(self.doc)
        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('type' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['type']) == 2)
        self.assertTrue(self.doc['knowledge_graph']['type'][0]['value'] in ["Type A", "Type B"])
Exemple #7
0
 def test_document_id(self):
     e_config = {'document_id': 'doc_id'}
     c = Core(extraction_config=e_config)
     r = c.process(self.doc)
     self.assertTrue('document_id' in r)
     doc_id = '1A4A5FF5BD066309C72C8EEE6F7BCCCFD21B83245AFCDADDF014455BCF990A21'
     self.assertEqual(r['document_id'], doc_id)
Exemple #8
0
 def test_filter_results(self):
     c = Core(extraction_config=self.e_config)
     r = c.process(self.doc)
     self.assertTrue('knowledge_graph' in self.doc)
     self.assertTrue('name' in self.doc['knowledge_graph'])
     self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
     self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)
Exemple #9
0
 def test_filter_results_multiple_filters(self):
     doc = {
         "url":
         "http://www.testurl.com",
         "doc_id":
         "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E"
     }
     e_config = {
         "document_id": "doc_id",
         "filters": {
             "testurl.com": [{
                 "field": "url",
                 "action": "keep",
                 "regex": "testt"
             }, {
                 "field": "url",
                 "action": "discard",
                 "regex": "test*"
             }]
         }
     }
     c = Core(extraction_config=e_config)
     doc = c.process_doc_filters(doc)
     self.assertTrue('prefilter_filter_outcome' in doc)
     self.assertTrue(doc['prefilter_filter_outcome'] == 'discard')
    def test_invalid_json_path(self):
        doc = {
            "url": "http:www.hitman.org",
            "doc_id": "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E",
            "actors": {
                "name": "agent 47",
                "affiliation": "International Contract Agency"
            }
        }

        e_config = {
            "document_id": "doc_id",
            "data_extraction": [
                {
                    "input_path": [
                        "actors["
                    ],
                    "fields": {
                        "actors": {
                            "extractors": {
                                "create_kg_node_extractor": {
                                    "config": {
                                        "segment_name": "actor_information"
                                    }
                                }
                            }
                        }
                    }
                }
            ]
        }
        c = Core(extraction_config=e_config)

        with self.assertRaises(Exception):
            r = c.process(doc)
 def test_spacy_date(self):
     doc = {
         "url": "http://date.test.com",
         "doc_id": "12344",
         "content_extraction": {
             "useful_text": {
                 "text": u"Alert: Tue, 2006-02-07"
             }
         }
     }
     e_config = {
         "document_id":
         "doc_id",
         'data_extraction': [{
             "fields": {
                 "event_date": {
                     "extractors": {
                         "extract_using_spacy": {
                             "config": {
                                 "post_filter": "parse_date"
                             }
                         }
                     }
                 }
             },
             "input_path": ["content_extraction.useful_text.text.`parent`"]
         }]
     }
     core = Core(extraction_config=e_config)
     r = core.process(doc)
     kg = r['knowledge_graph']
     self.assertTrue('event_date' in kg)
     self.assertEqual(kg['event_date'][0]['value'], '2006-02-07T00:00:00')
Exemple #12
0
    def setUp(self):

        e_config = {
            'data_extraction': [{
                'input_path': 'text.`parent`',
                'fields': {
                    "posting_date": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "age": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "social_media": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    },
                    "address": {
                        "extractors": {
                            "extract_using_spacy": {
                                "config": {}
                            }
                        }
                    }
                }
            }]
        }

        self.c = Core(extraction_config=e_config, load_spacy=True)
        self.ground_truth = dict()

        ground_truth_files = {
            "age":
            os.path.join(os.path.dirname(__file__), "ground_truth/age.jl"),
            "date":
            os.path.join(os.path.dirname(__file__), "ground_truth/date.jl"),
            "social_media":
            os.path.join(os.path.dirname(__file__),
                         "ground_truth/social_media.jl"),
            "address":
            os.path.join(os.path.dirname(__file__), "ground_truth/address.jl")
        }

        for extractor, file_name in ground_truth_files.items():
            with open(file_name, 'r') as f:
                test_data = f.read().split('\n')
                self.ground_truth[extractor] = list()
                for test_case in test_data:
                    self.ground_truth[extractor].append(json.loads(test_case))
 def test_extractor_regex(self):
     e_config = {
         "data_extraction": [{
             "input_path":
             ["content_extraction.content_strict.text.`parent`"],
             "fields": {
                 "name": {
                     "extractors": {
                         "extract_using_regex": {
                             "config": {
                                 "include_context":
                                 "true",
                                 "regex":
                                 "(?:my[\\s]+name[\\s]+is[\\s]+([-a-z0-9@$!]+))",
                                 "regex_options": ["IGNORECASE"],
                                 "pre_filter": [
                                     "x.replace('\\n', '')",
                                     "x.replace('\\r', '')"
                                 ]
                             },
                             "extraction_policy": "replace"
                         }
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     r = c.process(self.doc)
     self.assertTrue("content_extraction" in r)
     self.assertTrue("content_strict" in r["content_extraction"])
     self.assertTrue("text" in r["content_extraction"]["content_strict"])
     self.assertTrue("tokens" in r["content_extraction"]["content_strict"])
     self.assertTrue(
         "simple_tokens" in r["content_extraction"]["content_strict"])
     self.assertTrue(
         "data_extraction" in r["content_extraction"]["content_strict"])
     self.assertTrue("name" in r["content_extraction"]["content_strict"]
                     ["data_extraction"])
     self.assertTrue("extract_using_regex" in r["content_extraction"]
                     ["content_strict"]["data_extraction"]["name"])
     extraction = r["content_extraction"]["content_strict"][
         "data_extraction"]["name"]["extract_using_regex"]
     ex = {
         "results": [{
             "origin": {
                 "score": 1,
                 "segment": "readability_strict",
                 "method": "other_method"
             },
             "context": {
                 'text': u' 27 \n \n \n My name is Helena height 16',
                 "end": 73,
                 "start": 56
             },
             "value": "Helena"
         }]
     }
     self.assertEqual(extraction, ex)
Exemple #14
0
 def test_guards(self):
     c = Core(extraction_config=self.e_config)
     r = c.process(self.doc1)
     self.assertTrue("knowledge_graph" in r)
     self.assertTrue("event_date" in r['knowledge_graph'])
     r = c.process(self.doc2)
     self.assertTrue("knowledge_graph" not in r
                     or "event_date" not in r['knowledge_graph'])
    def test_table_extractor_empty_config(self):
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue("content_extraction" in r)
        self.assertTrue("table" in r["content_extraction"])
        ex = json.loads(json.JSONEncoder().encode(r["content_extraction"]["table"]["tables"]))
        self.assertEqual(ex, self.table_ex)
Exemple #16
0
    def test_extract_as_is_post_filter_3(self):
        doc = {
            "uri":
            "1",
            "event_actors": [{
                "description": "Non-State, Internal, No State Sanction",
                "id": "internalnononstatesanctionstate",
                "size": "54"
            }, {
                "description": "Noncombatant Status Asserted",
                "id":
                "assertedcontestednoncombatantnoncombatantnotstatusstatus",
                "size": "34.0"
            }, {
                "description": "Noncombatant Status Asserted",
                "id":
                "assertedcontestednoncombatantnoncombatantnotstatusstatus",
                "size": "redme34"
            }]
        }

        e_config = {
            "extraction_policy":
            "replace",
            "error_handling":
            "raise_error",
            "document_id":
            "uri",
            "content_extraction": {
                "json_content": [{
                    "input_path": "event_actors[*].size",
                    "segment_name": "actor_size"
                }]
            },
            "data_extraction": [{
                "input_path": "content_extraction.actor_size[*].text.`parent`",
                "fields": {
                    "actor_size": {
                        "extractors": {
                            "extract_as_is": {
                                "extraction_policy": "keep_existing",
                                "config": {
                                    "post_filter": ["parse_number"]
                                }
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)
        self.assertTrue(len(r['knowledge_graph']['actor_size']) == 2)
        self.assertEqual(r['knowledge_graph']['actor_size'][0]['value'], '54')
        self.assertEqual(r['knowledge_graph']['actor_size'][1]['value'],
                         '34.0')
Exemple #17
0
 def test_decode_value_dictionary(self):
     women_name_file_path = os.path.join(
         os.path.dirname(__file__),
         "resources/case_sensitive_female_name.json")
     name_decoding_dict_path = os.path.join(os.path.dirname(__file__),
                                            "resources/name_decode.json")
     doc = {
         'content_extraction': {
             'content_strict': {
                 'text':
                 'My name is Margie and this is a test for extracting this name using case sensitive '
                 'dictionary'
             }
         },
         'doc_id': 'id',
         'url': 'http://givemeabreak.com'
     }
     e_config = {
         "resources": {
             "dictionaries": {
                 "women_name": women_name_file_path
             },
             "decoding_dictionary": {
                 "name": name_decoding_dict_path
             }
         },
         "document_id":
         "doc_id",
         "data_extraction": [{
             "input_path":
             "content_extraction.content_strict.text.`parent`",
             "fields": {
                 "name": {
                     "extractors": {
                         "extract_using_dictionary": {
                             "config": {
                                 'case_sensitive': 'True',
                                 "dictionary": "women_name",
                                 "ngrams": 1,
                                 "joiner": " ",
                                 "pre_filter": ["x"],
                                 "post_filter":
                                 ["isinstance(x, basestring)"],
                                 "post_filter_s": "decode_value"
                             },
                             "extraction_policy": "keep_existing"
                         }
                     }
                 }
             }
         }]
     }
     c = Core(extraction_config=e_config)
     r = c.process(doc)
     self.assertEqual(r['knowledge_graph']['name'][0]['value'],
                      'Not Margie')
Exemple #18
0
    def test_extraction_from_age(self):

        c = Core()
        for t in self.doc:
            extracted_ages = c._extract_age(t['content'])
            extracted_ages = [age['value'] for age in extracted_ages]
            for extracted_age in extracted_ages:
                for correct_age in t['correct']:
                    if extracted_age == correct_age:
                        self.assertTrue(extracted_age, correct_age)
Exemple #19
0
    def test_extraction_from_default_spacy(self):
        c = Core(extraction_config=self.e_config, load_spacy=True)
        for i in range(len(self.ground_truth_input)):

            r = c.process(self.ground_truth_input[i],
                          create_knowledge_graph=True,
                          html_description=False)

            self.assertEquals(self.ground_truth_output[i]['knowledge_graph'],
                              r['knowledge_graph'])
    def test_table_extractor(self):
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)
        with open("table_out.jl", "w") as f:
            f.write(json.dumps(r["content_extraction"]["table"]["tables"]))

        self.assertTrue("content_extraction" in r)
        self.assertTrue("table" in r["content_extraction"])
        ex = json.loads(json.JSONEncoder().encode(r["content_extraction"]["table"]["tables"]))
        self.assertEqual(ex, self.table_ex)
Exemple #21
0
    def test_negative_case_sensitive(self):
        women_name_file_path = os.path.join(
            os.path.dirname(__file__),
            "resources/case_sensitive_female_name.json")
        doc = {
            'content_extraction': {
                'content_strict': {
                    'text':
                    'My name is margie and this is a test for extracting this name using case sensitive '
                    'dictionary'
                }
            },
            'doc_id': 'id',
            'url': 'http://givemeabreak.com'
        }
        e_config = {
            "resources": {
                "dictionaries": {
                    "women_name": women_name_file_path
                }
            },
            "document_id":
            "doc_id",
            "data_extraction": [{
                "input_path":
                "content_extraction.content_strict.text.`parent`",
                "fields": {
                    "name": {
                        "extractors": {
                            "extract_using_dictionary": {
                                "config": {
                                    'case_sensitive': 'trUe',
                                    "dictionary": "women_name",
                                    "ngrams": 1,
                                    "joiner": " ",
                                    "pre_filter": ["x"],
                                    "post_filter":
                                    ["isinstance(x, basestring)"]
                                },
                                "extraction_policy": "keep_existing"
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)

        self.assertTrue(
            "simple_tokens" in r["content_extraction"]["content_strict"])
        self.assertTrue('simple_tokens_original_case' in
                        r["content_extraction"]["content_strict"])
        self.assertTrue(
            "data_extraction" not in r["content_extraction"]["content_strict"])
Exemple #22
0
    def test_create_kg_node(self):
        doc = {
            "url": "http:www.hitman.org",
            "doc_id":
            "19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E",
            "actors": {
                "name": "agent 47",
                "affiliation": "International Contract Agency"
            }
        }

        e_config = {
            "document_id":
            "doc_id",
            "data_extraction": [{
                "input_path": ["actors"],
                "fields": {
                    "actors": {
                        "extractors": {
                            "create_kg_node_extractor": {
                                "config": {
                                    "segment_name": "actor_information"
                                }
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)
        self.assertTrue('knowledge_graph' in doc)
        self.assertTrue('actors' in doc['knowledge_graph'])
        self.assertTrue(len(doc['knowledge_graph']['actors']) == 1)
        self.assertTrue('nested_docs' in r)
        self.assertTrue(len(r['nested_docs']) == 1)
        nested_doc = r['nested_docs'][0]
        ce_expected = {
            "actor_information": {
                "affiliation": "International Contract Agency",
                "name": "agent 47"
            }
        }

        self.assertEqual(nested_doc['content_extraction'], ce_expected)
        # self.assertTrue('parent_doc_id' in nested_doc)
        # self.assertEqual(nested_doc['parent_doc_id'],
        #                  '19B0EAB211CD1D3C63063FAB0B2937043EA1F07B5341014A80E7473BA7318D9E')
        self.assertTrue('created_by' in nested_doc)
        self.assertTrue('@timestamp_created' in nested_doc)
        self.assertTrue('url' in nested_doc)

        self.assertEqual(
            r['knowledge_graph']['actors'][0]['provenance'][0]['qualifiers']
            ['timestamp_created'], nested_doc['@timestamp_created'])
Exemple #23
0
    def test_guard_field_regex_pass(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "fieldA",
            "regex": "ach"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
Exemple #24
0
    def test_guard_field_stop_value_fail(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "fieldA",
            "stop_value": "SACHIN"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
Exemple #25
0
    def test_extraction_from_default_spacy(self):
        c = Core(extraction_config=self.e_config, load_spacy=True)
        dd = codecs.open('temp', 'w')
        for i in range(len(self.ground_truth_input)):

            r = c.process(self.ground_truth_input[i],
                          create_knowledge_graph=True,
                          html_description=False)
            # dd.write(json.dumps(r))
            # dd.write('\n')
            self.assertEquals(self.ground_truth_output[i]['knowledge_graph'],
                              r['knowledge_graph'])
Exemple #26
0
 def test_empty_tokens(self):
     tokens = []
     pre_process = lambda x: x
     pre_filter = lambda x: x
     post_filter = lambda x: isinstance(x, basestring)
     ngrams = 1
     joiner = ' '
     n_trie = trie.CharTrie()
     c = Core()
     r = c._extract_using_dictionary(tokens, pre_process, n_trie, pre_filter, post_filter,
                                     ngrams, joiner)
     self.assertEqual(r, None)
Exemple #27
0
    def setUp(self):

        self.c = Core()
        self.doc = 'Call meorčpžsíáýd at \n\r  \t   ♥❤⚘sdj,,,?? ?123 fd123-123(123))),345 fdkjf☺☻✌☹♡♥❤⚘❀❃❁✼☀’ŰűŲųŴŵŶŷŸŹźŻżŽžſ0180ƀƁƂƃƄƅƆƇƈƉƊƋƌƍƎƏ0190ƐƑƒƓƔƕƖƗƘƙƚƛƜƝƞƟdsfkjhsdf'
        self.correct_tokens = [{'char_end': 8, 'char_start': 4, 'type': 'alphabet', 'value': u'Call'}, {'char_end': 6, 'char_start': 5, 'type': 'break', 'value': u' '}, {'char_end': 29, 'char_start': 17, 'type': 'alphabet', 'value': u'meor\u010dp\u017es\xed\xe1\xfdd'}, {'char_end': 19, 'char_start': 18, 'type': 'break', 'value': u' '}, {'char_end': 22, 'char_start': 20, 'type': 'alphabet', 'value': u'at'}, {'char_end': 38, 'char_start': 29, 'type': 'break', 'value': u' \n\r  \t   '}, {'char_end': 35, 'char_start': 32, 'type': 'emoji', 'value': u'\u2665\u2764\u2698'}, {'char_end': 38, 'char_start': 35, 'type': 'alphabet', 'value': u'sdj'}, {'char_end': 36, 'char_start': 35, 'type': 'punctuation', 'value': u','}, {'char_end': 37, 'char_start': 36, 'type': 'punctuation', 'value': u','}, {'char_end': 38, 'char_start': 37, 'type': 'punctuation', 'value': u','}, {'char_end': 39, 'char_start': 38, 'type': 'punctuation', 'value': u'?'}, {'char_end': 40, 'char_start': 39, 'type': 'punctuation', 'value': u'?'}, {'char_end': 42, 'char_start': 41, 'type': 'break', 'value': u' '}, {'char_end': 42, 'char_start': 41, 'type': 'punctuation', 'value': u'?'}, {'char_end': 48, 'char_start': 45, 'type': 'digit', 'value': u'123'}, {'char_end': 47, 'char_start': 46, 'type': 'break', 'value': u' '}, {'char_end': 50, 'char_start': 48, 'type': 'alphabet', 'value': u'fd'}, {'char_end': 54, 'char_start': 51, 'type': 'digit', 'value': u'123'}, {'char_end': 52, 'char_start': 51, 'type': 'punctuation', 'value': u'-'}, {'char_end': 58, 'char_start': 55, 'type': 'digit', 'value': u'123'}, {'char_end': 56,
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                'char_start': 55, 'type': 'punctuation', 'value': u'('}, {'char_end': 62, 'char_start': 59, 'type': 'digit', 'value': u'123'}, {'char_end': 60, 'char_start': 59, 'type': 'punctuation', 'value': u')'}, {'char_end': 61, 'char_start': 60, 'type': 'punctuation', 'value': u')'}, {'char_end': 62, 'char_start': 61, 'type': 'punctuation', 'value': u')'}, {'char_end': 63, 'char_start': 62, 'type': 'punctuation', 'value': u','}, {'char_end': 69, 'char_start': 66, 'type': 'digit', 'value': u'345'}, {'char_end': 68, 'char_start': 67, 'type': 'break', 'value': u' '}, {'char_end': 77, 'char_start': 72, 'type': 'alphabet', 'value': u'fdkjf'}, {'char_end': 98, 'char_start': 85, 'type': 'emoji', 'value': u'\u263a\u263b\u270c\u2639\u2661\u2665\u2764\u2698\u2740\u2743\u2741\u273c\u2600'}, {'char_end': 89, 'char_start': 87, 'type': 'alphabet', 'value': u'\xe2\u20ac'}, {'char_end': 89, 'char_start': 88, 'type': 'emoji', 'value': u'\u2122'}, {'char_end': 120, 'char_start': 104, 'type': 'alphabet', 'value': u'\u0170\u0171\u0172\u0173\u0174\u0175\u0176\u0177\u0178\u0179\u017a\u017b\u017c\u017d\u017e\u017f'}, {'char_end': 112, 'char_start': 108, 'type': 'digit', 'value': u'0180'}, {'char_end': 140, 'char_start': 124, 'type': 'alphabet', 'value': u'\u0180\u0181\u0182\u0183\u0184\u0185\u0186\u0187\u0188\u0189\u018a\u018b\u018c\u018d\u018e\u018f'}, {'char_end': 132, 'char_start': 128, 'type': 'digit', 'value': u'0190'}, {'char_end': 177, 'char_start': 152, 'type': 'alphabet', 'value': u'\u0190\u0191\u0192\u0193\u0194\u0195\u0196\u0197\u0198\u0199\u019a\u019b\u019c\u019d\u019e\u019fdsfkjhsdf'}]
        self.correct_simple_tokens = [u'Call', u' ', u'meor\u010dp\u017es\xed\xe1\xfdd', u' ', u'at', u' \n\r  \t   ', u'\u2665\u2764\u2698', u'sdj', u',', u',', u',', u'?', u'?', u' ', u'?', u'123', u' ', u'fd', u'123', u'-', u'123',
                                      u'(', u'123', u')', u')', u')', u',', u'345', u' ', u'fdkjf', u'\u263a\u263b\u270c\u2639\u2661\u2665\u2764\u2698\u2740\u2743\u2741\u273c\u2600', u'\xe2\u20ac', u'\u2122', u'\u0170\u0171\u0172\u0173\u0174\u0175\u0176\u0177\u0178\u0179\u017a\u017b\u017c\u017d\u017e\u017f', u'0180', u'\u0180\u0181\u0182\u0183\u0184\u0185\u0186\u0187\u0188\u0189\u018a\u018b\u018c\u018d\u018e\u018f', u'0190', u'\u0190\u0191\u0192\u0193\u0194\u0195\u0196\u0197\u0198\u0199\u019a\u019b\u019c\u019d\u019e\u019fdsfkjhsdf']
        self.correct_reverse_map = [1, 3, 5, 6, 13,
                                    15, 16, 18, 20, 22, 27, 28, 30, 32, 34, 36]
        self.correct_filtered_tokens = [{'char_end': 6, 'char_start': 5, 'type': 'break', 'value': u' '}, {'char_end': 19, 'char_start': 18, 'type': 'break', 'value': u' '}, {'char_end': 38, 'char_start': 29, 'type': 'break', 'value': u' \n\r  \t   '}, {'char_end': 35, 'char_start': 32, 'type': 'emoji', 'value': u'\u2665\u2764\u2698'}, {'char_end': 42, 'char_start': 41, 'type': 'break', 'value': u' '}, {'char_end': 48, 'char_start': 45, 'type': 'digit', 'value': u'123'}, {'char_end': 47, 'char_start': 46, 'type': 'break', 'value': u' '}, {'char_end': 54, 'char_start': 51, 'type': 'digit', 'value': u'123'}, {
            'char_end': 58, 'char_start': 55, 'type': 'digit', 'value': u'123'}, {'char_end': 62, 'char_start': 59, 'type': 'digit', 'value': u'123'}, {'char_end': 69, 'char_start': 66, 'type': 'digit', 'value': u'345'}, {'char_end': 68, 'char_start': 67, 'type': 'break', 'value': u' '}, {'char_end': 98, 'char_start': 85, 'type': 'emoji', 'value': u'\u263a\u263b\u270c\u2639\u2661\u2665\u2764\u2698\u2740\u2743\u2741\u273c\u2600'}, {'char_end': 89, 'char_start': 88, 'type': 'emoji', 'value': u'\u2122'}, {'char_end': 112, 'char_start': 108, 'type': 'digit', 'value': u'0180'}, {'char_end': 132, 'char_start': 128, 'type': 'digit', 'value': u'0190'}]
Exemple #28
0
    def test_guard_url_fail(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "url",
            "value": "http://www.testffffffurl.com"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 2)
        self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)
Exemple #29
0
    def test_extract_as_is_post_filter(self):
        doc = {
            "uri":
            "1",
            "event_actors": [{
                "description": "Non-State, Internal, No State Sanction",
                "id": "internalnononstatesanctionstate",
                "title": ""
            }, {
                "description": "Noncombatant Status Asserted",
                "id":
                "assertedcontestednoncombatantnoncombatantnotstatusstatus",
                "title": "Noncombatant Status Not Contested"
            }]
        }

        e_config = {
            "extraction_policy":
            "replace",
            "error_handling":
            "raise_error",
            "document_id":
            "uri",
            "content_extraction": {
                "json_content": [{
                    "input_path": "event_actors[*].title",
                    "segment_name": "actor_title"
                }]
            },
            "data_extraction": [{
                "input_path":
                "content_extraction.actor_title[*].text.`parent`",
                "fields": {
                    "actor_title": {
                        "extractors": {
                            "extract_as_is": {
                                "extraction_policy": "keep_existing",
                                "config": {
                                    "post_filter": ["x.upper()"]
                                }
                            }
                        }
                    }
                }
            }]
        }
        c = Core(extraction_config=e_config)
        r = c.process(doc)
        self.assertTrue('actor_title' in r['knowledge_graph'])
        self.assertTrue(len(r['knowledge_graph']['actor_title']) == 1)
        self.assertTrue(r['knowledge_graph']['actor_title'][0]['value'] ==
                        'noncombatant status not contested'.upper())
Exemple #30
0
    def test_guard_field_value_pass(self):
        self.e_config['kg_enhancement']['fields']['name']['guard'] = [{
            "field": "fieldA",
            "value": "sachin"
        }]
        c = Core(extraction_config=self.e_config)
        r = c.process(self.doc)

        self.assertTrue('knowledge_graph' in self.doc)
        self.assertTrue('name' in self.doc['knowledge_graph'])
        self.assertTrue(len(self.doc['knowledge_graph']['name']) == 1)
        self.assertTrue(self.doc['knowledge_graph']['name'][0]['confidence'] == 1.0)
        self.assertTrue(self.doc['knowledge_graph']['name'][0]['value'] == 'Aname')