def test_spacy_rules():

    try:
        obj = request.get_json(force=True)

        rule_extractor = SpacyRuleExtractor(
            etk.default_nlp,
            obj, "test_extractor")
        tokens = rule_extractor.tokenizer.tokenize_to_spacy_doc(obj['test_text'])
        obj['test_tokens'] = []
        for t in tokens:
            obj['test_tokens'].append({
                'index': t.i,
                'whitespace': t.whitespace_,
                'text': t.text
            })
        obj['results'] = []
        for extraction in rule_extractor.extract(obj['test_text']):
            obj['results'].append({
                'confidence': extraction.confidence,
                'start_token': extraction.provenance['start_token'],
                'end_token': extraction.provenance['end_token'],
                'start_char': extraction.provenance['start_char'],
                'end_char': extraction.provenance['end_char'],
                'identifier': extraction.rule_id,
                'text': extraction.value,
                'token_based_match_mapping': extraction.token_based_match_mapping
            })

        return json.dumps(obj), 201

    except Exception as e:
        print(e)
        return json.dumps({'message': 'exception: {}'.format(e.message)}), 400
 def test_SpacyRuleExtractor_word_2(self) -> None:
     sample_rules = rules["test_SpacyRuleExtractor_word_2"]
     sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                "test_extractor")
     extractions = sample_rule_extractor.extract(
         "version 2 of etk, implemented by Runqi Shao, Dongyu Li, Sylvia lin, Amandeep and others."
     )
     expected = [('rule_0', 'Name: Sylvia lin')]
     self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
    def test_SpacyRuleExtractor_number_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_number_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "Extract from the following number: 32 12 54435 23 665.3 34 65.42 23 4545"
        )

        expected = [('rule_0', '665.3'), ('rule_0', '4545')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
    def test_SpacyRuleExtractor_linebreak_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_linebreak_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, \n\n\n Sylvia-lin, Amandeep and others."
        )

        expected = [('rule_0', 'Length 3 linebreak: \n\n\n ')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
    def test_SpacyRuleExtractor_shape_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_shape_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by RqS, DongYu94 Li, Sylvia lin, Amandeep and others."
        )

        expected = [('rule_0', 'RqS'), ('rule_0', 'DongYu94')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
    def test_SpacyRuleExtractor_punc_1(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_punc_1"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Rq? Shao. DongYu94 Li, Sylvia-lin, Amandeep and others."
        )

        expected = [('rule_0', 'Name: Rq, Shao'),
                    ('rule_0', 'Name: Sylvia, lin')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
    def test_SpacyRuleExtractor_word_5(self) -> None:
        sample_rules = rules["test_SpacyRuleExtractor_word_5"]

        sample_rule_extractor = SpacyRuleExtractor(self.nlp, sample_rules,
                                                   "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Runqi Shao, DongYu94 Li, Sylvia lin, Amandeep and others."
        )
        expected = [
            ('rule_0',
             'First Name: Runqi, Last Name: Shao. Full name: Runqi Shao'),
            ('rule_0',
             'First Name: DongYu94, Last Name: Li. Full name: DongYu94 Li')
        ]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)
    def test_SpacyRuleExtractor(self) -> None:
        hme = HTMLMetadataExtractor()
        with open('etk/unit_tests/ground_truth/news.html', 'r') as f:
            sample_html = f.read()

        sample_rules = json.load(
            open('etk/unit_tests/ground_truth/sample_spacy_rule.json'))

        title_extraction = hme.extract(sample_html,
                                       extract_title=True)[0].value

        sample_rule_extractor = SpacyRuleExtractor(
            spacy.load("en_core_web_sm"), sample_rules, "dummy")
        extractions = sample_rule_extractor.extract(title_extraction)
        expected_extraction = 'Trump'
        self.assertEqual(extractions[0].value, expected_extraction)
Exemple #9
0
    def test_SpacyRuleExtractor(self) -> None:
        sample_rules = {
            "field_name":
            "test",
            "rules": [{
                "dependencies": [],
                "description":
                "",
                "identifier":
                "rule_3",
                "is_active":
                "true",
                "output_format":
                "firstName:{1}, lastName:{2}",
                "pattern": [{
                    "capitalization": ["title"],
                    "contain_digit": "true",
                    "is_in_output": "true",
                    "is_in_vocabulary": "false",
                    "is_out_of_vocabulary": "false",
                    "is_required": "true",
                    "length": [],
                    "match_all_forms": "true",
                    "maximum": "",
                    "minimum": "",
                    "numbers": [],
                    "part_of_speech": [],
                    "prefix": "",
                    "shapes": [],
                    "suffix": "",
                    "token": [],
                    "type": "word"
                }, {
                    "capitalization": ["title"],
                    "contain_digit": "false",
                    "is_in_output": "true",
                    "is_in_vocabulary": "false",
                    "is_out_of_vocabulary": "false",
                    "is_required": "false",
                    "length": [],
                    "match_all_forms": "true",
                    "maximum": "",
                    "minimum": "",
                    "numbers": [],
                    "part_of_speech": [],
                    "prefix": "",
                    "shapes": [],
                    "suffix": "",
                    "token": [],
                    "type": "word"
                }],
                "polarity":
                "true"
            }, {
                "dependencies": [],
                "description":
                "",
                "identifier":
                "rule_4",
                "is_active":
                "true",
                "output_format":
                "number:{1}",
                "pattern": [{
                    "capitalization": [],
                    "contain_digit": "true",
                    "is_in_output": "true",
                    "is_in_vocabulary": "false",
                    "is_out_of_vocabulary": "false",
                    "is_required": "true",
                    "length": [],
                    "match_all_forms": "true",
                    "maximum": "5",
                    "minimum": "0",
                    "numbers": [],
                    "part_of_speech": [],
                    "prefix": "",
                    "shapes": [],
                    "suffix": "",
                    "token": [],
                    "type": "number"
                }],
                "polarity":
                "true"
            }]
        }
        sample_rule_extractor = SpacyRuleExtractor(
            spacy.load("en_core_web_sm"), sample_rules, "test_extractor")
        extractions = sample_rule_extractor.extract(
            "version 2 of etk, implemented by Runqi12 Shao, Dongyu Li, Sylvia lin, Amandeep and others."
        )

        expected = [('rule_4', 'number:2'),
                    ('rule_3', 'firstName:Runqi12, lastName:Shao'),
                    ('rule_3', 'firstName:Dongyu, lastName:Li'),
                    ('rule_3', 'firstName:Sylvia, lastName:{2}'),
                    ('rule_3', 'firstName:Amandeep, lastName:{2}')]
        self.assertEqual([(x.rule_id, x.value) for x in extractions], expected)