Example #1
0
    def test_04(self):
        dict_value2texts = {"ReD": ["scarleTT", "radish"]}

        gazetteer = GazetteerMatcher(dict_value2texts)
        span_value_list = list(gazetteer.text2span_value_iter("ReD scarlett blue radish"))

        hyp = span_value_list
        ref = [((18, 24), 'ReD')]

        # pprint(hyp)
        self.assertEqual(hyp, ref)
Example #2
0
    def test_02(self):
        dict_value2texts = {"ReD": ["scarleTT", "radish"]}

        gazetteer = GazetteerMatcher(dict_value2texts, config={"normalizer":str2lower})
        span_value_list = list(gazetteer.text2span_value_iter("red scarlett blue radish"))

        hyp = span_value_list
        ref = [((4, 12), 'ReD'), ((18, 24), 'ReD')]

        # pprint(hyp)
        self.assertEqual(hyp, ref)
Example #3
0
    def test_03(self):
        dict_value2texts = DictTool.append_key2values({"ReD": ["scarleTT", "radish"]})

        gazetteer = GazetteerMatcher(dict_value2texts)
        span_value_list = list(gazetteer.text2span_value_iter("ReD scarlett blue radish"))


        hyp = span_value_list
        ref = [((0, 3), 'ReD'), ((18, 24), 'ReD')]

        # pprint(hyp)
        self.assertEqual(hyp, ref)
Example #4
0
    def test_05(self):
        gazetteer = {"black beauty": ["black"],
                     "black ugly": ["black"],
                     }

        gazetteer = GazetteerMatcher(gazetteer)
        span_value_list = list(gazetteer.text2span_value_iter("black beauty ugly"))

        hyp = set(span_value_list)
        ref = {((0, 5), 'black ugly'),
               ((0, 5), 'black beauty'),
               }

        # pprint(hyp)
        self.assertEqual(hyp, ref)
Example #5
0
    def _langs2matcher(cls, langs):
        logger = HenriqueLogger.func_level2logger(cls._langs2matcher,
                                                  logging.DEBUG)

        gazetteer = cls.langs2gazetteer(langs)

        def texts2pattern(texts):
            rstr_raw = RegexTool.rstr_iter2or(map(re.escape, texts))

            left_bounds = lchain(
                RegexTool.bounds2suffixed(RegexTool.left_wordbounds(), "\d"),
                RegexTool.left_wordbounds(),
            )
            right_bounds = RegexTool.right_wordbounds()

            rstr = RegexTool.rstr2bounded(rstr_raw, left_bounds, right_bounds)
            logger.debug({
                "rstr": rstr,
                "rstr_raw": rstr_raw,
            })
            return re.compile(rstr, re.I)

        config = {
            GazetteerMatcher.Config.Key.TEXTS2PATTERN: texts2pattern,
            GazetteerMatcher.Config.Key.NORMALIZER: cls.normalize,
        }
        matcher = GazetteerMatcher(gazetteer, config=config)
        return matcher
Example #6
0
    def lang2matcher(cls, lang):
        langs_recognizable = HenriqueLocale.lang2langs_recognizable(lang)

        h_codename2aliases = merge_dicts([{Port.port2codename(port): Port.port_langs2aliases(port, langs_recognizable)}
                                          for port in Port.list_all()],
                                         vwrite=vwrite_no_duplicate_key)

        config = {GazetteerMatcher.Config.Key.NORMALIZER: cls.text2norm,
                  GazetteerMatcher.Config.Key.TEXTS2PATTERN: HenriqueEntity.texts2pattern_port_tradegood,
                  }
        matcher = GazetteerMatcher(h_codename2aliases, config)
        return matcher
Example #7
0
    def matcher_names(cls):
        h_codename2aliases = merge_dicts([{
            Chatroomuser.chatroomuser2codename(chatroomuser):
            Chatroomuser.chatroomuser2aliases(chatroomuser)
        } for chatroomuser in Chatroomuser.list_all()],
                                         vwrite=vwrite_no_duplicate_key)

        config = {
            GazetteerMatcher.Config.Key.NORMALIZER: cls.text2norm,
            # GazetteerMatcher.Config.Key.TEXTS2PATTERN: HenriqueEntity.texts2rstr_word_with_cardinal_suffix,
        }
        matcher = GazetteerMatcher(h_codename2aliases, config)
        return matcher
Example #8
0
    def lang2matcher(cls, lang):
        langs_recognizable = HenriqueLocale.lang2langs_recognizable(lang)

        def server2h_codename2aliases(server):
            aliases = Server.server_langs2aliases(server, langs_recognizable)
            return {Server.server2codename(server): aliases}

        h_codename2aliases = merge_dicts(map(server2h_codename2aliases,
                                             Server.list_all()),
                                         vwrite=vwrite_no_duplicate_key)
        assert_is_not_none(h_codename2aliases)

        config = {GazetteerMatcher.Config.Key.NORMALIZER: cls.text2norm}
        matcher = GazetteerMatcher(h_codename2aliases, config)
        return matcher
Example #9
0
    def lang2matcher(cls, lang):
        langs = HenriqueLocale.lang2langs_recognizable(lang)

        h_lang2codename2aliases = cls.dict_lang2codename2aliases()

        def codename2texts(codename):
            for lang in langs:
                aliases = JsonTool.down(h_lang2codename2aliases, [lang, codename])
                if not aliases:
                    continue

                yield from aliases

        h_codename2texts = {codename: list(codename2texts(codename))
                            for codename in cls.codenames()}

        config = {GazetteerMatcher.Config.Key.NORMALIZER: cls.text2norm}
        matcher = GazetteerMatcher(h_codename2texts, config)
        return matcher
Example #10
0
    def lang2matcher(cls, lang):
        tgt_list = Tradegoodtype.list_all()
        langs_recognizable = HenriqueLocale.lang2langs_recognizable(lang)

        def tgt2aliases(tgt):
            for _lang in langs_recognizable:
                yield from Tradegoodtype.tradegoodtype_lang2aliases(tgt, _lang)

        h_value2aliases = merge_dicts([{
            Tradegoodtype.tradegoodtype2codename(tgt):
            list(tgt2aliases(tgt))
        } for tgt in tgt_list],
                                      vwrite=vwrite_no_duplicate_key)

        config = {
            GazetteerMatcher.Config.Key.NORMALIZER: cls.text2norm,
            # GazetteerMatcher.Config.Key.TEXTS2PATTERN: HenriqueEntity.texts2rstr_word_with_cardinal_suffix,
        }
        matcher = GazetteerMatcher(h_value2aliases, config)
        return matcher
Example #11
0
    def lang2matcher(cls, lang):
        tg_list = Tradegood.list_all()
        langs_recognizable = HenriqueLocale.lang2langs_recognizable(lang)

        def tg2aliases(tg):
            for _lang in langs_recognizable:
                yield from Tradegood.tradegood_lang2aliases(tg, _lang)

        h_value2aliases = merge_dicts(
            [{
                Tradegood.tradegood2codename(tg): list(tg2aliases(tg))
            } for tg in tg_list],
            vwrite=vwrite_no_duplicate_key)

        config = {
            GazetteerMatcher.Config.Key.NORMALIZER:
            cls.text2norm,
            GazetteerMatcher.Config.Key.TEXTS2PATTERN:
            HenriqueEntity.texts2pattern_port_tradegood,
        }
        matcher = GazetteerMatcher(h_value2aliases, config)
        return matcher