Example #1
0
    def lang2pattern(cls, lang):
        from henrique.main.document.price.trend.trend_entity import TrendEntity
        logger = HenriqueLogger.func_level2logger(cls.lang2pattern,
                                                  logging.DEBUG)

        # rstr_suffix = format_str("{}?",
        #                          RegexTool.rstr2wrapped(TrendEntity.lang2rstr(lang)),
        #                          )

        ### may be concatenated with port/tradegood name
        # rstr_prefixed = RegexTool.rstr2rstr_words_prefixed(cls.rstr())
        # raise Exception({"rstr_suffix":rstr_suffix})

        rstr_trend = TrendEntity.lang2rstr(lang)

        # bound_right_list_raw = RegexTool.right_wordbounds()

        right_bounds = lchain(
            RegexTool.bounds2prefixed(RegexTool.right_wordbounds(),
                                      rstr_trend),
            RegexTool.right_wordbounds(),
        )
        rstr_rightbound = RegexTool.rstr2right_bounded(cls.rstr(),
                                                       right_bounds)

        logger.debug({#"rstr_trend":rstr_trend,
                      #"right_bounds":right_bounds,
                      "rstr_rightbound":rstr_rightbound,
                      })
        # rstr_suffixed = RegexTool.rstr2rstr_words_suffixed(cls.rstr(), rstr_suffix=rstr_suffix)

        # raise Exception({"rstr_trend": rstr_trend, "rstr_suffixed": rstr_suffixed})
        # return re.compile(RegexTool.rstr2wordbounded(cls.rstr()))
        return re.compile(rstr_rightbound, re.I)
Example #2
0
        def j2valid_trend(j):
            nonlocal entities_list

            if j < 3:
                return False

            j_tuple = j_param_types2j_latest(j, Param.Type.list())
            if any(map(is_none, j_tuple)):
                return False

            entities_tuple = [
                entities_list[j] if j is not None else None for j in j_tuple
            ]
            if any(map(lambda x: len(x) != 1, entities_tuple)):
                return False

            j_portlike, j_tradegood, j_rate, j_trend = j_tuple
            assert_equal(j_trend, j)
            if j_rate != j - 1:
                return False

            if j - 2 not in {j_portlike, j_tradegood}:
                return False

            entity_portlike, entity_tradegood, entity_rate, entity_trend = map(
                l_singleton2obj, entities_tuple)

            if FoxylibEntity.entity2type(
                    entity_portlike) != PortEntity.entity_type(
                    ):  # not culture
                return False

            entity_latter = max([entity_portlike, entity_tradegood],
                                key=FoxylibEntity.entity2span)

            span_latter, span_rate, span_trend = lmap(
                FoxylibEntity.entity2span,
                [entity_latter, entity_rate, entity_trend])

            span_latter_rate = SpanTool.span_pair2between(
                span_latter, span_rate)
            str_between_latter_rate = StringTool.str_span2substr(
                text, span_latter_rate)

            if not RegexTool.pattern_str2match_full(
                    RegexTool.pattern_blank_or_nullstr(),
                    str_between_latter_rate):
                return False

            span_rate_trend = SpanTool.span_pair2between(span_rate, span_trend)
            str_between_rate_trend = StringTool.str_span2substr(
                text, span_rate_trend)
            if not RegexTool.pattern_str2match_full(
                    RegexTool.pattern_blank_or_nullstr(),
                    str_between_rate_trend):
                return False

            return True
Example #3
0
    def pattern_hour(cls):
        left_bounds = RegexTool.left_wordbounds()
        right_bounds = lchain(
            RegexTool.right_wordbounds(),
            [r":"],
        )
        rstr = RegexTool.rstr2bounded(r"\d+", left_bounds, right_bounds)

        return re.compile(rstr, re.I)
Example #4
0
    def test_03(self):
        rstr = "asdf"
        rstr_right_bounded = RegexTool.rstr2right_bounded(
            rstr, RegexTool.right_wordbounds())
        self.assertTrue(re.search(rstr_right_bounded, "ijilijasdf"))
        self.assertFalse(re.search(rstr_right_bounded, "asdfuhuef"))

        rstr_left_bounded = RegexTool.rstr2left_bounded(
            rstr, RegexTool.left_wordbounds())
        self.assertFalse(re.search(rstr_left_bounded, "ijilijasdf"))
        self.assertTrue(re.search(rstr_left_bounded, "asdfuhuef"))
Example #5
0
    def pattern_number(cls):
        rstr_leftbound = RegexTool.rstr2left_bounded(
            r"\d{1,2}", RegexTool.left_wordbounds())

        rstr_bound_right_list = lchain(
            RegexTool.right_wordbounds(),
            lchain(*TimedeltaEntityUnit.gazetteer_all().values()),
        )
        rstr_bound = RegexTool.rstr2right_bounded(rstr_leftbound,
                                                  rstr_bound_right_list)
        return re.compile(rstr_bound, re.I)
Example #6
0
    def _pattern_token(cls):
        """
        n't is specially treated because if we split by word boundary, the tokens/morphemes don't make sense.

        e.g. don't => don / ' / t   ("don" is not a valid token/morpheme)
        e.g. jane's => jane / ' / s  ("jean", "'", "s" are all valid morpheme)

        correct morphology
        e.g. don't => do / n't  (however we don't need to go this far.
                                 treating don't as a single token ok for most purposes.)
        """
        rstr = RegexTool.join(r"|", [r"\w+(?:n't)", r"\w+", r"\W+"])
        return re.compile(RegexTool.rstr2wrapped(rstr))
Example #7
0
 def rstr(cls):
     rstr = format_str(
         r"{}\s*{}?",
         RegexTool.name_rstr2named(
             "cardinal",
             "\d+",
         ),
         RegexTool.name_rstr2named(
             "Metricprefix",
             Metricprefix.rstr(),
         ),
     )
     return rstr
Example #8
0
    def pattern_rate_trend(cls):
        # rstr_idk = RegexTool.rstr_iter2or(map(re.escape, cls.dict_lang2text_idk().values()))

        rstr_arrows = RegexTool.rstr_iter2or(
            map(re.escape,
                Trend.dict_trend2arrow().values()))
        rstr_rate_trend = RegexTool.join(r"", [r"\d{2,3}", rstr_arrows])

        # rstr = r"{}\s*$".format(RegexTool.rstr_iter2or([rstr_idk, rstr_rate_trend]))
        # rstr = r"{}\s*$".format(rstr_rate_trend)

        # raise Exception(rstr)
        pattern = re.compile(RegexTool.rstr2wordbounded(rstr_rate_trend), re.I)
        return pattern
Example #9
0
    def lang2pattern(cls, lang):
        from henrique.main.document.price.rate.rate_entity import RateEntity
        logger = HenriqueLogger.func_level2logger(cls.lang2pattern,
                                                  logging.DEBUG)

        left_bounds = [
            RateEntity.rstr_last_char(),
            r"\s",
        ]
        right_bounds = RegexTool.right_wordbounds()
        rstr = RegexTool.rstr2bounded(cls.lang2rstr(lang), left_bounds,
                                      right_bounds)

        logger.debug({"left_bounds": left_bounds, "rstr": rstr})
        return re.compile(rstr, re.I)
Example #10
0
    def entity_pair2is_appendable(
        cls,
        text,
        entity_pair,
    ):
        Param = PriceSkillParameter

        entity_type_pair = lmap(FoxylibEntity.entity2type, entity_pair)
        param_type_pair = lmap(Param.Type.entity_type2parameter_type,
                               entity_type_pair)

        for param_type in param_type_pair:
            if param_type not in {Param.Type.PORTLIKE, Param.Type.TRADEGOOD}:
                return False

        param_type_1, param_type_2 = param_type_pair
        if param_type_1 != param_type_2:
            return False

        span_pair = lmap(FoxylibEntity.entity2span, entity_pair)
        text_between = StringTool.str_span2substr(
            text, SpanTool.span_pair2between(*span_pair))
        is_fullmatch = RegexTool.pattern_str2match_full(
            Param.pattern_delim(), text_between)
        if not is_fullmatch:
            return False

        return True
Example #11
0
    def dict2f_sub(cls, h):
        # Create a regular expression from all of the dictionary keys
        from foxylib.tools.regex.regex_tool import RegexTool
        rstr = RegexTool.join(r"|".join(map(re.escape, h.keys())))
        p = re.compile(rstr)

        # For each match, look up the corresponding value in the dictionary
        return lambda x: p.sub(lambda m: h[m.group(0)], x)
Example #12
0
    def str_span_pattern2match_full(cls, str_in, span, pattern):
        from foxylib.tools.regex.regex_tool import RegexTool
        str_sub = cls.str_span2substr(str_in, span)
        if str_sub is None:
            return None

        m = RegexTool.pattern_str2match_full(pattern, str_sub)
        return m
Example #13
0
    def str2token_span_list(cls, str_in):
        p_token = cls._pattern_token()
        m_list_all = list(p_token.finditer(str_in))

        p_blank = cls._pattern_blank()
        m_list_nowhitespace = filter(lambda m: not RegexTool.pattern_str2match_full(p_blank, m.group()),
                                     m_list_all)
        span_list = [m.span() for m in m_list_nowhitespace]
        return span_list
Example #14
0
    def lang2pattern(cls, lang):
        j_me = cls.j_yaml()

        langs_recognizable = HenriqueLocale.lang2langs_recognizable(lang)
        me_list = [
            me for lang in langs_recognizable for me in j_me.get(lang, [])
        ]
        rstr = RegexTool.rstr_iter2or(map(re.escape, me_list))
        pattern = re.compile(rstr, re.I)
        return pattern
Example #15
0
    def test_02(self):
        str_in = "hello world"

        p1 = re.compile(r"\w+ \w+")
        m1 = RegexTool.pattern_str2match_full(p1, str_in)
        self.assertIsNotNone(m1)

        p2 = re.compile(r"\w+ \w")
        m2 = RegexTool.pattern_str2match_full(p2, str_in)
        self.assertIsNone(m2)

        p3 = re.compile(r"\w \w+")
        m3 = RegexTool.pattern_str2match_full(p3, str_in)
        self.assertIsNone(m3)

        p4 = re.compile(r"\w* \w*")
        m4 = RegexTool.pattern_str2match_full(p4, str_in)
        self.assertIsNotNone(m4)

        p5 = re.compile(r"H\w* \w*D", re.I)
        m5 = RegexTool.pattern_str2match_full(p5, str_in)
        self.assertIsNotNone(m5)
Example #16
0
    def str2token_span_list(cls, str_in):
        p_token = cls._pattern_token()

        str_typable = UnicodeTool.str2typable(str_in)
        assert_equal(len(str_in), len(str_typable))

        m_list_all = list(p_token.finditer(str_typable))

        p_blank = cls._pattern_blank()
        m_list_nowhitespace = filter(
            lambda m: not RegexTool.pattern_str2is_fullmatch(
                p_blank, m.group()), m_list_all)
        span_list = [m.span() for m in m_list_nowhitespace]
        return span_list
Example #17
0
    def test_2(self):
        text = """
        What is Lorem Ipsum?
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.

Why do we use it?
It is a long established fact that a reader will be distracted by the readable content of a page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like).


Where does it come from?
Contrary to popular belief, Lorem Ipsum is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32.

The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham.

Where can I get some?
There are many variations of passages of Lorem Ipsum available, but the majority have suffered alteration in some form, by injected humour, or randomised words which don't look even slightly believable. If you are going to use a passage of Lorem Ipsum, you need to be sure there isn't anything embarrassing hidden in the middle of text. All the Lorem Ipsum generators on the Internet tend to repeat predefined chunks as necessary, making this the first true generator on the Internet. It uses a dictionary of over 200 Latin words, combined with a handful of model sentence structures, to generate Lorem Ipsum which looks reasonable. The generated Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc.
"""

        p1 = re.compile(RegexTool.join(r"|", [r"\w+(?:n't)", r"\w+", r"\W+"]),
                        re.I)
        p2 = re.compile(RegexTool.join(r"|", [r"\w+", r"\W+"]), re.I)

        def func2time(f):
            t_start = time()
            f()
            t_end = time()
            return t_end - t_start

        t_dont = func2time(lambda: list(p1.finditer(text)))
        t_simple = func2time(lambda: list(p2.finditer(text)))

        pprint({
            "t_dont": t_dont,
            "t_simple": t_simple,
            "t_dont/t_simple": t_dont / t_simple,
        })
        self.assertLess(t_dont / t_simple, 5)
Example #18
0
        def texts2pattern(texts):
            rstr_raw = RegexTool.rstr_iter2or(map(re.escape, texts))

            left_bounds = lchain(
                RegexTool.bounds2suffixed(RegexTool.left_wordbounds(), "\d"),
                RegexTool.left_wordbounds(),
            )
            right_bounds = RegexTool.right_wordbounds()

            rstr = RegexTool.rstr2bounded(rstr_raw, left_bounds, right_bounds)
            logger.debug({
                "rstr": rstr,
                "rstr_raw": rstr_raw,
            })
            return re.compile(rstr, re.I)
Example #19
0
    def pattern_suffix(cls):

        left_bounds = RegexTool.left_wordbounds()
        right_bounds = lchain(
            RegexTool.right_wordbounds(),
            [
                RegexTool.bound2prefixed(b, r"시")
                for b in RegexTool.right_wordbounds()
            ],
        )

        rstr_rightbounded = RegexTool.rstr2right_bounded(r"\d+", right_bounds)

        def bound_iter_left():
            b_list_raw = RegexTool.left_wordbounds()
            for b in b_list_raw:
                yield b
                yield r"{}{}".format(b, r"{1,2}")

        bound_list_left = list(bound_iter_left())
        rstr_bound = RegexTool.rstr2left_bounded(rstr_rightbound,
                                                 bound_list_left)

        return re.compile(rstr_bound)
Example #20
0
 def texts2pattern_word(cls, texts):
     regex_raw = cls.texts2regex(texts)
     regex_word = RegexTool.rstr2bounded(regex_raw, RegexTool.left_wordbounds(), RegexTool.right_wordbounds())
     return re.compile(regex_word, )  # re.I can be dealt with normalizer
Example #21
0
 def bound_iter_left():
     b_list_raw = RegexTool.left_wordbounds()
     for b in b_list_raw:
         yield b
         yield r"{}{}".format(b, r"{1,2}")
Example #22
0
    def j_yaml2p_command(cls, j_yaml):
        h_lang2names = j_yaml.get("default_action_names")

        rstr = RegexTool.rstr_iter2or(lchain(*list(h_lang2names.values())))
        return re.compile("{}$".format(rstr), re.I)
Example #23
0
 def rstr_last_char(cls):
     rstr_suffix_list = [r"\d", Metricprefix.rstr()]
     return RegexTool.rstr_iter2or(rstr_suffix_list)
Example #24
0
 def gap2valid(span):
     str_span = SpanTool.list_span2sublist(text_in, span)
     return RegexTool.pattern_str2match_full(cls.pattern_colon(),
                                             str_span)
Example #25
0
 def rstr(cls):
     return RegexTool.rstr_iter2or(map(re.escape, cls.Value.set()))
Example #26
0
 def texts2regex(cls, texts):
     return RegexTool.rstr_iter2or(map(re.escape, texts))
Example #27
0
 def pattern_variation(cls):
     l = lchain(cls.string_singlequote(), cls.string_doublequote())
     rstr = RegexTool.rstr_iter2or(map(re.escape, l))
     p = re.compile(rstr)
     return p
Example #28
0
    def rstr(cls):
        rstr_multidigit = r"[1-9][0-9]+"
        rstr_onedigit = r"[0-9]"
        rstr_number = RegexTool.rstr_iter2or([rstr_multidigit, rstr_onedigit])

        return rstr_number
Example #29
0
 def str_span2match_blank_or_nullstr(cls, str_in, span):
     from foxylib.tools.regex.regex_tool import RegexTool
     return cls.str_span_pattern2match_full(str_in, span, RegexTool.pattern_blank_or_nullstr())
Example #30
0
 def pattern_ko(cls):
     return re.compile(RegexTool.rstr2wordbounded(r"육메(?:크|클)?"))