def make_expr(self, init_state=None, tries=10, test_output=True, skip_re=r"", probability=False): found = False for _ in range(tries): if init_state: init_state = unicodedata.normalize("NFD", init_state) prefix = init_state.strip(BEGIN) init_state = init_state.rjust(self.state_size, BEGIN)[-self.state_size:] else: prefix = '' try: if probability: expr, prob = self.chain.walk(init_state, probability) expr = prefix + expr else: expr = prefix + self.chain.walk(init_state, probability) except KeyError: expr, prob = "", 0 if test_output: if self.test_expr_output(expr): if skip_re: if not re.search(unicodedata.normalize("NFD", skip_re), expr): found = True else: found = True else: found = True if found: if probability: return expr, prob else: return expr
def build(self, corpus, state_size): model = {} model = defaultdict(lambda: defaultdict(int)) for run, score in corpus: norm_run = unicodedata.normalize("NFD", run) items = (BEGIN * state_size) + norm_run + END for i in range(len(norm_run) + 1): state = items[i:i+state_size] follow = items[i+state_size] model[state][follow] += score model = dict({k: dict(model[k]) for k in model}) return model
def expr_prob(self, expr): prepped_expr = BEGIN * self.state_size + unicodedata.normalize("NFD", expr) + END output = 1 for i in range(len(expr) + 1): output *= self.chain.prob(prepped_expr[i:i+self.state_size], prepped_expr[i+self.state_size]) return output
def __init__(self, uid, state_size, expr_score_list, chain=None): self.uid = uid self.state_size = state_size self.expr_set = {unicodedata.normalize("NFD", ex[0]) for ex in expr_score_list} self.chain = chain or PLChain(expr_score_list, state_size)
def test_bug_834676(self): # Check for bug 834676 unicodedata.normalize('NFC', '\ud55c\uae00')
def test_edge_cases(self): self.assertRaises(TypeError, unicodedata.normalize) self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx') self.assertEqual(unicodedata.normalize('NFKC', ''), '')
def NFKD(str): return unicodedata.normalize("NFKD", str)
def NFC(str): return unicodedata.normalize("NFC", str)