def test_new_model_order(): lm = ARPAModelSimple() assert lm.order() is None for p in PARSERS: lm = arpa.loadf(TEST_ARPA, parser=p)[0] assert lm.order() == 5
def cal_A_adapted_arpa(B, f_B_star, B_hist_index, alpha, z_epsilon): A_adapted = ARPAModelSimple() global zh1, zh2 check_sum = 0 # unigram print("processing unigram...") for e in B._entries(1): w = e[1] p_A_w = float(p(B, w)) * alpha[w[0]] / z_epsilon A_adapted.add_entry(ngram=w, p=math.log(p_A_w, B._base)) check_sum += p_A_w assert check_sum - 1 < 0.0001 zh2[tuple()] = z_epsilon # ngram, n >= 2 for n in range(2, B.order() + 1): print("processing %d-gram..." % n) progress_count = 0 zh1.clear() zh1 = zh2 zh2 = dict() len_h = n - 1 for h, w_list in B_hist_index[len_h].items(): z_h = cal_z(h, B, f_B_star, B_hist_index, alpha, z_epsilon) z_h_prime = zh1.get(h[1:], None) # we can cache this if z_h_prime is None: z_h_prime = cal_z(h[1:], B, f_B_star, B_hist_index, alpha, z_epsilon) bow_A_h = (B._base**float(B._bos[h])) * z_h_prime / z_h A_adapted._bos[h] = math.log(bow_A_h, B._base) for w in w_list: hw = h + (w, ) # p_A_hw = cal_p_A(hw, f_B_star, alpha, z_h, bow_A_h, A_adapted) p_A_hw = cal_p_A_old(hw, alpha, B, z_h) # if p_A_hw - p_A_hw2 > 0.0001: # print("p_A_hw=", p_A_hw, "p_A_hw2=", p_A_hw2) # exit(0) A_adapted.add_entry(ngram=hw, p=math.log(p_A_hw, B._base)) progress_count += 1 if progress_count % 1000000 == 0: print(progress_count) for order, count in B.counts(): A_adapted.add_count(order, count) return A_adapted
def test_new_model_log_s(): lm = ARPAModelSimple() with pytest.raises(ValueError): lm.log_p(1)
def test_new_model_counts(): lm = ARPAModelSimple() assert lm.counts() == []
def test_new_model_contains(): lm = ARPAModelSimple() lm.add_entry(["foo"], 1.0) assert "foo" in lm
def test_input_equality(): lm = ARPAModelSimple() with pytest.raises(KeyError): assert lm.p('foo') == lm.p(('foo', )) with pytest.raises(KeyError): assert lm.p('xxx') == lm.p(('xxx', )) with pytest.raises(KeyError): assert lm.p('a little') == lm.p(('a', 'little')) with pytest.raises(KeyError): assert lm.p('xxx little') == lm.p(('xxx', 'little')) lm = arpa.loadf(TEST_ARPA)[0] assert lm.p('foo') == lm.p(('foo', )) assert lm.p('xxx') == lm.p(('xxx', )) assert lm.p('a little') == lm.p(('a', 'little')) assert lm.p('xxx little') == lm.p(('xxx', 'little'))
def test_log_s_int(): lm = ARPAModelSimple() with pytest.raises(ValueError): lm.log_s(1)
def test_log_p_empty_string(): lm = ARPAModelSimple() with pytest.raises(ValueError): lm.log_p('')
def test_new_model_vocabulary(): lm = ARPAModelSimple() assert lm.vocabulary() == []
def test_log_p_raw(): lm = ARPAModelSimple() with pytest.raises(KeyError): lm.log_p_raw('UnladenSwallow')
def test_new_model_contains(): lm = ARPAModelSimple() lm.add_entry(('foo', ), 1.0) assert 'foo' in lm
def test_log_p_empty_tuple(): lm = ARPAModelSimple() with pytest.raises(ValueError): lm.log_p(tuple())
def test_new_model_len(): lm = ARPAModelSimple() assert len(lm) == 0
def test_new_model_order(): lm = ARPAModelSimple() assert lm.order() is None
def test_new_model_contains_not(): lm = ARPAModelSimple() assert "foo" not in lm