def load_adaptation_sample(filename): if filename.endswith(".gz"): A_models = arpa.load(gzip.open(filename, mode='rt')) else: A_models = arpa.loadf(filename) A = A_models[0] return A
def test_load_dump(): with open(TEST_ARPA, 'rt') as fp: lm = arpa.load(fp)[0] fp.seek(0) with tempfile.TemporaryFile(mode='w+t') as gp: arpa.dump(lm, gp) gp.seek(0) assert fp.read() == gp.read()
def test_load_dump(): with open(TEST_ARPA, "rt") as fp: lm = arpa.load(fp)[0] fp.seek(0) with tempfile.TemporaryFile(mode="w+t") as gp: arpa.dump(lm, gp) gp.seek(0) assert fp.read() == gp.read()
def load_background(filename): if filename.endswith(".gz"): B_models = arpa.load(gzip.open(filename, mode='rt')) else: B_models = arpa.loadf(filename) B = B_models[0] # ARPA files may contain several models. # We can recover f_B_star (i.e., discounted probabilities) from interpolated probabilities # As B is an interpolated model, i.e., p_B(w|h) = f_B_star(w|h) + bow_B(h) * p_B(w|h') # Thus, # # f_B_star(w|h) = p_B(w|h) - bow_B(h) * p_B(w|h') # # where h' = h[1:] f_B_star = dict() for n in range(2, B.order() + 1): print("%d-gram" % n) # progress_count = 0 for e in B._entries(n): # entry format: (log10(prob), hw, log10(bow)) hw = e[1] h = hw[:-1] h_prime_w = hw[1:] f_B_star[hw] = B._base**float(e[0]) - B._base**( float(B._bos[h]) + float(log_p(B, h_prime_w))) # assert f_B_star[hw] >= 0 # progress_count += 1 # if progress_count % 2000 == 0: # print(progress_count) # Index structure: # len(h) --> h --> {w | hw is seen in the corpus}, where len(h) >= 1 B_hist_index = [defaultdict(list) for i in range(B.order())] for n in range(2, B.order() + 1): print("%d-gram" % n) # progress_count = 0 for e in B._entries(n): hw = e[1] h = hw[:-1] w = hw[-1] B_hist_index[len(h)][h].append(w) # progress_count += 1 # if progress_count % 2000 == 0: # print(progress_count) return B, f_B_star, B_hist_index
def test_load_option_parser(): with pytest.raises(ValueError): arpa.load(None, parser='foo')
def test_load_option_model(): with pytest.raises(ValueError): arpa.load(None, model='foo')