예제 #1
0
파일: text.py 프로젝트: kevdur/bvmm
def read_data(filename, words=False, sep='_', max_length=None):
    with open(filename, 'r') as f:
        if words:
            data = [sep+w for w in f.read().split()]
        else:
            data = [x for x in sep.join(f.read().split())]
    if max_length is not None:
        data = data[:max_length]
    data, alphabet = bvmm.create_index(data)
    print('Sequence characters:', len(data))
    print('Distinct characters:', len(alphabet))
    return data, alphabet
예제 #2
0
def read_data(filename, sep='-', max_length=None):
    with open(filename, 'r') as f:
        data = []
        for l in f:
            v, w = l.split()[:2]
            data.append((sep+v, sep+w))
    if max_length is not None:
        data = data[:max_length]
    data, alphabet = bvmm.create_index(data, kind='network')
    print('Sequence characters:', len(data))
    print('Distinct characters:', len(alphabet))
    return data, alphabet
예제 #3
0
alphabet = ['-{}'.format(x) for x in range(25)]
with open('dat/synthetic_3.pkl', 'rb') as f:
    root = pickle.load(f)
bvmm.print_tree(root, alphabet, max_counts=4)

# root = bvmm.rand_tree(50, alphabet)
# bvmm.print_tree(root, alphabet, max_counts=4)
# # with open('dat/synthetic_3.pkl', 'wb') as f:
# #     pickle.dump(root, f)

# %%
n = 10_000
with open('dat/synthetic_3_{}.txt'.format(n), 'r') as f:
    data = ['-' + x for x in f.read().split('-') if x]
data, alphabet = bvmm.create_index(data)

# data = bvmm.rand_data(root, n)
# # with open('dat/synthetic_3_{}.txt'.format(n), 'w') as f:
# #     f.write(''.join(bvmm.apply_alphabet(data, alphabet)))

# %%
%%time
mcmc, counts = bvmm.mcmc(data, alphabet, 100_000, 10)
bvmm.print_tree(mcmc, alphabet, min_samples=.01, max_counts=4)
print(counts)
bvmm.write_tree(mcmc, alphabet, 'dat/synthetic_3_{}.net'.format(n),
                min_samples=.01)

# %%
def activate_same(v1, v2):