def test_batch_basic(): model = Numila(PARSE='batch', CHUNK_THRESHOLD=2) log = utils.log_parse(model, 'a b c') assert log.count('bump a -> b') is 1 assert log.count('bump b -> c') is 1 log = utils.log_parse(model, 'a b c d') assert log.count('bump a -> b') is 1 assert log.count('bump b -> c') is 1 assert log.count('bump c -> d') is 1
def test_inc_basic(): model = Numila(PARSE='incremental', CHUNK_THRESHOLD=2) log = utils.log_parse(model, 'a b c') assert log.count('bump a -> b') is 2 assert log.count('bump b -> c') is 2 log = utils.log_parse(model, 'a b c d') assert log.count('bump a -> b') is 3 assert log.count('bump b -> c') is 3 assert log.count('bump c -> d') is 3 log = utils.log_parse(model, 'a b c d e') assert log.count('bump a -> b') is 3 assert log.count('bump b -> c') is 3 assert log.count('bump c -> d') is 3 assert log.count('bump d -> e') is 3
def test_easy(model): model.params['CHUNK_THRESHOLD'] = 2 # One simple utterance 50 times. utterance = 'a b a c a b d' corpus = [utterance] * 50 model.parse(corpus[0]) print(utils.log_parse(model, corpus[0])) a, b, c, d = (model.graph[x] for x in 'abcd') # node objects def weight(edge, n1, n2): return n1.edge_weight(n2, edge) # Check that all connections are positive after one utterance for x, y in utils.neighbors(utterance.split(' ')): assert weight('ftp', model.graph[x], model.graph[y]) assert weight('btp', model.graph[y], model.graph[x]) # Equal conditional probability, but more evidence #assert weight('btp', b, a) < weight('btp', c, a) model.fit(corpus) # Check that weights don't change when they shouldn't change. w1 = weight('ftp', a, b) model.parse('b c') w2 = weight('ftp', a, b) assert w1 - w2 < .001 w1 = weight('btp', b, a) model.parse('d a d a d a d a d a d a') w2 = weight('btp', b, a) assert w1 - w2 < .001 # Check that more common edges are more highly weighted. # We vary the conditional (ab | a) and raw (ab) probabilities. # Reference: a b a c a b d # Higher conditional, higher raw. assert weight('ftp', a, b) > weight('ftp', a, c) # Higher conditional, equal raw. assert weight('ftp', c, a) > weight('ftp', b, d) return # TODO # Equal conditional, higher raw. But lots of evidence for both. print() print(weight('btp', c, a, verbose=True)) assert 0 assert weight('btp', b, a) - weight('btp', c, a) < 0.001 # This always fails for vector. The edge weights do not really # represent probabilities. They are more sensitive to the raw # occurrence counts. # p(ab | a) = 0.66 # p(ca | c) = 1 # p(ab) = 0.4 # p(ca) = 0.2 #assert weight('ftp', c, a) > weight('ftp', a, b) assert weight('ftp', a, a) < 0.05 assert weight('ftp', b, b) < 0.05 assert weight('ftp', c, c) < 0.05 assert weight('ftp', b, c) < 0.05