Beispiel #1
0
def t_e_s_t_language_model():
    V = 50
    gen = FakeTextGenerator(V, 6, 6, 1.0, 0.2)
    n_sentences = 3
    len_sentences = 7
    data = [gen.sample_sentence(len_sentences) for i in range(n_sentences)]
    vocab_list = '\n'.join(list(set(gen.vocabulary)))
    dict_json = json.dumps(gen.dictionary)
    print "JSON dict:", dict_json

    with temporary_content_path(vocab_list) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(dict_json) as path:
        dict_ = Dictionary(path)

    data = [[str2vec(s, generator.tok_len) for s in row] for row in data]
    data = numpy.array(data)
    print "Data:", data

    # With the dictionary
    lm = LanguageModel(vocab=vocab,
                       dict_=dict_,
                       dim=10,
                       weights_init=Uniform(width=0.1),
                       biases_init=Uniform(width=0.1))
    lm.initialize()
    costs = lm.apply(tensor.as_tensor_variable(data),
                     numpy.ones((data.shape[0], data.shape[1])))
    cg = ComputationGraph(costs)
    def_spans, = VariableFilter(name='def_spans')(cg)
    f = theano.function([], [costs, def_spans])
    costs_value, def_spans_value = f()
    assert def_spans_value.tolist() == [[0, 2], [2, 4], [4, 5], [5, 7]]

    # Without the dictionary
    lm2 = LanguageModel(vocab=vocab,
                        dim=10,
                        weights_init=Uniform(width=0.1),
                        biases_init=Uniform(width=0.1))
    costs2 = lm2.apply(tensor.as_tensor_variable(data),
                       numpy.ones((data.shape[0], data.shape[1])))
    costs2.eval()
Beispiel #2
0
def _str2vec(word):
    return str2vec(word, MAX_NUM_CHARACTERS)
Beispiel #3
0
def test_str2vec():
    assert str2vec('def', 5).tolist() == [ord('d'), ord('e'), ord('f'), 0, 0]
    assert str2vec('abcdef', 3).tolist() == [ord('a'), ord('b'), ord('c')]
 def make_data_and_mask(data):
     data = [[str2vec(s, 3) for s in row] for row in data]
     data = np.array(data)
     mask = np.ones((data.shape[0], data.shape[1]),
                     dtype=floatX)
     return data, mask