Beispiel #1
0
def test_retrieval():
    with temporary_content_path(TEST_VOCAB, ".txt") as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON, ".json") as path:
        dict_ = Dictionary(path)

    # check a super simple case
    batch = [['a']]
    defs, def_map = Retrieval(vocab, dict_).retrieve(batch)
    assert defs == [[3, 6, 7, 4], [3, 8, 9, 4]]
    assert def_map == [(0, 0, 0), (0, 0, 1)]

    # check that vectors are handled correctly
    batch = numpy.array([ord('d'), ord(' '), ord('c'), 0, 0])[None, None, :]
    defs, def_map = Retrieval(vocab, dict_).retrieve(batch)
    assert defs == [[3, 5, 6, 4]]
    assert def_map == [(0, 0, 0)]

    # check a complex case
    batch = [['a', 'b', 'b'], ['d c', 'a', 'b']]
    defs, def_map = Retrieval(vocab, dict_).retrieve(batch)
    assert defs == [[3, 6, 7, 4],
                    [3, 8, 9, 4],
                    [3, 9, 8, 4],
                    [3, 5, 6, 4]]
    assert def_map == [(0, 0, 0), (0, 0, 1),
                       (0, 1, 2),
                       (0, 2, 2),
                       (1, 0, 3),
                       (1, 1, 0), (1, 1, 1),
                       (1, 2, 2)]

    # check a complex case with exclude top k
    batch = [['a', 'b', 'c', 'd'], ['a', 'e', 'b']]
    exclude_top_k = 7 # should exclude 'a', 'b', 'c', 'd' and only define 'e'
    defs, def_map = Retrieval(vocab, dict_, exclude_top_k=exclude_top_k).retrieve(batch)
    assert defs == [[3, 6, 7, 8, 4]]
    assert def_map == [(1, 1, 0)]

    # check the op
    retrieval_op = RetrievalOp(Retrieval(vocab, dict_))
    batch = tensor.as_tensor_variable(
        [[[ord('d'), ord(' '), ord('c'), 0, 0],
          [ord('e'), 0, 0, 0, 0]]])
    defs_var, mask_var,  def_map_var = retrieval_op(batch)
    assert defs_var.eval().tolist() == [[3, 5, 6, 4, 0],
                                        [3, 6, 7, 8, 4]]
    assert_allclose(mask_var.eval(), [[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
    assert def_map_var.eval().tolist() == [[0, 0, 0], [0, 1, 1]]
def test_language_model():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON, suffix=".json") as path:
        dict_ = Dictionary(path)

    floatX = theano.config.floatX

    def make_data_and_mask(data):
        data = [[str2vec(s, 3) for s in row] for row in data]
        data = np.array(data)
        mask = np.ones((data.shape[0], data.shape[1]),
                        dtype=floatX)
        return data, mask
    words_val, mask_val = make_data_and_mask([['p', 'e', 'a', ], ['a', 'e', 'p',]])
    mask_val[1,2] = 0
    print "data:"
    print words_val
    print "mask:"
    print mask_val
    mask_def_emb_val = np.asarray([[0, 1], [0,0]])

    # With the dictionary
    retrieval = Retrieval(vocab, dict_, exclude_top_k=7)
    lm = LanguageModel(7, 5, vocab.size(), vocab.size(), 
        vocab=vocab, retrieval=retrieval,
        compose_type='transform_and_sum',
        weights_init=Uniform(width=0.1),
        biases_init=Uniform(width=0.1))
    lm.initialize()
    words = tensor.ltensor3('words')
    mask = tensor.matrix('mask', dtype=floatX)
    costs = lm.apply(words, mask)
    cg = ComputationGraph(costs)
    def_mean, = VariableFilter(name='_dict_word_embeddings')(cg)
    def_mean_f = theano.function([words], def_mean)

    perplexities = VariableFilter(name_regex='perplexity.*')(cg)
    mask_def_emb, = VariableFilter(name='mask_def_emb')(cg)
    perplexities_f = theano.function([words, mask], perplexities)
    perplexities_v = perplexities_f(words_val, mask_val)
    mask_emb_f = theano.function([words, mask], mask_def_emb)
    mask_def_v = mask_emb_f(words_val, mask_val)
    for v,p in zip(perplexities_v,perplexities):
        print p.name, ":", v
    assert(np.allclose(mask_def_v, mask_def_emb_val))
Beispiel #3
0
def t_e_s_t_language_model():
    V = 50
    gen = FakeTextGenerator(V, 6, 6, 1.0, 0.2)
    n_sentences = 3
    len_sentences = 7
    data = [gen.sample_sentence(len_sentences) for i in range(n_sentences)]
    vocab_list = '\n'.join(list(set(gen.vocabulary)))
    dict_json = json.dumps(gen.dictionary)
    print "JSON dict:", dict_json

    with temporary_content_path(vocab_list) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(dict_json) as path:
        dict_ = Dictionary(path)

    data = [[str2vec(s, generator.tok_len) for s in row] for row in data]
    data = numpy.array(data)
    print "Data:", data

    # With the dictionary
    lm = LanguageModel(vocab=vocab,
                       dict_=dict_,
                       dim=10,
                       weights_init=Uniform(width=0.1),
                       biases_init=Uniform(width=0.1))
    lm.initialize()
    costs = lm.apply(tensor.as_tensor_variable(data),
                     numpy.ones((data.shape[0], data.shape[1])))
    cg = ComputationGraph(costs)
    def_spans, = VariableFilter(name='def_spans')(cg)
    f = theano.function([], [costs, def_spans])
    costs_value, def_spans_value = f()
    assert def_spans_value.tolist() == [[0, 2], [2, 4], [4, 5], [5, 7]]

    # Without the dictionary
    lm2 = LanguageModel(vocab=vocab,
                        dim=10,
                        weights_init=Uniform(width=0.1),
                        biases_init=Uniform(width=0.1))
    costs2 = lm2.apply(tensor.as_tensor_variable(data),
                       numpy.ones((data.shape[0], data.shape[1])))
    costs2.eval()
Beispiel #4
0
def test_extractive_qa_model():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    with temporary_content_path(TEST_DICT_JSON) as path:
        dict_ = Dictionary(path)

    def make_data_and_mask(data):
        data = [[vocab.word_to_id(s) for s in row] for row in data]
        data = numpy.array(data)
        mask = numpy.ones((data.shape[0], data.shape[1]),
                          dtype=theano.config.floatX)
        return data, mask

    # create some dummy data
    contexts, context_mask = make_data_and_mask([['a', 'a', 'a', 'b'],
                                                 ['b', 'a', 'b', 'a'],
                                                 ['a', 'b', 'b', 'b']])
    questions, question_mask = make_data_and_mask([['a', 'a'], ['b', 'a'],
                                                   ['a', 'b']])
    answer_begins = [0, 0, 1]
    answer_ends = [1, 2, 2]

    for coattention in [False, True]:
        qam = ExtractiveQAModel(vocab=vocab,
                                dim=10,
                                emb_dim=10,
                                num_input_words=10,
                                compose_type='sum',
                                use_definitions=False,
                                reuse_word_embeddings=False,
                                def_reader='LSTMReadDefinitions',
                                coattention=coattention,
                                weights_init=Uniform(width=0.1),
                                biases_init=Uniform(width=0.1))
        qam.initialize()

        costs = qam.apply(tensor.as_tensor_variable(contexts), context_mask,
                          tensor.as_tensor_variable(questions), question_mask,
                          tensor.as_tensor_variable(answer_begins),
                          tensor.as_tensor_variable(answer_ends))
        assert costs.eval().shape == (3, )
def test_vocab_op():
    with temporary_content_path(TEST_VOCAB) as path:
        vocab = Vocabulary(path)
    op = WordToIdOp(vocab)

    input_ = tensor.as_tensor_variable([ord('d'), ord(' '), ord('c'), 0, 0])
    assert op(input_).eval() == 0

    input_ = tensor.as_tensor_variable([ord('a')])
    assert op(input_).eval() == 5

    input_ = tensor.as_tensor_variable([[ord('a'), 0], [ord('b'), 0]])
    assert list(op(input_).eval()) == [5, 6]
Beispiel #6
0
def test_text_dataset():
    with temporary_content_path(TEST_TEXT) as path:
        dataset = TextDataset(path, 100)
        stream = dataset.get_example_stream()
        it = stream.get_epoch_iterator()

        d = next(it)
        assert d == (['abc', 'abc', 'def'], )
        pickled_it = cPickle.dumps(it)

        d = next(it)
        assert d == (['def', 'def', 'xyz'], )

        it = cPickle.loads(pickled_it)
        d = next(it)
        assert d == (['def', 'def', 'xyz'], )

        d = next(it)
        assert d == (['xyz'], )
def main():
    parser = argparse.ArgumentParser(
        "Generate synthetic data and outputs in files")
    parser.add_argument("path",
                        type=str,
                        help="Top most frequent words to leave")
    parser.add_argument("n_primes", type=int, help="# of primes")
    parser.add_argument("n_non_primes", type=int, help="# of non-primes")
    parser.add_argument("features_size", type=int, help="Features size")
    parser.add_argument("markov_order", type=int, help="Markov order")
    parser.add_argument("n_sentences", type=int, help="# sentences")
    parser.add_argument("pc_train", type=float, help="% train sentences")
    parser.add_argument("pc_valid", type=float, help="% valid sentences")
    parser.add_argument("sample_temperature",
                        type=float,
                        default=1.0,
                        help="% valid sentences")
    parser.add_argument("min_sentence_len", type=int, default=6)
    parser.add_argument("max_sentence_len", type=int, default=20)
    parser.add_argument("min_def_len", type=int, default=6)
    parser.add_argument("max_def_len", type=int, default=20)

    args = parser.parse_args()

    print "Number of sentences:", args.n_sentences
    assert (0 < args.pc_train + args.pc_valid < 1)
    assert (os.path.exists(args.path) == False)
    os.makedirs(args.path)
    args.pc_test = 1 - (args.pc_train + args.pc_valid)

    gen = FakeTextGenerator(args.n_primes, args.n_non_primes,
                            args.features_size, args.markov_order,
                            args.sample_temperature, args.min_def_len,
                            args.max_def_len)

    data = gen.create_corpus(args.n_sentences, args.min_sentence_len,
                             args.max_sentence_len, args.pc_train,
                             args.pc_valid)

    train_data, valid_data, test_data = data

    concat_sentences = lambda sentences: [' '.join(s) for s in sentences]
    train_data = concat_sentences(train_data)
    test_data = concat_sentences(test_data)
    valid_data = concat_sentences(valid_data)

    all_data = train_data + valid_data + test_data
    with temporary_content_path('\n'.join(all_data)) as path:
        vocab = Vocabulary.build(path, sort_by='lexicographical')
        vocab.save(os.path.join(args.path, "vocab.txt"))

    dict_json = json.dumps(gen.dictionary)
    write_data(os.path.join(args.path, "dict.json"), dict_json)

    write_data(os.path.join(args.path, "train.txt"), '\n'.join(train_data))
    write_data(os.path.join(args.path, "valid.txt"), '\n'.join(valid_data))
    write_data(os.path.join(args.path, "test.txt"), '\n'.join(test_data))

    args_json = json.dumps(vars(args), indent=4, sort_keys=True)
    write_data(os.path.join(args.path, "params.json"), args_json)

    write_data(os.path.join(args.path, "generator.p"), pickle.dumps(gen))
def test_dict():
    with temporary_content_path(TEST_DICT_JSON, ".json") as path:
        dict_ = Dictionary(path)
    assert dict_.get_definitions('a') == [['b', 'c'], ['d', 'e']]
    assert dict_.get_definitions('d c') == [['a', 'b']]