Beispiel #1
0
def test_text(backend_default):
    text_data = ('Lorem ipsum dolor sit amet, consectetur adipisicing elit, '
                 'sed do eiusmod tempor incididunt ut labore et dolore magna '
                 'aliqua. Ut enim ad minim veniam, quis nostrud exercitation '
                 'ullamco laboris nisi ut aliquip ex ea commodo consequat. '
                 'Duis aute irure dolor in reprehenderit in voluptate velit '
                 'esse cillum dolore eu fugiat nulla pariatur. Excepteur sint '
                 'occaecat cupidatat non proident, sunt in culpa qui officia '
                 'deserunt mollit anim id est laborum.')
    data_path = 'tmp_test_text_data'
    with open(data_path, 'w') as f:
        f.write(text_data)

    NervanaObject.be.bsz = 4
    time_steps = 6
    valid_split = 0.2

    # load data and parse on character-level
    train_path, valid_path = Text.create_valid_file(data_path,
                                                    valid_split=valid_split)
    train_set = Text(time_steps, train_path)
    valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

    train_set.be = NervanaObject.be
    bsz = train_set.be.bsz

    for i, (X_batch, y_batch) in enumerate(train_set):
        if i > 2:
            break
        chars = [
            train_set.index_to_token[x]
            for x in np.argmax(X_batch.get(), axis=0).tolist()
        ]
        # First sent of first batch will be contiguous with first sent of next
        # batch
        for batch in range(bsz):
            sent = ''.join(chars[batch::bsz])
            start = i * time_steps + batch * time_steps * train_set.nbatches
            sent_ref = text_data[start:start + time_steps]
            assert sent == sent_ref

    valid_start = int(len(text_data) * (1 - valid_split))
    for i, (X_batch, y_batch) in enumerate(valid_set):
        if i > 2:
            break
        chars = [
            train_set.index_to_token[x]
            for x in np.argmax(X_batch.get(), axis=0).tolist()
        ]
        for batch in range(bsz):
            sent = ''.join(chars[batch::bsz])
            start = i*time_steps + batch * time_steps * \
                valid_set.nbatches + valid_start
            sent_ref = text_data[start:start + time_steps]
            assert sent == sent_ref

    os.remove(data_path)
    os.remove(train_path)
    os.remove(valid_path)
Beispiel #2
0
# download penn treebank
train_path = load_ptb_train(path=args.data_dir)
valid_path = load_ptb_test(path=args.data_dir)


# define a custom function to parse the input into individual tokens, which for
# this data, splits into individual words.  This can be passed into the Text
# object during dataset creation as seen below.
def tokenizer(s):
    return s.replace('\n', '<eos>').split()


# load data and parse on word-level
train_set = Text(time_steps,
                 train_path,
                 tokenizer=tokenizer,
                 onehot_input=False)
valid_set = Text(time_steps,
                 valid_path,
                 vocab=train_set.vocab,
                 tokenizer=tokenizer,
                 onehot_input=False)

# weight initialization
init = Uniform(low=-0.1, high=0.1)

# model initialization
rlayer_params = {
    "output_size": hidden_size,
    "init": init,
    "activation": Tanh(),
Beispiel #3
0
# these hyperparameters are from the paper
args.batch_size = 50
time_steps = 150
hidden_size = 500
gradient_clip_value = None

# setup backend
be = gen_backend(**extract_valid_args(args, gen_backend))

# download penn treebank
train_path = load_ptb_train(path=args.data_dir)
valid_path = load_ptb_test(path=args.data_dir)

# load data and parse on character-level
train_set = Text(time_steps, train_path)
valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

# weight initialization
init = Uniform(low=-0.08, high=0.08)

# model initialization
layers = [Recurrent(hidden_size, init, activation=Tanh()),
          Affine(len(train_set.vocab), init, bias=init, activation=Softmax())]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

model = Model(layers=layers)

optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding)