Beispiel #1
0
def test_text(backend_default):
    text_data = ('Lorem ipsum dolor sit amet, consectetur adipisicing elit, '
                 'sed do eiusmod tempor incididunt ut labore et dolore magna '
                 'aliqua. Ut enim ad minim veniam, quis nostrud exercitation '
                 'ullamco laboris nisi ut aliquip ex ea commodo consequat. '
                 'Duis aute irure dolor in reprehenderit in voluptate velit '
                 'esse cillum dolore eu fugiat nulla pariatur. Excepteur sint '
                 'occaecat cupidatat non proident, sunt in culpa qui officia '
                 'deserunt mollit anim id est laborum.')
    data_path = 'tmp_test_text_data'
    with open(data_path, 'w') as f:
        f.write(text_data)

    NervanaObject.be.bsz = 4
    time_steps = 6
    valid_split = 0.2

    # load data and parse on character-level
    train_path, valid_path = Text.create_valid_file(data_path,
                                                    valid_split=valid_split)
    train_set = Text(time_steps, train_path)
    valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

    train_set.be = NervanaObject.be
    bsz = train_set.be.bsz

    for i, (X_batch, y_batch) in enumerate(train_set):
        if i > 2:
            break
        chars = [
            train_set.index_to_token[x]
            for x in np.argmax(X_batch.get(), axis=0).tolist()
        ]
        # First sent of first batch will be contiguous with first sent of next
        # batch
        for batch in range(bsz):
            sent = ''.join(chars[batch::bsz])
            start = i * time_steps + batch * time_steps * train_set.nbatches
            sent_ref = text_data[start:start + time_steps]
            assert sent == sent_ref

    valid_start = int(len(text_data) * (1 - valid_split))
    for i, (X_batch, y_batch) in enumerate(valid_set):
        if i > 2:
            break
        chars = [
            train_set.index_to_token[x]
            for x in np.argmax(X_batch.get(), axis=0).tolist()
        ]
        for batch in range(bsz):
            sent = ''.join(chars[batch::bsz])
            start = i*time_steps + batch * time_steps * \
                valid_set.nbatches + valid_start
            sent_ref = text_data[start:start + time_steps]
            assert sent == sent_ref

    os.remove(data_path)
    os.remove(train_path)
    os.remove(valid_path)
Beispiel #2
0
def test_text(backend_default):
    text_data = (
        'Lorem ipsum dolor sit amet, consectetur adipisicing elit, '
        'sed do eiusmod tempor incididunt ut labore et dolore magna '
        'aliqua. Ut enim ad minim veniam, quis nostrud exercitation '
        'ullamco laboris nisi ut aliquip ex ea commodo consequat. '
        'Duis aute irure dolor in reprehenderit in voluptate velit '
        'esse cillum dolore eu fugiat nulla pariatur. Excepteur sint '
        'occaecat cupidatat non proident, sunt in culpa qui officia '
        'deserunt mollit anim id est laborum.'
    )
    data_path = 'tmp_test_text_data'
    with open(data_path, 'w') as f:
        f.write(text_data)

    NervanaObject.be.bsz = 4
    time_steps = 6
    valid_split = 0.2

    # load data and parse on character-level
    train_path, valid_path = Text.create_valid_file(
        data_path, valid_split=valid_split)
    train_set = Text(time_steps, train_path)
    valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

    train_set.be = NervanaObject.be
    bsz = train_set.be.bsz

    for i, (X_batch, y_batch) in enumerate(train_set):
        if i > 2:
            break
        chars = [train_set.index_to_token[x]
                 for x in np.argmax(X_batch.get(), axis=0).tolist()]
        # First sent of first batch will be contiguous with first sent of next
        # batch
        for batch in range(bsz):
            sent = ''.join(chars[batch::bsz])
            start = i*time_steps + batch * time_steps * train_set.nbatches
            sent_ref = text_data[start:start+time_steps]
            assert sent == sent_ref

    valid_start = int(len(text_data) * (1 - valid_split))
    for i, (X_batch, y_batch) in enumerate(valid_set):
        if i > 2:
            break
        chars = [train_set.index_to_token[x]
                 for x in np.argmax(X_batch.get(), axis=0).tolist()]
        for batch in range(bsz):
            sent = ''.join(chars[batch::bsz])
            start = i*time_steps + batch * time_steps * \
                valid_set.nbatches + valid_start
            sent_ref = text_data[start:start+time_steps]
            assert sent == sent_ref

    os.remove(data_path)
    os.remove(train_path)
    os.remove(valid_path)
Beispiel #3
0
    def vectorize_stories(self, data):
        """
        Convert (story, query, answer) word data into vectors.

        Args:
            data (tuple) : Tuple of story, query, answer word data.

        Returns:
            tuple : Tuple of story, query, answer vectors.
        """
        s, q, a = [], [], []
        for story, query, answer in data:
            s.append(self.words_to_vector(story))
            q.append(self.words_to_vector(query))
            a.append(self.one_hot_vector(answer))

        s = Text.pad_sentences(s, self.story_maxlen)
        q = Text.pad_sentences(q, self.query_maxlen)
        a = np.array(a)
        return (s, q, a)
    def vectorize_stories(self, data):
        """
        Convert (story, query, answer) word data into vectors.

        Args:
            data (tuple) : Tuple of story, query, answer word data.

        Returns:
            tuple : Tuple of story, query, answer vectors.
        """
        s, q, a = [], [], []
        for story, query, answer in data:
            s.append(self.words_to_vector(story))
            q.append(self.words_to_vector(query))
            a.append(self.one_hot_vector(answer))

        s = Text.pad_sentences(s, self.story_maxlen)
        q = Text.pad_sentences(q, self.query_maxlen)
        a = np.array(a)
        return (s, q, a)
Beispiel #5
0
# download penn treebank
train_path = load_ptb_train(path=args.data_dir)
valid_path = load_ptb_test(path=args.data_dir)


# define a custom function to parse the input into individual tokens, which for
# this data, splits into individual words.  This can be passed into the Text
# object during dataset creation as seen below.
def tokenizer(s):
    return s.replace('\n', '<eos>').split()


# load data and parse on word-level
train_set = Text(time_steps,
                 train_path,
                 tokenizer=tokenizer,
                 onehot_input=False)
valid_set = Text(time_steps,
                 valid_path,
                 vocab=train_set.vocab,
                 tokenizer=tokenizer,
                 onehot_input=False)

# weight initialization
init = Uniform(low=-0.1, high=0.1)

# model initialization
rlayer_params = {
    "output_size": hidden_size,
    "init": init,
    "activation": Tanh(),
Beispiel #6
0
print ex_answer

while True:
    # ask user for story and question
    story_lines = []
    line = raw_input("\nPlease enter a story:\n")
    while line != "":
        story_lines.append(line)
        line = raw_input()
    story = ("\n".join(story_lines)).strip()

    question = raw_input("Please enter a question:\n")

    # convert user input into a suitable network input
    vectorize = lambda words, max_len: \
        be.array(Text.pad_sentences([babi.words_to_vector(BABI.tokenize(words))], max_len))
    s = vectorize(story, babi.story_maxlen)
    q = vectorize(question, babi.query_maxlen)

    # get prediction probabilities with forward propagation
    probs = model_inference.fprop(x=(s, q), inference=True).get()

    # get top k answers
    top_k = -min(5, babi.vocab_size)
    max_indices = np.argpartition(probs, top_k, axis=0)[top_k:]
    max_probs = probs[max_indices]
    sorted_idx = max_indices[np.argsort(max_probs, axis=0)]

    print "\nAnswer:"
    for idx in reversed(sorted_idx):
        idx = int(idx)
Beispiel #7
0
print ex_answer

while True:
    # ask user for story and question
    story_lines = []
    line = raw_input("\nPlease enter a story:\n")
    while line != "":
        story_lines.append(line)
        line = raw_input()
    story = ("\n".join(story_lines)).strip()

    question = raw_input("Please enter a question:\n")

    # convert user input into a suitable network input
    vectorize = lambda words, max_len: \
        be.array(Text.pad_sentences([babi.words_to_vector(BABI.tokenize(words))], max_len))
    s = vectorize(story, babi.story_maxlen)
    q = vectorize(question, babi.query_maxlen)

    # get prediction probabilities with forward propagation
    probs = model_inference.fprop(x=(s, q), inference=True).get()

    # get top k answers
    top_k = -min(5, babi.vocab_size)
    max_indices = np.argpartition(probs, top_k, axis=0)[top_k:]
    max_probs = probs[max_indices]
    sorted_idx = max_indices[np.argsort(max_probs, axis=0)]

    print "\nAnswer:"
    for idx in reversed(sorted_idx):
        idx = int(idx)
Beispiel #8
0
# these hyperparameters are from the paper
args.batch_size = 50
time_steps = 150
hidden_size = 500
gradient_clip_value = None

# setup backend
be = gen_backend(**extract_valid_args(args, gen_backend))

# download penn treebank
train_path = load_ptb_train(path=args.data_dir)
valid_path = load_ptb_test(path=args.data_dir)

# load data and parse on character-level
train_set = Text(time_steps, train_path)
valid_set = Text(time_steps, valid_path, vocab=train_set.vocab)

# weight initialization
init = Uniform(low=-0.08, high=0.08)

# model initialization
layers = [Recurrent(hidden_size, init, activation=Tanh()),
          Affine(len(train_set.vocab), init, bias=init, activation=Softmax())]

cost = GeneralizedCost(costfunc=CrossEntropyMulti(usebits=True))

model = Model(layers=layers)

optimizer = RMSProp(gradient_clip_value=gradient_clip_value, stochastic_round=args.rounding)