Beispiel #1
0
    def addLayer(self, idx):
        """
        :param idx: index of the layer(in the list passed to initialize the network) to be added. Note 0 is the input\
         layers
        :return: return a list of newly created variables that has to initialized
        """
        with tf.variable_scope('forward_variables', reuse=False):
            self.layers = self.layers[:-1]
            print 'layers len', len(self.layers)
            if len(self.layers) == 0:
                inpt = self.input
            else:
                inpt = self.layers[-1].activations
            self.layers.append(
                HiddenLayer(self.layer_dims[idx - 1], self.layer_dims[idx],
                            inpt, 'layer' + str(idx)))
            self.layers.append(
                LinearLayer(self.layer_dims[-2], self.layer_dims[-1],
                            self.layers[-1].activations,
                            str(idx) + 'layerNet_output'))
            self.__buildLossGraph__()

        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope='forward_variables/layer' + str(idx))
        params += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='forward_variables_' + str(idx))
        params += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='forward_variables/' + str(idx) +
                                    'layerNet_output')
        print 'params are ', params
        self.buildEvalGraph()
        self.buildSummaryGraph()
        return params
Beispiel #2
0
 def __buildFullGraph__(self):
     inpt = self.input
     for idx in range(1, len(self.layer_dims) - 1):
         self.layers.append(
             HiddenLayer(self.layer_dims[idx - 1], self.layer_dims[idx],
                         inpt, 'layer' + str(idx)))
         inpt = self.layers[-1].activations
     return inpt
Beispiel #3
0
    def forward_propagation(self, x):
        """Forward Progation of a single sample."""
        tau = len(x)
        prev_h = sp.zeros(self.n_hiddens)

        cells = [None for i in range(tau)]
        for i in range(tau):
            # Compute the hidden state
            time_input = x[i]
            hidden = HiddenLayer()
            hidden.forward(self.U, time_input, self.W, prev_h, self.b)

            # Compute the output
            prev_h = hidden.h
            output = OutputLayer()
            output.forward(self.V, hidden.h, self.c)

            cells[i] = (hidden, output)
        return cells
Beispiel #4
0
    def __init_layers(self, layer_spec):
        self.layers = []
        last_index = len(layer_spec) - 1
        for i, size in enumerate(layer_spec):
            if i == 0:
                self.layers.append(InputLayer(size, self.activation_fn))
            elif i == last_index:
                self.layers.append(OutputLayer(size, self.activation_fn))
            else:
                self.layers.append(HiddenLayer(size, self.activation_fn))

        for i in range(len(self.layers) - 1):
            self.__join_layers(self.layers[i], self.layers[i+1])
Beispiel #5
0
    def __init__(self,
                 architecture=[784, 100, 10],
                 activation='sigmoid',
                 learning_rate=0.1,
                 momentum=0.5,
                 weight_decay=1e-4,
                 dropout=0.5,
                 early_stopping=True,
                 seed=99):
        """
        Neural network model initializer.
        """

        # Attributes
        self.architecture = architecture
        self.activation = activation
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.dropout = dropout
        self.early_stopping = early_stopping
        self.seed = seed

        # Turn `activation` and `learning_rate` to class instances
        if not isinstance(self.activation, Activation):
            self.activation = Activation(self.activation)
        if not isinstance(self.learning_rate, LearningRate):
            self.learning_rate = LearningRate(self.learning_rate)

        # Initialize a list of layers
        self.layers = []
        for i, (n_in,
                n_out) in enumerate(zip(architecture[:-2],
                                        architecture[1:-1])):
            l = HiddenLayer('layer{}'.format(i), n_in, n_out, self.activation,
                            self.learning_rate, self.momentum,
                            self.weight_decay, self.dropout, self.seed + i)
            self.layers.append(l)
        # Output layer
        n_in, n_out = architecture[-2], architecture[-1]
        l = OutputLayer('output_layer', n_in, n_out, self.learning_rate,
                        self.momentum, self.weight_decay, self.dropout,
                        self.seed + i + 1)
        self.layers.append(l)

        # Training updates
        self.epoch = 0
        self.training_error = []
        self.validation_error = []
        self.training_loss = []
        self.validation_loss = []
Beispiel #6
0
    def build_layers(self):
        layers = []
        for i, layer_desc in enumerate(self.layer_descriptions):
            units = self._calc_num_units(i)
            constructor_params = {
                'n_in': units[0],
                'n_out': units[1],
                'batch_size': self.batch_size,
                'k': layer_desc['k'],
                'activation': layer_desc['activation'],
                'name': 'l_layer_%d' % i
            }
            layers.append(HiddenLayer(**constructor_params))

        return layers
Beispiel #7
0
def process(train_source_file, train_target_file, dev_source_file, dev_target_file, test_source_file, test_target_predictions):
    
    train_source_data = get_data(train_source_file)
    train_target_data = get_data(train_target_file)
    dev_source_data = get_data(dev_source_file)
    dev_target_data = get_data(dev_target_file)
    test_source_data = get_data(test_source_file)
    
    source_words = set(itertools.chain(*(train_source_data + dev_source_data)))
    target_words = set(itertools.chain(*(train_target_data + dev_target_data)))
    
    source_word_to_idx = dict((v, i) for i, v in enumerate(source_words))
    target_word_to_idx = dict((v, i) for i, v in enumerate(target_words))
    target_idx_to_word = dict((i, v) for i, v in enumerate(target_words))
    
    # Preparing data    
    train_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in train_source_data]
    dev_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in dev_source_data]
    train_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in train_target_data]
    dev_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in dev_target_data]
    test_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in test_source_data]
    
    # Changing the input numpy arrays to tensor vectors
    source_sentence = T.ivector()
    target_sentence = T.ivector()
    target_gold = T.ivector()
    
    source_word_embedding = 128
    target_word_embedding = 128
    source_hidden_embedding = 256
    target_hidden_embedding = 256
        
    hyper_params = []
    
    vocab_source_size = len(source_words)
    vocab_target_size = len(target_words)
    
    source_lookup = EmbeddingLayer(vocab_source_size, source_word_embedding) 
    target_lookup = EmbeddingLayer(vocab_target_size, target_word_embedding) 
    hyper_params += source_lookup.params + target_lookup.params

    source_lstm_forward = LSTM(source_word_embedding, source_hidden_embedding, with_batch=False)
    
    target_lstm = LSTM(256, target_hidden_embedding, with_batch=False)
    hyper_params += source_lstm_forward.params + target_lstm.params[:-1] # Removing the last output

    tanh_layer = HiddenLayer(source_hidden_embedding, target_word_embedding, activation='tanh')
    # weighted_attention_vector + target_sentence_embedding + last encoded vector
    softmax_layer = HiddenLayer(source_hidden_embedding + target_hidden_embedding, vocab_target_size, activation='softmax')
    hyper_params += softmax_layer.params

    # Getting the source and target embeddings
    source_sentence_emb = source_lookup.link(source_sentence)
    target_sentence_emb = target_lookup.link(target_sentence)
    last_h = source_lstm_forward.link(source_sentence_emb)

    # Repeating the last encoder_output for target word length times
    # First changing the last encoder_output into a row and vector and repeating target word length times
    broadcast_source_context = T.repeat(last_h.dimshuffle('x', 0), target_sentence_emb.shape[0], axis=0)
    broadcast_source_context = tanh_layer.link(broadcast_source_context)
    target_sentence_emb = T.concatenate((target_sentence_emb, broadcast_source_context), axis=1)
    target_lstm.h_0 = last_h
    target_lstm.link(target_sentence_emb)
    
    # Attention
    ht = target_lstm.h.dot(source_lstm_forward.h.transpose())
    # Normalizing across rows to get attention probabilities
    attention_weights = T.nnet.softmax(ht)
    # Weighted source_context_vector based on attention probabilities
    attention_weighted_vector = attention_weights.dot(source_lstm_forward.h)
    # Concatenating the hidden state from lstm and weighted source_context_vector
    pred = T.concatenate([attention_weighted_vector, target_lstm.h], axis=1)
    # Final softmax to get the best translation word
    prediction = softmax_layer.link(pred)
    
    # Computing the cross-entropy loss
    loss = T.nnet.categorical_crossentropy(prediction, target_gold).mean()
    
    updates = LearningMethod(clip=5.0).get_updates('adam', loss, hyper_params)
    
    # For training
    train_function = theano.function(
        inputs=[source_sentence, target_sentence, target_gold],
        outputs=loss,
        updates=updates
    )

    # For prediction
    predict_function = theano.function(
        inputs=[source_sentence, target_sentence],
        outputs=prediction,
    )
        
    def get_translations(source_sentences):
        translated_sentences = []
        for sentence in source_sentences:
            source_sentence = np.array(sentence).astype(np.int32)
            translated_so_far = [target_word_to_idx['<s>']]
            while True:
                next_word = predict_function(source_sentence, translated_so_far).argmax(axis=1)[-1] # Get the last translated word
                translated_so_far.append(next_word)
                if next_word == target_word_to_idx['</s>']:
                    translated_sentences.append([target_idx_to_word[x] for x in translated_so_far])
                    break
        return translated_sentences
    
    iterations = 100
    batch_size = 10000
    c = 0
    best_score = -1.0 * sys.maxint
    dev_preds = []
    test_preds = []
    dev_best_preds = []
    test_best_preds = []
    for i in xrange(iterations):
        print 'Iteration {}'.format(i)
        random_indexes = range(len(train_source_data))
        np.random.shuffle(random_indexes)
        loss = []
        for sent_no, index in enumerate(random_indexes):
            src_vector = np.array(train_source_data[index]).astype(np.int32)
            tgt_vector = np.array(train_target_data[index]).astype(np.int32)
            c = train_function(src_vector, tgt_vector[:-1], tgt_vector[1:])                  
            loss.append(c)
            if sent_no % batch_size == 0 and sent_no > 0:
                dev_preds = get_translations(dev_source_data)
                dev_bleu_score = get_bleu(dev_preds)
                if dev_bleu_score > best_score:
                    best_score = dev_bleu_score
                    dev_best_preds = dev_preds[:]
                    # Decoding the test once the dev reaches the baseline
                    if dev_bleu_score >= 28:
                        test_preds = get_translations(test_source_data)
                        test_best_preds = test_preds[:]
                    print 'Dev bleu score {}'.format(dev_bleu_score)
                
        print 'Iteration: {} Loss {}'.format(i, 1.0 * (sum(loss))/len(loss))

            
    dev_output_fp = open('dev_output.txt', 'w')
    test_output_fp = open(test_target_predictions, 'w')
    
    for pred in dev_best_preds:
        dev_output_fp.write(' '.join(pred) + '\n')
    dev_output_fp.close()
    
    for pred in test_best_preds:
        test_output_fp.write(' '.join(pred) + '\n')
    test_output_fp.close()
def main():
    config = ConfigParser.ConfigParser()
    train_src = load_data(config.get("Data", "train_src"))
    dev_src = load_data(config.get("Data", "dev_src"))
    test_src = load_data(config.get("Data", "test_src"))

    train_tgt = load_data(config.get("Data", "train_tgt"))
    dev_tgt = load_data(config.get("Data", "dev_tgt"))
    test_tgt = load_data(config.get("Data", "test_tgt"))

    assert len(train_src) == len(train_tgt)

    UD_path = config.get("Path", "UD")

    sys.path.append(UD_path + "/")

    words_src = get_words(train_src + dev_src)
    words_tgt = get_words(train_tgt + dev_tgt)

    source_word2ind = {word: ind for ind, word in enumerate(words_src)}
    source_ind2word = {ind: word for ind, word in enumerate(words_src)}
    target_word2ind = {word: ind for ind, word in enumerate(words_tgt)}
    target_ind2word = {ind: word for ind, word in enumerate(words_tgt)}

    # In[24]:

    #
    # Model
    #
    src_emb_dim = 256  # source word embedding dimension
    tgt_emb_dim = 256  # target word embedding dimension
    src_lstm_hid_dim = 512  # source LSTMs hidden dimension
    tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
    proj_dim = 104  # size of the first projection layer
    dropout = 0.5  # dropout rate

    n_src = len(source_word2ind)  # number of words in the source language
    n_tgt = len(target_word2ind)  # number of words in the target language

    # Parameters
    params = []

    # Source words + target words embeddings layer
    src_lookup = EmbeddingLayer(n_src, src_emb_dim, name="src_lookup")  # lookup table for source words
    tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name="tgt_lookup")  # lookup table for target words
    params += src_lookup.params + tgt_lookup.params

    # LSTMs
    src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_for", with_batch=False)
    src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_rev", with_batch=False)
    tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name="tgt_lstm", with_batch=False)
    params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1]

    # Projection layers
    proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name="proj_layer1", activation="softmax")
    proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name="proj_layer2", activation="tanh")
    params += proj_layer1.params  # + proj_layer2.params

    # Train status
    is_train = T.iscalar("is_train")
    # Input sentence
    src_sentence = T.ivector()
    # Current output translation
    tgt_sentence = T.ivector()
    # Gold translation
    tgt_gold = T.ivector()

    src_sentence_emb = src_lookup.link(src_sentence)
    tgt_sentence_emb = tgt_lookup.link(tgt_sentence)
    print "src_sentence_emb", src_sentence_emb.eval({src_sentence: src_sentence_t}).shape
    print "tgt_sentence_emb", tgt_sentence_emb.eval({tgt_sentence: tgt_sentence_t}).shape

    src_lstm_for.link(src_sentence_emb)
    src_lstm_rev.link(src_sentence_emb[::-1, :])

    print "src_lstm_for.h", src_lstm_for.h.eval({src_sentence: src_sentence_t}).shape
    print "src_lstm_rev.h", src_lstm_rev.h.eval({src_sentence: src_sentence_t}).shape

    src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1)
    print "src_context", src_context.eval({src_sentence: src_sentence_t}).shape

    tgt_lstm.h_0 = src_context[-1]
    print "tgt sentence emb", tgt_sentence_emb.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape
    tgt_lstm.link(tgt_sentence_emb)
    print "tgt_lstm.h", tgt_lstm.h.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    transition = tgt_lstm.h.dot(src_context.transpose())
    transition = transition.dot(src_context)
    print "transition", transition.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    transition_last = T.concatenate([transition, tgt_lstm.h], axis=1)
    print "transition_last", transition_last.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    prediction = proj_layer1.link(transition_last)
    print "prediction", prediction.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean()
    cost += beta * T.mean(
        (tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2
    )  # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf

    print "cost", cost.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t, tgt_gold: tgt_gold_t})

    # In[26]:

    updates = LearningMethod(clip=5.0).get_updates("adam", cost, params)

    # In[27]:

    f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates)

    # In[28]:

    f_eval = theano.function(inputs=[src_sentence, tgt_sentence], outputs=prediction)
Beispiel #9
0
def main():

    source_word2idx, source_idx2word = create_word_table(train_src)
    target_word2idx, target_idx2word = create_word_table(train_tgt)
    sys.stderr.write("Lookup table constructed." + "\n")

    src_emb_dim = 256  # source word embedding dimension
    tgt_emb_dim = 256  # target word embedding dimension
    src_lstm_hid_dim = 512  # source LSTMs hidden dimension
    tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
    dropout = 0.5  # dropout rate

    n_src = len(source_word2idx)  # number of words in the source language
    n_tgt = len(target_word2idx)  # number of words in the target language

    # Parameters
    params = []

    # Source words + target words embeddings layer
    # lookup table for source words
    src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup')
    # lookup table for target words
    tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup')
    params += src_lookup.params + tgt_lookup.params

    # LSTMs
    src_lstm_for = LSTM(src_emb_dim,
                        src_lstm_hid_dim,
                        name='src_lstm_for',
                        with_batch=False)
    src_lstm_rev = LSTM(src_emb_dim,
                        src_lstm_hid_dim,
                        name='src_lstm_rev',
                        with_batch=False)
    tgt_lstm = LSTM(2 * tgt_emb_dim,
                    tgt_lstm_hid_dim,
                    name='tgt_lstm',
                    with_batch=False)
    params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1]

    # Projection layers
    proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim,
                              n_tgt,
                              name='proj_layer1',
                              activation='softmax')
    proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim,
                              tgt_emb_dim,
                              name='proj_layer2',
                              activation='tanh')
    # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh')
    params += proj_layer1.params + proj_layer2.params

    beta = 500

    # Train status
    is_train = T.iscalar('is_train')
    # Input sentence
    src_sentence = T.ivector()
    # Current output translation
    tgt_sentence = T.ivector()
    # Gold translation
    tgt_gold = T.ivector()

    src_sentence_emb = src_lookup.link(src_sentence)
    tgt_sentence_emb = tgt_lookup.link(tgt_sentence)

    src_lstm_for.link(src_sentence_emb)
    src_lstm_rev.link(src_sentence_emb[::-1, :])

    src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]],
                                axis=1)

    tgt_lstm.h_0 = src_context[-1]
    repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0),
                                    tgt_sentence_emb.shape[0],
                                    axis=0)
    repeated_src_context = proj_layer2.link(repeated_src_context)

    tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context),
                                     axis=1)
    tgt_lstm.link(tgt_sentence_emb)

    # Attention
    transition = tgt_lstm.h.dot(src_context.transpose())
    transition = transition.dot(src_context)

    transition_last = T.concatenate([transition, tgt_lstm.h], axis=1)

    prediction = proj_layer1.link(transition_last)

    cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean()
    # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf
    cost += beta * T.mean((tgt_lstm.h[:-1]**2 - tgt_lstm.h[1:]**2)**2)

    updates = LearningMethod(clip=5.0).get_updates('adam', cost, params)

    f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold],
                              outputs=cost,
                              updates=updates)

    f_eval = theano.function(
        inputs=[src_sentence, tgt_sentence],
        outputs=prediction,
    )

    best_valid_preds = None
    best_valid_score = -sys.maxint
    best_test_preds = None

    log = open('blue_valid_log.txt', 'w')
    all_costs = []
    batch_size = 50
    n_epochs = 10
    for i in xrange(n_epochs):
        print 'Starting epoch %i' % i
        indices = range(len(train_src))
        np.random.shuffle(indices)
        train_src_batch = [train_src[ind] for ind in indices]
        train_tgt_batch = [train_tgt[ind] for ind in indices]
        assert len(train_src_batch) == len(train_tgt_batch)
        costs = []
        for j in xrange(len(train_src_batch)):
            new_cost = f_train(
                np.array([source_word2idx[x]
                          for x in train_src_batch[j]]).astype(np.int32),
                np.array([target_word2idx[x]
                          for x in train_tgt_batch[j]][:-1]).astype(np.int32),
                np.array([target_word2idx[x]
                          for x in train_tgt_batch[j]][1:]).astype(np.int32))
            all_costs.append((j, new_cost))
            costs.append(new_cost)
            if j % 300 == 0:
                print j, np.mean(costs)
                costs = []
            if np.isnan(new_cost):
                print 'NaN detected.'
                break
            if j % 10000 == 0 and j != 0:
                valid_preds = get_predictions(source_word2idx,
                                              target_word2idx,
                                              target_idx2word,
                                              f_eval,
                                              mode="validation")
                bleu = get_validation_bleu(valid_preds)
                print '==================================================================='
                print 'Epoch %i BLEU on Validation : %s ' % (i, bleu)
                print '==================================================================='
                if float(bleu) >= best_valid_score:
                    best_valid_score = float(bleu)
                    best_valid_preds = copy.deepcopy(valid_preds)
                    best_test_preds = get_predictions(source_word2idx,
                                                      target_word2idx,
                                                      target_idx2word,
                                                      f_eval,
                                                      mode="test")
                    print 'Found new best validation score %f ' % (
                        best_valid_score)
                log.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' %
                          (i, j, bleu))

        # Store after epoch
        fout = open('output' + str(i) + '.txt', 'w')
        for line in best_test_preds:
            fout.write(' '.join(line) + '\n')
        fout.close()

    log.close()
Beispiel #10
0
    def create_cell(self):
        # FIXME: Should create cells base on network structure
        hidden = HiddenLayer(self.hidden_size)
        output = OutputLayer(self.hidden_size)

        return (hidden, output)
Beispiel #11
0
def main():

    source_word2idx, source_idx2word = create_word_table(train_src)
    target_word2idx, target_idx2word = create_word_table(train_tgt)
    sys.stderr.write("Lookup table constructed." + "\n")

    src_emb_dim = 256  # source word embedding dimension
    tgt_emb_dim = 256  # target word embedding dimension
    src_lstm_hid_dim = 512  # source LSTMs hidden dimension
    tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
    dropout = 0.5  # dropout rate

    n_src = len(source_word2idx)  # number of words in the source language
    n_tgt = len(target_word2idx)  # number of words in the target language

    # Parameters
    params = []

    # Source words + target words embeddings layer
    # lookup table for source words
    src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup')
    # lookup table for target words
    tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup')
    params += src_lookup.params + tgt_lookup.params

    # LSTMs
    src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False)
    src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False)
    tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False)
    params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1]

    # Projection layers
    proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax')
    proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh')
    # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh')
    params += proj_layer1.params + proj_layer2.params

    beta = 500

    # Train status
    is_train = T.iscalar('is_train')
    # Input sentence
    src_sentence = T.ivector()
    # Current output translation
    tgt_sentence = T.ivector()
    # Gold translation
    tgt_gold = T.ivector()

    src_sentence_emb = src_lookup.link(src_sentence)
    tgt_sentence_emb = tgt_lookup.link(tgt_sentence)

    src_lstm_for.link(src_sentence_emb)
    src_lstm_rev.link(src_sentence_emb[::-1, :])

    src_context = T.concatenate(
        [src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1)

    tgt_lstm.h_0 = src_context[-1]
    repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0)
    repeated_src_context = proj_layer2.link(repeated_src_context)

    tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1)
    tgt_lstm.link(tgt_sentence_emb)

    # Attention
    transition = tgt_lstm.h.dot(src_context.transpose())
    transition = transition.dot(src_context)

    transition_last = T.concatenate([transition, tgt_lstm.h], axis=1)

    prediction = proj_layer1.link(transition_last)

    cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean()
    # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf
    cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2)

    updates = LearningMethod(clip=5.0).get_updates('adam', cost, params)

    f_train = theano.function(
        inputs=[src_sentence, tgt_sentence, tgt_gold],
        outputs=cost,
        updates=updates
    )

    f_eval = theano.function(
        inputs=[src_sentence, tgt_sentence],
        outputs=prediction,
    )

    best_valid_preds = None
    best_valid_score = -sys.maxint
    best_test_preds = None

    log = open('blue_valid_log.txt', 'w')
    all_costs = []
    batch_size = 50
    n_epochs = 10
    for i in xrange(n_epochs):
        print 'Starting epoch %i' % i
        indices = range(len(train_src))
        np.random.shuffle(indices)
        train_src_batch = [train_src[ind] for ind in indices]
        train_tgt_batch = [train_tgt[ind] for ind in indices]
        assert len(train_src_batch) == len(train_tgt_batch)
        costs = []
        for j in xrange(len(train_src_batch)):
            new_cost = f_train(
                np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32),
                np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32),
                np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32)
            )
            all_costs.append((j, new_cost))
            costs.append(new_cost)
            if j % 300 == 0:
                print j, np.mean(costs)
                costs = []
            if np.isnan(new_cost):
                print 'NaN detected.'
                break
            if j % 10000 == 0 and j != 0:
                valid_preds = get_predictions(
                    source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation")
                bleu = get_validation_bleu(valid_preds)
                print '==================================================================='
                print 'Epoch %i BLEU on Validation : %s ' % (i, bleu)
                print '==================================================================='
                if float(bleu) >= best_valid_score:
                    best_valid_score = float(bleu)
                    best_valid_preds = copy.deepcopy(valid_preds)
                    best_test_preds = get_predictions(
                        source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test")
                    print 'Found new best validation score %f ' % (best_valid_score)
                log.write(
                    'Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu))

        # Store after epoch
        fout = open('output' + str(i) + '.txt', 'w')
        for line in best_test_preds:
            fout.write(' '.join(line) + '\n')
        fout.close()

    log.close()
Beispiel #12
0
def test_train_little_only(
    rng,
    batch_size,
    learning_rate,
    n_hids,
    n_epochs=1000,
    L1_reg=0.0,
    L2_reg=0.0001,
    zero_last_layer_params=False,
):
    def summarize_rates():
        print "Learning rate: ", learning_rate.rate

    l_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX),
                             name='learning_rate')

    index = T.lscalar('index')
    l_x = T.matrix('l_x', dtype=config.floatX)
    y = T.ivector('y')

    print "Loading Data"
    print "... MNIST"
    dataset = 'mnist.pkl.gz'
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    print "Building models"
    print "... Building layers"
    # Create network structure
    x_size = train_set_x.shape[1].eval()
    y_size = train_set_y.shape[0].eval()
    n_in = x_size
    n_out = n_hids
    l_layers = []
    b_layers = []
    l_params = None

    # Shared variable used for always activating one block in a layer as in the
    # input and output layer
    one_block_idxs = shared(np.zeros((batch_size, 1), dtype='int64'),
                            name='one_block_idxs')

    l_layers.append(
        HiddenLayer(n_in,
                    n_out,
                    batch_size,
                    k=0.1,
                    activation=T.tanh,
                    name='l_layer_' + str(len(l_layers))))

    n_in = n_out
    #    l_layers.append(
    #        HiddenLayer(
    #            n_in,
    #            n_out,
    #            batch_size,
    #            k=0.1,
    #            activation=T.tanh,
    #            name='l_layer_' + str(len(l_layers))
    #        )
    #    )

    n_out = 10
    l_layers.append(
        HiddenLayer(n_in,
                    n_out,
                    batch_size,
                    k=1,
                    activation=T.nnet.softmax,
                    name='l_layer_' + str(len(l_layers))))
    if zero_last_layer_params:
        l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value())
        l_layers[-1].b.set_value(0 * l_layers[-1].b.get_value())

    for layer in l_layers:
        print "\t%s" % layer

    #for l_layer in l_layers:
    #    for param in l_layer.params:
    #        param.set_value(np.ones_like(param.get_value()))

    print "... Building top active updates"
    top_active = []
    l_activation = l_x
    for i in range(len(l_layers)):
        l_activation = l_layers[i].output(l_activation)

    print "... Building costs and errors"
    l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y),
                                L1_reg, L2_reg)
    l_error = l_layers[-1].error(l_activation, y)

    print "... Building parameter updates"
    l_grads = []
    l_param_updates = []
    for i in range(len(l_layers)):
        for param in l_layers[i].params:
            gparam = T.grad(l_cost, param)
            l_grads.append(gparam)
            l_param_updates.append((param, param - l_learning_rate * gparam))

    print "... Compiling little net train function"
    l_updates = l_param_updates

    l_train_model = function(
        [index], [l_cost, l_x, y],
        updates=l_updates,
        givens={
            l_x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling little net test function"
    l_test_model = function(
        [index],
        l_error,
        givens={
            l_x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling little net validate function"
    l_validate_model = function(
        [index],
        l_error,
        givens={
            l_x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "Training"

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 10  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    this_validation_loss = 0
    this_validation_loss_l = 0
    this_validation_loss_b = 0
    best_validation_loss = np.inf
    best_validation_loss_l = best_validation_loss
    best_validation_loss_b = best_validation_loss
    best_iter = 0
    test_score = 0.
    test_score_l = 0.
    accum_l = 0
    epoch = 0
    train_time_accum_l = 0
    done_looping = False

    timers = ['train', 'valid', 'train']
    ts = TS(['epoch', 'valid'])
    ts_l = TS(timers)

    summarize_rates()

    ts.start()
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        ts.start('epoch')
        for minibatch_index in xrange(n_train_batches):
            ts_l.start('train')
            minibatch_avg_cost_l = l_train_model(minibatch_index)
            ts_l.end('train')

            minibatch_avg_cost_l = minibatch_avg_cost_l[0]
            if np.isnan(minibatch_avg_cost_l):
                print "minibatch_avg_cost_l: %f" % minibatch_avg_cost_l
                ipdb.set_trace()
            accum_l = accum_l + minibatch_avg_cost_l

            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                ts.end('epoch')
                ts.reset('epoch')

                ts_l.reset('train')
                accum_l = accum_l / validation_frequency
                l_summary = ("minibatch_avg_cost_l: %f, time: %f" %
                             (accum_l, ts_l.accumed['train'][-1][1]))
                accum_l = 0
                train_time_accum_l = 0

                print "%s" % (l_summary)

                # compute zero-one loss on validation set
                summary = ('epoch %i, minibatch %i/%i' %
                           (epoch, minibatch_index + 1, n_train_batches))

                validation_losses_l = [
                    l_validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss_l = np.mean(validation_losses_l)
                l_summary = ('little validation error %f %% ' %
                             (this_validation_loss_l * 100.))

                print("%s %s" % (summary, l_summary))
                #ipdb.set_trace()

                # if we got the best validation score until now
                this_validation_loss = this_validation_loss_l

                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss_l = this_validation_loss_l
                    best_validation_loss = best_validation_loss_l

                    best_iter = iter

                    # test it on the test set
                    test_losses_l = [
                        l_test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score_l = np.mean(test_losses_l)
                    l_summary = 'little: %f' % (test_score_l * 100.)

                    print(
                        '     epoch %i, minibatch %i/%i,'
                        ' test error of best model %s' %
                        (epoch, minibatch_index + 1, n_train_batches,
                         l_summary))

                learning_rate.update()

                l_learning_rate.set_value(learning_rate.rate)

                summarize_rates()

            if patience <= iter:
                done_looping = True
                break

    ts.end()
    print(
        'Optimization complete. Best validation score of %f %% '
        'obtained at iteration %i, with test performance %f %%' %
        (best_validation_loss_l * 100., best_iter + 1, test_score_l * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %s' % ts)

    return ts.diffs['epoch']
Beispiel #13
0
def test_big_and_little_train_big(rng,
                                  batch_size,
                                  learning_rate,
                                  momentum_rate,
                                  n_epochs=1000,
                                  L1_reg=0.0,
                                  L2_reg=0.0001,
                                  restore_parameters=False,
                                  select_top_active=False,
                                  mult_small_net_params=False,
                                  zero_last_layer_params=False,
                                  train_little_net=False,
                                  train_big_net=True):
    def summarize_rates():
        print "Learning rate: ", learning_rate.rate, \
            "Momentum: ", momentum.get_value()

    assert (train_big_net or train_little_net)

    l_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX),
                             name='learning_rate')
    b_learning_rate = shared(np.array(learning_rate.rate, dtype=config.floatX),
                             name='learning_rate')
    momentum = shared(np.array(momentum_rate.rate, dtype=config.floatX),
                      name='momentum')

    index = T.lscalar('index')
    l_x = T.matrix('l_x', dtype=config.floatX)
    b_x = T.tensor3('b_x', dtype=config.floatX)
    y = T.ivector('y')

    print "Loading Data"
    print "... MNIST"
    dataset = 'mnist.pkl.gz'
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    print "Building models"
    print "... Building layers"
    # Create network structure
    x_size = train_set_x.shape[1].eval()
    y_size = train_set_y.shape[0].eval()
    n_in = x_size
    n_units_per = 1
    n_out = 5000
    l_layers = []
    b_layers = []
    l_params = None

    # Shared variable used for always activating one block in a layer as in the
    # input and output layer
    one_block_idxs = shared(np.zeros((batch_size, 1), dtype='int64'),
                            name='one_block_idxs')

    l_layers.append(
        HiddenLayer(n_in,
                    n_out,
                    batch_size,
                    k=0.1,
                    activation=T.tanh,
                    name='l_layer_' + str(len(l_layers))))

    if mult_small_net_params:
        l_params = l_layers[-1].params

    b_layers.append(
        HiddenBlockLayer((1, x_size), (n_out, n_units_per),
                         one_block_idxs,
                         l_layers[-1].top_active,
                         batch_size,
                         activation=T.tanh,
                         name='b_layer_' + str(len(b_layers)),
                         l_params=l_params,
                         l_param_map=[('x', 1, 0, 'x'), (0, 'x')]))

    n_in = n_out
    l_layers.append(
        HiddenLayer(n_in,
                    n_out,
                    batch_size,
                    k=0.1,
                    activation=T.tanh,
                    name='l_layer_' + str(len(l_layers))))

    if mult_small_net_params:
        l_params = l_layers[-1].params

    b_layers.append(
        HiddenBlockLayer(
            (n_in, n_units_per),
            (n_out, n_units_per),
            l_layers[-2].top_active,
            l_layers[-1].top_active,
            #out_idxs_n,
            batch_size,
            activation=T.tanh,
            name='b_layer_' + str(len(b_layers)),
            l_params=l_params,
            l_param_map=[(0, 1, 'x', 'x'), (0, 'x')]))

    n_out = 10
    l_layers.append(
        HiddenLayer(n_in,
                    n_out,
                    batch_size,
                    k=1,
                    activation=T.nnet.softmax,
                    name='l_layer_' + str(len(l_layers))))
    if zero_last_layer_params:
        l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value())
        l_layers[-1].b.set_value(0 * l_layers[-1].b.get_value())

    if mult_small_net_params:
        l_params = l_layers[-1].params

    b_layers.append(
        HiddenBlockLayer((n_in, n_units_per), (1, n_out),
                         l_layers[-2].top_active,
                         one_block_idxs,
                         batch_size,
                         None,
                         name='b_layer_' + str(len(b_layers)),
                         l_params=l_params,
                         l_param_map=[(0, 'x', 'x', 1), ('x', 0)]))
    if zero_last_layer_params:
        b_layers[-1].W.set_value(0 * b_layers[-1].W.get_value())
        b_layers[-1].b.set_value(0 * b_layers[-1].b.get_value())

    if train_little_net or select_top_active:
        for layer in l_layers:
            print "\t%s" % layer

    if train_big_net:
        for layer in b_layers:
            print layer

    if restore_parameters:
        print "... Restoring weights of little model"
        restore_parameters('parameters_20_20_l1_0.0001_l2_0.0001.pkl',
                           l_layers)

    #for l_layer in l_layers:
    #    for param in l_layer.params:
    #        param.set_value(np.ones_like(param.get_value()))

    print "... Building top active updates"
    top_active = []
    l_activation = l_x
    b_activation = b_x
    b_activations = [b_activation]
    for i in range(len(l_layers)):
        l_activation = l_layers[i].output(l_activation)
        b_activation = b_layers[i].output(b_activation)
        b_activations.append(b_activation)
        top_active.append((l_layers[i].top_active,
                           T.argsort(T.abs_(l_activation))[:, :l_layers[i].k]))

    print "... Building costs and errors"
    l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y),
                                L1_reg, L2_reg)
    l_error = l_layers[-1].error(l_activation, y)

    # T.nnet.softmax takes a matrix not a tensor so we only calculate the
    # linear component at the last layer and here we reshape and then
    # apply the softmax
    #b_activation = T.nnet.softmax(((b_activation*b_activation)**2).sum(axis=2))
    #b_activation = relu_softmax(((b_activation*b_activation)**2).sum(axis=2))
    #b_activation = T.nnet.softmax(T.mean(b_activation, axis=2))
    #b_activation = relu_softmax(T.mean(b_activation, axis=2))
    #b_activation = T.nnet.softmax(T.max(b_activation, axis=2))
    #b_activation = relu_softmax(T.max(b_activation, axis=2))
    b_shp = b_activation.shape
    #b_activation = relu_softmax(b_activation.reshape((b_shp[0], b_shp[2])))
    b_activation = T.nnet.softmax(b_activation.reshape((b_shp[0], b_shp[2])))
    b_activations.append(b_activation)
    b_cost = add_regularization(b_layers, b_layers[-1].cost(b_activation, y),
                                L1_reg, L2_reg)
    b_error = b_layers[-1].error(b_activation, y)

    print "... Building parameter updates"
    l_grads = []
    l_param_updates = []
    b_grads = []
    b_param_updates = []
    for i in range(len(l_layers)):
        for param in l_layers[i].params:
            gparam = T.grad(l_cost, param)
            l_grads.append(gparam)
            l_param_updates.append((param, param - l_learning_rate * gparam))

        for param in b_layers[i].params:
            b_gparam = T.grad(
                b_cost,
                param,
                #consider_constant=[b_layers[i].in_idxs, b_layers[i].out_idxs]
            )
            b_velocity = shared(
                np.zeros_like(param.get_value(), dtype=theano.config.floatX),
                param.name + '_velocity')
            b_param_updates.append(
                (b_velocity,
                 momentum * b_velocity - b_learning_rate * b_gparam))
            b_grads.append(b_gparam)
            b_param_updates.append((param, param + b_velocity))

        #if b_layers[i].l_params is not None:
        #for param in b_layers[i].l_params:
        #l_gparam = T.grad(
        #    b_cost,
        #    param
        #)
        #l_velocity = shared(
        #    np.zeros_like(param.get_value()),
        #    param.name + '_velocity'
        #)
        #b_param_updates.append((
        #    l_velocity, momentum*l_velocity - b_learning_rate*l_gparam
        #))
        #l_grads.append(l_gparam)
        #b_param_updates.append((param, param + l_velocity))
        #b_param_updates.append((
        #    param, param - 0.0001*l_gparam
        #))

    print "... Compiling little net train function"
    l_updates = []
    if select_top_active:
        l_updates = l_updates + top_active

    if train_little_net:
        l_updates = l_updates + l_param_updates

    l_train_model = function(
        [index], [l_cost, l_x, y],
        updates=l_updates,
        givens={
            l_x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling big net train function"
    temp = train_set_x.get_value(borrow=True, return_internal_type=True)
    train_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])),
                           borrow=True,
                           name='train_set_x_b')

    b_updates = []
    if train_big_net:
        b_updates = b_updates + b_param_updates

    b_train_model = function(
        [index], [b_cost],
        updates=b_updates,
        givens={
            b_x: train_set_x_b[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    #theano.printing.debugprint(b_train_model)
    #ipdb.set_trace()

    #    verify_layers(batch_size, b_layers, train_set_x_b, train_set_y)
    #    temp = verify_cost(
    #        b_cost,
    #        b_layers,
    #        b_x,
    #        y,
    #        batch_size,
    #        train_set_x_b,
    #        train_set_y
    #    )
    #    T.verify_grad(
    #        temp,
    #        [b_layers[0].W.get_value(), b_layers[1].W.get_value()],
    #        rng=rng
    #    )

    print "... Compiling little net test function"
    l_test_model = function(
        [index],
        l_error,
        givens={
            l_x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling big net test function"
    temp = test_set_x.get_value(borrow=True, return_internal_type=True)
    test_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])),
                          borrow=True,
                          name='test_set_x_b')
    b_test_model = function(
        [index],
        b_error,
        givens={
            b_x: test_set_x_b[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling little net validate function"
    l_validate_model = function(
        [index],
        l_error,
        givens={
            l_x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling big net validate function"
    temp = valid_set_x.get_value(borrow=True, return_internal_type=True)
    valid_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])),
                           borrow=True,
                           name='valid_set_x_b')
    b_validate_model = function(
        [index],
        b_error,
        givens={
            b_x: valid_set_x_b[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "Training"

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 10  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    this_validation_loss = 0
    this_validation_loss_l = 0
    this_validation_loss_b = 0
    best_validation_loss = np.inf
    best_validation_loss_l = best_validation_loss
    best_validation_loss_b = best_validation_loss
    best_iter = 0
    test_score = 0.
    test_score_l = 0.
    test_score_b = 0.
    accum_l = 0
    accum_b = 0
    epoch = 0
    train_time_accum_l = 0
    train_time_accum_b = 0
    done_looping = False

    timers = ['train', 'valid', 'train']
    ts = TS(['epoch', 'valid'])
    ts_l = TS(timers)
    ts_b = TS(timers)

    summarize_rates()

    ts.start()
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        ts.start('epoch')
        for minibatch_index in xrange(n_train_batches):
            if train_little_net or select_top_active:
                ts_l.start('train')
                minibatch_avg_cost_l = l_train_model(minibatch_index)
                ts_l.end('train')

                minibatch_avg_cost_l = minibatch_avg_cost_l[0]
                if np.isnan(minibatch_avg_cost_l):
                    print "minibatch_avg_cost_l: %f" % minibatch_avg_cost_l
                    ipdb.set_trace()
                accum_l = accum_l + minibatch_avg_cost_l

            if train_big_net:
                ts_b.start('train')
                minibatch_avg_cost_b = b_train_model(minibatch_index)
                ts_b.end('train')

                minibatch_avg_cost_b = minibatch_avg_cost_b[0]
                accum_b = accum_b + minibatch_avg_cost_b

            #print "minibatch_avg_cost: " + str(minibatch_avg_cost) + " minibatch_avg_cost_b: " + str(minibatch_avg_cost_b)
            #print l_layers[0].W.get_value().sum(), l_layers[1].W.get_value().sum(), b_layers[0].W.get_value().sum(), b_layers[1].W.get_value().sum()
            #print "A: ", np.max(np.abs(b_layers[0].W.get_value())), np.max(np.abs(b_layers[0].b.get_value())), np.max(np.abs(b_layers[1].W.get_value())), np.max(np.abs(b_layers[1].b.get_value()))
            #print "B: ", np.abs(b_layers[0].W.get_value()).sum(), np.abs(b_layers[0].b.get_value()).sum(), np.abs(b_layers[1].W.get_value()).sum(), np.abs(b_layers[1].b.get_value()).sum()
            #print "C: ", np.abs(np.array(minibatch_avg_cost_b[1])).sum(), np.abs(np.array(minibatch_avg_cost_b[2])).sum(), np.abs(np.array(minibatch_avg_cost_b[3])).sum(), np.abs(np.array(minibatch_avg_cost_b[4])).sum()

            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                ts.end('epoch')
                ts.reset('epoch')

                l_summary = ""
                if train_little_net or select_top_active:
                    ts_l.reset('train')
                    accum_l = accum_l / validation_frequency
                    l_summary = ("minibatch_avg_cost_l: %f, time: %f" %
                                 (accum_l, ts_l.accumed['train'][-1][1]))
                    accum_l = 0
                    train_time_accum_l = 0

                b_summary = ""
                if train_big_net:
                    ts_b.reset('train')
                    accum_b = accum_b / validation_frequency
                    b_summary = ("minibatch_avg_cost_b: %f, time: %f" %
                                 (accum_b, ts_b.accumed['train'][-1][1]))
                    accum_b = 0

                print "%s %s" % (l_summary, b_summary)

                # compute zero-one loss on validation set
                summary = ('epoch %i, minibatch %i/%i' %
                           (epoch, minibatch_index + 1, n_train_batches))

                l_summary = ""
                if train_little_net or select_top_active:
                    validation_losses_l = [
                        l_validate_model(i) for i in xrange(n_valid_batches)
                    ]
                    this_validation_loss_l = np.mean(validation_losses_l)
                    l_summary = ('little validation error %f %% ' %
                                 (this_validation_loss_l * 100.))

                b_summary = ""
                if train_big_net:
                    validation_losses_b = [
                        b_validate_model(i) for i in xrange(n_valid_batches)
                    ]
                    this_validation_loss_b = np.mean(validation_losses_b)
                    #this_validation_loss_b = 0
                    b_summary = ('big validation error %f %% ' %
                                 (this_validation_loss_b * 100.))

                print("%s %s %s" % (summary, l_summary, b_summary))
                #ipdb.set_trace()

                # if we got the best validation score until now
                if train_big_net:
                    this_validation_loss = this_validation_loss_b
                elif train_little_net:
                    this_validation_loss = this_validation_loss_l

                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss_l = this_validation_loss_l
                    best_validation_loss_b = this_validation_loss_b

                    if train_big_net:
                        best_validation_loss = best_validation_loss_b
                    elif train_little_net:
                        best_validation_loss = best_validation_loss_l

                    best_iter = iter

                    # test it on the test set
                    l_summary = ""
                    if train_little_net:
                        test_losses_l = [
                            l_test_model(i) for i in xrange(n_test_batches)
                        ]
                        test_score_l = np.mean(test_losses_l)
                        l_summary = 'little: %f' % (test_score_l * 100.)

                    b_summary = ""
                    if train_big_net:
                        test_losses_b = [
                            b_test_model(i) for i in xrange(n_test_batches)
                        ]
                        test_score_b = np.mean(test_losses_b)
                        #test_score_b = 0
                        b_summary = 'big: %f' % (test_score_b * 100.)

                    print(
                        '     epoch %i, minibatch %i/%i,'
                        ' test error of best model %s %s' %
                        (epoch, minibatch_index + 1, n_train_batches,
                         l_summary, b_summary))

                learning_rate.update()

                if train_little_net:
                    l_learning_rate.set_value(learning_rate.rate)

                if train_big_net:
                    b_learning_rate.set_value(learning_rate.rate)

                momentum_rate.update()
                momentum.set_value(momentum_rate.rate)

                summarize_rates()

            if patience <= iter:
                done_looping = True
                break

    ts.end()
    print(
        'Optimization complete. Best validation score of %f %% (%f %%) '
        'obtained at iteration %i, with test performance %f %% (%f %%)' %
        (best_validation_loss_l * 100., best_validation_loss_b * 100.,
         best_iter + 1, test_score_l * 100., test_score_b * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %s' % ts)
Beispiel #14
0
def test_big_and_little_train_both(rng,
                                   batch_size=1,
                                   learning_rate=0.01,
                                   n_epochs=1000,
                                   L1_reg=0.0,
                                   L2_reg=0.0001):
    l_learning_rate = learning_rate
    b_learning_rate = 10 * learning_rate

    index = T.lscalar('index')
    l_x = T.matrix('l_x', dtype=config.floatX)
    b_x = T.tensor3('b_x', dtype=config.floatX)
    y = T.ivector('y')

    print "Loading Data"
    dataset = 'mnist.pkl.gz'
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    print "Building models"
    print "... Building layers"
    # Create network structure
    x_size = train_set_x.shape[1].eval()
    n_in = x_size
    n_units_per = 32
    n_out = 500
    l_layers = []
    b_layers = []

    l_layers.append(
        HiddenLayer(
            n_in,
            n_out,
            batch_size,
            #k=0.05,
            k=1,
            activation=T.tanh,
            name='l_layer_' + str(len(l_layers))))

    in_idxs_0 = shared(np.zeros((batch_size, 1), dtype='int64'),
                       name='in_idxs_0')
    b_layers.append(
        HiddenBlockLayer((1, x_size), (n_out, n_units_per),
                         in_idxs_0,
                         l_layers[-1].top_active,
                         batch_size,
                         activation=T.tanh,
                         name='b_layer_' + str(len(b_layers))))

    #n_in = n_out
    #n_out = 100
    #k_activations = 0.12
    #l_layers.append(
    #    HiddenLayer(
    #        n_in,
    #        n_out,
    #        k=k_activations,
    #        name='l_layer_' + str(len(l_layers))
    #    )
    #)
    #b_layers.append(HiddenBlockLayer(n_in, n_out, batch_size))

    n_in = n_out
    n_out = 10
    l_layers.append(
        HiddenLayer(n_in,
                    n_out,
                    batch_size,
                    k=1,
                    activation=T.nnet.softmax,
                    name='l_layer_' + str(len(l_layers))))
    l_layers[-1].W.set_value(0 * l_layers[-1].W.get_value())

    # T.nnet.softmax takes a matrix not a tensor so just calculate the linear
    # component in the layer and apply the softmax later
    #out_idxs_n = shared(
    #    np.repeat(
    #        np.arange(n_out, dtype='int64').reshape(1, n_out),
    #        batch_size,
    #        axis=0
    #    ),
    #    name='out_idxs_' + str(len(l_layers))
    #)
    b_layers.append(
        HiddenBlockLayer(
            (n_in, n_units_per),
            (n_out, n_units_per),
            l_layers[-2].top_active,
            l_layers[-1].top_active,
            #out_idxs_n,
            batch_size,
            None,
            name='b_layer_' + str(len(b_layers))))
    #b_layers[-1].W.set_value(0*b_layers[-1].W.get_value())

    print "... Building top active updates"
    top_active = []
    l_activation = l_x
    b_activation = b_x
    b_activations = [b_activation]
    for i in range(len(l_layers)):
        l_activation = l_layers[i].output(l_activation)
        b_activation = b_layers[i].output(b_activation)
        b_activations.append(b_activation)
        top_active.append((l_layers[i].top_active,
                           T.argsort(T.abs_(l_activation))[:, :l_layers[i].k]))

    print "... Building costs and errors"
    l_cost = add_regularization(l_layers, l_layers[-1].cost(l_activation, y),
                                L1_reg, L2_reg)
    l_error = l_layers[-1].error(l_activation, y)

    # T.nnet.softmax takes a matrix not a tensor so we only calculate the
    # linear component at the last layer and here we reshape and then
    # apply the softmax
    #b_activation = T.nnet.softmax(((b_activation*b_activation)**2).sum(axis=2))
    #b_activation = relu_softmax(((b_activation*b_activation)**2).sum(axis=2))
    b_activation = T.nnet.softmax(T.mean(b_activation, axis=2))
    #b_activation = relu_softmax(T.mean(b_activation, axis=2))
    #b_activation = T.nnet.softmax(T.max(b_activation, axis=2))
    #b_activation = relu_softmax(T.max(b_activation, axis=2))
    b_activations.append(b_activation)
    b_cost = add_regularization(b_layers, b_layers[-1].cost(b_activation, y),
                                L1_reg, L2_reg)
    b_error = b_layers[-1].error(b_activation, y)

    print "... Building parameter updates"
    l_grads = []
    l_param_updates = []
    b_grads = []
    b_param_updates = []
    for i in range(len(l_layers)):
        for param in l_layers[i].params:
            gparam = T.grad(l_cost, param)
            l_grads.append(gparam)
            l_param_updates.append((param, param - l_learning_rate * gparam))

        for param in b_layers[i].params:
            gparam = T.grad(
                b_cost,
                param,
                consider_constant=[b_layers[i].in_idxs, b_layers[i].out_idxs])
            b_grads.append(gparam)
            b_param_updates.append((param, param - b_learning_rate * gparam))

    print "... Compiling little net train function"
    l_train_model = function(
        [index], [l_cost, l_x, y],
        updates=top_active + l_param_updates,
        givens={
            l_x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling big net train function"
    temp = train_set_x.get_value(borrow=True, return_internal_type=True)
    train_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])),
                           borrow=True,
                           name='train_set_x_b')
    b_train_model = function(
        [index], [b_cost],
        updates=b_param_updates,
        givens={
            b_x: train_set_x_b[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    #theano.printing.debugprint(b_train_model)
    #ipdb.set_trace()

    #    verify_layers(batch_size, b_layers, train_set_x_b, train_set_y)
    #    temp = verify_cost(
    #        b_cost,
    #        b_layers,
    #        b_x,
    #        y,
    #        batch_size,
    #        train_set_x_b,
    #        train_set_y
    #    )
    #    T.verify_grad(
    #        temp,
    #        [b_layers[0].W.get_value(), b_layers[1].W.get_value()],
    #        rng=rng
    #    )

    print "... Compiling little net test function"
    l_test_model = function(
        [index],
        l_error,
        givens={
            l_x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling big net test function"
    temp = test_set_x.get_value(borrow=True, return_internal_type=True)
    test_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])),
                          borrow=True,
                          name='test_set_x_b')
    b_test_model = function(
        [index],
        b_error,
        givens={
            b_x: test_set_x_b[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling little net validate function"
    l_validate_model = function(
        [index],
        l_error,
        givens={
            l_x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "... Compiling big net validate function"
    temp = valid_set_x.get_value(borrow=True, return_internal_type=True)
    valid_set_x_b = shared(temp.reshape((temp.shape[0], 1, temp.shape[1])),
                           borrow=True,
                           name='valid_set_x_b')
    b_validate_model = function(
        [index],
        b_error,
        givens={
            b_x: valid_set_x_b[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    print "Training"

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 100  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    accum = 0
    accum_b = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost = l_train_model(minibatch_index)
            minibatch_avg_cost_b = b_train_model(minibatch_index,
                                                 learning_rate.rate)

            #print "minibatch_avg_cost: " + str(minibatch_avg_cost) + " minibatch_avg_cost_b: " + str(minibatch_avg_cost_b)
            #print l_layers[0].W.get_value().sum(), l_layers[1].W.get_value().sum(), b_layers[0].W.get_value().sum(), b_layers[1].W.get_value().sum()
            #print "A: ", np.max(np.abs(b_layers[0].W.get_value())), np.max(np.abs(b_layers[0].b.get_value())), np.max(np.abs(b_layers[1].W.get_value())), np.max(np.abs(b_layers[1].b.get_value()))
            #print "B: ", np.abs(b_layers[0].W.get_value()).sum(), np.abs(b_layers[0].b.get_value()).sum(), np.abs(b_layers[1].W.get_value()).sum(), np.abs(b_layers[1].b.get_value()).sum()
            #print "C: ", np.abs(np.array(minibatch_avg_cost_b[1])).sum(), np.abs(np.array(minibatch_avg_cost_b[2])).sum(), np.abs(np.array(minibatch_avg_cost_b[3])).sum(), np.abs(np.array(minibatch_avg_cost_b[4])).sum()
            minibatch_avg_cost = minibatch_avg_cost[0]
            minibatch_avg_cost_b = minibatch_avg_cost_b[0]
            accum = accum + minibatch_avg_cost
            accum_b = accum_b + minibatch_avg_cost_b

            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                accum = accum / validation_frequency
                accum_b = accum_b / validation_frequency
                print "minibatch_avg_cost: ", accum, \
                    "minibatch_avg_cost_b: ", accum_b
                accum = 0
                accum_b = 0

                # compute zero-one loss on validation set
                validation_losses = [
                    l_validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = np.mean(validation_losses)

                validation_losses_b = [
                    b_validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss_b = np.mean(validation_losses_b)
                #this_validation_loss_b = 0

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %% '
                    '(%f %%)' % (epoch, minibatch_index + 1, n_train_batches,
                                 this_validation_loss * 100.,
                                 this_validation_loss_b * 100.))
                #ipdb.set_trace()

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        l_test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = np.mean(test_losses)

                    test_losses_b = [
                        b_test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score_b = np.mean(test_losses_b)
                    #test_score_b = 0

                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %% (%f %%)') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100., test_score_b * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #15
0
    def setup_hidden_layers(self, activation, n_in=0, n_out=0, n_hiddens=0):
        """
        Setup the hidden layers with the specified number of hidden units.
        """
        act_fn = T.tanh
        if activation == NeuralActivations.Rectifier:
            act_fn = self.rectifier_act

        if n_in == 0:
            n_in = self.n_in
        if n_out == 0:
            n_out = self.n_out
        if n_hiddens == 0:
            n_hiddens = self.n_hiddens

        self.rng.seed(1985)
        #Create the hidden layers.
        self.hiddenLayers.append(
            HiddenLayer(rng=self.rng,
                        input=self.input,
                        n_in=n_in,
                        n_out=n_hiddens[0],
                        activation=act_fn))

        for i in xrange(1, self.n_hidden_layers):
            self.rng.seed(2012)
            self.hiddenLayers.append(
                HiddenLayer(rng=self.rng,
                            input=self.hiddenLayers[i - 1].output,
                            n_in=n_hiddens[i - 1],
                            n_out=n_hiddens[i],
                            activation=act_fn))

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegressionLayer(
            input=self.hiddenLayers[-1].output,
            n_in=n_hiddens[-1],
            n_out=n_out,
            rng=self.rng)

        self.initialize_regularization()

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood

        # negative log likelihood of the MLP is given by the
        # crossentropy of the output of the model, computed in the
        # logistic regression layer
        self.crossentropy = self.logRegressionLayer.crossentropy
        self.crossentropy_categorical = self.logRegressionLayer.crossentropy_categorical

        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        self.raw_prediction_errors =\
        self.logRegressionLayer.raw_prediction_errors

        self.p_y_given_x = self.logRegressionLayer.p_y_given_x

        # Class memberships
        hidden_outputs = self.hiddenLayers[0].get_outputs(self.input)
        for i in xrange(1, self.n_hidden_layers):
            hidden_outputs = self.hiddenLayers[i].get_outputs(hidden_outputs)

        self.class_memberships = self.logRegressionLayer.get_class_memberships(
            hidden_outputs)
        self.initialize_params()