def __init__(self, num_input=256, num_hidden=512, num_output=256): X = T.matrix() Y = T.matrix() eta = T.scalar() alpha = T.scalar() self.num_input = num_input self.num_hidden = num_hidden self.num_output = num_output inputs = InputLayer(X, name="inputs") lstm1f = LSTMLayer(num_input, num_hidden, input_layers=[inputs], name="lstm1f") lstm1b = LSTMLayer(num_input, num_hidden, input_layers=[inputs], name="lstm1b", go_backwards=True) fc = FullyConnectedLayer(2*num_hidden, num_output, input_layers=[lstm1f, lstm1b], name="yhat") Y_hat = sigmoid(T.mean(fc.output(), axis=0)) self.layers = inputs, lstm1f, lstm1b, fc params = get_params(self.layers) caches = make_caches(params) mean_cost = - T.mean( Y * T.log(Y_hat) + (1-Y) * T.log(1-Y_hat) ) last_step_cost = - T.mean( Y[-1] * T.log(Y_hat[-1]) + (1-Y[-1]) * T.log(1-Y_hat[-1]) ) cost = alpha * mean_cost + (1-alpha) * last_step_cost updates = momentum(cost, params, caches, eta, clip_at=3.0) self.train = theano.function([X, Y, eta, alpha], [cost, last_step_cost], updates=updates, allow_input_downcast=True) self.predict=theano.function([X], [Y_hat[-1]], allow_input_downcast=True)
def create_rnn_layer(self, hidden_dim, input_dim, vocab_size, is_encoder): if self.rnn_type == 'vanillarnn': return VanillaRNNLayer(hidden_dim, input_dim, vocab_size, create_init_state=is_encoder) elif self.rnn_type == 'gru': return GRULayer(hidden_dim, input_dim, vocab_size, create_init_state=is_encoder) elif self.rnn_type == 'lstm': return LSTMLayer(hidden_dim, input_dim, vocab_size, create_init_state=is_encoder) elif self.rnn_type == 'atnh': return LSTMLayer(hidden_dim, input_dim, vocab_size, create_init_state=is_encoder) raise Exception('Unrecognized rnn_type %s' % self.rnn_type)
def __init__(self): X = T.matrix() Y = T.matrix() eta = T.scalar() temperature = T.scalar() num_input = 256 num_hidden = 500 num_output = 256 inputs = InputLayer(X, name="inputs") lstm1 = LSTMLayer(num_input, num_hidden, input_layer=inputs, name="lstm1") lstm2 = LSTMLayer(num_hidden, num_hidden, input_layer=lstm1, name="lstm2") softmax = SoftmaxLayer(num_hidden, num_output, input_layer=lstm2, name="yhat", temperature=temperature) Y_hat = softmax.output() self.layers = inputs, lstm1, lstm2, softmax params = get_params(self.layers) caches = make_caches(params) cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, Y)) updates = momentum(cost, params, caches, eta) self.train = theano.function([X, Y, eta, temperature], cost, updates=updates, allow_input_downcast=True) predict_updates = one_step_updates(self.layers) self.predict_char = theano.function([X, temperature], Y_hat, updates=predict_updates, allow_input_downcast=True)
def __init__(self, num_input=256, num_hidden=[512, 512], num_output=256, clip_at=0.0, scale_norm=0.0): X = T.matrix() Y = T.matrix() eta = T.scalar() alpha = T.scalar() lambda2 = T.scalar() dropout_lstm = T.scalar() self.num_input = num_input self.num_hidden = num_hidden self.num_output = num_output self.clip_at = clip_at self.scale_norm = scale_norm inputs = InputLayer(X, name="inputs") num_prev = num_input prev_layer = inputs self.layers = [inputs] for i, num_curr in enumerate(num_hidden): lstm = LSTMLayer(num_prev, num_curr, input_layers=[prev_layer], name="lstm{0}".format(i + 1), drop_prob=drop_prob) num_prev = num_curr prev_layer = lstm prev_layer = DropoutLayer(input_layers=[prev_layer], dropout_probability=dropout_lstm) self.layers.append(lstm) sigmoid = SigmoidLayer(num_prev, num_output, input_layers=[prev_layer], name="yhat") self.layers.append(sigmoid) Y_hat = sigmoid.output() params = get_params(self.layers) caches = make_caches(params) mean_cost = -T.mean(Y * T.log(Y_hat) + (1 - Y) * T.log(1 - Y_hat)) last_step_cost = -T.mean(Y[-1] * T.log(Y_hat[-1]) + (1 - Y[-1]) * T.log(1 - Y_hat[-1])) cost = alpha * mean_cost + (1 - alpha) * last_step_cost updates = momentum(cost, params, caches, eta, clip_at=self.clip_at, scale_norm=self.scale_norm, lambda2=lambda2) self.train_func = theano.function( [X, Y, eta, alpha, lambda2, dropout_lstm], [cost, last_step_cost], updates=updates, allow_input_downcast=True) self.predict_func = theano.function([X, dropout_lstm], [Y_hat[-1]], allow_input_downcast=True) self.predict_sequence_func = theano.function([X, dropout_lstm], [Y_hat], allow_input_downcast=True)
def main(num_epochs=NUM_EPOCHS, vocab_size=VOCAB_SIZE): logging.info("Building network ...") # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, SEQ_LENGTH, num_features) l_in = lasagne.layers.InputLayer(shape=(None, None, NDIM)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) # We now build the LSTM layer which takes l_in as the input layer # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients. l_forward = None if MODEL_TYPE == 'LSTM' or MODEL_TYPE == 'LSTM_T': l_t = lasagne.layers.InputLayer( shape=(None, None)) if USE_TIME_INPUT else None l_forward = LSTMLayer(l_in, time_input=l_t, mask_input=l_mask, num_units=N_HIDDEN, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate( W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=lasagne.layers.Gate(), cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, bn=BN, only_return_final=False) elif MODEL_TYPE == 'TLSTM1': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = TLSTM1Layer( l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'TLSTM2': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = TLSTM2Layer( l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'TLSTM3': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = TLSTM3Layer( l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), # forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'PLSTM': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = PLSTMLayer(l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, bn=BN, timegate=PLSTMTimeGate()) # Theano tensor for the targets target_values = T.matrix('target_values', dtype='int32') # The output of l_forward of shape (batch_size,time_sequence, N_HIDDEN) is then passed through the # softmax nonlinearity to # create probability distribution of the prediction # The output of this stage is (batch_size, time_sequence, vocab_size) l_out = lasagne.layers.DenseLayer(l_forward, num_units=vocab_size, W=lasagne.init.Normal(), num_leading_axes=2, nonlinearity=None) # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(l_out) # We need sum up all the cost through time. # network_output ( time_sequence,batch_size, vocab_size) network_output = network_output.dimshuffle(1, 0, 2) def calculate_softmax(n_input): return T.nnet.softmax(n_input) def merge_cost(n_input, n_target, n_mask, cost_prev): n_target = n_target.ravel() n_cost = T.nnet.categorical_crossentropy(n_input, n_target) n_cost = n_cost * n_mask n_cost = n_cost.sum() return cost_prev + n_cost network_output_softmax, _ = theano.scan(fn=calculate_softmax, sequences=network_output) # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target. m_cost, _ = theano.scan(fn=merge_cost, sequences=[ network_output_softmax, target_values.T, l_mask.input_var.T ], outputs_info=T.constant(0.)) m_cost = m_cost[-1] cost = m_cost / l_mask.input_var.sum() # convert back to: (batch_size, time_seqsence, vocab_size) network_output_softmax = network_output_softmax.dimshuffle(1, 0, 2) # Compute AdaGrad updates for training logging.info("Computing updates ...") all_params = lasagne.layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training, predict logging.info("Compiling functions ...") input_var = [l_in.input_var, l_mask.input_var] if USE_TIME_INPUT: input_var += [l_t.input_var] predict = theano.function(input_var, network_output_softmax, allow_input_downcast=True) input_var += [target_values] train = theano.function(input_var, cost, updates=updates, allow_input_downcast=True) # compute_cost return cost but without update compute_cost = theano.function(input_var, cost, allow_input_downcast=True) def do_evaluate(test_x, test_y, test_mask, lengths, test_t=None, n=100, test_batch=5): # evaluate and calculate recall@10, MRR@10 logging.info("Evaluate: Start predicting") p = 0 probs_all_time = None while True: input_var = [test_x[p:p + test_batch], test_mask[p:p + test_batch]] if test_t is not None: input_var += [test_t[p:p + test_batch]] batch_probs = predict(*input_var) p += test_batch if probs_all_time is None: probs_all_time = np.zeros( (test_x.shape[0] + TEST_BATCH, batch_probs.shape[2])) probs_all_time[p:p + batch_probs.shape[0], :] = batch_probs[:, -1, :] if p >= test_x.shape[0]: break logging.info("Evaluate: End predicting") total_size = test_x.shape[0] recall10 = 0. MRR10_score = 0. NDCG_score = 0. rate_sum = 0 sample_time = SAMPLE_TIME for idx in range(total_size): gnd = test_y[idx] probs = probs_all_time[idx, :] prob_index = np.argsort(probs)[-1::-1].tolist() gnd_rate = prob_index.index(gnd) + 1 rate_sum += gnd_rate # Sample multiple times to reduce randomness for _ in range(sample_time): samples = np.random.choice(range(vocab_size), n + 1, replace=False).tolist() # for i, sample in enumerate(samples): # o = 0 # while sample in test_x[idx].tolist() and o < 10: # sample = random.choice(range(vocab_size)) # samples[i] = sample # o+=1 # make sure the fist element is gnd try: samples.remove(gnd) samples.insert(0, gnd) except ValueError: samples[0] = gnd sample_probs = probs[samples] prob_index = np.argsort(sample_probs)[-1::-1].tolist() rate = prob_index.index(0) + 1 # caculate Recall@10, NDCG@10 and MRR@10 if rate <= 10: recall10 += 1 MRR10_score += 1. / rate NDCG_score += 1. / math.log(rate + 1, 2) logging.info("Evaluate: End calculating scores") count = total_size * sample_time recall10 = recall10 / count MRR10_score = MRR10_score / count NDCG_score = NDCG_score / count avg_rate = float(rate_sum) / total_size logging.info('Recall@10 {}'.format(recall10)) logging.info('MRR@10 1/rate {}'.format(MRR10_score)) logging.info('NDCG@10 1/rate {}'.format(NDCG_score)) logging.info('Average rate {}'.format(avg_rate)) def onehot2int(onehot_vec): # convert onehot vector to index ret = [] for onehot in onehot_vec: ret.append(onehot.tolist().index(1)) return ret def get_short_test_data(length): print("Get short test data") # generate short sequence in the test_data. test_x = test_data['x'][:, :length] test_mask = test_data['mask'][:, :length] test_t = test_data['t'][:, :length] if USE_TIME_INPUT else None lengths = np.sum(test_mask, axis=1).astype('int') test_y = test_data['y'].copy() for idx in range(test_y.shape[0]): whole_length = test_data['lengths'][idx] if length < whole_length: test_y[idx] = test_data['x'][idx, length, :].tolist().index( 1) if ONE_HOT else test_data['x'][idx, length, 0] logging.info("Finished getting short test data") return test_x, test_y, test_mask, lengths, test_t def evaluate(model, current_epoch, additional_test_length): # Evaluate the model logging.info('Evaluate') test_x = test_data['x'] test_y = test_data['y'] test_mask = test_data['mask'] lengths = test_data['lengths'] logging.info( '-----------Evaluate Normal:{},{},{}-------------------'.format( MODEL_TYPE, DATA_TYPE, N_HIDDEN)) do_evaluate(test_x, test_y, test_mask, lengths, test_data['t'] if USE_TIME_INPUT else None, test_batch=TEST_BATCH) # Evaluate the model on short data if additional_test_length > 0: logging.info('-----------Evaluate Additional---------------') test_x, test_y, test_mask, lengths, test_t = get_short_test_data( additional_test_length) do_evaluate(test_x, test_y, test_mask, lengths, test_t, test_batch=TEST_BATCH) logging.info('-----------Evaluate End----------------------') if not DEBUG: utils.save_model( '{}-{}-{}-{}'.format(MODEL_TYPE, current_epoch, DATA_TYPE, N_HIDDEN), str(datetime.datetime.now()), model, '_new') logging.info("Done saving") def add_test_to_train(length): logging.info('Length {} test cases added to train set'.format(length)) global train_data logging.info('Old train data size {}'.format(len(train_data['x']))) # Remote the train_data added before train_data['x'] = train_data['x'][:train_data_size] train_data['y'] = train_data['y'][:train_data_size] if 't' in train_data: train_data['t'] = train_data['t'][:train_data_size] test_x = test_data['x'] lengths = test_data['lengths'] for idx in range(test_x.shape[0]): n_length = length # To make sure the complete test case will not be added into train set if lengths[idx] <= length: n_length = length - 1 if ONE_HOT: # if ONE_HOT is used, we convert one hot vector to int first. new_x = onehot2int(test_x[idx, :n_length, :]) new_y = onehot2int(test_x[idx, 1:n_length + 1, :]) else: new_x = test_x[idx, :n_length, 0] new_y = test_x[idx, 1:n_length + 1, 0] train_data['x'].append(new_x) train_data['y'].append(new_y) if 't' in train_data: test_t = test_data['t'] new_t = test_t[idx, :n_length].tolist() train_data['t'].append(new_t) logging.info('New train data size {}'.format(len(train_data['x']))) logging.info('--Data Added--') logging.info("Training ...") logging.info('Data size {},Max epoch {},Batch {}'.format( train_data_size, num_epochs, BATCH_SIZE)) p = 0 current_epoch = 0 it = 0 data_size = train_data_size last_it = 0 avg_cost = 0 avg_seq_len = 0 try: while True: #logging.info("Load batch") batch_data = gen_data(p, train_data, batch_size=BATCH_SIZE) x = batch_data['x'] y = batch_data['y'] mask = batch_data['mask'] avg_seq_len += x.shape[1] input_var = [x, mask, y] #logging.info("Train batch") if USE_TIME_INPUT: t = batch_data['t'] input_var.insert(2, t) avg_cost += train(*input_var) it += 1 p += BATCH_SIZE #logging.info("Done bitch") #if True: if (p >= data_size): p = 0 last_it = it current_epoch += 1 # First stage: Using original train data to train model in #FIXED_EPOCHS # Second stage: After that add part of test data to train data. # The first stage is using user information with similar interest, and the second stage is using history information additional_length = int( (current_epoch - FIXED_EPOCHS) * test_data_length / (NUM_EPOCHS - FIXED_EPOCHS)) #if current_epoch % 2 == 0: evaluate(l_out, current_epoch=current_epoch, additional_test_length=additional_length) if current_epoch >= num_epochs: break if current_epoch > FIXED_EPOCHS: data_size = train_data_size + test_data_size logging.info( '>> length {} test cases added to train set.'.format( additional_length)) add_test_to_train(additional_length) logging.info('Epoch {} Carriage Return'.format(current_epoch)) if it % PRINT_FREQ == 0: logging.info( "Epoch {}-{},iter {} average seq length = {} average loss = {}" .format(current_epoch, (it - last_it) * 1.0 * BATCH_SIZE / data_size, it, avg_seq_len / PRINT_FREQ, avg_cost / PRINT_FREQ)) avg_cost = 0 avg_seq_len = 0 logging.info('End') except KeyboardInterrupt: pass
def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False, no_dropout=False, custom_lstm=False): super(RNNModel, self).__init__() self.lockdrop = LockedDropout() self.idrop = nn.Dropout(dropouti) self.hdrop = nn.Dropout(dropouth) self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) self.use_dropout = not no_dropout if wdrop is None: wdrop = 0 wdrop = wdrop if self.use_dropout else 0 assert rnn_type in ['LSTM', 'QRNN', 'GRU'], 'RNN type is not supported' if rnn_type == 'LSTM': # we need to use own lstm for second order derivative if not custom_lstm: self.rnns = [ torch.nn.LSTM(ninp if l == 0 else nhid, nhid if l != nlayers - 1 else (ninp if tie_weights else nhid), 1, dropout=0) for l in range(nlayers) ] self.rnns = [ WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop) for rnn in self.rnns ] else: self.rnns = [ LSTMLayer( ninp if l == 0 else nhid, nhid if l != nlayers - 1 else (ninp if tie_weights else nhid)) for l in range(nlayers) ] self.rnns = [ WeightDrop(rnn, ['weight_hh'], dropout=wdrop) for rnn in self.rnns ] if rnn_type == 'GRU': self.rnns = [ torch.nn.GRU(ninp if l == 0 else nhid, nhid if l != nlayers - 1 else ninp, 1, dropout=0) for l in range(nlayers) ] self.rnns = [ WeightDrop(rnn, ['weight_hh'], dropout=wdrop) for rnn in self.rnns ] elif rnn_type == 'QRNN': from torchqrnn import QRNNLayer self.rnns = [ QRNNLayer(input_size=ninp if l == 0 else nhid, hidden_size=nhid if l != nlayers - 1 else (ninp if tie_weights else nhid), save_prev_x=True, zoneout=0, window=2 if l == 0 else 1, output_gate=True) for l in range(nlayers) ] for rnn in self.rnns: rnn.linear = WeightDrop(rnn.linear, ['weight'], dropout=wdrop) print(self.rnns) self.rnns = torch.nn.ModuleList(self.rnns) self.decoder = nn.Linear(nhid, ntoken) # Optionally tie weights as in: # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) # https://arxiv.org/abs/1608.05859 # and # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) # https://arxiv.org/abs/1611.01462 if tie_weights: #if nhid != ninp: # raise ValueError('When using the tied flag, nhid must be equal to emsize') self.decoder.weight = self.encoder.weight self.init_weights() self.rnn_type = rnn_type self.ninp = ninp self.nhid = nhid self.nlayers = nlayers self.wdrop = wdrop self.dropout = dropout self.dropouti = dropouti self.dropouth = dropouth self.dropoute = dropoute self.tie_weights = tie_weights
def main(num_epochs=NUM_EPOCHS, vocab_size=VOCAB_SIZE): logging.info("Building network ...") # First, we build the network, starting with an input layer # Recurrent layers expect input of shape # (batch size, SEQ_LENGTH, num_features) l_in = lasagne.layers.InputLayer(shape=(None, None, NDIM)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) # We now build the LSTM layer which takes l_in as the input layer # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients. l_forward = None if MODEL_TYPE == 'LSTM' or MODEL_TYPE == 'LSTM_T': l_t = lasagne.layers.InputLayer( shape=(None, None)) if USE_TIME_INPUT else None l_forward = LSTMLayer(l_in, time_input=l_t, mask_input=l_mask, num_units=N_HIDDEN, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate( W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=lasagne.layers.Gate(), cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, bn=BN, only_return_final=False) elif MODEL_TYPE == 'TLSTM1': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = TLSTM1Layer( l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'TLSTM2': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = TLSTM2Layer( l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'TLSTM3': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = TLSTM3Layer( l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), # forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'PLSTM': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = PLSTMLayer(l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, grad_clipping=GRAD_CLIP, bn=BN, timegate=PLSTMTimeGate()) # Theano tensor for the targets target_values = T.matrix('target_values', dtype='int32') # The output of l_forward of shape (batch_size,time_sequence, N_HIDDEN) is then passed through the # softmax nonlinearity to # create probability distribution of the prediction # The output of this stage is (batch_size, time_sequence, vocab_size) l_out = lasagne.layers.DenseLayer(l_forward, num_units=vocab_size, W=lasagne.init.Normal(), num_leading_axes=2, nonlinearity=None) # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(l_out) # We need sum up all the cost through time. # network_output ( time_sequence,batch_size, vocab_size) network_output = network_output.dimshuffle(1, 0, 2) def calculate_softmax(n_input): return T.nnet.softmax(n_input) def merge_cost(n_input, n_target, n_mask, cost_prev): n_target = n_target.ravel() n_cost = T.nnet.categorical_crossentropy(n_input, n_target) n_cost = n_cost * n_mask n_cost = n_cost.sum() return cost_prev + n_cost network_output_softmax, _ = theano.scan(fn=calculate_softmax, sequences=network_output) # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target. m_cost, _ = theano.scan(fn=merge_cost, sequences=[ network_output_softmax, target_values.T, l_mask.input_var.T ], outputs_info=T.constant(0.)) m_cost = m_cost[-1] cost = m_cost / l_mask.input_var.sum() # convert back to: (batch_size, time_seqsence, vocab_size) network_output_softmax = network_output_softmax.dimshuffle(1, 0, 2) # Compute AdaGrad updates for training logging.info("Computing updates ...") all_params = lasagne.layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training, predict logging.info("Compiling functions ...") input_var = [l_in.input_var, l_mask.input_var] if USE_TIME_INPUT: input_var += [l_t.input_var] predict = theano.function(input_var, network_output_softmax, allow_input_downcast=True) input_var += [target_values] train = theano.function(input_var, cost, updates=updates, allow_input_downcast=True) # compute_cost return cost but without update compute_cost = theano.function(input_var, cost, allow_input_downcast=True) def do_evaluate(current_epoch, test_x, test_y, test_mask, lengths, test_t=None, n=100, test_batch=5, name=None): # evaluate and calculate recall@10, MRR@10 p = 0 probs_all_time = None while True: input_var = [test_x[p:p + test_batch], test_mask[p:p + test_batch]] if test_t is not None: input_var += [test_t[p:p + test_batch]] batch_probs = predict(*input_var) p += test_batch probs_all_time = batch_probs if probs_all_time is None else np.concatenate( [probs_all_time, batch_probs], axis=0) if p >= test_x.shape[0]: break total_size = test_x.shape[0] recall10 = 0. MRR10_score = 0. NDCG_score = 0. rate_sum = 0 sample_time = SAMPLE_TIME for idx in range(total_size): gnd = test_y[idx] probs = probs_all_time[idx, lengths[idx] - 1, :] prob_index = np.argsort(probs)[-1::-1].tolist() gnd_rate = prob_index.index(gnd) + 1 rate_sum += gnd_rate # Sample multiple times to reduce randomness for _ in range(sample_time): samples = np.random.choice(range(vocab_size), n + 1, replace=False).tolist() # make sure the fist element is gnd try: samples.remove(gnd) samples.insert(0, gnd) except ValueError: samples[0] = gnd sample_probs = probs[samples] prob_index = np.argsort(sample_probs)[-1::-1].tolist() rate = prob_index.index(0) + 1 # caculate Recall@10 and MRR@10 if rate <= 10: recall10 += 1 MRR10_score += 1. / rate NDCG_score += 1. / np.log2(rate + 1) count = total_size * sample_time recall10 = recall10 / count MRR10_score = MRR10_score / count NDCG_score = NDCG_score / count avg_rate = float(rate_sum) / total_size logging.info('Recall@10 {}'.format(recall10)) logging.info('MRR@10 1/rate {}'.format(MRR10_score)) logging.info('NDCG@10 {}'.format(NDCG_score)) logging.info('Average rate {}'.format(avg_rate)) from log import log_results log_results(result_dir, current_epoch, recall10, MRR10_score, NDCG_score, avg_rate, cost, name) def onehot2int(onehot_vec): # convert onehot vector to index ret = [] for onehot in onehot_vec: ret.append(onehot.tolist().index(1)) return ret def get_short_test_data(length): # generate short sequence in the test_data. test_x = test_data['x'][:, :length] test_mask = test_data['mask'][:, :length] test_t = test_data['t'][:, :length] if USE_TIME_INPUT else None lengths = np.sum(test_mask, axis=1).astype('int') test_y = test_data['y'].copy() for idx in range(test_y.shape[0]): whole_length = test_data['lengths'][idx] if length < whole_length: test_y[idx] = test_data['x'][idx, length, :].tolist().index( 1) if ONE_HOT else test_data['x'][idx, length, 0] return test_x, test_y, test_mask, lengths, test_t def evaluate(model, current_epoch, additional_test_length): # Evaluate the model logging.info('Evaluate') test_x = test_data['x'] test_y = test_data['y'] test_mask = test_data['mask'] lengths = test_data['lengths'] logging.info( '-----------Evaluate Normal:{},{},{}-------------------'.format( MODEL_TYPE, DATA_TYPE, N_HIDDEN)) do_evaluate(current_epoch, test_x, test_y, test_mask, lengths, test_data['t'] if USE_TIME_INPUT else None, test_batch=TEST_BATCH, name='additional') # Evaluate the model on short data if additional_test_length > 0: logging.info('-----------Evaluate Additional---------------') test_x, test_y, test_mask, lengths, test_t = get_short_test_data( additional_test_length) do_evaluate(current_epoch, test_x, test_y, test_mask, lengths, test_t, test_batch=TEST_BATCH, name='additional_test') logging.info('-----------Evaluate End----------------------') if not DEBUG: utils.save_model( '{}-{}-{}-{}'.format(MODEL_TYPE, current_epoch, DATA_TYPE, N_HIDDEN), str(datetime.datetime.now()), model, '_new') def add_test_to_train(length): logging.info('Length {} test cases added to train set'.format(length)) global train_data logging.info('Old train data size {}'.format(len(train_data['x']))) # Remote the train_data added before train_data['x'] = train_data['x'][:train_data_size] train_data['y'] = train_data['y'][:train_data_size] if train_data.has_key('t'): train_data['t'] = train_data['t'][:train_data_size] test_x = test_data['x'] lengths = test_data['lengths'] for idx in range(test_x.shape[0]): n_length = length # To make sure the complete test case will not be added into train set if lengths[idx] <= length: n_length = length - 1 if ONE_HOT: # if ONE_HOT is used, we convert one hot vector to int first. new_x = onehot2int(test_x[idx, :n_length, :]) new_y = onehot2int(test_x[idx, 1:n_length + 1, :]) else: new_x = test_x[idx, :n_length, 0] new_y = test_x[idx, 1:n_length + 1, 0] train_data['x'].append(new_x) train_data['y'].append(new_y) if train_data.has_key('t'): test_t = test_data['t'] new_t = test_t[idx, :n_length].tolist() train_data['t'].append(new_t) logging.info('New train data size {}'.format(len(train_data['x']))) logging.info('--Data Added--') logging.info("Training ...") logging.info('Data size {},Max epoch {},Batch {}'.format( train_data_size, num_epochs, BATCH_SIZE)) logging.info("Load pickle") utils.load_model("TLSTM3-9-music-128_2019-10-16 14:00:39.099161", l_out) lengths = [25, 50, 100, 200] max_length = 200 for seq_length in lengths: mask_length = max_length - lengths # Evaluate the model logging.info('Evaluate') test_x = test_data['x'] test_y = test_data['y'] test_mask = np.copy(test_data['mask']) test_mask[:, :mask_length] = 1 lengths = np.minimum(test_data['lengths'], seq_length) logging.info( '-----------Evaluate length: {}-------------------'.format( seq_length)) do_evaluate(test_x, test_y, test_mask, lengths, test_data['t'] if USE_TIME_INPUT else None, test_batch=TEST_BATCH)
sampler = RandomSampler(dataset) loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler, shuffle=False, num_workers=2) # dataiter = iter(loader) # images, labels = dataiter.next() # print (images) # images=tensor_to_img(images) # print (labels) # print (images) net = Net(14 * batch_size) lstm = LSTMLayer(7 * 7 * (16 + 5 * 2), 64, 14 * 14 * (num_class + 5 * 2), 2, batch_size) lossfunction = Loss(batch_size) optimizer = optim.Adam([{ 'params': net.parameters() }, { 'params': lstm.parameters(), 'lr': 0.0001 }], lr=0, weight_decay=0) if load_checkpoint: net.load_state_dict(torch.load(SAVE_PATH)) net.cuda() optimizer = optim.Adam(net.parameters(), lr=0.0001)
def main(num_epochs=NUM_EPOCHS, vocab_size=VOCAB_SIZE): logging.info("Building network ...") # (batch size, SEQ_LENGTH, num_features) # v: None表示该维度的大小在编译时没有固定。 # InputLayer,它可用于表示网络的输入。张量的第一个维度通常是批量维度 l_in = lasagne.layers.InputLayer(shape=(None, None, NDIM)) l_mask = lasagne.layers.InputLayer(shape=(None, None)) # addv l_pos = lasagne.layers.InputLayer(shape=(None, None)) # We now build the LSTM layer which takes l_in as the input layer # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients. l_forward = None if MODEL_TYPE == 'LSTM' or MODEL_TYPE == 'LSTM_T': l_t = lasagne.layers.InputLayer(shape=(None, None)) if USE_TIME_INPUT else None l_forward = LSTMLayer( l_in, time_input=l_t, mask_input=l_mask, num_units=N_HIDDEN, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=lasagne.layers.Gate(), cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, bn=BN, only_return_final=False) elif MODEL_TYPE == 'RNN': l_t = lasagne.layers.InputLayer(shape=(None, None)) if USE_TIME_INPUT else None l_forward = RNNLayer( l_in, time_input=l_t, mask_input=l_mask, num_units=N_HIDDEN, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=lasagne.layers.Gate(), cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, bn=BN, only_return_final=False) elif MODEL_TYPE == 'DTLSTM': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_d = lasagne.layers.InputLayer(shape=(None, None)) l_forward = VDTLSTMLayer( l_in, time_input=l_t, duration_input=l_d, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'DTLSTM_EM': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_d = lasagne.layers.InputLayer(shape=(None, None)) l_forward = VDTLSTMEMLayer( l_in, time_input=l_t, duration_input=l_d, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) elif MODEL_TYPE == 'TLSTM2': l_t = lasagne.layers.InputLayer(shape=(None, None)) l_forward = VTLSTM2Layer( l_in, time_input=l_t, num_units=N_HIDDEN, mask_input=l_mask, peepholes=True, ingate=lasagne.layers.Gate(), forgetgate=lasagne.layers.Gate(), cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh), outgate=OutGate(), nonlinearity=lasagne.nonlinearities.tanh, cell_init=lasagne.init.Constant(0.), hid_init=lasagne.init.Constant(0.), grad_clipping=GRAD_CLIP, only_return_final=False, bn=BN, ) else: logging.info('没有这种模型类型') exit(0) target_values = T.matrix('target_values', dtype='int32') # v:输出层(N_HIDDEN,vocab_size) # 调用了l_forward中get_output_shape_for()方法 # l_forward (num_batch, sequence_length, num_units) l_out = lasagne.layers.DenseLayer(l_forward, num_units=vocab_size, W=lasagne.init.Normal(), num_leading_axes=2, nonlinearity=None) # 获取输出层的输出(None, None, 500) # 调用了l_forward中get_output_for()方法 # l_out (num_batch, sequence_length, vocab_size) network_output = lasagne.layers.get_output(l_out) # (2, 0, 1) -> AxBxC to CxAxB # (0, ‘x’, 1) -> AxB to Ax1xB # (1, ‘x’, 0) -> AxB to Bx1xA # (sequence_length, num_batch, vocab_size) network_output = network_output.dimshuffle(1, 0, 2) def calculate_softmax(n_input): return T.nnet.softmax(n_input) def merge_cost(n_input, n_target, n_mask, n_pos, cost_prev): # 使用ravel将原始矩阵张开 n_target = n_target.ravel() # addv # n_pos = T.reshape(n_pos, (5, 1)) # n_input = n_pos - n_input # n_pos = (n_pos - 0.5) * 2 # n_input = n_input * n_pos n_cost = T.nnet.categorical_crossentropy(n_input, n_target) n_cost = n_cost * n_mask * n_pos # * (1.0 - n_pos) n_cost = n_cost.sum() return cost_prev + n_cost network_output_softmax, _ = theano.scan(fn=calculate_softmax, sequences=network_output) # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target. # 后面用于计算交叉熵损失函数的sum m_cost, _ = theano.scan(fn=merge_cost, sequences=[network_output_softmax, target_values.T, l_mask.input_var.T, l_pos.input_var.T], outputs_info=T.constant(0.)) # m_cost是一个序列,但是只需要最后一个叠加值cost[-1] m_cost = m_cost[-1] # 求平均cost cost = m_cost / l_mask.input_var.sum() # 转换回来: (batch_size, time_seqsence, vocab_size) network_output_softmax = network_output_softmax.dimshuffle(1, 0, 2) # Compute AdaGrad updates for training logging.info("Computing updates ...") # 这个get_all_params方法应该是用于获取所有的在lstmlayer中add_param all_params = lasagne.layers.get_all_params(l_out, trainable=True) # 根据cost更新所有的参数all_params,学习率为LEARNING_RATE updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE) # Theano functions for training, predict logging.info("Compiling functions ...") input_var = [l_in.input_var, l_mask.input_var] # add if USE_TIME_INPUT: input_var += [l_t.input_var] # addv if USE_DURATION: input_var += [l_d.input_var] predict = theano.function(input_var, network_output_softmax, allow_input_downcast=True) input_var += [target_values] # addv input_var.insert(2, l_pos.input_var) # v:计算损失函数值 # input_var[l_in.input_var, l_mask.input_var, l_pos.input_var,l_t.input_var,l_d.input_var,target_values] train = theano.function(input_var, cost, updates=updates, allow_input_downcast=True) # compute_cost return cost but without update compute_cost = theano.function(input_var, cost, allow_input_downcast=True) # v:评估方法!!!! # addv def do_evaluate(test_x, test_y, test_mask, lengths, test_t=None, test_d=None, n=1000, test_batch=5): # evaluate and calculate recall@10, MRR@10 p = 0 probs_all_time = None # 所有的预测值 while True: input_var = [test_x[p:p + test_batch], test_mask[p:p + test_batch]] if test_t is not None: input_var += [test_t[p:p + test_batch]] # addv if test_d is not None: input_var += [test_d[p:p + test_batch]] batch_probs = predict(*input_var) p += test_batch probs_all_time = batch_probs if probs_all_time is None else np.concatenate([probs_all_time, batch_probs], axis=0) if p >= test_x.shape[0]: break total_size = test_x.shape[0] recall10 = 0. MRR10_score = 0. rate_sum = 0 sample_time = SAMPLE_TIME # addv _rank = [] for idx in range(total_size): gnd = test_y[idx] probs = probs_all_time[idx, lengths[idx] - 1, :] # 取每一个test的最后一个的预测值,一个500维的向量 prob_index = np.argsort(probs)[-1::-1].tolist() # argsort函数返回的是数组值从小到大的索引值[3 1 2]-->[1 2 0] gnd_rate = prob_index.index(gnd) + 1 # 这个是所有的东西的排名 rate_sum += gnd_rate # Sample multiple times to reduce randomness for _ in range(sample_time): # addvv samples = np.random.choice(range(vocab_size), vocab_size, replace=False).tolist() # make sure the fist element is gnd # v 这样在随机之后,只要选择index(0)知道是第几了 try: samples.remove(gnd) samples.insert(0, gnd) except ValueError: samples[0] = gnd sample_probs = probs[samples] prob_index = np.argsort(sample_probs)[-1::-1].tolist() # v 这个是随机100个的排名 rate = prob_index.index(0) + 1 # addvv # logging.info('rank:{}'.format(rate)) # caculate Recall@10 and MRR@10 # addvc if rate <= RANK: recall10 += 1 MRR10_score += 1. / rate count = total_size * sample_time recall10 = recall10 / count MRR10_score = MRR10_score / count avg_rate = float(rate_sum) / total_size logging.info('Recall@10 {}'.format(recall10)) logging.info('MRR@10 1/rate {}'.format(MRR10_score)) logging.info('Average rate {}'.format(avg_rate)) def onehot2int(onehot_vec): # convert onehot vector to index ret = [] for onehot in onehot_vec: ret.append(onehot.tolist().index(1)) return ret def get_short_test_data(length): # generate short sequence in the test_data. test_x = test_data['x'][:, :length] test_mask = test_data['mask'][:, :length] # add test_t = test_data['t'][:, :length] if USE_TIME_INPUT else None # addv test_d = test_data['d'][:, :length] if USE_DURATION else None lengths = np.sum(test_mask, axis=1).astype('int') test_y = test_data['y'].copy() for idx in range(test_y.shape[0]): whole_length = test_data['lengths'][idx] if length < whole_length: test_y[idx] = test_data['x'][idx, length, :].tolist().index(1) if ONE_HOT else test_data['x'][ idx, length, 0] return test_x, test_y, test_mask, lengths, test_t, test_d def evaluate(model, current_epoch, additional_test_length): # Evaluate the model logging.info('Evaluate') # 包括了所有测试集合 test_x = test_data['x'] test_y = test_data['y'] test_mask = test_data['mask'] lengths = test_data['lengths'] logging.info('-----------Evaluate Normal:{},{},{}-------------------'.format(MODEL_TYPE, DATA_TYPE, N_HIDDEN)) do_evaluate(test_x, test_y, test_mask, lengths, test_data['t'] if USE_TIME_INPUT else None, test_data['d'] if USE_DURATION else None, test_batch=TEST_BATCH) # Evaluate the model on short data if additional_test_length > 0: logging.info('-----------Evaluate Additional---------------') # addv test_x, test_y, test_mask, lengths, test_t, test_d = get_short_test_data(additional_test_length) do_evaluate(test_x, test_y, test_mask, lengths, test_t, test_d, test_batch=TEST_BATCH) logging.info('-----------Evaluate End----------------------') if not DEBUG: vutils.save_model('{}-{}-{}-{}'.format(MODEL_TYPE, current_epoch, DATA_TYPE, N_HIDDEN), str(datetime.datetime.now()), model, '_new') def add_test_to_train(length): logging.info('Length {} test cases added to train set'.format(length)) global train_data logging.info('Old train data size {}'.format(len(train_data['x']))) # Remote the train_data added before train_data['x'] = train_data['x'][:train_data_size] train_data['y'] = train_data['y'][:train_data_size] if train_data.has_key('t'): train_data['t'] = train_data['t'][:train_data_size] # addv if train_data.has_key('d'): train_data['d'] = train_data['d'][:train_data_size] test_x = test_data['x'] lengths = test_data['lengths'] for idx in range(test_x.shape[0]): n_length = length # To make sure the complete test case will not be added into train set if lengths[idx] <= length: n_length = length - 1 if ONE_HOT: # if ONE_HOT is used, we convert one hot vector to int first. new_x = onehot2int(test_x[idx, :n_length, :]) new_y = onehot2int(test_x[idx, 1:n_length + 1, :]) else: new_x = test_x[idx, :n_length, 0] new_y = test_x[idx, 1:n_length + 1, 0] train_data['x'].append(new_x) train_data['y'].append(new_y) if train_data.has_key('t'): test_t = test_data['t'] new_t = test_t[idx, :n_length].tolist() train_data['t'].append(new_t) # addv if train_data.has_key('d'): test_d = test_data['d'] new_d = test_d[idx, :n_length].tolist() train_data['d'].append(new_d) logging.info('New train data size {}'.format(len(train_data['x']))) logging.info('--Data Added--') logging.info("Training ...") logging.info('Data size {},Max epoch {},Batch {}'.format(train_data_size, num_epochs, BATCH_SIZE)) p = 0 current_epoch = 0 it = 0 data_size = train_data_size last_it = 0 # 最后一次迭代的次数 avg_cost = 0 # 平均损失函数值 avg_seq_len = 0 # 平均序列长度 # 随机模块 plist = vutils.genPlist(data_size, BATCH_SIZE) try: while True: randP = plist[p / BATCH_SIZE] batch_data = gen_data(randP, train_data, batch_size=BATCH_SIZE) # mask:[[1 1 1 1 1...0 0 0 0 0],[1 1 1 ... 0 0]] 1的个数表示物品的长度 # lengths_x:[1519 1596 ...] 每一个数字表示用户的序列长度 # y:next game id的list [0 0 0 1 0 ...] 0为英雄联盟 x = batch_data['x'] y = batch_data['y'] mask = batch_data['mask'] pos = batch_data['pos'] avg_seq_len += x.shape[1] input_var = [x, mask, pos, y] # add if USE_TIME_INPUT: t = batch_data['t'] # 消耗时间 input_var.insert(3, t) # addv if USE_DURATION: d = batch_data['d'] input_var.insert(4, d) # v:训练主要方法 # input_var[x, mask, pos, t, d, y] avg_cost += train(*input_var) it += 1 # input_var = [x, mask, t, y] p += BATCH_SIZE if (p >= data_size): # 如果p>=data_size,说明一次循环结束 p = 0 last_it = it current_epoch += 1 # First stage: Using original train data to train model in #FIXED_EPOCHS # Second stage: After that add part of test data to train data. # The first stage is using user information with similar interest, and the second stage is using history information '''v 第一阶段:使用原始列车数据在#FIXED_EPOCHS中训练模型 第二阶段:之后添加部分测试数据来训练数据。 第一阶段是使用具有类似兴趣的用户信息,第二阶段是使用历史信息. ''' additional_length = int((current_epoch - FIXED_EPOCHS) * test_data_length / (NUM_EPOCHS - FIXED_EPOCHS)) evaluate(l_out, current_epoch=current_epoch, additional_test_length=additional_length) if current_epoch >= num_epochs: break if current_epoch > FIXED_EPOCHS: data_size = train_data_size + test_data_size logging.info('>> length {} test cases added to train set.'.format(additional_length)) add_test_to_train(additional_length) logging.info('Epoch {} Carriage Return'.format(current_epoch)) if it % PRINT_FREQ == 0: # 所以每 PRINT_FREQ * BATCH_SIZE 打印一次 # current_epoch 循环次数 logging.info("Epoch {}-{},iter {} average seq length = {} average loss = {}".format(current_epoch, ( it - last_it) * 1.0 * BATCH_SIZE / data_size, it, avg_seq_len / PRINT_FREQ, avg_cost / PRINT_FREQ)) avg_cost = 0 avg_seq_len = 0 logging.info('End') except KeyboardInterrupt: logging.info('由于你的自行中断,程序已经停止.')