Example #1
0
    def __init__(self, num_input=256, num_hidden=512, num_output=256):
        X = T.matrix()
        Y = T.matrix()
        eta = T.scalar()
        alpha = T.scalar()

        self.num_input = num_input
        self.num_hidden = num_hidden
        self.num_output = num_output

        inputs = InputLayer(X, name="inputs")
        lstm1f = LSTMLayer(num_input, num_hidden, input_layers=[inputs], name="lstm1f")
        lstm1b = LSTMLayer(num_input, num_hidden, input_layers=[inputs], name="lstm1b", go_backwards=True)

        fc = FullyConnectedLayer(2*num_hidden, num_output, input_layers=[lstm1f, lstm1b], name="yhat")

        Y_hat = sigmoid(T.mean(fc.output(), axis=0))

        self.layers = inputs, lstm1f, lstm1b, fc

        params = get_params(self.layers)
        caches = make_caches(params)


        mean_cost = - T.mean( Y * T.log(Y_hat) + (1-Y) * T.log(1-Y_hat) )

        last_step_cost = - T.mean( Y[-1] * T.log(Y_hat[-1]) + (1-Y[-1]) * T.log(1-Y_hat[-1]) )

        cost = alpha * mean_cost + (1-alpha) * last_step_cost

        updates = momentum(cost, params, caches, eta, clip_at=3.0)

        self.train = theano.function([X, Y, eta, alpha], [cost, last_step_cost], updates=updates, allow_input_downcast=True)

        self.predict=theano.function([X], [Y_hat[-1]], allow_input_downcast=True)
Example #2
0
  def create_rnn_layer(self, hidden_dim, input_dim, vocab_size, is_encoder):
    if self.rnn_type == 'vanillarnn':
      return VanillaRNNLayer(hidden_dim, input_dim, vocab_size,
                             create_init_state=is_encoder)
    elif self.rnn_type == 'gru':
      return GRULayer(hidden_dim, input_dim, vocab_size,
                      create_init_state=is_encoder)
    elif self.rnn_type == 'lstm':
      return LSTMLayer(hidden_dim, input_dim, vocab_size,
              create_init_state=is_encoder)
    elif self.rnn_type == 'atnh':
      return LSTMLayer(hidden_dim, input_dim, vocab_size,
                       create_init_state=is_encoder)

    raise Exception('Unrecognized rnn_type %s' % self.rnn_type)
Example #3
0
    def __init__(self):
        X = T.matrix()
        Y = T.matrix()
        eta = T.scalar()
        temperature = T.scalar()

        num_input = 256
        num_hidden = 500
        num_output = 256

        inputs = InputLayer(X, name="inputs")
        lstm1 = LSTMLayer(num_input,
                          num_hidden,
                          input_layer=inputs,
                          name="lstm1")
        lstm2 = LSTMLayer(num_hidden,
                          num_hidden,
                          input_layer=lstm1,
                          name="lstm2")
        softmax = SoftmaxLayer(num_hidden,
                               num_output,
                               input_layer=lstm2,
                               name="yhat",
                               temperature=temperature)

        Y_hat = softmax.output()

        self.layers = inputs, lstm1, lstm2, softmax

        params = get_params(self.layers)
        caches = make_caches(params)

        cost = T.mean(T.nnet.categorical_crossentropy(Y_hat, Y))
        updates = momentum(cost, params, caches, eta)

        self.train = theano.function([X, Y, eta, temperature],
                                     cost,
                                     updates=updates,
                                     allow_input_downcast=True)

        predict_updates = one_step_updates(self.layers)
        self.predict_char = theano.function([X, temperature],
                                            Y_hat,
                                            updates=predict_updates,
                                            allow_input_downcast=True)
Example #4
0
    def __init__(self,
                 num_input=256,
                 num_hidden=[512, 512],
                 num_output=256,
                 clip_at=0.0,
                 scale_norm=0.0):
        X = T.matrix()
        Y = T.matrix()
        eta = T.scalar()
        alpha = T.scalar()
        lambda2 = T.scalar()
        dropout_lstm = T.scalar()

        self.num_input = num_input
        self.num_hidden = num_hidden
        self.num_output = num_output
        self.clip_at = clip_at
        self.scale_norm = scale_norm

        inputs = InputLayer(X, name="inputs")
        num_prev = num_input
        prev_layer = inputs

        self.layers = [inputs]
        for i, num_curr in enumerate(num_hidden):
            lstm = LSTMLayer(num_prev,
                             num_curr,
                             input_layers=[prev_layer],
                             name="lstm{0}".format(i + 1),
                             drop_prob=drop_prob)
            num_prev = num_curr
            prev_layer = lstm
            prev_layer = DropoutLayer(input_layers=[prev_layer],
                                      dropout_probability=dropout_lstm)
            self.layers.append(lstm)
        sigmoid = SigmoidLayer(num_prev,
                               num_output,
                               input_layers=[prev_layer],
                               name="yhat")
        self.layers.append(sigmoid)
        Y_hat = sigmoid.output()

        params = get_params(self.layers)
        caches = make_caches(params)

        mean_cost = -T.mean(Y * T.log(Y_hat) + (1 - Y) * T.log(1 - Y_hat))

        last_step_cost = -T.mean(Y[-1] * T.log(Y_hat[-1]) +
                                 (1 - Y[-1]) * T.log(1 - Y_hat[-1]))

        cost = alpha * mean_cost + (1 - alpha) * last_step_cost

        updates = momentum(cost,
                           params,
                           caches,
                           eta,
                           clip_at=self.clip_at,
                           scale_norm=self.scale_norm,
                           lambda2=lambda2)

        self.train_func = theano.function(
            [X, Y, eta, alpha, lambda2, dropout_lstm], [cost, last_step_cost],
            updates=updates,
            allow_input_downcast=True)

        self.predict_func = theano.function([X, dropout_lstm], [Y_hat[-1]],
                                            allow_input_downcast=True)

        self.predict_sequence_func = theano.function([X, dropout_lstm],
                                                     [Y_hat],
                                                     allow_input_downcast=True)
def main(num_epochs=NUM_EPOCHS, vocab_size=VOCAB_SIZE):
    logging.info("Building network ...")

    # First, we build the network, starting with an input layer
    # Recurrent layers expect input of shape
    # (batch size, SEQ_LENGTH, num_features)
    l_in = lasagne.layers.InputLayer(shape=(None, None, NDIM))
    l_mask = lasagne.layers.InputLayer(shape=(None, None))

    # We now build the LSTM layer which takes l_in as the input layer
    # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients.
    l_forward = None

    if MODEL_TYPE == 'LSTM' or MODEL_TYPE == 'LSTM_T':
        l_t = lasagne.layers.InputLayer(
            shape=(None, None)) if USE_TIME_INPUT else None
        l_forward = LSTMLayer(l_in,
                              time_input=l_t,
                              mask_input=l_mask,
                              num_units=N_HIDDEN,
                              peepholes=True,
                              ingate=lasagne.layers.Gate(),
                              forgetgate=lasagne.layers.Gate(),
                              cell=lasagne.layers.Gate(
                                  W_cell=None,
                                  nonlinearity=lasagne.nonlinearities.tanh),
                              outgate=lasagne.layers.Gate(),
                              cell_init=lasagne.init.Constant(0.),
                              hid_init=lasagne.init.Constant(0.),
                              grad_clipping=GRAD_CLIP,
                              nonlinearity=lasagne.nonlinearities.tanh,
                              bn=BN,
                              only_return_final=False)
    elif MODEL_TYPE == 'TLSTM1':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = TLSTM1Layer(
            l_in,
            time_input=l_t,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'TLSTM2':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = TLSTM2Layer(
            l_in,
            time_input=l_t,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'TLSTM3':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = TLSTM3Layer(
            l_in,
            time_input=l_t,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            # forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'PLSTM':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = PLSTMLayer(l_in,
                               time_input=l_t,
                               num_units=N_HIDDEN,
                               mask_input=l_mask,
                               grad_clipping=GRAD_CLIP,
                               bn=BN,
                               timegate=PLSTMTimeGate())

    # Theano tensor for the targets
    target_values = T.matrix('target_values', dtype='int32')
    # The output of l_forward of shape (batch_size,time_sequence, N_HIDDEN) is then passed through the
    # softmax nonlinearity to
    # create probability distribution of the prediction
    # The output of this stage is (batch_size, time_sequence, vocab_size)
    l_out = lasagne.layers.DenseLayer(l_forward,
                                      num_units=vocab_size,
                                      W=lasagne.init.Normal(),
                                      num_leading_axes=2,
                                      nonlinearity=None)
    # lasagne.layers.get_output produces a variable for the output of the net
    network_output = lasagne.layers.get_output(l_out)
    # We need sum up all the cost through time.
    # network_output ( time_sequence,batch_size, vocab_size)
    network_output = network_output.dimshuffle(1, 0, 2)

    def calculate_softmax(n_input):
        return T.nnet.softmax(n_input)

    def merge_cost(n_input, n_target, n_mask, cost_prev):
        n_target = n_target.ravel()
        n_cost = T.nnet.categorical_crossentropy(n_input, n_target)
        n_cost = n_cost * n_mask
        n_cost = n_cost.sum()

        return cost_prev + n_cost

    network_output_softmax, _ = theano.scan(fn=calculate_softmax,
                                            sequences=network_output)

    # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target.
    m_cost, _ = theano.scan(fn=merge_cost,
                            sequences=[
                                network_output_softmax, target_values.T,
                                l_mask.input_var.T
                            ],
                            outputs_info=T.constant(0.))
    m_cost = m_cost[-1]
    cost = m_cost / l_mask.input_var.sum()

    # convert back to: (batch_size, time_seqsence, vocab_size)
    network_output_softmax = network_output_softmax.dimshuffle(1, 0, 2)

    # Compute AdaGrad updates for training
    logging.info("Computing updates ...")
    all_params = lasagne.layers.get_all_params(l_out, trainable=True)
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training, predict
    logging.info("Compiling functions ...")
    input_var = [l_in.input_var, l_mask.input_var]
    if USE_TIME_INPUT:
        input_var += [l_t.input_var]

    predict = theano.function(input_var,
                              network_output_softmax,
                              allow_input_downcast=True)
    input_var += [target_values]
    train = theano.function(input_var,
                            cost,
                            updates=updates,
                            allow_input_downcast=True)
    # compute_cost return cost but without update
    compute_cost = theano.function(input_var, cost, allow_input_downcast=True)

    def do_evaluate(test_x,
                    test_y,
                    test_mask,
                    lengths,
                    test_t=None,
                    n=100,
                    test_batch=5):
        # evaluate and calculate recall@10, MRR@10

        logging.info("Evaluate: Start predicting")

        p = 0
        probs_all_time = None
        while True:
            input_var = [test_x[p:p + test_batch], test_mask[p:p + test_batch]]
            if test_t is not None:
                input_var += [test_t[p:p + test_batch]]
            batch_probs = predict(*input_var)
            p += test_batch
            if probs_all_time is None:
                probs_all_time = np.zeros(
                    (test_x.shape[0] + TEST_BATCH, batch_probs.shape[2]))

            probs_all_time[p:p + batch_probs.shape[0], :] = batch_probs[:,
                                                                        -1, :]
            if p >= test_x.shape[0]:
                break

        logging.info("Evaluate: End predicting")
        total_size = test_x.shape[0]
        recall10 = 0.
        MRR10_score = 0.
        NDCG_score = 0.
        rate_sum = 0

        sample_time = SAMPLE_TIME

        for idx in range(total_size):
            gnd = test_y[idx]
            probs = probs_all_time[idx, :]
            prob_index = np.argsort(probs)[-1::-1].tolist()
            gnd_rate = prob_index.index(gnd) + 1
            rate_sum += gnd_rate
            # Sample multiple times to reduce randomness
            for _ in range(sample_time):
                samples = np.random.choice(range(vocab_size),
                                           n + 1,
                                           replace=False).tolist()
                # for i, sample in enumerate(samples):
                #     o = 0
                #     while sample in test_x[idx].tolist() and o < 10:
                #         sample = random.choice(range(vocab_size))
                #         samples[i] = sample
                #         o+=1

                # make sure the fist element is gnd
                try:
                    samples.remove(gnd)
                    samples.insert(0, gnd)
                except ValueError:
                    samples[0] = gnd

                sample_probs = probs[samples]
                prob_index = np.argsort(sample_probs)[-1::-1].tolist()
                rate = prob_index.index(0) + 1

                # caculate Recall@10, NDCG@10 and MRR@10
                if rate <= 10:
                    recall10 += 1
                    MRR10_score += 1. / rate
                    NDCG_score += 1. / math.log(rate + 1, 2)

        logging.info("Evaluate: End calculating scores")

        count = total_size * sample_time
        recall10 = recall10 / count
        MRR10_score = MRR10_score / count
        NDCG_score = NDCG_score / count
        avg_rate = float(rate_sum) / total_size

        logging.info('Recall@10 {}'.format(recall10))
        logging.info('MRR@10 1/rate {}'.format(MRR10_score))
        logging.info('NDCG@10 1/rate {}'.format(NDCG_score))
        logging.info('Average rate {}'.format(avg_rate))

    def onehot2int(onehot_vec):
        # convert onehot vector to index
        ret = []
        for onehot in onehot_vec:
            ret.append(onehot.tolist().index(1))
        return ret

    def get_short_test_data(length):
        print("Get short test data")
        # generate short sequence in the test_data.
        test_x = test_data['x'][:, :length]
        test_mask = test_data['mask'][:, :length]
        test_t = test_data['t'][:, :length] if USE_TIME_INPUT else None
        lengths = np.sum(test_mask, axis=1).astype('int')

        test_y = test_data['y'].copy()
        for idx in range(test_y.shape[0]):
            whole_length = test_data['lengths'][idx]
            if length < whole_length:
                test_y[idx] = test_data['x'][idx, length, :].tolist().index(
                    1) if ONE_HOT else test_data['x'][idx, length, 0]
        logging.info("Finished getting short test data")
        return test_x, test_y, test_mask, lengths, test_t

    def evaluate(model, current_epoch, additional_test_length):
        # Evaluate the model
        logging.info('Evaluate')
        test_x = test_data['x']
        test_y = test_data['y']
        test_mask = test_data['mask']
        lengths = test_data['lengths']
        logging.info(
            '-----------Evaluate Normal:{},{},{}-------------------'.format(
                MODEL_TYPE, DATA_TYPE, N_HIDDEN))
        do_evaluate(test_x,
                    test_y,
                    test_mask,
                    lengths,
                    test_data['t'] if USE_TIME_INPUT else None,
                    test_batch=TEST_BATCH)
        # Evaluate the model on short data
        if additional_test_length > 0:
            logging.info('-----------Evaluate Additional---------------')
            test_x, test_y, test_mask, lengths, test_t = get_short_test_data(
                additional_test_length)
            do_evaluate(test_x,
                        test_y,
                        test_mask,
                        lengths,
                        test_t,
                        test_batch=TEST_BATCH)
        logging.info('-----------Evaluate End----------------------')
        if not DEBUG:
            utils.save_model(
                '{}-{}-{}-{}'.format(MODEL_TYPE, current_epoch,
                                     DATA_TYPE, N_HIDDEN),
                str(datetime.datetime.now()), model, '_new')

        logging.info("Done saving")

    def add_test_to_train(length):
        logging.info('Length {} test cases added to train set'.format(length))
        global train_data
        logging.info('Old train data size {}'.format(len(train_data['x'])))
        # Remote the train_data added before
        train_data['x'] = train_data['x'][:train_data_size]
        train_data['y'] = train_data['y'][:train_data_size]
        if 't' in train_data:
            train_data['t'] = train_data['t'][:train_data_size]
        test_x = test_data['x']
        lengths = test_data['lengths']
        for idx in range(test_x.shape[0]):
            n_length = length
            # To make sure the complete test case will not be added into train set
            if lengths[idx] <= length:
                n_length = length - 1
            if ONE_HOT:
                # if ONE_HOT is used, we convert one hot vector to int first.
                new_x = onehot2int(test_x[idx, :n_length, :])
                new_y = onehot2int(test_x[idx, 1:n_length + 1, :])
            else:
                new_x = test_x[idx, :n_length, 0]
                new_y = test_x[idx, 1:n_length + 1, 0]
            train_data['x'].append(new_x)
            train_data['y'].append(new_y)
            if 't' in train_data:
                test_t = test_data['t']
                new_t = test_t[idx, :n_length].tolist()
                train_data['t'].append(new_t)
        logging.info('New train data size {}'.format(len(train_data['x'])))
        logging.info('--Data Added--')

    logging.info("Training ...")
    logging.info('Data size {},Max epoch {},Batch {}'.format(
        train_data_size, num_epochs, BATCH_SIZE))

    p = 0
    current_epoch = 0
    it = 0
    data_size = train_data_size
    last_it = 0
    avg_cost = 0
    avg_seq_len = 0
    try:
        while True:
            #logging.info("Load batch")
            batch_data = gen_data(p, train_data, batch_size=BATCH_SIZE)
            x = batch_data['x']
            y = batch_data['y']
            mask = batch_data['mask']
            avg_seq_len += x.shape[1]
            input_var = [x, mask, y]

            #logging.info("Train batch")

            if USE_TIME_INPUT:
                t = batch_data['t']
                input_var.insert(2, t)
            avg_cost += train(*input_var)
            it += 1
            p += BATCH_SIZE
            #logging.info("Done bitch")
            #if True:
            if (p >= data_size):
                p = 0
                last_it = it
                current_epoch += 1
                # First stage: Using original train data to train model in #FIXED_EPOCHS
                # Second stage: After that add part of test data to train data.
                # The first stage is using user information with similar interest, and the second stage is using history information
                additional_length = int(
                    (current_epoch - FIXED_EPOCHS) * test_data_length /
                    (NUM_EPOCHS - FIXED_EPOCHS))
                #if current_epoch % 2 == 0:
                evaluate(l_out,
                         current_epoch=current_epoch,
                         additional_test_length=additional_length)

                if current_epoch >= num_epochs:
                    break
                if current_epoch > FIXED_EPOCHS:
                    data_size = train_data_size + test_data_size
                    logging.info(
                        '>> length {} test cases added to train set.'.format(
                            additional_length))
                    add_test_to_train(additional_length)
                logging.info('Epoch {} Carriage Return'.format(current_epoch))
            if it % PRINT_FREQ == 0:
                logging.info(
                    "Epoch {}-{},iter {} average seq length = {} average loss = {}"
                    .format(current_epoch,
                            (it - last_it) * 1.0 * BATCH_SIZE / data_size, it,
                            avg_seq_len / PRINT_FREQ, avg_cost / PRINT_FREQ))
                avg_cost = 0
                avg_seq_len = 0
        logging.info('End')
    except KeyboardInterrupt:
        pass
Example #6
0
    def __init__(self,
                 rnn_type,
                 ntoken,
                 ninp,
                 nhid,
                 nlayers,
                 dropout=0.5,
                 dropouth=0.5,
                 dropouti=0.5,
                 dropoute=0.1,
                 wdrop=0,
                 tie_weights=False,
                 no_dropout=False,
                 custom_lstm=False):
        super(RNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.use_dropout = not no_dropout

        if wdrop is None:
            wdrop = 0
        wdrop = wdrop if self.use_dropout else 0
        assert rnn_type in ['LSTM', 'QRNN', 'GRU'], 'RNN type is not supported'
        if rnn_type == 'LSTM':
            # we need to use own lstm for second order derivative
            if not custom_lstm:
                self.rnns = [
                    torch.nn.LSTM(ninp if l == 0 else nhid,
                                  nhid if l != nlayers - 1 else
                                  (ninp if tie_weights else nhid),
                                  1,
                                  dropout=0) for l in range(nlayers)
                ]
                self.rnns = [
                    WeightDrop(rnn, ['weight_hh_l0'], dropout=wdrop)
                    for rnn in self.rnns
                ]
            else:
                self.rnns = [
                    LSTMLayer(
                        ninp if l == 0 else nhid, nhid if l != nlayers - 1 else
                        (ninp if tie_weights else nhid))
                    for l in range(nlayers)
                ]
                self.rnns = [
                    WeightDrop(rnn, ['weight_hh'], dropout=wdrop)
                    for rnn in self.rnns
                ]
        if rnn_type == 'GRU':
            self.rnns = [
                torch.nn.GRU(ninp if l == 0 else nhid,
                             nhid if l != nlayers - 1 else ninp,
                             1,
                             dropout=0) for l in range(nlayers)
            ]
            self.rnns = [
                WeightDrop(rnn, ['weight_hh'], dropout=wdrop)
                for rnn in self.rnns
            ]
        elif rnn_type == 'QRNN':
            from torchqrnn import QRNNLayer
            self.rnns = [
                QRNNLayer(input_size=ninp if l == 0 else nhid,
                          hidden_size=nhid if l != nlayers - 1 else
                          (ninp if tie_weights else nhid),
                          save_prev_x=True,
                          zoneout=0,
                          window=2 if l == 0 else 1,
                          output_gate=True) for l in range(nlayers)
            ]
            for rnn in self.rnns:
                rnn.linear = WeightDrop(rnn.linear, ['weight'], dropout=wdrop)
        print(self.rnns)
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            #if nhid != ninp:
            #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.wdrop = wdrop
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights
def main(num_epochs=NUM_EPOCHS, vocab_size=VOCAB_SIZE):
    logging.info("Building network ...")

    # First, we build the network, starting with an input layer
    # Recurrent layers expect input of shape
    # (batch size, SEQ_LENGTH, num_features)
    l_in = lasagne.layers.InputLayer(shape=(None, None, NDIM))
    l_mask = lasagne.layers.InputLayer(shape=(None, None))

    # We now build the LSTM layer which takes l_in as the input layer
    # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients.
    l_forward = None

    if MODEL_TYPE == 'LSTM' or MODEL_TYPE == 'LSTM_T':
        l_t = lasagne.layers.InputLayer(
            shape=(None, None)) if USE_TIME_INPUT else None
        l_forward = LSTMLayer(l_in,
                              time_input=l_t,
                              mask_input=l_mask,
                              num_units=N_HIDDEN,
                              peepholes=True,
                              ingate=lasagne.layers.Gate(),
                              forgetgate=lasagne.layers.Gate(),
                              cell=lasagne.layers.Gate(
                                  W_cell=None,
                                  nonlinearity=lasagne.nonlinearities.tanh),
                              outgate=lasagne.layers.Gate(),
                              cell_init=lasagne.init.Constant(0.),
                              hid_init=lasagne.init.Constant(0.),
                              grad_clipping=GRAD_CLIP,
                              nonlinearity=lasagne.nonlinearities.tanh,
                              bn=BN,
                              only_return_final=False)
    elif MODEL_TYPE == 'TLSTM1':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = TLSTM1Layer(
            l_in,
            time_input=l_t,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'TLSTM2':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = TLSTM2Layer(
            l_in,
            time_input=l_t,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'TLSTM3':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = TLSTM3Layer(
            l_in,
            time_input=l_t,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            # forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None,
                                     nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'PLSTM':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = PLSTMLayer(l_in,
                               time_input=l_t,
                               num_units=N_HIDDEN,
                               mask_input=l_mask,
                               grad_clipping=GRAD_CLIP,
                               bn=BN,
                               timegate=PLSTMTimeGate())

    # Theano tensor for the targets
    target_values = T.matrix('target_values', dtype='int32')
    # The output of l_forward of shape (batch_size,time_sequence, N_HIDDEN) is then passed through the
    # softmax nonlinearity to
    # create probability distribution of the prediction
    # The output of this stage is (batch_size, time_sequence, vocab_size)
    l_out = lasagne.layers.DenseLayer(l_forward,
                                      num_units=vocab_size,
                                      W=lasagne.init.Normal(),
                                      num_leading_axes=2,
                                      nonlinearity=None)
    # lasagne.layers.get_output produces a variable for the output of the net
    network_output = lasagne.layers.get_output(l_out)
    # We need sum up all the cost through time.
    # network_output ( time_sequence,batch_size, vocab_size)
    network_output = network_output.dimshuffle(1, 0, 2)

    def calculate_softmax(n_input):
        return T.nnet.softmax(n_input)

    def merge_cost(n_input, n_target, n_mask, cost_prev):
        n_target = n_target.ravel()
        n_cost = T.nnet.categorical_crossentropy(n_input, n_target)
        n_cost = n_cost * n_mask
        n_cost = n_cost.sum()

        return cost_prev + n_cost

    network_output_softmax, _ = theano.scan(fn=calculate_softmax,
                                            sequences=network_output)

    # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target.
    m_cost, _ = theano.scan(fn=merge_cost,
                            sequences=[
                                network_output_softmax, target_values.T,
                                l_mask.input_var.T
                            ],
                            outputs_info=T.constant(0.))
    m_cost = m_cost[-1]
    cost = m_cost / l_mask.input_var.sum()

    # convert back to: (batch_size, time_seqsence, vocab_size)
    network_output_softmax = network_output_softmax.dimshuffle(1, 0, 2)

    # Compute AdaGrad updates for training
    logging.info("Computing updates ...")
    all_params = lasagne.layers.get_all_params(l_out, trainable=True)
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training, predict
    logging.info("Compiling functions ...")
    input_var = [l_in.input_var, l_mask.input_var]
    if USE_TIME_INPUT:
        input_var += [l_t.input_var]

    predict = theano.function(input_var,
                              network_output_softmax,
                              allow_input_downcast=True)
    input_var += [target_values]
    train = theano.function(input_var,
                            cost,
                            updates=updates,
                            allow_input_downcast=True)
    # compute_cost return cost but without update
    compute_cost = theano.function(input_var, cost, allow_input_downcast=True)

    def do_evaluate(current_epoch,
                    test_x,
                    test_y,
                    test_mask,
                    lengths,
                    test_t=None,
                    n=100,
                    test_batch=5,
                    name=None):
        # evaluate and calculate recall@10, MRR@10
        p = 0
        probs_all_time = None
        while True:
            input_var = [test_x[p:p + test_batch], test_mask[p:p + test_batch]]
            if test_t is not None:
                input_var += [test_t[p:p + test_batch]]
            batch_probs = predict(*input_var)
            p += test_batch
            probs_all_time = batch_probs if probs_all_time is None else np.concatenate(
                [probs_all_time, batch_probs], axis=0)
            if p >= test_x.shape[0]:
                break

        total_size = test_x.shape[0]
        recall10 = 0.
        MRR10_score = 0.
        NDCG_score = 0.
        rate_sum = 0

        sample_time = SAMPLE_TIME

        for idx in range(total_size):
            gnd = test_y[idx]
            probs = probs_all_time[idx, lengths[idx] - 1, :]
            prob_index = np.argsort(probs)[-1::-1].tolist()
            gnd_rate = prob_index.index(gnd) + 1
            rate_sum += gnd_rate
            # Sample multiple times to reduce randomness
            for _ in range(sample_time):
                samples = np.random.choice(range(vocab_size),
                                           n + 1,
                                           replace=False).tolist()
                # make sure the fist element is gnd
                try:
                    samples.remove(gnd)
                    samples.insert(0, gnd)
                except ValueError:
                    samples[0] = gnd

                sample_probs = probs[samples]
                prob_index = np.argsort(sample_probs)[-1::-1].tolist()
                rate = prob_index.index(0) + 1

                # caculate Recall@10 and MRR@10
                if rate <= 10:
                    recall10 += 1
                    MRR10_score += 1. / rate
                    NDCG_score += 1. / np.log2(rate + 1)

        count = total_size * sample_time
        recall10 = recall10 / count
        MRR10_score = MRR10_score / count
        NDCG_score = NDCG_score / count
        avg_rate = float(rate_sum) / total_size

        logging.info('Recall@10 {}'.format(recall10))
        logging.info('MRR@10 1/rate {}'.format(MRR10_score))
        logging.info('NDCG@10 {}'.format(NDCG_score))
        logging.info('Average rate {}'.format(avg_rate))

        from log import log_results
        log_results(result_dir, current_epoch, recall10, MRR10_score,
                    NDCG_score, avg_rate, cost, name)

    def onehot2int(onehot_vec):
        # convert onehot vector to index
        ret = []
        for onehot in onehot_vec:
            ret.append(onehot.tolist().index(1))
        return ret

    def get_short_test_data(length):
        # generate short sequence in the test_data.
        test_x = test_data['x'][:, :length]
        test_mask = test_data['mask'][:, :length]
        test_t = test_data['t'][:, :length] if USE_TIME_INPUT else None
        lengths = np.sum(test_mask, axis=1).astype('int')

        test_y = test_data['y'].copy()
        for idx in range(test_y.shape[0]):
            whole_length = test_data['lengths'][idx]
            if length < whole_length:
                test_y[idx] = test_data['x'][idx, length, :].tolist().index(
                    1) if ONE_HOT else test_data['x'][idx, length, 0]

        return test_x, test_y, test_mask, lengths, test_t

    def evaluate(model, current_epoch, additional_test_length):
        # Evaluate the model
        logging.info('Evaluate')
        test_x = test_data['x']
        test_y = test_data['y']
        test_mask = test_data['mask']
        lengths = test_data['lengths']
        logging.info(
            '-----------Evaluate Normal:{},{},{}-------------------'.format(
                MODEL_TYPE, DATA_TYPE, N_HIDDEN))
        do_evaluate(current_epoch,
                    test_x,
                    test_y,
                    test_mask,
                    lengths,
                    test_data['t'] if USE_TIME_INPUT else None,
                    test_batch=TEST_BATCH,
                    name='additional')
        # Evaluate the model on short data
        if additional_test_length > 0:
            logging.info('-----------Evaluate Additional---------------')
            test_x, test_y, test_mask, lengths, test_t = get_short_test_data(
                additional_test_length)
            do_evaluate(current_epoch,
                        test_x,
                        test_y,
                        test_mask,
                        lengths,
                        test_t,
                        test_batch=TEST_BATCH,
                        name='additional_test')
        logging.info('-----------Evaluate End----------------------')
        if not DEBUG:
            utils.save_model(
                '{}-{}-{}-{}'.format(MODEL_TYPE, current_epoch,
                                     DATA_TYPE, N_HIDDEN),
                str(datetime.datetime.now()), model, '_new')

    def add_test_to_train(length):
        logging.info('Length {} test cases added to train set'.format(length))
        global train_data
        logging.info('Old train data size {}'.format(len(train_data['x'])))
        # Remote the train_data added before
        train_data['x'] = train_data['x'][:train_data_size]
        train_data['y'] = train_data['y'][:train_data_size]
        if train_data.has_key('t'):
            train_data['t'] = train_data['t'][:train_data_size]
        test_x = test_data['x']
        lengths = test_data['lengths']
        for idx in range(test_x.shape[0]):
            n_length = length
            # To make sure the complete test case will not be added into train set
            if lengths[idx] <= length:
                n_length = length - 1
            if ONE_HOT:
                # if ONE_HOT is used, we convert one hot vector to int first.
                new_x = onehot2int(test_x[idx, :n_length, :])
                new_y = onehot2int(test_x[idx, 1:n_length + 1, :])
            else:
                new_x = test_x[idx, :n_length, 0]
                new_y = test_x[idx, 1:n_length + 1, 0]
            train_data['x'].append(new_x)
            train_data['y'].append(new_y)
            if train_data.has_key('t'):
                test_t = test_data['t']
                new_t = test_t[idx, :n_length].tolist()
                train_data['t'].append(new_t)
        logging.info('New train data size {}'.format(len(train_data['x'])))
        logging.info('--Data Added--')

    logging.info("Training ...")
    logging.info('Data size {},Max epoch {},Batch {}'.format(
        train_data_size, num_epochs, BATCH_SIZE))

    logging.info("Load pickle")
    utils.load_model("TLSTM3-9-music-128_2019-10-16 14:00:39.099161", l_out)

    lengths = [25, 50, 100, 200]
    max_length = 200

    for seq_length in lengths:
        mask_length = max_length - lengths
        # Evaluate the model
        logging.info('Evaluate')
        test_x = test_data['x']
        test_y = test_data['y']

        test_mask = np.copy(test_data['mask'])
        test_mask[:, :mask_length] = 1
        lengths = np.minimum(test_data['lengths'], seq_length)

        logging.info(
            '-----------Evaluate length: {}-------------------'.format(
                seq_length))
        do_evaluate(test_x,
                    test_y,
                    test_mask,
                    lengths,
                    test_data['t'] if USE_TIME_INPUT else None,
                    test_batch=TEST_BATCH)
Example #8
0
    sampler = RandomSampler(dataset)
    loader = DataLoader(dataset,
                        batch_size=batch_size,
                        sampler=sampler,
                        shuffle=False,
                        num_workers=2)

    #    dataiter = iter(loader)
    #    images, labels = dataiter.next()
    #    print (images)
    #    images=tensor_to_img(images)
    #    print (labels)
    #    print (images)

    net = Net(14 * batch_size)
    lstm = LSTMLayer(7 * 7 * (16 + 5 * 2), 64, 14 * 14 * (num_class + 5 * 2),
                     2, batch_size)
    lossfunction = Loss(batch_size)
    optimizer = optim.Adam([{
        'params': net.parameters()
    }, {
        'params': lstm.parameters(),
        'lr': 0.0001
    }],
                           lr=0,
                           weight_decay=0)
    if load_checkpoint:
        net.load_state_dict(torch.load(SAVE_PATH))

    net.cuda()

    optimizer = optim.Adam(net.parameters(), lr=0.0001)
Example #9
0
def main(num_epochs=NUM_EPOCHS, vocab_size=VOCAB_SIZE):
    logging.info("Building network ...")
    # (batch size, SEQ_LENGTH, num_features)
    # v: None表示该维度的大小在编译时没有固定。
    # InputLayer,它可用于表示网络的输入。张量的第一个维度通常是批量维度
    l_in = lasagne.layers.InputLayer(shape=(None, None, NDIM))
    l_mask = lasagne.layers.InputLayer(shape=(None, None))

    # addv
    l_pos = lasagne.layers.InputLayer(shape=(None, None))

    # We now build the LSTM layer which takes l_in as the input layer
    # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients.
    l_forward = None
    if MODEL_TYPE == 'LSTM' or MODEL_TYPE == 'LSTM_T':
        l_t = lasagne.layers.InputLayer(shape=(None, None)) if USE_TIME_INPUT else None
        l_forward = LSTMLayer(
            l_in,
            time_input=l_t,
            mask_input=l_mask,
            num_units=N_HIDDEN,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
            outgate=lasagne.layers.Gate(),
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            nonlinearity=lasagne.nonlinearities.tanh,
            bn=BN,
            only_return_final=False)
    elif MODEL_TYPE == 'RNN':
        l_t = lasagne.layers.InputLayer(shape=(None, None)) if USE_TIME_INPUT else None
        l_forward = RNNLayer(
            l_in,
            time_input=l_t,
            mask_input=l_mask,
            num_units=N_HIDDEN,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
            outgate=lasagne.layers.Gate(),
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            nonlinearity=lasagne.nonlinearities.tanh,
            bn=BN,
            only_return_final=False)
    elif MODEL_TYPE == 'DTLSTM':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_d = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = VDTLSTMLayer(
            l_in,
            time_input=l_t,
            duration_input=l_d,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'DTLSTM_EM':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_d = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = VDTLSTMEMLayer(
            l_in,
            time_input=l_t,
            duration_input=l_d,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    elif MODEL_TYPE == 'TLSTM2':
        l_t = lasagne.layers.InputLayer(shape=(None, None))
        l_forward = VTLSTM2Layer(
            l_in,
            time_input=l_t,
            num_units=N_HIDDEN,
            mask_input=l_mask,
            peepholes=True,
            ingate=lasagne.layers.Gate(),
            forgetgate=lasagne.layers.Gate(),
            cell=lasagne.layers.Gate(W_cell=None, nonlinearity=lasagne.nonlinearities.tanh),
            outgate=OutGate(),
            nonlinearity=lasagne.nonlinearities.tanh,
            cell_init=lasagne.init.Constant(0.),
            hid_init=lasagne.init.Constant(0.),
            grad_clipping=GRAD_CLIP,
            only_return_final=False,
            bn=BN,
        )
    else:
        logging.info('没有这种模型类型')
        exit(0)

    target_values = T.matrix('target_values', dtype='int32')

    # v:输出层(N_HIDDEN,vocab_size)
    # 调用了l_forward中get_output_shape_for()方法
    # l_forward (num_batch, sequence_length, num_units)
    l_out = lasagne.layers.DenseLayer(l_forward, num_units=vocab_size, W=lasagne.init.Normal(),
                                      num_leading_axes=2, nonlinearity=None)

    # 获取输出层的输出(None, None, 500)
    # 调用了l_forward中get_output_for()方法
    # l_out (num_batch, sequence_length, vocab_size)
    network_output = lasagne.layers.get_output(l_out)

    # (2, 0, 1) -> AxBxC to CxAxB
    # (0, ‘x’, 1) -> AxB to Ax1xB
    # (1, ‘x’, 0) -> AxB to Bx1xA
    # (sequence_length, num_batch, vocab_size)
    network_output = network_output.dimshuffle(1, 0, 2)

    def calculate_softmax(n_input):
        return T.nnet.softmax(n_input)

    def merge_cost(n_input, n_target, n_mask, n_pos, cost_prev):
        # 使用ravel将原始矩阵张开
        n_target = n_target.ravel()
        # addv
        # n_pos = T.reshape(n_pos, (5, 1))
        # n_input = n_pos - n_input
        # n_pos = (n_pos - 0.5) * 2
        # n_input = n_input * n_pos

        n_cost = T.nnet.categorical_crossentropy(n_input, n_target)
        n_cost = n_cost * n_mask * n_pos   # * (1.0 - n_pos)
        n_cost = n_cost.sum()
        return cost_prev + n_cost

    network_output_softmax, _ = theano.scan(fn=calculate_softmax, sequences=network_output)

    # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target.
    # 后面用于计算交叉熵损失函数的sum
    m_cost, _ = theano.scan(fn=merge_cost,
                            sequences=[network_output_softmax, target_values.T, l_mask.input_var.T, l_pos.input_var.T],

                            outputs_info=T.constant(0.))
    # m_cost是一个序列,但是只需要最后一个叠加值cost[-1]
    m_cost = m_cost[-1]
    # 求平均cost
    cost = m_cost / l_mask.input_var.sum()

    # 转换回来: (batch_size, time_seqsence, vocab_size)
    network_output_softmax = network_output_softmax.dimshuffle(1, 0, 2)

    # Compute AdaGrad updates for training
    logging.info("Computing updates ...")
    # 这个get_all_params方法应该是用于获取所有的在lstmlayer中add_param
    all_params = lasagne.layers.get_all_params(l_out, trainable=True)
    # 根据cost更新所有的参数all_params,学习率为LEARNING_RATE
    updates = lasagne.updates.adagrad(cost, all_params, LEARNING_RATE)

    # Theano functions for training, predict
    logging.info("Compiling functions ...")
    input_var = [l_in.input_var, l_mask.input_var]
    # add
    if USE_TIME_INPUT:
        input_var += [l_t.input_var]
        # addv
        if USE_DURATION:
            input_var += [l_d.input_var]

    predict = theano.function(input_var, network_output_softmax, allow_input_downcast=True)
    input_var += [target_values]
    # addv
    input_var.insert(2, l_pos.input_var)

    # v:计算损失函数值
    # input_var[l_in.input_var, l_mask.input_var, l_pos.input_var,l_t.input_var,l_d.input_var,target_values]
    train = theano.function(input_var, cost, updates=updates, allow_input_downcast=True)
    # compute_cost return cost but without update
    compute_cost = theano.function(input_var, cost, allow_input_downcast=True)

    # v:评估方法!!!!
    # addv
    def do_evaluate(test_x, test_y, test_mask, lengths, test_t=None, test_d=None, n=1000, test_batch=5):
        # evaluate and calculate recall@10, MRR@10
        p = 0
        probs_all_time = None  # 所有的预测值
        while True:
            input_var = [test_x[p:p + test_batch], test_mask[p:p + test_batch]]
            if test_t is not None:
                input_var += [test_t[p:p + test_batch]]
                # addv
                if test_d is not None:
                    input_var += [test_d[p:p + test_batch]]
            batch_probs = predict(*input_var)
            p += test_batch
            probs_all_time = batch_probs if probs_all_time is None else np.concatenate([probs_all_time, batch_probs],
                                                                                       axis=0)
            if p >= test_x.shape[0]:
                break

        total_size = test_x.shape[0]
        recall10 = 0.
        MRR10_score = 0.
        rate_sum = 0

        sample_time = SAMPLE_TIME

        # addv
        _rank = []

        for idx in range(total_size):
            gnd = test_y[idx]
            probs = probs_all_time[idx, lengths[idx] - 1, :]  # 取每一个test的最后一个的预测值,一个500维的向量
            prob_index = np.argsort(probs)[-1::-1].tolist()  # argsort函数返回的是数组值从小到大的索引值[3 1 2]-->[1 2 0]
            gnd_rate = prob_index.index(gnd) + 1
            # 这个是所有的东西的排名
            rate_sum += gnd_rate
            # Sample multiple times to reduce randomness
            for _ in range(sample_time):

                # addvv
                samples = np.random.choice(range(vocab_size), vocab_size, replace=False).tolist()
                # make sure the fist element is gnd
                # v 这样在随机之后,只要选择index(0)知道是第几了
                try:
                    samples.remove(gnd)
                    samples.insert(0, gnd)
                except ValueError:
                    samples[0] = gnd

                sample_probs = probs[samples]
                prob_index = np.argsort(sample_probs)[-1::-1].tolist()
                # v 这个是随机100个的排名
                rate = prob_index.index(0) + 1

                # addvv
                # logging.info('rank:{}'.format(rate))

                # caculate Recall@10 and MRR@10
                # addvc
                if rate <= RANK:
                    recall10 += 1
                    MRR10_score += 1. / rate

        count = total_size * sample_time
        recall10 = recall10 / count
        MRR10_score = MRR10_score / count
        avg_rate = float(rate_sum) / total_size

        logging.info('Recall@10 {}'.format(recall10))
        logging.info('MRR@10 1/rate {}'.format(MRR10_score))
        logging.info('Average rate {}'.format(avg_rate))

    def onehot2int(onehot_vec):
        # convert onehot vector to index
        ret = []
        for onehot in onehot_vec:
            ret.append(onehot.tolist().index(1))
        return ret

    def get_short_test_data(length):
        # generate short sequence in the test_data.
        test_x = test_data['x'][:, :length]
        test_mask = test_data['mask'][:, :length]
        # add
        test_t = test_data['t'][:, :length] if USE_TIME_INPUT else None
        # addv
        test_d = test_data['d'][:, :length] if USE_DURATION else None

        lengths = np.sum(test_mask, axis=1).astype('int')

        test_y = test_data['y'].copy()
        for idx in range(test_y.shape[0]):
            whole_length = test_data['lengths'][idx]
            if length < whole_length:
                test_y[idx] = test_data['x'][idx, length, :].tolist().index(1) if ONE_HOT else test_data['x'][
                    idx, length, 0]

        return test_x, test_y, test_mask, lengths, test_t, test_d

    def evaluate(model, current_epoch, additional_test_length):
        # Evaluate the model
        logging.info('Evaluate')
        # 包括了所有测试集合
        test_x = test_data['x']
        test_y = test_data['y']
        test_mask = test_data['mask']
        lengths = test_data['lengths']
        logging.info('-----------Evaluate Normal:{},{},{}-------------------'.format(MODEL_TYPE, DATA_TYPE, N_HIDDEN))
        do_evaluate(test_x, test_y, test_mask, lengths,
                    test_data['t'] if USE_TIME_INPUT else None,
                    test_data['d'] if USE_DURATION else None,
                    test_batch=TEST_BATCH)
        # Evaluate the model on short data
        if additional_test_length > 0:
            logging.info('-----------Evaluate Additional---------------')
            # addv
            test_x, test_y, test_mask, lengths, test_t, test_d = get_short_test_data(additional_test_length)
            do_evaluate(test_x, test_y, test_mask, lengths, test_t, test_d, test_batch=TEST_BATCH)
        logging.info('-----------Evaluate End----------------------')
        if not DEBUG:
            vutils.save_model('{}-{}-{}-{}'.format(MODEL_TYPE, current_epoch, DATA_TYPE, N_HIDDEN),
                              str(datetime.datetime.now()), model, '_new')

    def add_test_to_train(length):
        logging.info('Length {} test cases added to train set'.format(length))
        global train_data
        logging.info('Old train data size {}'.format(len(train_data['x'])))
        # Remote the train_data added before
        train_data['x'] = train_data['x'][:train_data_size]
        train_data['y'] = train_data['y'][:train_data_size]
        if train_data.has_key('t'):
            train_data['t'] = train_data['t'][:train_data_size]
            # addv
            if train_data.has_key('d'):
                train_data['d'] = train_data['d'][:train_data_size]

        test_x = test_data['x']
        lengths = test_data['lengths']
        for idx in range(test_x.shape[0]):
            n_length = length
            # To make sure the complete test case will not be added into train set
            if lengths[idx] <= length:
                n_length = length - 1
            if ONE_HOT:
                # if ONE_HOT is used, we convert one hot vector to int first.
                new_x = onehot2int(test_x[idx, :n_length, :])
                new_y = onehot2int(test_x[idx, 1:n_length + 1, :])
            else:
                new_x = test_x[idx, :n_length, 0]
                new_y = test_x[idx, 1:n_length + 1, 0]
            train_data['x'].append(new_x)
            train_data['y'].append(new_y)
            if train_data.has_key('t'):
                test_t = test_data['t']
                new_t = test_t[idx, :n_length].tolist()
                train_data['t'].append(new_t)

                # addv
                if train_data.has_key('d'):
                    test_d = test_data['d']
                    new_d = test_d[idx, :n_length].tolist()
                    train_data['d'].append(new_d)

        logging.info('New train data size {}'.format(len(train_data['x'])))
        logging.info('--Data Added--')

    logging.info("Training ...")
    logging.info('Data size {},Max epoch {},Batch {}'.format(train_data_size, num_epochs, BATCH_SIZE))
    p = 0
    current_epoch = 0
    it = 0
    data_size = train_data_size
    last_it = 0  # 最后一次迭代的次数
    avg_cost = 0  # 平均损失函数值
    avg_seq_len = 0  # 平均序列长度

    # 随机模块
    plist = vutils.genPlist(data_size, BATCH_SIZE)


    try:
        while True:
            randP = plist[p / BATCH_SIZE]
            batch_data = gen_data(randP, train_data, batch_size=BATCH_SIZE)
            # mask:[[1 1 1 1 1...0 0 0 0 0],[1 1 1 ... 0 0]] 1的个数表示物品的长度
            # lengths_x:[1519 1596 ...] 每一个数字表示用户的序列长度
            # y:next game id的list [0 0 0 1 0 ...] 0为英雄联盟
            x = batch_data['x']
            y = batch_data['y']
            mask = batch_data['mask']
            pos = batch_data['pos']
            avg_seq_len += x.shape[1]

            input_var = [x, mask, pos, y]

            # add
            if USE_TIME_INPUT:
                t = batch_data['t']
                # 消耗时间
                input_var.insert(3, t)
                # addv
                if USE_DURATION:
                    d = batch_data['d']
                    input_var.insert(4, d)
            # v:训练主要方法
            # input_var[x, mask, pos, t, d, y]
            avg_cost += train(*input_var)
            it += 1
            # input_var = [x, mask, t, y]
            p += BATCH_SIZE
            if (p >= data_size):  # 如果p>=data_size,说明一次循环结束
                p = 0
                last_it = it
                current_epoch += 1
                # First stage: Using original train data to train model in #FIXED_EPOCHS
                # Second stage: After that add part of test data to train data.
                # The first stage is using user information with similar interest, and the second stage is using history information
                '''v
                第一阶段:使用原始列车数据在#FIXED_EPOCHS中训练模型
                第二阶段:之后添加部分测试数据来训练数据。
                第一阶段是使用具有类似兴趣的用户信息,第二阶段是使用历史信息.
                '''
                additional_length = int((current_epoch - FIXED_EPOCHS) * test_data_length / (NUM_EPOCHS - FIXED_EPOCHS))
                evaluate(l_out, current_epoch=current_epoch, additional_test_length=additional_length)
                if current_epoch >= num_epochs:
                    break
                if current_epoch > FIXED_EPOCHS:
                    data_size = train_data_size + test_data_size
                    logging.info('>> length {} test cases added to train set.'.format(additional_length))
                    add_test_to_train(additional_length)
                logging.info('Epoch {} Carriage Return'.format(current_epoch))
            if it % PRINT_FREQ == 0:
                # 所以每 PRINT_FREQ * BATCH_SIZE 打印一次
                # current_epoch 循环次数
                logging.info("Epoch {}-{},iter {} average seq length = {} average loss = {}".format(current_epoch, (
                        it - last_it) * 1.0 * BATCH_SIZE / data_size, it, avg_seq_len / PRINT_FREQ,
                                                                                                    avg_cost / PRINT_FREQ))
                avg_cost = 0
                avg_seq_len = 0
        logging.info('End')
    except KeyboardInterrupt:
        logging.info('由于你的自行中断,程序已经停止.')