class ObliqueForwardNet(object): def __init__(self, n_h): self.unit = GRU(n_in=n_h * 2, n_h=n_h) self.params = self.unit.params def forward_all(self, x, h_prev, h0): """ :param x: 1D: n_prds, 2D: n_words, 3D: batch, dim_h :param h_prev: 1D: n_words, 2D: batch, 3D: dim_h :param h0: 1D: batch, 2D: dim_h :return: 1D: n_prds, 2D: n_words, 3D: batch, 3D: dim_h """ h, _ = theano.scan(fn=self.forward_row, sequences=[x], outputs_info=[h_prev], non_sequences=[h0]) return h def forward_row(self, x, h_prev, h0): """ :param x: 1D: n_words, 2D: batch, 3D: dim_h :param h_prev: 1D: n_words, 2D: batch, 3D: dim_h :param h0: 1D: batch, 2D: dim_h :return: 1D: n_words, 2D: batch, 3D: dim_h """ return self.forward_column(T.concatenate([x, h_prev], axis=2), h0) def forward_column(self, x, h): """ :param x: 1D: n_words, 2D: batch, 3D: dim_h :param h: 1D: n_words, 2D: batch, 3D: dim_h :return: 1D: n_words, 2D: batch, 3D: dim_h """ return self.unit.forward_all(x, h)
def __init__(self, rng, embedding, vocab_size, hidden_size, max_length, num_layers=1): """ model init :param rng: np random with seed. :param embedding: decoder embedding :param vocab_size: target vocab_size :param hidden_size: hidden size for gru layer :param max_length: sequence max length :param num_layers: num of layers """ self.embedding = embedding self.hidden_size = hidden_size self.vocab_size = vocab_size self.num_layers = num_layers self.max_length = max_length self.gru_layer = GRU(rng, hidden_size, hidden_size) self.linear = theano.shared(value=(rng.randn(hidden_size, vocab_size) * 0.1).astype(theano.config.floatX), name="linear", borrow=True) self.params = [self.linear] self.params += self.gru_layer.params
def __init__(self, rng, embedding, hidden_size, num_layers=1): """ model init. :param rng: np random with seed. :param embedding: encoder embedding :param hidden_size: hidden_size for gru. :param num_layers: num of layers. """ self.embedding = embedding self.num_layers = num_layers self.hidden_size = hidden_size self.gru_layer = GRU(rng, hidden_size, hidden_size) self.params = [] self.params += self.gru_layer.params
def __init__(self, rnn_type, input_size, node_fdim, hidden_size, depth): super(MPNEncoder, self).__init__() self.hidden_size = hidden_size self.input_size = input_size self.depth = depth self.W_o = nn.Sequential( nn.Linear(node_fdim + hidden_size, hidden_size), nn.ReLU() ) if rnn_type == 'GRU': self.rnn = GRU(input_size, hidden_size, depth) elif rnn_type == 'LSTM': self.rnn = LSTM(input_size, hidden_size, depth) else: raise ValueError('unsupported rnn cell type ' + rnn_type)
def __init__(self, vocab_size, embedding_dim, hidden_dim, n_classes=1, bidirectional=False, padding_idx=0, n_layers=1, dropout=0.2): super(SentimentGRU, self).__init__() self.bridge = nn.Linear(embedding_dim, embedding_dim) self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx) self.rnn = GRU( embedding_dim, hidden_dim, ) self.out = nn.Linear(hidden_dim * n_layers, n_classes) self.n_layers = n_layers
def __init__(self, word_dim, hidden_dim, output_dim, bptt_truncate= 1): GRU.__init__(self, word_dim, hidden_dim, output_dim, 1) self.wi = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, word_dim)) self.wh = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, hidden_dim)) self.aw = self.wh = np.random.uniform(-np.sqrt(1. / word_dim), np.sqrt(1. / word_dim), (hidden_dim, hidden_dim))
# NOTE ============================================== # This is where your model code will be called. if args.model == 'RNN': model = RNN(emb_size=args.emb_size, hidden_size=args.hidden_size, seq_len=args.seq_len, batch_size=args.batch_size, vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.dp_keep_prob) elif args.model == 'GRU': model = GRU(emb_size=args.emb_size, hidden_size=args.hidden_size, seq_len=args.seq_len, batch_size=args.batch_size, vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.dp_keep_prob) elif args.model == 'TRANSFORMER': if args.debug: # use a very small model model = TRANSFORMER(vocab_size=vocab_size, n_units=16, n_blocks=2) else: # Note that we're using num_layers and hidden_size to mean slightly # different things here than in the RNNs. # Also, the Transformer also has other hyperparameters # (such as the number of attention heads) which can change it's behavior. model = TRANSFORMER(vocab_size=vocab_size, n_units=args.hidden_size, n_blocks=args.num_layers, dropout=1. - args.dp_keep_prob)
def get_lstm(opt): print('rnn_type ', opt.rnn_type) # LSTM if opt.rnn_type == "LSTM": core = LSTM.LSTM(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT": core = LSTM.LSTM_SOFT_ATT(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.att_size, opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT": core = LSTM.LSTM_DOUBLE_ATT(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.att_size, opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK": core = LSTM.LSTM_SOFT_ATT_STACK(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK": core = LSTM.LSTM_DOUBLE_ATT_STACK(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL": core = LSTM.LSTM_DOUBLE_ATT_STACK_PARALLEL(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_POLICY": core = LSTM.LSTM_DOUBLE_ATT_STACK_PARALLEL_POLICY(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_BN": core = LSTM.LSTM_DOUBLE_ATT_STACK_PARALLEL_BN(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_BN_RELU": core = LSTM.LSTM_DOUBLE_ATT_STACK_PARALLEL_BN_RELU(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_DROPOUT": core = LSTM.LSTM_DOUBLE_ATT_STACK_PARALLEL_DROPOUT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_DROPOUT_SET": core = LSTM.LSTM_DOUBLE_ATT_STACK_PARALLEL_DROPOUT_SET(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.rnn_size_list, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "GRU_DOUBLE_ATT_STACK_PARALLEL_DROPOUT": core = GRU.GRU_DOUBLE_ATT_STACK_PARALLEL_DROPOUT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_IT_ATT": core = LSTM1.LSTM_IT_ATT(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.att_size, opt.drop_prob_lm, opt.num_layers, opt.word_input_layer, opt.att_input_layer) elif opt.rnn_type == "LSTM_IT_ATT_COMBINE": core = LSTM1.LSTM_IT_ATT_COMBINE(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.att_size, opt.drop_prob_lm, opt.num_layers, opt.word_input_layer, opt.att_input_layer) elif opt.rnn_type == "FO_IT_ATT_COMBINE": core = LSTM1.FO_IT_ATT_COMBINE(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.att_size, opt.drop_prob_lm, opt.num_layers, opt.word_input_layer, opt.att_input_layer) elif opt.rnn_type == "CONV_IT_ATT_COMBINE": core = LSTM1.CONV_IT_ATT_COMBINE(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.att_size, opt.drop_prob_lm, opt.num_layers, opt.word_input_layer, opt.att_input_layer) elif opt.rnn_type == "CONV_LSTM": core = LSTM1.CONV_LSTM(opt.input_encoding_size, opt.vocab_size + 1, opt.rnn_size, opt.drop_prob_lm, opt.num_layers, opt.block_num, opt.use_proj_mul) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_NEW": core = LSTM1.LSTM_DOUBLE_ATT_STACK_PARALLEL(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT": core = LSTM1.LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_NEW": core = LSTM1.LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_NEW(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT": core = LSTM2.LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_WITH_BU": core = LSTM2.LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_WITH_BU(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.bu_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_NEW": core = LSTM2.LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_NEW(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_LSTM_MUL": core = LSTM2.LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_LSTM_MUL(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.drop_prob_lm, opt.block_num) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_A": core = LSTM2.LSTM_DOUBLE_ATT_STACK_PARALLEL_A(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL": core = LSTM2.LSTM_SOFT_ATT_STACK_PARALLEL(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_WITH_WEIGHT": core = LSTM2.LSTM_SOFT_ATT_STACK_PARALLEL_WITH_WEIGHT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_WITH_MUL_WEIGHT": core = LSTM2.LSTM_SOFT_ATT_STACK_PARALLEL_WITH_MUL_WEIGHT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_WITH_WEIGHT": core = LSTM2.LSTM_DOUBLE_ATT_STACK_PARALLEL_MUL_OUT_ATT_WITH_WEIGHT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_WITH_WEIGHT_SPP": core = LSTM3.LSTM_SOFT_ATT_STACK_PARALLEL_WITH_WEIGHT_SPP(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.pool_size, opt.spp_num, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_SPP": core = LSTM3.LSTM_SOFT_ATT_STACK_PARALLEL_SPP(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.pool_size, opt.spp_num, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_MEMORY": core = LSTM4.LSTM_SOFT_ATT_STACK_PARALLEL_MEMORY(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.memory_num_hop, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_NO_MEMORY": core = LSTM4.LSTM_SOFT_ATT_STACK_PARALLEL_NO_MEMORY(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_WITH_WEIGHT_BU": core = LSTM5.LSTM_SOFT_ATT_STACK_PARALLEL_WITH_WEIGHT_BU(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.bu_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_C_S_ATT_STACK_PARALLEL_WITH_WEIGHT_BU": core = LSTM5.LSTM_C_S_ATT_STACK_PARALLEL_WITH_WEIGHT_BU(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.bu_size, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_WITH_TOP_DOWN_ATTEN": core = LSTM6.LSTM_WITH_TOP_DOWN_ATTEN(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, opt.bu_size, opt.bu_num, dropout=opt.drop_prob_lm) elif opt.rnn_type == "LSTM_SOFT_ATT_STACK_PARALLEL_WITH_FC_WEIGHT": core = LSTM2.LSTM_SOFT_ATT_STACK_PARALLEL_WITH_FC_WEIGHT(opt.input_encoding_size, opt.vocab_size + 1, opt.num_layers, opt.num_parallels, opt.rnn_size, opt.att_size, dropout=opt.drop_prob_lm) else: raise Exception("rnn type not supported: {}".format(opt.rnn_type)) return core
def __init__(self, n_h): self.unit = GRU(n_in=n_h * 2, n_h=n_h) self.params = self.unit.params
loss_log = [] if 'ResNestedLSTM' == rnn_type: rnn = ResNestedLSTM(x_size, state_size, layer_norm=layer_norm) elif 'ResLSTM' == rnn_type: rnn = ResLSTM(x_size, state_size, layer_norm=layer_norm) elif 'ResRNN' == rnn_type: rnn = ResRNN(x_size, state_size, layer_norm=layer_norm) elif 'NestedLSTM' == rnn_type: rnn = NestedLSTM(x_size, state_size, layer_norm=layer_norm) elif 'LSTM' == rnn_type: rnn = LSTM(x_size, state_size, layer_norm=layer_norm) elif 'DoubleGRU' == rnn_type: rnn = DoubleGRU(x_size, state_size, layer_norm=layer_norm) elif 'GRU' == rnn_type: rnn = GRU(x_size, state_size, layer_norm=layer_norm) elif "ResGRU" == rnn_type: rnn = ResGRU(x_size, state_size, layer_norm=layer_norm) adam = optim.SGD(rnn.parameters(), lr=lr) adam.zero_grad() classifier = Variable(rand_vector.clone(), requires_grad=True) for i in range(n_epochs): X, Y = Variable(X.data), Variable(Y.data) state_vars = [ Variable(torch.zeros(batch_size, state_size)) for i in range(rnn.n_state_vars) ] for j in range(X.shape[1]): x, y = X[:, j], Y[:, j] prediction, state_vars = pred_fxn(rnn, state_vars, classifier,