def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] """ Stack the weight matricies for all the gates for much cleaner code and slightly faster dot-prods """ # input weights W = numpy.concatenate([ norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim), norm_weight(nin, dim) ], axis=1) params[_p(prefix, 'W')] = W # for the previous hidden activation U = numpy.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U params[_p(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32') return params
def param_init_attention(options, params, prefix='attention'): dim_word = options['dim_word'] params[_p(prefix, 'Wm')] = norm_weight(dim_word) params[_p(prefix, 'b')] = numpy.zeros((dim_word, ), dtype='float32') params[_p(prefix, 'W_att')] = norm_weight(dim_word) params[_p(prefix, 'U_att')] = norm_weight(dim_word, 1) params[_p(prefix, 'c_att')] = numpy.zeros((1, ), dtype='float32') return params
def param_init_lstm_cond(options, params, prefix='lstm_cond', nin=None, dim=None, dimctx=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] # input to LSTM, similar to the above, we stack the matricies for compactness, do one # dot product, and use the slice function below to get the activations for each "gate" W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix,'W')] = W # LSTM to LSTM U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix,'U')] = U # bias to LSTM params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32') # context to LSTM Wc = norm_weight(dimctx,dim*4) params[_p(prefix,'Wc')] = Wc # attention: context -> hidden Wc_att = norm_weight(dimctx, ortho=False) params[_p(prefix,'Wc_att')] = Wc_att # attnetion: last context -> hidden Wct_att = norm_weight(dimctx, ortho=False) params[_p(prefix,'Wct_att')] = Wct_att # attention: LSTM -> hidden Wd_att = norm_weight(dim,dimctx) params[_p(prefix,'Wd_att')] = Wd_att # attention: hidden bias b_att = numpy.zeros((dimctx,)).astype('float32') params[_p(prefix,'b_att')] = b_att # optional "deep" attention if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): params[_p(prefix,'W_att_%d'%lidx)] = ortho_weight(dimctx) params[_p(prefix,'b_att_%d'%lidx)] = numpy.zeros((dimctx,)).astype('float32') # attention: U_att = norm_weight(dimctx,1) params[_p(prefix,'U_att')] = U_att c_att = numpy.zeros((1,)).astype('float32') params[_p(prefix, 'c_tt')] = c_att return params
def param_init_lstm_cond_nox(options, params, prefix='lstm_cond_nox', dim=None, dimctx=None): if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] # LSTM to LSTM U = numpy.concatenate([ ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim) ], axis=1) params[_p(prefix, 'U')] = U # bias to LSTM params[_p(prefix, 'b')] = numpy.zeros((4 * dim, )).astype('float32') # from context to gates Wc = norm_weight(dimctx, dim * 4) params[_p(prefix, 'Wc')] = Wc Wc_att = norm_weight(dimctx, ortho=False) params[_p(prefix, 'Wc_att')] = Wc_att # attnetion: last context -> hidden Wct_att = norm_weight(dimctx, ortho=False) #params[_p(prefix,'Wct_att')] = Wct_att Wd_att = norm_weight(dim, dimctx) params[_p(prefix, 'Wd_att')] = Wd_att # attention: hidden bias b_att = numpy.zeros((dimctx, )).astype('float32') params[_p(prefix, 'b_att')] = b_att # attention: U_att = norm_weight(dimctx, 1) params[_p(prefix, 'U_att')] = U_att c_att = numpy.zeros((1, )).astype('float32') params[_p(prefix, 'c_att')] = c_att return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True, flag=False): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) flag = False if flag: #params[_p(prefix, 'b')] = np.full(nout,-1).astype('float32') import gzip import pickle with gzip.open('mnist.pkl.gz', 'rb') as f: train_set, _, _ = pickle.load(f) train_x, train_y = train_set marginals = np.clip(train_x.mean(axis=0), 1e-7, 1 - 1e-7) initial_baises = np.log(marginals / (1 - marginals)) params[_p(prefix, 'b')] = initial_baises.astype('float32') else: params[_p(prefix, 'b')] = np.zeros((nout, )).astype('float32') return params
def param_init_mlp(options, params, prefix='predictor'): dims = options['dims'] layer_num = len(dims) assert layer_num >= 3 for i in range(layer_num - 1): W = norm_weight(dims[i], dims[i + 1]) params[_p(prefix, i)] = W return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01) params[_p(prefix, 'b')] = numpy.zeros((nout, )).astype('float32') return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01) params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') return params
def param_init_fflayer(options, params, prefix='ff', prefix_bnorm='bnorm', nin=None, nout=None, ortho=True, batch_norm=False): if prefix in params: print 'this layer is already present' else: params[_p(prefix, 'W')] = norm_weight(nin, nout) params[_p(prefix, 'b')] = np.zeros((nout,)).astype('float32') return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True, flag=False): params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) params[_p(prefix, 'b')] = np.zeros((nout, )).astype('float32') return params
def param_init_lstm(options, params, prefix='lstm', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] """ Stack the weight matricies for all the gates for much cleaner code and slightly faster dot-prods """ # input weights W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix,'W')] = W # for the previous hidden activation U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix,'U')] = U params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32') return params
def init_params(options): params = OrderedDict() # embedding: [matrix E in paper] params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) ctx_dim = options['ctx_dim'] if options['lstm_encoder']: # potential feature that runs an LSTM over the annotation vectors # encoder: LSTM params = get_layer('lstm')[0](options, params, prefix='encoder', nin=options['ctx_dim'], dim=options['dim']) params = get_layer('lstm')[0](options, params, prefix='encoder_rev', nin=options['ctx_dim'], dim=options['dim']) ctx_dim = options['dim'] * 2 # init_state, init_cell: [top right on page 4] for lidx in xrange(1, options['n_layers_init']): params = get_layer('ff')[0](options, params, prefix='ff_init_%d'%lidx, nin=ctx_dim, nout=ctx_dim) params = get_layer('ff')[0](options, params, prefix='ff_state', nin=ctx_dim, nout=options['dim']) params = get_layer('ff')[0](options, params, prefix='ff_memory', nin=ctx_dim, nout=options['dim']) # decoder: LSTM: [equation (1)/(2)/(3)] params = get_layer('lstm_cond')[0](options, params, prefix='decoder', nin=options['dim_word'], dim=options['dim'], dimctx=ctx_dim) # potentially deep decoder (warning: should work but somewhat untested) if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): params = get_layer('ff')[0](options, params, prefix='ff_state_%d'%lidx, nin=options['ctx_dim'], nout=options['dim']) params = get_layer('ff')[0](options, params, prefix='ff_memory_%d'%lidx, nin=options['ctx_dim'], nout=options['dim']) params = get_layer('lstm_cond')[0](options, params, prefix='decoder_%d'%lidx, nin=options['dim'], dim=options['dim'], dimctx=ctx_dim) # readout: [equation (7)] params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', nin=options['dim'], nout=options['dim_word']) if options['ctx2out']: params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', nin=ctx_dim, nout=options['dim_word']) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): params = get_layer('ff')[0](options, params, prefix='ff_logit_h%d'%lidx, nin=options['dim_word'], nout=options['dim_word']) params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words']) return params
def init_params(options): params = OrderedDict() # embedding: [matrix E in paper] params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) ctx_dim = options['ctx_dim'] if options[ 'lstm_encoder']: # potential feature that runs an LSTM over the annotation vectors # encoder: LSTM params = get_layer('lstm')[0](options, params, prefix='encoder', nin=options['ctx_dim'], dim=options['dim']) params = get_layer('lstm')[0](options, params, prefix='encoder_rev', nin=options['ctx_dim'], dim=options['dim']) ctx_dim = options['dim'] * 2 # init_state, init_cell: [top right on page 4] for lidx in xrange(1, options['n_layers_init']): params = get_layer('ff')[0](options, params, prefix='ff_init_%d' % lidx, nin=ctx_dim, nout=ctx_dim) params = get_layer('ff')[0](options, params, prefix='ff_state', nin=ctx_dim, nout=options['dim']) params = get_layer('ff')[0](options, params, prefix='ff_memory', nin=ctx_dim, nout=options['dim']) # decoder: LSTM: [equation (1)/(2)/(3)] params = get_layer('lstm_cond')[0](options, params, prefix='decoder', nin=options['dim_word'], dim=options['dim'], dimctx=ctx_dim) # potentially deep decoder (warning: should work but somewhat untested) if options['n_layers_lstm'] > 1: for lidx in xrange(1, options['n_layers_lstm']): params = get_layer('ff')[0](options, params, prefix='ff_state_%d' % lidx, nin=options['ctx_dim'], nout=options['dim']) params = get_layer('ff')[0](options, params, prefix='ff_memory_%d' % lidx, nin=options['ctx_dim'], nout=options['dim']) params = get_layer('lstm_cond')[0](options, params, prefix='decoder_%d' % lidx, nin=options['dim'], dim=options['dim'], dimctx=ctx_dim) # readout: [equation (7)] params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', nin=options['dim'], nout=options['dim_word']) if options['ctx2out']: params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', nin=ctx_dim, nout=options['dim_word']) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): params = get_layer('ff')[0](options, params, prefix='ff_logit_h%d' % lidx, nin=options['dim_word'], nout=options['dim_word']) params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words']) return params
def param_init_lstm_cond(options, params, prefix='lstm_cond', nin=None, dim=None, dimctx=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] # input to LSTM, similar to the above, we stack the matricies for compactness, do one # dot product, and use the slice function below to get the activations for each "gate" W = numpy.concatenate([norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim), norm_weight(nin,dim)], axis=1) params[_p(prefix,'W')] = W # LSTM to LSTM U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim), ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix,'U')] = U # bias to LSTM params[_p(prefix,'b')] = numpy.zeros((4 * dim,)).astype('float32') # context to LSTM Wc = norm_weight(dimctx,dim*4) params[_p(prefix,'Wc')] = Wc # attention: context -> hidden Wc_att = norm_weight(dimctx, ortho=False) params[_p(prefix,'Wc_att')] = Wc_att # attention: LSTM -> hidden Wd_att = norm_weight(dim,dimctx) params[_p(prefix,'Wd_att')] = Wd_att # attention: hidden bias b_att = numpy.zeros((dimctx,)).astype('float32') params[_p(prefix,'b_att')] = b_att # optional "deep" attention if options['n_layers_att'] > 1: for lidx in xrange(1, options['n_layers_att']): params[_p(prefix,'W_att_%d'%lidx)] = ortho_weight(dimctx) params[_p(prefix,'b_att_%d'%lidx)] = numpy.zeros((dimctx,)).astype('float32') # attention: U_att = norm_weight(dimctx,1) params[_p(prefix,'U_att')] = U_att c_att = numpy.zeros((1,)).astype('float32') params[_p(prefix, 'c_tt')] = c_att if options['selector']: # attention: selector W_sel = norm_weight(dim, 1) params[_p(prefix, 'W_sel')] = W_sel b_sel = numpy.float32(0.) params[_p(prefix, 'b_sel')] = b_sel return params
def init_params(options): params = OrderedDict() # Visual concept embedding if not options['with_glove']: params['VCemb'] = norm_weight(options['n_words'], options['dim_word']) # embedding: [matrix E in paper] params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) #params = get_layer('ff')[0](options, params, prefix='CNNTrans', nin=options['cnn_dim'], nout=options['dim']) ctx_dim = options['ctx_dim'] if options[ 'lstm_encoder']: # potential feature that runs an LSTM over the annotation vectors # use input attentive encoder params = get_layer('lstm_cond_nox')[0](options, params, prefix='encoder', dim=ctx_dim, dimctx=options['semantic_dim']) # potentially deep decoder (warning: should work but somewhat untested) for lidx in range(options['n_layers_lstm']): ff_state_prefix = 'CNNTrans_%d' % lidx if lidx > 0 else 'CNNTrans' ff_memory_prefix = 'CNN_memory_%d' % lidx if lidx > 0 else 'CNN_memory' lstm_prefix = 'decoder_%d' % lidx if lidx > 0 else 'decoder' nin_lstm = options['dim'] if lidx > 0 else options['dim_word'] params = get_layer('ff')[0](options, params, prefix=ff_state_prefix, nin=options['cnn_dim'], nout=options['dim']) params = get_layer('ff')[0](options, params, prefix=ff_memory_prefix, nin=options['cnn_dim'], nout=options['dim']) params = get_layer('lstm_cond')[0](options, params, prefix=lstm_prefix, nin=nin_lstm, dim=options['dim'], dimctx=ctx_dim) # readout: [equation (7)] params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', nin=options['dim'], nout=options['dim_word']) if options['ctx2out']: params = get_layer('ff')[0](options, params, prefix='ff_logit_ctx', nin=ctx_dim, nout=options['dim_word']) if options['n_layers_out'] > 1: for lidx in xrange(1, options['n_layers_out']): params = get_layer('ff')[0](options, params, prefix='ff_logit_h%d' % lidx, nin=options['dim_word'], nout=options['dim_word']) params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words']) return params