def find_sent_embedding(n_words=21102, img_w=300, img_h=48, feature_maps=200, filter_hs=[3,4,5],n_x=300, n_h=600): options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['n_x'] = n_x options['n_h'] = n_h filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options) tparams = init_tparams(params) data = np.load('./bookcorpus_result.npz') for kk, pp in params.iteritems(): params[kk] = data[kk] for kk, pp in params.iteritems(): tparams[kk].set_value(params[kk]) x = tensor.matrix('x', dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),dtype='int32')].reshape((x.shape[0],1,x.shape[1],tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input,filter_shape, pool_size,prefix=_p('cnn_encoder',i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input = tensor.concatenate(layer1_inputs,1) f_embed = theano.function([x], layer1_input, name='f_embed') return f_embed, params
def find_sent_embedding(whole, n_words=21102, img_w=300, img_h=48, feature_maps=200, filter_hs=[3, 4, 5], n_x=300, n_h=600): options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['n_x'] = n_x options['n_h'] = n_h filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options) tparams = init_tparams(params) data = np.load('./bookcorpus_result.npz') for kk, pp in params.iteritems(): params[kk] = data[kk] for kk, pp in params.iteritems(): tparams[kk].set_value(params[kk]) x = tensor.matrix('x', dtype='int32') layer0_input = tparams['Wemb'][tensor.cast(x.flatten(), dtype='int32')].reshape( (x.shape[0], 1, x.shape[1], tparams['Wemb'].shape[1])) layer1_inputs = [] for i in xrange(len(options['filter_hs'])): filter_shape = options['filter_shapes'][i] pool_size = options['pool_sizes'][i] conv_layer = encoder(tparams, layer0_input, filter_shape, pool_size, prefix=_p('cnn_encoder', i)) layer1_input = conv_layer layer1_inputs.append(layer1_input) layer1_input = tensor.concatenate(layer1_inputs, 1) f_embed = theano.function([x], layer1_input, name='f_embed') kf = get_minibatches_idx(len(whole), 100) sent_emb = np.zeros((len(whole), 600)) for i, train_index in kf: sents = [whole[t] for t in train_index] x = prepare_data_for_cnn(sents) sent_emb[train_index[0]:train_index[-1] + 1] = f_embed(x) if i % 500 == 0: print i, np.savez('./bookcorpus_embedding.npz', sent_emb=sent_emb) return sent_emb
def train_model(train, val, test, n_words=21103, img_w=300, max_len=40, feature_maps=200, filter_hs=[3,4,5], n_x=300, n_h=600, max_epochs=8, lrate=0.0002, batch_size=64, valid_batch_size=64, dispFreq=10, validFreq=500, saveFreq=1000, saveto = 'bookcorpus_result.npz'): """ train, valid, test : datasets n_words : vocabulary size img_w : word embedding dimension, must be 300. max_len : the maximum length of a sentence feature_maps : the number of feature maps we used filter_hs: the filter window sizes we used n_x: word embedding dimension n_h: the number of hidden units in LSTM max_epochs : the maximum number of epoch to run lrate : learning rate batch_size : batch size during training valid_batch_size : The batch size used for validation/test set dispFreq : Display to stdout the training progress every N updates validFreq : Compute the validation error after this number of update. saveFreq: save the result after this number of update. saveto: where to save the result. """ img_h = max_len + 2*(filter_hs[-1]-1) options = {} options['n_words'] = n_words options['img_w'] = img_w options['img_h'] = img_h options['feature_maps'] = feature_maps options['filter_hs'] = filter_hs options['n_x'] = n_x options['n_h'] = n_h options['max_epochs'] = max_epochs options['lrate'] = lrate options['batch_size'] = batch_size options['valid_batch_size'] = valid_batch_size options['dispFreq'] = dispFreq options['validFreq'] = validFreq options['saveFreq'] = saveFreq logger.info('Model options {}'.format(options)) logger.info('Building model...') filter_w = img_w filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1)) options['filter_shapes'] = filter_shapes options['pool_sizes'] = pool_sizes params = init_params(options) tparams = init_tparams(params) use_noise, x, y, y_mask, cost = build_model(tparams,options) f_cost = theano.function([x, y, y_mask], cost, name='f_cost') lr = tensor.scalar(name='lr') f_grad_shared, f_update = Adam(tparams, cost, [x, y, y_mask], lr) logger.info('Training model...') history_cost = [] uidx = 0 # the number of update done start_time = time.time() kf_valid = get_minibatches_idx(len(val), valid_batch_size) zero_vec_tensor = tensor.vector() zero_vec = np.zeros(img_w).astype(theano.config.floatX) set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][21102,:], zero_vec_tensor))]) try: for eidx in xrange(max_epochs): n_samples = 0 kf = get_minibatches_idx(len(train), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(0.) sents = [train[t]for t in train_index] x = prepare_data_for_cnn(sents) y, y_mask = prepare_data_for_rnn(sents) n_samples += y.shape[1] cost = f_grad_shared(x, y, y_mask) f_update(lrate) # the special <pad_zero> token does not need to update. set_zero(zero_vec) if np.isnan(cost) or np.isinf(cost): logger.info('NaN detected') return 1., 1., 1. if np.mod(uidx, dispFreq) == 0: logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, np.exp(cost))) if np.mod(uidx, saveFreq) == 0: logger.info('Saving ...') params = unzip(tparams) np.savez(saveto, history_cost=history_cost, **params) logger.info('Done ...') if np.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid) history_cost.append([valid_cost]) logger.info('Valid {}'.format(np.exp(valid_cost))) logger.info('Seen {} samples'.format(n_samples)) except KeyboardInterrupt: logger.info('Training interupted') end_time = time.time() # if best_p is not None: # zipp(best_p, tparams) # else: # best_p = unzip(tparams) use_noise.set_value(0.) valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid) logger.info('Valid {}'.format(np.exp(valid_cost))) params = unzip(tparams) np.savez(saveto, history_cost=history_cost, **params) logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, (end_time - start_time) / (1. * (eidx + 1)))) return valid_cost