def build_network(self, num_labels, features, max_len=None, hidden_units=None, l2=None, use_cnn=None, cnn_filter_size=None, cnn_pool_size=None, cnn_num_filters=None, cnn_filter_sizes=None, embedding_size=None, DEBUG=False): """ Build the neural network used for training. :param num_labels: Number of labels to classify :param features: the input features we use :param max_len: Configured window-size :param hidden_units: Number of units in the MLP's hiddden layer :returns: The cost function, the misclassification rate function, the computation graph of the cost function and the prediction function """ logger.info( 'building the network, with one CNN for left and one for right') hidden_units = hidden_units or self._config['hidden_units'] logger.info('#hidden units: %d', hidden_units) # building the feature vector from input. mlp_in_e1, mlp_in_e2, mlp_in_dim = self.build_feature_vector_noMention( features) logger.info('feature vector size: %d', mlp_in_dim) mlp = MLP(activations=[Rectifier()], dims=[mlp_in_dim, hidden_units], seed=self.curSeed) initialize([mlp]) before_out_e1 = mlp.apply(mlp_in_e1) before_out_e2 = mlp.apply(mlp_in_e2) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_units, output_dim=num_labels) initialize([hidden_to_output]) linear_output_e1 = hidden_to_output.apply(before_out_e1) linear_output_e2 = hidden_to_output.apply(before_out_e2) linear_output_e1.name = 'linear_output_e1' linear_output_e2.name = 'linear_output_e2' y_hat_e1 = Logistic(name='logistic1').apply(linear_output_e1) y_hat_e2 = Logistic(name='logistic2').apply(linear_output_e2) y_hat_e1.name = 'y_hat_e1' y_hat_e2.name = 'y_hat_e2' y_hat_e1 = debug_print(y_hat_e1, 'y_1', DEBUG) return y_hat_e1, y_hat_e2, before_out_e1, before_out_e2
def softmax_layer(h, y, hidden_size, num_targets, cost_fn='cross'): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' y_pred = T.argmax(linear_output, axis=1) label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False) pat1 = T.mean(label_of_predicted) updates = None if 'ranking' in cost_fn: cost, updates = ranking_loss(linear_output, y) print 'using ranking loss function!' else: y_hat = Logistic().apply(linear_output) y_hat.name = 'y_hat' cost = cross_entropy_loss(y_hat, y) cost.name = 'cost' pat1.name = 'precision@1' misclassify_rate = MultiMisclassificationRate().apply( y, T.ge(linear_output, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, updates, misclassify_rate
def softmax_layer_old(h, y, hidden_size, num_targets, cost_fn='softmax'): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' y_pred = T.argmax(linear_output, axis=1) label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False) pat1 = T.mean(label_of_predicted) updates = {} if 'softmax' in cost_fn: y_hat = Logistic().apply(linear_output) y_hat.name = 'y_hat' cost = cross_entropy_loss(y_hat, y) else: cost, updates = ranking_loss(linear_output, y) cost.name = 'cost' pat1.name = 'precision@1' return cost, pat1, updates
def softmax_layer(h, y, hidden_size, num_targets, cost_fn='cross'): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' y_pred = T.argmax(linear_output, axis=1) label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False) pat1 = T.mean(label_of_predicted) updates = None if 'ranking' in cost_fn: cost, updates = ranking_loss(linear_output, y) print 'using ranking loss function!' else: y_hat = Logistic().apply(linear_output) y_hat.name = 'y_hat' cost = cross_entropy_loss(y_hat, y) cost.name = 'cost' pat1.name = 'precision@1' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(linear_output, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, updates, misclassify_rate
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False): hidden_size = config['hidden_units'].split() use_highway = str_to_bool( config['use_highway']) if 'use_highway' in config else False use_gaus = str_to_bool( config['use_gaus']) if 'use_gaus' in config else False use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0 use_noise = str_to_bool( config['use_noise']) if 'use_noise' in config else False use_vae = str_to_bool(config['use_vae']) if 'use_vae' in config else False hu_decoder = int( config['hu_decoder']) if 'hu_decoder' in config else hidden_size logger.info( 'use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size) init_with_type = str_to_bool( config['init_with_type']) if 'init_with_type' in config else False y = T.matrix('targets', dtype='int32') drop_prob = float(config['dropout']) if 'dropout' in config else 0 #build the feature vector with one model, e.g., with cnn or mean or lstm feature_vec, feature_vec_len = build_feature_vec(fea2obj, config) #drop out if drop_prob > 0: mask = T.cast( srng.binomial(n=1, p=1 - drop_prob, size=feature_vec.shape), 'float32') if test: feature_vec *= (1 - drop_prob) else: feature_vec *= mask #Highway network if use_highway: g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp') t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp') initialize([g_mlp, t_mlp]) t = t_mlp.apply(feature_vec) z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec feature_vec = z #MLP(s) logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size)) if len(hidden_size) > 1: #2 MLP on feature fector mlp = MLP( activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = int(hidden_size[1]) else: hidden_size = int(hidden_size[0]) mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = hidden_size #compute y_hat initial guess hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets) typemfile = None if init_with_type: typemfile = config['dsdir'] + '/_typematrix.npy' #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy' initialize_lasthid(hidden_to_output, typemfile) # initialize([hidden_to_output]) y_hat_init = Logistic().apply(hidden_to_output.apply(before_out)) y_hat_init.name = 'y_hat_init' y_hat_init = debug_print(y_hat_init, 'yhat_init', False) logpy_xz_init = cross_entropy_loss(y_hat_init, y) logpy_xz = logpy_xz_init y_hat_recog = y_hat_init y_hat = y_hat_init KLD = 0 if use_gaus: if use_vae: logger.info('using VAE') vae_conditional = str_to_bool(config['vae_cond']) if vae_conditional: y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal( kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) else: y_hat, logpy_xz, KLD = build_vae_basic( kl_weight, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) y_hat_recog = y_hat else: if use_rec: logger.info('Not using VAE... but using recursion') prior_in = T.concatenate([feature_vec, y_hat_init], axis=1) mu_prior, log_sigma_prior = prior_network( x=prior_in, n_input=feature_vec_len + num_targets, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z + feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat = (y_hat + y_hat_init) / 2. logpy_xz = (logpy_xz + logpy_xz_init) / 2. else: prior_in = T.concatenate([feature_vec], axis=1) mu_prior, log_sigma_prior = prior_network( x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z + feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat_recog = y_hat y_hat = debug_print(y_hat, 'y_hat', False) pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)]) max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False) pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type]) mean_cross = T.mean(logpy_xz) mean_kld = T.mean(KLD) cost = mean_kld + mean_cross cost.name = 'cost' mean_kld.name = 'kld' mean_cross.name = 'cross_entropy_loss' pat1.name = 'p@1' pat1_recog.name = 'p@1_recog' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False ): hidden_size = config['hidden_units'].split() use_highway = str_to_bool(config['use_highway']) if 'use_highway' in config else False use_gaus = str_to_bool(config['use_gaus']) if 'use_gaus' in config else False use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0 use_noise = str_to_bool(config['use_noise']) if 'use_noise' in config else False use_vae=str_to_bool(config['use_vae']) if 'use_vae' in config else False hu_decoder = int(config['hu_decoder']) if 'hu_decoder' in config else hidden_size logger.info('use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size) init_with_type = str_to_bool(config['init_with_type']) if 'init_with_type' in config else False y = T.matrix('targets', dtype='int32') drop_prob = float(config['dropout']) if 'dropout' in config else 0 #build the feature vector with one model, e.g., with cnn or mean or lstm feature_vec, feature_vec_len = build_feature_vec(fea2obj, config) #drop out if drop_prob > 0: mask = T.cast(srng.binomial(n=1, p=1-drop_prob, size=feature_vec.shape), 'float32') if test: feature_vec *= (1 - drop_prob) else: feature_vec *= mask #Highway network if use_highway: g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp') t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp') initialize([g_mlp, t_mlp]) t = t_mlp.apply(feature_vec) z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec feature_vec = z #MLP(s) logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size)) if len(hidden_size) > 1: #2 MLP on feature fector mlp = MLP(activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = int(hidden_size[1]) else: hidden_size = int(hidden_size[0]) mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = hidden_size #compute y_hat initial guess hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets) typemfile = None if init_with_type: typemfile = config['dsdir'] + '/_typematrix.npy' #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy' initialize_lasthid(hidden_to_output, typemfile) # initialize([hidden_to_output]) y_hat_init = Logistic().apply(hidden_to_output.apply(before_out)) y_hat_init.name='y_hat_init' y_hat_init = debug_print(y_hat_init, 'yhat_init', False) logpy_xz_init = cross_entropy_loss(y_hat_init, y) logpy_xz = logpy_xz_init y_hat_recog = y_hat_init y_hat = y_hat_init KLD = 0 if use_gaus: if use_vae: logger.info('using VAE') vae_conditional=str_to_bool(config['vae_cond']) if vae_conditional: y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal(kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) else: y_hat, logpy_xz, KLD = build_vae_basic(kl_weight, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) y_hat_recog = y_hat else: if use_rec: logger.info('Not using VAE... but using recursion') prior_in = T.concatenate([feature_vec, y_hat_init], axis=1) mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len+num_targets, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat = (y_hat + y_hat_init) / 2. logpy_xz = (logpy_xz + logpy_xz_init) / 2. else: prior_in = T.concatenate([feature_vec], axis=1) mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat_recog = y_hat y_hat = debug_print(y_hat, 'y_hat', False) pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)]) max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False) pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type]) mean_cross = T.mean(logpy_xz) mean_kld = T.mean(KLD) cost = mean_kld + mean_cross cost.name = 'cost'; mean_kld.name = 'kld'; mean_cross.name = 'cross_entropy_loss'; pat1.name = 'p@1'; pat1_recog.name = 'p@1_recog' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate