def generation(z_list, n_latent, hu_decoder, n_out, y): logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent, hu_decoder) if hu_decoder == 0: return generation_simple(z_list, n_latent, n_out, y) mlp1 = MLP(activations=[Rectifier()], dims=[n_latent, hu_decoder], name='latent_to_hidDecoder') initialize([mlp1]) hid_to_out = Linear(name='hidDecoder_to_output', input_dim=hu_decoder, output_dim=n_out) initialize([hid_to_out]) mysigmoid = Logistic(name='y_hat_vae') agg_logpy_xz = 0. agg_y_hat = 0. for i, z in enumerate(z_list): y_hat = mysigmoid.apply(hid_to_out.apply( mlp1.apply(z))) #reconstructed x agg_logpy_xz += cross_entropy_loss(y_hat, y) agg_y_hat += y_hat agg_logpy_xz /= len(z_list) agg_y_hat /= len(z_list) return agg_y_hat, agg_logpy_xz
def generation_simple(z_list, n_latent, n_out, y): logger.info('generate output without MLP') hid_to_out = Linear(name='hidDecoder_to_output', input_dim=n_latent, output_dim=n_out) initialize([hid_to_out]) mysigmoid = Logistic(name='y_hat_vae') agg_logpy_xz = 0. agg_y_hat = 0. for z in z_list: lin_out = hid_to_out.apply(z) y_hat = mysigmoid.apply(lin_out) #reconstructed x logpy_xz = -cross_entropy_loss(y_hat, y) agg_logpy_xz += logpy_xz agg_y_hat += y_hat agg_logpy_xz /= len(z_list) agg_y_hat /= len(z_list) return agg_y_hat, agg_logpy_xz
def generation(z_list, n_latent, hu_decoder, n_out, y): logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent, hu_decoder) if hu_decoder == 0: return generation_simple(z_list, n_latent, n_out, y) mlp1 = MLP(activations=[Rectifier()], dims=[n_latent, hu_decoder], name='latent_to_hidDecoder') initialize([mlp1]) hid_to_out = Linear(name='hidDecoder_to_output', input_dim=hu_decoder, output_dim=n_out) initialize([hid_to_out]) mysigmoid = Logistic(name='y_hat_vae') agg_logpy_xz = 0. agg_y_hat = 0. for i, z in enumerate(z_list): y_hat = mysigmoid.apply(hid_to_out.apply(mlp1.apply(z))) #reconstructed x agg_logpy_xz += cross_entropy_loss(y_hat, y) agg_y_hat += y_hat agg_logpy_xz /= len(z_list) agg_y_hat /= len(z_list) return agg_y_hat, agg_logpy_xz
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name="linear1") brick2 = Bias(2, name="bias1") activation = Logistic(name="sigm") x = tensor.vector() h1 = brick1.apply(x) h2 = activation.apply(h1) h2.name = "h2act" y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.parameters[0]] bias = [brick1.b, brick2.parameters[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name="W_norm") assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex="W_no.?m") assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by theano name theano_name_filter = VariableFilter(theano_name="h2act") assert [cg.variables[11]] == theano_name_filter(cg.variables) # Testing filtering by theano name regex theano_name_filter_regex = VariableFilter(theano_name_regex="h2a.?t") assert [cg.variables[11]] == theano_name_filter_regex(cg.variables) # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) variables = [cg.variables[1], cg.variables[8]] assert variables == appli_filter(cg.variables) # Testing filtering by application appli_filter_list = VariableFilter(applications=[brick1.apply]) assert variables == appli_filter_list(cg.variables) input1 = tensor.matrix("input1") input2 = tensor.matrix("input2") merge = Merge(["input1", "input2"], [5, 6], 2) merged = merge.apply(input1, input2) merge_cg = ComputationGraph(merged) outputs = VariableFilter(roles=[OUTPUT], bricks=[merge])(merge_cg.variables) assert merged in outputs assert len(outputs) == 3 outputs_application = VariableFilter(roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables) assert outputs_application == [merged]
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') brick2 = Bias(2, name='bias1') activation = Logistic(name='sigm') x = tensor.vector() h1 = brick1.apply(x, call_id='brick1_call_id') h2 = activation.apply(h1, call_id='act') h2.name = "h2act" y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.parameters[0]] bias = [brick1.b, brick2.parameters[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name='W_norm') assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex='W_no.?m') assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by theano name theano_name_filter = VariableFilter(theano_name='h2act') assert [cg.variables[11]] == theano_name_filter(cg.variables) # Testing filtering by theano name regex theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t') assert [cg.variables[11]] == theano_name_filter_regex(cg.variables) brick1_apply_variables = [cg.variables[1], cg.variables[8]] # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) assert brick1_apply_variables == appli_filter(cg.variables) # Testing filtering by unbound application unbound_appli_filter = VariableFilter(applications=[Linear.apply]) assert brick1_apply_variables == unbound_appli_filter(cg.variables) # Testing filtering by call identifier call_id_filter = VariableFilter(call_id='brick1_call_id') assert brick1_apply_variables == call_id_filter(cg.variables) input1 = tensor.matrix('input1') input2 = tensor.matrix('input2') merge = Merge(['input1', 'input2'], [5, 6], 2) merged = merge.apply(input1, input2) merge_cg = ComputationGraph(merged) outputs = VariableFilter( roles=[OUTPUT], bricks=[merge])(merge_cg.variables) assert merged in outputs assert len(outputs) == 3 outputs_application = VariableFilter( roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables) assert outputs_application == [merged]
class MeanPoolCombiner(Initializable): """ Parameters ---------- dim: int Dimensionality of def embedding dropout_type: str dropout: float, defaut: 0.0 emb_dim: int Dimensionality of word embeddings, as well as final output compose_type : str If 'sum', the definition and word embeddings are averaged If 'fully_connected_linear', a learned perceptron compose the 2 embeddings linearly If 'fully_connected_relu', ... If 'fully_connected_tanh', ... """ def __init__(self, emb_dim, dim, dropout=0.0, def_word_gating="none", dropout_type="per_unit", compose_type="sum", word_dropout_weighting="no_weighting", shortcut_unk_and_excluded=False, num_input_words=-1, exclude_top_k=-1, vocab=None, **kwargs): self._dropout = dropout self._num_input_words = num_input_words self._exclude_top_K = exclude_top_k self._dropout_type = dropout_type self._compose_type = compose_type self._vocab = vocab self._shortcut_unk_and_excluded = shortcut_unk_and_excluded self._word_dropout_weighting = word_dropout_weighting self._def_word_gating = def_word_gating if def_word_gating not in {"none", "self_attention"}: raise NotImplementedError() if word_dropout_weighting not in {"no_weighting"}: raise NotImplementedError("Not implemented " + word_dropout_weighting) if dropout_type not in {"per_unit", "per_example", "per_word"}: raise NotImplementedError() children = [] if self._def_word_gating == "self_attention": self._gate_mlp = Linear(dim, dim) self._gate_act = Logistic() children.extend([self._gate_mlp, self._gate_act]) if compose_type == 'fully_connected_linear': self._def_state_compose = MLP(activations=[None], dims=[emb_dim + dim, emb_dim]) children.append(self._def_state_compose) if compose_type == "gated_sum" or compose_type == "gated_transform_and_sum": if dropout_type == "per_word" or dropout_type == "per_example": raise RuntimeError( "I dont think this combination makes much sense") self._compose_gate_mlp = Linear(dim + emb_dim, emb_dim, name='gate_linear') self._compose_gate_act = Logistic() children.extend([self._compose_gate_mlp, self._compose_gate_act]) if compose_type == 'sum': if not emb_dim == dim: raise ValueError( "Embedding has different dim! Cannot use compose_type='sum'" ) if compose_type == 'transform_and_sum' or compose_type == "gated_transform_and_sum": self._def_state_transform = Linear(dim, emb_dim, name='state_transform') children.append(self._def_state_transform) super(MeanPoolCombiner, self).__init__(children=children, **kwargs) @application def apply(self, application_call, word_embs, words_mask, def_embeddings, def_map, train_phase=False, word_ids=False, call_name=""): batch_shape = word_embs.shape flat_indices = def_map[:, 0] * batch_shape[ 1] + def_map[:, 1] # Index of word in flat # def_map is (seq_pos, word_pos, def_index) # def_embeddings is (id, emb_dim) def_sum = T.zeros( (batch_shape[0] * batch_shape[1], def_embeddings.shape[1])) def_lens = T.zeros_like(def_sum[:, 0]) def_lens = T.inc_subtensor(def_lens[flat_indices], 1) if self._def_word_gating == "none": def_sum = T.inc_subtensor(def_sum[flat_indices], def_embeddings[def_map[:, 2]]) def_mean = def_sum / T.maximum(def_lens[:, None], 1) elif self._def_word_gating == "self_attention": gates = def_embeddings[def_map[:, 2]] gates = self._gate_mlp.apply(gates)[:, 0] application_call.add_auxiliary_variable(gates, name='def_gates') # Dima: this is numerically unstable. But maybe it can work. # If it can work, we can avoid too much coding. def_normalization = T.zeros_like(def_lens) def_normalization = T.inc_subtensor( def_normalization[flat_indices], T.exp(gates)) gates = T.exp(gates) / def_normalization[flat_indices] def_mean = T.inc_subtensor( def_sum[flat_indices], gates[:, None] * def_embeddings[def_map[:, 2]]) else: raise NotImplementedError() def_mean = def_mean.reshape((batch_shape[0], batch_shape[1], -1)) application_call.add_auxiliary_variable( masked_root_mean_square(def_mean, words_mask), name=call_name + '_def_mean_rootmean2') if train_phase and self._dropout != 0.0: if self._dropout_type == "per_unit": logger.info("Adding per_unit drop on dict and normal emb") word_embs = apply_dropout(word_embs, drop_prob=self._dropout) def_mean = apply_dropout(def_mean, drop_prob=self._dropout) elif self._dropout_type == "per_example": logger.info("Adding per_example drop on dict and normal emb") # We dropout mask mask_defs = T.ones((batch_shape[0], )) mask_we = T.ones((batch_shape[0], )) # Mask dropout mask_defs = apply_dropout(mask_defs, drop_prob=self._dropout) mask_we = apply_dropout(mask_we, drop_prob=self._dropout) # this reduces variance. If both 0 will select both where_both_zero = T.eq((mask_defs + mask_we), 0) mask_defs = (where_both_zero + mask_defs).dimshuffle( 0, "x", "x") mask_we = (where_both_zero + mask_we).dimshuffle(0, "x", "x") def_mean = mask_defs * def_mean word_embs = mask_we * word_embs elif self._dropout_type == "per_word_independent": # TODO: Maybe we also want to have possibility of including both (like in per_example) pass # TODO: implement elif self._dropout_type == "per_word": # First select if to perform dropout at all mask_higher = T.ones((batch_shape[0], batch_shape[1])) mask_higher = apply_dropout(mask_higher, drop_prob=self._dropout) mask_higher = mask_higher.dimshuffle(0, 1, "x") logger.info("Apply per_word dropou on dict and normal emb") # And if yes just 50% word vs 50% def mask = T.ones((batch_shape[0], batch_shape[1])) mask = apply_dropout(mask, drop_prob=0.5) mask = mask.dimshuffle(0, 1, "x") # Competitive def_mean = mask_higher * def_mean + ( 1 - mask_higher) * mask * def_mean word_embs = mask_higher * word_embs + (1 - mask_higher) * ( 1 - mask) * word_embs # TODO: Smarter weighting (at least like divisor in dropout) if not self._compose_type == "sum" and not self._compose_type == "transform_and_sum": raise NotImplementedError() application_call.add_auxiliary_variable(def_mean.copy(), name=call_name + '_dict_word_embeddings') if self._compose_type == 'sum': final_embeddings = word_embs + def_mean elif self._compose_type == 'transform_and_sum': final_embeddings = (word_embs + self._def_state_transform.apply(def_mean)) elif self._compose_type == 'gated_sum' or self._compose_type == 'gated_transform_and_sum': concat = T.concatenate([word_embs, def_mean], axis=2) gates = concat.reshape((batch_shape[0] * batch_shape[1], -1)) gates = self._compose_gate_mlp.apply(gates) gates = self._compose_gate_act.apply(gates) gates = gates.reshape((batch_shape[0], batch_shape[1], -1)) if self._compose_type == 'gated_sum': final_embeddings = gates * word_embs + (1 - gates) * def_mean else: final_embeddings = gates * word_embs + ( 1 - gates) * self._def_state_transform.apply(def_mean) application_call.add_auxiliary_variable(masked_root_mean_square( gates.reshape((batch_shape[0], batch_shape[1], -1)), words_mask), name=call_name + '_compose_gate_rootmean2') elif self._compose_type.startswith('fully_connected'): concat = T.concatenate([word_embs, def_mean], axis=2) final_embeddings = self._def_state_compose.apply(concat) else: raise NotImplementedError() if self._shortcut_unk_and_excluded: # NOTE: It might be better to move it out of Lookup, because it breaks API a bit # but at the same time it makes sense to share this code # 1. If no def, just go with word emb final_embeddings = word_embs * T.lt(word_ids, self._exclude_top_K).dimshuffle(0, 1, "x") + \ final_embeddings * T.ge(word_ids, self._exclude_top_K).dimshuffle(0, 1, "x") # 2. UNKs always get def embeddings (UNK can happen for dev/test set of course) final_embeddings = final_embeddings * T.neq(word_ids, self._vocab.unk).dimshuffle(0, 1, "x") + \ def_mean * T.eq(word_ids, self._vocab.unk).dimshuffle(0, 1, "x") application_call.add_auxiliary_variable( masked_root_mean_square(final_embeddings, words_mask), name=call_name + '_merged_input_rootmean2') return final_embeddings