Exemple #1
0
def generation(z_list, n_latent, hu_decoder, n_out, y):
    logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent,
                hu_decoder)
    if hu_decoder == 0:
        return generation_simple(z_list, n_latent, n_out, y)
    mlp1 = MLP(activations=[Rectifier()],
               dims=[n_latent, hu_decoder],
               name='latent_to_hidDecoder')
    initialize([mlp1])
    hid_to_out = Linear(name='hidDecoder_to_output',
                        input_dim=hu_decoder,
                        output_dim=n_out)
    initialize([hid_to_out])
    mysigmoid = Logistic(name='y_hat_vae')
    agg_logpy_xz = 0.
    agg_y_hat = 0.
    for i, z in enumerate(z_list):
        y_hat = mysigmoid.apply(hid_to_out.apply(
            mlp1.apply(z)))  #reconstructed x
        agg_logpy_xz += cross_entropy_loss(y_hat, y)
        agg_y_hat += y_hat

    agg_logpy_xz /= len(z_list)
    agg_y_hat /= len(z_list)
    return agg_y_hat, agg_logpy_xz
Exemple #2
0
def generation_simple(z_list, n_latent, n_out, y):
    logger.info('generate output without MLP')
    hid_to_out = Linear(name='hidDecoder_to_output', input_dim=n_latent, output_dim=n_out)
    initialize([hid_to_out])
    mysigmoid = Logistic(name='y_hat_vae')
    agg_logpy_xz = 0.
    agg_y_hat = 0.
    for z in z_list:
        lin_out = hid_to_out.apply(z)
        y_hat = mysigmoid.apply(lin_out) #reconstructed x
        logpy_xz = -cross_entropy_loss(y_hat, y)
        agg_logpy_xz += logpy_xz
        agg_y_hat += y_hat
    agg_logpy_xz /= len(z_list)
    agg_y_hat /= len(z_list)
    return agg_y_hat, agg_logpy_xz
Exemple #3
0
def generation_simple(z_list, n_latent, n_out, y):
    logger.info('generate output without MLP')
    hid_to_out = Linear(name='hidDecoder_to_output',
                        input_dim=n_latent,
                        output_dim=n_out)
    initialize([hid_to_out])
    mysigmoid = Logistic(name='y_hat_vae')
    agg_logpy_xz = 0.
    agg_y_hat = 0.
    for z in z_list:
        lin_out = hid_to_out.apply(z)
        y_hat = mysigmoid.apply(lin_out)  #reconstructed x
        logpy_xz = -cross_entropy_loss(y_hat, y)
        agg_logpy_xz += logpy_xz
        agg_y_hat += y_hat
    agg_logpy_xz /= len(z_list)
    agg_y_hat /= len(z_list)
    return agg_y_hat, agg_logpy_xz
Exemple #4
0
def generation(z_list, n_latent, hu_decoder, n_out, y):
    logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent, hu_decoder)
    if hu_decoder == 0:
        return generation_simple(z_list, n_latent, n_out, y)
    mlp1 = MLP(activations=[Rectifier()], dims=[n_latent, hu_decoder], name='latent_to_hidDecoder')
    initialize([mlp1])
    hid_to_out = Linear(name='hidDecoder_to_output', input_dim=hu_decoder, output_dim=n_out)
    initialize([hid_to_out])
    mysigmoid = Logistic(name='y_hat_vae')
    agg_logpy_xz = 0.
    agg_y_hat = 0.
    for i, z in enumerate(z_list):
        y_hat = mysigmoid.apply(hid_to_out.apply(mlp1.apply(z))) #reconstructed x
        agg_logpy_xz += cross_entropy_loss(y_hat, y)
        agg_y_hat += y_hat
    
    agg_logpy_xz /= len(z_list)
    agg_y_hat /= len(z_list)
    return agg_y_hat, agg_logpy_xz
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name="linear1")
    brick2 = Bias(2, name="bias1")
    activation = Logistic(name="sigm")

    x = tensor.vector()
    h1 = brick1.apply(x)
    h2 = activation.apply(h1)
    h2.name = "h2act"
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.parameters[0]]
    bias = [brick1.b, brick2.parameters[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name="W_norm")
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex="W_no.?m")
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by theano name
    theano_name_filter = VariableFilter(theano_name="h2act")
    assert [cg.variables[11]] == theano_name_filter(cg.variables)

    # Testing filtering by theano name regex
    theano_name_filter_regex = VariableFilter(theano_name_regex="h2a.?t")
    assert [cg.variables[11]] == theano_name_filter_regex(cg.variables)

    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    variables = [cg.variables[1], cg.variables[8]]
    assert variables == appli_filter(cg.variables)

    # Testing filtering by application
    appli_filter_list = VariableFilter(applications=[brick1.apply])
    assert variables == appli_filter_list(cg.variables)

    input1 = tensor.matrix("input1")
    input2 = tensor.matrix("input2")
    merge = Merge(["input1", "input2"], [5, 6], 2)
    merged = merge.apply(input1, input2)
    merge_cg = ComputationGraph(merged)
    outputs = VariableFilter(roles=[OUTPUT], bricks=[merge])(merge_cg.variables)
    assert merged in outputs
    assert len(outputs) == 3

    outputs_application = VariableFilter(roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables)
    assert outputs_application == [merged]
Exemple #6
0
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name='linear1')
    brick2 = Bias(2, name='bias1')
    activation = Logistic(name='sigm')

    x = tensor.vector()
    h1 = brick1.apply(x, call_id='brick1_call_id')
    h2 = activation.apply(h1, call_id='act')
    h2.name = "h2act"
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.parameters[0]]
    bias = [brick1.b, brick2.parameters[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name='W_norm')
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex='W_no.?m')
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by theano name
    theano_name_filter = VariableFilter(theano_name='h2act')
    assert [cg.variables[11]] == theano_name_filter(cg.variables)

    # Testing filtering by theano name regex
    theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t')
    assert [cg.variables[11]] == theano_name_filter_regex(cg.variables)

    brick1_apply_variables = [cg.variables[1], cg.variables[8]]
    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    assert brick1_apply_variables == appli_filter(cg.variables)

    # Testing filtering by unbound application
    unbound_appli_filter = VariableFilter(applications=[Linear.apply])
    assert brick1_apply_variables == unbound_appli_filter(cg.variables)

    # Testing filtering by call identifier
    call_id_filter = VariableFilter(call_id='brick1_call_id')
    assert brick1_apply_variables == call_id_filter(cg.variables)

    input1 = tensor.matrix('input1')
    input2 = tensor.matrix('input2')
    merge = Merge(['input1', 'input2'], [5, 6], 2)
    merged = merge.apply(input1, input2)
    merge_cg = ComputationGraph(merged)
    outputs = VariableFilter(
        roles=[OUTPUT], bricks=[merge])(merge_cg.variables)
    assert merged in outputs
    assert len(outputs) == 3

    outputs_application = VariableFilter(
        roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables)
    assert outputs_application == [merged]
Exemple #7
0
class MeanPoolCombiner(Initializable):
    """
    Parameters
    ----------
    dim: int
        Dimensionality of def embedding

    dropout_type: str

    dropout: float, defaut: 0.0

    emb_dim: int
        Dimensionality of word embeddings, as well as final output

    compose_type : str
        If 'sum', the definition and word embeddings are averaged
        If 'fully_connected_linear', a learned perceptron compose the 2
        embeddings linearly
        If 'fully_connected_relu', ...
        If 'fully_connected_tanh', ...
    """
    def __init__(self,
                 emb_dim,
                 dim,
                 dropout=0.0,
                 def_word_gating="none",
                 dropout_type="per_unit",
                 compose_type="sum",
                 word_dropout_weighting="no_weighting",
                 shortcut_unk_and_excluded=False,
                 num_input_words=-1,
                 exclude_top_k=-1,
                 vocab=None,
                 **kwargs):

        self._dropout = dropout
        self._num_input_words = num_input_words
        self._exclude_top_K = exclude_top_k
        self._dropout_type = dropout_type
        self._compose_type = compose_type
        self._vocab = vocab
        self._shortcut_unk_and_excluded = shortcut_unk_and_excluded
        self._word_dropout_weighting = word_dropout_weighting
        self._def_word_gating = def_word_gating

        if def_word_gating not in {"none", "self_attention"}:
            raise NotImplementedError()

        if word_dropout_weighting not in {"no_weighting"}:
            raise NotImplementedError("Not implemented " +
                                      word_dropout_weighting)

        if dropout_type not in {"per_unit", "per_example", "per_word"}:
            raise NotImplementedError()

        children = []

        if self._def_word_gating == "self_attention":
            self._gate_mlp = Linear(dim, dim)
            self._gate_act = Logistic()
            children.extend([self._gate_mlp, self._gate_act])

        if compose_type == 'fully_connected_linear':
            self._def_state_compose = MLP(activations=[None],
                                          dims=[emb_dim + dim, emb_dim])
            children.append(self._def_state_compose)
        if compose_type == "gated_sum" or compose_type == "gated_transform_and_sum":
            if dropout_type == "per_word" or dropout_type == "per_example":
                raise RuntimeError(
                    "I dont think this combination makes much sense")

            self._compose_gate_mlp = Linear(dim + emb_dim,
                                            emb_dim,
                                            name='gate_linear')
            self._compose_gate_act = Logistic()
            children.extend([self._compose_gate_mlp, self._compose_gate_act])
        if compose_type == 'sum':
            if not emb_dim == dim:
                raise ValueError(
                    "Embedding has different dim! Cannot use compose_type='sum'"
                )
        if compose_type == 'transform_and_sum' or compose_type == "gated_transform_and_sum":
            self._def_state_transform = Linear(dim,
                                               emb_dim,
                                               name='state_transform')
            children.append(self._def_state_transform)

        super(MeanPoolCombiner, self).__init__(children=children, **kwargs)

    @application
    def apply(self,
              application_call,
              word_embs,
              words_mask,
              def_embeddings,
              def_map,
              train_phase=False,
              word_ids=False,
              call_name=""):
        batch_shape = word_embs.shape
        flat_indices = def_map[:, 0] * batch_shape[
            1] + def_map[:, 1]  # Index of word in flat

        # def_map is (seq_pos, word_pos, def_index)
        # def_embeddings is (id, emb_dim)

        def_sum = T.zeros(
            (batch_shape[0] * batch_shape[1], def_embeddings.shape[1]))
        def_lens = T.zeros_like(def_sum[:, 0])
        def_lens = T.inc_subtensor(def_lens[flat_indices], 1)

        if self._def_word_gating == "none":
            def_sum = T.inc_subtensor(def_sum[flat_indices],
                                      def_embeddings[def_map[:, 2]])
            def_mean = def_sum / T.maximum(def_lens[:, None], 1)
        elif self._def_word_gating == "self_attention":
            gates = def_embeddings[def_map[:, 2]]
            gates = self._gate_mlp.apply(gates)[:, 0]
            application_call.add_auxiliary_variable(gates, name='def_gates')

            # Dima: this is numerically unstable. But maybe it can work.
            # If it can work, we can avoid too much coding.
            def_normalization = T.zeros_like(def_lens)
            def_normalization = T.inc_subtensor(
                def_normalization[flat_indices], T.exp(gates))
            gates = T.exp(gates) / def_normalization[flat_indices]

            def_mean = T.inc_subtensor(
                def_sum[flat_indices],
                gates[:, None] * def_embeddings[def_map[:, 2]])
        else:
            raise NotImplementedError()
        def_mean = def_mean.reshape((batch_shape[0], batch_shape[1], -1))

        application_call.add_auxiliary_variable(
            masked_root_mean_square(def_mean, words_mask),
            name=call_name + '_def_mean_rootmean2')

        if train_phase and self._dropout != 0.0:
            if self._dropout_type == "per_unit":
                logger.info("Adding per_unit drop on dict and normal emb")
                word_embs = apply_dropout(word_embs, drop_prob=self._dropout)
                def_mean = apply_dropout(def_mean, drop_prob=self._dropout)
            elif self._dropout_type == "per_example":
                logger.info("Adding per_example drop on dict and normal emb")
                # We dropout mask
                mask_defs = T.ones((batch_shape[0], ))
                mask_we = T.ones((batch_shape[0], ))

                # Mask dropout
                mask_defs = apply_dropout(mask_defs, drop_prob=self._dropout)
                mask_we = apply_dropout(mask_we, drop_prob=self._dropout)

                # this reduces variance. If both 0 will select both
                where_both_zero = T.eq((mask_defs + mask_we), 0)

                mask_defs = (where_both_zero + mask_defs).dimshuffle(
                    0, "x", "x")
                mask_we = (where_both_zero + mask_we).dimshuffle(0, "x", "x")

                def_mean = mask_defs * def_mean
                word_embs = mask_we * word_embs
            elif self._dropout_type == "per_word_independent":
                # TODO: Maybe we also want to have possibility of including both (like in per_example)
                pass  # TODO: implement
            elif self._dropout_type == "per_word":
                # First select if to perform dropout at all
                mask_higher = T.ones((batch_shape[0], batch_shape[1]))
                mask_higher = apply_dropout(mask_higher,
                                            drop_prob=self._dropout)
                mask_higher = mask_higher.dimshuffle(0, 1, "x")

                logger.info("Apply per_word dropou on dict and normal emb")
                # And if yes just 50% word vs 50% def
                mask = T.ones((batch_shape[0], batch_shape[1]))
                mask = apply_dropout(mask, drop_prob=0.5)
                mask = mask.dimshuffle(0, 1, "x")

                # Competitive
                def_mean = mask_higher * def_mean + (
                    1 - mask_higher) * mask * def_mean
                word_embs = mask_higher * word_embs + (1 - mask_higher) * (
                    1 - mask) * word_embs

                # TODO: Smarter weighting (at least like divisor in dropout)

                if not self._compose_type == "sum" and not self._compose_type == "transform_and_sum":
                    raise NotImplementedError()

        application_call.add_auxiliary_variable(def_mean.copy(),
                                                name=call_name +
                                                '_dict_word_embeddings')

        if self._compose_type == 'sum':
            final_embeddings = word_embs + def_mean
        elif self._compose_type == 'transform_and_sum':
            final_embeddings = (word_embs +
                                self._def_state_transform.apply(def_mean))
        elif self._compose_type == 'gated_sum' or self._compose_type == 'gated_transform_and_sum':
            concat = T.concatenate([word_embs, def_mean], axis=2)
            gates = concat.reshape((batch_shape[0] * batch_shape[1], -1))
            gates = self._compose_gate_mlp.apply(gates)
            gates = self._compose_gate_act.apply(gates)
            gates = gates.reshape((batch_shape[0], batch_shape[1], -1))

            if self._compose_type == 'gated_sum':
                final_embeddings = gates * word_embs + (1 - gates) * def_mean
            else:
                final_embeddings = gates * word_embs + (
                    1 - gates) * self._def_state_transform.apply(def_mean)

            application_call.add_auxiliary_variable(masked_root_mean_square(
                gates.reshape((batch_shape[0], batch_shape[1], -1)),
                words_mask),
                                                    name=call_name +
                                                    '_compose_gate_rootmean2')
        elif self._compose_type.startswith('fully_connected'):
            concat = T.concatenate([word_embs, def_mean], axis=2)
            final_embeddings = self._def_state_compose.apply(concat)
        else:
            raise NotImplementedError()

        if self._shortcut_unk_and_excluded:
            # NOTE: It might be better to move it out of Lookup, because it breaks API a bit
            # but at the same time it makes sense to share this code

            # 1. If no def, just go with word emb
            final_embeddings = word_embs * T.lt(word_ids, self._exclude_top_K).dimshuffle(0, 1, "x") + \
                               final_embeddings * T.ge(word_ids, self._exclude_top_K).dimshuffle(0, 1, "x")

            # 2. UNKs always get def embeddings (UNK can happen for dev/test set of course)
            final_embeddings = final_embeddings * T.neq(word_ids, self._vocab.unk).dimshuffle(0, 1, "x") + \
                               def_mean * T.eq(word_ids, self._vocab.unk).dimshuffle(0, 1, "x")

        application_call.add_auxiliary_variable(
            masked_root_mean_square(final_embeddings, words_mask),
            name=call_name + '_merged_input_rootmean2')

        return final_embeddings