Esempio n. 1
0
class EmbNNRanker(NNRanker):
    """A ranker using MR and tree embeddings in a NN."""

    def __init__(self, cfg):
        super(EmbNNRanker, self).__init__(cfg)
        self.emb_size = cfg.get('emb_size', 20)
        self.nn_shape = cfg.get('nn_shape', 'ff')
        self.normgrad = cfg.get('normgrad', False)

        self.cnn_num_filters = cfg.get('cnn_num_filters', 3)
        self.cnn_filter_length = cfg.get('cnn_filter_length', 3)

        # 'emb' = embeddings for both, 'emb_trees' = embeddings for tree only, 1-hot DA
        # 'emb_tree', 'emb_prev' = tree-only embeddings
        self.da_embs = cfg.get('nn', 'emb') == 'emb'

        self.tree_embs = TreeEmbeddingExtract(cfg)

        if self.da_embs:
            self.da_embs = DAEmbeddingExtract(cfg)
        else:
            self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
            self.vectorizer = None

    def _init_training(self, das_file, ttree_file, data_portion):
        super(EmbNNRanker, self)._init_training(das_file, ttree_file, data_portion)
        self._init_dict()
        self._init_neural_network()

        self.train_feats = [self._extract_feats(tree, da)
                            for tree, da in zip(self.train_trees, self.train_das)]

        self.w_after_iter = []
        self.update_weights_sum()

    def _init_dict(self):
        """Initialize word -> integer dictionaries, starting from a minimum
        valid value, always adding a new integer to unknown values to prevent
        clashes among different types of inputs."""

        # avoid dictionary clashes between DAs and tree embeddings
        # – remember current highest index number
        dict_ord = None

        # DA embeddings
        if self.da_embs:
            dict_ord = self.da_embs.init_dict(self.train_das)

        # DA one-hot representation
        else:
            X = []
            for da, tree in zip(self.train_das, self.train_trees):
                X.append(self.da_feats.get_features(tree, {'da': da}))

            self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True)
            self.vectorizer.fit(X)

        # tree embeddings
        # remember last dictionary key to initialize embeddings with enough rows
        self.dict_size = self.tree_embs.init_dict(self.train_trees, dict_ord)

    def _score(self, cand_embs):
        return self.nn.score([cand_embs[0]], [cand_embs[1]])[0]

    def _extract_feats(self, tree, da):
        """Extract DA and tree embeddings (return as a pair)."""
        if self.da_embs:
            # DA embeddings
            da_repr = self.da_embs.get_embeddings(da)
        else:
            # DA one-hot representation
            da_repr = self.vectorizer.transform([self.da_feats.get_features(tree, {'da': da})])[0]

        # tree embeddings
        tree_emb_idxs = self.tree_embs.get_embeddings(tree)

        return (da_repr, tree_emb_idxs)

    def _init_neural_network(self):
        # initial layer – tree embeddings & DA 1-hot or embeddings
        # input shapes don't contain the batch dimension, but the input Theano types do!
        if self.da_embs:
            input_shapes = (self.da_embs.get_embeddings_shape(),
                            self.tree_embs.get_embeddings_shape())
            input_types = (T.itensor3, T.itensor3)
            layers = [[Embedding('emb_da', self.dict_size, self.emb_size, 'uniform_005'),
                       Embedding('emb_tree', self.dict_size, self.emb_size, 'uniform_005')]]
        else:
            input_shapes = ([len(self.vectorizer.get_feature_names())],
                            self.tree_embs.get_embeddings_shape())
            input_types = (T.fmatrix, T.itensor3)
            layers = [[Identity('id_da'),
                       Embedding('emb_tree', self.dict_size, self.emb_size, 'uniform_005')]]

        # plain feed-forward networks
        if self.nn_shape.startswith('ff'):

            layers += [[Flatten('flat_da'), Flatten('flat_tree')], [Concat('concat')]]
            num_ff_layers = 2
            if self.nn_shape[-1] in ['3', '4']:
                num_ff_layers = int(self.nn_shape[-1])
            layers += self._ff_layers('ff', num_ff_layers, perc_layer=True)

        # convolution with or without max/avg-pooling
        elif self.nn_shape.startswith('conv'):

            num_conv_layers = 2 if self.nn_shape.startswith('conv2') else 1
            pooling = None
            if 'maxpool' in self.nn_shape:
                pooling = T.max
            elif 'avgpool' in self.nn_shape:
                pooling = T.mean

            if self.da_embs:
                da_layers = self._conv_layers('conv_da', num_conv_layers, pooling=pooling)
            else:
                da_layers = self._id_layers('id_da',
                                            num_conv_layers + (1 if pooling is not None else 0))
            tree_layers = self._conv_layers('conv_tree', num_conv_layers, pooling=pooling)

            for da_layer, tree_layer in zip(da_layers, tree_layers):
                layers.append([da_layer[0], tree_layer[0]])
            layers += [[Flatten('flat_da'), Flatten('flat_tree')], [Concat('concat')]]
            layers += self._ff_layers('ff', 2, perc_layer=True)

        # max-pooling without convolution
        elif 'maxpool-ff' in self.nn_shape:
            layers += [[Pool1D('mp_da') if self.da_embs else Identity('id_da'),
                        Pool1D('mp_trees')]
                       [Concat('concat')], [Flatten('flat')]]
            layers += self._ff_layers('ff', 2, perc_layer=True),

        # dot-product FF network
        elif 'dot' in self.nn_shape:
            # with max or average pooling
            if 'maxpool' in self.nn_shape or 'avgpool' in self.nn_shape:
                pooling = T.mean if 'avgpool' in self.nn_shape else T.max
                layers += [[Pool1D('mp_da', pooling_func=pooling)
                            if self.da_embs else Identity('id_da'),
                            Pool1D('mp_tree', pooling_func=pooling)]]
            layers += [[Flatten('flat_da') if self.da_embs else Identity('id_da'),
                        Flatten('flat_tree')]]

            num_ff_layers = int(self.nn_shape[-1]) if self.nn_shape[-1] in ['2', '3', '4'] else 1
            for da_layer, tree_layer in zip(self._ff_layers('ff_da', num_ff_layers),
                                            self._ff_layers('ff_tree', num_ff_layers)):
                layers.append([da_layer[0], tree_layer[0]])
            layers.append([DotProduct('dot')])

        # input: batch * word * sub-embeddings
        self.nn = RankNN(layers, input_shapes, input_types, self.normgrad)
        log_info("Network shape:\n\n" + str(self.nn))

    def _conv_layers(self, name, num_layers=1, pooling=None):
        ret = []
        for i in xrange(num_layers):
            ret.append([Conv1D(name + str(i + 1),
                               filter_length=self.cnn_filter_length,
                               num_filters=self.cnn_num_filters,
                               init=self.init, activation=T.tanh)])
        if pooling is not None:
            ret.append([Pool1D(name + str(i + 1) + 'pool', pooling_func=pooling)])
        return ret

    def _id_layers(self, name, num_layers):
        ret = []
        for i in xrange(num_layers):
            ret.append([Identity(name + str(i + 1))])
        return ret

    def _update_nn(self, bad_feats, good_feats, rate):
        """Changing the NN update call to support arrays of parameters."""
        # TODO: this is just adding another dimension to fit the parallelized scoring
        # (even if updates are not parallelized). Make it nicer.
        bad_feats = ([bad_feats[0]], [bad_feats[1]])
        good_feats = ([good_feats[0]], [good_feats[1]])

        cost_gcost = self.nn.update(*(bad_feats + good_feats + (rate,)))
        log_debug('Cost:' + str(cost_gcost[0]))
        param_vals = [param.get_value() for param in self.nn.params]
        log_debug('Param norms : ' + str(self._l2s(param_vals)))
        log_debug('Gparam norms: ' + str(self._l2s(cost_gcost[1:])))

    def _embs_to_str(self):
        out = ""
        da_emb = self.nn.layers[0][0].e.get_value()
        tree_emb = self.nn.layers[0][1].e.get_value()
        for idx, emb in enumerate(da_emb):
            for key, val in self.dict_slot.items():
                if val == idx:
                    out += key + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
            for key, val in self.dict_value.items():
                if val == idx:
                    out += key + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
        for idx, emb in enumerate(tree_emb):
            for key, val in self.dict_t_lemma.items():
                if val == idx:
                    out += str(key) + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
            for key, val in self.dict_formeme.items():
                if val == idx:
                    out += str(key) + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
        return out

    def _l2s(self, params):
        """Compute L2-norm of all members of the given list."""
        return [np.linalg.norm(param) for param in params]

    def store_iter_weights(self):
        """Remember the current weights to be used for averaged perceptron."""
        # fh = open('embs.txt', 'a')
        # print >> fh, '---', self._embs_to_str()
        # fh.close()
        self.w_after_iter.append(self.nn.get_param_values())

    def score_all(self, trees, da):
        cand_embs = [self._extract_feats(tree, da) for tree in trees]
        score = self.nn.score([emb[0] for emb in cand_embs], [emb[1] for emb in cand_embs])
        return np.atleast_1d(score[0])
Esempio n. 2
0
class EmbNNRanker(NNRanker):
    """A ranker using MR and tree embeddings in a NN."""

    def __init__(self, cfg):
        super(EmbNNRanker, self).__init__(cfg)
        self.emb_size = cfg.get('emb_size', 20)
        self.nn_shape = cfg.get('nn_shape', 'ff')
        self.normgrad = cfg.get('normgrad', False)

        self.cnn_num_filters = cfg.get('cnn_num_filters', 3)
        self.cnn_filter_length = cfg.get('cnn_filter_length', 3)

        # 'emb' = embeddings for both, 'emb_trees' = embeddings for tree only, 1-hot DA
        # 'emb_tree', 'emb_prev' = tree-only embeddings
        self.da_embs = cfg.get('nn', 'emb') == 'emb'

        self.tree_embs = TreeEmbeddingExtract(cfg)

        if self.da_embs:
            self.da_embs = DAEmbeddingExtract(cfg)
        else:
            self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
            self.vectorizer = None

    def _init_training(self, das_file, ttree_file, data_portion):
        super(EmbNNRanker, self)._init_training(das_file, ttree_file, data_portion)
        self._init_dict()
        self._init_neural_network()

        self.train_feats = [self._extract_feats(tree, da)
                            for tree, da in zip(self.train_trees, self.train_das)]

        self.w_after_iter = []
        self.update_weights_sum()

    def _init_dict(self):
        """Initialize word -> integer dictionaries, starting from a minimum
        valid value, always adding a new integer to unknown values to prevent
        clashes among different types of inputs."""

        # avoid dictionary clashes between DAs and tree embeddings
        # – remember current highest index number
        dict_ord = None

        # DA embeddings
        if self.da_embs:
            dict_ord = self.da_embs.init_dict(self.train_das)

        # DA one-hot representation
        else:
            X = []
            for da, tree in zip(self.train_das, self.train_trees):
                X.append(self.da_feats.get_features(tree, {'da': da}))

            self.vectorizer = DictVectorizer(sparse=False, binarize_numeric=True)
            self.vectorizer.fit(X)

        # tree embeddings
        # remember last dictionary key to initialize embeddings with enough rows
        self.dict_size = self.tree_embs.init_dict(self.train_trees, dict_ord)

    def _score(self, cand_embs):
        return self.nn.score([cand_embs[0]], [cand_embs[1]])[0]

    def _extract_feats(self, tree, da):
        """Extract DA and tree embeddings (return as a pair)."""
        if self.da_embs:
            # DA embeddings
            da_repr = self.da_embs.get_embeddings(da)
        else:
            # DA one-hot representation
            da_repr = self.vectorizer.transform([self.da_feats.get_features(tree, {'da': da})])[0]

        # tree embeddings
        tree_emb_idxs = self.tree_embs.get_embeddings(tree)

        return (da_repr, tree_emb_idxs)

    def _init_neural_network(self):
        # initial layer – tree embeddings & DA 1-hot or embeddings
        # input shapes don't contain the batch dimension, but the input Theano types do!
        if self.da_embs:
            input_shapes = (self.da_embs.get_embeddings_shape(),
                            self.tree_embs.get_embeddings_shape())
            input_types = (T.itensor3, T.itensor3)
            layers = [[Embedding('emb_da', self.dict_size, self.emb_size, 'uniform_005'),
                       Embedding('emb_tree', self.dict_size, self.emb_size, 'uniform_005')]]
        else:
            input_shapes = ([len(self.vectorizer.get_feature_names())],
                            self.tree_embs.get_embeddings_shape())
            input_types = (T.fmatrix, T.itensor3)
            layers = [[Identity('id_da'),
                       Embedding('emb_tree', self.dict_size, self.emb_size, 'uniform_005')]]

        # plain feed-forward networks
        if self.nn_shape.startswith('ff'):

            layers += [[Flatten('flat_da'), Flatten('flat_tree')], [Concat('concat')]]
            num_ff_layers = 2
            if self.nn_shape[-1] in ['3', '4']:
                num_ff_layers = int(self.nn_shape[-1])
            layers += self._ff_layers('ff', num_ff_layers, perc_layer=True)

        # convolution with or without max/avg-pooling
        elif self.nn_shape.startswith('conv'):

            num_conv_layers = 2 if self.nn_shape.startswith('conv2') else 1
            pooling = None
            if 'maxpool' in self.nn_shape:
                pooling = T.max
            elif 'avgpool' in self.nn_shape:
                pooling = T.mean

            if self.da_embs:
                da_layers = self._conv_layers('conv_da', num_conv_layers, pooling=pooling)
            else:
                da_layers = self._id_layers('id_da',
                                            num_conv_layers + (1 if pooling is not None else 0))
            tree_layers = self._conv_layers('conv_tree', num_conv_layers, pooling=pooling)

            for da_layer, tree_layer in zip(da_layers, tree_layers):
                layers.append([da_layer[0], tree_layer[0]])
            layers += [[Flatten('flat_da'), Flatten('flat_tree')], [Concat('concat')]]
            layers += self._ff_layers('ff', 2, perc_layer=True)

        # max-pooling without convolution
        elif 'maxpool-ff' in self.nn_shape:
            layers += [[Pool1D('mp_da') if self.da_embs else Identity('id_da'),
                        Pool1D('mp_trees')]
                       [Concat('concat')], [Flatten('flat')]]
            layers += self._ff_layers('ff', 2, perc_layer=True),

        # dot-product FF network
        elif 'dot' in self.nn_shape:
            # with max or average pooling
            if 'maxpool' in self.nn_shape or 'avgpool' in self.nn_shape:
                pooling = T.mean if 'avgpool' in self.nn_shape else T.max
                layers += [[Pool1D('mp_da', pooling_func=pooling)
                            if self.da_embs else Identity('id_da'),
                            Pool1D('mp_tree', pooling_func=pooling)]]
            layers += [[Flatten('flat_da') if self.da_embs else Identity('id_da'),
                        Flatten('flat_tree')]]

            num_ff_layers = int(self.nn_shape[-1]) if self.nn_shape[-1] in ['2', '3', '4'] else 1
            for da_layer, tree_layer in zip(self._ff_layers('ff_da', num_ff_layers),
                                            self._ff_layers('ff_tree', num_ff_layers)):
                layers.append([da_layer[0], tree_layer[0]])
            layers.append([DotProduct('dot')])

        # input: batch * word * sub-embeddings
        self.nn = RankNN(layers, input_shapes, input_types, self.normgrad)
        log_info("Network shape:\n\n" + str(self.nn))

    def _conv_layers(self, name, num_layers=1, pooling=None):
        ret = []
        for i in range(num_layers):
            ret.append([Conv1D(name + str(i + 1),
                               filter_length=self.cnn_filter_length,
                               num_filters=self.cnn_num_filters,
                               init=self.init, activation=T.tanh)])
        if pooling is not None:
            ret.append([Pool1D(name + str(i + 1) + 'pool', pooling_func=pooling)])
        return ret

    def _id_layers(self, name, num_layers):
        ret = []
        for i in range(num_layers):
            ret.append([Identity(name + str(i + 1))])
        return ret

    def _update_nn(self, bad_feats, good_feats, rate):
        """Changing the NN update call to support arrays of parameters."""
        # TODO: this is just adding another dimension to fit the parallelized scoring
        # (even if updates are not parallelized). Make it nicer.
        bad_feats = ([bad_feats[0]], [bad_feats[1]])
        good_feats = ([good_feats[0]], [good_feats[1]])

        cost_gcost = self.nn.update(*(bad_feats + good_feats + (rate,)))
        log_debug('Cost:' + str(cost_gcost[0]))
        param_vals = [param.get_value() for param in self.nn.params]
        log_debug('Param norms : ' + str(self._l2s(param_vals)))
        log_debug('Gparam norms: ' + str(self._l2s(cost_gcost[1:])))

    def _embs_to_str(self):
        out = ""
        da_emb = self.nn.layers[0][0].e.get_value()
        tree_emb = self.nn.layers[0][1].e.get_value()
        for idx, emb in enumerate(da_emb):
            for key, val in list(self.dict_slot.items()):
                if val == idx:
                    out += key + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
            for key, val in list(self.dict_value.items()):
                if val == idx:
                    out += key + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
        for idx, emb in enumerate(tree_emb):
            for key, val in list(self.dict_t_lemma.items()):
                if val == idx:
                    out += str(key) + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
            for key, val in list(self.dict_formeme.items()):
                if val == idx:
                    out += str(key) + ',' + ','.join([("%f" % d) for d in emb]) + "\n"
        return out

    def _l2s(self, params):
        """Compute L2-norm of all members of the given list."""
        return [np.linalg.norm(param) for param in params]

    def store_iter_weights(self):
        """Remember the current weights to be used for averaged perceptron."""
        # fh = open('embs.txt', 'a')
        # print >> fh, '---', self._embs_to_str()
        # fh.close()
        self.w_after_iter.append(self.nn.get_param_values())

    def score_all(self, trees, da):
        cand_embs = [self._extract_feats(tree, da) for tree in trees]
        score = self.nn.score([emb[0] for emb in cand_embs], [emb[1] for emb in cand_embs])
        return np.atleast_1d(score[0])
Esempio n. 3
0
class Reranker(object):

    def __init__(self, cfg):
        self.cfg = cfg
        self.language = cfg.get('language', 'en')
        self.selector = cfg.get('selector', '')

        self.mode = cfg.get('mode', 'tokens' if cfg.get('use_tokens') else 'trees')

        self.da_feats = Features(['dat: dat_presence', 'svp: svp_presence'])
        self.da_vect = DictVectorizer(sparse=False, binarize_numeric=True)

        self.cur_da = None
        self.cur_da_bin = None

        self.delex_slots = cfg.get('delex_slots', None)
        if self.delex_slots:
            self.delex_slots = set(self.delex_slots.split(','))

    @staticmethod
    def get_model_type(cfg):
        """Return the correct model class according to the config."""
        if cfg.get('model') == 'e2e_patterns':
            from tgen.e2e.slot_error import E2EPatternClassifier
            return E2EPatternClassifier
        return RerankingClassifier

    @staticmethod
    def load_from_file(reranker_fname):
        """Detect correct model type and start loading."""
        model_type = RerankingClassifier  # default to classifier
        with file_stream(reranker_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            if isinstance(data, type):
                from tgen.e2e.slot_error import E2EPatternClassifier
                model_type = data
        return model_type.load_from_file(reranker_fname)

    def save_to_file(self, reranker_fname):
        raise NotImplementedError()

    def get_all_settings(self):
        raise NotImplementedError()

    def classify(self, trees):
        raise NotImplementedError()

    def train(self, das, trees, data_portion=1.0, valid_das=None, valid_trees=None):
        raise NotImplementedError()

    def _normalize_da(self, da):
        if isinstance(da, tuple):  # if DA is actually context + DA, ignore context
            da = da[1]
        if self.delex_slots:  # delexicalize the DA if needed
            da = da.get_delexicalized(self.delex_slots)
        return da

    def init_run(self, da):
        """Remember the current DA for subsequent runs of `dist_to_cur_da`."""
        self.cur_da = self._normalize_da(da)
        da_bin = self.da_vect.transform([self.da_feats.get_features(None, {'da': self.cur_da})])[0]
        self.cur_da_bin = da_bin != 0

    def dist_to_da(self, da, trees, return_classif=False):
        """Return Hamming distance of given trees to the given DA.

        @param da: the DA as the base of the Hamming distance measure
        @param trees: list of trees to measure the distance
        @return: list of Hamming distances for each tree (+ resulting classification if return_classif)
        """
        self.init_run(da)
        ret = self.dist_to_cur_da(trees, return_classif)
        self.cur_da = None
        self.cur_da_bin = None
        return ret

    def dist_to_cur_da(self, trees, return_classif=False):
        """Return Hamming distance of given trees to the current DA (set in `init_run`).

        @param trees: list of trees to measure the distance
        @return: list of Hamming distances for each tree (+ resulting classification if return_classif)
        """
        da_bin = self.cur_da_bin
        covered = self.classify(trees)
        dist = [sum(abs(c - da_bin)) for c in covered]
        if return_classif:
            return dist, [[f for f, c_ in zip(self.da_vect.feature_names_, c) if c_] for c in covered]
        return dist