Beispiel #1
0
class UnigramPrior(object):
    '''
    >>> p = UnigramPrior()
    >>> p.train([instance.Instance('blue')])
    >>> p.sample(3)  # doctest: +ELLIPSIS
    [Instance('...', None), Instance('...', None), Instance('...', None)]
    '''
    def __init__(self):
        self.vec = SequenceVectorizer()
        self.vec.add_all([['</s>'], ['<MASK>']])
        self.counts = theano.shared(np.zeros((self.vec.num_types,), dtype=np.int32))
        self.total = theano.shared(np.array(0, dtype=np.int32))
        self.log_probs = T.cast(self.counts, 'float32') / T.cast(self.total, 'float32')
        self.mask_index = self.vec.vectorize(['<MASK>'])[0]

    def train(self, training_instances, listener_data=True):
        get_utt = (lambda inst: inst.input) if listener_data else (lambda inst: inst.output)
        tokenized = [get_utt(inst).split() for inst in training_instances]
        self.vec.add_all(tokenized)
        x = self.vec.vectorize_all(self.pad(tokenized, self.vec.max_len))
        vocab_size = self.vec.num_types

        counts = np.bincount(x.flatten(), minlength=vocab_size).astype(np.int32)
        counts[self.mask_index] = 0
        self.counts.set_value(counts)
        self.total.set_value(np.sum(counts))

    def apply(self, input_vars):
        (x,) = input_vars

        token_probs = self.log_probs[x]
        if self.mask_index is not None:
            token_probs = token_probs * T.cast(T.eq(x, self.mask_index), 'float32')
        if token_probs.ndim == 1:
            return token_probs
        else:
            return token_probs.sum(axis=1)

    def sample(self, num_samples=1):
        indices = np.array([[sample(self.counts.get_value() * 1.0 / self.total.get_value())
                             for _t in range(self.vec.max_len)]
                            for _s in range(num_samples)], dtype=np.int32)
        return [instance.Instance(' '.join(strip_invalid_tokens(s)))
                for s in self.vec.unvectorize_all(indices)]

    def pad(self, sequences, length):
        '''
        Adds </s> tokens followed by zero or more <MASK> tokens to bring the total
        length of all sequences to `length + 1` (the addition of one is because all
        sequences receive a </s>, but `length` should be the max length of the original
        sequences).

        >>> UnigramPrior().pad([['blue'], ['very', 'blue']], 2)
        [['blue', '</s>', '<MASK>'], ['very', 'blue', '</s>']]
        '''
        return [seq + ['</s>'] + ['<MASK>'] * (length - len(seq))
                for seq in sequences]
class LRContextListenerLearner(Learner):
    def train(self,
              training_instances,
              validation_instances=None,
              metrics=None):
        X, y = self._data_to_arrays(training_instances, init_vectorizer=True)
        self.mod = LogisticRegression(solver='lbfgs')
        self.mod.fit(X, y)

    @property
    def num_params(self):
        return np.prod(self.mod.coef_.shape) + np.prod(
            self.mod.intercept_.shape)

    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        X, y = self._data_to_arrays(eval_instances)
        y = y.reshape((len(eval_instances), self.context_len))
        all_scores = self.mod.predict_log_proba(X)[:, 1].reshape(
            (len(eval_instances), self.context_len))
        all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis]

        preds = all_scores.argmax(axis=1)
        scores = np.where(y, all_scores, 0).sum(axis=1)

        return preds.tolist(), scores.tolist()

    def _data_to_arrays(self,
                        instances,
                        inverted=False,
                        init_vectorizer=False):
        self.get_options()

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)
        get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (
            lambda inst: inst.alt_outputs)
        get_alt_colors = get_alt_i if inverted else get_alt_o

        tokenize = TOKENIZERS[self.options.listener_tokenizer]
        tokenized = [tokenize(get_desc(inst)) for inst in instances]
        context_lens = [len(get_alt_colors(inst)) for inst in instances]

        if init_vectorizer:
            self.seq_vec = SequenceVectorizer()
            self.seq_vec.add_all(tokenized)

        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        if init_vectorizer:
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

            self.context_len = context_lens[0]

            color_repr = COLOR_REPRS[self.options.listener_color_repr]
            self.color_vec = color_repr(self.options.listener_color_resolution,
                                        hsv=self.options.listener_hsv)

        assert all(cl == self.context_len
                   for cl in context_lens), (self.context_len, context_lens)

        padded = [(d + ['</s>'] *
                   (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len]
                  for d in unk_replaced]
        colors = [c for inst in instances for c in get_alt_colors(inst)]
        labels = np.array([
            int(i == get_color(inst)) for inst in instances
            for i in range(self.context_len)
        ])

        desc_indices = self.seq_vec.vectorize_all(padded)
        desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types))
        desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis],
                 desc_indices] = 1.
        color_feats = self.color_vec.vectorize_all(colors)
        color_feats = color_feats.reshape(
            (desc_indices.shape[0], self.context_len, color_feats.shape[1]))
        feats = np.einsum('ij,ick->icjk', desc_bow, color_feats)
        feats = feats.reshape((desc_indices.shape[0] * self.context_len,
                               desc_bow.shape[1] * color_feats.shape[2]))

        return feats, labels

    def get_options(self):
        if not hasattr(self, 'options'):
            self.options = config.options()
class LRContextListenerLearner(Learner):
    def train(self, training_instances, validation_instances=None, metrics=None):
        X, y = self._data_to_arrays(training_instances, init_vectorizer=True)
        self.mod = LogisticRegression(solver='lbfgs')
        self.mod.fit(X, y)

    @property
    def num_params(self):
        return np.prod(self.mod.coef_.shape) + np.prod(self.mod.intercept_.shape)

    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        X, y = self._data_to_arrays(eval_instances)
        y = y.reshape((len(eval_instances), self.context_len))
        all_scores = self.mod.predict_log_proba(X)[:, 1].reshape((len(eval_instances),
                                                                  self.context_len))
        all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis]

        preds = all_scores.argmax(axis=1)
        scores = np.where(y, all_scores, 0).sum(axis=1)

        return preds.tolist(), scores.tolist()

    def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False):
        self.get_options()

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)
        get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (lambda inst: inst.alt_outputs)
        get_alt_colors = get_alt_i if inverted else get_alt_o

        tokenize = TOKENIZERS[self.options.listener_tokenizer]
        tokenized = [tokenize(get_desc(inst)) for inst in instances]
        context_lens = [len(get_alt_colors(inst)) for inst in instances]

        if init_vectorizer:
            self.seq_vec = SequenceVectorizer()
            self.seq_vec.add_all(tokenized)

        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        if init_vectorizer:
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

            self.context_len = context_lens[0]

            color_repr = COLOR_REPRS[self.options.listener_color_repr]
            self.color_vec = color_repr(self.options.listener_color_resolution,
                                        hsv=self.options.listener_hsv)

        assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens)

        padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len]
                  for d in unk_replaced]
        colors = [c for inst in instances for c in get_alt_colors(inst)]
        labels = np.array([int(i == get_color(inst))
                           for inst in instances
                           for i in range(self.context_len)])

        desc_indices = self.seq_vec.vectorize_all(padded)
        desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types))
        desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1.
        color_feats = self.color_vec.vectorize_all(colors)
        color_feats = color_feats.reshape((desc_indices.shape[0],
                                           self.context_len,
                                           color_feats.shape[1]))
        feats = np.einsum('ij,ick->icjk', desc_bow, color_feats)
        feats = feats.reshape((desc_indices.shape[0] * self.context_len,
                               desc_bow.shape[1] * color_feats.shape[2]))

        return feats, labels

    def get_options(self):
        if not hasattr(self, 'options'):
            self.options = config.options()