Beispiel #1
0
 def __init__(self):
     self.vec = SequenceVectorizer()
     self.vec.add_all([['</s>'], ['<MASK>']])
     self.counts = theano.shared(np.zeros((self.vec.num_types,), dtype=np.int32))
     self.total = theano.shared(np.array(0, dtype=np.int32))
     self.log_probs = T.cast(self.counts, 'float32') / T.cast(self.total, 'float32')
     self.mask_index = self.vec.vectorize(['<MASK>'])[0]
Beispiel #2
0
def count_unks():
    options = config.options()

    print('Data source: {}'.format(options.data_source))
    print('Unk threshold: {}'.format(options.unk_threshold))
    print('Tokenizer: {}'.format(options.tokenizer))

    print('')
    print('Loading data')
    train_insts = color_instances.SOURCES[options.data_source].train_data(
        listener=True)
    eval_insts = color_instances.SOURCES[options.data_source].test_data(
        listener=True)

    tokenize = TOKENIZERS[options.tokenizer]
    vec = SequenceVectorizer(unk_threshold=options.unk_threshold)

    print('Tokenizing training data')
    train_tokenized = [['<s>'] + tokenize(inst.input) + ['</s>']
                       for inst in train_insts]
    print('Tokenizing eval data')
    eval_tokenized = [['<s>'] + tokenize(inst.input) + ['</s>']
                      for inst in eval_insts]
    print('Initializing vectorizer')
    vec.add_all(train_tokenized)

    print_unk_ratio(train_tokenized, vec, 'Train')
    print_unk_ratio(eval_tokenized, vec, 'Eval')
Beispiel #3
0
 def __init__(self):
     options = config.options()
     self.tokenizer = options.speaker_tokenizer
     self.token_counts = Counter()
     self.seq_vec = SequenceVectorizer(
         unk_threshold=options.speaker_unk_threshold)
     self.num_tokens = 0
Beispiel #4
0
 def __init__(self, id=None):
     super(ListenerLearner, self).__init__(id=id)
     self.word_counts = Counter()
     self.seq_vec = SequenceVectorizer()
     self.color_vec = BucketsVectorizer(
         self.options.listener_color_resolution,
         hsv=self.options.listener_hsv)
Beispiel #5
0
 def __init__(self, id=None, context_len=1):
     super(SpeakerLearner, self).__init__(id=id)
     self.seq_vec = SequenceVectorizer()
     color_repr = COLOR_REPRS[self.options.speaker_color_repr]
     self.color_vec = color_repr(self.options.speaker_color_resolution,
                                 hsv=self.options.speaker_hsv)
     self.context_len = context_len
    def _data_to_arrays(self,
                        instances,
                        inverted=False,
                        init_vectorizer=False):
        self.get_options()

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)
        get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (
            lambda inst: inst.alt_outputs)
        get_alt_colors = get_alt_i if inverted else get_alt_o

        tokenize = TOKENIZERS[self.options.listener_tokenizer]
        tokenized = [tokenize(get_desc(inst)) for inst in instances]
        context_lens = [len(get_alt_colors(inst)) for inst in instances]

        if init_vectorizer:
            self.seq_vec = SequenceVectorizer()
            self.seq_vec.add_all(tokenized)

        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        if init_vectorizer:
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

            self.context_len = context_lens[0]

            color_repr = COLOR_REPRS[self.options.listener_color_repr]
            self.color_vec = color_repr(self.options.listener_color_resolution,
                                        hsv=self.options.listener_hsv)

        assert all(cl == self.context_len
                   for cl in context_lens), (self.context_len, context_lens)

        padded = [(d + ['</s>'] *
                   (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len]
                  for d in unk_replaced]
        colors = [c for inst in instances for c in get_alt_colors(inst)]
        labels = np.array([
            int(i == get_color(inst)) for inst in instances
            for i in range(self.context_len)
        ])

        desc_indices = self.seq_vec.vectorize_all(padded)
        desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types))
        desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis],
                 desc_indices] = 1.
        color_feats = self.color_vec.vectorize_all(colors)
        color_feats = color_feats.reshape(
            (desc_indices.shape[0], self.context_len, color_feats.shape[1]))
        feats = np.einsum('ij,ick->icjk', desc_bow, color_feats)
        feats = feats.reshape((desc_indices.shape[0] * self.context_len,
                               desc_bow.shape[1] * color_feats.shape[2]))

        return feats, labels
Beispiel #7
0
class UnigramLMSpeakerLearner(Learner):
    def __init__(self):
        options = config.options()
        self.tokenizer = options.speaker_tokenizer
        self.token_counts = Counter()
        self.seq_vec = SequenceVectorizer(
            unk_threshold=options.speaker_unk_threshold)
        self.num_tokens = 0

    def train(self,
              training_instances,
              validation_instances='ignored',
              metrics='ignored'):
        tokenize = TOKENIZERS[self.tokenizer]

        tokenized = [
            tokenize(inst.output) + ['</s>'] for inst in training_instances
        ]
        self.seq_vec.add_all(tokenized)
        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        progress.start_task('Example', len(training_instances))
        for i, utt in enumerate(unk_replaced):
            progress.progress(i)
            self.token_counts.update(utt)
            self.num_tokens += len(utt)
        progress.end_task()

    @property
    def num_params(self):
        return len(self.token_counts)

    def predict_and_score(self, eval_instances):
        predict = [''] * len(eval_instances)
        score = []
        progress.start_task('Example', len(eval_instances))
        for i, inst in enumerate(eval_instances):
            progress.progress(i)
            score.append(self._get_log_prob(inst.output))
        progress.end_task()
        return predict, score

    def _get_log_prob(self, output):
        tokenize = TOKENIZERS[self.tokenizer]
        tokenized = tokenize(output) + ['</s>']
        unk_replaced = self.seq_vec.unk_replace(tokenized)
        log_prob = 0.0
        for token in unk_replaced:
            log_prob += np.log(self.token_counts[token] * 1.0 /
                               self.num_tokens)
        return log_prob
class UnigramLMSpeakerLearner(Learner):
    def __init__(self):
        options = config.options()
        self.tokenizer = options.speaker_tokenizer
        self.token_counts = Counter()
        self.seq_vec = SequenceVectorizer(unk_threshold=options.speaker_unk_threshold)
        self.num_tokens = 0

    def train(self, training_instances, validation_instances='ignored', metrics='ignored'):
        tokenize = TOKENIZERS[self.tokenizer]

        tokenized = [tokenize(inst.output) + ['</s>'] for inst in training_instances]
        self.seq_vec.add_all(tokenized)
        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        progress.start_task('Example', len(training_instances))
        for i, utt in enumerate(unk_replaced):
            progress.progress(i)
            self.token_counts.update(utt)
            self.num_tokens += len(utt)
        progress.end_task()

    @property
    def num_params(self):
        return len(self.token_counts)

    def predict_and_score(self, eval_instances):
        predict = [''] * len(eval_instances)
        score = []
        progress.start_task('Example', len(eval_instances))
        for i, inst in enumerate(eval_instances):
            progress.progress(i)
            score.append(self._get_log_prob(inst.output))
        progress.end_task()
        return predict, score

    def _get_log_prob(self, output):
        tokenize = TOKENIZERS[self.tokenizer]
        tokenized = tokenize(output) + ['</s>']
        unk_replaced = self.seq_vec.unk_replace(tokenized)
        log_prob = 0.0
        for token in unk_replaced:
            log_prob += np.log(self.token_counts[token] * 1.0 / self.num_tokens)
        return log_prob
Beispiel #9
0
class UnigramPrior(object):
    '''
    >>> p = UnigramPrior()
    >>> p.train([instance.Instance('blue')])
    >>> p.sample(3)  # doctest: +ELLIPSIS
    [Instance('...', None), Instance('...', None), Instance('...', None)]
    '''
    def __init__(self):
        self.vec = SequenceVectorizer()
        self.vec.add_all([['</s>'], ['<MASK>']])
        self.counts = theano.shared(np.zeros((self.vec.num_types,), dtype=np.int32))
        self.total = theano.shared(np.array(0, dtype=np.int32))
        self.log_probs = T.cast(self.counts, 'float32') / T.cast(self.total, 'float32')
        self.mask_index = self.vec.vectorize(['<MASK>'])[0]

    def train(self, training_instances, listener_data=True):
        get_utt = (lambda inst: inst.input) if listener_data else (lambda inst: inst.output)
        tokenized = [get_utt(inst).split() for inst in training_instances]
        self.vec.add_all(tokenized)
        x = self.vec.vectorize_all(self.pad(tokenized, self.vec.max_len))
        vocab_size = self.vec.num_types

        counts = np.bincount(x.flatten(), minlength=vocab_size).astype(np.int32)
        counts[self.mask_index] = 0
        self.counts.set_value(counts)
        self.total.set_value(np.sum(counts))

    def apply(self, input_vars):
        (x,) = input_vars

        token_probs = self.log_probs[x]
        if self.mask_index is not None:
            token_probs = token_probs * T.cast(T.eq(x, self.mask_index), 'float32')
        if token_probs.ndim == 1:
            return token_probs
        else:
            return token_probs.sum(axis=1)

    def sample(self, num_samples=1):
        indices = np.array([[sample(self.counts.get_value() * 1.0 / self.total.get_value())
                             for _t in range(self.vec.max_len)]
                            for _s in range(num_samples)], dtype=np.int32)
        return [instance.Instance(' '.join(strip_invalid_tokens(s)))
                for s in self.vec.unvectorize_all(indices)]

    def pad(self, sequences, length):
        '''
        Adds </s> tokens followed by zero or more <MASK> tokens to bring the total
        length of all sequences to `length + 1` (the addition of one is because all
        sequences receive a </s>, but `length` should be the max length of the original
        sequences).

        >>> UnigramPrior().pad([['blue'], ['very', 'blue']], 2)
        [['blue', '</s>', '<MASK>'], ['very', 'blue', '</s>']]
        '''
        return [seq + ['</s>'] + ['<MASK>'] * (length - len(seq))
                for seq in sequences]
Beispiel #10
0
    def __init__(self, id=None):
        self.get_options()
        self.init_submodels(id)
        super(RSALearner, self).__init__(id=id)

        color_resolution = (self.options.listener_color_resolution
                            if self.options.listener else
                            self.options.speaker_color_resolution)
        self.seq_vec = SequenceVectorizer()
        self.color_vec = BucketsVectorizer(color_resolution,
                                           hsv=self.options.speaker_hsv)
Beispiel #11
0
    def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False):
        self.get_options()

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)
        get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (lambda inst: inst.alt_outputs)
        get_alt_colors = get_alt_i if inverted else get_alt_o

        tokenize = TOKENIZERS[self.options.listener_tokenizer]
        tokenized = [tokenize(get_desc(inst)) for inst in instances]
        context_lens = [len(get_alt_colors(inst)) for inst in instances]

        if init_vectorizer:
            self.seq_vec = SequenceVectorizer()
            self.seq_vec.add_all(tokenized)

        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        if init_vectorizer:
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

            self.context_len = context_lens[0]

            color_repr = COLOR_REPRS[self.options.listener_color_repr]
            self.color_vec = color_repr(self.options.listener_color_resolution,
                                        hsv=self.options.listener_hsv)

        assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens)

        padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len]
                  for d in unk_replaced]
        colors = [c for inst in instances for c in get_alt_colors(inst)]
        labels = np.array([int(i == get_color(inst))
                           for inst in instances
                           for i in range(self.context_len)])

        desc_indices = self.seq_vec.vectorize_all(padded)
        desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types))
        desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1.
        color_feats = self.color_vec.vectorize_all(colors)
        color_feats = color_feats.reshape((desc_indices.shape[0],
                                           self.context_len,
                                           color_feats.shape[1]))
        feats = np.einsum('ij,ick->icjk', desc_bow, color_feats)
        feats = feats.reshape((desc_indices.shape[0] * self.context_len,
                               desc_bow.shape[1] * color_feats.shape[2]))

        return feats, labels
Beispiel #12
0
class LRContextListenerLearner(Learner):
    def train(self,
              training_instances,
              validation_instances=None,
              metrics=None):
        X, y = self._data_to_arrays(training_instances, init_vectorizer=True)
        self.mod = LogisticRegression(solver='lbfgs')
        self.mod.fit(X, y)

    @property
    def num_params(self):
        return np.prod(self.mod.coef_.shape) + np.prod(
            self.mod.intercept_.shape)

    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        X, y = self._data_to_arrays(eval_instances)
        y = y.reshape((len(eval_instances), self.context_len))
        all_scores = self.mod.predict_log_proba(X)[:, 1].reshape(
            (len(eval_instances), self.context_len))
        all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis]

        preds = all_scores.argmax(axis=1)
        scores = np.where(y, all_scores, 0).sum(axis=1)

        return preds.tolist(), scores.tolist()

    def _data_to_arrays(self,
                        instances,
                        inverted=False,
                        init_vectorizer=False):
        self.get_options()

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)
        get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (
            lambda inst: inst.alt_outputs)
        get_alt_colors = get_alt_i if inverted else get_alt_o

        tokenize = TOKENIZERS[self.options.listener_tokenizer]
        tokenized = [tokenize(get_desc(inst)) for inst in instances]
        context_lens = [len(get_alt_colors(inst)) for inst in instances]

        if init_vectorizer:
            self.seq_vec = SequenceVectorizer()
            self.seq_vec.add_all(tokenized)

        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        if init_vectorizer:
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

            self.context_len = context_lens[0]

            color_repr = COLOR_REPRS[self.options.listener_color_repr]
            self.color_vec = color_repr(self.options.listener_color_resolution,
                                        hsv=self.options.listener_hsv)

        assert all(cl == self.context_len
                   for cl in context_lens), (self.context_len, context_lens)

        padded = [(d + ['</s>'] *
                   (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len]
                  for d in unk_replaced]
        colors = [c for inst in instances for c in get_alt_colors(inst)]
        labels = np.array([
            int(i == get_color(inst)) for inst in instances
            for i in range(self.context_len)
        ])

        desc_indices = self.seq_vec.vectorize_all(padded)
        desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types))
        desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis],
                 desc_indices] = 1.
        color_feats = self.color_vec.vectorize_all(colors)
        color_feats = color_feats.reshape(
            (desc_indices.shape[0], self.context_len, color_feats.shape[1]))
        feats = np.einsum('ij,ick->icjk', desc_bow, color_feats)
        feats = feats.reshape((desc_indices.shape[0] * self.context_len,
                               desc_bow.shape[1] * color_feats.shape[2]))

        return feats, labels

    def get_options(self):
        if not hasattr(self, 'options'):
            self.options = config.options()
Beispiel #13
0
class SpeakerLearner(NeuralLearner):
    '''
    An speaker with a feedforward neural net color input passed into an RNN
    to generate a description.
    '''
    def __init__(self, id=None, context_len=1):
        super(SpeakerLearner, self).__init__(id=id)
        self.seq_vec = SequenceVectorizer(
            unk_threshold=self.options.speaker_unk_threshold)
        color_repr = COLOR_REPRS[self.options.speaker_color_repr]
        self.color_vec = color_repr(self.options.speaker_color_resolution,
                                    hsv=self.options.speaker_hsv)
        self.context_len = context_len

    @property
    def use_color_mask(self):
        return False

    def predict(self, eval_instances, random=False, verbosity=0):
        result = []
        batches = iterators.iter_batches(eval_instances,
                                         self.options.speaker_eval_batch_size)
        num_batches = (len(eval_instances) -
                       1) // self.options.speaker_eval_batch_size + 1

        eos_index = self.seq_vec.vectorize(['</s>'])[0]

        if self.options.verbosity + verbosity >= 2:
            print('Predicting')
        if self.options.verbosity + verbosity >= 1:
            progress.start_task('Predict batch', num_batches)
        for batch_num, batch in enumerate(batches):
            if self.options.verbosity + verbosity >= 1:
                progress.progress(batch_num)
            batch = list(batch)

            if self.use_color_mask:
                (c, cm, _p, mask), (_y, ) = self._data_to_arrays(batch,
                                                                 test=True)
            else:
                (c, _p, mask), (_y, ) = self._data_to_arrays(batch, test=True)
            assert mask.all()  # We shouldn't be masking anything in prediction

            beam_size = 1 if random else self.options.speaker_beam_size
            done = np.zeros((len(batch), beam_size), dtype=np.bool)
            beam = np.zeros((len(batch), beam_size, self.seq_vec.max_len),
                            dtype=np.int32)
            beam[:, :, 0] = self.seq_vec.vectorize(['<s>'])[0]
            beam_scores = np.log(np.zeros((len(batch), beam_size)))
            beam_scores[:, 0] = 0.0

            c = np.repeat(c, beam_size, axis=0)
            mask = np.repeat(mask, beam_size, axis=0)
            if self.use_color_mask:
                cm = np.repeat(cm, beam_size, axis=0)

            for length in range(1, self.seq_vec.max_len):
                if done.all():
                    break
                p = beam.reshape(
                    (beam.shape[0] * beam.shape[1], beam.shape[2]))[:, :-1]
                inputs = [c, cm, p, mask
                          ] if self.use_color_mask else [c, p, mask]
                probs = self.model.predict(inputs)
                if random:
                    indices = sample(probs[:, length - 1, :])
                    beam[:, 0, length] = indices
                    done = np.logical_or(done, indices == eos_index)
                else:
                    assert probs.shape[1] == p.shape[1], (probs.shape[1],
                                                          p.shape[1])
                    assert probs.shape[2] == len(
                        self.seq_vec.tokens), (probs.shape[2],
                                               len(self.seq_vec.tokens))
                    scores = np.log(probs)[:, length - 1, :].reshape(
                        (beam.shape[0], beam.shape[1], probs.shape[2]))
                    beam_search_step(scores, length, beam, beam_scores, done,
                                     eos_index)
            outputs = self.seq_vec.unvectorize_all(beam[:, 0, :])
            result.extend([' '.join(strip_invalid_tokens(o)) for o in outputs])
        if self.options.verbosity + verbosity >= 1:
            progress.end_task()

        return result

    def score(self, eval_instances, verbosity=0):
        result = []
        batches = iterators.iter_batches(eval_instances,
                                         self.options.speaker_eval_batch_size)
        num_batches = (len(eval_instances) -
                       1) // self.options.speaker_eval_batch_size + 1

        if self.options.verbosity + verbosity >= 2:
            print('Scoring')
        if self.options.verbosity + verbosity >= 1:
            progress.start_task('Score batch', num_batches)
        for batch_num, batch in enumerate(batches):
            if self.options.verbosity + verbosity >= 1:
                progress.progress(batch_num)
            batch = list(batch)

            xs, (n, ) = self._data_to_arrays(batch, test=False)
            if self.use_color_mask:
                mask = xs[3]
            else:
                mask = xs[2]

            probs = self.model.predict(xs)
            token_probs = probs[np.arange(probs.shape[0])[:, np.newaxis],
                                np.arange(probs.shape[1]), n]
            scores_arr = np.sum(np.log(token_probs) * mask, axis=1)
            scores = scores_arr.tolist()
            result.extend(scores)
        if self.options.verbosity + verbosity >= 1:
            progress.end_task()

        return result

    def _data_to_arrays(self,
                        training_instances,
                        init_vectorizer=False,
                        test=False,
                        inverted=False):
        context_len = self.context_len if hasattr(self, 'context_len') else 1
        use_context = context_len > 1

        def get_multi(val):
            if isinstance(val, tuple):
                assert len(val) == 1
                return val[0]
            else:
                return val

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_color, get_desc_simple = (get_o, get_i) if inverted else (get_i,
                                                                      get_o)
        get_desc = lambda inst: get_multi(get_desc_simple(inst))
        get_i_ind, get_o_ind = (
            (lambda inst: inst.alt_inputs[get_multi(inst.input)]),
            (lambda inst: inst.alt_outputs[get_multi(inst.output)]))
        get_color_indexed = get_o_ind if inverted else get_i_ind
        get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (
            lambda inst: inst.alt_outputs)
        get_alt_colors = get_alt_o if inverted else get_alt_i

        if hasattr(self.options, 'speaker_tokenizer'):
            tokenize = TOKENIZERS[self.options.speaker_tokenizer]
        else:
            tokenize = TOKENIZERS['whitespace']

        if init_vectorizer:
            tokenized = [['<s>'] + tokenize(get_desc(inst)) + ['</s>']
                         for inst in training_instances]
            self.seq_vec.add_all(tokenized)
            unk_replaced = self.seq_vec.unk_replace_all(tokenized)
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

        colors = []
        previous = []
        next_tokens = []
        if self.options.verbosity >= 9:
            print('%s _data_to_arrays:' % self.id)
        for i, inst in enumerate(training_instances):
            desc, color = get_desc(inst), get_color(inst)
            if isinstance(color, numbers.Number):
                color = get_color_indexed(inst)
            if test:
                full = ['<s>'] + ['</s>'] * (self.seq_vec.max_len - 1)
            else:
                desc = tokenize(desc)
                full = (['<s>'] + desc + ['</s>'] + ['<MASK>'] *
                        (self.seq_vec.max_len - 1 - len(desc)))
            prev = full[:-1]
            next = full[1:]
            if self.options.verbosity >= 9:
                print('%s, %s -> %s' % (repr(color), repr(prev), repr(next)))
            colors.append(color)
            if use_context:
                new_context = get_alt_colors(inst)
                index = get_color(inst)
                if isinstance(index, tuple):
                    assert len(index) == 1
                    index = index[0]
                assert len(new_context) == context_len, \
                    'Inconsistent context lengths: %s' % ((context_len, len(new_context)),)
                colors.extend(
                    [c for j, c in enumerate(new_context) if j != index])
            previous.append(prev)
            next_tokens.append(next)

        P = np.zeros((len(previous), self.seq_vec.max_len - 1), dtype=np.int32)
        mask = np.zeros((len(previous), self.seq_vec.max_len - 1),
                        dtype=np.int32)
        N = np.zeros((len(next_tokens), self.seq_vec.max_len - 1),
                     dtype=np.int32)
        c = self.color_vec.vectorize_all(colors, hsv=True)
        if len(c.shape) == 1:
            c = c.reshape((len(colors) / context_len, context_len))
        else:
            c = c.reshape((len(colors) / context_len,
                           context_len * c.shape[1]) + c.shape[2:])
        for i, (color, prev,
                next) in enumerate(zip(colors, previous, next_tokens)):
            if len(prev) > P.shape[1]:
                prev = prev[:P.shape[1]]
            if len(next) > N.shape[1]:
                next = next[:N.shape[1]]
            P[i, :len(prev)] = self.seq_vec.vectorize(prev)
            N[i, :len(next)] = self.seq_vec.vectorize(next)
            for t, token in enumerate(next):
                mask[i, t] = (token != '<MASK>')
        c = np.tile(c[:, np.newaxis, ...],
                    [1, self.seq_vec.max_len - 1] + [1] * (c.ndim - 1))

        if self.options.verbosity >= 9:
            print('c: %s' % (repr(c), ))
            print('P: %s' % (repr(P), ))
            print('mask: %s' % (repr(mask), ))
            print('N: %s' % (repr(N), ))
        return [c, P, mask], [N]

    def _build_model(self, model_class=SimpleLasagneModel):
        id_tag = (self.id + '/') if self.id else ''

        input_vars = self.color_vec.get_input_vars(
            self.id, recurrent=not self.use_color_mask)
        if self.use_color_mask:
            input_vars.append(T.imatrix(id_tag + 'color_mask'))
        input_vars.extend(
            [T.imatrix(id_tag + 'previous'),
             T.imatrix(id_tag + 'mask')])
        target_var = T.imatrix(id_tag + 'targets')

        self.l_out, self.input_layers = self._get_l_out(input_vars)
        self.model = model_class(
            input_vars, [target_var],
            self.l_out,
            id=self.id,
            loss=self.masked_loss(input_vars),
            optimizer=OPTIMIZERS[self.options.speaker_optimizer],
            learning_rate=self.options.speaker_learning_rate)

    def train_priors(self, training_instances, listener_data=False):
        prior_class = PRIORS[self.options.speaker_prior]
        self.prior_emp = prior_class(recurrent=True)
        self.prior_smooth = prior_class(recurrent=True)

        self.prior_emp.train(training_instances, listener_data=listener_data)
        self.prior_smooth.train(training_instances,
                                listener_data=listener_data)

    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        prev_output_var, mask_var = input_vars[-2:]
        color_input_vars = input_vars[:-2]

        context_len = self.context_len if hasattr(self, 'context_len') else 1
        l_color_repr, color_inputs = self.color_vec.get_input_layer(
            color_input_vars,
            recurrent_length=self.seq_vec.max_len - 1,
            cell_size=self.options.speaker_cell_size,
            context_len=context_len,
            id=self.id)
        l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1))
        for i in range(1, self.options.speaker_hidden_color_layers + 1):
            l_hidden_color = NINLayer(
                l_hidden_color,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_color%d' % i)
        l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1))

        l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                                input_var=prev_output_var,
                                name=id_tag + 'prev_input')
        l_prev_embed = EmbeddingLayer(
            l_prev_out,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.speaker_cell_size,
            name=id_tag + 'prev_embed')
        l_in = ConcatLayer([l_hidden_color, l_prev_embed],
                           axis=2,
                           name=id_tag + 'color_prev')
        l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                               input_var=mask_var,
                               name=id_tag + 'mask_input')
        l_rec_drop = l_in

        cell = CELLS[self.options.speaker_cell]
        cell_kwargs = {
            'mask_input':
            (None if self.options.speaker_no_mask else l_mask_in),
            'grad_clipping': self.options.speaker_grad_clipping,
            'num_units': self.options.speaker_cell_size,
        }
        if self.options.speaker_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.speaker_forget_bias))
        if self.options.speaker_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.speaker_nonlinearity]

        for i in range(1, self.options.speaker_recurrent_layers):
            l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs)
            if self.options.speaker_dropout > 0.0:
                l_rec_drop = DropoutLayer(l_rec,
                                          p=self.options.speaker_dropout,
                                          name=id_tag + 'rec%d_drop' % i)
            else:
                l_rec_drop = l_rec
        l_rec = cell(l_rec_drop,
                     name=id_tag +
                     'rec%d' % self.options.speaker_recurrent_layers,
                     **cell_kwargs)
        l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size),
                               name=id_tag + 'reshape')
        l_hidden_out = l_shape
        for i in range(1, self.options.speaker_hidden_out_layers + 1):
            l_hidden_out = DenseLayer(
                l_hidden_out,
                num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_out%d' % i)
        l_softmax = DenseLayer(l_hidden_out,
                               num_units=len(self.seq_vec.tokens),
                               nonlinearity=softmax,
                               name=id_tag + 'softmax')
        l_out = ReshapeLayer(
            l_softmax,
            (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)),
            name=id_tag + 'out')

        return l_out, color_inputs + [l_prev_out, l_mask_in]

    def loss_out(self, input_vars=None, target_var=None):
        if input_vars is None:
            input_vars = self.model.input_vars
        if target_var is None:
            target_var = self.model.target_var
        pred = get_output(self.l_out, dict(zip(self.input_layers, input_vars)))
        loss = self.masked_loss(input_vars)
        return loss(pred, target_var)

    def masked_loss(self, input_vars):
        return masked_seq_crossentropy(input_vars[-1])

    def sample_prior_smooth(self, num_samples):
        return self.prior_smooth.sample(num_samples)
Beispiel #14
0
class ListenerLearner(NeuralLearner):
    '''
    An LSTM-based listener (guesses colors from descriptions).
    '''
    def __init__(self, id=None):
        super(ListenerLearner, self).__init__(id=id)
        self.word_counts = Counter()
        self.seq_vec = SequenceVectorizer(
            unk_threshold=self.options.listener_unk_threshold)
        self.color_vec = BucketsVectorizer(
            self.options.listener_color_resolution,
            hsv=self.options.listener_hsv)

    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        predictions = []
        scores = []
        batches = iterators.iter_batches(eval_instances,
                                         self.options.listener_eval_batch_size)
        num_batches = (len(eval_instances) -
                       1) // self.options.listener_eval_batch_size + 1

        if self.options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)

            xs, (y, ) = self._data_to_arrays(batch, test=True)

            probs = self.model.predict(xs)
            if random:
                indices = sample(probs)
                predictions.extend(self.unvectorize(indices, random=True))
            else:
                predictions.extend(self.unvectorize(probs.argmax(axis=1)))
            scores_arr = np.log(probs[np.arange(len(batch)),
                                      y]) + self.bucket_adjustment()
            scores.extend(scores_arr.tolist())
        progress.end_task()
        if self.options.verbosity >= 9:
            print('%s %ss:') % (self.id, 'sample' if random else 'prediction')
            for inst, prediction in zip(eval_instances, predictions):
                print('%s -> %s' % (repr(inst.input), repr(prediction)))

        return predictions, scores

    def unvectorize(self, indices, random=False):
        return self.color_vec.unvectorize_all(indices, random=random, hsv=True)

    def bucket_adjustment(self):
        bucket_volume = (256.0**3) / self.color_vec.num_types
        return -np.log(bucket_volume)

    def on_iter_end(self, step, writer):
        most_common = [
            desc for desc, count in self.word_counts.most_common(10)
        ]
        insts = [instance.Instance(input=desc) for desc in most_common]
        xs, (y, ) = self._data_to_arrays(insts, test=True)
        probs = self.model.predict(xs)
        for i, desc in enumerate(most_common):
            dist = probs[i, :]
            for image, channel in zip(
                    self.color_vec.visualize_distribution(dist), '012'):
                writer.log_image(step, '%s/%s/%s' % (self.id, desc, channel),
                                 image)
        super(ListenerLearner, self).on_iter_end(step, writer)

    def _data_to_arrays(self,
                        training_instances,
                        init_vectorizer=False,
                        test=False,
                        inverted=False):
        def get_multi(val):
            if isinstance(val, tuple):
                assert len(val) == 1
                return val[0]
            else:
                return val

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)

        get_i_ind, get_o_ind = (
            (lambda inst: inst.alt_inputs[get_multi(inst.input)]),
            (lambda inst: inst.alt_outputs[get_multi(inst.output)]))
        get_color_indexed = get_i_ind if inverted else get_o_ind

        if hasattr(self.options, 'listener_tokenizer'):
            tokenize = TOKENIZERS[self.options.listener_tokenizer]
        else:
            tokenize = TOKENIZERS['whitespace']

        if init_vectorizer:
            tokenized = [['<s>'] + tokenize(get_desc(inst)) + ['</s>']
                         for inst in training_instances]
            self.seq_vec.add_all(tokenized)
            unk_replaced = self.seq_vec.unk_replace_all(tokenized)
            self.word_counts.update(
                [get_desc(inst) for inst in training_instances])
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

        sentences = []
        colors = []
        if self.options.verbosity >= 9:
            print('%s _data_to_arrays:' % self.id)
        for i, inst in enumerate(training_instances):
            desc = tokenize(get_desc(inst))
            color = get_color(inst)
            if isinstance(color, numbers.Number):
                color = get_color_indexed(inst)
            if not color:
                assert test
                color = (0.0, 0.0, 0.0)
            s = ['<s>'] * (self.seq_vec.max_len - 1 - len(desc)) + desc
            s.append('</s>')
            if self.options.verbosity >= 9:
                print('%s -> %s' % (repr(s), repr(color)))
            sentences.append(s)
            colors.append(color)

        x = np.zeros((len(sentences), self.seq_vec.max_len), dtype=np.int32)
        y = np.zeros((len(sentences), ), dtype=np.int32)
        for i, sentence in enumerate(sentences):
            if len(sentence) > x.shape[1]:
                sentence = sentence[:x.shape[1]]
            x[i, :] = self.seq_vec.vectorize(sentence)
            y[i] = self.color_vec.vectorize(colors[i], hsv=True)

        return [x], [y]

    def _build_model(self, model_class=SimpleLasagneModel):
        id_tag = (self.id + '/') if self.id else ''

        input_var = T.imatrix(id_tag + 'inputs')
        target_var = T.ivector(id_tag + 'targets')

        self.l_out, self.input_layers = self._get_l_out([input_var])
        self.loss = categorical_crossentropy

        self.model = model_class(
            [input_var], [target_var],
            self.l_out,
            loss=self.loss,
            optimizer=OPTIMIZERS[self.options.listener_optimizer],
            learning_rate=self.options.listener_learning_rate,
            id=self.id)

    def train_priors(self, training_instances, listener_data=False):
        prior_class = PRIORS[self.options.listener_prior]
        self.prior_emp = prior_class(
        )  # TODO: accurate values for empirical prior
        self.prior_smooth = prior_class()

        self.prior_emp.train(training_instances, listener_data=listener_data)
        self.prior_smooth.train(training_instances,
                                listener_data=listener_data)

    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len),
                          input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(
            l_in,
            input_size=len(self.seq_vec.tokens),
            output_size=self.options.listener_cell_size,
            name=id_tag + 'desc_embed')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(
                b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[
                self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1
        l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec2_drop = DropoutLayer(l_rec2,
                                       p=self.options.listener_dropout,
                                       name=id_tag + 'rec2_drop')
        else:
            l_rec2_drop = l_rec2

        l_hidden = DenseLayer(
            l_rec2_drop,
            num_units=self.options.listener_cell_size,
            nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
            name=id_tag + 'hidden')
        if self.options.listener_dropout > 0.0:
            l_hidden_drop = DropoutLayer(l_hidden,
                                         p=self.options.listener_dropout,
                                         name=id_tag + 'hidden_drop')
        else:
            l_hidden_drop = l_hidden
        l_scores = DenseLayer(l_hidden_drop,
                              num_units=self.color_vec.num_types,
                              nonlinearity=None,
                              name=id_tag + 'scores')
        l_out = NonlinearityLayer(l_scores,
                                  nonlinearity=softmax,
                                  name=id_tag + 'out')

        return l_out, [l_in]

    def sample_prior_smooth(self, num_samples):
        return self.prior_smooth.sample(num_samples)
Beispiel #15
0
 def __init__(self, id=None):
     super(ListenerLearner, self).__init__(id=id)
     self.word_counts = Counter()
     self.seq_vec = SequenceVectorizer(unk_threshold=self.options.listener_unk_threshold)
     self.color_vec = BucketsVectorizer(self.options.listener_color_resolution,
                                        hsv=self.options.listener_hsv)
Beispiel #16
0
class ListenerLearner(NeuralLearner):
    '''
    An LSTM-based listener (guesses colors from descriptions).
    '''
    def __init__(self, id=None):
        super(ListenerLearner, self).__init__(id=id)
        self.word_counts = Counter()
        self.seq_vec = SequenceVectorizer(unk_threshold=self.options.listener_unk_threshold)
        self.color_vec = BucketsVectorizer(self.options.listener_color_resolution,
                                           hsv=self.options.listener_hsv)

    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        predictions = []
        scores = []
        batches = iterators.iter_batches(eval_instances, self.options.listener_eval_batch_size)
        num_batches = (len(eval_instances) - 1) // self.options.listener_eval_batch_size + 1

        if self.options.verbosity + verbosity >= 2:
            print('Testing')
        progress.start_task('Eval batch', num_batches)
        for batch_num, batch in enumerate(batches):
            progress.progress(batch_num)
            batch = list(batch)

            xs, (y,) = self._data_to_arrays(batch, test=True)

            probs = self.model.predict(xs)
            self.on_predict(xs)
            if random:
                indices = sample(probs)
                predictions.extend(self.unvectorize(indices, random=True))
            else:
                predictions.extend(self.unvectorize(probs.argmax(axis=1)))
            scores_arr = np.log(probs[np.arange(len(batch)), y]) + self.bucket_adjustment()
            scores.extend(scores_arr.tolist())
        progress.end_task()
        if self.options.verbosity >= 9:
            print('%s %ss:') % (self.id, 'sample' if random else 'prediction')
            for inst, prediction in zip(eval_instances, predictions):
                print('%s -> %s' % (repr(inst.input), repr(prediction)))

        return predictions, scores

    def unvectorize(self, indices, random=False):
        return self.color_vec.unvectorize_all(indices, random=random, hsv=True)

    def bucket_adjustment(self):
        bucket_volume = (256.0 ** 3) / self.color_vec.num_types
        return -np.log(bucket_volume)

    def on_predict(self, xs):
        pass

    def on_iter_end(self, step, writer):
        most_common = [desc for desc, count in self.word_counts.most_common(10)]
        insts = [instance.Instance(input=desc) for desc in most_common]
        xs, (y,) = self._data_to_arrays(insts, test=True)
        probs = self.model.predict(xs)
        for i, desc in enumerate(most_common):
            dist = probs[i, :]
            for image, channel in zip(self.color_vec.visualize_distribution(dist), '012'):
                writer.log_image(step, '%s/%s/%s' % (self.id, desc, channel), image)
        super(ListenerLearner, self).on_iter_end(step, writer)

    def _data_to_arrays(self, training_instances,
                        init_vectorizer=False, test=False, inverted=False):
        def get_multi(val):
            if isinstance(val, tuple):
                assert len(val) == 1
                return val[0]
            else:
                return val

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)

        get_i_ind, get_o_ind = ((lambda inst: inst.alt_inputs[get_multi(inst.input)]),
                                (lambda inst: inst.alt_outputs[get_multi(inst.output)]))
        get_color_indexed = get_i_ind if inverted else get_o_ind

        if hasattr(self.options, 'listener_tokenizer'):
            tokenize = TOKENIZERS[self.options.listener_tokenizer]
        else:
            tokenize = TOKENIZERS['whitespace']

        if init_vectorizer:
            tokenized = [['<s>'] + tokenize(get_desc(inst)) + ['</s>']
                         for inst in training_instances]
            self.seq_vec.add_all(tokenized)
            unk_replaced = self.seq_vec.unk_replace_all(tokenized)
            self.word_counts.update([get_desc(inst) for inst in training_instances])
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

        sentences = []
        colors = []
        if self.options.verbosity >= 9:
            print('%s _data_to_arrays:' % self.id)
        for i, inst in enumerate(training_instances):
            desc = tokenize(get_desc(inst))
            color = get_color(inst)
            if isinstance(color, numbers.Number):
                color = get_color_indexed(inst)
            if not color:
                assert test
                color = (0.0, 0.0, 0.0)
            s = ['<s>'] * (self.seq_vec.max_len - 1 - len(desc)) + desc
            s.append('</s>')
            if self.options.verbosity >= 9:
                print('%s -> %s' % (repr(s), repr(color)))
            sentences.append(s)
            colors.append(color)

        x = np.zeros((len(sentences), self.seq_vec.max_len), dtype=np.int32)
        y = np.zeros((len(sentences),), dtype=np.int32)
        for i, sentence in enumerate(sentences):
            if len(sentence) > x.shape[1]:
                sentence = sentence[:x.shape[1]]
            x[i, :] = self.seq_vec.vectorize(sentence)
            y[i] = self.color_vec.vectorize(colors[i], hsv=True)

        return [x], [y]

    def _build_model(self, model_class=SimpleLasagneModel):
        id_tag = (self.id + '/') if self.id else ''

        input_var = T.imatrix(id_tag + 'inputs')
        target_var = T.ivector(id_tag + 'targets')

        self.l_out, self.input_layers = self._get_l_out([input_var])
        self.loss = categorical_crossentropy

        self.model = model_class(
            [input_var], [target_var], self.l_out,
            loss=self.loss, optimizer=OPTIMIZERS[self.options.listener_optimizer],
            learning_rate=self.options.listener_learning_rate,
            id=self.id)

    def train_priors(self, training_instances, listener_data=False):
        prior_class = PRIORS[self.options.listener_prior]
        self.prior_emp = prior_class()  # TODO: accurate values for empirical prior
        self.prior_smooth = prior_class()

        self.prior_emp.train(training_instances, listener_data=listener_data)
        self.prior_smooth.train(training_instances, listener_data=listener_data)

    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        input_var = input_vars[0]

        l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var,
                          name=id_tag + 'desc_input')
        l_in_embed = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens),
                                    output_size=self.options.listener_cell_size,
                                    name=id_tag + 'desc_embed')

        cell = CELLS[self.options.listener_cell]
        cell_kwargs = {
            'grad_clipping': self.options.listener_grad_clipping,
            'num_units': self.options.listener_cell_size,
        }
        if self.options.listener_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias))
        if self.options.listener_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity]

        l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout,
                                       name=id_tag + 'rec1_drop')
        else:
            l_rec1_drop = l_rec1
        l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs)
        if self.options.listener_dropout > 0.0:
            l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout,
                                       name=id_tag + 'rec2_drop')
        else:
            l_rec2_drop = l_rec2

        l_hidden = DenseLayer(l_rec2_drop, num_units=self.options.listener_cell_size,
                              nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity],
                              name=id_tag + 'hidden')
        if self.options.listener_dropout > 0.0:
            l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout,
                                         name=id_tag + 'hidden_drop')
        else:
            l_hidden_drop = l_hidden
        l_scores = DenseLayer(l_hidden_drop, num_units=self.color_vec.num_types, nonlinearity=None,
                              name=id_tag + 'scores')
        l_out = NonlinearityLayer(l_scores, nonlinearity=softmax, name=id_tag + 'out')

        return l_out, [l_in]

    def sample_prior_smooth(self, num_samples):
        return self.prior_smooth.sample(num_samples)
Beispiel #17
0
 def __init__(self):
     options = config.options()
     self.tokenizer = options.speaker_tokenizer
     self.token_counts = Counter()
     self.seq_vec = SequenceVectorizer(unk_threshold=options.speaker_unk_threshold)
     self.num_tokens = 0
Beispiel #18
0
class LRContextListenerLearner(Learner):
    def train(self, training_instances, validation_instances=None, metrics=None):
        X, y = self._data_to_arrays(training_instances, init_vectorizer=True)
        self.mod = LogisticRegression(solver='lbfgs')
        self.mod.fit(X, y)

    @property
    def num_params(self):
        return np.prod(self.mod.coef_.shape) + np.prod(self.mod.intercept_.shape)

    def predict_and_score(self, eval_instances, random=False, verbosity=0):
        X, y = self._data_to_arrays(eval_instances)
        y = y.reshape((len(eval_instances), self.context_len))
        all_scores = self.mod.predict_log_proba(X)[:, 1].reshape((len(eval_instances),
                                                                  self.context_len))
        all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis]

        preds = all_scores.argmax(axis=1)
        scores = np.where(y, all_scores, 0).sum(axis=1)

        return preds.tolist(), scores.tolist()

    def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False):
        self.get_options()

        get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output)
        get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o)
        get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (lambda inst: inst.alt_outputs)
        get_alt_colors = get_alt_i if inverted else get_alt_o

        tokenize = TOKENIZERS[self.options.listener_tokenizer]
        tokenized = [tokenize(get_desc(inst)) for inst in instances]
        context_lens = [len(get_alt_colors(inst)) for inst in instances]

        if init_vectorizer:
            self.seq_vec = SequenceVectorizer()
            self.seq_vec.add_all(tokenized)

        unk_replaced = self.seq_vec.unk_replace_all(tokenized)

        if init_vectorizer:
            config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True)

            self.context_len = context_lens[0]

            color_repr = COLOR_REPRS[self.options.listener_color_repr]
            self.color_vec = color_repr(self.options.listener_color_resolution,
                                        hsv=self.options.listener_hsv)

        assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens)

        padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len]
                  for d in unk_replaced]
        colors = [c for inst in instances for c in get_alt_colors(inst)]
        labels = np.array([int(i == get_color(inst))
                           for inst in instances
                           for i in range(self.context_len)])

        desc_indices = self.seq_vec.vectorize_all(padded)
        desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types))
        desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1.
        color_feats = self.color_vec.vectorize_all(colors)
        color_feats = color_feats.reshape((desc_indices.shape[0],
                                           self.context_len,
                                           color_feats.shape[1]))
        feats = np.einsum('ij,ick->icjk', desc_bow, color_feats)
        feats = feats.reshape((desc_indices.shape[0] * self.context_len,
                               desc_bow.shape[1] * color_feats.shape[2]))

        return feats, labels

    def get_options(self):
        if not hasattr(self, 'options'):
            self.options = config.options()