def __init__(self): self.vec = SequenceVectorizer() self.vec.add_all([['</s>'], ['<MASK>']]) self.counts = theano.shared(np.zeros((self.vec.num_types,), dtype=np.int32)) self.total = theano.shared(np.array(0, dtype=np.int32)) self.log_probs = T.cast(self.counts, 'float32') / T.cast(self.total, 'float32') self.mask_index = self.vec.vectorize(['<MASK>'])[0]
def count_unks(): options = config.options() print('Data source: {}'.format(options.data_source)) print('Unk threshold: {}'.format(options.unk_threshold)) print('Tokenizer: {}'.format(options.tokenizer)) print('') print('Loading data') train_insts = color_instances.SOURCES[options.data_source].train_data( listener=True) eval_insts = color_instances.SOURCES[options.data_source].test_data( listener=True) tokenize = TOKENIZERS[options.tokenizer] vec = SequenceVectorizer(unk_threshold=options.unk_threshold) print('Tokenizing training data') train_tokenized = [['<s>'] + tokenize(inst.input) + ['</s>'] for inst in train_insts] print('Tokenizing eval data') eval_tokenized = [['<s>'] + tokenize(inst.input) + ['</s>'] for inst in eval_insts] print('Initializing vectorizer') vec.add_all(train_tokenized) print_unk_ratio(train_tokenized, vec, 'Train') print_unk_ratio(eval_tokenized, vec, 'Eval')
def __init__(self): options = config.options() self.tokenizer = options.speaker_tokenizer self.token_counts = Counter() self.seq_vec = SequenceVectorizer( unk_threshold=options.speaker_unk_threshold) self.num_tokens = 0
def __init__(self, id=None): super(ListenerLearner, self).__init__(id=id) self.word_counts = Counter() self.seq_vec = SequenceVectorizer() self.color_vec = BucketsVectorizer( self.options.listener_color_resolution, hsv=self.options.listener_hsv)
def __init__(self, id=None, context_len=1): super(SpeakerLearner, self).__init__(id=id) self.seq_vec = SequenceVectorizer() color_repr = COLOR_REPRS[self.options.speaker_color_repr] self.color_vec = color_repr(self.options.speaker_color_resolution, hsv=self.options.speaker_hsv) self.context_len = context_len
def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False): self.get_options() get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), ( lambda inst: inst.alt_outputs) get_alt_colors = get_alt_i if inverted else get_alt_o tokenize = TOKENIZERS[self.options.listener_tokenizer] tokenized = [tokenize(get_desc(inst)) for inst in instances] context_lens = [len(get_alt_colors(inst)) for inst in instances] if init_vectorizer: self.seq_vec = SequenceVectorizer() self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) if init_vectorizer: config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) self.context_len = context_lens[0] color_repr = COLOR_REPRS[self.options.listener_color_repr] self.color_vec = color_repr(self.options.listener_color_resolution, hsv=self.options.listener_hsv) assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens) padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len] for d in unk_replaced] colors = [c for inst in instances for c in get_alt_colors(inst)] labels = np.array([ int(i == get_color(inst)) for inst in instances for i in range(self.context_len) ]) desc_indices = self.seq_vec.vectorize_all(padded) desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types)) desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1. color_feats = self.color_vec.vectorize_all(colors) color_feats = color_feats.reshape( (desc_indices.shape[0], self.context_len, color_feats.shape[1])) feats = np.einsum('ij,ick->icjk', desc_bow, color_feats) feats = feats.reshape((desc_indices.shape[0] * self.context_len, desc_bow.shape[1] * color_feats.shape[2])) return feats, labels
class UnigramLMSpeakerLearner(Learner): def __init__(self): options = config.options() self.tokenizer = options.speaker_tokenizer self.token_counts = Counter() self.seq_vec = SequenceVectorizer( unk_threshold=options.speaker_unk_threshold) self.num_tokens = 0 def train(self, training_instances, validation_instances='ignored', metrics='ignored'): tokenize = TOKENIZERS[self.tokenizer] tokenized = [ tokenize(inst.output) + ['</s>'] for inst in training_instances ] self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) progress.start_task('Example', len(training_instances)) for i, utt in enumerate(unk_replaced): progress.progress(i) self.token_counts.update(utt) self.num_tokens += len(utt) progress.end_task() @property def num_params(self): return len(self.token_counts) def predict_and_score(self, eval_instances): predict = [''] * len(eval_instances) score = [] progress.start_task('Example', len(eval_instances)) for i, inst in enumerate(eval_instances): progress.progress(i) score.append(self._get_log_prob(inst.output)) progress.end_task() return predict, score def _get_log_prob(self, output): tokenize = TOKENIZERS[self.tokenizer] tokenized = tokenize(output) + ['</s>'] unk_replaced = self.seq_vec.unk_replace(tokenized) log_prob = 0.0 for token in unk_replaced: log_prob += np.log(self.token_counts[token] * 1.0 / self.num_tokens) return log_prob
class UnigramLMSpeakerLearner(Learner): def __init__(self): options = config.options() self.tokenizer = options.speaker_tokenizer self.token_counts = Counter() self.seq_vec = SequenceVectorizer(unk_threshold=options.speaker_unk_threshold) self.num_tokens = 0 def train(self, training_instances, validation_instances='ignored', metrics='ignored'): tokenize = TOKENIZERS[self.tokenizer] tokenized = [tokenize(inst.output) + ['</s>'] for inst in training_instances] self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) progress.start_task('Example', len(training_instances)) for i, utt in enumerate(unk_replaced): progress.progress(i) self.token_counts.update(utt) self.num_tokens += len(utt) progress.end_task() @property def num_params(self): return len(self.token_counts) def predict_and_score(self, eval_instances): predict = [''] * len(eval_instances) score = [] progress.start_task('Example', len(eval_instances)) for i, inst in enumerate(eval_instances): progress.progress(i) score.append(self._get_log_prob(inst.output)) progress.end_task() return predict, score def _get_log_prob(self, output): tokenize = TOKENIZERS[self.tokenizer] tokenized = tokenize(output) + ['</s>'] unk_replaced = self.seq_vec.unk_replace(tokenized) log_prob = 0.0 for token in unk_replaced: log_prob += np.log(self.token_counts[token] * 1.0 / self.num_tokens) return log_prob
class UnigramPrior(object): ''' >>> p = UnigramPrior() >>> p.train([instance.Instance('blue')]) >>> p.sample(3) # doctest: +ELLIPSIS [Instance('...', None), Instance('...', None), Instance('...', None)] ''' def __init__(self): self.vec = SequenceVectorizer() self.vec.add_all([['</s>'], ['<MASK>']]) self.counts = theano.shared(np.zeros((self.vec.num_types,), dtype=np.int32)) self.total = theano.shared(np.array(0, dtype=np.int32)) self.log_probs = T.cast(self.counts, 'float32') / T.cast(self.total, 'float32') self.mask_index = self.vec.vectorize(['<MASK>'])[0] def train(self, training_instances, listener_data=True): get_utt = (lambda inst: inst.input) if listener_data else (lambda inst: inst.output) tokenized = [get_utt(inst).split() for inst in training_instances] self.vec.add_all(tokenized) x = self.vec.vectorize_all(self.pad(tokenized, self.vec.max_len)) vocab_size = self.vec.num_types counts = np.bincount(x.flatten(), minlength=vocab_size).astype(np.int32) counts[self.mask_index] = 0 self.counts.set_value(counts) self.total.set_value(np.sum(counts)) def apply(self, input_vars): (x,) = input_vars token_probs = self.log_probs[x] if self.mask_index is not None: token_probs = token_probs * T.cast(T.eq(x, self.mask_index), 'float32') if token_probs.ndim == 1: return token_probs else: return token_probs.sum(axis=1) def sample(self, num_samples=1): indices = np.array([[sample(self.counts.get_value() * 1.0 / self.total.get_value()) for _t in range(self.vec.max_len)] for _s in range(num_samples)], dtype=np.int32) return [instance.Instance(' '.join(strip_invalid_tokens(s))) for s in self.vec.unvectorize_all(indices)] def pad(self, sequences, length): ''' Adds </s> tokens followed by zero or more <MASK> tokens to bring the total length of all sequences to `length + 1` (the addition of one is because all sequences receive a </s>, but `length` should be the max length of the original sequences). >>> UnigramPrior().pad([['blue'], ['very', 'blue']], 2) [['blue', '</s>', '<MASK>'], ['very', 'blue', '</s>']] ''' return [seq + ['</s>'] + ['<MASK>'] * (length - len(seq)) for seq in sequences]
def __init__(self, id=None): self.get_options() self.init_submodels(id) super(RSALearner, self).__init__(id=id) color_resolution = (self.options.listener_color_resolution if self.options.listener else self.options.speaker_color_resolution) self.seq_vec = SequenceVectorizer() self.color_vec = BucketsVectorizer(color_resolution, hsv=self.options.speaker_hsv)
def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False): self.get_options() get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (lambda inst: inst.alt_outputs) get_alt_colors = get_alt_i if inverted else get_alt_o tokenize = TOKENIZERS[self.options.listener_tokenizer] tokenized = [tokenize(get_desc(inst)) for inst in instances] context_lens = [len(get_alt_colors(inst)) for inst in instances] if init_vectorizer: self.seq_vec = SequenceVectorizer() self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) if init_vectorizer: config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) self.context_len = context_lens[0] color_repr = COLOR_REPRS[self.options.listener_color_repr] self.color_vec = color_repr(self.options.listener_color_resolution, hsv=self.options.listener_hsv) assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens) padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len] for d in unk_replaced] colors = [c for inst in instances for c in get_alt_colors(inst)] labels = np.array([int(i == get_color(inst)) for inst in instances for i in range(self.context_len)]) desc_indices = self.seq_vec.vectorize_all(padded) desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types)) desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1. color_feats = self.color_vec.vectorize_all(colors) color_feats = color_feats.reshape((desc_indices.shape[0], self.context_len, color_feats.shape[1])) feats = np.einsum('ij,ick->icjk', desc_bow, color_feats) feats = feats.reshape((desc_indices.shape[0] * self.context_len, desc_bow.shape[1] * color_feats.shape[2])) return feats, labels
class LRContextListenerLearner(Learner): def train(self, training_instances, validation_instances=None, metrics=None): X, y = self._data_to_arrays(training_instances, init_vectorizer=True) self.mod = LogisticRegression(solver='lbfgs') self.mod.fit(X, y) @property def num_params(self): return np.prod(self.mod.coef_.shape) + np.prod( self.mod.intercept_.shape) def predict_and_score(self, eval_instances, random=False, verbosity=0): X, y = self._data_to_arrays(eval_instances) y = y.reshape((len(eval_instances), self.context_len)) all_scores = self.mod.predict_log_proba(X)[:, 1].reshape( (len(eval_instances), self.context_len)) all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis] preds = all_scores.argmax(axis=1) scores = np.where(y, all_scores, 0).sum(axis=1) return preds.tolist(), scores.tolist() def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False): self.get_options() get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), ( lambda inst: inst.alt_outputs) get_alt_colors = get_alt_i if inverted else get_alt_o tokenize = TOKENIZERS[self.options.listener_tokenizer] tokenized = [tokenize(get_desc(inst)) for inst in instances] context_lens = [len(get_alt_colors(inst)) for inst in instances] if init_vectorizer: self.seq_vec = SequenceVectorizer() self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) if init_vectorizer: config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) self.context_len = context_lens[0] color_repr = COLOR_REPRS[self.options.listener_color_repr] self.color_vec = color_repr(self.options.listener_color_resolution, hsv=self.options.listener_hsv) assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens) padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len] for d in unk_replaced] colors = [c for inst in instances for c in get_alt_colors(inst)] labels = np.array([ int(i == get_color(inst)) for inst in instances for i in range(self.context_len) ]) desc_indices = self.seq_vec.vectorize_all(padded) desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types)) desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1. color_feats = self.color_vec.vectorize_all(colors) color_feats = color_feats.reshape( (desc_indices.shape[0], self.context_len, color_feats.shape[1])) feats = np.einsum('ij,ick->icjk', desc_bow, color_feats) feats = feats.reshape((desc_indices.shape[0] * self.context_len, desc_bow.shape[1] * color_feats.shape[2])) return feats, labels def get_options(self): if not hasattr(self, 'options'): self.options = config.options()
class SpeakerLearner(NeuralLearner): ''' An speaker with a feedforward neural net color input passed into an RNN to generate a description. ''' def __init__(self, id=None, context_len=1): super(SpeakerLearner, self).__init__(id=id) self.seq_vec = SequenceVectorizer( unk_threshold=self.options.speaker_unk_threshold) color_repr = COLOR_REPRS[self.options.speaker_color_repr] self.color_vec = color_repr(self.options.speaker_color_resolution, hsv=self.options.speaker_hsv) self.context_len = context_len @property def use_color_mask(self): return False def predict(self, eval_instances, random=False, verbosity=0): result = [] batches = iterators.iter_batches(eval_instances, self.options.speaker_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.speaker_eval_batch_size + 1 eos_index = self.seq_vec.vectorize(['</s>'])[0] if self.options.verbosity + verbosity >= 2: print('Predicting') if self.options.verbosity + verbosity >= 1: progress.start_task('Predict batch', num_batches) for batch_num, batch in enumerate(batches): if self.options.verbosity + verbosity >= 1: progress.progress(batch_num) batch = list(batch) if self.use_color_mask: (c, cm, _p, mask), (_y, ) = self._data_to_arrays(batch, test=True) else: (c, _p, mask), (_y, ) = self._data_to_arrays(batch, test=True) assert mask.all() # We shouldn't be masking anything in prediction beam_size = 1 if random else self.options.speaker_beam_size done = np.zeros((len(batch), beam_size), dtype=np.bool) beam = np.zeros((len(batch), beam_size, self.seq_vec.max_len), dtype=np.int32) beam[:, :, 0] = self.seq_vec.vectorize(['<s>'])[0] beam_scores = np.log(np.zeros((len(batch), beam_size))) beam_scores[:, 0] = 0.0 c = np.repeat(c, beam_size, axis=0) mask = np.repeat(mask, beam_size, axis=0) if self.use_color_mask: cm = np.repeat(cm, beam_size, axis=0) for length in range(1, self.seq_vec.max_len): if done.all(): break p = beam.reshape( (beam.shape[0] * beam.shape[1], beam.shape[2]))[:, :-1] inputs = [c, cm, p, mask ] if self.use_color_mask else [c, p, mask] probs = self.model.predict(inputs) if random: indices = sample(probs[:, length - 1, :]) beam[:, 0, length] = indices done = np.logical_or(done, indices == eos_index) else: assert probs.shape[1] == p.shape[1], (probs.shape[1], p.shape[1]) assert probs.shape[2] == len( self.seq_vec.tokens), (probs.shape[2], len(self.seq_vec.tokens)) scores = np.log(probs)[:, length - 1, :].reshape( (beam.shape[0], beam.shape[1], probs.shape[2])) beam_search_step(scores, length, beam, beam_scores, done, eos_index) outputs = self.seq_vec.unvectorize_all(beam[:, 0, :]) result.extend([' '.join(strip_invalid_tokens(o)) for o in outputs]) if self.options.verbosity + verbosity >= 1: progress.end_task() return result def score(self, eval_instances, verbosity=0): result = [] batches = iterators.iter_batches(eval_instances, self.options.speaker_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.speaker_eval_batch_size + 1 if self.options.verbosity + verbosity >= 2: print('Scoring') if self.options.verbosity + verbosity >= 1: progress.start_task('Score batch', num_batches) for batch_num, batch in enumerate(batches): if self.options.verbosity + verbosity >= 1: progress.progress(batch_num) batch = list(batch) xs, (n, ) = self._data_to_arrays(batch, test=False) if self.use_color_mask: mask = xs[3] else: mask = xs[2] probs = self.model.predict(xs) token_probs = probs[np.arange(probs.shape[0])[:, np.newaxis], np.arange(probs.shape[1]), n] scores_arr = np.sum(np.log(token_probs) * mask, axis=1) scores = scores_arr.tolist() result.extend(scores) if self.options.verbosity + verbosity >= 1: progress.end_task() return result def _data_to_arrays(self, training_instances, init_vectorizer=False, test=False, inverted=False): context_len = self.context_len if hasattr(self, 'context_len') else 1 use_context = context_len > 1 def get_multi(val): if isinstance(val, tuple): assert len(val) == 1 return val[0] else: return val get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_color, get_desc_simple = (get_o, get_i) if inverted else (get_i, get_o) get_desc = lambda inst: get_multi(get_desc_simple(inst)) get_i_ind, get_o_ind = ( (lambda inst: inst.alt_inputs[get_multi(inst.input)]), (lambda inst: inst.alt_outputs[get_multi(inst.output)])) get_color_indexed = get_o_ind if inverted else get_i_ind get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), ( lambda inst: inst.alt_outputs) get_alt_colors = get_alt_o if inverted else get_alt_i if hasattr(self.options, 'speaker_tokenizer'): tokenize = TOKENIZERS[self.options.speaker_tokenizer] else: tokenize = TOKENIZERS['whitespace'] if init_vectorizer: tokenized = [['<s>'] + tokenize(get_desc(inst)) + ['</s>'] for inst in training_instances] self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) colors = [] previous = [] next_tokens = [] if self.options.verbosity >= 9: print('%s _data_to_arrays:' % self.id) for i, inst in enumerate(training_instances): desc, color = get_desc(inst), get_color(inst) if isinstance(color, numbers.Number): color = get_color_indexed(inst) if test: full = ['<s>'] + ['</s>'] * (self.seq_vec.max_len - 1) else: desc = tokenize(desc) full = (['<s>'] + desc + ['</s>'] + ['<MASK>'] * (self.seq_vec.max_len - 1 - len(desc))) prev = full[:-1] next = full[1:] if self.options.verbosity >= 9: print('%s, %s -> %s' % (repr(color), repr(prev), repr(next))) colors.append(color) if use_context: new_context = get_alt_colors(inst) index = get_color(inst) if isinstance(index, tuple): assert len(index) == 1 index = index[0] assert len(new_context) == context_len, \ 'Inconsistent context lengths: %s' % ((context_len, len(new_context)),) colors.extend( [c for j, c in enumerate(new_context) if j != index]) previous.append(prev) next_tokens.append(next) P = np.zeros((len(previous), self.seq_vec.max_len - 1), dtype=np.int32) mask = np.zeros((len(previous), self.seq_vec.max_len - 1), dtype=np.int32) N = np.zeros((len(next_tokens), self.seq_vec.max_len - 1), dtype=np.int32) c = self.color_vec.vectorize_all(colors, hsv=True) if len(c.shape) == 1: c = c.reshape((len(colors) / context_len, context_len)) else: c = c.reshape((len(colors) / context_len, context_len * c.shape[1]) + c.shape[2:]) for i, (color, prev, next) in enumerate(zip(colors, previous, next_tokens)): if len(prev) > P.shape[1]: prev = prev[:P.shape[1]] if len(next) > N.shape[1]: next = next[:N.shape[1]] P[i, :len(prev)] = self.seq_vec.vectorize(prev) N[i, :len(next)] = self.seq_vec.vectorize(next) for t, token in enumerate(next): mask[i, t] = (token != '<MASK>') c = np.tile(c[:, np.newaxis, ...], [1, self.seq_vec.max_len - 1] + [1] * (c.ndim - 1)) if self.options.verbosity >= 9: print('c: %s' % (repr(c), )) print('P: %s' % (repr(P), )) print('mask: %s' % (repr(mask), )) print('N: %s' % (repr(N), )) return [c, P, mask], [N] def _build_model(self, model_class=SimpleLasagneModel): id_tag = (self.id + '/') if self.id else '' input_vars = self.color_vec.get_input_vars( self.id, recurrent=not self.use_color_mask) if self.use_color_mask: input_vars.append(T.imatrix(id_tag + 'color_mask')) input_vars.extend( [T.imatrix(id_tag + 'previous'), T.imatrix(id_tag + 'mask')]) target_var = T.imatrix(id_tag + 'targets') self.l_out, self.input_layers = self._get_l_out(input_vars) self.model = model_class( input_vars, [target_var], self.l_out, id=self.id, loss=self.masked_loss(input_vars), optimizer=OPTIMIZERS[self.options.speaker_optimizer], learning_rate=self.options.speaker_learning_rate) def train_priors(self, training_instances, listener_data=False): prior_class = PRIORS[self.options.speaker_prior] self.prior_emp = prior_class(recurrent=True) self.prior_smooth = prior_class(recurrent=True) self.prior_emp.train(training_instances, listener_data=listener_data) self.prior_smooth.train(training_instances, listener_data=listener_data) def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' prev_output_var, mask_var = input_vars[-2:] color_input_vars = input_vars[:-2] context_len = self.context_len if hasattr(self, 'context_len') else 1 l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=self.seq_vec.max_len - 1, cell_size=self.options.speaker_cell_size, context_len=context_len, id=self.id) l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1)) for i in range(1, self.options.speaker_hidden_color_layers + 1): l_hidden_color = NINLayer( l_hidden_color, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_color%d' % i) l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1)) l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer( l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_hidden_color, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.speaker_nonlinearity] for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer( l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_prev_out, l_mask_in] def loss_out(self, input_vars=None, target_var=None): if input_vars is None: input_vars = self.model.input_vars if target_var is None: target_var = self.model.target_var pred = get_output(self.l_out, dict(zip(self.input_layers, input_vars))) loss = self.masked_loss(input_vars) return loss(pred, target_var) def masked_loss(self, input_vars): return masked_seq_crossentropy(input_vars[-1]) def sample_prior_smooth(self, num_samples): return self.prior_smooth.sample(num_samples)
class ListenerLearner(NeuralLearner): ''' An LSTM-based listener (guesses colors from descriptions). ''' def __init__(self, id=None): super(ListenerLearner, self).__init__(id=id) self.word_counts = Counter() self.seq_vec = SequenceVectorizer( unk_threshold=self.options.listener_unk_threshold) self.color_vec = BucketsVectorizer( self.options.listener_color_resolution, hsv=self.options.listener_hsv) def predict_and_score(self, eval_instances, random=False, verbosity=0): predictions = [] scores = [] batches = iterators.iter_batches(eval_instances, self.options.listener_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.listener_eval_batch_size + 1 if self.options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) xs, (y, ) = self._data_to_arrays(batch, test=True) probs = self.model.predict(xs) if random: indices = sample(probs) predictions.extend(self.unvectorize(indices, random=True)) else: predictions.extend(self.unvectorize(probs.argmax(axis=1))) scores_arr = np.log(probs[np.arange(len(batch)), y]) + self.bucket_adjustment() scores.extend(scores_arr.tolist()) progress.end_task() if self.options.verbosity >= 9: print('%s %ss:') % (self.id, 'sample' if random else 'prediction') for inst, prediction in zip(eval_instances, predictions): print('%s -> %s' % (repr(inst.input), repr(prediction))) return predictions, scores def unvectorize(self, indices, random=False): return self.color_vec.unvectorize_all(indices, random=random, hsv=True) def bucket_adjustment(self): bucket_volume = (256.0**3) / self.color_vec.num_types return -np.log(bucket_volume) def on_iter_end(self, step, writer): most_common = [ desc for desc, count in self.word_counts.most_common(10) ] insts = [instance.Instance(input=desc) for desc in most_common] xs, (y, ) = self._data_to_arrays(insts, test=True) probs = self.model.predict(xs) for i, desc in enumerate(most_common): dist = probs[i, :] for image, channel in zip( self.color_vec.visualize_distribution(dist), '012'): writer.log_image(step, '%s/%s/%s' % (self.id, desc, channel), image) super(ListenerLearner, self).on_iter_end(step, writer) def _data_to_arrays(self, training_instances, init_vectorizer=False, test=False, inverted=False): def get_multi(val): if isinstance(val, tuple): assert len(val) == 1 return val[0] else: return val get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_i_ind, get_o_ind = ( (lambda inst: inst.alt_inputs[get_multi(inst.input)]), (lambda inst: inst.alt_outputs[get_multi(inst.output)])) get_color_indexed = get_i_ind if inverted else get_o_ind if hasattr(self.options, 'listener_tokenizer'): tokenize = TOKENIZERS[self.options.listener_tokenizer] else: tokenize = TOKENIZERS['whitespace'] if init_vectorizer: tokenized = [['<s>'] + tokenize(get_desc(inst)) + ['</s>'] for inst in training_instances] self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) self.word_counts.update( [get_desc(inst) for inst in training_instances]) config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) sentences = [] colors = [] if self.options.verbosity >= 9: print('%s _data_to_arrays:' % self.id) for i, inst in enumerate(training_instances): desc = tokenize(get_desc(inst)) color = get_color(inst) if isinstance(color, numbers.Number): color = get_color_indexed(inst) if not color: assert test color = (0.0, 0.0, 0.0) s = ['<s>'] * (self.seq_vec.max_len - 1 - len(desc)) + desc s.append('</s>') if self.options.verbosity >= 9: print('%s -> %s' % (repr(s), repr(color))) sentences.append(s) colors.append(color) x = np.zeros((len(sentences), self.seq_vec.max_len), dtype=np.int32) y = np.zeros((len(sentences), ), dtype=np.int32) for i, sentence in enumerate(sentences): if len(sentence) > x.shape[1]: sentence = sentence[:x.shape[1]] x[i, :] = self.seq_vec.vectorize(sentence) y[i] = self.color_vec.vectorize(colors[i], hsv=True) return [x], [y] def _build_model(self, model_class=SimpleLasagneModel): id_tag = (self.id + '/') if self.id else '' input_var = T.imatrix(id_tag + 'inputs') target_var = T.ivector(id_tag + 'targets') self.l_out, self.input_layers = self._get_l_out([input_var]) self.loss = categorical_crossentropy self.model = model_class( [input_var], [target_var], self.l_out, loss=self.loss, optimizer=OPTIMIZERS[self.options.listener_optimizer], learning_rate=self.options.listener_learning_rate, id=self.id) def train_priors(self, training_instances, listener_data=False): prior_class = PRIORS[self.options.listener_prior] self.prior_emp = prior_class( ) # TODO: accurate values for empirical prior self.prior_smooth = prior_class() self.prior_emp.train(training_instances, listener_data=listener_data) self.prior_smooth.train(training_instances, listener_data=listener_data) def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer( l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate( b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[ self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_hidden = DenseLayer( l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.color_vec.num_types, nonlinearity=None, name=id_tag + 'scores') l_out = NonlinearityLayer(l_scores, nonlinearity=softmax, name=id_tag + 'out') return l_out, [l_in] def sample_prior_smooth(self, num_samples): return self.prior_smooth.sample(num_samples)
def __init__(self, id=None): super(ListenerLearner, self).__init__(id=id) self.word_counts = Counter() self.seq_vec = SequenceVectorizer(unk_threshold=self.options.listener_unk_threshold) self.color_vec = BucketsVectorizer(self.options.listener_color_resolution, hsv=self.options.listener_hsv)
class ListenerLearner(NeuralLearner): ''' An LSTM-based listener (guesses colors from descriptions). ''' def __init__(self, id=None): super(ListenerLearner, self).__init__(id=id) self.word_counts = Counter() self.seq_vec = SequenceVectorizer(unk_threshold=self.options.listener_unk_threshold) self.color_vec = BucketsVectorizer(self.options.listener_color_resolution, hsv=self.options.listener_hsv) def predict_and_score(self, eval_instances, random=False, verbosity=0): predictions = [] scores = [] batches = iterators.iter_batches(eval_instances, self.options.listener_eval_batch_size) num_batches = (len(eval_instances) - 1) // self.options.listener_eval_batch_size + 1 if self.options.verbosity + verbosity >= 2: print('Testing') progress.start_task('Eval batch', num_batches) for batch_num, batch in enumerate(batches): progress.progress(batch_num) batch = list(batch) xs, (y,) = self._data_to_arrays(batch, test=True) probs = self.model.predict(xs) self.on_predict(xs) if random: indices = sample(probs) predictions.extend(self.unvectorize(indices, random=True)) else: predictions.extend(self.unvectorize(probs.argmax(axis=1))) scores_arr = np.log(probs[np.arange(len(batch)), y]) + self.bucket_adjustment() scores.extend(scores_arr.tolist()) progress.end_task() if self.options.verbosity >= 9: print('%s %ss:') % (self.id, 'sample' if random else 'prediction') for inst, prediction in zip(eval_instances, predictions): print('%s -> %s' % (repr(inst.input), repr(prediction))) return predictions, scores def unvectorize(self, indices, random=False): return self.color_vec.unvectorize_all(indices, random=random, hsv=True) def bucket_adjustment(self): bucket_volume = (256.0 ** 3) / self.color_vec.num_types return -np.log(bucket_volume) def on_predict(self, xs): pass def on_iter_end(self, step, writer): most_common = [desc for desc, count in self.word_counts.most_common(10)] insts = [instance.Instance(input=desc) for desc in most_common] xs, (y,) = self._data_to_arrays(insts, test=True) probs = self.model.predict(xs) for i, desc in enumerate(most_common): dist = probs[i, :] for image, channel in zip(self.color_vec.visualize_distribution(dist), '012'): writer.log_image(step, '%s/%s/%s' % (self.id, desc, channel), image) super(ListenerLearner, self).on_iter_end(step, writer) def _data_to_arrays(self, training_instances, init_vectorizer=False, test=False, inverted=False): def get_multi(val): if isinstance(val, tuple): assert len(val) == 1 return val[0] else: return val get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_i_ind, get_o_ind = ((lambda inst: inst.alt_inputs[get_multi(inst.input)]), (lambda inst: inst.alt_outputs[get_multi(inst.output)])) get_color_indexed = get_i_ind if inverted else get_o_ind if hasattr(self.options, 'listener_tokenizer'): tokenize = TOKENIZERS[self.options.listener_tokenizer] else: tokenize = TOKENIZERS['whitespace'] if init_vectorizer: tokenized = [['<s>'] + tokenize(get_desc(inst)) + ['</s>'] for inst in training_instances] self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) self.word_counts.update([get_desc(inst) for inst in training_instances]) config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) sentences = [] colors = [] if self.options.verbosity >= 9: print('%s _data_to_arrays:' % self.id) for i, inst in enumerate(training_instances): desc = tokenize(get_desc(inst)) color = get_color(inst) if isinstance(color, numbers.Number): color = get_color_indexed(inst) if not color: assert test color = (0.0, 0.0, 0.0) s = ['<s>'] * (self.seq_vec.max_len - 1 - len(desc)) + desc s.append('</s>') if self.options.verbosity >= 9: print('%s -> %s' % (repr(s), repr(color))) sentences.append(s) colors.append(color) x = np.zeros((len(sentences), self.seq_vec.max_len), dtype=np.int32) y = np.zeros((len(sentences),), dtype=np.int32) for i, sentence in enumerate(sentences): if len(sentence) > x.shape[1]: sentence = sentence[:x.shape[1]] x[i, :] = self.seq_vec.vectorize(sentence) y[i] = self.color_vec.vectorize(colors[i], hsv=True) return [x], [y] def _build_model(self, model_class=SimpleLasagneModel): id_tag = (self.id + '/') if self.id else '' input_var = T.imatrix(id_tag + 'inputs') target_var = T.ivector(id_tag + 'targets') self.l_out, self.input_layers = self._get_l_out([input_var]) self.loss = categorical_crossentropy self.model = model_class( [input_var], [target_var], self.l_out, loss=self.loss, optimizer=OPTIMIZERS[self.options.listener_optimizer], learning_rate=self.options.listener_learning_rate, id=self.id) def train_priors(self, training_instances, listener_data=False): prior_class = PRIORS[self.options.listener_prior] self.prior_emp = prior_class() # TODO: accurate values for empirical prior self.prior_smooth = prior_class() self.prior_emp.train(training_instances, listener_data=listener_data) self.prior_smooth.train(training_instances, listener_data=listener_data) def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' input_var = input_vars[0] l_in = InputLayer(shape=(None, self.seq_vec.max_len), input_var=input_var, name=id_tag + 'desc_input') l_in_embed = EmbeddingLayer(l_in, input_size=len(self.seq_vec.tokens), output_size=self.options.listener_cell_size, name=id_tag + 'desc_embed') cell = CELLS[self.options.listener_cell] cell_kwargs = { 'grad_clipping': self.options.listener_grad_clipping, 'num_units': self.options.listener_cell_size, } if self.options.listener_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.listener_forget_bias)) if self.options.listener_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.listener_nonlinearity] l_rec1 = cell(l_in_embed, name=id_tag + 'rec1', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec1_drop = DropoutLayer(l_rec1, p=self.options.listener_dropout, name=id_tag + 'rec1_drop') else: l_rec1_drop = l_rec1 l_rec2 = cell(l_rec1_drop, name=id_tag + 'rec2', **cell_kwargs) if self.options.listener_dropout > 0.0: l_rec2_drop = DropoutLayer(l_rec2, p=self.options.listener_dropout, name=id_tag + 'rec2_drop') else: l_rec2_drop = l_rec2 l_hidden = DenseLayer(l_rec2_drop, num_units=self.options.listener_cell_size, nonlinearity=NONLINEARITIES[self.options.listener_nonlinearity], name=id_tag + 'hidden') if self.options.listener_dropout > 0.0: l_hidden_drop = DropoutLayer(l_hidden, p=self.options.listener_dropout, name=id_tag + 'hidden_drop') else: l_hidden_drop = l_hidden l_scores = DenseLayer(l_hidden_drop, num_units=self.color_vec.num_types, nonlinearity=None, name=id_tag + 'scores') l_out = NonlinearityLayer(l_scores, nonlinearity=softmax, name=id_tag + 'out') return l_out, [l_in] def sample_prior_smooth(self, num_samples): return self.prior_smooth.sample(num_samples)
def __init__(self): options = config.options() self.tokenizer = options.speaker_tokenizer self.token_counts = Counter() self.seq_vec = SequenceVectorizer(unk_threshold=options.speaker_unk_threshold) self.num_tokens = 0
class LRContextListenerLearner(Learner): def train(self, training_instances, validation_instances=None, metrics=None): X, y = self._data_to_arrays(training_instances, init_vectorizer=True) self.mod = LogisticRegression(solver='lbfgs') self.mod.fit(X, y) @property def num_params(self): return np.prod(self.mod.coef_.shape) + np.prod(self.mod.intercept_.shape) def predict_and_score(self, eval_instances, random=False, verbosity=0): X, y = self._data_to_arrays(eval_instances) y = y.reshape((len(eval_instances), self.context_len)) all_scores = self.mod.predict_log_proba(X)[:, 1].reshape((len(eval_instances), self.context_len)) all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis] preds = all_scores.argmax(axis=1) scores = np.where(y, all_scores, 0).sum(axis=1) return preds.tolist(), scores.tolist() def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False): self.get_options() get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (lambda inst: inst.alt_outputs) get_alt_colors = get_alt_i if inverted else get_alt_o tokenize = TOKENIZERS[self.options.listener_tokenizer] tokenized = [tokenize(get_desc(inst)) for inst in instances] context_lens = [len(get_alt_colors(inst)) for inst in instances] if init_vectorizer: self.seq_vec = SequenceVectorizer() self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) if init_vectorizer: config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) self.context_len = context_lens[0] color_repr = COLOR_REPRS[self.options.listener_color_repr] self.color_vec = color_repr(self.options.listener_color_resolution, hsv=self.options.listener_hsv) assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens) padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len] for d in unk_replaced] colors = [c for inst in instances for c in get_alt_colors(inst)] labels = np.array([int(i == get_color(inst)) for inst in instances for i in range(self.context_len)]) desc_indices = self.seq_vec.vectorize_all(padded) desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types)) desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1. color_feats = self.color_vec.vectorize_all(colors) color_feats = color_feats.reshape((desc_indices.shape[0], self.context_len, color_feats.shape[1])) feats = np.einsum('ij,ick->icjk', desc_bow, color_feats) feats = feats.reshape((desc_indices.shape[0] * self.context_len, desc_bow.shape[1] * color_feats.shape[2])) return feats, labels def get_options(self): if not hasattr(self, 'options'): self.options = config.options()