class UnigramPrior(object): ''' >>> p = UnigramPrior() >>> p.train([instance.Instance('blue')]) >>> p.sample(3) # doctest: +ELLIPSIS [Instance('...', None), Instance('...', None), Instance('...', None)] ''' def __init__(self): self.vec = SequenceVectorizer() self.vec.add_all([['</s>'], ['<MASK>']]) self.counts = theano.shared(np.zeros((self.vec.num_types,), dtype=np.int32)) self.total = theano.shared(np.array(0, dtype=np.int32)) self.log_probs = T.cast(self.counts, 'float32') / T.cast(self.total, 'float32') self.mask_index = self.vec.vectorize(['<MASK>'])[0] def train(self, training_instances, listener_data=True): get_utt = (lambda inst: inst.input) if listener_data else (lambda inst: inst.output) tokenized = [get_utt(inst).split() for inst in training_instances] self.vec.add_all(tokenized) x = self.vec.vectorize_all(self.pad(tokenized, self.vec.max_len)) vocab_size = self.vec.num_types counts = np.bincount(x.flatten(), minlength=vocab_size).astype(np.int32) counts[self.mask_index] = 0 self.counts.set_value(counts) self.total.set_value(np.sum(counts)) def apply(self, input_vars): (x,) = input_vars token_probs = self.log_probs[x] if self.mask_index is not None: token_probs = token_probs * T.cast(T.eq(x, self.mask_index), 'float32') if token_probs.ndim == 1: return token_probs else: return token_probs.sum(axis=1) def sample(self, num_samples=1): indices = np.array([[sample(self.counts.get_value() * 1.0 / self.total.get_value()) for _t in range(self.vec.max_len)] for _s in range(num_samples)], dtype=np.int32) return [instance.Instance(' '.join(strip_invalid_tokens(s))) for s in self.vec.unvectorize_all(indices)] def pad(self, sequences, length): ''' Adds </s> tokens followed by zero or more <MASK> tokens to bring the total length of all sequences to `length + 1` (the addition of one is because all sequences receive a </s>, but `length` should be the max length of the original sequences). >>> UnigramPrior().pad([['blue'], ['very', 'blue']], 2) [['blue', '</s>', '<MASK>'], ['very', 'blue', '</s>']] ''' return [seq + ['</s>'] + ['<MASK>'] * (length - len(seq)) for seq in sequences]
class LRContextListenerLearner(Learner): def train(self, training_instances, validation_instances=None, metrics=None): X, y = self._data_to_arrays(training_instances, init_vectorizer=True) self.mod = LogisticRegression(solver='lbfgs') self.mod.fit(X, y) @property def num_params(self): return np.prod(self.mod.coef_.shape) + np.prod( self.mod.intercept_.shape) def predict_and_score(self, eval_instances, random=False, verbosity=0): X, y = self._data_to_arrays(eval_instances) y = y.reshape((len(eval_instances), self.context_len)) all_scores = self.mod.predict_log_proba(X)[:, 1].reshape( (len(eval_instances), self.context_len)) all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis] preds = all_scores.argmax(axis=1) scores = np.where(y, all_scores, 0).sum(axis=1) return preds.tolist(), scores.tolist() def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False): self.get_options() get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), ( lambda inst: inst.alt_outputs) get_alt_colors = get_alt_i if inverted else get_alt_o tokenize = TOKENIZERS[self.options.listener_tokenizer] tokenized = [tokenize(get_desc(inst)) for inst in instances] context_lens = [len(get_alt_colors(inst)) for inst in instances] if init_vectorizer: self.seq_vec = SequenceVectorizer() self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) if init_vectorizer: config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) self.context_len = context_lens[0] color_repr = COLOR_REPRS[self.options.listener_color_repr] self.color_vec = color_repr(self.options.listener_color_resolution, hsv=self.options.listener_hsv) assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens) padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len] for d in unk_replaced] colors = [c for inst in instances for c in get_alt_colors(inst)] labels = np.array([ int(i == get_color(inst)) for inst in instances for i in range(self.context_len) ]) desc_indices = self.seq_vec.vectorize_all(padded) desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types)) desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1. color_feats = self.color_vec.vectorize_all(colors) color_feats = color_feats.reshape( (desc_indices.shape[0], self.context_len, color_feats.shape[1])) feats = np.einsum('ij,ick->icjk', desc_bow, color_feats) feats = feats.reshape((desc_indices.shape[0] * self.context_len, desc_bow.shape[1] * color_feats.shape[2])) return feats, labels def get_options(self): if not hasattr(self, 'options'): self.options = config.options()
class LRContextListenerLearner(Learner): def train(self, training_instances, validation_instances=None, metrics=None): X, y = self._data_to_arrays(training_instances, init_vectorizer=True) self.mod = LogisticRegression(solver='lbfgs') self.mod.fit(X, y) @property def num_params(self): return np.prod(self.mod.coef_.shape) + np.prod(self.mod.intercept_.shape) def predict_and_score(self, eval_instances, random=False, verbosity=0): X, y = self._data_to_arrays(eval_instances) y = y.reshape((len(eval_instances), self.context_len)) all_scores = self.mod.predict_log_proba(X)[:, 1].reshape((len(eval_instances), self.context_len)) all_scores -= logsumexp(all_scores, axis=1)[:, np.newaxis] preds = all_scores.argmax(axis=1) scores = np.where(y, all_scores, 0).sum(axis=1) return preds.tolist(), scores.tolist() def _data_to_arrays(self, instances, inverted=False, init_vectorizer=False): self.get_options() get_i, get_o = (lambda inst: inst.input), (lambda inst: inst.output) get_desc, get_color = (get_o, get_i) if inverted else (get_i, get_o) get_alt_i, get_alt_o = (lambda inst: inst.alt_inputs), (lambda inst: inst.alt_outputs) get_alt_colors = get_alt_i if inverted else get_alt_o tokenize = TOKENIZERS[self.options.listener_tokenizer] tokenized = [tokenize(get_desc(inst)) for inst in instances] context_lens = [len(get_alt_colors(inst)) for inst in instances] if init_vectorizer: self.seq_vec = SequenceVectorizer() self.seq_vec.add_all(tokenized) unk_replaced = self.seq_vec.unk_replace_all(tokenized) if init_vectorizer: config.dump(unk_replaced, 'unk_replaced.train.jsons', lines=True) self.context_len = context_lens[0] color_repr = COLOR_REPRS[self.options.listener_color_repr] self.color_vec = color_repr(self.options.listener_color_resolution, hsv=self.options.listener_hsv) assert all(cl == self.context_len for cl in context_lens), (self.context_len, context_lens) padded = [(d + ['</s>'] * (self.seq_vec.max_len - len(d)))[:self.seq_vec.max_len] for d in unk_replaced] colors = [c for inst in instances for c in get_alt_colors(inst)] labels = np.array([int(i == get_color(inst)) for inst in instances for i in range(self.context_len)]) desc_indices = self.seq_vec.vectorize_all(padded) desc_bow = -np.ones((desc_indices.shape[0], self.seq_vec.num_types)) desc_bow[np.arange(desc_indices.shape[0])[:, np.newaxis], desc_indices] = 1. color_feats = self.color_vec.vectorize_all(colors) color_feats = color_feats.reshape((desc_indices.shape[0], self.context_len, color_feats.shape[1])) feats = np.einsum('ij,ick->icjk', desc_bow, color_feats) feats = feats.reshape((desc_indices.shape[0] * self.context_len, desc_bow.shape[1] * color_feats.shape[2])) return feats, labels def get_options(self): if not hasattr(self, 'options'): self.options = config.options()