def _create_neg(self): data = [] for _ in range(self.n_neg): vec1 = randvec(self.embed_dim) vec2 = vec1.copy() while np.array_equal(vec1, vec2): vec2 = randvec(self.embed_dim) rep = (vec1, vec2) data.append((rep, self.NEG_LABEL)) return data
def _create_pos(self): data = [] for _ in range(self.n_pos): vec = randvec(self.embed_dim) rep = (vec, vec) data.append((rep, self.POS_LABEL)) return data
def create_example_dataset(group_size=100, vec_dim=2): """ Creates simple datasets in which the inputs are three-vector sequences and the outputs are simple character sequences, with the range of values in the final vector in the input determining the output sequence. For example, a single input/output pair will look like this: [[0.44, 0.51], [0.87, 0.89], [0.1, 0.2]], ['<s>', 'A', '</s>'] The sequences are meaningless, as are their lengths (which were chosen only to be different from each other). """ import random groups = ((0.0, 0.2), (0.4, 0.6), (0.8, 1.0)) vocab = ['<s>', '</s>', 'A', 'B', '$UNK'] seqs = [ ['<s>', 'A', '</s>'], ['<s>', 'A', 'B', '</s>'], ['<s>', 'B', 'A', 'B', 'A', '</s>']] color_seqs = [] word_seqs = [] for i, ((l, u), seq) in enumerate(zip(groups, seqs)): dis_indices = list(range(len(groups))) dis_indices.remove(i) random.shuffle(dis_indices) disl1, disu1 = groups[dis_indices[0]] disl2, disu2 = groups[dis_indices[1]] for _ in range(group_size): target = utils.randvec(vec_dim, l, u) dis1 = utils.randvec(vec_dim, disl1, disu1) dis2 = utils.randvec(vec_dim, disl2, disu2) context = [dis1, dis2, target] color_seqs.append(context) word_seqs += [seq for _ in range(group_size)] return color_seqs, word_seqs, vocab
def glove_middle_featurizer(kbt, corpus, np_func=np.sum): reps = [] for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj): for word in ex.middle.split(): rep = glove_lookup.get(word) if rep is not None: reps.append(rep) # A random representation of the right dimensionality if the # example happens not to overlap with GloVe's vocabulary: if len(reps) == 0: dim = len(next(iter(glove_lookup.values()))) return utils.randvec(n=dim) else: return np_func(reps, axis=0)
def test_premack_create_same_same(dataset_class): n_pos = 20 dataset = dataset_class(n_pos=n_pos, flatten_root=True, flatten_leaves=False) if 'Trained' in dataset_class.__name__: examples = [utils.randvec(10) for _ in range(n_pos)] result = dataset._create_same_same(examples) else: result = dataset._create_same_same() assert len(result) == dataset.n_same_same for (p1, p2), label in result: assert label == dataset.POS_LABEL assert np.array_equal(p1[0], p1[1]) assert np.array_equal(p2[0], p2[1])
def test_np_model(X_sequence): """Just makes sure that this code will run; it doesn't check that it is creating good models. """ X_train, X_test, y_train, y_test, vocab = X_sequence embedding = np.array([utils.randvec(10) for _ in vocab]) mod = RNNClassifier(vocab=vocab, embedding=embedding, hidden_dim=20, max_iter=100) mod.fit(X_train, y_train) mod.predict(X_test) mod.predict_proba(X_test) mod.predict_one(X_test[0]) mod.predict_one_proba(X_test[0])
def test_premack_create_same_diff(dataset_class): n_neg = 20 vecs_needed = 30 dataset = dataset_class(n_neg=n_neg, flatten_root=True, flatten_leaves=False) if 'Trained' in dataset_class.__name__: examples = [utils.randvec(10) for _ in range(vecs_needed)] result = dataset._create_same_diff(examples) else: result = dataset._create_same_diff() assert len(result) == dataset.n_same_diff for (p1, p2), label in result: assert label == dataset.NEG_LABEL assert np.array_equal(p1[0], p1[1]) assert not np.array_equal(p2[0], p2[1])
def glove_featurizer(tweet, glove_lookup, np_func=np.sum): """Get vector representation of one tweet. """ reps = [] tokens = tweet.split() for word in tokens: rep = glove_lookup.get(word) if rep is not None: reps.append(rep) # A random representation of the right dimensionality if the # example happens not to overlap with GloVe's vocabulary: if len(reps) == 0: dim = len(next(iter(glove_lookup.values()))) return randvec(n=dim) else: return np_func(reps, axis=0)
def _train_embedding(self): embedding = np.array( [randvec(self.embed_dim) for _ in range(self.vocab_size)]) mod = RepLearner(self.vocab_size, embed_dim=self.embed_dim, embedding=embedding, hidden_dim=self.hidden_dim, n_tasks=self.n_tasks, max_iter=self.max_iter) X = list(range(self.vocab_size)) ys = [] for _ in range(self.n_tasks): y = np.random.choice((0, 1), size=self.vocab_size, replace=True) ys.append(y) ys = list(zip(*ys)) mod.fit(X, ys) self.embedding = mod.embedding self.embedding_labels = ys
def load_data_embedding(glove6B=False): X_train, y_train = build_dataset('train', max_sen_length) X_dev, y_dev = build_dataset('dev', max_sen_length) embedding_weights = np.zeros((vocab_size, vocab_dim)) if vocab_dim == 50: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.50d.txt')) elif vocab_dim == 100: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt')) elif vocab_dim == 200: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.200d.txt')) elif vocab_dim == 300: if glove6B: GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.300d.txt')) else: glove_home = 'glove_dir/glove.840B' GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.840B.300d.txt')) for word, index in wordMap.items(): if word in GLOVE: embedding_weights[index, :] = GLOVE[word] else: embedding_weights[index, :] = utils.randvec(vocab_dim) return X_train, y_train, X_dev, y_dev, embedding_weights
def featurizer(kbt, corpus): return utils.randvec(10)
def dummy_nonvectorizing_feature_function(kbt, corpus): return utils.randvec(10)
def test_randvec(): x = utils.randvec(10) assert len(x) == 10
def bias_init(n): """Uses the current PyTorch default `nn.Linear`.""" x = np.sqrt(1.0 / n) return randvec(n, lower=-x, upper=x)
""" self._forward_propagation(seq) return np.argmax(self.y) ###################################################################### if __name__ == '__main__': T = 'T' F = 'F' train = [ # p q XOR ([T, T], [1., 0.]), ([T, F], [0., 1.]), ([F, T], [0., 1.]), ([F, F], [1., 0.]) ] vocab = [T, F] embedding = np.array([randvec(10) for _ in vocab]) mod = ClassifierRNN(vocab=vocab, embedding=embedding, maxiter=1000) mod.fit(copy.copy(train)) for x, y in train: p = mod.predict(x) print(p == np.argmax(y), mod.y, y)
def randvec(w, n=50, lower=-1.0, upper=1.0): """Returns a random vector of length `n`. `w` is ignored.""" return utils.randvec(n=n, lower=lower, upper=upper)
def _create_same_pair(self): vec = randvec(self.embed_dim) return (vec, vec)
def _create_diff_pair(self): vec1 = randvec(self.embed_dim) vec2 = randvec(self.embed_dim) assert not np.array_equal(vec1, vec2) return (vec1, vec2)
the model. """ self._forward_propagation(seq) return np.argmax(self.y) ###################################################################### if __name__ == '__main__': T = 'T' F = 'F' train = [ # p q XOR ([T ,T], [1., 0.]), ([T, F], [0., 1.]), ([F, T], [0., 1.]), ([F, F], [1., 0.])] vocab = [T, F] embedding = np.array([randvec(10) for _ in vocab]) mod = ClassifierRNN(vocab=vocab, embedding=embedding, maxiter=1000) mod.fit(copy.copy(train)) for x, y in train: p = mod.predict(x) print(p == np.argmax(y), mod.y, y)