def _create_neg(self):
     data = []
     for _ in range(self.n_neg):
         vec1 = randvec(self.embed_dim)
         vec2 = vec1.copy()
         while np.array_equal(vec1, vec2):
             vec2 = randvec(self.embed_dim)
         rep = (vec1, vec2)
         data.append((rep, self.NEG_LABEL))
     return data
 def _create_pos(self):
     data = []
     for _ in range(self.n_pos):
         vec = randvec(self.embed_dim)
         rep = (vec, vec)
         data.append((rep, self.POS_LABEL))
     return data
def create_example_dataset(group_size=100, vec_dim=2):
    """
    Creates simple datasets in which the inputs are three-vector
    sequences and the outputs are simple character sequences, with
    the range of values in the final vector in the input determining
    the output sequence. For example, a single input/output pair
    will look like this:

    [[0.44, 0.51], [0.87, 0.89], [0.1, 0.2]],  ['<s>', 'A', '</s>']

    The sequences are meaningless, as are their lengths (which were
    chosen only to be different from each other).

    """
    import random

    groups = ((0.0, 0.2), (0.4, 0.6), (0.8, 1.0))
    vocab = ['<s>', '</s>', 'A', 'B', '$UNK']
    seqs = [
        ['<s>', 'A', '</s>'],
        ['<s>', 'A', 'B', '</s>'],
        ['<s>', 'B', 'A', 'B', 'A', '</s>']]

    color_seqs = []
    word_seqs = []
    for i, ((l, u), seq) in enumerate(zip(groups, seqs)):

        dis_indices = list(range(len(groups)))
        dis_indices.remove(i)
        random.shuffle(dis_indices)
        disl1, disu1 = groups[dis_indices[0]]
        disl2, disu2 = groups[dis_indices[1]]

        for _ in range(group_size):
            target = utils.randvec(vec_dim, l, u)
            dis1 = utils.randvec(vec_dim, disl1, disu1)
            dis2 = utils.randvec(vec_dim, disl2, disu2)
            context = [dis1, dis2, target]
            color_seqs.append(context)

        word_seqs += [seq for _ in range(group_size)]

    return color_seqs, word_seqs, vocab
Example #4
0
def glove_middle_featurizer(kbt, corpus, np_func=np.sum):
    reps = []
    for ex in corpus.get_examples_for_entities(kbt.sbj, kbt.obj):
        for word in ex.middle.split():
            rep = glove_lookup.get(word)
            if rep is not None:
                reps.append(rep)
    # A random representation of the right dimensionality if the
    # example happens not to overlap with GloVe's vocabulary:
    if len(reps) == 0:
        dim = len(next(iter(glove_lookup.values())))
        return utils.randvec(n=dim)
    else:
        return np_func(reps, axis=0)
Example #5
0
def test_premack_create_same_same(dataset_class):
    n_pos = 20
    dataset = dataset_class(n_pos=n_pos,
                            flatten_root=True,
                            flatten_leaves=False)
    if 'Trained' in dataset_class.__name__:
        examples = [utils.randvec(10) for _ in range(n_pos)]
        result = dataset._create_same_same(examples)
    else:
        result = dataset._create_same_same()
    assert len(result) == dataset.n_same_same
    for (p1, p2), label in result:
        assert label == dataset.POS_LABEL
        assert np.array_equal(p1[0], p1[1])
        assert np.array_equal(p2[0], p2[1])
Example #6
0
def test_np_model(X_sequence):
    """Just makes sure that this code will run; it doesn't check that
    it is creating good models.
    """
    X_train, X_test, y_train, y_test, vocab = X_sequence
    embedding = np.array([utils.randvec(10) for _ in vocab])
    mod = RNNClassifier(vocab=vocab,
                        embedding=embedding,
                        hidden_dim=20,
                        max_iter=100)
    mod.fit(X_train, y_train)
    mod.predict(X_test)
    mod.predict_proba(X_test)
    mod.predict_one(X_test[0])
    mod.predict_one_proba(X_test[0])
Example #7
0
def test_premack_create_same_diff(dataset_class):
    n_neg = 20
    vecs_needed = 30
    dataset = dataset_class(n_neg=n_neg,
                            flatten_root=True,
                            flatten_leaves=False)
    if 'Trained' in dataset_class.__name__:
        examples = [utils.randvec(10) for _ in range(vecs_needed)]
        result = dataset._create_same_diff(examples)
    else:
        result = dataset._create_same_diff()
    assert len(result) == dataset.n_same_diff
    for (p1, p2), label in result:
        assert label == dataset.NEG_LABEL
        assert np.array_equal(p1[0], p1[1])
        assert not np.array_equal(p2[0], p2[1])
def glove_featurizer(tweet, glove_lookup, np_func=np.sum):
    """Get vector representation of one tweet.
    """
    reps = []
    tokens = tweet.split()
    for word in tokens:
        rep = glove_lookup.get(word)
        if rep is not None:
            reps.append(rep)
    # A random representation of the right dimensionality if the
    # example happens not to overlap with GloVe's vocabulary:
    if len(reps) == 0:
        dim = len(next(iter(glove_lookup.values())))                
        return randvec(n=dim)
    else:
        return np_func(reps, axis=0)
 def _train_embedding(self):
     embedding = np.array(
         [randvec(self.embed_dim) for _ in range(self.vocab_size)])
     mod = RepLearner(self.vocab_size,
                      embed_dim=self.embed_dim,
                      embedding=embedding,
                      hidden_dim=self.hidden_dim,
                      n_tasks=self.n_tasks,
                      max_iter=self.max_iter)
     X = list(range(self.vocab_size))
     ys = []
     for _ in range(self.n_tasks):
         y = np.random.choice((0, 1), size=self.vocab_size, replace=True)
         ys.append(y)
     ys = list(zip(*ys))
     mod.fit(X, ys)
     self.embedding = mod.embedding
     self.embedding_labels = ys
Example #10
0
def load_data_embedding(glove6B=False):
    X_train, y_train = build_dataset('train', max_sen_length)
    X_dev, y_dev = build_dataset('dev', max_sen_length)
    embedding_weights = np.zeros((vocab_size, vocab_dim))
    if vocab_dim == 50:
        GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.50d.txt'))
    elif vocab_dim == 100:
        GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt'))
    elif vocab_dim == 200:
        GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.200d.txt'))
    elif vocab_dim == 300:
        if glove6B:
            GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.300d.txt'))
        else:
            glove_home = 'glove_dir/glove.840B'
            GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.840B.300d.txt'))
    for word, index in wordMap.items():
        if word in GLOVE:
            embedding_weights[index, :] = GLOVE[word]
        else:
            embedding_weights[index, :] = utils.randvec(vocab_dim)
    return X_train, y_train, X_dev, y_dev, embedding_weights
Example #11
0
 def featurizer(kbt, corpus):
     return utils.randvec(10)
Example #12
0
def dummy_nonvectorizing_feature_function(kbt, corpus):
    return utils.randvec(10)
Example #13
0
def test_randvec():
    x = utils.randvec(10)
    assert len(x) == 10
 def bias_init(n):
     """Uses the current PyTorch default `nn.Linear`."""
     x = np.sqrt(1.0 / n)
     return randvec(n, lower=-x, upper=x)
Example #15
0
        
        """
        self._forward_propagation(seq)
        return np.argmax(self.y)


######################################################################

if __name__ == '__main__':

    T = 'T'
    F = 'F'

    train = [
        # p  q      XOR
        ([T, T], [1., 0.]),
        ([T, F], [0., 1.]),
        ([F, T], [0., 1.]),
        ([F, F], [1., 0.])
    ]

    vocab = [T, F]
    embedding = np.array([randvec(10) for _ in vocab])

    mod = ClassifierRNN(vocab=vocab, embedding=embedding, maxiter=1000)
    mod.fit(copy.copy(train))

    for x, y in train:
        p = mod.predict(x)
        print(p == np.argmax(y), mod.y, y)
Example #16
0
def randvec(w, n=50, lower=-1.0, upper=1.0):
    """Returns a random vector of length `n`. `w` is ignored."""
    return utils.randvec(n=n, lower=lower, upper=upper)
 def _create_same_pair(self):
     vec = randvec(self.embed_dim)
     return (vec, vec)
 def _create_diff_pair(self):
     vec1 = randvec(self.embed_dim)
     vec2 = randvec(self.embed_dim)
     assert not np.array_equal(vec1, vec2)
     return (vec1, vec2)
Example #19
0
            the model.
        
        """
        self._forward_propagation(seq)
        return np.argmax(self.y)
        
######################################################################    

if __name__ == '__main__':

    T = 'T'
    F = 'F'
    
    train = [
        # p  q      XOR
        ([T ,T], [1.,   0.]),
        ([T, F], [0.,   1.]),
        ([F, T], [0.,   1.]),
        ([F, F], [1.,   0.])]
    
    vocab = [T, F]
    embedding = np.array([randvec(10) for _ in vocab])
    
    mod = ClassifierRNN(vocab=vocab, embedding=embedding, maxiter=1000)
    mod.fit(copy.copy(train))
    
    for x, y in train:
        p = mod.predict(x)
        print(p == np.argmax(y), mod.y, y)