Example #1
0
def main(we_file, w2i_file, use_brown=True, n_files=100):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = [] # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king', 'man', 'woman',
                'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england',
                'french', 'english', 'japan', 'japanese', 'chinese', 'italian',
                'australia', 'australian', 'december', 'november', 'june',
                'january', 'february', 'march', 'april', 'may', 'july', 'august',
                'september', 'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)

    # alternating least squares method
    model.fit(sentences, cc_matrix=cc_matrix)
    model.save(we_file)
Example #2
0
def main(we_file, w2i_file, n_files=50):
    cc_matrix = "cc_matrix_%s.npy" % n_files

    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = []
    else:
        sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(80, V, 10)
    model.fit(sentences, cc_matrix=cc_matrix, epochs=20)
    # model.fit(
    #     sentences=sentences,
    #     cc_matrix=cc_matrix,
    #     learning_rate=3*10e-5,
    #     reg=0.01,
    #     epochs=2000,
    #     gd=True,
    #     use_theano=False,
    # )
    model.save(we_file)
def main(we_file, w2i_file, use_brown=True, n_files=50):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s.npy" % n_files

    # hacky way of checking if we need to re-load the raw data or not
    # remember, only the co-occurrence matrix is needed for training
    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = [] # dummy - we won't actually use it
    else:
        if use_brown:
            keep_words = set([
                'king', 'man', 'woman',
                'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england',
                'french', 'english', 'japan', 'japanese', 'chinese', 'italian',
                'australia', 'australian', 'december', 'november', 'june',
                'january', 'february', 'march', 'april', 'may', 'july', 'august',
                'september', 'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(100, V, 10)
    model.fit(sentences, cc_matrix=cc_matrix, epochs=200)
    model.save(we_file)
def main():
    sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
    V = len(word2idx)
    model = Model(10, V, 2)
    # fp = open('/Users/macuser/Code/word2vec-proto/wiki.en.text')
    # model.fit(fp)
    model.fit(sentences)
    model.save('w2v_model.npz')
def main():
    sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    V = len(word2idx)
    model = Model(80, V, 10)
    model.fitt(sentences, learning_rate=10e-4, mu=0, epochs=5)
    model.save('w2v_model.npz')
def main():
    sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    V = len(word2idx)
    model = Model(80, V, 10)
    model.fitt(sentences, learning_rate=10e-4, mu=0, epochs=5)
    model.save('w2v_model.npz')
Example #7
0
def main(use_brown=True):
    if use_brown:
        # sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
        sentences, word2idx = get_sentences_with_word2idx()
    else:
        sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    V = len(word2idx)
    model = Model(50, V, 5)
    model.fit(sentences, learning_rate=1e-3, mu=0, epochs=3, num_neg_samples=5)
    model.save('w2v_model.npz')
Example #8
0
def main():
    sentences, word2idx = get_wikipedia_data(n_files=10,
                                             n_vocab=1500,
                                             by_paragraph=True)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)
    V = len(word2idx)
    N = len(sentences)
    A = np.zeros(V, N)
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print("finished getting raw counts")
Example #9
0
def main():
    sentences, word2idx = get_wikipedia_data(n_files=10,
                                             n_vocab=1500,
                                             by_paragraph=True)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print "finished getting raw counts"

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print "type(A):", type(A)
    # exit()
    A = A.toarray()

    idx2word = {v: k for k, v in word2idx.iteritems()}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:, 0], Z[:, 1])
    for i in xrange(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i, 0], Z[i, 1]))
        except:
            print "bad string:", idx2word[i]
    plt.show()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z
    find_analogies('king', 'man', 'woman', We, word2idx)
    find_analogies('france', 'paris', 'london', We, word2idx)
    find_analogies('france', 'paris', 'rome', We, word2idx)
    find_analogies('paris', 'france', 'italy', We, word2idx)
def main():
    sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print "finished getting raw counts"

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print "type(A):", type(A)
    # exit()
    A = A.toarray()

    idx2word = {v:k for k, v in word2idx.iteritems()}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in xrange(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print "bad string:", idx2word[i]
    plt.show()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z
    find_analogies('king', 'man', 'woman', We, word2idx)
    find_analogies('france', 'paris', 'london', We, word2idx)
    find_analogies('france', 'paris', 'rome', We, word2idx)
    find_analogies('paris', 'france', 'italy', We, word2idx)
Example #11
0
def main(we_file, w2i_file, n_files=50):
    # co-occurrence matrix
    cc_matrix = 'cc_matrix_%s.npy' % n_files

    if os.path.exists(cc_matrix):
        with open(w2i_file, 'r') as f:
            word2idx = json.load(f)
        sentences = []
    else:
        sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000)
        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(80, V, 10)
    model.fit(sentences=sentences,
              cc_matrix=cc_matrix,
              learning_rate=3 * 10e-5,
              reg=0.01,
              epochs=20,
              gd=False,
              use_theano=False)
    model.save(we_file)
Example #12
0
def main(we_file, w2i_file, n_files=75, use_brown=False):
    if use_brown:
        cc_matrix = "cc_matrix_brown.npy"
    else:
        cc_matrix = "cc_matrix_%s.npy" % n_files

    if os.path.exists(cc_matrix):
        with open(w2i_file) as f:
            word2idx = json.load(f)
        sentences = []
    else:
        if use_brown:
            keep_words = set([
                'king',
                'man',
                'woman',
                'france',
                'paris',
                'london',
                'rome',
                'italy',
                'britain',
                'england',
                'french',
                'english',
                'japan',
                'japanese',
                'chinese',
                'italian',
                'australia',
                'australian',
                'december',
                'november',
                'june',
                'january',
                'february',
                'march',
                'april',
                'may',
                'july',
                'august',
                'september',
                'october',
            ])
            sentences, word2idx = get_sentences_with_word2idx_limit_vocab(
                n_vocab=10000, keep_words=keep_words)
        else:
            sentences, word2idx = get_wikipedia_data(n_files=n_files,
                                                     n_vocab=10000)

        with open(w2i_file, 'w') as f:
            json.dump(word2idx, f)

    V = len(word2idx)
    model = Glove(137, V, 10)
    model.fit(sentences, cc_matrix=cc_matrix, epochs=100)
    # model.fit(
    #     sentences=sentences,
    #     cc_matrix=cc_matrix,
    #     learning_rate=3*10e-5,
    #     reg=0.01,
    #     epochs=2000,
    #     gd=True,
    #     use_theano=False,
    # )
    model.save(we_file)
def main():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    ### choose a data source ###
    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size")
                notfound = True
    if notfound:
        exit()


    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A.T).T

    # tsne requires a dense array
    A = A.toarray()

    # map back to word in plot
    idx2word = {v:k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    ### multiple ways to create vectors for each word ###
    # 1) simply set it to the TF-IDF matrix
    # We = A

    # 2) create a higher-D word embedding
    tsne = TSNE(n_components=3)
    We = tsne.fit_transform(A)

    # 3) use a classic dimensionality reduction technique
    # svd = KernelPCA(n_components=20, kernel='rbf')
    # We = svd.fit_transform(A)

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx, idx2word)

    plt.show() # pause script until plot is closed
Example #14
0
def main():
    import ipdb;ipdb.set_trace()
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    ### choose a data source ###
    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=3, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size" % w)
                notfound = True
    if notfound:
        exit()


    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    print("V:", V, "N:", N)
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A.T).T

    # tsne requires a dense array
    A = A.toarray()

    # map back to word in plot
    idx2word = {v:k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    ### multiple ways to create vectors for each word ###
    # 1) simply set it to the TF-IDF matrix
    # We = A

    # 2) create a higher-D word embedding
    tsne = TSNE(n_components=3)
    We = tsne.fit_transform(A)

    # 3) use a classic dimensionality reduction technique
    # svd = KernelPCA(n_components=20, kernel='rbf')
    # We = svd.fit_transform(A)

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx, idx2word)

    plt.show() # pause script until plot is closed
def main():
<<<<<<< HEAD
    sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    # sentences, word2idx = get_wikipedia_data(n_files=10, n_vocab=1500, by_paragraph=True)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)
=======
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size")
                notfound = True
    if notfound:
        exit()

>>>>>>> upstream/master
def main():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=20, n_vocab=2000, by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size")
                notfound = True
    if notfound:
        exit()


    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i,j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print("type(A):", type(A))
    # exit()
    A = A.toarray()

    idx2word = {v:k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:,0], Z[:,1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"), xy=(Z[i,0], Z[i,1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx)

    plt.show() # pause script until plot is closed
Example #17
0
def main():
    analogies_to_try = (
        ('king', 'man', 'woman'),
        ('france', 'paris', 'london'),
        ('france', 'paris', 'rome'),
        ('paris', 'france', 'italy'),
    )

    # sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=1500)
    sentences, word2idx = get_wikipedia_data(n_files=20,
                                             n_vocab=2000,
                                             by_paragraph=True)
    # with open('tfidf_word2idx.json', 'w') as f:
    #     json.dump(word2idx, f)

    notfound = False
    for word_list in analogies_to_try:
        for w in word_list:
            if w not in word2idx:
                print("%s not found in vocab, remove it from \
                    analogies to try or increase vocab size")
                notfound = True
    if notfound:
        exit()

    # build term document matrix
    V = len(word2idx)
    N = len(sentences)

    # create raw counts first
    A = np.zeros((V, N))
    j = 0
    for sentence in sentences:
        for i in sentence:
            A[i, j] += 1
        j += 1
    print("finished getting raw counts")

    transformer = TfidfTransformer()
    A = transformer.fit_transform(A)
    # print("type(A):", type(A))
    # exit()
    A = A.toarray()

    idx2word = {v: k for k, v in iteritems(word2idx)}

    # plot the data in 2-D
    tsne = TSNE()
    Z = tsne.fit_transform(A)
    plt.scatter(Z[:, 0], Z[:, 1])
    for i in range(V):
        try:
            plt.annotate(s=idx2word[i].encode("utf8").decode("utf8"),
                         xy=(Z[i, 0], Z[i, 1]))
        except:
            print("bad string:", idx2word[i])
    plt.draw()

    # create a higher-D word embedding, try word analogies
    # tsne = TSNE(n_components=3)
    # We = tsne.fit_transform(A)
    We = Z

    for word_list in analogies_to_try:
        w1, w2, w3 = word_list
        find_analogies(w1, w2, w3, We, word2idx)

    plt.show()  # pause script until plot is closed
Example #18
0
        arrays = [self.W1, self.W2]
        np.savez(fn, *arrays)


def main(use_brown=True):
    if use_brown:
<<<<<<< HEAD
        # sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
        sentences, word2idx = get_sentences_with_word2idx()
=======
        sentences, word2idx = get_sentences_with_word2idx_limit_vocab()
        # sentences, word2idx = get_sentences_with_word2idx()
        # sentences, word2idx = get_text8()
>>>>>>> upstream/master
    else:
        sentences, word2idx = get_wikipedia_data(n_files=1, n_vocab=2000)
    with open('w2v_word2idx.json', 'w') as f:
        json.dump(word2idx, f)

    V = len(word2idx)
    model = Model(50, V, 5)
<<<<<<< HEAD
    model.fit(sentences, learning_rate=1e-3, mu=0, epochs=3, num_neg_samples=5)
=======

    # use numpy
    # model.fit(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5)

    # use theano
    model.fitt(sentences, learning_rate=1e-3, mu=0, epochs=5, num_neg_samples=5)