Esempio n. 1
0
def findAnchorTopics_Orig(Data,
                          K=10,
                          loss='L2',
                          seed=0,
                          lowerDim=1000,
                          minDocPerWord=0,
                          eps=1e-4,
                          doRecover=1):
    """ Estimate and return K topics using anchor word method

        Returns
        -------
        topics : numpy 2D array, size K x V

    """
    from Q_matrix import generate_Q_matrix
    from fastRecover import do_recovery

    params = Params(seed=seed,
                    lowerDim=lowerDim,
                    minDocPerWord=minDocPerWord,
                    eps=eps)

    assert isinstance(Data, bnpy.data.DataObj)
    DocWordMat = Data.getSparseDocTypeCountMatrix()

    if not str(type(DocWordMat)).count('csr_matrix') > 0:
        raise NotImplementedError('Need CSR matrix')

    Q = generate_Q_matrix(DocWordMat.copy().T)

    anchors = selectAnchorWords(DocWordMat.tocsc(), Q, K, params)

    if doRecover:
        topics, topic_likelihoods = do_recovery(Q, anchors, loss, params)
        topics = topics.T
        topics = topics / topics.sum(axis=1)[:, np.newaxis]
        return topics
    else:
        return Q, anchors
Esempio n. 2
0
var = np.zeros((M.shape))
row_sums = Q_emp.sum(1)
for i in xrange(len(Q_emp[:,0])):
	Q_bar[i,:] = Q_emp[i,:]/float(row_sums[i])
	var[i,:] = var_analytical[i,:]/(float(row_sums[i])**2)


# find anchors
(anchors, anchor_indices) = gs.Projection_Find(Q_bar, K, candidate_anchors, var)
anchor_indices = [int(a) for a in anchor_indices]
print "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[int(a)]

# recover topics
A, topic_likelihoods, R = do_recovery(Q_emp, anchors, loss, params)
print "done recovering"

np.savetxt(outfile+".A", A)
np.savetxt(outfile+".topic_likelihoods", topic_likelihoods)
np.savetxt(outfile+".R", R)
# np.savetxt(outfile+".Q", Q)

#display
f = file(outfile+".topwords", 'w')
for k in xrange(K):
    mask = A[:, k] > 0.01
    topwords = [ x for x in np.argsort(A[:, k]) if mask[x] ][::-1]
    # topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
    # print params.top_words
    # print 'npargsort', A[:, k]
vocab = file(vocab_file).read().strip().split()

#check that Q sum is 1 or close to it
print "Q sum is", Q.sum()
V = Q.shape[0]
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space
anchors = findAnchors(Q, K, params, candidate_anchors)
print "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[a]

#recover topics
A, topic_likelihoods = do_recovery(Q, anchors, loss, params) 
print "done recovering"

np.savetxt(outfile+".A", A)
np.savetxt(outfile+".topic_likelihoods", topic_likelihoods)

#display
f = file(outfile+".topwords", 'w')
for k in xrange(K):
    topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
    print vocab[anchors[k]], ':',
    print >>f, vocab[anchors[k]], ':',
    for w in topwords:
        print vocab[w],
        print >>f, vocab[w],
    print ""
Esempio n. 4
0
def anchor_words(D, loss='L2', params=config.default_config()):
    Q = generate_Q_matrix(D * 100)
    anchors = findAnchors(Q, params['T'], params)
    W, topic_likelihoods = do_recovery(Q, anchors, loss, params)
    return W
    def run(self):
        params = self.params

        if isinstance(params.infile, basestr):
            M = scipy.io.loadmat(params.infile)['M']
        else:
            M = params.infile
        assert sparse.isspmatrix_csc(M), "Must provide a sparse CSC matrix"

        print("Input matrix shape: {}".format(M.shape))

        if isinstance(params.vocab_file, basestr):
            with open(params.vocab_file) as f:
                vocab = f.read().strip().split()
        else:
            vocab = params.vocab_file
        assert np.iterable(vocab), "Must provide an iterable vocab"

        assert M.shape[0] == len(vocab), \
            "Number of rows must correspond to vocab size: {} rows vs {} vocab words" \
            .format(M.shape[0], len(vocab))

        #only accept anchors that appear in a significant number of docs
        print("identifying candidate anchors")
        candidate_anchors = []
        for i in range(M.shape[0]):
            if len(np.nonzero(M[i, :])[1]) > params.anchor_thresh:
                candidate_anchors.append(i)

        print(len(candidate_anchors), "candidates")

        #forms Q matrix from document-word matrix
        Q = generate_Q_matrix(M)

        # Save copy of unnormalized Q, before any normalizations happen
        self.Q_unnormalized = Q.copy()

        #check that Q sum is 1 or close to it
        print("Q sum is", Q.sum())
        V = Q.shape[0]
        print("done reading documents")

        #find anchors- this step uses a random projection
        #into low dimensional space
        anchors = findAnchors(Q, params, candidate_anchors)
        print("anchors are:")
        for i, a in enumerate(anchors):
            print(i, vocab[a])

        #recover topics
        A, topic_likelihoods = do_recovery(Q, anchors, params)
        print("done recovering")

        output_streams = [sys.stdout]
        output_file_handle = None
        if params.outfile is not None:
            np.savetxt(params.outfile+".A", A)
            np.savetxt(params.outfile+".topic_likelihoods", topic_likelihoods)
            output_file_handle = open(params.outfile+".topwords", 'w')
            output_streams.append(output_file_handle)

        def print_multiple(*args, **kwargs):
            # Print the same info to multiple output streams
            for f in output_streams:
                print(*args, file=f, **kwargs)

        # Display top words per topic
        all_topwords = []
        for k in range(params.K):
            topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
            print_multiple(vocab[anchors[k]], ':', end=' ')
            for w in topwords:
                print_multiple(vocab[w], end=' ')
            print_multiple("")
            all_topwords.append(TopWordsSummary(
                topic_index = k,
                anchor_word_index = anchors[k],
                anchor_word = vocab[anchors[k]],
                top_word_indices = topwords,
                top_words = [vocab[w] for w in topwords]))

        if params.outfile is not None:
            output_file_handle.close()

        # make some results available as attributes of "self"
        self.Q = Q
        self.M = M
        self.A = A
        self._R = None
        self.topic_likelihoods = topic_likelihoods
        self.candidate_anchors = candidate_anchors
        self.anchors = anchors
        self.vocab = vocab
        self.all_topwords = all_topwords
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space

anchor_logfile = file(params.log_prefix + '.anchors', 'w')
anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile)
print "anchors are:"
print >> anchor_logfile, "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[a]
    print >> anchor_logfile, i, vocab[a]
anchor_logfile.close()

#recover topics
A, topic_likelihoods, objective = do_recovery(Q, anchors, loss, params)
print "done recovering"
print "avg objective function during recovery using", K, "topics:", objective

np.savetxt(outfile + ".A", A)
np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods)

#display
f = file(outfile + ".topwords", 'w')
for k in xrange(K):
    topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
    print vocab[anchors[k]], ':',
    print >> f, vocab[anchors[k]], ':',
    for w in topwords:
        print vocab[w],
        print >> f, vocab[w],
print "done reading documents"

#find anchors- this step uses a random projection
#into low dimensional space

anchor_logfile = file(params.log_prefix+'.anchors', 'w')
anchors = findAnchors(Q, K, params, candidate_anchors, anchor_logfile)
print "anchors are:"
print >>anchor_logfile, "anchors are:"
for i, a in enumerate(anchors):
    print i, vocab[a]
    print >>anchor_logfile, i, vocab[a]
anchor_logfile.close()

#recover topics
A, topic_likelihoods,objective = do_recovery(Q, anchors, loss, params) 
print "done recovering"
print "avg objective function during recovery using", K, "topics:", objective

np.savetxt(outfile+".A", A)
np.savetxt(outfile+".topic_likelihoods", topic_likelihoods)

#display
f = file(outfile+".topwords", 'w')
for k in xrange(K):
    topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
    print vocab[anchors[k]], ':',
    print >>f, vocab[anchors[k]], ':',
    for w in topwords:
        print vocab[w],
        print >>f, vocab[w],
Esempio n. 8
0
vocab = open(vocab_file).read().strip().split()

# check that Q sum is 1 or close to it
print("Q sum is", Q.sum())
V = Q.shape[0]
print("done reading documents")

# find anchors- this step uses a random projection
# into low dimensional space
anchors = findAnchors(Q, K, params, candidate_anchors)
print("anchors are:")
for i, a in enumerate(anchors):
    print(i, vocab[a])

# recover topics
A, topic_likelihoods = do_recovery(Q, anchors, loss, params)
print("done recovering")

np.savetxt(outfile + ".A", A)
np.savetxt(outfile + ".topic_likelihoods", topic_likelihoods)

# display
with open(outfile + ".topwords", "w") as f:
    for k in range(K):
        topwords = np.argsort(A[:, k])[-params.top_words:][::-1]
        print(vocab[anchors[k]], ":", end=" ")
        print(vocab[anchors[k]], ":", end=" ", file=f)
        for w in topwords:
            print(vocab[w], end=" ")
            print(vocab[w], end=" ", file=f)
        print("")