Beispiel #1
0
qdata_K10 = np.loadtxt('data/kos_q_a1p0_b0p5_K10.dat')
ndata_K19 = np.loadtxt('data/kos_n_a1p0_b0p5_K19.dat')
qdata_K19 = np.loadtxt('data/kos_q_a1p0_b0p5_K19.dat')

# import the doc,vocab indices
kosdata = np.loadtxt('data/kosdata.dat')
doc_idx = kosdata[:,0]
voc_idx = kosdata[:,1]
f = open('data/vocab.kos.txt', 'r')
vocabulary = f.readlines()
f.close()

V = 6906

beta = 0.5 * np.ones(V)
phiK2 = ml.calc_phi(qdata_K2,beta,voc_idx,V)
phiK3 = ml.calc_phi(qdata_K3,beta,voc_idx,V)
phiK10 = ml.calc_phi(qdata_K10,beta,voc_idx,V)
phiK19 = ml.calc_phi(qdata_K19,beta,voc_idx,V)

# get the top 20 words and their probabilities
topK2 = np.sort(phiK2,axis=1)[:,-20:][:,::-1]
topK2_ind = np.argsort(phiK2,axis=1)[:,-20:][:,::-1]
topK3 = np.sort(phiK3,axis=1)[:,-20:][:,::-1]
topK3_ind = np.argsort(phiK3,axis=1)[:,-20:][:,::-1]
topK10 = np.sort(phiK10,axis=1)[:,-20:][:,::-1]
topK10_ind = np.argsort(phiK10,axis=1)[:,-20:][:,::-1]
topK19 = np.sort(phiK19,axis=1)[:,-20:][:,::-1]
topK19_ind = np.argsort(phiK19,axis=1)[:,-20:][:,::-1]

# now print the top words and their probabilities to file
Beispiel #2
0
ndata_b1p0 = np.loadtxt("data/c400_n_a1p0_b1p0_K3.dat")
qdata_b1p0 = np.loadtxt("data/c400_q_a1p0_b1p0_K3.dat")
ndata_b10p0 = np.loadtxt("data/c400_n_a1p0_b10p0_K3.dat")
qdata_b10p0 = np.loadtxt("data/c400_q_a1p0_b10p0_K3.dat")

# import doc,vocab indices
c400data = scipy.io.loadmat("data/classic400.mat")
doc_idx, voc_idx = c400data["classic400"].nonzero()

K = 3
V = 6205

beta0p1 = 0.1 * np.ones(V)
beta1p0 = 1.0 * np.ones(V)
beta10p0 = 10.0 * np.ones(V)
phi0p1 = ml.calc_phi(qdata_b0p1, beta0p1, voc_idx, V)
phi1p0 = ml.calc_phi(qdata_b1p0, beta1p0, voc_idx, V)
phi10p0 = ml.calc_phi(qdata_b10p0, beta10p0, voc_idx, V)

# get the top 10 words
top1 = np.sort(phi0p1[0])[-10:][::-1]
print top1

# find max probabilities
phimax0p1 = np.max(phi0p1, axis=1)
phimax1p0 = np.max(phi1p0, axis=1)
phimax10p0 = np.max(phi10p0, axis=1)

print phimax0p1
print phimax1p0
print phimax10p0
Beispiel #3
0
c400data = scipy.io.loadmat('data/classic400.mat')
doc_idx,voc_idx = c400data['classic400'].nonzero()
# import vocabulary
vocabulary = []
for i,entry in enumerate(c400data['classicwordlist']):
    vocabulary.append(entry[0][0])

K = 3
V = 6205

beta1 = 10000.0 * np.ones(V)
beta2 = 0.1 * np.ones(V)
beta3 = 0.1 * np.ones(V)
beta4 = 0.1 * np.ones(V)
beta5 = 100.0 * np.ones(V)
phi1 = ml.calc_phi(qdata_1,beta1,voc_idx,V)
phi2 = ml.calc_phi(qdata_2,beta2,voc_idx,V)
phi3 = ml.calc_phi(qdata_3,beta3,voc_idx,V)
phi4 = ml.calc_phi(qdata_4,beta4,voc_idx,V)
phi5 = ml.calc_phi(qdata_5,beta5,voc_idx,V)

# get the top 20 words
top1 = np.sort(phi1,axis=1)[:,-20:][:,::-1]
top1_ind = np.argsort(phi1,axis=1)[:,-20:][:,::-1]
top2 = np.sort(phi2,axis=1)[:,-20:][:,::-1]
top2_ind = np.argsort(phi2,axis=1)[:,-20:][:,::-1]
top3 = np.sort(phi3,axis=1)[:,-20:][:,::-1]
top3_ind = np.argsort(phi3,axis=1)[:,-20:][:,::-1]
top4 = np.sort(phi4,axis=1)[:,-20:][:,::-1]
top4_ind = np.argsort(phi4,axis=1)[:,-20:][:,::-1]
top5 = np.sort(phi5,axis=1)[:,-20:][:,::-1]