def lda(data, K, it, alpha, beta, dict_=True, verbose=True, randomness=1, PATH="", algo='cgs'): #** 1. Define Internal Parameters alpha = 0.5 beta = alpha #** 2. Random topics and dictionate data = np.asarray(data) if randomness > 0: data = addrandomcol(data, K, -1, randomness) #K if dict_: data, idx2vals, vals2idx, _ = dictionate(data) else: idx2vals = None vals2idx = None data = data.astype(float) data = data.astype(np.int) z_d = join2(data[:][:, [0, 2]]) w_z = join2(data[:][:, [2, 1]]) z_ = join2(data[:][:, [2]]) if algo == "motion": data = map(lambda row: [row[0], row[1], toDistribution(row[2], K)], data) #** 3. Inference if PATH != "": np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_0", w_z) np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_0", z_d) for i in range(it): start = time.time() if algo == "cgs": data, z_d, w_z = sampling(data, z_d, w_z, z_, alpha, beta) elif algo == "motion": data, z_d, w_z = motion(data, z_d, w_z, z_, alpha, beta) else: print "Only cgs and motion are implemented " assert (False) if PATH != "": np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_" + str(i + 1), w_z) np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_" + str(i + 1), z_d) print "Iteration", i, "took", time.time() - start return data, w_z, z_d, idx2vals, vals2idx
def slda(data, K, it, a, alpha, beta, eta, dict_=True, verbose=True, randomness=1, compressed=False, batch=0, PATH="", form='standard', algo='cgs'): #** 1. Random topics and dictionate if randomness > 0: data = addrandomcol(data, K, 3, randomness) if dict_: data, idx2vals, vals2idx, _ = dictionate(data, cols=[0, 1]) else: idx2vals = None vals2idx = None if algo == "cgs" or algo == "cgsgpu": dz = join2(data[:][:, [0, 3]]) wz = join2(data[:][:, [1, 3]]) z = join2(data[:][:, [3]]) #** TODO: UPDATE rating range from 0-5 for all methods except cool and herongpu #** 2. Inference if algo == "cgs": print "cgs ---------------------------------------------" print data[:10] afterdata, D, W = cgs(data, dz, wz, z, K, it, a, alpha, beta, eta, PATH) elif algo == "heron": print "heron ----------------------------------------------" herondata, D, W, Z = preprocessData_old(data, K, compressed) herondata, D, W, Z = fixedp(g, herondata, D, W, Z, K, a, alpha, beta, eta, PATH, maxiter=it) elif algo == "cgsgpu": print "cgs gpu ------------------------------------------" afterdata, D, W, Z = SLDACGSGPU(data, wz, dz, z, K, it, a, alpha, beta, eta, PATH) elif algo == "herongpu": print "heron gpu ----------------------------------------" if batch > 0: # and compressed data, pz, D, W, Z = preprocessData(data, K, compressed) if batch > len(data): print "Batch size=", batch, "> len(data)=", len(data) batch = len(data) from_ = list(xrange(0, len(data), batch)) to_ = from_[1:] + [from_[-1] + batch] Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') print data for i in range(it): print "Iteration", i, "------------------------------------------" fullD = np.zeros(np.shape(D), dtype=np.float32) fullW = np.zeros(np.shape(W), dtype=np.float32) fullZ = np.zeros(np.shape(Z), dtype=np.float32) for f, t in zip(from_, to_): data_batch = data[f:t] pz_batch = pz[f:t] _, partD, partW, partZ = SLDAHERONGPU( data_batch[:, [0, 1, 2, 3]], W, D, Z, pz_batch, K, 1, a, alpha, beta, eta, PATH="") fullD += partD fullW += partW fullZ += partZ del _, data_batch D = fullD W = fullW Z = fullZ if PATH != "": if (i + 1) % 5 == 0: np.save( PATH + "wz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullW) np.save( PATH + "dz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullD) else: herondata, pz, D, W, Z = preprocessData(data, K, compressed) Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') afterdata, D, W, Z = SLDAHERONGPU(herondata[:][:, [0, 1, 2, 3]], W, D, Z, pz, K, it, a, alpha, beta, eta, PATH) elif algo == "cool": print "cool ----------------------------------------" if batch > 0: # and compressed data, pz, D, W, Z = preprocessData(data, K, compressed) if batch > len(data): print "Batch size=", batch, "> len(data)=", len(data) batch = len(data) from_ = list(xrange(0, len(data), batch)) to_ = from_[1:] + [from_[-1] + batch] Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') del pz for i in range(it): print "Iteration", i, "------------------------------------------" fullD = np.zeros(np.shape(D), dtype=np.float32) fullW = np.zeros(np.shape(W), dtype=np.float32) fullZ = np.zeros(np.shape(Z), dtype=np.float32) for f, t in zip(from_, to_): data_batch = data[f:t].copy() _, partD, partW, partZ = SLDACOOLGPU( data_batch[:, [0, 1, 2, 3]], W, D, Z, K, 1, a, alpha, beta, eta, PATH="") fullD += partD fullW += partW fullZ += partZ del _, data_batch D = fullD W = fullW Z = fullZ if PATH != "": if (i + 1) % 5 == 0: np.save( PATH + "wz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullW) np.save( PATH + "dz_slda" + "_".join(map(str, [algo, K, alpha, beta, (i + 1)])), fullD) else: herondata, pz, D, W, Z = preprocessData_old(data, K, compressed) del pz Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C') afterdata, D, W, Z = SLDACOOLGPU(herondata[:][:, [0, 1, 2, 3]], W, D, Z, pz, K, it, a, alpha, beta, eta, PATH) else: print "Inference method not supported" assert (0) return data, D, W, idx2vals, vals2idx
def torchLDA(data, K, it, alpha, beta, dict_=True, verbose=True, randomness=1, PATH=""): #** 2. Random topics and dictionate data = np.asarray(data) if randomness > 0: data = addrandomcol(data, K, -1, randomness) #K dict_ = False if dict_: data, idx2vals, vals2idx, _ = dictionate(data) else: idx2vals = None vals2idx = None data = data.astype(np.int) data = data.copy() d_z = join2(data[:][:, [0, 2]]) w_z = join2(data[:][:, [1, 2]]) z = join2(data[:][:, [2]]) datamotion = map(lambda row: (row[0], row[1], toDistribution(row[2], K)), data) pz = np.array([row[2] for row in datamotion]) D, W, I = len(d_z), len(w_z), len(data) i_hot, w_hot, d_hot = np.eye(I), np.eye(W), np.eye(D) dz = from_numpy(d_z) wz = from_numpy(w_z) z = from_numpy(z) pz = from_numpy(pz) if PATH != "": np.save(PATH + "w_z_ldatorch_" + str(K) + "_0", w_z) np.save(PATH + "z_d_ldatorch_" + str(K) + "_0", d_z.T) lda = LDA(alpha, beta) for j in range(it): start = time.time() for idx, row in enumerate(datamotion): curr_d, curr_w, _ = row d = from_numpy(d_hot[None, curr_d]) w = from_numpy(w_hot[None, curr_w]) i = from_numpy(i_hot[None, idx]) dz, wz, pz, z = lda.forward(dz, wz, z, pz, d, w, i) if PATH != "": w_z = tonumpy(wz).T z_d = tonumpy(dz) np.save(PATH + "w_z_ldatorch_" + str(K) + "_" + str(idx + 1), w_z) np.save(PATH + "z_d_ldatorch_" + str(K) + "_" + str(idx + 1), z_d) print "Iteration", j, "took", time.time() - start return data, wz, dz, idx2vals, vals2idx
else: print "\nRunning ",options.inference,options.model,"["+str(options.K)+" topics] - NO initialization\nHyperparameters: alpha:",options.alpha,"beta:",options.beta,"eta:",options.eta,"a:",options.a,"\nNumber of iterations:",options.iteration,"; number of tuples in a batch is ",options.batch,"\n" print "The parameters are being saved at:",options.path,"\n" if options.model == "LDA": import LDA.lda as lda data=np.load(options.filename) data,_,_,_=dictionate(data,cols=[0,1]) train,test=splitTrainTestRepeated(data,0.7) it=options.iteration path=options.path #** Initialize outside the method call for fair comparison between models train=addrandomcol(train,options.K,-1,1) data,dz,wz,idx2vals,vals2idx=lda.lda(train,options.K,options.iteration,options.alpha,options.beta,batch=options.batch,randomness=options.randomness,dict_=False,PATH=options.path,algo=options.inference,compressed=options.compression) elif options.model == "RTM": import RTM.rtm as rtm path,_=os.path.split(options.filename) dd=np.load(path+"/dd.npy") train=np.load(options.filename) train=addrandomcol(train,options.K,-1,1)#K data,dz,wz,idx2vals,vals2idx=rtm.rtm(train,dd,options.K,options.iteration,options.alpha,options.beta,options.eta,randomness=options.randomness,dict_=False,batch=options.batch,PATH=options.path,algo=options.inference,compressed=options.compression)