Beispiel #1
0
def lda(data,
        K,
        it,
        alpha,
        beta,
        dict_=True,
        verbose=True,
        randomness=1,
        PATH="",
        algo='cgs'):

    #** 1. Define Internal Parameters
    alpha = 0.5
    beta = alpha

    #** 2. Random topics and dictionate
    data = np.asarray(data)
    if randomness > 0:
        data = addrandomcol(data, K, -1, randomness)  #K

    if dict_:
        data, idx2vals, vals2idx, _ = dictionate(data)
    else:
        idx2vals = None
        vals2idx = None

    data = data.astype(float)
    data = data.astype(np.int)

    z_d = join2(data[:][:, [0, 2]])
    w_z = join2(data[:][:, [2, 1]])
    z_ = join2(data[:][:, [2]])

    if algo == "motion":
        data = map(lambda row: [row[0], row[1],
                                toDistribution(row[2], K)], data)

    #** 3. Inference
    if PATH != "":
        np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_0", w_z)
        np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_0", z_d)
    for i in range(it):
        start = time.time()
        if algo == "cgs":
            data, z_d, w_z = sampling(data, z_d, w_z, z_, alpha, beta)
        elif algo == "motion":
            data, z_d, w_z = motion(data, z_d, w_z, z_, alpha, beta)
        else:
            print "Only cgs and motion are implemented "
            assert (False)

        if PATH != "":
            np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_" + str(i + 1),
                    w_z)
            np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_" + str(i + 1),
                    z_d)
        print "Iteration", i, "took", time.time() - start

    return data, w_z, z_d, idx2vals, vals2idx
Beispiel #2
0
    def __init__(self, data, alpha, beta):

        #** Preprocess the data
        self.data, idx2vals, vals2idx, self.counts = dictionate(
            data)  #self.data is dictionated data
        self.V = len(idx2vals[0])  # Total number of observed variables in V
        self.W = len(idx2vals[1])  # Total number of observed variables in W

        self.alpha = alpha
        self.beta = beta

        # Global parameters
        self.currV = 0  # Current number of observed variables in V
        self.currW = 0  # Current number of observed variables in W
        self.Vs = set()  # Set of Vs
        self.Ws = set()  # Set of Ws
        self.K = 0  # Current number of existing K
        self.nvk_ = np.zeros((self.V, self.K))
        self.n_kw = np.zeros((self.W, self.K))
        self.n_k_ = np.zeros(self.K)
        self.sum_N = 0
        self.P_new = self.alpha
Beispiel #3
0
def slda(data,
         K,
         it,
         a,
         alpha,
         beta,
         eta,
         dict_=True,
         verbose=True,
         randomness=1,
         compressed=False,
         batch=0,
         PATH="",
         form='standard',
         algo='cgs'):

    #** 1. Random topics and dictionate
    if randomness > 0:
        data = addrandomcol(data, K, 3, randomness)

    if dict_:
        data, idx2vals, vals2idx, _ = dictionate(data, cols=[0, 1])
    else:
        idx2vals = None
        vals2idx = None

    if algo == "cgs" or algo == "cgsgpu":
        dz = join2(data[:][:, [0, 3]])
        wz = join2(data[:][:, [1, 3]])
        z = join2(data[:][:, [3]])

    #** TODO: UPDATE rating range from 0-5 for all methods except cool and herongpu

    #** 2. Inference
    if algo == "cgs":
        print "cgs ---------------------------------------------"
        print data[:10]

        afterdata, D, W = cgs(data, dz, wz, z, K, it, a, alpha, beta, eta,
                              PATH)

    elif algo == "heron":
        print "heron ----------------------------------------------"
        herondata, D, W, Z = preprocessData_old(data, K, compressed)
        herondata, D, W, Z = fixedp(g,
                                    herondata,
                                    D,
                                    W,
                                    Z,
                                    K,
                                    a,
                                    alpha,
                                    beta,
                                    eta,
                                    PATH,
                                    maxiter=it)

    elif algo == "cgsgpu":
        print "cgs gpu ------------------------------------------"
        afterdata, D, W, Z = SLDACGSGPU(data, wz, dz, z, K, it, a, alpha, beta,
                                        eta, PATH)

    elif algo == "herongpu":
        print "heron gpu ----------------------------------------"

        if batch > 0:  # and compressed

            data, pz, D, W, Z = preprocessData(data, K, compressed)

            if batch > len(data):
                print "Batch size=", batch, "> len(data)=", len(data)
                batch = len(data)

            from_ = list(xrange(0, len(data), batch))
            to_ = from_[1:] + [from_[-1] + batch]

            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')

            print data

            for i in range(it):
                print "Iteration", i, "------------------------------------------"

                fullD = np.zeros(np.shape(D), dtype=np.float32)
                fullW = np.zeros(np.shape(W), dtype=np.float32)
                fullZ = np.zeros(np.shape(Z), dtype=np.float32)

                for f, t in zip(from_, to_):
                    data_batch = data[f:t]
                    pz_batch = pz[f:t]

                    _, partD, partW, partZ = SLDAHERONGPU(
                        data_batch[:, [0, 1, 2, 3]],
                        W,
                        D,
                        Z,
                        pz_batch,
                        K,
                        1,
                        a,
                        alpha,
                        beta,
                        eta,
                        PATH="")

                    fullD += partD
                    fullW += partW
                    fullZ += partZ

                    del _, data_batch

                D = fullD
                W = fullW
                Z = fullZ

                if PATH != "":
                    if (i + 1) % 5 == 0:
                        np.save(
                            PATH + "wz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullW)
                        np.save(
                            PATH + "dz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullD)

        else:

            herondata, pz, D, W, Z = preprocessData(data, K, compressed)
            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')
            afterdata, D, W, Z = SLDAHERONGPU(herondata[:][:, [0, 1, 2, 3]], W,
                                              D, Z, pz, K, it, a, alpha, beta,
                                              eta, PATH)

    elif algo == "cool":
        print "cool ----------------------------------------"

        if batch > 0:  # and compressed

            data, pz, D, W, Z = preprocessData(data, K, compressed)

            if batch > len(data):
                print "Batch size=", batch, "> len(data)=", len(data)
                batch = len(data)

            from_ = list(xrange(0, len(data), batch))
            to_ = from_[1:] + [from_[-1] + batch]

            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')
            del pz

            for i in range(it):
                print "Iteration", i, "------------------------------------------"

                fullD = np.zeros(np.shape(D), dtype=np.float32)
                fullW = np.zeros(np.shape(W), dtype=np.float32)
                fullZ = np.zeros(np.shape(Z), dtype=np.float32)

                for f, t in zip(from_, to_):
                    data_batch = data[f:t].copy()
                    _, partD, partW, partZ = SLDACOOLGPU(
                        data_batch[:, [0, 1, 2, 3]],
                        W,
                        D,
                        Z,
                        K,
                        1,
                        a,
                        alpha,
                        beta,
                        eta,
                        PATH="")
                    fullD += partD
                    fullW += partW
                    fullZ += partZ

                    del _, data_batch

                D = fullD
                W = fullW
                Z = fullZ

                if PATH != "":
                    if (i + 1) % 5 == 0:
                        np.save(
                            PATH + "wz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullW)
                        np.save(
                            PATH + "dz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullD)

        else:
            herondata, pz, D, W, Z = preprocessData_old(data, K, compressed)
            del pz
            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')
            afterdata, D, W, Z = SLDACOOLGPU(herondata[:][:, [0, 1, 2, 3]], W,
                                             D, Z, pz, K, it, a, alpha, beta,
                                             eta, PATH)

    else:
        print "Inference method not supported"
        assert (0)

    return data, D, W, idx2vals, vals2idx
Beispiel #4
0
if (options.model=="cgs" and options.batch>0) or (options.model=="heron" and options.batch>0):
	print "CGS, neither heron support batches please try cool or herongpu."

#**3. Run Inference Algorithm
if options.randomness:
	print "\nRunning ",options.inference,options.model,"["+str(options.K)+" topics] - Uniformly initialized\nHyperparameters: alpha:",options.alpha,"beta:",options.beta,"eta:",options.eta,"a:",options.a,"\nNumber of iterations:",options.iteration,"; number of tuples in a batch is ",options.batch,"\n"
else:
	print "\nRunning ",options.inference,options.model,"["+str(options.K)+" topics] - NO initialization\nHyperparameters: alpha:",options.alpha,"beta:",options.beta,"eta:",options.eta,"a:",options.a,"\nNumber of iterations:",options.iteration,"; number of tuples in a batch is ",options.batch,"\n"

print "The parameters are being saved at:",options.path,"\n"
	
if options.model == "LDA":
	import LDA.lda as lda
	data=np.load(options.filename)
	data,_,_,_=dictionate(data,cols=[0,1])
	train,test=splitTrainTestRepeated(data,0.7)		
	
	it=options.iteration
	path=options.path
	
	#** Initialize outside the method call for fair comparison between models 
	train=addrandomcol(train,options.K,-1,1)

	data,dz,wz,idx2vals,vals2idx=lda.lda(train,options.K,options.iteration,options.alpha,options.beta,batch=options.batch,randomness=options.randomness,dict_=False,PATH=options.path,algo=options.inference,compressed=options.compression)

elif options.model == "RTM":
	import RTM.rtm as rtm

	path,_=os.path.split(options.filename)
	dd=np.load(path+"/dd.npy")
Beispiel #5
0
def dt2b(dt2bdata,Ku,Kp,n,alpha,beta_row,beta_column,it,verbose=True):

	def init(data,Ku,Kp):
	
		data=np.array(data)
		assert(data.shape[1]==2)
	
		datadt2b=np.zeros((len(data),4),dtype='|S20')
		for idx,row in enumerate(datadt2b):
			row[0]=data[idx][0]
			row[1]=data[idx][1]
	
		datadt2b=addrandomtopic(datadt2b,Ku,-2)
		datadt2b=addrandomtopic(datadt2b,Kp,-1)
	
		return datadt2b

	
	#--printTopics------------------------------------------------------
	"""Print Topics top 't' topics given conditional distribution given the topic
	p_z = ditribution given topic 
	t = Top T topics
	"""
	#------------------------------------------------------------
	def printColumnTopics(p_z,t):
		for idx,z in enumerate(p_z):
			print "Topic",idx,'- evidence'
			for topic,evidence in zip(np.argsort(z)[::-1][:t],np.sort(z)[::-1][:t]):
				print idx2vals[1][topic],int(evidence)
			print ""
			
	def printRowTopics(p_z,t):
		for idx,z in enumerate(p_z):
			print "Topic",idx,'- evidence'
			for topic,evidence in zip(np.argsort(z)[::-1][:t],np.sort(z)[::-1][:t]):
				print idx2vals[0][topic],int(evidence)
			print ""

	def	printTopics(mdata,verbose): 		
		
		print "Row Topics\n"
		printRowTopics(join2(processed_data[:][:,[2,0]]),n)
		print "------------------------------"
			
		print "Column Topics\n"
		printColumnTopics(join2(processed_data[:][:,[3,1]]),n)
		print "------------------------------"
	
		print "Topic Interrelation\n Evidence of the relationship between the row-topics and column-topics\n" 
		print join2(processed_data[:][:,[2,3]]).astype(np.int)
	
		print "------------------------------"
		
		
		
	"""-----------------*
	*                   *
	* |\/|  /\  | |\ |  * 
	* |  | /  \ | | \|  *
	*                   *
	*----------------"""

	dt2bdata=np.asarray(dt2bdata)
	dt2bdata=init(dt2bdata,Ku,Kp)
	
	print "Processing the data ..."
	dt2bdata,idx2vals,vals2idx=dictionate(dt2bdata)	
	dt2bdata=dt2bdata.astype(np.int)

	print "Running the inference process ..."
		
	start=time.time()
	
	processed_data = dt2b_c.inference(dt2bdata,it)
	
	print 'Inference Took:',time.time()-start,'seconds'
	
	if verbose:
		printTopics(processed_data,verbose)
	
	columns_w_z=join2(processed_data[:][:,[3,1]])
	rows_w_z=join2(processed_data[:][:,[2,0]])
	joint=join2(processed_data[:][:,[2,3]])

	return columns_w_z,rows_w_z,joint,idx2vals,vals2idx