Beispiel #1
0
def lda(data,
        K,
        it,
        alpha,
        beta,
        dict_=True,
        verbose=True,
        randomness=1,
        PATH="",
        algo='cgs'):

    #** 1. Define Internal Parameters
    alpha = 0.5
    beta = alpha

    #** 2. Random topics and dictionate
    data = np.asarray(data)
    if randomness > 0:
        data = addrandomcol(data, K, -1, randomness)  #K

    if dict_:
        data, idx2vals, vals2idx, _ = dictionate(data)
    else:
        idx2vals = None
        vals2idx = None

    data = data.astype(float)
    data = data.astype(np.int)

    z_d = join2(data[:][:, [0, 2]])
    w_z = join2(data[:][:, [2, 1]])
    z_ = join2(data[:][:, [2]])

    if algo == "motion":
        data = map(lambda row: [row[0], row[1],
                                toDistribution(row[2], K)], data)

    #** 3. Inference
    if PATH != "":
        np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_0", w_z)
        np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_0", z_d)
    for i in range(it):
        start = time.time()
        if algo == "cgs":
            data, z_d, w_z = sampling(data, z_d, w_z, z_, alpha, beta)
        elif algo == "motion":
            data, z_d, w_z = motion(data, z_d, w_z, z_, alpha, beta)
        else:
            print "Only cgs and motion are implemented "
            assert (False)

        if PATH != "":
            np.save(PATH + "w_z_lda" + algo + "_" + str(K) + "_" + str(i + 1),
                    w_z)
            np.save(PATH + "z_d_lda" + algo + "_" + str(K) + "_" + str(i + 1),
                    z_d)
        print "Iteration", i, "took", time.time() - start

    return data, w_z, z_d, idx2vals, vals2idx
Beispiel #2
0
def slda(data,
         K,
         it,
         a,
         alpha,
         beta,
         eta,
         dict_=True,
         verbose=True,
         randomness=1,
         compressed=False,
         batch=0,
         PATH="",
         form='standard',
         algo='cgs'):

    #** 1. Random topics and dictionate
    if randomness > 0:
        data = addrandomcol(data, K, 3, randomness)

    if dict_:
        data, idx2vals, vals2idx, _ = dictionate(data, cols=[0, 1])
    else:
        idx2vals = None
        vals2idx = None

    if algo == "cgs" or algo == "cgsgpu":
        dz = join2(data[:][:, [0, 3]])
        wz = join2(data[:][:, [1, 3]])
        z = join2(data[:][:, [3]])

    #** TODO: UPDATE rating range from 0-5 for all methods except cool and herongpu

    #** 2. Inference
    if algo == "cgs":
        print "cgs ---------------------------------------------"
        print data[:10]

        afterdata, D, W = cgs(data, dz, wz, z, K, it, a, alpha, beta, eta,
                              PATH)

    elif algo == "heron":
        print "heron ----------------------------------------------"
        herondata, D, W, Z = preprocessData_old(data, K, compressed)
        herondata, D, W, Z = fixedp(g,
                                    herondata,
                                    D,
                                    W,
                                    Z,
                                    K,
                                    a,
                                    alpha,
                                    beta,
                                    eta,
                                    PATH,
                                    maxiter=it)

    elif algo == "cgsgpu":
        print "cgs gpu ------------------------------------------"
        afterdata, D, W, Z = SLDACGSGPU(data, wz, dz, z, K, it, a, alpha, beta,
                                        eta, PATH)

    elif algo == "herongpu":
        print "heron gpu ----------------------------------------"

        if batch > 0:  # and compressed

            data, pz, D, W, Z = preprocessData(data, K, compressed)

            if batch > len(data):
                print "Batch size=", batch, "> len(data)=", len(data)
                batch = len(data)

            from_ = list(xrange(0, len(data), batch))
            to_ = from_[1:] + [from_[-1] + batch]

            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')

            print data

            for i in range(it):
                print "Iteration", i, "------------------------------------------"

                fullD = np.zeros(np.shape(D), dtype=np.float32)
                fullW = np.zeros(np.shape(W), dtype=np.float32)
                fullZ = np.zeros(np.shape(Z), dtype=np.float32)

                for f, t in zip(from_, to_):
                    data_batch = data[f:t]
                    pz_batch = pz[f:t]

                    _, partD, partW, partZ = SLDAHERONGPU(
                        data_batch[:, [0, 1, 2, 3]],
                        W,
                        D,
                        Z,
                        pz_batch,
                        K,
                        1,
                        a,
                        alpha,
                        beta,
                        eta,
                        PATH="")

                    fullD += partD
                    fullW += partW
                    fullZ += partZ

                    del _, data_batch

                D = fullD
                W = fullW
                Z = fullZ

                if PATH != "":
                    if (i + 1) % 5 == 0:
                        np.save(
                            PATH + "wz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullW)
                        np.save(
                            PATH + "dz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullD)

        else:

            herondata, pz, D, W, Z = preprocessData(data, K, compressed)
            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')
            afterdata, D, W, Z = SLDAHERONGPU(herondata[:][:, [0, 1, 2, 3]], W,
                                              D, Z, pz, K, it, a, alpha, beta,
                                              eta, PATH)

    elif algo == "cool":
        print "cool ----------------------------------------"

        if batch > 0:  # and compressed

            data, pz, D, W, Z = preprocessData(data, K, compressed)

            if batch > len(data):
                print "Batch size=", batch, "> len(data)=", len(data)
                batch = len(data)

            from_ = list(xrange(0, len(data), batch))
            to_ = from_[1:] + [from_[-1] + batch]

            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')
            del pz

            for i in range(it):
                print "Iteration", i, "------------------------------------------"

                fullD = np.zeros(np.shape(D), dtype=np.float32)
                fullW = np.zeros(np.shape(W), dtype=np.float32)
                fullZ = np.zeros(np.shape(Z), dtype=np.float32)

                for f, t in zip(from_, to_):
                    data_batch = data[f:t].copy()
                    _, partD, partW, partZ = SLDACOOLGPU(
                        data_batch[:, [0, 1, 2, 3]],
                        W,
                        D,
                        Z,
                        K,
                        1,
                        a,
                        alpha,
                        beta,
                        eta,
                        PATH="")
                    fullD += partD
                    fullW += partW
                    fullZ += partZ

                    del _, data_batch

                D = fullD
                W = fullW
                Z = fullZ

                if PATH != "":
                    if (i + 1) % 5 == 0:
                        np.save(
                            PATH + "wz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullW)
                        np.save(
                            PATH + "dz_slda" +
                            "_".join(map(str, [algo, K, alpha, beta,
                                               (i + 1)])), fullD)

        else:
            herondata, pz, D, W, Z = preprocessData_old(data, K, compressed)
            del pz
            Z = np.array(Z[:, np.newaxis], dtype=np.float32, order='C')
            afterdata, D, W, Z = SLDACOOLGPU(herondata[:][:, [0, 1, 2, 3]], W,
                                             D, Z, pz, K, it, a, alpha, beta,
                                             eta, PATH)

    else:
        print "Inference method not supported"
        assert (0)

    return data, D, W, idx2vals, vals2idx
Beispiel #3
0
def torchLDA(data,
             K,
             it,
             alpha,
             beta,
             dict_=True,
             verbose=True,
             randomness=1,
             PATH=""):

    #** 2. Random topics and dictionate
    data = np.asarray(data)
    if randomness > 0:
        data = addrandomcol(data, K, -1, randomness)  #K

    dict_ = False
    if dict_:
        data, idx2vals, vals2idx, _ = dictionate(data)
    else:
        idx2vals = None
        vals2idx = None

    data = data.astype(np.int)

    data = data.copy()
    d_z = join2(data[:][:, [0, 2]])
    w_z = join2(data[:][:, [1, 2]])
    z = join2(data[:][:, [2]])

    datamotion = map(lambda row: (row[0], row[1], toDistribution(row[2], K)),
                     data)
    pz = np.array([row[2] for row in datamotion])

    D, W, I = len(d_z), len(w_z), len(data)
    i_hot, w_hot, d_hot = np.eye(I), np.eye(W), np.eye(D)

    dz = from_numpy(d_z)
    wz = from_numpy(w_z)
    z = from_numpy(z)
    pz = from_numpy(pz)

    if PATH != "":
        np.save(PATH + "w_z_ldatorch_" + str(K) + "_0", w_z)
        np.save(PATH + "z_d_ldatorch_" + str(K) + "_0", d_z.T)

    lda = LDA(alpha, beta)

    for j in range(it):
        start = time.time()
        for idx, row in enumerate(datamotion):

            curr_d, curr_w, _ = row
            d = from_numpy(d_hot[None, curr_d])
            w = from_numpy(w_hot[None, curr_w])
            i = from_numpy(i_hot[None, idx])

            dz, wz, pz, z = lda.forward(dz, wz, z, pz, d, w, i)

        if PATH != "":
            w_z = tonumpy(wz).T
            z_d = tonumpy(dz)
            np.save(PATH + "w_z_ldatorch_" + str(K) + "_" + str(idx + 1), w_z)
            np.save(PATH + "z_d_ldatorch_" + str(K) + "_" + str(idx + 1), z_d)
        print "Iteration", j, "took", time.time() - start
    return data, wz, dz, idx2vals, vals2idx
Beispiel #4
0
else:
	print "\nRunning ",options.inference,options.model,"["+str(options.K)+" topics] - NO initialization\nHyperparameters: alpha:",options.alpha,"beta:",options.beta,"eta:",options.eta,"a:",options.a,"\nNumber of iterations:",options.iteration,"; number of tuples in a batch is ",options.batch,"\n"

print "The parameters are being saved at:",options.path,"\n"
	
if options.model == "LDA":
	import LDA.lda as lda
	data=np.load(options.filename)
	data,_,_,_=dictionate(data,cols=[0,1])
	train,test=splitTrainTestRepeated(data,0.7)		
	
	it=options.iteration
	path=options.path
	
	#** Initialize outside the method call for fair comparison between models 
	train=addrandomcol(train,options.K,-1,1)

	data,dz,wz,idx2vals,vals2idx=lda.lda(train,options.K,options.iteration,options.alpha,options.beta,batch=options.batch,randomness=options.randomness,dict_=False,PATH=options.path,algo=options.inference,compressed=options.compression)

elif options.model == "RTM":
	import RTM.rtm as rtm

	path,_=os.path.split(options.filename)
	dd=np.load(path+"/dd.npy")
	train=np.load(options.filename)

	train=addrandomcol(train,options.K,-1,1)#K

	data,dz,wz,idx2vals,vals2idx=rtm.rtm(train,dd,options.K,options.iteration,options.alpha,options.beta,options.eta,randomness=options.randomness,dict_=False,batch=options.batch,PATH=options.path,algo=options.inference,compressed=options.compression)