Ejemplo n.º 1
0
def EigenEmbedding(dataTable, finalDims = 3):
    t0 = time.time()
        
    e = eig(loadMatrix(dataTable))
    e = [(e[1].real.T[i]*sqrt(abs(e[0].real[i]))).tolist() for i in xrange(len(e[0]))]
    #e.reverse()
    e = [list(l) for l in zip(*e[:finalDims])]
    
    print time.time()-t0, " seconds to compute eigenvalue embedding."
    
    return e
Ejemplo n.º 2
0
def EigenEmbedding(dataTable, finalDims=3):
    t0 = time.time()

    e = eig(loadMatrix(dataTable))
    e = [(e[1].real.T[i] * sqrt(abs(e[0].real[i]))).tolist()
         for i in xrange(len(e[0]))]
    #e.reverse()
    e = [list(l) for l in zip(*e[:finalDims])]

    print time.time() - t0, " seconds to compute eigenvalue embedding."

    return e
Ejemplo n.º 3
0
def KMeans(dataTable, k, epsilon=0.00001, srcDims = 1000000000000000, iters=20, normData = False):
    """
    Get the best out of iters tries of k means terminating when delta k < epsilon
    """
    #load up the configuration
    kmOptions = KMeansConfig(dataTable,k,epsilon,srcDims)
    
    
    #load and format the table for use.
    data = loadMatrix(dataTable)[:,:kmOptions['sourceDims']]
    
    #check if we should normalise the data (this is really quick and dirty, replace it with something better)
    if normData:
        dmax = amax(data)
        dmin = amin(data)
        data = (data-dmin)/(dmax-dmin+0.00000001)
    
    
    #make our starting point solutions from the dataset
    solutions = [array(random.sample(data,k)) for i in xrange(iters)]
    
    #chunk solutions if necessary
    for i in xrange(len(solutions)):
        sol = []
        while len(solutions[i]) > kmOptions['chunkSize']:
            sol.append(solutions[i][:kmOptions['chunkSize']])
            solutions[i] = solutions[i][kmOptions['chunkSize']:]
        sol.append(solutions[i])
        solutions[i] = sol
    
    #create our chunked problem data
    dataChunks = []
    while len(data) > kmOptions['chunkSize']:
        dataChunks.append(data[:kmOptions['chunkSize']])
        data = data[kmOptions['chunkSize']:]
    dataChunks.append(data)
    kNorm = (len(dataChunks)-1)+len(dataChunks[-1])/float(len(dataChunks[0]))
    
    #create the CUDA kernels
    program = SourceModule(open(KernelLocation+"KMEANS_LABEL.nvcc").read())
    prg = program.get_function("KMEANS_LABEL")
    program = SourceModule(open(KernelLocation+"KMEANS_UPDATE.nvcc").read())
    prg2 = program.get_function("KMEANS_UPDATE")
    t0 = time.time()
    
    #store the resultant performance of each solution here
    results = []
    finalSols = []
    
    #make GPU allocations and support variables
    total = 0.
    dists = [numpy.zeros(kmOptions['chunkSize']).astype(numpy.float32)+10000000000000000. for i in xrange(len(dataChunks))] #this is used as an intermediate step
    labels = [numpy.zeros(kmOptions['chunkSize']).astype(numpy.uint32) for i in xrange(len(dataChunks))] #this is used as an intermediate step
    data_gpu = drv.mem_alloc(dataChunks[0].nbytes)
    k_gpu = drv.mem_alloc(solutions[0][0].nbytes)
    labels_gpu = drv.mem_alloc(labels[0].nbytes)
    dists_gpu = drv.mem_alloc(dists[0].nbytes)
    
    #calculate KMeans
    for sol in solutions:
        t0 = time.time()
        for i in xrange(10000):
            #Step 1: find all the closest labels
            for i in xrange(len(sol)):
                #copy in blank distances, labels, and the label coordinates
                drv.memcpy_htod(k_gpu, sol[i])
                for j in xrange(len(dataChunks)):
                    drv.memcpy_htod(data_gpu, dataChunks[j])
                    drv.memcpy_htod(labels_gpu, labels[j])
                    drv.memcpy_htod(dists_gpu, dists[j])
                    prg(k_gpu,
                        data_gpu,
                        kmOptions["dimensions"],
                        labels_gpu,
                        dists_gpu,
                        kmOptions['k'],
                        kmOptions['dataSize'],
                        kmOptions['chunkSize'],
                        numpy.int64(i*kmOptions['chunkSize']), #k offset
                        numpy.int64(j*kmOptions['chunkSize']), #data offset
                        kmOptions['maxThreads'],
                        block=kmOptions['block'],
                        grid=kmOptions['grid'])
                drv.memcpy_dtoh(labels[i], labels_gpu)
            #Step 2: find the new averages
            old_sol = [s.copy() for s in sol]
            for i in xrange(len(sol)):
                #load up a blank set of k matrices
                drv.memcpy_htod(k_gpu, sol[i]*0.)
                for j in xrange(len(dataChunks)):
                    drv.memcpy_htod(data_gpu, dataChunks[j])
                    drv.memcpy_htod(labels_gpu, labels[j])
                    prg2(k_gpu,
                        data_gpu,
                        kmOptions["dimensions"],
                        labels_gpu,
                        kmOptions['k'],
                        kmOptions['dataSize'],
                        kmOptions['chunkSize'],
                        numpy.int64(i*kmOptions['chunkSize']), #label offset
                        numpy.int64(j*kmOptions['chunkSize']), #data offset
                        kmOptions['maxThreads'],
                        block=kmOptions['block'],
                        grid=kmOptions['grid'])
                drv.memcpy_dtoh(sol[i], k_gpu)
                sol[i] /= kNorm #final normalisation
            #Step 3: check that the update distance is larger than epsilon
            total = 0.
            for j in xrange(len(sol)):
                tmp = sol[j]-old_sol[j]
                tmp = tmp*tmp
                total += sum([sum(t**0.5) for t in tmp])
            if total/kmOptions['dataSize'] < kmOptions['eps']:
                break
        print "solution done in ",time.time()-t0
        results.append((total,len(results)))
        finalSols.append(numpy.concatenate(sol)[:kmOptions['dataSize']])
    results.sort()
    return finalSols[results[0][1]]
def KMeans(dataTable,
           k,
           epsilon=0.00001,
           srcDims=1000000000000000,
           iters=20,
           normData=False):
    """
    Get the best out of iters tries of k means terminating when delta k < epsilon
    """
    #load up the configuration
    kmOptions = KMeansConfig(dataTable, k, epsilon, srcDims)

    #load and format the table for use.
    data = loadMatrix(dataTable)[:, :kmOptions['sourceDims']]

    #check if we should normalise the data (this is really quick and dirty, replace it with something better)
    if normData:
        dmax = amax(data)
        dmin = amin(data)
        data = (data - dmin) / (dmax - dmin + 0.00000001)

    #make our starting point solutions from the dataset
    solutions = [array(random.sample(data, k)) for i in xrange(iters)]

    #chunk solutions if necessary
    for i in xrange(len(solutions)):
        sol = []
        while len(solutions[i]) > kmOptions['chunkSize']:
            sol.append(solutions[i][:kmOptions['chunkSize']])
            solutions[i] = solutions[i][kmOptions['chunkSize']:]
        sol.append(solutions[i])
        solutions[i] = sol

    #create our chunked problem data
    dataChunks = []
    while len(data) > kmOptions['chunkSize']:
        dataChunks.append(data[:kmOptions['chunkSize']])
        data = data[kmOptions['chunkSize']:]
    dataChunks.append(data)
    kNorm = (len(dataChunks) -
             1) + len(dataChunks[-1]) / float(len(dataChunks[0]))

    #create the CUDA kernels
    program = SourceModule(open(KernelLocation + "KMEANS_LABEL.nvcc").read())
    prg = program.get_function("KMEANS_LABEL")
    program = SourceModule(open(KernelLocation + "KMEANS_UPDATE.nvcc").read())
    prg2 = program.get_function("KMEANS_UPDATE")
    t0 = time.time()

    #store the resultant performance of each solution here
    results = []
    finalSols = []

    #make GPU allocations and support variables
    total = 0.
    dists = [
        numpy.zeros(kmOptions['chunkSize']).astype(numpy.float32) +
        10000000000000000. for i in xrange(len(dataChunks))
    ]  #this is used as an intermediate step
    labels = [
        numpy.zeros(kmOptions['chunkSize']).astype(numpy.uint32)
        for i in xrange(len(dataChunks))
    ]  #this is used as an intermediate step
    data_gpu = drv.mem_alloc(dataChunks[0].nbytes)
    k_gpu = drv.mem_alloc(solutions[0][0].nbytes)
    labels_gpu = drv.mem_alloc(labels[0].nbytes)
    dists_gpu = drv.mem_alloc(dists[0].nbytes)

    #calculate KMeans
    for sol in solutions:
        t0 = time.time()
        for i in xrange(10000):
            #Step 1: find all the closest labels
            for i in xrange(len(sol)):
                #copy in blank distances, labels, and the label coordinates
                drv.memcpy_htod(k_gpu, sol[i])
                for j in xrange(len(dataChunks)):
                    drv.memcpy_htod(data_gpu, dataChunks[j])
                    drv.memcpy_htod(labels_gpu, labels[j])
                    drv.memcpy_htod(dists_gpu, dists[j])
                    prg(
                        k_gpu,
                        data_gpu,
                        kmOptions["dimensions"],
                        labels_gpu,
                        dists_gpu,
                        kmOptions['k'],
                        kmOptions['dataSize'],
                        kmOptions['chunkSize'],
                        numpy.int64(i * kmOptions['chunkSize']),  #k offset
                        numpy.int64(j * kmOptions['chunkSize']),  #data offset
                        kmOptions['maxThreads'],
                        block=kmOptions['block'],
                        grid=kmOptions['grid'])
                drv.memcpy_dtoh(labels[i], labels_gpu)
            #Step 2: find the new averages
            old_sol = [s.copy() for s in sol]
            for i in xrange(len(sol)):
                #load up a blank set of k matrices
                drv.memcpy_htod(k_gpu, sol[i] * 0.)
                for j in xrange(len(dataChunks)):
                    drv.memcpy_htod(data_gpu, dataChunks[j])
                    drv.memcpy_htod(labels_gpu, labels[j])
                    prg2(
                        k_gpu,
                        data_gpu,
                        kmOptions["dimensions"],
                        labels_gpu,
                        kmOptions['k'],
                        kmOptions['dataSize'],
                        kmOptions['chunkSize'],
                        numpy.int64(i * kmOptions['chunkSize']),  #label offset
                        numpy.int64(j * kmOptions['chunkSize']),  #data offset
                        kmOptions['maxThreads'],
                        block=kmOptions['block'],
                        grid=kmOptions['grid'])
                drv.memcpy_dtoh(sol[i], k_gpu)
                sol[i] /= kNorm  #final normalisation
            #Step 3: check that the update distance is larger than epsilon
            total = 0.
            for j in xrange(len(sol)):
                tmp = sol[j] - old_sol[j]
                tmp = tmp * tmp
                total += sum([sum(t**0.5) for t in tmp])
            if total / kmOptions['dataSize'] < kmOptions['eps']:
                break
        print "solution done in ", time.time() - t0
        results.append((total, len(results)))
        finalSols.append(numpy.concatenate(sol)[:kmOptions['dataSize']])
    results.sort()
    return finalSols[results[0][1]]
Ejemplo n.º 5
0
for o in optlist:
    nonmetric = True
    
for o in optlist:
    if o[0].strip('-') == 'help' or o[1].strip('-') == 'h':
        print "The following commands are available:"
        print "\t--if=inputfile\tDefaults to embedding.csv"
        print "\t--of=outputfile\tDefaults to embedding.ps"
        print "\t--k=k_nearest_neighbours\tDefaults to 12"
        print "\t--outdims=embedding_dimensions\tDefaults to 3"
        print "\t--indims=input_dimensions\tDefaults to all in the input file"
        print "\t--nonmetric\tEnables non-metric MDS embeddings"

result = []

graph_distances = array(loadMatrix(distfile)).flatten()


for dim in xrange(minDims,maxDims):
    embedding = loadMatrix(infile)

    embedding_distances = []
    for i in xrange(len(embedding)):
        ei = embedding[i][:dim]
        for j in xrange(i):
            embedding_distances.append(embedding_distances[i+j*len(embedding)])
        for j in xrange(i,len(embedding)):
            e = ei-embedding[j][:dim]
            embedding_distances.append(dot(e,e))
    embedding_distances = corrcoef(sqrt(array(embedding_distances)),graph_distances)
    residual = (1 - embedding_distances*embedding_distances)[0][1]