Example #1
0
def get_big_module(nDim, nPts, nClusters,
                    blocksize_step4, seqcount_step4, gridsize_step4, 
                    blocksize_step4part2, useTextureForData):
    # module to calculate distances between each cluster and half distance to closest
    
    modString = """

#define NCLUSTERS         """ + str(nClusters)                    + """
#define NDIM              """ + str(nDim)                         + """
#define NPTS              """ + str(nPts)                         + """
//#define CLUSTERS_SIZE     """ + str(nClusters*nDim)               + """

#define SHARED_CLUSTERS   """ + str((4000-32)/nDim)              + """

#define THREADS4          """ + str(blocksize_step4)             + """
#define BLOCKS4           """ + str(gridsize_step4)              + """
#define SEQ_COUNT4        """ + str(seqcount_step4)              + """
#define RED_OUT_WIDTH     """ + str(gridsize_step4*nClusters)    + """
#define THREADS4PART2     """ + str(blocksize_step4part2)        + """

texture<float, 2, cudaReadModeElementType>texData;


//-----------------------------------------------------------------------
//                          misc functions
//-----------------------------------------------------------------------

// calculate the distance beteen two clusters
__device__ float calc_dist(float *clusterA, float *clusterB)
{
    float dist = (clusterA[0]-clusterB[0]) * (clusterA[0]-clusterB[0]);

    for (int i=NCLUSTERS; i<NDIM*NCLUSTERS; i+= NCLUSTERS) {
        float diff = clusterA[i] - clusterB[i];
        dist += diff*diff;
    }


//------------------------------------------------------------------------
// + loop(1, nDim, 16, 
//        dist += (clusterA[{0}*NCLUSTERS] - clusterB[{0}*NCLUSTERS])
//                *(clusterA[{0}*NCLUSTERS] - clusterB[{0}*NCLUSTERS]);
//        ) + 
//------------------------------------------------------------------------


    return sqrt(dist);
}


__device__ float calc_dist_shared(float *clusterA, float *clusterB, int nFit)
{
    float dist = (clusterA[0]-clusterB[0]) * (clusterA[0]-clusterB[0]);

    for (int i=1; i<NDIM; i++) {
        float diff = clusterA[i*nFit] - clusterB[i*NCLUSTERS];
        dist += diff*diff;
    }
    return sqrt(dist);
}
    

// calculate the distance from a data point to a cluster
__device__ float dc_dist(float *data, float *cluster)
{
    float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]);

//------------------------------------------------------------------------
""" + meta.loop(1, nDim, 16, """ 
        dist += (data[{0}*NPTS] - cluster[{0}*NCLUSTERS])
                *(data[{0}*NPTS] - cluster[{0}*NCLUSTERS]);
"""        ) + """
//------------------------------------------------------------------------

    return sqrt(dist);
}

// calculate the distance from a data point in texture to a cluster
__device__ float dc_dist_tex(int pt, float *cluster)
{
    float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]);
    for(int i=1; i<NDIM; i++){
        float diff = tex2D(texData, i, pt) - cluster[i*NCLUSTERS];
        dist += diff * diff;
    }
    return sqrt(dist);
}


//-----------------------------------------------------------------------
//                             ccdist
//-----------------------------------------------------------------------

#define NFIT 4


// **TODO**  need to loop through clusters if all of them don't fit into shared memory

// Calculate cluster - cluster distances
__global__ void ccdist(float *clusters, float *cc_dists, float *hdClosest)
{
""" + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """

    // calculate distance between this cluster and all lower clusters
    // then store the distance in the table in two places: (this, lower) and (lower, this)
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if(idx >= NCLUSTERS) return;
    for(int c=0; c<idx; c++){
        float d = 0.5f * calc_dist(s_clusters+c, s_clusters + idx); // store 1/2 distance
        cc_dists[c * NCLUSTERS + idx] = d;
        cc_dists[idx * NCLUSTERS + c] = d;
    }
}


//-----------------------------------------------------------------------
//                           calc_hdClosest
//-----------------------------------------------------------------------

// Determination of hdClosest
__global__ void calc_hdclosest(float *cc_dists, float *hdClosest)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    hdClosest[idx] = 1.0e10;
    for(int c=0; c<NCLUSTERS; c++){
        if(c == idx) continue;
        float d = cc_dists[c*NCLUSTERS + idx];      // cc_dists contains 1/2 distance
        if(d < hdClosest[idx]) hdClosest[idx] = d;
    }
}


//-----------------------------------------------------------------------
//                              init
//-----------------------------------------------------------------------

// **TODO**  need to loop through clusters if all of them don't fit into shared memory

// Assign data points to the nearest cluster

"""
    if useTextureForData:
        modString += "__global__ void init(float *clusters,\n"
    else:
        modString += "__global__ void init(float *data, float *clusters,\n"
    modString += """
                                float *ccdist, float *hdClosest, int *assignments, 
                                float *lower, float *upper)
{
""" + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """

    // calculate distance to each cluster
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= NPTS) return;
    
    // start with cluster 0 as the closest
"""
    if useTextureForData:
        modString += "float min_dist = dc_dist_tex(idx, s_clusters);\n"
    else:
        modString += "float min_dist = dc_dist(data+idx, s_clusters);\n"
    modString += """
    lower[idx] = min_dist;
    int closest = 0;
    
    for(int c=1; c<NCLUSTERS; c++){
    // **TODO**  see if this test to skip some calculations is really worth it on the gpu versus cpu
        if(min_dist <= ccdist[closest * NCLUSTERS + c]) continue;
"""
    if useTextureForData:
        modString += "float d = dc_dist_tex(idx, s_clusters + c);\n"
    else:
        modString += "float d = dc_dist(data + idx, s_clusters + c);\n"
    modString += """
        lower[c*NPTS + idx] = d;
        if(d < min_dist){
            min_dist = d;
            closest = c;
        }
    }
    assignments[idx] = closest;
    upper[idx] = min_dist;
}


//-----------------------------------------------------------------------
//                                step3
//-----------------------------------------------------------------------

// **TODO**  need to loop through clusters if all of them don't fit into shared memory


// Step 3 of the algorithm

"""
    if useTextureForData:
        modString += "__global__ void step3(float *clusters,\n"
    else:
        modString += "__global__ void step3(float *data, float *clusters,\n"
    modString += """
                                     float *ccdist, float *hdClosest, int *assignments, 
                                     float *lower, float *upper, int *badUpper, 
                                     int *cluster_changed)
{
""" + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """
    
    // idx ranges over the data points
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx >= NPTS) return;
    
    float ux = upper[idx];
    int cx = assignments[idx];
    float rx = badUpper[idx];

    if(ux <= hdClosest[cx]) return; // step 2 condition

    for(int c=0; c<NCLUSTERS; c++){
        // step 3 conditions...
        if(c == cx || ux <= lower[c*NPTS + idx] || ux <= ccdist[cx*NCLUSTERS + c])
             continue;
             
        // Step 3a: check if upper bound needs to be recalculated
        float d_x_cx;
        if(rx){

        // distance between point idx and its currently assigned center needs to be calculated
"""
    if useTextureForData:
        modString += "d_x_cx = dc_dist_tex(idx, s_clusters+cx);\n"
    else:
        modString += "d_x_cx = dc_dist(data+idx, s_clusters+cx);\n"
    modString += """
            ux = d_x_cx;
            lower[c*NPTS + idx] = d_x_cx;
            rx = 0;
        }else{
            d_x_cx = ux;
        }
        
        // Step 3b: compute distance between x and c change x's assignment if necessary
        if(d_x_cx > lower[c*NPTS + idx] || d_x_cx > ccdist[cx*NCLUSTERS + c]){
"""
    if useTextureForData:
        modString += "float d_x_c = dc_dist_tex(idx, s_clusters+c);\n"
    else:
        modString += "float d_x_c = dc_dist(data+idx, s_clusters+c);\n"
    modString += """
            lower[c*NPTS + idx] = d_x_c;
            if(d_x_c < d_x_cx){
                // assign x to c
                // mark both c and cx as having changed
                ux = d_x_c;
                cx = c;
                rx = 0;
            }
        }
    }
    __syncthreads();

    upper[idx] = ux;
    
    // check for new assignment and flag old and new cluster as changed
    if(cx != assignments[idx]){
        cluster_changed[cx] = 1;
        cluster_changed[assignments[idx]] = 1;
        assignments[idx] = cx;
    }
    badUpper[idx] = rx;
    
    __syncthreads();
}

//-----------------------------------------------------------------------
//                                step4
//-----------------------------------------------------------------------

// Calculate the new cluster centers
"""
    if useTextureForData:
        modString += "__global__ void step4(\n"
    else:
        modString += "__global__ void step4(float *data,\n"
    modString += """
                        int *cluster_changed, float *reduction_out,
                        int *reduction_counts, int *assignments)
{
    __shared__ float s_data[THREADS4];
    __shared__ int s_count[THREADS4];

    int idx = threadIdx.x;
    int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + idx;
    
    int dim = blockIdx.y;
    
    for(int c=0; c<NCLUSTERS; c++){
        if(cluster_changed[c]){
            float tot = 0.0f;
            int count = 0;
            for(int s=0; s<SEQ_COUNT4; s++){
                if(iData >= NPTS) break;
                if(assignments[iData] == c){
                    count += 1;

"""
    if useTextureForData:
        modString += "tot += tex2D(texData, dim, iData);\n"
    else:
        modString += "tot += data[dim*NPTS + iData];\n"
    modString += """
                }
            }
            s_data[idx] = tot;
            s_count[idx] = count;
"""

    modString += meta.reduction2("s_data", "s_count", blocksize_step4) + """

            if(idx == 0){
                reduction_out[dim * RED_OUT_WIDTH + blockIdx.x * NCLUSTERS + c] = s_data[0];
                reduction_counts[blockIdx.x * NCLUSTERS + c] = s_count[0];
            }
        }
    }
}


//-----------------------------------------------------------------------
//                           step4part2
//-----------------------------------------------------------------------

// Calculate new cluster centers using reduction, part 2

__global__ void step4part2(int *cluster_changed, float *reduction_out, int *reduction_counts,
                            float *new_clusters, float *clusters)
{
    __shared__ float s_data[THREADS4PART2];
    __shared__ int s_count[THREADS4PART2];
    
    int idx = threadIdx.x;
    
    int dim = blockIdx.y;

    for(int c=0; c<NCLUSTERS; c++){
        s_data[idx] = 0.0f;
        s_count[idx] = 0;
        if(cluster_changed[c]){
            if(idx < BLOCKS4){
                // straight copy of data into shared memory
                s_data[idx] = reduction_out[dim*RED_OUT_WIDTH + idx*NCLUSTERS + c];
                s_count[idx] = reduction_counts[idx*NCLUSTERS + c];
            }
"""

    modString += meta.reduction2("s_data", "s_count", blocksize_step4part2) + """
    
        }

        // calculate the new cluster, or copy the old one has no values or didn't change
        if(idx == 0){
            if(s_count[0] == 0){
                new_clusters[dim * NCLUSTERS + c] = clusters[dim*NCLUSTERS + c];
            }else{
                new_clusters[dim * NCLUSTERS + c] = s_data[0] / s_count[0];
            }
        }
    }
}
    

//-----------------------------------------------------------------------
//                                calc movement
//-----------------------------------------------------------------------
__global__ void calc_movement(float *clusters, float *new_clusters, float *cluster_movement, 
                                int *cluster_changed)
{
""" + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """
    
    int cluster = threadIdx.x + blockDim.x*blockIdx.x;
    if(cluster_changed[cluster])
        cluster_movement[cluster] = calc_dist(s_clusters + cluster, new_clusters + cluster);
}


//-----------------------------------------------------------------------
//                                step56
//-----------------------------------------------------------------------
// **TODO**  need to loop through clusters if all of them don't fit into shared memory

// Assign data points to the nearest cluster
__global__ void step56(int *assignment, 
                        float *lower, float * upper, 
                        float *cluster_movement, int *badUpper)
{
""" + meta.copy_to_shared("float", "cluster_movement", "s_cluster_movement", nClusters) + """

    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= NPTS) return;
    
    // loop through all clusters and update the lower bound
    for(int c=0; c < NCLUSTERS; c++){
        if(s_cluster_movement[c] > 0.0f){
            if(s_cluster_movement[c] < lower[c * NPTS + idx]){
                lower[c*NPTS + idx] -= s_cluster_movement[c];
            }else{
                lower[c*NPTS + idx] = 0.0f;
            }
        }
    }
    
    // update the upper bound for this data point
    if(s_cluster_movement[assignment[idx]] > 0.f){
        upper[idx] += s_cluster_movement[assignment[idx]];
        badUpper[idx] = 1;
    }
}

"""
    #print modString
    return SourceModule(modString)
Example #2
0
def get_cuda_module(
    nDim, nPts, nClusters, blocksize_calc, seqcount_calc, gridsize_calc, blocksize_calc_part2, useTextureForData, bounds
):

    modString = (
        """

#define NCLUSTERS         """
        + str(nClusters)
        + """
#define NDIM              """
        + str(nDim)
        + """
#define NPTS              """
        + str(nPts)
        + """

#define THREADS4          """
        + str(blocksize_calc)
        + """
#define BLOCKS4           """
        + str(gridsize_calc)
        + """
#define SEQ_COUNT4        """
        + str(seqcount_calc)
        + """
#define RED_OUT_WIDTH     """
        + str(gridsize_calc * nClusters)
        + """
#define THREADS4PART2     """
        + str(blocksize_calc_part2)
        + """

#define BOUNDS     (float)"""
        + str(bounds)
        + """

texture<float, 2, cudaReadModeElementType>texData;


//-----------------------------------------------------------------------
//                          misc functions
//-----------------------------------------------------------------------


// calculate the distance squared from a data point to a cluster
__device__ float dc_dist(float *data, float *cluster)
{
    float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]);

//------------------------------------------------------------------------
"""
        + meta.loop(
            1,
            nDim,
            16,
            """ 
        dist += (data[{0}*NPTS] - cluster[{0}*NCLUSTERS])
                *(data[{0}*NPTS] - cluster[{0}*NCLUSTERS]);
""",
        )
        + """
//------------------------------------------------------------------------

    return dist;
}

// calculate the distance squared from a data point to a cluster
__device__ float dc_dist2(float *data, float *cluster)
{
    float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]);
    float *pData = data;
    for(float *pCluster = cluster + NCLUSTERS; 
            pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){
        pData += NPTS;
        dist +=((*pData) - (*pCluster)) * ((*pData) - (*pCluster));
    }

    return dist;
}


// calculate the distance squared from a data point in texture to a cluster
__device__ float dc_dist_tex(int pt, float *cluster)
{
    float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]);
    for(int i=1; i<NDIM; i++){
        float diff = tex2D(texData, i, pt) - cluster[i*NCLUSTERS];
        dist += diff * diff;
    }
    return dist;
}


// calculate the distance squared from a data point in texture to a cluster
__device__ float dc_dist_tex2(int pt, float *cluster)
{
    float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]);
    int i = 0;
    for(float *pCluster = cluster + NCLUSTERS;
            pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){
        i += 1;
        float diff = tex2D(texData, i, pt) - *pCluster;
        dist += diff * diff;
    }
    return dist;
}


//-----------------------------------------------------------------------
//                              assign
//-----------------------------------------------------------------------

// Assign data points to the nearest cluster

"""
    )
    if useTextureForData:
        modString += "__global__ void assign(float *clusters,\n"
    else:
        modString += "__global__ void assign(float *data, float *clusters,\n"
    modString += (
        """
                                int *assignments)
{
"""
        + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters * nDim)
        + """

    // calculate distance to each cluster
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= NPTS) return;
    
    // start with cluster 0 as the closest
"""
    )
    if useTextureForData:
        modString += "float min_dist = dc_dist_tex(idx, s_clusters);\n"
    else:
        modString += "float min_dist = dc_dist(data+idx, s_clusters);\n"
    modString += """
    int closest = 0;
    
    for(int c=1; c<NCLUSTERS; c++){
"""
    if useTextureForData:
        modString += "float d = dc_dist_tex2(idx, s_clusters + c);\n"
    else:
        modString += "float d = dc_dist(data + idx, s_clusters + c);\n"
    modString += """
        if(d < min_dist){
            min_dist = d;
            closest = c;
        }
    }
    assignments[idx] = closest;
}



//-----------------------------------------------------------------------
//                                calc
//-----------------------------------------------------------------------

// Calculate the new cluster centers
"""
    if useTextureForData:
        modString += "__global__ void calc(\n"
    else:
        modString += "__global__ void calc(float *data,\n"
    modString += """
                        float *reduction_out, int *reduction_counts, 
                        int *assignments)
{
    __shared__ float s_data[THREADS4];
    __shared__ int s_count[THREADS4];

    int idx = threadIdx.x;
//    int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + idx;
    
    int dim = blockIdx.y;
    
    for(int c=0; c<NCLUSTERS; c++){
        float tot = 0.0f;
        int count = 0;
        for(int s=0; s<SEQ_COUNT4; s++){
            int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + s * blockDim.x + idx;
            if(iData >= NPTS) break;
            if(assignments[iData] == c){
                count += 1;
"""
    if useTextureForData:
        modString += "tot += tex2D(texData, dim, iData);\n"
    else:
        modString += "tot += data[dim*NPTS + iData];\n"
    modString += """
            }
        }
        s_data[idx] = tot;
        s_count[idx] = count;
"""

    modString += (
        meta.reduction2("s_data", "s_count", blocksize_calc)
        + """

        if(idx == 0){
            reduction_out[dim * RED_OUT_WIDTH + blockIdx.x * NCLUSTERS + c] = s_data[0];
            reduction_counts[blockIdx.x * NCLUSTERS + c] = s_count[0];
        }
    }
}


//-----------------------------------------------------------------------
//                           calc_part2
//-----------------------------------------------------------------------

// Calculate new cluster centers using reduction, part 2

__global__ void calc_part2(float *reduction_out, int *reduction_counts,
                            float *new_clusters, float *clusters)
{
    __shared__ float s_data[THREADS4PART2];
    __shared__ int s_count[THREADS4PART2];
    
    int idx = threadIdx.x;
    
    int dim = blockIdx.y;

    for(int c=0; c<NCLUSTERS; c++){
        s_data[idx] = 0.0f;
        s_count[idx] = 0;
        if(idx < BLOCKS4){
            // straight copy of data into shared memory
            s_data[idx] = reduction_out[dim*RED_OUT_WIDTH + idx*NCLUSTERS + c];
            s_count[idx] = reduction_counts[idx*NCLUSTERS + c];
        }
"""
    )

    modString += (
        meta.reduction2("s_data", "s_count", blocksize_calc_part2)
        + """

        // calculate the new cluster, or copy the old one if has no values or didn't change
        if(idx == 0){
            if(s_count[0] == 0){
                new_clusters[dim * NCLUSTERS + c] = clusters[dim*NCLUSTERS + c];
            }else{
                new_clusters[dim * NCLUSTERS + c] = s_data[0] / s_count[0];
            }
        }
    }
}
    

"""
    )
    # print modString
    return SourceModule(modString)
def get_cuda_module(nDim, nPts, nClusters, blocksize_calc, seqcount_calc,
                    gridsize_calc, blocksize_calc_part2, useTextureForData,
                    bounds):

    modString = """

#define NCLUSTERS         """ + str(nClusters) + """
#define NDIM              """ + str(nDim) + """
#define NPTS              """ + str(nPts) + """

#define THREADS4          """ + str(blocksize_calc) + """
#define BLOCKS4           """ + str(gridsize_calc) + """
#define SEQ_COUNT4        """ + str(seqcount_calc) + """
#define RED_OUT_WIDTH     """ + str(gridsize_calc * nClusters) + """
#define THREADS4PART2     """ + str(blocksize_calc_part2) + """

#define BOUNDS     (float)""" + str(bounds) + """

texture<float, 2, cudaReadModeElementType>texData;


//-----------------------------------------------------------------------
//                          misc functions
//-----------------------------------------------------------------------


// calculate the distance squared from a data point to a cluster
__device__ float dc_dist(float *data, float *cluster)
{
    float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]);

//------------------------------------------------------------------------
""" + meta.loop(
        1, nDim, 16, """ 
        dist += (data[{0}*NPTS] - cluster[{0}*NCLUSTERS])
                *(data[{0}*NPTS] - cluster[{0}*NCLUSTERS]);
""") + """
//------------------------------------------------------------------------

    return dist;
}

// calculate the distance squared from a data point to a cluster
__device__ float dc_dist2(float *data, float *cluster)
{
    float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]);
    float *pData = data;
    for(float *pCluster = cluster + NCLUSTERS; 
            pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){
        pData += NPTS;
        dist +=((*pData) - (*pCluster)) * ((*pData) - (*pCluster));
    }

    return dist;
}


// calculate the distance squared from a data point in texture to a cluster
__device__ float dc_dist_tex(int pt, float *cluster)
{
    float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]);
    for(int i=1; i<NDIM; i++){
        float diff = tex2D(texData, i, pt) - cluster[i*NCLUSTERS];
        dist += diff * diff;
    }
    return dist;
}


// calculate the distance squared from a data point in texture to a cluster
__device__ float dc_dist_tex2(int pt, float *cluster)
{
    float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]);
    int i = 0;
    for(float *pCluster = cluster + NCLUSTERS;
            pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){
        i += 1;
        float diff = tex2D(texData, i, pt) - *pCluster;
        dist += diff * diff;
    }
    return dist;
}


//-----------------------------------------------------------------------
//                              assign
//-----------------------------------------------------------------------

// Assign data points to the nearest cluster

"""
    if useTextureForData:
        modString += "__global__ void assign(float *clusters,\n"
    else:
        modString += "__global__ void assign(float *data, float *clusters,\n"
    modString += """
                                int *assignments)
{
""" + meta.copy_to_shared("float", "clusters", "s_clusters",
                          nClusters * nDim) + """

    // calculate distance to each cluster
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    if (idx >= NPTS) return;
    
    // start with cluster 0 as the closest
"""
    if useTextureForData:
        modString += "float min_dist = dc_dist_tex(idx, s_clusters);\n"
    else:
        modString += "float min_dist = dc_dist(data+idx, s_clusters);\n"
    modString += """
    int closest = 0;
    
    for(int c=1; c<NCLUSTERS; c++){
"""
    if useTextureForData:
        modString += "float d = dc_dist_tex2(idx, s_clusters + c);\n"
    else:
        modString += "float d = dc_dist(data + idx, s_clusters + c);\n"
    modString += """
        if(d < min_dist){
            min_dist = d;
            closest = c;
        }
    }
    assignments[idx] = closest;
}



//-----------------------------------------------------------------------
//                                calc
//-----------------------------------------------------------------------

// Calculate the new cluster centers
"""
    if useTextureForData:
        modString += "__global__ void calc(\n"
    else:
        modString += "__global__ void calc(float *data,\n"
    modString += """
                        float *reduction_out, int *reduction_counts, 
                        int *assignments)
{
    __shared__ float s_data[THREADS4];
    __shared__ int s_count[THREADS4];

    int idx = threadIdx.x;
//    int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + idx;
    
    int dim = blockIdx.y;
    
    for(int c=0; c<NCLUSTERS; c++){
        float tot = 0.0f;
        int count = 0;
        for(int s=0; s<SEQ_COUNT4; s++){
            int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + s * blockDim.x + idx;
            if(iData >= NPTS) break;
            if(assignments[iData] == c){
                count += 1;
"""
    if useTextureForData:
        modString += "tot += tex2D(texData, dim, iData);\n"
    else:
        modString += "tot += data[dim*NPTS + iData];\n"
    modString += """
            }
        }
        s_data[idx] = tot;
        s_count[idx] = count;
"""

    modString += meta.reduction2("s_data", "s_count", blocksize_calc) + """

        if(idx == 0){
            reduction_out[dim * RED_OUT_WIDTH + blockIdx.x * NCLUSTERS + c] = s_data[0];
            reduction_counts[blockIdx.x * NCLUSTERS + c] = s_count[0];
        }
    }
}


//-----------------------------------------------------------------------
//                           calc_part2
//-----------------------------------------------------------------------

// Calculate new cluster centers using reduction, part 2

__global__ void calc_part2(float *reduction_out, int *reduction_counts,
                            float *new_clusters, float *clusters)
{
    __shared__ float s_data[THREADS4PART2];
    __shared__ int s_count[THREADS4PART2];
    
    int idx = threadIdx.x;
    
    int dim = blockIdx.y;

    for(int c=0; c<NCLUSTERS; c++){
        s_data[idx] = 0.0f;
        s_count[idx] = 0;
        if(idx < BLOCKS4){
            // straight copy of data into shared memory
            s_data[idx] = reduction_out[dim*RED_OUT_WIDTH + idx*NCLUSTERS + c];
            s_count[idx] = reduction_counts[idx*NCLUSTERS + c];
        }
"""

    modString += meta.reduction2("s_data", "s_count",
                                 blocksize_calc_part2) + """

        // calculate the new cluster, or copy the old one if has no values or didn't change
        if(idx == 0){
            if(s_count[0] == 0){
                new_clusters[dim * NCLUSTERS + c] = clusters[dim*NCLUSTERS + c];
            }else{
                new_clusters[dim * NCLUSTERS + c] = s_data[0] / s_count[0];
            }
        }
    }
}
    

"""
    #print modString
    return SourceModule(modString)