def get_big_module(nDim, nPts, nClusters, blocksize_step4, seqcount_step4, gridsize_step4, blocksize_step4part2, useTextureForData): # module to calculate distances between each cluster and half distance to closest modString = """ #define NCLUSTERS """ + str(nClusters) + """ #define NDIM """ + str(nDim) + """ #define NPTS """ + str(nPts) + """ //#define CLUSTERS_SIZE """ + str(nClusters*nDim) + """ #define SHARED_CLUSTERS """ + str((4000-32)/nDim) + """ #define THREADS4 """ + str(blocksize_step4) + """ #define BLOCKS4 """ + str(gridsize_step4) + """ #define SEQ_COUNT4 """ + str(seqcount_step4) + """ #define RED_OUT_WIDTH """ + str(gridsize_step4*nClusters) + """ #define THREADS4PART2 """ + str(blocksize_step4part2) + """ texture<float, 2, cudaReadModeElementType>texData; //----------------------------------------------------------------------- // misc functions //----------------------------------------------------------------------- // calculate the distance beteen two clusters __device__ float calc_dist(float *clusterA, float *clusterB) { float dist = (clusterA[0]-clusterB[0]) * (clusterA[0]-clusterB[0]); for (int i=NCLUSTERS; i<NDIM*NCLUSTERS; i+= NCLUSTERS) { float diff = clusterA[i] - clusterB[i]; dist += diff*diff; } //------------------------------------------------------------------------ // + loop(1, nDim, 16, // dist += (clusterA[{0}*NCLUSTERS] - clusterB[{0}*NCLUSTERS]) // *(clusterA[{0}*NCLUSTERS] - clusterB[{0}*NCLUSTERS]); // ) + //------------------------------------------------------------------------ return sqrt(dist); } __device__ float calc_dist_shared(float *clusterA, float *clusterB, int nFit) { float dist = (clusterA[0]-clusterB[0]) * (clusterA[0]-clusterB[0]); for (int i=1; i<NDIM; i++) { float diff = clusterA[i*nFit] - clusterB[i*NCLUSTERS]; dist += diff*diff; } return sqrt(dist); } // calculate the distance from a data point to a cluster __device__ float dc_dist(float *data, float *cluster) { float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]); //------------------------------------------------------------------------ """ + meta.loop(1, nDim, 16, """ dist += (data[{0}*NPTS] - cluster[{0}*NCLUSTERS]) *(data[{0}*NPTS] - cluster[{0}*NCLUSTERS]); """ ) + """ //------------------------------------------------------------------------ return sqrt(dist); } // calculate the distance from a data point in texture to a cluster __device__ float dc_dist_tex(int pt, float *cluster) { float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]); for(int i=1; i<NDIM; i++){ float diff = tex2D(texData, i, pt) - cluster[i*NCLUSTERS]; dist += diff * diff; } return sqrt(dist); } //----------------------------------------------------------------------- // ccdist //----------------------------------------------------------------------- #define NFIT 4 // **TODO** need to loop through clusters if all of them don't fit into shared memory // Calculate cluster - cluster distances __global__ void ccdist(float *clusters, float *cc_dists, float *hdClosest) { """ + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """ // calculate distance between this cluster and all lower clusters // then store the distance in the table in two places: (this, lower) and (lower, this) int idx = threadIdx.x + blockDim.x * blockIdx.x; if(idx >= NCLUSTERS) return; for(int c=0; c<idx; c++){ float d = 0.5f * calc_dist(s_clusters+c, s_clusters + idx); // store 1/2 distance cc_dists[c * NCLUSTERS + idx] = d; cc_dists[idx * NCLUSTERS + c] = d; } } //----------------------------------------------------------------------- // calc_hdClosest //----------------------------------------------------------------------- // Determination of hdClosest __global__ void calc_hdclosest(float *cc_dists, float *hdClosest) { int idx = threadIdx.x + blockIdx.x * blockDim.x; hdClosest[idx] = 1.0e10; for(int c=0; c<NCLUSTERS; c++){ if(c == idx) continue; float d = cc_dists[c*NCLUSTERS + idx]; // cc_dists contains 1/2 distance if(d < hdClosest[idx]) hdClosest[idx] = d; } } //----------------------------------------------------------------------- // init //----------------------------------------------------------------------- // **TODO** need to loop through clusters if all of them don't fit into shared memory // Assign data points to the nearest cluster """ if useTextureForData: modString += "__global__ void init(float *clusters,\n" else: modString += "__global__ void init(float *data, float *clusters,\n" modString += """ float *ccdist, float *hdClosest, int *assignments, float *lower, float *upper) { """ + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """ // calculate distance to each cluster int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= NPTS) return; // start with cluster 0 as the closest """ if useTextureForData: modString += "float min_dist = dc_dist_tex(idx, s_clusters);\n" else: modString += "float min_dist = dc_dist(data+idx, s_clusters);\n" modString += """ lower[idx] = min_dist; int closest = 0; for(int c=1; c<NCLUSTERS; c++){ // **TODO** see if this test to skip some calculations is really worth it on the gpu versus cpu if(min_dist <= ccdist[closest * NCLUSTERS + c]) continue; """ if useTextureForData: modString += "float d = dc_dist_tex(idx, s_clusters + c);\n" else: modString += "float d = dc_dist(data + idx, s_clusters + c);\n" modString += """ lower[c*NPTS + idx] = d; if(d < min_dist){ min_dist = d; closest = c; } } assignments[idx] = closest; upper[idx] = min_dist; } //----------------------------------------------------------------------- // step3 //----------------------------------------------------------------------- // **TODO** need to loop through clusters if all of them don't fit into shared memory // Step 3 of the algorithm """ if useTextureForData: modString += "__global__ void step3(float *clusters,\n" else: modString += "__global__ void step3(float *data, float *clusters,\n" modString += """ float *ccdist, float *hdClosest, int *assignments, float *lower, float *upper, int *badUpper, int *cluster_changed) { """ + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """ // idx ranges over the data points int idx = threadIdx.x + blockIdx.x * blockDim.x; if(idx >= NPTS) return; float ux = upper[idx]; int cx = assignments[idx]; float rx = badUpper[idx]; if(ux <= hdClosest[cx]) return; // step 2 condition for(int c=0; c<NCLUSTERS; c++){ // step 3 conditions... if(c == cx || ux <= lower[c*NPTS + idx] || ux <= ccdist[cx*NCLUSTERS + c]) continue; // Step 3a: check if upper bound needs to be recalculated float d_x_cx; if(rx){ // distance between point idx and its currently assigned center needs to be calculated """ if useTextureForData: modString += "d_x_cx = dc_dist_tex(idx, s_clusters+cx);\n" else: modString += "d_x_cx = dc_dist(data+idx, s_clusters+cx);\n" modString += """ ux = d_x_cx; lower[c*NPTS + idx] = d_x_cx; rx = 0; }else{ d_x_cx = ux; } // Step 3b: compute distance between x and c change x's assignment if necessary if(d_x_cx > lower[c*NPTS + idx] || d_x_cx > ccdist[cx*NCLUSTERS + c]){ """ if useTextureForData: modString += "float d_x_c = dc_dist_tex(idx, s_clusters+c);\n" else: modString += "float d_x_c = dc_dist(data+idx, s_clusters+c);\n" modString += """ lower[c*NPTS + idx] = d_x_c; if(d_x_c < d_x_cx){ // assign x to c // mark both c and cx as having changed ux = d_x_c; cx = c; rx = 0; } } } __syncthreads(); upper[idx] = ux; // check for new assignment and flag old and new cluster as changed if(cx != assignments[idx]){ cluster_changed[cx] = 1; cluster_changed[assignments[idx]] = 1; assignments[idx] = cx; } badUpper[idx] = rx; __syncthreads(); } //----------------------------------------------------------------------- // step4 //----------------------------------------------------------------------- // Calculate the new cluster centers """ if useTextureForData: modString += "__global__ void step4(\n" else: modString += "__global__ void step4(float *data,\n" modString += """ int *cluster_changed, float *reduction_out, int *reduction_counts, int *assignments) { __shared__ float s_data[THREADS4]; __shared__ int s_count[THREADS4]; int idx = threadIdx.x; int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + idx; int dim = blockIdx.y; for(int c=0; c<NCLUSTERS; c++){ if(cluster_changed[c]){ float tot = 0.0f; int count = 0; for(int s=0; s<SEQ_COUNT4; s++){ if(iData >= NPTS) break; if(assignments[iData] == c){ count += 1; """ if useTextureForData: modString += "tot += tex2D(texData, dim, iData);\n" else: modString += "tot += data[dim*NPTS + iData];\n" modString += """ } } s_data[idx] = tot; s_count[idx] = count; """ modString += meta.reduction2("s_data", "s_count", blocksize_step4) + """ if(idx == 0){ reduction_out[dim * RED_OUT_WIDTH + blockIdx.x * NCLUSTERS + c] = s_data[0]; reduction_counts[blockIdx.x * NCLUSTERS + c] = s_count[0]; } } } } //----------------------------------------------------------------------- // step4part2 //----------------------------------------------------------------------- // Calculate new cluster centers using reduction, part 2 __global__ void step4part2(int *cluster_changed, float *reduction_out, int *reduction_counts, float *new_clusters, float *clusters) { __shared__ float s_data[THREADS4PART2]; __shared__ int s_count[THREADS4PART2]; int idx = threadIdx.x; int dim = blockIdx.y; for(int c=0; c<NCLUSTERS; c++){ s_data[idx] = 0.0f; s_count[idx] = 0; if(cluster_changed[c]){ if(idx < BLOCKS4){ // straight copy of data into shared memory s_data[idx] = reduction_out[dim*RED_OUT_WIDTH + idx*NCLUSTERS + c]; s_count[idx] = reduction_counts[idx*NCLUSTERS + c]; } """ modString += meta.reduction2("s_data", "s_count", blocksize_step4part2) + """ } // calculate the new cluster, or copy the old one has no values or didn't change if(idx == 0){ if(s_count[0] == 0){ new_clusters[dim * NCLUSTERS + c] = clusters[dim*NCLUSTERS + c]; }else{ new_clusters[dim * NCLUSTERS + c] = s_data[0] / s_count[0]; } } } } //----------------------------------------------------------------------- // calc movement //----------------------------------------------------------------------- __global__ void calc_movement(float *clusters, float *new_clusters, float *cluster_movement, int *cluster_changed) { """ + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters*nDim) + """ int cluster = threadIdx.x + blockDim.x*blockIdx.x; if(cluster_changed[cluster]) cluster_movement[cluster] = calc_dist(s_clusters + cluster, new_clusters + cluster); } //----------------------------------------------------------------------- // step56 //----------------------------------------------------------------------- // **TODO** need to loop through clusters if all of them don't fit into shared memory // Assign data points to the nearest cluster __global__ void step56(int *assignment, float *lower, float * upper, float *cluster_movement, int *badUpper) { """ + meta.copy_to_shared("float", "cluster_movement", "s_cluster_movement", nClusters) + """ int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= NPTS) return; // loop through all clusters and update the lower bound for(int c=0; c < NCLUSTERS; c++){ if(s_cluster_movement[c] > 0.0f){ if(s_cluster_movement[c] < lower[c * NPTS + idx]){ lower[c*NPTS + idx] -= s_cluster_movement[c]; }else{ lower[c*NPTS + idx] = 0.0f; } } } // update the upper bound for this data point if(s_cluster_movement[assignment[idx]] > 0.f){ upper[idx] += s_cluster_movement[assignment[idx]]; badUpper[idx] = 1; } } """ #print modString return SourceModule(modString)
def get_cuda_module( nDim, nPts, nClusters, blocksize_calc, seqcount_calc, gridsize_calc, blocksize_calc_part2, useTextureForData, bounds ): modString = ( """ #define NCLUSTERS """ + str(nClusters) + """ #define NDIM """ + str(nDim) + """ #define NPTS """ + str(nPts) + """ #define THREADS4 """ + str(blocksize_calc) + """ #define BLOCKS4 """ + str(gridsize_calc) + """ #define SEQ_COUNT4 """ + str(seqcount_calc) + """ #define RED_OUT_WIDTH """ + str(gridsize_calc * nClusters) + """ #define THREADS4PART2 """ + str(blocksize_calc_part2) + """ #define BOUNDS (float)""" + str(bounds) + """ texture<float, 2, cudaReadModeElementType>texData; //----------------------------------------------------------------------- // misc functions //----------------------------------------------------------------------- // calculate the distance squared from a data point to a cluster __device__ float dc_dist(float *data, float *cluster) { float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]); //------------------------------------------------------------------------ """ + meta.loop( 1, nDim, 16, """ dist += (data[{0}*NPTS] - cluster[{0}*NCLUSTERS]) *(data[{0}*NPTS] - cluster[{0}*NCLUSTERS]); """, ) + """ //------------------------------------------------------------------------ return dist; } // calculate the distance squared from a data point to a cluster __device__ float dc_dist2(float *data, float *cluster) { float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]); float *pData = data; for(float *pCluster = cluster + NCLUSTERS; pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){ pData += NPTS; dist +=((*pData) - (*pCluster)) * ((*pData) - (*pCluster)); } return dist; } // calculate the distance squared from a data point in texture to a cluster __device__ float dc_dist_tex(int pt, float *cluster) { float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]); for(int i=1; i<NDIM; i++){ float diff = tex2D(texData, i, pt) - cluster[i*NCLUSTERS]; dist += diff * diff; } return dist; } // calculate the distance squared from a data point in texture to a cluster __device__ float dc_dist_tex2(int pt, float *cluster) { float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]); int i = 0; for(float *pCluster = cluster + NCLUSTERS; pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){ i += 1; float diff = tex2D(texData, i, pt) - *pCluster; dist += diff * diff; } return dist; } //----------------------------------------------------------------------- // assign //----------------------------------------------------------------------- // Assign data points to the nearest cluster """ ) if useTextureForData: modString += "__global__ void assign(float *clusters,\n" else: modString += "__global__ void assign(float *data, float *clusters,\n" modString += ( """ int *assignments) { """ + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters * nDim) + """ // calculate distance to each cluster int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= NPTS) return; // start with cluster 0 as the closest """ ) if useTextureForData: modString += "float min_dist = dc_dist_tex(idx, s_clusters);\n" else: modString += "float min_dist = dc_dist(data+idx, s_clusters);\n" modString += """ int closest = 0; for(int c=1; c<NCLUSTERS; c++){ """ if useTextureForData: modString += "float d = dc_dist_tex2(idx, s_clusters + c);\n" else: modString += "float d = dc_dist(data + idx, s_clusters + c);\n" modString += """ if(d < min_dist){ min_dist = d; closest = c; } } assignments[idx] = closest; } //----------------------------------------------------------------------- // calc //----------------------------------------------------------------------- // Calculate the new cluster centers """ if useTextureForData: modString += "__global__ void calc(\n" else: modString += "__global__ void calc(float *data,\n" modString += """ float *reduction_out, int *reduction_counts, int *assignments) { __shared__ float s_data[THREADS4]; __shared__ int s_count[THREADS4]; int idx = threadIdx.x; // int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + idx; int dim = blockIdx.y; for(int c=0; c<NCLUSTERS; c++){ float tot = 0.0f; int count = 0; for(int s=0; s<SEQ_COUNT4; s++){ int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + s * blockDim.x + idx; if(iData >= NPTS) break; if(assignments[iData] == c){ count += 1; """ if useTextureForData: modString += "tot += tex2D(texData, dim, iData);\n" else: modString += "tot += data[dim*NPTS + iData];\n" modString += """ } } s_data[idx] = tot; s_count[idx] = count; """ modString += ( meta.reduction2("s_data", "s_count", blocksize_calc) + """ if(idx == 0){ reduction_out[dim * RED_OUT_WIDTH + blockIdx.x * NCLUSTERS + c] = s_data[0]; reduction_counts[blockIdx.x * NCLUSTERS + c] = s_count[0]; } } } //----------------------------------------------------------------------- // calc_part2 //----------------------------------------------------------------------- // Calculate new cluster centers using reduction, part 2 __global__ void calc_part2(float *reduction_out, int *reduction_counts, float *new_clusters, float *clusters) { __shared__ float s_data[THREADS4PART2]; __shared__ int s_count[THREADS4PART2]; int idx = threadIdx.x; int dim = blockIdx.y; for(int c=0; c<NCLUSTERS; c++){ s_data[idx] = 0.0f; s_count[idx] = 0; if(idx < BLOCKS4){ // straight copy of data into shared memory s_data[idx] = reduction_out[dim*RED_OUT_WIDTH + idx*NCLUSTERS + c]; s_count[idx] = reduction_counts[idx*NCLUSTERS + c]; } """ ) modString += ( meta.reduction2("s_data", "s_count", blocksize_calc_part2) + """ // calculate the new cluster, or copy the old one if has no values or didn't change if(idx == 0){ if(s_count[0] == 0){ new_clusters[dim * NCLUSTERS + c] = clusters[dim*NCLUSTERS + c]; }else{ new_clusters[dim * NCLUSTERS + c] = s_data[0] / s_count[0]; } } } } """ ) # print modString return SourceModule(modString)
def get_cuda_module(nDim, nPts, nClusters, blocksize_calc, seqcount_calc, gridsize_calc, blocksize_calc_part2, useTextureForData, bounds): modString = """ #define NCLUSTERS """ + str(nClusters) + """ #define NDIM """ + str(nDim) + """ #define NPTS """ + str(nPts) + """ #define THREADS4 """ + str(blocksize_calc) + """ #define BLOCKS4 """ + str(gridsize_calc) + """ #define SEQ_COUNT4 """ + str(seqcount_calc) + """ #define RED_OUT_WIDTH """ + str(gridsize_calc * nClusters) + """ #define THREADS4PART2 """ + str(blocksize_calc_part2) + """ #define BOUNDS (float)""" + str(bounds) + """ texture<float, 2, cudaReadModeElementType>texData; //----------------------------------------------------------------------- // misc functions //----------------------------------------------------------------------- // calculate the distance squared from a data point to a cluster __device__ float dc_dist(float *data, float *cluster) { float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]); //------------------------------------------------------------------------ """ + meta.loop( 1, nDim, 16, """ dist += (data[{0}*NPTS] - cluster[{0}*NCLUSTERS]) *(data[{0}*NPTS] - cluster[{0}*NCLUSTERS]); """) + """ //------------------------------------------------------------------------ return dist; } // calculate the distance squared from a data point to a cluster __device__ float dc_dist2(float *data, float *cluster) { float dist = (data[0]-cluster[0]) * (data[0]-cluster[0]); float *pData = data; for(float *pCluster = cluster + NCLUSTERS; pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){ pData += NPTS; dist +=((*pData) - (*pCluster)) * ((*pData) - (*pCluster)); } return dist; } // calculate the distance squared from a data point in texture to a cluster __device__ float dc_dist_tex(int pt, float *cluster) { float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]); for(int i=1; i<NDIM; i++){ float diff = tex2D(texData, i, pt) - cluster[i*NCLUSTERS]; dist += diff * diff; } return dist; } // calculate the distance squared from a data point in texture to a cluster __device__ float dc_dist_tex2(int pt, float *cluster) { float dist = (tex2D(texData, 0, pt)-cluster[0]) * (tex2D(texData, 0, pt)-cluster[0]); int i = 0; for(float *pCluster = cluster + NCLUSTERS; pCluster < cluster + NCLUSTERS * NDIM; pCluster += NCLUSTERS){ i += 1; float diff = tex2D(texData, i, pt) - *pCluster; dist += diff * diff; } return dist; } //----------------------------------------------------------------------- // assign //----------------------------------------------------------------------- // Assign data points to the nearest cluster """ if useTextureForData: modString += "__global__ void assign(float *clusters,\n" else: modString += "__global__ void assign(float *data, float *clusters,\n" modString += """ int *assignments) { """ + meta.copy_to_shared("float", "clusters", "s_clusters", nClusters * nDim) + """ // calculate distance to each cluster int idx = threadIdx.x + blockDim.x * blockIdx.x; if (idx >= NPTS) return; // start with cluster 0 as the closest """ if useTextureForData: modString += "float min_dist = dc_dist_tex(idx, s_clusters);\n" else: modString += "float min_dist = dc_dist(data+idx, s_clusters);\n" modString += """ int closest = 0; for(int c=1; c<NCLUSTERS; c++){ """ if useTextureForData: modString += "float d = dc_dist_tex2(idx, s_clusters + c);\n" else: modString += "float d = dc_dist(data + idx, s_clusters + c);\n" modString += """ if(d < min_dist){ min_dist = d; closest = c; } } assignments[idx] = closest; } //----------------------------------------------------------------------- // calc //----------------------------------------------------------------------- // Calculate the new cluster centers """ if useTextureForData: modString += "__global__ void calc(\n" else: modString += "__global__ void calc(float *data,\n" modString += """ float *reduction_out, int *reduction_counts, int *assignments) { __shared__ float s_data[THREADS4]; __shared__ int s_count[THREADS4]; int idx = threadIdx.x; // int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + idx; int dim = blockIdx.y; for(int c=0; c<NCLUSTERS; c++){ float tot = 0.0f; int count = 0; for(int s=0; s<SEQ_COUNT4; s++){ int iData = blockIdx.x * THREADS4 * SEQ_COUNT4 + s * blockDim.x + idx; if(iData >= NPTS) break; if(assignments[iData] == c){ count += 1; """ if useTextureForData: modString += "tot += tex2D(texData, dim, iData);\n" else: modString += "tot += data[dim*NPTS + iData];\n" modString += """ } } s_data[idx] = tot; s_count[idx] = count; """ modString += meta.reduction2("s_data", "s_count", blocksize_calc) + """ if(idx == 0){ reduction_out[dim * RED_OUT_WIDTH + blockIdx.x * NCLUSTERS + c] = s_data[0]; reduction_counts[blockIdx.x * NCLUSTERS + c] = s_count[0]; } } } //----------------------------------------------------------------------- // calc_part2 //----------------------------------------------------------------------- // Calculate new cluster centers using reduction, part 2 __global__ void calc_part2(float *reduction_out, int *reduction_counts, float *new_clusters, float *clusters) { __shared__ float s_data[THREADS4PART2]; __shared__ int s_count[THREADS4PART2]; int idx = threadIdx.x; int dim = blockIdx.y; for(int c=0; c<NCLUSTERS; c++){ s_data[idx] = 0.0f; s_count[idx] = 0; if(idx < BLOCKS4){ // straight copy of data into shared memory s_data[idx] = reduction_out[dim*RED_OUT_WIDTH + idx*NCLUSTERS + c]; s_count[idx] = reduction_counts[idx*NCLUSTERS + c]; } """ modString += meta.reduction2("s_data", "s_count", blocksize_calc_part2) + """ // calculate the new cluster, or copy the old one if has no values or didn't change if(idx == 0){ if(s_count[0] == 0){ new_clusters[dim * NCLUSTERS + c] = clusters[dim*NCLUSTERS + c]; }else{ new_clusters[dim * NCLUSTERS + c] = s_data[0] / s_count[0]; } } } } """ #print modString return SourceModule(modString)