__local DTYPE sdata[blockSize]; const int our_index = get_global_id(1); //The index of the vector that we will sum const int offset = get_local_id(0); sdata[offset] = 0; if(our_index < other_size) { //Load the values for(int i = offset; i<sum_size;i+=blockSize ) { sdata[offset] += from_matrix(in,i,our_index); } } NOWARPBLOCK """+opencl_tools.get_inkernal_reduction('sdata','blockSize','offset')+ """ if(our_index < other_size && offset == 0) { out[our_index] = sdata[0]; } } """ class SumKernal(object): def __init__(self,matrix,axis,queue=None): assert axis >= 0 and axis <= 1
const uint column_index = indices[c]; #ifdef INPUT_COLUMN_MAJOR const float in_value = in[column_index+out_col*out_rows]; #else const float in_value = in[column_index*max_cols+out_col]; #endif sdata[offset] += data[c]*in_value; } } #ifdef WARPSPEED if (blockSize > 32) #endif barrier(CLK_LOCAL_MEM_FENCE); """ + opencl_tools.get_inkernal_reduction("sdata", "blockSize", "offset") + """ //Return the output if (offset == 0 && row < num_rows && out_col < max_cols) { #ifdef OUTPUT_COLUMN_MAJOR out[row+out_col*num_rows] = sdata[0]; #else out[row*max_cols+out_col] = sdata[0]; #endif } } """ )