def init_buffers(self): shape = self.op.args[0].tensor_description().shape dtype = self.op.args[0].tensor_description().dtype # Allocate output and scratch buffers self.output_buff = gpuarray.zeros(shape, dtype) self.scratch_buff = gpuarray.zeros(shape, dtype) self.output_buff_dict[self.device_id] = self.output_buff.gpudata self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata # Allocate IPC handles output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata) scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata) event_ipc_hdl = self.event.ipc_handle() # Broadcast handles to others msg = (self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) self.comm.bcast(msg, root=self.device_id) # Get handles from others for i in self.op.device_ids: if i != self.device_id: (peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) =\ self.comm.bcast(msg, root=i) output_hdl = drv.IPCMemoryHandle(output_ipc_hdl) scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl) event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl) self.output_buff_dict[peer_id] = output_hdl self.scratch_buff_dict[peer_id] = scratch_hdl self.event_buff_dict[peer_id] = event_hdl
def init_buffers(self): shape = self.op.args[0].tensor_description().shape dtype = self.op.args[0].tensor_description().dtype n_devs = len(self.op.device_ids) size = self.op.args[0].tensor_description().axes.size segment_size = calculate_segment_size(size, n_devs) # Allocate output and scratch buffers self.output_buff = gpuarray.zeros(shape, dtype) self.scratch_buff = gpuarray.zeros(segment_size * n_devs, dtype) self.output_buff_dict[self.device_id] = self.output_buff.gpudata self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata # Allocate IPC handles output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata) scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata) event_ipc_hdl = self.event.ipc_handle() # Broadcast handles to others msg = (self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) for i in self.device_ids: if i == self.device_id: self.comm.bcast(msg, root=i) else: (peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) = self.comm.bcast(None, root=i) output_hdl = drv.IPCMemoryHandle(output_ipc_hdl) scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl) event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl) self.output_buff_dict[peer_id] = output_hdl self.scratch_buff_dict[peer_id] = scratch_hdl self.event_buff_dict[peer_id] = event_hdl
def init_buffers(self): shape = self.op.args[0].tensor_description().shape dtype = self.op.args[0].tensor_description().dtype # Allocate output and scratch buffers self.output_buff = gpuarray.zeros(shape, dtype) self.scratch_buff = gpuarray.zeros(shape, dtype) self.output_buff_dict[self.device_id] = self.output_buff.gpudata self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata # Allocate IPC handles output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata) scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata) event_ipc_hdl = self.event.ipc_handle() # Put handles in queues for i in self.op.shared_queues.keys(): if i != self.device_id: self.op.shared_queues[i].put((self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl)) # Get handles from others q = self.op.shared_queues[self.device_id] for i in range(len(self.op.shared_queues) - 1): peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl = q.get() output_hdl = drv.IPCMemoryHandle(output_ipc_hdl) scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl) event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl) self.output_buff_dict[peer_id] = output_hdl self.scratch_buff_dict[peer_id] = scratch_hdl self.event_buff_dict[peer_id] = event_hdl
def open_ipc_handle(shared_queue): while True: try: (buf_ipc_hdl, lock_ipc_hdl) = shared_queue.get(timeout=SLEEP_S) buf_hdl = drv.IPCMemoryHandle(buf_ipc_hdl) lock = drv.IPCMemoryHandle(lock_ipc_hdl) return (buf_hdl, lock) except Exception as e: if isinstance(e, Empty): pass else: raise
def bcast_ipc_handle(comm, handle=None): if handle is not None: buffer_ipc_handle = drv.mem_get_ipc_handle(handle) return comm.bcast(buffer_ipc_handle) else: handle = comm.bcast(handle) return drv.IPCMemoryHandle(handle)
def fun_load(config, sock_data=5000): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror flag_randproc = not config['use_data_layer'] flag_batch = config['batch_crop_mirror'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind('tcp://*:{0}'.format(sock_data)) shape, dtype, h = sock.recv_pyobj() print 'shared_x information received' gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: # getting the hkl file name to load hkl_name = recv_queue.get() # print hkl_name data = hkl.load(hkl_name) - img_mean # print 'load ', time.time() - bgn_time if flag_randproc: param_rand = recv_queue.get() data = crop_and_mirror(data, param_rand, flag_batch=flag_batch) gpu_data.set(data) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx) ctx.synchronize() send_queue.put('copy_finished')
def bind_buffers(self): """ Get allocated GPU tensor for output and potentially source value """ if isinstance(self.tensor, TensorDescription): self.tensor = self.tensor_view_from_td(self.tensor) super(CudaRecvKernel, self).bind_buffers() buf_ipc_hdl = self.comm.recv(source=self.source, tag=TAG_IPC) self.sender_buf = drv.IPCMemoryHandle(buf_ipc_hdl)
def fun_load(config, sock_data_2=5001): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending num_timesteps = config['num_timesteps'] num_seq = config['num_seq'] img_scale_x = config['img_scale_x'] img_scale_y = config['img_scale_y'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx_2 = dev.make_context() sock_2 = zmq.Context().socket(zmq.PAIR) sock_2.bind('tcp://*:{0}'.format(sock_data_2)) shape_temporal, dtype_temporal, h_temporal = sock_2.recv_pyobj() print 'shared_x information received', shape_temporal gpu_data_remote_temporal = gpuarray.GPUArray( shape_temporal, dtype_temporal, gpudata=drv.IPCMemoryHandle(h_temporal)) gpu_data_temporal = gpuarray.GPUArray(shape_temporal, dtype_temporal) # print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: video_name_temporal = recv_queue.get() rand_param = recv_queue.get() if config['modal'] == 'rgb': data_temporal = prepare_data_rgb(video_name_temporal, num_timesteps, num_seq, rand_param, data_shape=(img_scale_x, img_scale_y, 3)) else: data_temporal = prepare_data_flow(video_name_temporal, num_timesteps, num_seq, rand_param, data_shape=(img_scale_x, img_scale_y)) gpu_data_temporal.set(data_temporal) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer( gpu_data_remote_temporal.ptr, gpu_data_temporal.ptr, gpu_data_temporal.dtype.itemsize * gpu_data_temporal.size, ctx_2, ctx_2) ctx_2.synchronize() send_queue.put('copy_finished')
def controller(h): drv.init() dev = drv.Device(0) ctx_gpu = dev.make_context() print('receive handler') print(bytearray(bytes(h))) x_ptr = drv.IPCMemoryHandle(bytearray(bytes(h))) print('server gpu type: ') print(type(x_ptr)) x_gpu = gpuarray.GPUArray((1, 32), numpy.int8, gpudata=x_ptr) print('gpu: ', x_gpu.get()) ctx_gpu.pop() return b'hi back!'
def proc2(): sock = zmq.Context().socket(zmq.REP) sock.bind('tcp://*:5000') drv.init() dev = drv.Device(0) ctx = dev.make_context() shape, dtype, h = sock.recv_pyobj() sock.send_pyobj('') x_gpu = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) print x_gpu ctx.detach()
def setup_ipc_handle(op, comm, cmd, handle=None, dest=None): if cmd == 'send': for d in dest: if op.metadata['device_id'] == int(d): local = True buf_ipc_hdl = int(handle) else: local = False buf_ipc_hdl = drv.mem_get_ipc_handle(handle) comm.send((local, buf_ipc_hdl), dest=int(d), tag=TAG_IPC) else: (local, buf_ipc_hdl) = comm.recv(source=op.source_id, tag=TAG_IPC) if local: return (buf_ipc_hdl) else: return (drv.IPCMemoryHandle(buf_ipc_hdl))
def func2(): drv.init() dev = drv.Device(0) ctx_gpu = dev.make_context() ctx = zmq.Context() sock = ctx.socket(zmq.REP) sock.bind('tcp://*:6000') h = sock.recv_pyobj() x_ptr = drv.IPCMemoryHandle(h) x_gpu = gpuarray.GPUArray((1, 32), numpy.int8, gpudata=x_ptr) print('gpu: ', x_gpu.get()) ctx_gpu.pop()
def get(self): """Creates a GPUArray object from the IPC memory handle. Returns: ~pycuda.gpuarray.GPUArray: Recovered GPU array with memory shared accross processes. .. note:: Note that :mod:`cuda` does not take care of data race between multiple processes. """ drv.IPCMemoryHandle(self.handle) array = gpuarray.GPUArray((0, ), dtype=self.dtype) array.shape = self.shape array.size = self.size array.mem_size = self.mem_size setattr(array, 'ipc_handle', self.handle) return array
def ipc_handle_wrap(self, handle): return cuda.IPCMemoryHandle(handle)
def train_net(config, private_config): # UNPACK CONFIGS (train_videos_spatial_jhmdb,val_videos_spatial_jhmdb,train_videos_temporal_jhmdb,val_videos_temporal_jhmdb, train_targets,val_targets, train_labels_jhmdb,val_labels_jhmdb) = unpack_configs_jhmdb(config,gpu_id=private_config['gpu_id']) # print('val_len',len(val_videos_spatial_jhmdb),'train_len',len(train_videos_spatial_jhmdb)) if config['modal']=='rgb': train_videos = list(train_videos_spatial_jhmdb) test_videos = list(val_videos_spatial_jhmdb) else: train_videos = list(train_videos_temporal_jhmdb) test_videos = list(val_videos_temporal_jhmdb) print('jhmdb_len',len(train_videos),len(train_labels_jhmdb))#,len(tr_video_length_jhmdb)) flag_para_load =config['para_load'] gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij,'cost_nll:',cost_nll,'cost_attention:',cost_att if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') if count%20 == 0: e = time.time() print "time per 20 iter:", (e - s) # ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_test_error(config, shared_x, shared_mask, shared_y,shared_target,shared_use_noise, shared_conv,test_videos, val_labels_jhmdb, flag_para_load, batch_size,num_seq, validate_model_lstm,train_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: test loss of jhmdb %f ' % (epoch, this_val_loss)) print('epoch %i: test error of jhmdb %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'test_record_jhmdb.npy', val_record) DropoutLayer.SetDropoutOn() ########################################### # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save'] : if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
def main(): client = PyConnector('/tmp/orchestrator.socket', '/tmp/ral.socket') cuda.init() dev = cuda.Device(0) ctx_gpu = dev.make_context() try: client.connect() except Error as err: print(err) try: client.run_ddl_create_table('nation', ['id'], ['GDF_INT8'], 'main') except Error as err: print(err) data_gpu, data_sz = create_sample_device_data() data_handler = bytes(cuda.mem_get_ipc_handle(data_gpu)) valid_gpu, data_sz = create_sample_device_data() valid_handler = bytes(cuda.mem_get_ipc_handle(valid_gpu)) try: tableGroup = { 'tables': [{ 'name': 'main.nation', 'columns': [{ 'data': data_handler, 'valid': valid_handler, 'size': data_sz, 'dtype': 1, 'null_count': 0, 'dtype_info': 0 }], 'columnNames': ['id'] }], 'name': 'main', } resultSet = client.run_dml_query('select id > 5 from main.nation', tableGroup) print("#RESULT_SET:") print('GetResult Response') print(' metadata:') print(' status: %s' % resultSet.metadata.status) print(' message: %s' % resultSet.metadata.message) print(' time: %s' % resultSet.metadata.time) print(' rows: %s' % resultSet.metadata.rows) print(' columnNames: %s' % list(resultSet.columnNames)) for i, column in enumerate(resultSet.columns): x_ptr = cuda.IPCMemoryHandle( column.data) # x_ptr: device raw pointer x_gpu = gpuarray.GPUArray((1, column.size), numpy.int8, gpudata=x_ptr) print('\tgpu: ', x_gpu.get()) print("#RESULT_SET:") resultSet = client.free_result(123456) except Error as err: print(err) # try: # client.run_ddl_drop_table('User', 'main') # except Error as err: # print(err) client.close_connection() ctx_gpu.pop()
def fun_mlp(shared_args, private_args, this_queue, that_queue): ''' shared_args contains neural network parameters private_args contains parameters for process run on each gpu this_queue and that_queue are used for synchronization between processes. ''' learning_rate = shared_args['learning_rate'] n_epochs = shared_args['n_epochs'] dataset = shared_args['dataset'] batch_size = shared_args['batch_size'] L1_reg = shared_args['L1_reg'] L2_reg = shared_args['L2_reg'] n_hidden = shared_args['n_hidden'] #### # pycuda and zmq environment drv.init() dev = drv.Device(private_args['ind_gpu']) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) if private_args['flag_client']: sock.connect('tcp://localhost:5000') else: sock.bind('tcp://*:5000') #### #### # import theano related import theano.sandbox.cuda theano.sandbox.cuda.use(private_args['gpu']) import theano import theano.tensor as T from logistic_sgd import load_data from mlp import MLP import theano.misc.pycuda_init import theano.misc.pycuda_utils #### datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(1234) classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) gparams = [T.grad(cost, param) for param in classifier.params] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) #### # setting pycuda and # pass handles, only done once param_ga_list = [] # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu param_other_list = [] # a list of theano shared variables that are used to store values of theano shared variable from the other gpu param_ga_other_list = [] # a list of pycuda gpuarrays which point to theano shared variables in param_other_list h_list = [] # a list of pycuda IPC handles shape_list = [] # a list containing shapes of variables in param_ga_list dtype_list = [] # a list containing dtypes of variables in param_ga_list average_fun_list = [] # a list containing theano functions for averaging parameters for param in classifier.params: param_other = theano.shared(param.get_value()) param_ga = \ theano.misc.pycuda_utils.to_gpuarray(param.container.value) param_ga_other = \ theano.misc.pycuda_utils.to_gpuarray( param_other.container.value) h = drv.mem_get_ipc_handle(param_ga.ptr) average_fun = \ theano.function([], updates=[(param, (param + param_other) / 2.)]) param_other_list.append(param_other) param_ga_list.append(param_ga) param_ga_other_list.append(param_ga_other) h_list.append(h) shape_list.append(param_ga.shape) dtype_list.append(param_ga.dtype) average_fun_list.append(average_fun) # pass shape, dtype and handles sock.send_pyobj((shape_list, dtype_list, h_list)) shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj() param_ga_remote_list = [] # create gpuarray point to the other gpu use the passed information for shape_other, dtype_other, h_other in zip(shape_other_list, dtype_other_list, h_other_list): param_ga_remote = \ gpuarray.GPUArray(shape_other, dtype_other, gpudata=drv.IPCMemoryHandle(h_other)) param_ga_remote_list.append(param_ga_remote) #### ############### # TRAIN MODEL # ############### print '... training' this_queue.put('') that_queue.get() start_time = time.time() epoch = 0 while epoch < n_epochs: epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): if minibatch_index % 2 == private_args['mod']: train_model(minibatch_index) this_queue.put('') that_queue.get() # exchanging weights for param_ga, param_ga_other, param_ga_remote in \ zip(param_ga_list, param_ga_other_list, param_ga_remote_list): drv.memcpy_peer( param_ga_other.ptr, param_ga_remote.ptr, param_ga_remote.dtype.itemsize * param_ga_remote.size, ctx, ctx) ctx.synchronize() this_queue.put('') that_queue.get() for average_fun in average_fun_list: average_fun() if private_args['verbose']: validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) end_time = time.time() this_queue.put('') that_queue.get() if private_args['verbose']: print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
def train_net(config, private_config): # UNPACK CONFIGS (flag_para_load, flag_datalayer, train_filenames, val_filenames, train_labels, val_labels, img_mean) = \ unpack_configs(config, ext_data=private_config['ext_data'], ext_label=private_config['ext_label']) gpu_send_queue = private_config['queue_gpu_send'] gpu_recv_queue = private_config['queue_gpu_recv'] # pycuda and zmq set up drv.init() dev = drv.Device(int(private_config['gpu'][-1])) ctx = dev.make_context() sock_gpu = zmq.Context().socket(zmq.PAIR) if private_config['flag_client']: sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter print 'training cost:', cost_ij if config['print_train_error']: error_ij = train_error() gpu_send_queue.put(error_ij) that_error = gpu_recv_queue.get() error_ij = (error_ij + that_error) / 2. if private_config['flag_verbose']: print 'training error rate:', error_ij if flag_para_load and (count < len(minibatch_range)): load_send_queue.put('calc_finished') ############### Test on Validation Set ################## DropoutLayer.SetDropoutOff() this_val_error, this_val_loss = get_val_error_loss( rand_arr, shared_x, shared_y, val_filenames, val_labels, flag_datalayer, flag_para_load, batch_size, validate_model, send_queue=load_send_queue, recv_queue=load_recv_queue) # report validation stats gpu_send_queue.put(this_val_error) that_val_error = gpu_recv_queue.get() this_val_error = (this_val_error + that_val_error) / 2. gpu_send_queue.put(this_val_loss) that_val_loss = gpu_recv_queue.get() this_val_loss = (this_val_loss + that_val_loss) / 2. if private_config['flag_verbose']: print('epoch %i: validation loss %f ' % (epoch, this_val_loss)) print('epoch %i: validation error %f %%' % (epoch, this_val_error * 100.)) val_record.append([this_val_error, this_val_loss]) if private_config['flag_save']: np.save(config['weights_dir'] + 'val_record.npy', val_record) DropoutLayer.SetDropoutOn() ############################################ # Adapt Learning Rate step_idx = adjust_learning_rate(config, epoch, step_idx, val_record, learning_rate) # Save Weights, only one of them will do if private_config['flag_save']: if epoch % config['snapshot_freq'] == 0: save_weights(layers, config['weights_dir'], epoch) np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy', learning_rate.get_value()) save_momentums(vels, config['weights_dir'], epoch) print('Optimization complete.')
sock = zmq.Context().socket(zmq.PAIR) try: sock.bind('tcp://*:{0}'.format(config['sock_data'])) except zmq.error.ZMQError: print '[load] rank %d port %d zmq error' % (rank,config['sock_data']) sock.close() zmq.Context().term() raise finally: pass shape, dtype, h = sock.recv_pyobj() if verbose: print '[load] 1. shared_x information received' gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = icomm.recv(source=MPI.ANY_SOURCE, tag=66) if verbose: print '[load] 2. img_mean received' count=0 mode=None import time while True: # 3. load the very first filename in 'train' or 'val' mode message = icomm.recv(source=0, tag=40) if message == 'stop': break