def init_buffers(self):
        shape = self.op.args[0].tensor_description().shape
        dtype = self.op.args[0].tensor_description().dtype

        # Allocate output and scratch buffers
        self.output_buff = gpuarray.zeros(shape, dtype)
        self.scratch_buff = gpuarray.zeros(shape, dtype)

        self.output_buff_dict[self.device_id] = self.output_buff.gpudata
        self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata

        # Allocate IPC handles
        output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata)
        scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata)
        event_ipc_hdl = self.event.ipc_handle()

        # Broadcast handles to others
        msg = (self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl)
        self.comm.bcast(msg, root=self.device_id)

        # Get handles from others
        for i in self.op.device_ids:
            if i != self.device_id:
                (peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl) =\
                    self.comm.bcast(msg, root=i)
                output_hdl = drv.IPCMemoryHandle(output_ipc_hdl)
                scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl)
                event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl)
                self.output_buff_dict[peer_id] = output_hdl
                self.scratch_buff_dict[peer_id] = scratch_hdl
                self.event_buff_dict[peer_id] = event_hdl
Esempio n. 2
0
    def init_buffers(self):
        shape = self.op.args[0].tensor_description().shape
        dtype = self.op.args[0].tensor_description().dtype

        n_devs = len(self.op.device_ids)
        size = self.op.args[0].tensor_description().axes.size
        segment_size = calculate_segment_size(size, n_devs)

        # Allocate output and scratch buffers
        self.output_buff = gpuarray.zeros(shape, dtype)
        self.scratch_buff = gpuarray.zeros(segment_size * n_devs, dtype)

        self.output_buff_dict[self.device_id] = self.output_buff.gpudata
        self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata

        # Allocate IPC handles
        output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata)
        scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata)
        event_ipc_hdl = self.event.ipc_handle()

        # Broadcast handles to others
        msg = (self.device_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl)
        for i in self.device_ids:
            if i == self.device_id:
                self.comm.bcast(msg, root=i)
            else:
                (peer_id, output_ipc_hdl, scratch_ipc_hdl,
                 event_ipc_hdl) = self.comm.bcast(None, root=i)

                output_hdl = drv.IPCMemoryHandle(output_ipc_hdl)
                scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl)
                event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl)
                self.output_buff_dict[peer_id] = output_hdl
                self.scratch_buff_dict[peer_id] = scratch_hdl
                self.event_buff_dict[peer_id] = event_hdl
Esempio n. 3
0
    def init_buffers(self):
        shape = self.op.args[0].tensor_description().shape
        dtype = self.op.args[0].tensor_description().dtype

        # Allocate output and scratch buffers
        self.output_buff = gpuarray.zeros(shape, dtype)
        self.scratch_buff = gpuarray.zeros(shape, dtype)

        self.output_buff_dict[self.device_id] = self.output_buff.gpudata
        self.scratch_buff_dict[self.device_id] = self.scratch_buff.gpudata

        # Allocate IPC handles
        output_ipc_hdl = drv.mem_get_ipc_handle(self.output_buff.gpudata)
        scratch_ipc_hdl = drv.mem_get_ipc_handle(self.scratch_buff.gpudata)
        event_ipc_hdl = self.event.ipc_handle()

        # Put handles in queues
        for i in self.op.shared_queues.keys():
            if i != self.device_id:
                self.op.shared_queues[i].put((self.device_id, output_ipc_hdl,
                                              scratch_ipc_hdl, event_ipc_hdl))

        # Get handles from others
        q = self.op.shared_queues[self.device_id]
        for i in range(len(self.op.shared_queues) - 1):
            peer_id, output_ipc_hdl, scratch_ipc_hdl, event_ipc_hdl = q.get()
            output_hdl = drv.IPCMemoryHandle(output_ipc_hdl)
            scratch_hdl = drv.IPCMemoryHandle(scratch_ipc_hdl)
            event_hdl = drv.Event.from_ipc_handle(event_ipc_hdl)
            self.output_buff_dict[peer_id] = output_hdl
            self.scratch_buff_dict[peer_id] = scratch_hdl
            self.event_buff_dict[peer_id] = event_hdl
Esempio n. 4
0
def set_ipc_handle(op, shared_queue, handle):
    lock = drv.mem_alloc(1)
    drv.memset_d8(lock, 0, 1)
    buf_ipc_hdl = drv.mem_get_ipc_handle(handle)
    lock_ipc_hdl = drv.mem_get_ipc_handle(lock)
    shared_queue.put((buf_ipc_hdl, lock_ipc_hdl))
    return (lock)
Esempio n. 5
0
def set_ipc_handle(op, shared_queue, handle, local=False):
    lock = drv.mem_alloc(1)
    drv.memset_d8(lock, 0, 1)
    if local:
        buf_ipc_hdl = int(handle)
        lock_ipc_hdl = int(lock)
    else:
        buf_ipc_hdl = drv.mem_get_ipc_handle(handle)
        lock_ipc_hdl = drv.mem_get_ipc_handle(lock)
    shared_queue.put((local, buf_ipc_hdl, lock_ipc_hdl))
    return (lock)
Esempio n. 6
0
def bcast_ipc_handle(comm, handle=None):
    if handle is not None:
        buffer_ipc_handle = drv.mem_get_ipc_handle(handle)
        return comm.bcast(buffer_ipc_handle)
    else:
        handle = comm.bcast(handle)
        return drv.IPCMemoryHandle(handle)
Esempio n. 7
0
    def para_load_init(self):
        
        # 0. send config dict (can't carry any special objects) to loading process
        
        self.icomm.isend(self.config,dest=0,tag=99)
    	
        drv = self.drv
        shared_x = self.model.shared_x
        img_mean = self.data[4]

        sock_data = self.config['sock_data']
        
        import zmq
        sock = zmq.Context().socket(zmq.PAIR)
        sock.connect('tcp://localhost:{0}'.format(sock_data))
        
        #import theano.sandbox.cuda
        #theano.sandbox.cuda.use(config.device)
        import theano.misc.pycuda_init
        import theano.misc.pycuda_utils
        # pass ipc handle and related information
        gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray(
            shared_x.container.value)
        h = drv.mem_get_ipc_handle(gpuarray_batch.ptr)
        # 1. send ipc handle of shared_x
        sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h))

        # 2. send img_mean
        self.icomm.send(img_mean, dest=0, tag=66)
Esempio n. 8
0
    def para_load_init(self):

        # 0. send config dict (can't carry any special objects) to loading process

        self.icomm.isend(self.config, dest=0, tag=99)

        drv = self.drv
        shared_x = self.model.shared_x
        img_mean = self.data[4]

        sock_data = self.config['sock_data']

        import zmq
        sock = zmq.Context().socket(zmq.PAIR)
        sock.connect('tcp://localhost:{0}'.format(sock_data))

        #import theano.sandbox.cuda
        #theano.sandbox.cuda.use(config.device)
        import theano.misc.pycuda_init
        import theano.misc.pycuda_utils
        # pass ipc handle and related information
        gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray(
            shared_x.container.value)
        h = drv.mem_get_ipc_handle(gpuarray_batch.ptr)
        # 1. send ipc handle of shared_x
        sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h))

        # 2. send img_mean
        self.icomm.send(img_mean, dest=0, tag=66)
Esempio n. 9
0
    def getGdfHandles(self, df):

        dev = drv.Device(0)

        gdfHandless = []
        for name, series in df._cols.items():
            #             WSM TODO add if statement for valid != nullptr
            gdfHandless.append(
                gdfHandles(
                    drv.mem_get_ipc_handle(series._column.cffi_view.data),
                    drv.mem_get_ipc_handle(series._column.cffi_view.valid),
                    series._column.cffi_view.size,
                    series._column.cffi_view.null_count,
                    series._column.cffi_view.dtype,
                    series._column.cffi_view.dtype_info, name, None))

        return gdfHandles
Esempio n. 10
0
def proc1():
    sock = zmq.Context().socket(zmq.REQ)
    sock.connect('tcp://localhost:5000')

    drv.init()
    dev = drv.Device(0)
    ctx = dev.make_context()

    x_gpu = gpuarray.to_gpu(np.random.rand(8))
    h = drv.mem_get_ipc_handle(x_gpu.ptr)
    sock.send_pyobj((x_gpu.shape, x_gpu.dtype, h))
    sock.recv_pyobj()

    ctx.detach()
Esempio n. 11
0
def func1():
    drv.init()
    dev = drv.Device(0)
    ctx_gpu = dev.make_context()

    ctx = zmq.Context()
    sock = ctx.socket(zmq.REQ)
    sock.connect('tcp://localhost:6000')

    x_gpu = create_sample_device_data()
    h = drv.mem_get_ipc_handle(x_gpu)
    sock.send_pyobj(h)

    ctx_gpu.pop()
Esempio n. 12
0
def setup_ipc_handle(op, comm, cmd, handle=None, dest=None):
    if cmd == 'send':
        for d in dest:
            if op.metadata['device_id'] == int(d):
                local = True
                buf_ipc_hdl = int(handle)
            else:
                local = False
                buf_ipc_hdl = drv.mem_get_ipc_handle(handle)
            comm.send((local, buf_ipc_hdl), dest=int(d), tag=TAG_IPC)
    else:
        (local, buf_ipc_hdl) = comm.recv(source=op.source_id, tag=TAG_IPC)
        if local:
            return (buf_ipc_hdl)
        else:
            return (drv.IPCMemoryHandle(buf_ipc_hdl))
Esempio n. 13
0
    def __init__(self, array):
        """Creates an IPC memory handle of the device array.

        Args:
            array (~pycuda.gpuarray.GPUArray): GPU array to be shared
                accross processes.

        """
        if isinstance(array, drv.IPCMemoryHandle):
            # do not doubly extract IPC memory handle
            self.handle = array.ipc_handle
        else:
            self.handle = drv.mem_get_ipc_handle(array.ptr)

        self.shape = array.shape
        self.dtype = array.dtype
        self.size = array.size
        self.mem_size = array.mem_size
Esempio n. 14
0
def client():
    drv.init()
    dev = drv.Device(0)
    ctx_gpu = dev.make_context()
    connection = blazingdb.protocol.UnixSocketConnection(unix_path)
    sock = blazingdb.protocol.Client(connection)

    x_gpu = create_sample_device_data()
    print('gpu type: ')
    print(type(x_gpu))
    h = drv.mem_get_ipc_handle(x_gpu)

    print('send handler')
    print(h)

    res = sock.send(bytes(h))
    print(res)
    ctx_gpu.pop()
Esempio n. 15
0
    def __init__(self, array):
        """Creates an IPC memory handle of the device array.

        Args:
            array (~pycuda.gpuarray.GPUArray): GPU array to be shared
                accross processes.

        """
        if isinstance(array, drv.IPCMemoryHandle):
            # do not doubly extract IPC memory handle
            self.handle = array.ipc_handle
        else:
            self.handle = drv.mem_get_ipc_handle(array.ptr)

        self.shape    = array.shape
        self.dtype    = array.dtype
        self.size     = array.size
        self.mem_size = array.mem_size
Esempio n. 16
0
def para_load_init(queue_dict, drv, shared_x,img_mean):
    
    sock_data = queue_dict['sock_data']
    load_send_queue = queue_dict['queue_t2l']
    load_recv_queue = queue_dict['queue_l2t']
    
    import zmq
    sock = zmq.Context().socket(zmq.PAIR)
    sock.connect('tcp://localhost:{0}'.format(sock_data))
    
    #import theano.sandbox.cuda
    #theano.sandbox.cuda.use(config.device)
    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils
    # pass ipc handle and related information
    gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray(
        shared_x.container.value)
    h = drv.mem_get_ipc_handle(gpuarray_batch.ptr)
    # 1. send ipc handle of shared_x
    sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h))

    # 2. send img_mean
    load_send_queue.put(img_mean)
Esempio n. 17
0
 def ipc_handle(self, addr):
     return cuda.mem_get_ipc_handle(addr)
Esempio n. 18
0
def train_net(config, private_config):

    # UNPACK CONFIGS
    (train_videos_spatial_jhmdb,val_videos_spatial_jhmdb,train_videos_temporal_jhmdb,val_videos_temporal_jhmdb,
     train_targets,val_targets,
           train_labels_jhmdb,val_labels_jhmdb) = unpack_configs_jhmdb(config,gpu_id=private_config['gpu_id'])
    # print('val_len',len(val_videos_spatial_jhmdb),'train_len',len(train_videos_spatial_jhmdb))
    if config['modal']=='rgb':
        train_videos = list(train_videos_spatial_jhmdb)
        test_videos = list(val_videos_spatial_jhmdb)
    else:
        train_videos = list(train_videos_temporal_jhmdb)
        test_videos = list(val_videos_temporal_jhmdb)
    print('jhmdb_len',len(train_videos),len(train_labels_jhmdb))#,len(tr_video_length_jhmdb))
    flag_para_load =config['para_load']
    gpu_send_queue = private_config['queue_gpu_send']
    gpu_recv_queue = private_config['queue_gpu_recv']

    # pycuda and zmq set up
    drv.init()
    dev = drv.Device(int(private_config['gpu'][-1]))
    ctx = dev.make_context()

    sock_gpu = zmq.Context().socket(zmq.PAIR)
    if private_config['flag_client']:
        sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter
                        print 'training cost:', cost_ij,'cost_nll:',cost_nll,'cost_attention:',cost_att

                    if config['print_train_error']:
                        error_ij = train_error()

                        gpu_send_queue.put(error_ij)
                        that_error = gpu_recv_queue.get()
                        error_ij = (error_ij + that_error) / 2.

                        if private_config['flag_verbose']:
                            print 'training error rate:', error_ij

                if flag_para_load and (count < len(minibatch_range)):
                    load_send_queue.put('calc_finished')

                if count%20 == 0:
                    e = time.time()
                    print "time per 20 iter:", (e - s)
            # ############### Test on Validation Set ##################
            DropoutLayer.SetDropoutOff()
            this_val_error, this_val_loss = get_test_error(config,
                 shared_x, shared_mask, shared_y,shared_target,shared_use_noise,
                 shared_conv,test_videos,  val_labels_jhmdb,
                flag_para_load,
                batch_size,num_seq, validate_model_lstm,train_model,
                send_queue=load_send_queue, recv_queue=load_recv_queue)

            # report validation stats
            gpu_send_queue.put(this_val_error)
            that_val_error = gpu_recv_queue.get()
            this_val_error = (this_val_error + that_val_error) / 2.

            gpu_send_queue.put(this_val_loss)
            that_val_loss = gpu_recv_queue.get()
            this_val_loss = (this_val_loss + that_val_loss) / 2.

            if private_config['flag_verbose']:
                print('epoch %i: test loss of jhmdb %f ' %
                      (epoch, this_val_loss))
                print('epoch %i: test error of jhmdb %f %%' %
                      (epoch, this_val_error * 100.))
            val_record.append([this_val_error, this_val_loss])
            if private_config['flag_save']:
                np.save(config['weights_dir'] + 'test_record_jhmdb.npy', val_record)

            DropoutLayer.SetDropoutOn()
            ###########################################
            # Adapt Learning Rate
            step_idx = adjust_learning_rate(config, epoch, step_idx,
                                            val_record, learning_rate)
            # Save Weights, only one of them will do
            if private_config['flag_save'] :
                if epoch % config['snapshot_freq'] == 0:
                    save_weights(layers, config['weights_dir'], epoch)
                    np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                            learning_rate.get_value())
                    save_momentums(vels, config['weights_dir'], epoch)
        print('Optimization complete.')
def main():

    client = PyConnector('/tmp/orchestrator.socket', '/tmp/ral.socket')

    cuda.init()
    dev = cuda.Device(0)
    ctx_gpu = dev.make_context()

    try:
        client.connect()
    except Error as err:
        print(err)

    try:
        client.run_ddl_create_table('nation', ['id'], ['GDF_INT8'], 'main')
    except Error as err:
        print(err)

    data_gpu, data_sz = create_sample_device_data()
    data_handler = bytes(cuda.mem_get_ipc_handle(data_gpu))
    valid_gpu, data_sz = create_sample_device_data()
    valid_handler = bytes(cuda.mem_get_ipc_handle(valid_gpu))

    try:
        tableGroup = {
            'tables': [{
                'name':
                'main.nation',
                'columns': [{
                    'data': data_handler,
                    'valid': valid_handler,
                    'size': data_sz,
                    'dtype': 1,
                    'null_count': 0,
                    'dtype_info': 0
                }],
                'columnNames': ['id']
            }],
            'name':
            'main',
        }
        resultSet = client.run_dml_query('select id > 5 from main.nation',
                                         tableGroup)

        print("#RESULT_SET:")
        print('GetResult Response')
        print('  metadata:')
        print('     status: %s' % resultSet.metadata.status)
        print('    message: %s' % resultSet.metadata.message)
        print('       time: %s' % resultSet.metadata.time)
        print('       rows: %s' % resultSet.metadata.rows)
        print('  columnNames: %s' % list(resultSet.columnNames))
        for i, column in enumerate(resultSet.columns):
            x_ptr = cuda.IPCMemoryHandle(
                column.data)  # x_ptr: device raw pointer
            x_gpu = gpuarray.GPUArray((1, column.size),
                                      numpy.int8,
                                      gpudata=x_ptr)
            print('\tgpu:  ', x_gpu.get())
        print("#RESULT_SET:")

        resultSet = client.free_result(123456)

    except Error as err:
        print(err)

    # try:
    #   client.run_ddl_drop_table('User', 'main')
    # except Error as err:
    #   print(err)

    client.close_connection()
    ctx_gpu.pop()
Esempio n. 20
0
def fun_mlp(shared_args, private_args, this_queue, that_queue):
    '''
    shared_args 
    contains neural network parameters

    private_args
    contains parameters for process run on each gpu

    this_queue and that_queue are used for synchronization between processes.
    '''

    learning_rate = shared_args['learning_rate']
    n_epochs = shared_args['n_epochs']
    dataset = shared_args['dataset']
    batch_size = shared_args['batch_size']
    L1_reg = shared_args['L1_reg']
    L2_reg = shared_args['L2_reg']
    n_hidden = shared_args['n_hidden']

    ####
    # pycuda and zmq environment
    drv.init()
    dev = drv.Device(private_args['ind_gpu'])
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)

    if private_args['flag_client']:
        sock.connect('tcp://localhost:5000')
    else:
        sock.bind('tcp://*:5000')
    ####

    ####
    # import theano related
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(private_args['gpu'])

    import theano
    import theano.tensor as T

    from logistic_sgd import load_data
    from mlp import MLP

    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils

    ####


    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = np.random.RandomState(1234)

    classifier = MLP(rng=rng, input=x, n_in=28 * 28,
                     n_hidden=n_hidden, n_out=10)

    cost = (classifier.negative_log_likelihood(y)
            + L1_reg * classifier.L1
            + L2_reg * classifier.L2_sqr)

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]}
    )

    gparams = [T.grad(cost, param) for param in classifier.params]

    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]})
    ####
    # setting pycuda and
    # pass handles, only done once
    
    param_ga_list = []
    # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu
    
    param_other_list = []
    # a list of theano shared variables that are used to store values of theano shared variable from the other gpu

    param_ga_other_list = []
    # a list of pycuda gpuarrays which point to theano shared variables in param_other_list

    h_list = []
    # a list of pycuda IPC handles

    shape_list = []
    # a list containing shapes of variables in param_ga_list

    dtype_list = []
    # a list containing dtypes of variables in param_ga_list
    
    average_fun_list = []
    # a list containing theano functions for averaging parameters

    for param in classifier.params:
        param_other = theano.shared(param.get_value())
        param_ga = \
            theano.misc.pycuda_utils.to_gpuarray(param.container.value)
        param_ga_other = \
            theano.misc.pycuda_utils.to_gpuarray(
                param_other.container.value)
        h = drv.mem_get_ipc_handle(param_ga.ptr)
        average_fun = \
            theano.function([], updates=[(param,
                                          (param + param_other) / 2.)])

        param_other_list.append(param_other)
        param_ga_list.append(param_ga)
        param_ga_other_list.append(param_ga_other)
        h_list.append(h)
        shape_list.append(param_ga.shape)
        dtype_list.append(param_ga.dtype)
        average_fun_list.append(average_fun)

    # pass shape, dtype and handles
    sock.send_pyobj((shape_list, dtype_list, h_list))
    shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj()

    param_ga_remote_list = []

    # create gpuarray point to the other gpu use the passed information
    for shape_other, dtype_other, h_other in zip(shape_other_list,
                                                 dtype_other_list,
                                                 h_other_list):
        param_ga_remote = \
            gpuarray.GPUArray(shape_other, dtype_other,
                              gpudata=drv.IPCMemoryHandle(h_other))

        param_ga_remote_list.append(param_ga_remote)
    ####


    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    this_queue.put('')
    that_queue.get()
    start_time = time.time()

    epoch = 0

    while epoch < n_epochs:
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            if minibatch_index % 2 == private_args['mod']:
                train_model(minibatch_index)
                
                this_queue.put('')
                that_queue.get()

                # exchanging weights
                for param_ga, param_ga_other, param_ga_remote in \
                        zip(param_ga_list, param_ga_other_list,
                            param_ga_remote_list):

                    drv.memcpy_peer(param_ga_other.ptr,
                                    param_ga_remote.ptr,
                                    param_ga_remote.dtype.itemsize *
                                    param_ga_remote.size,
                                    ctx, ctx)                
                
                ctx.synchronize()
                this_queue.put('')
                that_queue.get()
                    
                for average_fun in average_fun_list:
                    average_fun()



        if private_args['verbose']:
            validation_losses = [validate_model(i) for i
                                 in xrange(n_valid_batches)]
            this_validation_loss = np.mean(validation_losses)

            print('epoch %i, minibatch %i/%i, validation error %f %%' %
                  (epoch, minibatch_index + 1, n_train_batches,
                   this_validation_loss * 100.))

    end_time = time.time()

    this_queue.put('')
    that_queue.get()

    if private_args['verbose']:
        print 'The code run for %d epochs, with %f epochs/sec' % (
            epoch, 1. * epoch / (end_time - start_time))
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.1fs' % ((end_time - start_time)))
Esempio n. 21
0
def fun_mlp(shared_args, private_args, this_queue, that_queue):
    '''
    shared_args 
    contains neural network parameters

    private_args
    contains parameters for process run on each gpu

    this_queue and that_queue are used for synchronization between processes.
    '''

    learning_rate = shared_args['learning_rate']
    n_epochs = shared_args['n_epochs']
    dataset = shared_args['dataset']
    batch_size = shared_args['batch_size']
    L1_reg = shared_args['L1_reg']
    L2_reg = shared_args['L2_reg']
    n_hidden = shared_args['n_hidden']

    ####
    # pycuda and zmq environment
    drv.init()
    dev = drv.Device(private_args['ind_gpu'])
    ctx = dev.make_context()
    sock = zmq.Context().socket(zmq.PAIR)

    if private_args['flag_client']:
        sock.connect('tcp://localhost:5000')
    else:
        sock.bind('tcp://*:5000')
    ####

    ####
    # import theano related
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(private_args['gpu'])

    import theano
    import theano.tensor as T

    from logistic_sgd import load_data
    from mlp import MLP

    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils

    ####

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    rng = np.random.RandomState(1234)

    classifier = MLP(rng=rng,
                     input=x,
                     n_in=28 * 28,
                     n_hidden=n_hidden,
                     n_out=10)

    cost = (classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 +
            L2_reg * classifier.L2_sqr)

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    gparams = [T.grad(cost, param) for param in classifier.params]

    updates = [(param, param - learning_rate * gparam)
               for param, gparam in zip(classifier.params, gparams)]

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    ####
    # setting pycuda and
    # pass handles, only done once

    param_ga_list = []
    # a list of pycuda gpuarrays which point to value of theano shared variable on this gpu

    param_other_list = []
    # a list of theano shared variables that are used to store values of theano shared variable from the other gpu

    param_ga_other_list = []
    # a list of pycuda gpuarrays which point to theano shared variables in param_other_list

    h_list = []
    # a list of pycuda IPC handles

    shape_list = []
    # a list containing shapes of variables in param_ga_list

    dtype_list = []
    # a list containing dtypes of variables in param_ga_list

    average_fun_list = []
    # a list containing theano functions for averaging parameters

    for param in classifier.params:
        param_other = theano.shared(param.get_value())
        param_ga = \
            theano.misc.pycuda_utils.to_gpuarray(param.container.value)
        param_ga_other = \
            theano.misc.pycuda_utils.to_gpuarray(
                param_other.container.value)
        h = drv.mem_get_ipc_handle(param_ga.ptr)
        average_fun = \
            theano.function([], updates=[(param,
                                          (param + param_other) / 2.)])

        param_other_list.append(param_other)
        param_ga_list.append(param_ga)
        param_ga_other_list.append(param_ga_other)
        h_list.append(h)
        shape_list.append(param_ga.shape)
        dtype_list.append(param_ga.dtype)
        average_fun_list.append(average_fun)

    # pass shape, dtype and handles
    sock.send_pyobj((shape_list, dtype_list, h_list))
    shape_other_list, dtype_other_list, h_other_list = sock.recv_pyobj()

    param_ga_remote_list = []

    # create gpuarray point to the other gpu use the passed information
    for shape_other, dtype_other, h_other in zip(shape_other_list,
                                                 dtype_other_list,
                                                 h_other_list):
        param_ga_remote = \
            gpuarray.GPUArray(shape_other, dtype_other,
                              gpudata=drv.IPCMemoryHandle(h_other))

        param_ga_remote_list.append(param_ga_remote)
    ####

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    this_queue.put('')
    that_queue.get()
    start_time = time.time()

    epoch = 0

    while epoch < n_epochs:
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            if minibatch_index % 2 == private_args['mod']:
                train_model(minibatch_index)

                this_queue.put('')
                that_queue.get()

                # exchanging weights
                for param_ga, param_ga_other, param_ga_remote in \
                        zip(param_ga_list, param_ga_other_list,
                            param_ga_remote_list):

                    drv.memcpy_peer(
                        param_ga_other.ptr, param_ga_remote.ptr,
                        param_ga_remote.dtype.itemsize * param_ga_remote.size,
                        ctx, ctx)

                ctx.synchronize()
                this_queue.put('')
                that_queue.get()

                for average_fun in average_fun_list:
                    average_fun()

        if private_args['verbose']:
            validation_losses = [
                validate_model(i) for i in xrange(n_valid_batches)
            ]
            this_validation_loss = np.mean(validation_losses)

            print('epoch %i, minibatch %i/%i, validation error %f %%' %
                  (epoch, minibatch_index + 1, n_train_batches,
                   this_validation_loss * 100.))

    end_time = time.time()

    this_queue.put('')
    that_queue.get()

    if private_args['verbose']:
        print 'The code run for %d epochs, with %f epochs/sec' % (
            epoch, 1. * epoch / (end_time - start_time))
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] + ' ran for %.1fs' %
                              ((end_time - start_time)))
def validate_performance(config):

    # UNPACK CONFIGS
    (flag_para_load, train_filenames, val_filenames, train_labels, val_labels,
     img_mean) = unpack_configs(config)

    if flag_para_load:
        # pycuda and zmq set up
        drv.init()
        dev = drv.Device(int(config['gpu'][-1]))
        ctx = dev.make_context()
        sock = zmq.Context().socket(zmq.PAIR)
        sock.connect('tcp://localhost:{0}'.format(config['sock_data']))

        load_send_queue = config['queue_t2l']
        load_recv_queue = config['queue_l2t']
    else:
        load_send_queue = None
        load_recv_queue = None

    import theano.sandbox.cuda
    theano.sandbox.cuda.use(config['gpu'])
    import theano
    theano.config.on_unused_input = 'warn'

    from layers import DropoutLayer
    from alex_net import AlexNet, compile_models

    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils

    # # BUILD NETWORK ##
    model = AlexNet(config)
    layers = model.layers
    batch_size = model.batch_size

    # # COMPILE FUNCTIONS ##
    (train_model, validate_model, train_error, learning_rate, shared_x,
     shared_y, rand_arr, vels) = compile_models(model, config, flag_top_5=True)

    print '... training'

    if flag_para_load:
        # pass ipc handle and related information
        gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray(
            shared_x.container.value)
        h = drv.mem_get_ipc_handle(gpuarray_batch.ptr)
        sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h))
        load_send_queue.put(img_mean)

    load_epoch = config['load_epoch']
    load_weights(layers, config['weights_dir'], load_epoch)

    DropoutLayer.SetDropoutOff()


    this_validation_error, this_validation_error_top_5, this_validation_loss = \
        get_val_error_loss(rand_arr, shared_x, shared_y,
                           val_filenames, val_labels,
                           flag_para_load,img_mean,
                           batch_size, validate_model,
                           send_queue=load_send_queue,
                           recv_queue=load_recv_queue,
                           flag_top_5=True)

    print('validation error %f %%' % (this_validation_error * 100.))
    print('top 5 validation error %f %%' %
          (this_validation_error_top_5 * 100.))
    print('validation loss %f ' % (this_validation_loss))

    return this_validation_error, this_validation_loss
Esempio n. 23
0
def train_net(config):

    # UNPACK CONFIGS
    (flag_para_load, train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = unpack_configs(config)

    # pycuda set up
    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx = dev.make_context()
    
    if flag_para_load:
        #  zmq set up
        sock = zmq.Context().socket(zmq.PAIR)
        sock.connect('tcp://*****:*****@ iter = ', num_iter
                print 'training cost:', cost_ij
                if config['print_train_error']:
                    print 'training error rate:', train_error()

            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_validation_error, this_validation_loss = get_val_error_loss(
            rand_arr, shared_x, shared_y,
            val_filenames, val_labels,
            flag_para_load, img_mean,
            batch_size, validate_model,
            send_queue=load_send_queue, recv_queue=load_recv_queue)


        print('epoch %i: validation loss %f ' %
              (epoch, this_validation_loss))
        print('epoch %i: validation error %f %%' %
              (epoch, this_validation_error * 100.))
        val_record.append([this_validation_error, this_validation_loss])
        np.save(config['weights_dir'] + 'val_record.npy', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx,
                                        val_record, learning_rate)

        # Save weights
        if epoch % config['snapshot_freq'] == 0:
            save_weights(layers, config['weights_dir'], epoch)
            np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                       learning_rate.get_value())
            save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')
Esempio n. 24
0
def train_net(config, private_config):

    # UNPACK CONFIGS
    (flag_para_load, train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = \
        unpack_configs(config, ext_data=private_config['ext_data'],
                       ext_label=private_config['ext_label'])


    gpu_send_queue = private_config['queue_gpu_send']
    gpu_recv_queue = private_config['queue_gpu_recv']

    # pycuda and zmq set up
    drv.init()
    dev = drv.Device(int(private_config['gpu'][-1]))
    ctx = dev.make_context()

    sock_gpu = zmq.Context().socket(zmq.PAIR)
    if private_config['flag_client']:
        sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter
                    log_iter.write("%d\n" % num_iter)
                    log_iter.flush()
                    print 'training cost:', cost_ij
                    log_err_cost.write("%f\n" % cost_ij)
                    log_err_cost.flush()

                if config['print_train_error']:
                    error_ij = train_error()

                    gpu_send_queue.put(error_ij)
                    that_error = gpu_recv_queue.get()
                    error_ij = (error_ij + that_error) / 2.

                    if private_config['flag_verbose']:
                        print 'training error rate:', error_ij
                        log_err_rate.write("%f\n" % error_ij)
                        log_err_rate.flush()


            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

            if count%20 == 0:
                e = time.time()
                print "time per 20 iter:", (e - s)
                
        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_val_error, this_val_loss = get_val_error_loss(
            rand_arr, shared_x, shared_y,
            val_filenames, val_labels,
            flag_para_load, img_mean,
            batch_size, validate_model,
            send_queue=load_send_queue, recv_queue=load_recv_queue)

        # report validation stats
        gpu_send_queue.put(this_val_error)
        that_val_error = gpu_recv_queue.get()
        this_val_error = (this_val_error + that_val_error) / 2.

        gpu_send_queue.put(this_val_loss)
        that_val_loss = gpu_recv_queue.get()
        this_val_loss = (this_val_loss + that_val_loss) / 2.

        if private_config['flag_verbose']:
            print('epoch %i: validation loss %f ' %
                  (epoch, this_val_loss))
            print('epoch %i: validation error %f %%' %
                  (epoch, this_val_error * 100.))
        val_record.append([this_val_error, this_val_loss])

        if private_config['flag_save']:
            np.save(config['weights_dir'] + 'val_record.npy', val_record)
            np.savetxt(config['weights_dir'] + 'val_record_txt.txt', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx,
                                        val_record, learning_rate)

        # Save Weights, only one of them will do
        if private_config['flag_save']:
            if epoch % config['snapshot_freq'] == 0:
                save_weights(layers, config['weights_dir'], epoch)
                np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                        learning_rate.get_value())
                save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')
Esempio n. 25
0
 def bind_buffers(self):
     if isinstance(self.tensor, TensorDescription):
         self.tensor = self.tensor_view_from_td(self.tensor)
     super(CudaSendKernel, self).bind_buffers()
     buf_ipc_hdl = drv.mem_get_ipc_handle(self.tensor.tensor.gpudata)
     self.comm.send(buf_ipc_hdl, dest=self.destination, tag=TAG_IPC)
def validate_performance(config):

    # UNPACK CONFIGS
    (flag_para_load,  train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = unpack_configs(config)

    if flag_para_load:
        # pycuda and zmq set up
        drv.init()
        dev = drv.Device(int(config['gpu'][-1]))
        ctx = dev.make_context()
        sock = zmq.Context().socket(zmq.PAIR)
        sock.connect('tcp://localhost:{0}'.format(config['sock_data']))

        load_send_queue = config['queue_t2l']
        load_recv_queue = config['queue_l2t']
    else:
        load_send_queue = None
        load_recv_queue = None

    import theano.sandbox.cuda
    theano.sandbox.cuda.use(config['gpu'])
    import theano
    theano.config.on_unused_input = 'warn'

    from layers import DropoutLayer
    from alex_net import AlexNet, compile_models

    import theano.misc.pycuda_init
    import theano.misc.pycuda_utils

    # # BUILD NETWORK ##
    model = AlexNet(config)
    layers = model.layers
    batch_size = model.batch_size

    # # COMPILE FUNCTIONS ##
    (train_model, validate_model, train_error, learning_rate,
        shared_x, shared_y, rand_arr, vels) = compile_models(model, config,
                                                             flag_top_5=True)

    print '... training'

    if flag_para_load:
        # pass ipc handle and related information
        gpuarray_batch = theano.misc.pycuda_utils.to_gpuarray(
            shared_x.container.value)
        h = drv.mem_get_ipc_handle(gpuarray_batch.ptr)
        sock.send_pyobj((gpuarray_batch.shape, gpuarray_batch.dtype, h))
        load_send_queue.put(img_mean)
    

    load_epoch = config['load_epoch']
    load_weights(layers, config['weights_dir'], load_epoch)

    DropoutLayer.SetDropoutOff()

    
    this_validation_error, this_validation_error_top_5, this_validation_loss = \
        get_val_error_loss(rand_arr, shared_x, shared_y,
                           val_filenames, val_labels,
                           flag_para_load,img_mean,
                           batch_size, validate_model,
                           send_queue=load_send_queue,
                           recv_queue=load_recv_queue,
                           flag_top_5=True)

    print('validation error %f %%' %
          (this_validation_error * 100.))
    print('top 5 validation error %f %%' %
          (this_validation_error_top_5 * 100.))
    print('validation loss %f ' %
          (this_validation_loss))

    return this_validation_error, this_validation_loss
Esempio n. 27
0
def train_net(config):

    # UNPACK CONFIGS
    (flag_para_load, train_filenames, val_filenames, train_labels, val_labels,
     img_mean) = unpack_configs(config)

    # pycuda set up
    drv.init()
    dev = drv.Device(int(config['gpu'][-1]))
    ctx = dev.make_context()

    if flag_para_load:
        #  zmq set up
        sock = zmq.Context().socket(zmq.PAIR)
        sock.connect('tcp://*****:*****@ iter = ', num_iter
                print 'training cost:', cost_ij
                if config['print_train_error']:
                    print 'training error rate:', train_error()

            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_validation_error, this_validation_loss = get_val_error_loss(
            rand_arr,
            shared_x,
            shared_y,
            val_filenames,
            val_labels,
            flag_para_load,
            img_mean,
            batch_size,
            validate_model,
            send_queue=load_send_queue,
            recv_queue=load_recv_queue)

        print('epoch %i: validation loss %f ' % (epoch, this_validation_loss))
        print('epoch %i: validation error %f %%' %
              (epoch, this_validation_error * 100.))
        val_record.append([this_validation_error, this_validation_loss])
        np.save(config['weights_dir'] + 'val_record.npy', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx, val_record,
                                        learning_rate)

        # Save weights
        if epoch % config['snapshot_freq'] == 0:
            save_weights(layers, config['weights_dir'], epoch)
            np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                    learning_rate.get_value())
            save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')
Esempio n. 28
0
def train_net(config, private_config):

    # UNPACK CONFIGS
    (flag_para_load, flag_datalayer, train_filenames, val_filenames,
     train_labels, val_labels, img_mean) = \
        unpack_configs(config, ext_data=private_config['ext_data'],
                       ext_label=private_config['ext_label'])

    gpu_send_queue = private_config['queue_gpu_send']
    gpu_recv_queue = private_config['queue_gpu_recv']

    # pycuda and zmq set up
    drv.init()
    dev = drv.Device(int(private_config['gpu'][-1]))
    ctx = dev.make_context()

    sock_gpu = zmq.Context().socket(zmq.PAIR)
    if private_config['flag_client']:
        sock_gpu.connect('tcp://*****:*****@ iter = ', num_iter
                    print 'training cost:', cost_ij

                if config['print_train_error']:
                    error_ij = train_error()

                    gpu_send_queue.put(error_ij)
                    that_error = gpu_recv_queue.get()
                    error_ij = (error_ij + that_error) / 2.

                    if private_config['flag_verbose']:
                        print 'training error rate:', error_ij

            if flag_para_load and (count < len(minibatch_range)):
                load_send_queue.put('calc_finished')

        ############### Test on Validation Set ##################

        DropoutLayer.SetDropoutOff()

        this_val_error, this_val_loss = get_val_error_loss(
            rand_arr,
            shared_x,
            shared_y,
            val_filenames,
            val_labels,
            flag_datalayer,
            flag_para_load,
            batch_size,
            validate_model,
            send_queue=load_send_queue,
            recv_queue=load_recv_queue)

        # report validation stats
        gpu_send_queue.put(this_val_error)
        that_val_error = gpu_recv_queue.get()
        this_val_error = (this_val_error + that_val_error) / 2.

        gpu_send_queue.put(this_val_loss)
        that_val_loss = gpu_recv_queue.get()
        this_val_loss = (this_val_loss + that_val_loss) / 2.

        if private_config['flag_verbose']:
            print('epoch %i: validation loss %f ' % (epoch, this_val_loss))
            print('epoch %i: validation error %f %%' %
                  (epoch, this_val_error * 100.))
        val_record.append([this_val_error, this_val_loss])

        if private_config['flag_save']:
            np.save(config['weights_dir'] + 'val_record.npy', val_record)

        DropoutLayer.SetDropoutOn()
        ############################################

        # Adapt Learning Rate
        step_idx = adjust_learning_rate(config, epoch, step_idx, val_record,
                                        learning_rate)

        # Save Weights, only one of them will do
        if private_config['flag_save']:
            if epoch % config['snapshot_freq'] == 0:
                save_weights(layers, config['weights_dir'], epoch)
                np.save(config['weights_dir'] + 'lr_' + str(epoch) + '.npy',
                        learning_rate.get_value())
                save_momentums(vels, config['weights_dir'], epoch)

    print('Optimization complete.')