def __init__(self, gpuID=None, stream=None): if gpuID is not None: if gpuID < len(cuda.list_devices()) and gpuID >= 0: cuda.close() cuda.select_device(gpuID) else: raise ValueError('GPU ID not found') if stream is None: self.stream = cuda.stream() else: assert isinstance(stream, numba.cuda.cudadrv.driver.Stream) self.stream = stream self.blas = numbapro.cudalib.cublas.Blas(stream=self.stream) self.blockdim = 32 self.blockdim2 = (32, 32)
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = {} # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc['ixtoword'], bias_init_vector = preProBuildWordVocab(dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray(bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join( '%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum(model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat = 'sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max(1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len(misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
def main(params): batch_size = params['batch_size'] dataset = params['dataset'] word_count_threshold = params['word_count_threshold'] do_grad_check = params['do_grad_check'] max_epochs = params['max_epochs'] host = socket.gethostname() # get computer hostname params['mode'] = 'CPU' # fetch the data provider dp = getDataProvider(dataset) misc = { } # stores various misc items that need to be passed around the framework # go over all training sentences and find the vocabulary we want to use, i.e. the words that occur # at least word_count_threshold number of times misc['wordtoix'], misc[ 'ixtoword'], bias_init_vector = preProBuildWordVocab( dp.iterSentences('train'), word_count_threshold) # delegate the initialization of the model to the Generator class BatchGenerator = decodeGenerator(params) init_struct = BatchGenerator.init(params, misc) model, misc['update'], misc['regularize'] = (init_struct['model'], init_struct['update'], init_struct['regularize']) if params['mode'] == 'GPU': # force overwrite here. This is a bit of a hack, not happy about it model['bd'] = gp.garray( bias_init_vector.reshape(1, bias_init_vector.size)) else: model['bd'] = bias_init_vector.reshape(1, bias_init_vector.size) print 'model init done.' print 'model has keys: ' + ', '.join(model.keys()) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['update']) print 'updating: ' + ', '.join('%s [%dx%d]' % (k, model[k].shape[0], model[k].shape[1]) for k in misc['regularize']) print 'number of learnable parameters total: %d' % (sum( model[k].shape[0] * model[k].shape[1] for k in misc['update']), ) # initialize the Solver and the cost function solver = Solver() def costfun(batch, model): # wrap the cost function to abstract some things away from the Solver return RNNGenCost(batch, model, params, misc) # calculate how many iterations we need num_sentences_total = dp.getSplitSize('train', ofwhat='sentences') num_iters_one_epoch = num_sentences_total / batch_size max_iters = max_epochs * num_iters_one_epoch eval_period_in_epochs = params['eval_period'] eval_period_in_iters = max( 1, int(num_iters_one_epoch * eval_period_in_epochs)) abort = False top_val_ppl2 = -1 smooth_train_ppl2 = len( misc['ixtoword']) # initially size of dictionary of confusion val_ppl2 = len(misc['ixtoword']) last_status_write_time = 0 # for writing worker job status reports json_worker_status = {} json_worker_status['params'] = params json_worker_status['history'] = [] max_iters = 1 for it in xrange(max_iters): if abort: break t0 = time.time() # fetch a batch of data batch = [dp.sampleImageSentencePair() for i in xrange(batch_size)] # evaluate cost, gradient and perform parameter update step_struct = solver.step(batch, model, costfun, **params) cost = step_struct['cost'] dt = time.time() - t0 # print training statistics train_ppl2 = step_struct['stats']['ppl2'] smooth_train_ppl2 = 0.99 * smooth_train_ppl2 + 0.01 * train_ppl2 # smooth exponentially decaying moving average if it == 0: smooth_train_ppl2 = train_ppl2 # start out where we start out epoch = it * 1.0 / num_iters_one_epoch print '%d/%d batch done in %.3fs. at epoch %.2f. loss cost = %f, reg cost = %f, ppl2 = %.2f (smooth %.2f)' \ % (it, max_iters, dt, epoch, cost['loss_cost'], cost['reg_cost'], \ train_ppl2, smooth_train_ppl2) # perform gradient check if desired, with a bit of a burnin time (10 iterations) #if it == 10 and do_grad_check: # solver.gradCheck(batch, model, costfun) # print 'done gradcheck. continue?' # raw_input() # ## detect if loss is exploding and kill the job if so #total_cost = cost['total_cost'] #if it == 0: # total_cost0 = total_cost # store this initial cost #if total_cost > total_cost0 * 2: # print 'Aboring, cost seems to be exploding. Run gradcheck? Lower the learning rate?' # abort = True # set the abort flag, we'll break out # ## logging: write JSON files for visual inspection of the training #tnow = time.time() #if tnow > last_status_write_time + 60*1: # every now and then lets write a report # last_status_write_time = tnow # jstatus = {} # jstatus['time'] = datetime.datetime.now().isoformat() # jstatus['iter'] = (it, max_iters) # jstatus['epoch'] = (epoch, max_epochs) # jstatus['time_per_batch'] = dt # jstatus['smooth_train_ppl2'] = smooth_train_ppl2 # jstatus['val_ppl2'] = val_ppl2 # just write the last available one # jstatus['train_ppl2'] = train_ppl2 # json_worker_status['history'].append(jstatus) # status_file = os.path.join(params['worker_status_output_directory'], host + '_status.json') # try: # json.dump(json_worker_status, open(status_file, 'w')) # except Exception, e: # todo be more clever here # print 'tried to write worker status into %s but got error:' % (status_file, ) # print e # ## perform perplexity evaluation on the validation set and save a model checkpoint if it's good #is_last_iter = (it+1) == max_iters #if (((it+1) % eval_period_in_iters) == 0 and it < max_iters - 5) or is_last_iter: # val_ppl2 = eval_split('val', dp, model, params, misc) # perform the evaluation on VAL set # print 'validation perplexity = %f' % (val_ppl2, ) # write_checkpoint_ppl_threshold = params['write_checkpoint_ppl_threshold'] # if val_ppl2 < top_val_ppl2 or top_val_ppl2 < 0: # if val_ppl2 < write_checkpoint_ppl_threshold or write_checkpoint_ppl_threshold < 0: # # if we beat a previous record or if this is the first time # # AND we also beat the user-defined threshold or it doesnt exist # top_val_ppl2 = val_ppl2 # filename = 'model_checkpoint_%s_%s_%s_%.2f.p' % (dataset, host, params['fappend'], val_ppl2) # filepath = os.path.join(params['checkpoint_output_directory'], filename) # checkpoint = {} # checkpoint['it'] = it # checkpoint['epoch'] = epoch # checkpoint['model'] = model # checkpoint['params'] = params # checkpoint['perplexity'] = val_ppl2 # checkpoint['wordtoix'] = misc['wordtoix'] # checkpoint['ixtoword'] = misc['ixtoword'] # try: # pickle.dump(checkpoint, open(filepath, "wb")) # print 'saved checkpoint in %s' % (filepath, ) # except Exception, e: # todo be more clever here # print 'tried to write checkpoint into %s but got error: ' % (filepat, ) # print e cuda.close()
m_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) n_dev = curand.normal(0, 1, n, dtype=np.float32, device=True) a_host = np.zeros(n, dtype=np.float32) a_dev = cuda.device_array_like(a_host) cuda_div[griddim, blockdim, stream](m_dev, n_dev, a_dev, n) #keeps a_dev on the device for the kernel ==> no access at this point to the device memory # so i cant know what appends to m_dev and n_dev best guess is python GC is # translated into desallocation on the device b_dev = curand.uniform((n * n), dtype=np.float32, device=True) c_dev = cuda.device_array_like(c_host, stream) block_kernel[griddim, blockdim, stream](start, n, a_dev, b_dev, c_dev) c_dev.copy_to_host(c_host, stream) stream.synchronize() return c_host if __name__ == '__main__': t0 = time.time() n = 8000 stream = cuda.stream() blockdim = 256 griddim = n // 256 + 1 c_host = block_increment(0, n) stream.synchronize() cuda.close() print(c_host) print(time.time() - t0)