def gen_folds( dataset, options, nrun ): nids = len( set( dataset[0,:] ) ) ids = options['numpy_rng'].permutation(nids) # train / test ids trainsizeElem = round( options['trainsize']*nids ) train_ids = ids[0:trainsizeElem] test_ids = ids[trainsizeElem+1:nids] print >> sys.stderr, test_ids if options['verbose']> 2: print >> sys.stderr, "Train IDS" print >> sys.stderr, train_ids print >> sys.stderr, "Test IDS" print >> sys.stderr, test_ids # val ids val_ids = numpy.copy(train_ids) nitems = len(val_ids)/options['folds'] val_ids.resize((options['folds'],nitems)) folds = range(0,options['folds']) trainval = [] valval = [] testval = [] for k in folds: others = list( set([k]).symmetric_difference(set(folds)) ) #print val_ids #print others #kk train = val_ids[k].flatten() val = val_ids[others[0]] test = val_ids[others[1]] xtrain,ytrain,minv,maxv = get_data( dataset, train, options ) xval,yval = get_data( dataset, val, options, isFirst=False, minvalue=minv, maxvalue=maxv )[0:2] xtest,ytest = get_data( dataset, test, options, isFirst=False, minvalue=minv, maxvalue=maxv )[0:2] trainval.append( (xtrain,ytrain) ) valval.append( (xval,yval) ) testval.append( (xtest,ytest) ) if options['verbose'] > 0: print 'Train set with size %d for fold %d' % (ytrain.shape.eval(),k) print 'Test set with size %d for fold %d' % (ytest.shape.eval(),k) if options['verbose'] > 5: for cls in range(0,2): print >>sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format(cls,sum(ytrain.eval() == cls)) print >>sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format(cls,sum(ytest.eval() == cls)) # final ids final_ids = numpy.copy(train_ids) nitems = len(final_ids)/2 final_ids.resize((2,nitems)) trainfinal_ids = final_ids[0] valfinal_ids = final_ids[1] xtrain,ytrain,minv,maxv = get_data( dataset, trainfinal_ids, options ) xval,yval = get_data( dataset, valfinal_ids, options, isFirst = False, minvalue=minv, maxvalue=maxv )[0:2] xtest,ytest = get_data( dataset, test_ids , options, isFirst = True , minvalue=minv, maxvalue=maxv )[0:2] trainFinal = (xtrain,ytrain) valFinal = (xval,yval) testFinal = (xtest,ytest) print >> sys.stderr, test_ids if options['verbose'] > 0: print 'Train set with size %d ' % (ytrain.shape.eval()) print 'Val set with size %d ' % (yval.shape.eval()) print 'Test set with size %d ' % (ytest.shape.eval()) if options['verbose'] > 5: for cls in range(0,2): print >>sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format(cls,sum(ytrain.eval() == cls)) print >>sys.stderr, "\tNumber of validation elements for cls {0:02d} is {1:05d}".format(cls,sum(yval.eval() == cls)) print >>sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format(cls,sum(ytest.eval() == cls)) basefilename = '{0:s}/{1:05d}_{2:03d}_'.format(options['outputfolder'],nrun,string.atoi(options['resolution'])) trainfilename = basefilename + 'train_ids.pkl.gz' valfilename = basefilename + 'val_ids.pkl.gz' trainfinalfilename = basefilename + 'trainfinal_ids.pkl.gz' valfinalfilename = basefilename + 'valfinal_ids.pkl.gz' testfilename = basefilename + 'test_ids.pkl.gz' save_gzdata(trainfilename,train_ids) save_gzdata(valfilename,val_ids) save_gzdata(trainfinalfilename,trainfinal_ids) save_gzdata(valfinalfilename,valfinal_ids) save_gzdata(testfilename,test_ids) if options['verbose'] > 0: print 'Train set with size %d' % (trainFinal[1].shape.eval()) print 'Val set with size %d' % (valFinal[1].shape.eval()) print 'Test set with size %d' % (testFinal[1].shape.eval()) rval = [trainval, valval, testval, trainFinal, valFinal, testFinal ] return rval
def pretrain_finetune_model(sda, pretraining_fns, train_set, test_set, options): train_set_x, train_set_y = train_set n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= options['batchsize'] if options['retrain'] == 0: bestmodelsda = copy.copy(sda) # ----------------------------------------------- # PRETRAINING # ----------------------------------------------- if options['verbose'] > 5: print >> sys.stderr, ('... pre-training the model') start_time = time.clock() ## Pre-train layer-wise corruption_levels = options['corruptlevels'] for i in xrange(sda.n_layers): # go through pretraining epochs for epoch in xrange(options['pretraining_epochs']): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i]( index=batch_index, corruption=corruption_levels[i], lr=options['pretrain_lr'])) if epoch % 100 == 0 and options['verbose'] > 5: print >> sys.stderr, ( 'Pre-training layer %02i, epoch %04d, cost ' % (i, epoch)), print >> sys.stderr, (numpy.mean(c)) end_time = time.clock() if options['savetimes']: filename = '{0:s}/times_pr_{1:03d}_{2:03d}.pkl.gz'.format( options['outputfolderres'], options['nrun'], string.atoi(options['resolution'])) save_gzdata(filename, end_time - start_time) if options['verbose'] > 4: print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # get the training, validation and testing function for the model #dataset = [folds[0][0], folds[1][0], folds[2]] dataset = [train_set, test_set] if options['verbose'] > 5: print >> sys.stderr, ('... getting the finetuning functions') train_fn, validate_model = sda.build_finetune_functions( datasets=dataset, batch_size=options['batchsize'], learning_rate=options['finetune_lr']) else: dataset = [train_set, test_set] train_fn, validate_model = sda.build_finetune_functions_reuse( datasets=dataset, batch_size=options['batchsize'], learning_rate=options['finetune_lr'], update_layerwise=options['retrain_ft_layers']) # ------------------------------------------------------------------------------------------------ # ----------------------------------------------- # FINETUNE # ----------------------------------------------- if options['verbose'] > 5: print >> sys.stderr, ('... finetunning the model') # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2. #2. # wait this much longer when a new best is found improvement_threshold = 0.995 # 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < options['training_epochs']) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: #this_validation_loss = numpy.mean( validate_model() ) (y_valid, y_pred, y_pred_prob) = validate_model() # pos = numpy.random.randint(len(y_pred),size=(100,)) # print options # print pos # print y_pred_prob[pos,:].T # raw_input() # alll # we are going to control the predictions according to their prob if options['threshold'] != None: y_pred = numpy.array( y_pred_prob[:, 0] < options['threshold'], dtype=numpy.uint8) else: y_pred = numpy.argmax(y_pred_prob, axis=1) this_validation_loss = evaluate_error(y_valid, y_pred, options) # if epoch % 10 == 0: # cm = confusion_matrix(y_valid, y_pred, options['nclasses']) # print >> sys.stderr, cm, this_validation_loss # print >> sys.stderr, this_validation_loss # this_validation_loss = numpy.mean(validation_losses) if epoch % 30 == 0 and options['verbose'] > 5: # print >> sys.stderr, y_valid # print >> sys.stderr, y_pred_prob # print >> sys.stderr, y_pred # print >> sys.stderr, y_valid.shape # print >> sys.stderr, y_pred.shape # print >> sys.stderr, y_pred_prob[1:10,:], y_pred[1:10], y_valid[1:10] # print >> sys.stderr, test_set[1].eval() print >> sys.stderr, ( 'epoch %04i, minibatch %04i/%04i, validation error %03f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: bestmodelsda = copy.copy(sda) # % ------------------------------------------------------------ if options['oneclass'] == True: options['nclasses'] = 2 #print >> sys.stderr, sda.params[-2].get_value().T, sda.params[-1].get_value() pos = numpy.random.randint(len(y_pred), size=(10, )) # print options # print pos # print y_pred_prob.shape #print >> sys.stderr, options['threshold'] # print >> sys.stderr, numpy.array( y_pred_prob[:,0] < options['threshold'], dtype=numpy.uint8) #print >> sys.stderr, y_pred_prob[pos,:].T #print >> sys.stderr, y_pred[pos] cm = confusion_matrix(y_valid, y_pred, options['nclasses']) #print >> sys.stderr, ("Fine tune...epoch %04i" % epoch) #print >> sys.stderr, this_validation_loss #print >> sys.stderr, cm # options['nclasses'] = 1 # % ------------------------------------------------------------ # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter if patience <= iter: done_looping = True break end_time = time.clock() if options['savetimes']: filename = '{0:s}/times_fn_{1:03d}_{2:03d}.pkl.gz'.format( options['outputfolderres'], options['nrun'], string.atoi(options['resolution'])) save_gzdata(filename, end_time - start_time) print >> sys.stderr, ("Stopped at epoch %04i" % epoch) return (best_validation_loss, bestmodelsda)
def do_experiment(folds, options, nrun, sda_reuse_model): modeloptions = {} param = list( itertools.product(options['nneurons'], options['hlayers'], options['pretraining_epochs'], options['training_epochs'], options['pretrain_lr'], options['finetune_lr'], options['batchsize'], options['threshold'], options['corruptlevels'])) print >> sys.stderr, param print >> sys.stderr, ('Number of combinations {0:03d}'.format(len(param))) step = 0 besterror = numpy.inf # --------------------------------------------------------------- # cross validation # --------------------------------------------------------------- for k in range(0, len(param)): (nneurons, hlayers, pretraining_epochs, training_epochs, pretrain_lr, finetune_lr, batchsize, threshold, corruptlevels) = param[k] modeloptions = { 'savetimes': False, 'outputfolder': options['outputfolder'], 'outputfolderres': options['outputfolderres'], 'resolution': options['resolution'], 'retrain': options['retrain'], 'verbose': options['verbose'], 'ndim': options['ndim'], 'nclasses_source': options['nclasses_source'], 'nclasses': options['nclasses'], 'numpy_rng': options['numpy_rng'], 'theano_rng': options['theano_rng'], 'measure': options['measure'], 'oneclass': options['oneclass'], 'batchsize': batchsize, 'hlayers': nneurons * numpy.ones((hlayers, )), # numpy.array(nneurons * numpy.ones((hlayers,)) * (1/(2*numpy.arange(1,hlayers+1)*1.)),dtype=numpy.int), 'corruptlevels': corruptlevels * numpy.ones((hlayers, ), dtype=numpy.float32), 'pretraining_epochs': pretraining_epochs, 'training_epochs': training_epochs, 'pretrain_lr': pretrain_lr, 'finetune_lr': finetune_lr, 'threshold': threshold, 'sda_reuse_model': sda_reuse_model, 'retrain_ft_layers': options['retrain_ft_layers'], 'weight': options['weight'], } if k == 0: bestmodeloptions = copy.copy(modeloptions) if modeloptions['verbose'] > 2: print >> sys.stderr, "######################################################" print >> sys.stderr, " CROSS-VAL " print >> sys.stderr, "######################################################" print >> sys.stderr, modeloptions merror = 0 merrori = 0 for cv in range(0, options['folds']): counter = step / (len(param) * options['folds'] * 1.) print >> sys.stderr, ('###### {t:0{format}.1f}% ({e:0.2f})'.format( format=5, t=counter * 100, e=besterror)) trainset = folds[0] valset = folds[1] testset = folds[2] # print >> sys.stderr, sda_reuse_model (sda, pretraining_fns) = build_model(trainset[cv], modeloptions) # print >> sys.stderr, sda sda = pretrain_finetune_model(sda, pretraining_fns, trainset[cv], valset[cv], modeloptions)[1] merrori = evaluate_model(sda, testset[cv], modeloptions)[0] merror = merror + merrori # print >> sys.stderr, sda, sda_reuse_model step = step + 1 merror = merror / options['folds'] if merror < besterror: besterror = merror bestmodeloptions = copy.copy(modeloptions) # print >> sys.stderr, "------------------------------" # ------------------------------------------------------------------- # end of cross validation if modeloptions['verbose'] > 0: print >> sys.stderr, "######################################################" print >> sys.stderr, " TRAIN/TEST " print >> sys.stderr, "######################################################" print >> sys.stderr, (bestmodeloptions) trainset = folds[3] valset = folds[4] testset = folds[5] bestmodeloptions['savetimes'] = True bestmodeloptions['nrun'] = nrun # print >> sys.stderr, sda_reuse_model start_time = time.clock() (sda, pretraining_fns) = build_model(trainset, bestmodeloptions) end_time = time.clock() pretrain_time = end_time - start_time start_time = time.clock() sda = pretrain_finetune_model(sda, pretraining_fns, trainset, valset, bestmodeloptions)[1] end_time = time.clock() finetune_time = end_time - start_time result = evaluate_model(sda, testset, bestmodeloptions) # print >> sys.stderr, sda, sda_reuse_model print >> sys.stderr, "time pretrain: {0:f} | time fine-tune: {1:f}".format( pretrain_time, finetune_time) result = result + (pretrain_time, finetune_time) filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format( options['outputfolder'], nrun, string.atoi(options['resolution'])) save_gzdata(filename, sda) filename = '{0:s}/{1:05d}_{2:03d}_options.pkl.gz'.format( options['outputfolder'], nrun, string.atoi(options['resolution'])) save_gzdata(filename, bestmodeloptions) return result
if ypred[i] == 0: # red, SdA cv2.circle(img, (int(pti[0]), int(pti[1])), 30, (0, 0, 255), 5) filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.jpg".format( imgspath[ids[count]], rd, th, nrunImg, cv) print >> sys.stderr, "Saving image..:" + filename cv2.imwrite(filename, img) filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}_LoG.pkl.gz".format( imgspath[ids[count]], rd, th, nrunImg, cv) print >> sys.stderr, "(LoG) Precision: {0:05f} | Recall: {1:05f} ".format( Precision_LoG_, Recall_LoG_) save_gzdata(filename, [Precision_LoG_, Recall_LoG_]) filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.pkl.gz".format( imgspath[ids[count]], rd, th, nrunImg, cv) print >> sys.stderr, "(SdA) Precision: {0:05f} | Recall: {1:05f} ".format( Precision_, Recall_) save_gzdata(filename, [Precision_, Recall_]) print >> sys.stderr, ( "Ann: {0:05d} | Nano (SdA): {1:05d}| Back (SdA): {2:05d}| LoG: {3:05d} " ).format(nmbrAnn, sum(numpy.array(ypred) == 0), sum(numpy.array(ypred) == 1), nelem_x) print >> sys.stderr, "-------------------------" # average over all files Precision = Precision / (nfiles / 2)
def main(resolution, method, pathRes): # load results from LoG imgpathsae = '../../imgs_nanoparticles/{0:03d}/db2/resultado_sae/'.format( string.atoi(resolution)) if method == 'baseline': basepath = './{0:s}/{1:05d}/models/res_baseline_resized_{1:05d}_111111/'.format( pathRes, string.atoi(resolution)) elif method == 'tl': basepath = './{0:s}/{1:05d}/models/res_tl_resized_50000_{1:05d}_111111/'.format( pathRes, string.atoi(resolution)) # annotations annbasepath = '../../imgs_nanoparticles/{0:03d}/db2/annotation/user/'.format( string.atoi(resolution)) annfiles = [ f for f in os.listdir(annbasepath) if re.match(r'[\w\W]*csv', f) ] annfiles = sorted(annfiles) # imgs base paths imgsbasepath = '../../imgs_nanoparticles/{0:03d}/db2/'.format( string.atoi(resolution)) imgspath = os.listdir(imgsbasepath) imgspath = sorted(imgspath) # ------------------------------------------------------------------------------------------------ # TEST DATA PrecisionAll = [] RecallAll = [] PrecisionLoGAll = [] RecallLoGAll = [] nDetectionsAll = [] for nrun in range(1, 21): # print >> sys.stderr, "\n**************************\n" print >> sys.stderr, "NRUN {0:05d}/20 ".format(nrun) filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format( basepath, nrun, string.atoi(resolution)) print >> sys.stderr, "Loading " + filename model = load_savedgzdata(filename) # get ids pathids = '{0:s}/{1:05d}_{2:05d}_test_ids.pkl.gz'.format( basepath, nrun, string.atoi(resolution)) print >> sys.stderr, 'Loading ' + pathids + '...' ids = load_savedgzdata(pathids) print >> sys.stderr, ids reg = 'detectedNanoParticlesDetectionResult_log_detector_test_{0:03d}_'.format( nrun) files = [f for f in os.listdir(imgpathsae) if re.match(reg, f)] # order data files = sorted(files) nfiles = len(files) (Precision, Recall, PrecisionLoG, RecallLoG, nDetections) = getPrecisionRecall(nfiles, files, ids, imgpathsae, imgsbasepath, imgspath, annbasepath, annfiles, model, (0, 0, nrun, 0), printImg=True) print >> sys.stderr, "Precision LoG: {0:05f} | Recall LoG: {1:05f}".format( PrecisionLoG, RecallLoG) print >> sys.stderr, "Precision SdA: {0:05f} | Recall SdA: {1:05f}".format( Precision, Recall) # kaka PrecisionAll.append(Precision) RecallAll.append(Recall) PrecisionLoGAll.append(PrecisionLoG) RecallLoGAll.append(RecallLoG) nDetectionsAll.append(nDetections) # --------------------------------------------------------- PrecisionAll = numpy.array(PrecisionAll) RecallAll = numpy.array(RecallAll) PrecisionLoGAll = numpy.array(PrecisionLoGAll) RecallLoGAll = numpy.array(RecallLoGAll) nDetectionsAll = numpy.array(nDetectionsAll) print "--------------------------------------------\n" print "Precision LoG: {0:03f} ({1:03f}) | Recall LoG: {2:03f} ({3:03f})".format( numpy.mean(PrecisionLoGAll), numpy.std(PrecisionLoGAll), numpy.mean(RecallLoGAll), numpy.std(RecallLoGAll)) print "Precision SdA: {0:03f} ({1:03f}) | Recall SdA: {2:03f} ({3:03f})".format( numpy.mean(PrecisionAll), numpy.std(PrecisionAll), numpy.mean(RecallAll), numpy.std(RecallAll)) print "number detections: {0:03f} ({1:03f})".format( numpy.mean(nDetectionsAll), numpy.std(nDetectionsAll)) PrecisionRecall = numpy.c_[PrecisionAll, RecallAll] filename = 'results/sae_{0:s}_{1:s}_test_all.pkl.gz'.format( method, resolution) save_gzdata(filename, PrecisionRecall) PrecisionRecallLoG = numpy.c_[PrecisionLoGAll, RecallLoGAll] filename = 'results/log_{0:s}_{1:s}_test_all.pkl.gz'.format( method, resolution) save_gzdata(filename, PrecisionRecallLoG) PrecisionRecall = numpy.r_[numpy.mean(PrecisionAll), numpy.mean(RecallAll)] filename = 'results/sae_{0:s}_{1:s}_test.pkl.gz'.format(method, resolution) save_gzdata(filename, PrecisionRecall) PrecisionRecallLoG = numpy.r_[numpy.mean(PrecisionLoGAll), numpy.mean(RecallLoGAll)] filename = 'results/log_{0:s}_{1:s}_test.pkl.gz'.format(method, resolution) save_gzdata(filename, PrecisionRecallLoG) filename = 'results/ndetections_{0:s}_{1:s}_test.pkl.gz'.format( method, resolution) save_gzdata(filename, nDetectionsAll)
def do_experiment( folds, options, nrun, sda_reuse_model ): modeloptions = {} param = list(itertools.product( options['nneurons'], options['hlayers'], options['pretraining_epochs'], options['training_epochs'], options['pretrain_lr'], options['finetune_lr'], options['batchsize'], options['threshold'], options['corruptlevels'] ) ) print >> sys.stderr, param print >> sys.stderr, ('Number of combinations {0:03d}'.format(len(param))) step = 0 besterror = numpy.inf # --------------------------------------------------------------- # cross validation # --------------------------------------------------------------- for k in range(0,len(param)): (nneurons, hlayers, pretraining_epochs, training_epochs, pretrain_lr, finetune_lr, batchsize, threshold, corruptlevels) = param[k] modeloptions = { 'savetimes' : False, 'outputfolder' : options['outputfolder'], 'outputfolderres' : options['outputfolderres'], 'resolution' : options['resolution'], 'retrain' : options['retrain'], 'verbose' : options['verbose'], 'ndim' : options['ndim'], 'nclasses_source' : options['nclasses_source'], 'nclasses' : options['nclasses'], 'numpy_rng' : options['numpy_rng'], 'theano_rng' : options['theano_rng'], 'measure' : options['measure'], 'oneclass' : options['oneclass'], 'batchsize' : batchsize, 'hlayers' : nneurons * numpy.ones((hlayers,)), # numpy.array(nneurons * numpy.ones((hlayers,)) * (1/(2*numpy.arange(1,hlayers+1)*1.)),dtype=numpy.int), 'corruptlevels' : corruptlevels*numpy.ones((hlayers,),dtype=numpy.float32), 'pretraining_epochs' : pretraining_epochs, 'training_epochs' : training_epochs, 'pretrain_lr' : pretrain_lr, 'finetune_lr' : finetune_lr, 'threshold' : threshold, 'sda_reuse_model' : sda_reuse_model, 'retrain_ft_layers' : options['retrain_ft_layers'], 'weight' : options['weight'], } if k == 0: bestmodeloptions = copy.copy( modeloptions ) if modeloptions['verbose'] > 2: print >> sys.stderr, "######################################################" print >> sys.stderr, " CROSS-VAL " print >> sys.stderr, "######################################################" print >> sys.stderr, modeloptions merror = 0 merrori = 0 for cv in range(0,options['folds']): counter = step/(len(param)*options['folds']*1.) print >> sys.stderr, ('###### {t:0{format}.1f}% ({e:0.2f})'.format(format=5,t=counter*100,e=besterror) ) trainset = folds[0] valset = folds[1] testset = folds[2] # print >> sys.stderr, sda_reuse_model (sda,pretraining_fns) = build_model(trainset[cv],modeloptions) # print >> sys.stderr, sda sda = pretrain_finetune_model(sda,pretraining_fns, trainset[cv], valset[cv], modeloptions)[1] merrori = evaluate_model(sda,testset[cv],modeloptions)[0] merror = merror + merrori # print >> sys.stderr, sda, sda_reuse_model step = step + 1 merror = merror / options['folds'] if merror < besterror: besterror = merror bestmodeloptions = copy.copy( modeloptions ) # print >> sys.stderr, "------------------------------" # ------------------------------------------------------------------- # end of cross validation if modeloptions['verbose'] > 0: print >> sys.stderr, "######################################################" print >> sys.stderr, " TRAIN/TEST " print >> sys.stderr, "######################################################" print >> sys.stderr, (bestmodeloptions) trainset = folds[3] valset = folds[4] testset = folds[5] bestmodeloptions['savetimes'] = True bestmodeloptions['nrun'] = nrun # print >> sys.stderr, sda_reuse_model start_time = time.clock() (sda,pretraining_fns) = build_model(trainset, bestmodeloptions) end_time = time.clock() pretrain_time = end_time - start_time start_time = time.clock() sda = pretrain_finetune_model(sda, pretraining_fns, trainset, valset, bestmodeloptions)[1] end_time = time.clock() finetune_time = end_time - start_time result = evaluate_model( sda, testset, bestmodeloptions ) # print >> sys.stderr, sda, sda_reuse_model print >> sys.stderr, "time pretrain: {0:f} | time fine-tune: {1:f}".format(pretrain_time, finetune_time) result = result + ( pretrain_time, finetune_time ) filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format(options['outputfolder'],nrun,string.atoi(options['resolution'])) save_gzdata(filename, sda) filename = '{0:s}/{1:05d}_{2:03d}_options.pkl.gz'.format(options['outputfolder'],nrun,string.atoi(options['resolution'])) save_gzdata(filename, bestmodeloptions) return result
def pretrain_finetune_model(sda,pretraining_fns,train_set,test_set,options): train_set_x, train_set_y = train_set n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= options['batchsize'] if options['retrain'] == 0: bestmodelsda = copy.copy( sda ) # ----------------------------------------------- # PRETRAINING # ----------------------------------------------- if options['verbose'] > 5: print >> sys.stderr, ('... pre-training the model') start_time = time.clock() ## Pre-train layer-wise corruption_levels = options['corruptlevels'] for i in xrange(sda.n_layers): # go through pretraining epochs for epoch in xrange(options['pretraining_epochs']): # go through the training set c = [] for batch_index in xrange(n_train_batches): c.append(pretraining_fns[i](index=batch_index, corruption=corruption_levels[i], lr=options['pretrain_lr'])) if epoch % 100 == 0 and options['verbose'] > 5: print >> sys.stderr, ('Pre-training layer %02i, epoch %04d, cost ' % (i, epoch)), print >> sys.stderr, (numpy.mean(c)) end_time = time.clock() if options['savetimes']: filename = '{0:s}/times_pr_{1:03d}_{2:03d}.pkl.gz'.format(options['outputfolderres'],options['nrun'],string.atoi(options['resolution'])) save_gzdata(filename, end_time - start_time) if options['verbose'] > 4: print >> sys.stderr, ('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # get the training, validation and testing function for the model #dataset = [folds[0][0], folds[1][0], folds[2]] dataset = [train_set, test_set] if options['verbose'] > 5: print >> sys.stderr,('... getting the finetuning functions') train_fn, validate_model = sda.build_finetune_functions( datasets=dataset, batch_size=options['batchsize'], learning_rate=options['finetune_lr'] ) else: dataset = [train_set, test_set] train_fn, validate_model = sda.build_finetune_functions_reuse( datasets=dataset, batch_size=options['batchsize'], learning_rate=options['finetune_lr'], update_layerwise=options['retrain_ft_layers']) # ------------------------------------------------------------------------------------------------ # ----------------------------------------------- # FINETUNE # ----------------------------------------------- if options['verbose'] > 5: print >> sys.stderr, ('... finetunning the model') # early-stopping parameters patience = 10 * n_train_batches # look as this many examples regardless patience_increase = 2. #2. # wait this much longer when a new best is found improvement_threshold = 0.995 # 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < options['training_epochs']) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: #this_validation_loss = numpy.mean( validate_model() ) (y_valid, y_pred, y_pred_prob) = validate_model() # pos = numpy.random.randint(len(y_pred),size=(100,)) # print options # print pos # print y_pred_prob[pos,:].T # raw_input() # alll # we are going to control the predictions according to their prob if options['threshold'] != None: y_pred = numpy.array( y_pred_prob[:,0] < options['threshold'], dtype=numpy.uint8) else: y_pred = numpy.argmax( y_pred_prob, axis = 1 ) this_validation_loss = evaluate_error( y_valid, y_pred, options ) # if epoch % 10 == 0: # cm = confusion_matrix(y_valid, y_pred, options['nclasses']) # print >> sys.stderr, cm, this_validation_loss # print >> sys.stderr, this_validation_loss # this_validation_loss = numpy.mean(validation_losses) if epoch % 30 == 0 and options['verbose'] > 5: # print >> sys.stderr, y_valid # print >> sys.stderr, y_pred_prob # print >> sys.stderr, y_pred # print >> sys.stderr, y_valid.shape # print >> sys.stderr, y_pred.shape # print >> sys.stderr, y_pred_prob[1:10,:], y_pred[1:10], y_valid[1:10] # print >> sys.stderr, test_set[1].eval() print >> sys.stderr,('epoch %04i, minibatch %04i/%04i, validation error %03f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: bestmodelsda = copy.copy(sda) # % ------------------------------------------------------------ if options['oneclass'] == True: options['nclasses'] = 2 #print >> sys.stderr, sda.params[-2].get_value().T, sda.params[-1].get_value() pos = numpy.random.randint(len(y_pred),size=(10,)) # print options # print pos # print y_pred_prob.shape #print >> sys.stderr, options['threshold'] # print >> sys.stderr, numpy.array( y_pred_prob[:,0] < options['threshold'], dtype=numpy.uint8) #print >> sys.stderr, y_pred_prob[pos,:].T #print >> sys.stderr, y_pred[pos] cm = confusion_matrix(y_valid, y_pred, options['nclasses']) #print >> sys.stderr, ("Fine tune...epoch %04i" % epoch) #print >> sys.stderr, this_validation_loss #print >> sys.stderr, cm # options['nclasses'] = 1 # % ------------------------------------------------------------ # improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter if patience <= iter: done_looping = True break end_time = time.clock() if options['savetimes']: filename = '{0:s}/times_fn_{1:03d}_{2:03d}.pkl.gz'.format(options['outputfolderres'],options['nrun'],string.atoi(options['resolution'])) save_gzdata(filename, end_time - start_time) print >> sys.stderr, ("Stopped at epoch %04i" % epoch ) return (best_validation_loss,bestmodelsda)
def main(resolution,method,pathRes): # load results from LoG imgpathsae = '../../imgs_nanoparticles/{0:03d}/db2/resultado_sae/'.format(string.atoi(resolution)) if method == 'baseline': basepath = './{0:s}/{1:05d}/models/res_baseline_resized_{1:05d}_111111/'.format(pathRes,string.atoi(resolution)) elif method == 'tl': basepath = './{0:s}/{1:05d}/models/res_tl_resized_50000_{1:05d}_111111/'.format(pathRes,string.atoi(resolution)) # annotations annbasepath = '../../imgs_nanoparticles/{0:03d}/db2/annotation/user/'.format(string.atoi(resolution)) annfiles = [f for f in os.listdir(annbasepath) if re.match(r'[\w\W]*csv', f)] annfiles = sorted( annfiles ) # imgs base paths imgsbasepath = '../../imgs_nanoparticles/{0:03d}/db2/'.format(string.atoi(resolution)) imgspath = os.listdir(imgsbasepath) imgspath = sorted( imgspath ) # ------------------------------------------------------------------------------------------------ # TEST DATA PrecisionAll = [] RecallAll = [] PrecisionLoGAll = [] RecallLoGAll = [] nDetectionsAll = [] for nrun in range(1,21): # print >> sys.stderr, "\n**************************\n" print >> sys.stderr, "NRUN {0:05d}/20 ".format(nrun) filename = '{0:s}/{1:05d}_{2:03d}_model.pkl.gz'.format(basepath,nrun,string.atoi(resolution)) print >> sys.stderr, "Loading " + filename model = load_savedgzdata(filename) # get ids pathids = '{0:s}/{1:05d}_{2:05d}_test_ids.pkl.gz'.format(basepath,nrun,string.atoi(resolution)) print >> sys.stderr, 'Loading ' + pathids + '...' ids = load_savedgzdata(pathids) print >> sys.stderr, ids reg = 'detectedNanoParticlesDetectionResult_log_detector_test_{0:03d}_'.format(nrun) files = [f for f in os.listdir(imgpathsae) if re.match(reg, f)] # order data files = sorted( files ) nfiles = len(files) (Precision, Recall, PrecisionLoG,RecallLoG,nDetections) = getPrecisionRecall(nfiles,files,ids,imgpathsae,imgsbasepath,imgspath,annbasepath,annfiles,model,(0,0,nrun,0),printImg=True) print >> sys.stderr, "Precision LoG: {0:05f} | Recall LoG: {1:05f}".format(PrecisionLoG, RecallLoG) print >> sys.stderr, "Precision SdA: {0:05f} | Recall SdA: {1:05f}".format(Precision, Recall) # kaka PrecisionAll.append( Precision ) RecallAll.append( Recall ) PrecisionLoGAll.append( PrecisionLoG ) RecallLoGAll.append( RecallLoG ) nDetectionsAll.append( nDetections ) # --------------------------------------------------------- PrecisionAll = numpy.array( PrecisionAll ) RecallAll = numpy.array( RecallAll ) PrecisionLoGAll = numpy.array( PrecisionLoGAll ) RecallLoGAll = numpy.array( RecallLoGAll ) nDetectionsAll = numpy.array( nDetectionsAll ) print "--------------------------------------------\n" print "Precision LoG: {0:03f} ({1:03f}) | Recall LoG: {2:03f} ({3:03f})".format(numpy.mean(PrecisionLoGAll),numpy.std(PrecisionLoGAll),numpy.mean(RecallLoGAll),numpy.std(RecallLoGAll)) print "Precision SdA: {0:03f} ({1:03f}) | Recall SdA: {2:03f} ({3:03f})".format(numpy.mean(PrecisionAll),numpy.std(PrecisionAll),numpy.mean(RecallAll),numpy.std(RecallAll)) print "number detections: {0:03f} ({1:03f})".format(numpy.mean(nDetectionsAll),numpy.std(nDetectionsAll)) PrecisionRecall = numpy.c_[PrecisionAll,RecallAll] filename = 'results/sae_{0:s}_{1:s}_test_all.pkl.gz'.format(method,resolution) save_gzdata(filename, PrecisionRecall ) PrecisionRecallLoG = numpy.c_[PrecisionLoGAll,RecallLoGAll] filename = 'results/log_{0:s}_{1:s}_test_all.pkl.gz'.format(method,resolution) save_gzdata(filename, PrecisionRecallLoG ) PrecisionRecall = numpy.r_[numpy.mean(PrecisionAll),numpy.mean(RecallAll)] filename = 'results/sae_{0:s}_{1:s}_test.pkl.gz'.format(method,resolution) save_gzdata(filename, PrecisionRecall ) PrecisionRecallLoG = numpy.r_[numpy.mean(PrecisionLoGAll),numpy.mean(RecallLoGAll)] filename = 'results/log_{0:s}_{1:s}_test.pkl.gz'.format(method,resolution) save_gzdata(filename, PrecisionRecallLoG ) filename = 'results/ndetections_{0:s}_{1:s}_test.pkl.gz'.format(method,resolution) save_gzdata(filename, nDetectionsAll )
if ypredlog[i] == 0: # blue, log cv2.circle(img,(int(pti[0]),int(pti[1])),20,(255,0,0),5) if ypred[i] == 0: # red, SdA cv2.circle(img,(int(pti[0]),int(pti[1])),30,(0,0,255),5) filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.jpg".format(imgspath[ids[count]],rd,th,nrunImg,cv) print >> sys.stderr, "Saving image..:" + filename cv2.imwrite(filename,img) filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}_LoG.pkl.gz".format(imgspath[ids[count]],rd,th,nrunImg,cv) print >> sys.stderr, "(LoG) Precision: {0:05f} | Recall: {1:05f} ".format(Precision_LoG_, Recall_LoG_) save_gzdata(filename,[Precision_LoG_,Recall_LoG_]) filename = "imgs_debug/{0:s}_r={1:d}_th={2:d}_{3:03d}_cv={4:d}.pkl.gz".format(imgspath[ids[count]],rd,th,nrunImg,cv) print >> sys.stderr, "(SdA) Precision: {0:05f} | Recall: {1:05f} ".format(Precision_, Recall_) save_gzdata(filename,[Precision_,Recall_]) print >> sys.stderr, ("Ann: {0:05d} | Nano (SdA): {1:05d}| Back (SdA): {2:05d}| LoG: {3:05d} ").format(nmbrAnn, sum(numpy.array(ypred)==0), sum(numpy.array(ypred)==1), nelem_x) print >> sys.stderr, "-------------------------" # average over all files Precision = Precision / (nfiles/2) Recall = Recall / (nfiles/2) Precision_LoG = Precision_LoG / (nfiles/2) Recall_LoG = Recall_LoG / (nfiles/2)
def gen_folds(dataset, options, nrun): nids = len(set(dataset[0, :])) ids = options['numpy_rng'].permutation(nids) # train / test ids trainsizeElem = round(options['trainsize'] * nids) train_ids = ids[0:trainsizeElem] test_ids = ids[trainsizeElem + 1:nids] print >> sys.stderr, test_ids if options['verbose'] > 2: print >> sys.stderr, "Train IDS" print >> sys.stderr, train_ids print >> sys.stderr, "Test IDS" print >> sys.stderr, test_ids # val ids val_ids = numpy.copy(train_ids) nitems = len(val_ids) / options['folds'] val_ids.resize((options['folds'], nitems)) folds = range(0, options['folds']) trainval = [] valval = [] testval = [] for k in folds: others = list(set([k]).symmetric_difference(set(folds))) #print val_ids #print others #kk train = val_ids[k].flatten() val = val_ids[others[0]] test = val_ids[others[1]] xtrain, ytrain, minv, maxv = get_data(dataset, train, options) xval, yval = get_data(dataset, val, options, isFirst=False, minvalue=minv, maxvalue=maxv)[0:2] xtest, ytest = get_data(dataset, test, options, isFirst=False, minvalue=minv, maxvalue=maxv)[0:2] trainval.append((xtrain, ytrain)) valval.append((xval, yval)) testval.append((xtest, ytest)) if options['verbose'] > 0: print 'Train set with size %d for fold %d' % (ytrain.shape.eval(), k) print 'Test set with size %d for fold %d' % (ytest.shape.eval(), k) if options['verbose'] > 5: for cls in range(0, 2): print >> sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format( cls, sum(ytrain.eval() == cls)) print >> sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format( cls, sum(ytest.eval() == cls)) # final ids final_ids = numpy.copy(train_ids) nitems = len(final_ids) / 2 final_ids.resize((2, nitems)) trainfinal_ids = final_ids[0] valfinal_ids = final_ids[1] xtrain, ytrain, minv, maxv = get_data(dataset, trainfinal_ids, options) xval, yval = get_data(dataset, valfinal_ids, options, isFirst=False, minvalue=minv, maxvalue=maxv)[0:2] xtest, ytest = get_data(dataset, test_ids, options, isFirst=True, minvalue=minv, maxvalue=maxv)[0:2] trainFinal = (xtrain, ytrain) valFinal = (xval, yval) testFinal = (xtest, ytest) print >> sys.stderr, test_ids if options['verbose'] > 0: print 'Train set with size %d ' % (ytrain.shape.eval()) print 'Val set with size %d ' % (yval.shape.eval()) print 'Test set with size %d ' % (ytest.shape.eval()) if options['verbose'] > 5: for cls in range(0, 2): print >> sys.stderr, "\tNumber of training elements for cls {0:02d} is {1:05d}".format( cls, sum(ytrain.eval() == cls)) print >> sys.stderr, "\tNumber of validation elements for cls {0:02d} is {1:05d}".format( cls, sum(yval.eval() == cls)) print >> sys.stderr, "\tNumber of testing elements for cls {0:02d} is {1:05d}".format( cls, sum(ytest.eval() == cls)) basefilename = '{0:s}/{1:05d}_{2:03d}_'.format( options['outputfolder'], nrun, string.atoi(options['resolution'])) trainfilename = basefilename + 'train_ids.pkl.gz' valfilename = basefilename + 'val_ids.pkl.gz' trainfinalfilename = basefilename + 'trainfinal_ids.pkl.gz' valfinalfilename = basefilename + 'valfinal_ids.pkl.gz' testfilename = basefilename + 'test_ids.pkl.gz' save_gzdata(trainfilename, train_ids) save_gzdata(valfilename, val_ids) save_gzdata(trainfinalfilename, trainfinal_ids) save_gzdata(valfinalfilename, valfinal_ids) save_gzdata(testfilename, test_ids) if options['verbose'] > 0: print 'Train set with size %d' % (trainFinal[1].shape.eval()) print 'Val set with size %d' % (valFinal[1].shape.eval()) print 'Test set with size %d' % (testFinal[1].shape.eval()) rval = [trainval, valval, testval, trainFinal, valFinal, testFinal] return rval