def __init__( self,configfile,fillername,identity,ipaddress,port=0,batchsize=None,verbosity=0): super( LArCV2ThreadIOWorker, self ).__init__(identity,ipaddress) self.configfile = configfile self.fillername = fillername self.batchsize = batchsize self.larcvloader = LArCVDataset(self.configfile,fillername) self.products = {} self.compression_level = 4 self.print_msg_size = False self.num_reads = 0 if self.batchsize is not None: self.start_dataloader(self.batchsize) print "LArCV2ThreadIOWorker[{}] is loaded.".format(self._identity)
def load_pre_cropped_data(larcvdataset_configfile, batchsize=1): larcvdataset_config = """ThreadProcessor: { Verbosity:3 NumThreads: 2 NumBatchStorage: 2 RandomAccess: false InputFiles: ["/mnt/disk1/nutufts/kmason/data/crop_test.root"] ProcessName: ["ADC_valid","ADCmasked_valid","weights_valid","labelsbasic_valid"] ProcessType: ["BatchFillerImage2D","BatchFillerImage2D","BatchFillerImage2D","BatchFillerImage2D"] ProcessList: { weights_valid: { Verbosity:3 ImageProducer: "Weights" Channels: [0] EnableMirror: false } ADC_valid: { Verbosity:3 ImageProducer: "ADC" Channels: [0] EnableMirror: false } labelsbasic_valid: { Verbosity:3 ImageProducer: "Labels" Channels: [0] EnableMirror: false } ADCmasked_valid: { Verbosity:3 ImageProducer: "ADCMasked" Channels: [0] EnableMirror: false } } } """ with open("larcv_dataloader.cfg", 'w') as f: print >> f, larcvdataset_config iotest = LArCVDataset("larcv_dataloader.cfg", "ThreadProcessor") #, store_eventids=True return iotest
def load_pre_cropped_data(larcvdataset_configfile, batchsize=1): """ we can just use the normal larcvdataset""" larcvdataset_config = """ThreadProcessor: { Verbosity:3 NumThreads: 2 NumBatchStorage: 2 RandomAccess: false InputFiles: ["test_crops.root"] ProcessName: ["target_valid","wire_valid","weights_valid"] ProcessType: ["BatchFillerImage2D","BatchFillerImage2D","BatchFillerImage2D"] ProcessList: { target_valid: { Verbosity:3 ImageProducer: "Target" Channels: [1] EnableMirror: false } wire_valid: { Verbosity:3 ImageProducer: "wire" Channels: [1] EnableMirror: false } weights_valid: { Verbosity:3 ImageProducer: "Weights" Channels: [1] EnableMirror: false } } } """ with open("larcv_dataloader.cfg", 'w') as f: print >> f, larcvdataset_config iotest = LArCVDataset("larcv_dataloader.cfg", "ThreadProcessor") #, store_eventids=True return iotest
class LArCV2ThreadIOWorker( WorkerService ): """ This worker simply receives data and replies with dummy string. prints shape of array. """ def __init__( self,configfile,fillername,identity,ipaddress,port=0,batchsize=None,verbosity=0): super( LArCV2ThreadIOWorker, self ).__init__(identity,ipaddress) self.configfile = configfile self.fillername = fillername self.batchsize = batchsize self.larcvloader = LArCVDataset(self.configfile,fillername) self.products = {} self.compression_level = 4 self.print_msg_size = False self.num_reads = 0 if self.batchsize is not None: self.start_dataloader(self.batchsize) print "LArCV2ThreadIOWorker[{}] is loaded.".format(self._identity) def process_message(self, frames ): """ just a request. nothing to parse """ return True def fetch_data(self): """ load up the next data set. we've already sent out the message. so here we try to hide latency while gpu running. """ # get data tstart = time.time() while self.larcvloader.io._proc.thread_running(): #print "finish load" time.sleep(0.001) self.products = self.larcvloader[0] while self.larcvloader.io._proc.thread_running(): #print "finish load" time.sleep(0.001) #print "[",self.num_reads,":{}] ".format(self._identity),self.products.keys() self.num_reads += 1 print "LArCV2ThreadIOWorker[{}] fetched data. time={} secs. nreads={}".format(self._identity,time.time()-tstart,self.num_reads) return def generate_reply(self): """ our job is to return our data set, then load another """ self.fetch_data() reply = [self._identity] totmsgsize = 0.0 totcompsize = 0.0 tstart = time.time() for key,arr in self.products.items(): # encode x_enc = msgpack.packb( arr, default=m.encode ) x_comp = zlib.compress(x_enc,self.compression_level) # for debug: inspect compression gains (usually reduction to 1% or lower of original size) if self.print_msg_size: encframe = zmq.Frame(x_enc) comframe = zmq.Frame(x_comp) totmsgsize += len(encframe.bytes) totcompsize += len(comframe.bytes) reply.append( key.encode('utf-8') ) reply.append( x_comp ) if self.print_msg_size: print "LArCV2ThreadIOWorker[{}]: size of array portion={} MB (uncompressed {} MB)".format(self._identity,totcompsize/1.0e6,totmsgsize/1.0e6) print "LArCV2ThreadIOWorker[{}]: generate msg in {} secs".format(self._identity,time.time()-tstart) return reply def start_dataloader(self,batchsize): print "LArCV2ThreadIOWorker[{}] starting loader w/ batchsize={}".format(self._identity,self.batchsize) self.batchsize = batchsize self.larcvloader.start(self.batchsize) print "LArCV2ThreadIOWorker[{}] dataloader ready, loading first product set".format(self._identity,self.batchsize) while not self.larcvloader.io._proc.manager_started(): time.sleep(1.0) print "LArCV2ThreadIOWorker[{}] waiting for larcv_threadio".format(self._identity) #self.post_reply() # get first batch print "LArCV2ThreadIOWorker[{}] manager started. syncing with client".format(self._identity)
def main(): global best_prec1 global writer # create model, mark it to run on the GPU model = LArFlowUResNet(inplanes=22, input_channels=1, num_classes=2, showsizes=False, use_visi=USE_VISI) if GPUMODE: if USE_DATA_PARALLEL: model.encoder = nn.DataParallel( model.encoder, device_ids=DEVICE_IDS) # distribute model.decoder1 = nn.DataParallel( model.decoder1, device_ids=DEVICE_IDS_2) # distribute model.decoder2 = nn.DataParallel( model.decoder2, device_ids=DEVICE_IDS_2) # distribute model.encoder.cuda(DEVICE_IDS[0]) model.decoder1.cuda(DEVICE_IDS_2[0]) model.decoder2.cuda(DEVICE_IDS_2[0]) else: model.cuda(DEVICE_IDS[0]) # Resume training option if RESUME_FROM_CHECKPOINT: print "RESUMING FROM CHECKPOINT FILE ", CHECKPOINT_FILE checkpoint = torch.load( CHECKPOINT_FILE, map_location=CHECKPOINT_MAP_LOCATIONS) # load weights to gpuid best_prec1 = checkpoint["best_prec1"] #best_prec1 = 0.158 model.load_state_dict(checkpoint["state_dict"]) ## register hook # for n,m in model.named_children(): # print("name = ",n) # for n1, m1 in m.named_children(): # print("name 1 = ", n1 ) # m1.register_backward_hook(print_module_name) # m1.register_forward_hook( print_module_name_fwd) # # define loss function (criterion) and optimizer if GPUMODE: criterion = LArFlowLoss(VISI_WEIGHT) else: criterion = LArFlowLoss(VISI_WEIGHT) # training parameters lr = 1.0e-4 momentum = 0.9 weight_decay = 1.0e-4 # training length batchsize_train = 12 #*len(DEVICE_IDS) batchsize_valid = 6 #*len(DEVICE_IDS) start_epoch = 0 epochs = 10 num_iters = 10000 iter_per_epoch = None # determined later iter_per_valid = 10 iter_per_checkpoint = 500 nbatches_per_itertrain = 10 itersize_train = batchsize_train * nbatches_per_itertrain trainbatches_per_print = 100 nbatches_per_itervalid = 20 itersize_valid = batchsize_valid * nbatches_per_itervalid validbatches_per_print = 100 optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # optimize algorithms based on input size (good if input size is constant) cudnn.benchmark = True # LOAD THE DATASET iotrain = LArCVDataset(TRAIN_LARCV_CONFIG, "ThreadProcessorTrain") iovalid = LArCVDataset(VALID_LARCV_CONFIG, "ThreadProcessorValid") iotrain.start(batchsize_train) iovalid.start(batchsize_valid) NENTRIES = len(iotrain) print "Number of entries in training set: ", NENTRIES if NENTRIES > 0: iter_per_epoch = NENTRIES / (itersize_train) if num_iters is None: # we set it by the number of request epochs num_iters = (epochs - start_epoch) * NENTRIES else: epochs = num_iters / NENTRIES else: iter_per_epoch = 1 print "Number of epochs: ", epochs print "Iter per epoch: ", iter_per_epoch with torch.autograd.profiler.profile(enabled=RUNPROFILER) as prof: for ii in range(start_iter, num_iters): adjust_learning_rate(optimizer, ii, lr) print "MainLoop Iter:%d Epoch:%d.%d " % (ii, ii / iter_per_epoch, ii % iter_per_epoch), for param_group in optimizer.param_groups: print "lr=%.3e" % (param_group['lr']), print # train for one iteration try: train_ave_loss, train_ave_acc = train( iotrain, batchsize_train, model, criterion, optimizer, nbatches_per_itertrain, ii, trainbatches_per_print) #mem_report() except Exception, e: print "Error in training routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break print "Train Iter:%d Epoch:%d.%d train aveloss=%.3f aveacc=%.3f" % ( ii, ii / iter_per_epoch, ii % iter_per_epoch, train_ave_loss, train_ave_acc) # evaluate on validation set if ii % iter_per_valid == 0: try: prec1 = validate(iovalid, batchsize_valid, model, criterion, nbatches_per_itervalid, validbatches_per_print, ii) except Exception, e: print "Error in validation routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) # check point for best model if is_best: print "Saving best model" save_checkpoint( { 'iter': ii, 'epoch': ii / iter_per_epoch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, -1) # periodic checkpoint if ii > 0 and ii % iter_per_checkpoint == 0: print "saving periodic checkpoint" save_checkpoint( { 'iter': ii, 'epoch': ii / iter_per_epoch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, False, ii) # flush the print buffer after iteration sys.stdout.flush()
import os, sys import ROOT as rt from larcv import larcv from uresnet import UResNet from larcvdataset import LArCVDataset #net = UResNet( num_classes=3, input_channels=1, inplanes=16 ) # we load in a test image #iotest = LArCVDataset("test_dataloader.cfg", "ThreadProcessorTest") iotest = LArCVDataset("test_threadfiller.cfg", "ThreadProcessorTest") iotest.start(1) data = iotest[0] print data #print net iotest.stop()
def main(): global best_prec1 global writer # create model, mark it to run on the GPU if GPUMODE: model = UResNet(inplanes=32, input_channels=1, num_classes=NCLASSES, showsizes=False) model.to(device=torch.device(DEVICE)) # put onto gpuid else: model = UResNet(inplanes=32, input_channels=1, num_classes=NCLASSES) # Resume training option if RESUME_FROM_CHECKPOINT: print "RESUMING FROM CHECKPOINT FILE ", CHECKPOINT_FILE checkpoint = torch.load( CHECKPOINT_FILE, map_location=CHECKPOINT_MAP_LOCATIONS) # load weights to gpuid best_prec1 = checkpoint["best_prec1"] if CHECKPOINT_FROM_DATA_PARALLEL: model = nn.DataParallel( model, device_ids=DEVICE_IDS) # distribute across device_ids model.load_state_dict(checkpoint["state_dict"]) if not CHECKPOINT_FROM_DATA_PARALLEL and len(DEVICE_IDS) > 1: model = nn.DataParallel( model, device_ids=DEVICE_IDS) # distribute across device_ids # uncomment to dump model print "Loaded model: ", model # check where model pars are #for p in model.parameters(): # print p.is_cuda # define loss function (criterion) and optimizer if GPUMODE: criterion = PixelWiseNLLLoss() criterion.to(device=torch.device(DEVICE)) else: criterion = PixelWiseNLLLoss() # training parameters lr = 1.0e-5 momentum = 0.9 weight_decay = 1.0e-4 # training length if "cuda" in DEVICE: batchsize_train = 4 * len(DEVICE_IDS) batchsize_valid = 2 * len(DEVICE_IDS) else: batchsize_train = 4 batchsize_valid = 2 start_epoch = 0 epochs = 10 num_iters = 30000 iter_per_epoch = None # determined later iter_per_valid = 10 iter_per_checkpoint = 500 nbatches_per_itertrain = 20 itersize_train = batchsize_train * nbatches_per_itertrain trainbatches_per_print = 100 nbatches_per_itervalid = 40 itersize_valid = batchsize_valid * nbatches_per_itervalid validbatches_per_print = 100 # SETUP OPTIMIZER # SGD w/ momentum #optimizer = torch.optim.SGD(model.parameters(), lr, # momentum=momentum, # weight_decay=weight_decay) # ADAM # betas default: (0.9, 0.999) for (grad, grad^2). smoothing coefficient for grad. magnitude calc. optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # optimize algorithms based on input size (good if input size is constant) cudnn.benchmark = True # LOAD THE DATASET iotrain = LArCVDataset(TRAIN_LARCV_CONFIG, "ThreadProcessorTrain") iovalid = LArCVDataset(VALID_LARCV_CONFIG, "ThreadProcessorValid") iotrain.start(batchsize_train) iovalid.start(batchsize_valid) iosample = {"valid": iovalid, "train": iotrain} NENTRIES = len(iotrain) print "Number of entries in training set: ", NENTRIES if NENTRIES > 0: iter_per_epoch = NENTRIES / (itersize_train) if num_iters is None: # we set it by the number of request epochs num_iters = (epochs - start_epoch) * NENTRIES else: epochs = num_iters / NENTRIES else: iter_per_epoch = 1 print "Number of epochs: ", epochs print "Iter per epoch: ", iter_per_epoch if False: # for debugging/testing data sample = "train" print "TEST BATCH: sample=", sample adc_t, label_t, weight_t = prep_data(iosample[sample], sample, batchsize_train, IMAGE_WIDTH, IMAGE_HEIGHT, ADC_THRESH) print "adc shape: ", adc_t.shape print "label shape: ", label_t.shape print "weight shape: ", weight_t.shape # load opencv, to dump png of image import cv2 as cv cv.imwrite("testout_adc.png", adc_t.numpy()[0, 0, :, :]) cv.imwrite("testout_label.png", label_t.numpy()[0, :, :]) cv.imwrite("testout_weight.png", weight_t.numpy()[0, 0, :, :]) print "STOP FOR DEBUGGING" iotrain.stop() iovalid.stop() sys.exit(-1) with torch.autograd.profiler.profile(enabled=RUNPROFILER) as prof: # Resume training option #if RESUME_FROM_CHECKPOINT: # print "RESUMING FROM CHECKPOINT FILE ",CHECKPOINT_FILE # checkpoint = torch.load( CHECKPOINT_FILE, map_location=CHECKPOINT_MAP_LOCATIONS ) # best_prec1 = checkpoint["best_prec1"] # model.load_state_dict(checkpoint["state_dict"]) #optimizer.load_state_dict(checkpoint['optimizer']) #if GPUMODE: # optimizer.cuda(GPUID) for ii in range(start_iter, num_iters): adjust_learning_rate(optimizer, ii, lr) print "MainLoop Iter:%d Epoch:%d.%d " % (ii, ii / iter_per_epoch, ii % iter_per_epoch), for param_group in optimizer.param_groups: print "lr=%.3e" % (param_group['lr']), print # train for one iteration try: train_ave_loss, train_ave_acc = train(iotrain, batchsize_train, model, criterion, optimizer, nbatches_per_itertrain, ii, NCLASSES, trainbatches_per_print) except Exception, e: print "Error in training routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break print "Train Iter:%d Epoch:%d.%d train aveloss=%.3f aveacc=%.3f" % ( ii, ii / iter_per_epoch, ii % iter_per_epoch, train_ave_loss, train_ave_acc) # evaluate on validation set if ii % iter_per_valid == 0: try: prec1 = validate(iovalid, batchsize_valid, model, criterion, nbatches_per_itervalid, validbatches_per_print, ii) except Exception, e: print "Error in validation routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) # check point for best model if is_best: print "Saving best model" save_checkpoint( { 'iter': ii, 'epoch': ii / iter_per_epoch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, -1) # periodic checkpoint if ii > 0 and ii % iter_per_checkpoint == 0: print "saving periodic checkpoint" save_checkpoint( { 'iter': ii, 'epoch': ii / iter_per_epoch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, False, ii) # flush the print buffer after iteration sys.stdout.flush()
def main(): global best_prec1_vis global best_prec1_flow global writer model = network.mymodel( num_classes=1, input_channels=1, showsizes=False) model.cuda() #print "Loaded model: ",model # define loss function (criterion) and optimizer criterion1 = myfunc.PixelWiseFlowLoss(minval=4).cuda() criterion2 = myfunc.PixelWiseNLLLoss().cuda() # training parameters lmbd = 0.5 lr = 1.0e-4 #-3 momentum = 0.9 weight_decay = 1.0e-3 batchsize_train = 8 batchsize_valid = 8 start_epoch = 0 epochs = 50 #1500 nbatches_per_iter = 25 if len(sys.argv)>1: epochs = int(sys.argv[1]) print "Number of epochs: ", epochs print "Train batch: ", batchsize_train print "# batch per iter: ", nbatches_per_iter optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) cudnn.benchmark = True # dataset #iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor", loadallinmem=True) iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor") iovalid = LArCVDataset("valid_dataloader.cfg", "ThreadProcessorTest") iotrain.start(batchsize_train) iovalid.start(batchsize_valid) #nbatch per epoch NENTRIES = iotrain.io.fetch_n_entries() #NENTRIES=0; if NENTRIES>0: nbatches_per_epoch = NENTRIES/batchsize_train nbatches_per_valid = NENTRIES/batchsize_valid else: nbatches_per_epoch = 1 nbatches_per_valid = 1 iter_per_epoch = nbatches_per_epoch/nbatches_per_iter iter_per_valid = 5 iter_per_checkpoint = 150 num_iters = iter_per_epoch*epochs print "Iterations: ", num_iters # Resume training option if False: checkpoint = torch.load( "checkpoint.pth.p01.tar" ) best_prec1 = checkpoint["best_prec1"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint['optimizer']) for ii in range(0, num_iters): myfunc.adjust_learning_rate(optimizer, ii, lr) print "Iter:%d Epoch:%d.%d "%(ii,ii/iter_per_epoch,ii%iter_per_epoch), for param_group in optimizer.param_groups: print "lr=%.3e"%(param_group['lr']), print # train for one epoch try: train_ave_loss, train_ave_acc_vis, train_ave_acc_flow = train(iotrain, model, criterion1, criterion2, lmbd, optimizer, nbatches_per_iter, ii, 10) except Exception,e: print "Error in training routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break print "Iter:%d Epoch [%d.%d] train aveloss=%.3f aveacc_vis=%.3f aveacc_flow=%.3f"%(ii,ii/iter_per_epoch,ii%iter_per_epoch, train_ave_loss,train_ave_acc_vis,train_ave_acc_flow) # evaluate on validation set if ii%iter_per_valid==0: try: prec1_vis, prec1_flow = validate(iovalid, model, criterion1, criterion2, lmbd, nbatches_per_iter, ii, 10) except Exception,e: print "Error in validation routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break # remember best prec@1 and save checkpoint is_best_flow = prec1_flow > best_prec1_flow best_prec1_flow = max(prec1_flow, best_prec1_flow) is_best_vis = prec1_vis > best_prec1_vis best_prec1_vis = max(prec1_vis, best_prec1_vis) # check point for best model if is_best_flow: print "Saving best model" myfunc.save_checkpoint({ 'iter':ii, 'epoch': ii/iter_per_epoch, 'state_dict': model.state_dict(), 'best_prec1_vis': best_prec1_vis, 'best_prec1_flow': best_prec1_flow, 'optimizer' : optimizer.state_dict(), }, is_best_flow, -1) # periodic checkpoint if ii>0 and ii%iter_per_checkpoint==0: print "saving periodic checkpoint" myfunc.save_checkpoint({ 'iter':ii, 'epoch': ii/iter_per_epoch, 'state_dict': model.state_dict(), 'best_prec1_vis': best_prec1_vis, 'best_prec1_flow': best_prec1_flow, 'optimizer' : optimizer.state_dict(), }, False, ii)
def main(): global best_prec1 # create model: loading resnet18 as defined in torchvision module #model = resnet_example.resnet18(pretrained=False, num_classes=5, input_channels=1) model = resnet_example.resnet14(pretrained=False, num_classes=5, input_channels=1) model.cuda() print "Loaded model: ", model # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # training parameters lr = 1.0e-3 momentum = 0.9 weight_decay = 1.0e-3 batchsize = 50 batchsize_valid = 500 start_epoch = 0 epochs = 1500 nbatches_per_epoch = 10000 / batchsize nbatches_per_valid = 1000 / batchsize_valid optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) cudnn.benchmark = True # dataset iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor", loadallinmem=True) iovalid = LArCVDataset("valid_dataloader.cfg", "ThreadProcessorTest") iotrain.start(batchsize) iovalid.start(batchsize_valid) # Resume training option if False: checkpoint = torch.load("checkpoint.pth.p01.tar") best_prec1 = checkpoint["best_prec1"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint['optimizer']) if False: data = iotrain[0] img = data["image"] lbl = data["label"] img_np = np.zeros((img.shape[0], 1, 256, 256), dtype=np.float32) lbl_np = np.zeros((lbl.shape[0]), dtype=np.int) for j in range(img.shape[0]): imgtemp = img[j].reshape((256, 256)) print imgtemp.shape img_np[j, 0, :, :] = padandcrop(imgtemp) lbl_np[j] = np.argmax(lbl[j]) print "Train label" print lbl_np datatest = iovalid[0] imgtest = data["image"] print "Test image shape" print imgtest.shape iotrain.stop() iovalid.stop() return for epoch in range(start_epoch, epochs): adjust_learning_rate(optimizer, epoch, lr) print "Epoch [%d]: " % (epoch), for param_group in optimizer.param_groups: print "lr=%.3e" % (param_group['lr']), print # train for one epoch try: train_ave_loss, train_ave_acc = train(iotrain, model, criterion, optimizer, nbatches_per_epoch, epoch, 50) except Exception, e: print "Error in training routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break print "Epoch [%d] train aveloss=%.3f aveacc=%.3f" % ( epoch, train_ave_loss, train_ave_acc) # evaluate on validation set try: prec1 = validate(iovalid, model, criterion, nbatches_per_valid, 1) except Exception, e: print "Error in validation routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break
def main(): global best_prec1_vis global best_prec1_flow global writer model = network.mymodel( num_classes=1, input_channels=1, showsizes=False) model.cuda() #print "Loaded model: ",model # define loss function (criterion) and optimizer criterion1 = myfunc.PixelWiseFlowLoss(minval=4).cuda() criterion2 = myfunc.PixelWiseNLLLoss().cuda() # training parameters lmbd = 0.5 lr = 1.0e-4 #-3 momentum = 0.9 weight_decay = 1.0e-3 batchsize = 8 batchsize_valid = 8 start_epoch = 0 epochs = 50 #1500 if len(sys.argv)>1: epochs = int(sys.argv[1]) print "Number of epochs: ", epochs print "Train batch: ", batchsize optimizer = torch.optim.SGD(model.parameters(), lr, momentum=momentum, weight_decay=weight_decay) cudnn.benchmark = True # dataset #iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor", loadallinmem=True) iotrain = LArCVDataset("train_dataloader.cfg", "ThreadProcessor") iovalid = LArCVDataset("valid_dataloader.cfg", "ThreadProcessorTest") iotrain.start(batchsize) iovalid.start(batchsize_valid) #nbatch per epoch NENTRIES = iotrain.io.fetch_n_entries() #NENTRIES=0; if NENTRIES>0: nbatches_per_epoch = NENTRIES/batchsize nbatches_per_valid = NENTRIES/batchsize_valid else: nbatches_per_epoch = 1 nbatches_per_valid = 1 # Resume training option if False: checkpoint = torch.load( "checkpoint.pth.p01.tar" ) best_prec1 = checkpoint["best_prec1"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint['optimizer']) if False: #debug data = iotrain[0] img = data["imageY"] img2 = data["imageU"] lbl = data["label"] vis = data["match"] ''' img_np = np.zeros( (img.shape[0], 1, 512, 512), dtype=np.float32 ) img2_np = np.zeros( (img2.shape[0], 1, 512, 512), dtype=np.float32 ) lbl_np = np.zeros( (lbl.shape[0], 1, 512, 512), dtype=np.int ) vis_np = np.zeros( (vis.shape[0], 512, 512), dtype=np.int ) fvis_np = np.zeros( (vis.shape[0], 1, 512, 512), dtype=np.float32 ) for j in range(img.shape[0]): img_np[j,0,:,:] = img[j].reshape( (512,512) ) img2_np[j,0,:,:] = img2[j].reshape( (512,512) ) lbl_np[j,0,:,:] = lbl[j].reshape( (512,512) ) vis_np[j,:,:] = vis[j].reshape( (512,512) ) fvis_np[j,0,:,:] = vis[j].reshape( (512,512) ) ''' img_np = np.zeros( ( 512, 512), dtype=np.float32 ) img2_np = np.zeros( ( 512, 512), dtype=np.float32 ) lbl_np = np.zeros( ( 512, 512), dtype=np.int ) vis_np = np.zeros( ( 512, 512), dtype=np.int ) fvis_np = np.zeros( ( 512, 512), dtype=np.float32 ) for j in range(1):#img.shape[0]): img_np[:,:] = img[j].reshape( (512,512) ) img2_np[:,:] = img2[j].reshape( (512,512) ) lbl_np[:,:] = lbl[j].reshape( (512,512) ) vis_np[:,:] = vis[j].reshape( (512,512) ) fvis_np[:,:] = vis[j].reshape( (512,512) ) tar_x_visi = np.multiply(lbl_np,fvis_np) abs_tar_x_visi = np.fabs(tar_x_visi) thresh = abs_tar_x_visi >0 threshint = thresh.astype(int) datatest = iovalid[0] imgtest = datatest["imageYtest"] print "Test image shape" print imgtest.shape cv.imwrite( "testout_srcY.png", img_np ) cv.imwrite( "testout_srcU.png", img2_np ) cv.imwrite( "testout_tar.png", lbl_np ) cv.imwrite( "testout_vis.png", fvis_np*100 ) cv.imwrite( "testout_tarXvis.png", tar_x_visi ) cv.imwrite( "testout_abs_tarXvis.png", abs_tar_x_visi*100 ) cv.imwrite( "testout_thresh_tarXvis.png", threshint*100 ) iotrain.stop() iovalid.stop() return #data = iotrain[0] #data2 = iovalid[0] for epoch in range(start_epoch, epochs): myfunc.adjust_learning_rate(optimizer, epoch, lr) print "Epoch [%d]: "%(epoch), for param_group in optimizer.param_groups: print "lr=%.3e"%(param_group['lr']), print # train for one epoch try: train_ave_loss, train_ave_acc_vis, train_ave_acc_flow = train(iotrain, model, criterion1, criterion2, lmbd, optimizer, nbatches_per_epoch, epoch, 100) #train_ave_loss, train_ave_acc_vis, train_ave_acc_flow = train(data, model, criterion1, criterion2, lmbd, optimizer, nbatches_per_epoch, epoch, 50) except Exception,e: print "Error in training routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break print "Epoch [%d] train aveloss=%.3f aveacc_vis=%.3f aveacc_flow=%.3f"%(epoch,train_ave_loss,train_ave_acc_vis,train_ave_acc_flow) # evaluate on validation set try: prec1_vis, prec1_flow = validate(iovalid, model, criterion1, criterion2, lmbd, nbatches_per_valid, epoch, 100) #prec1_vis, prec1_flow = validate(data2, model, criterion1, criterion2, lmbd, nbatches_per_valid, epoch, 50) except Exception,e: print "Error in validation routine!" print e.message print e.__class__.__name__ traceback.print_exc(e) break