checkpoint = 100 #Create master socket to recieve information from clients socket = masterSocket() #Step 1: Both Queen and Clients make their own Peaknet instances. peaknet = Peaknet() lr = 0.001 #Step 2: Queen loads DN weights peaknet.loadCfg("/reg/neh/home/liponan/ai/pytorch-yolo2/cfg/newpeaksv10-asic.cfg") # FIXME: load latest weights peaknet.init_model() #peaknet.model = torch.load("/reg/d/psdm/cxi/cxic0415/res/liponan/antfarm_backup/api_demo_psana_model_000086880") if runMasterOnGPU: peaknet.model.cuda() peaknet.set_optimizer(adagrad=True, lr=lr) #Communication with clients begins. kk = 0 outdir = "/reg/d/psdm/cxi/cxic0415/res/liponan/antfarm_backup" while 1: try: print("waiting for worker...") val = socket.pull() print("#### master pulled: ", val) except zmq.error.Again: break print("^^^^^^^^^^^^^^^^^^^^: ", val)
my_r = y_label[u] % 185 my_c = x_label[u] % 388 s[u] = my_s r[u] = my_r c[u] = my_c labels = (cls, s, r, c, bh, bw) #print(labels) if t % batch_size == 0: batch_imgs = imgs else: batch_imgs = np.concatenate( (batch_imgs, imgs), axis=0 ) batch_labels.append( labels ) t2 = time.time() #print("data proceessing time", t2-t1) if t % batch_size == (batch_size-1) or t == (dataset_hits-1): pn.set_optimizer(adagrad=set_algo=="ada", lr=set_lr ) pn.train( batch_imgs, batch_labels, mini_batch_size=32*3, use_cuda=True ) # pn.optimize( optimizer ) t5 = time.time() print("time per event", 1.0*(t5-t0)/batch_size) pn.model.save_weights( "results/weights/" + project + "_ep"+str(ep+1)+".weights" ) img, label = load_from_cxi( filename_valid, idx_valid ) # pn.valid( #TODO #model.load_weights( "results/cxic0415_0091_ep"+str(ep)+".weights" ) #model_dict = dict( model.named_parameters() ) #for key, value in model_dict.items(): # #model_dict[key].grad.data = grad[key].data # print(key)
### Peaknet setup ### net = Peaknet() net.loadCfg("/reg/neh/home/liponan/ai/pytorch-yolo2/cfg/newpeaksv10-asic.cfg") net.init_model() net.model print("done model setup") ##################### context = zmq.Context() socket = context.socket(zmq.REP) socket.bind("tcp://*:5556") while True: # Wait for next request from client message = socket.recv_pyobj() grads, delta = message # 'messsage' always has two components print("Received request. delta:", delta) # let's not to print out the grads if delta > 0: # delta = net.set_optimizer( adagrad=True) # number of images trained in the last iteration net.updateGrad(grads=grads, delta=delta, useGPU=False) net.optimize() print("imgs seen:", net.model.seen) if net.model.seen % n_validate == 0 and net.model.seen > 0: socket.send_pyobj(["validate", net.model]) else: socket.send_pyobj(["train", net.model])
class Trainer(object): def __init__(self, params): self.params = params # get val list self.get_val_list() # get training list self.get_train_list() # set-up Peaknet self.setup_peaknet() self.grad = None self.delta = 0 self.psana_ready = False self.cxi_ready = False self.writer = SummaryWriter("runs/" + params["project_name"]) def get_train_list(self): if self.params["build_train_list"]: self.df_train = get_train_df( cxi_path="/reg/d/psdm/cxi/cxitut13/res/autosfx", val_csv= "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_val.csv", test_csv= "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_test.csv" ) else: self.df_train = pd.read_csv( "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_train.csv", index_col=0) print("training list", len(self.df_train)) def get_val_list(self, n=1000): self.df_val = pd.read_csv( "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_val_events_1000.csv" ) self.df_val = self.df_val.sort_values(by=["exp", "run", "event"]) print("validation list", len(self.df_val), "events") self.df_val_runs = self.df_val[["exp", "run", "path"]].drop_duplicates() print("validation list", len(self.df_val_runs), "runs") def setup_peaknet(self, model=None): self.net = Peaknet() self.net.loadCfg( "/reg/neh/home/liponan/ai/pytorch-yolo2/cfg/newpeaksv10-asic.cfg") if model is None: self.net.init_model() else: self.net.model = model self.net.model.cuda() #self.net.set_writer(project_name=self.params["project_name"], parameters=self.params) def get_grads(self): return self.net.getGrad() def validate(self): print( "=========================================== VAL ===========================================" ) macro_batch_size = self.params["macro_batch_size"] seen = 0 overall_recall = 0 # validation for i in range(len(self.df_val_runs)): exp, run, path = self.df_val_runs.iloc[i][["exp", "run", "path"]] try: ds = psana.DataSource("exp=" + exp + ":run=" + str(run) + ":idx") det = psana.Detector('DscCsPad') this_run = ds.runs().next() times = this_run.times() print( "*********************** {}-{} OKAY ***********************" .format(exp, run)) except: print("{}-{} not avaiable".format(exp, run)) continue sub_events = self.df_val.query( "exp == '{}' and run == '{}'".format(exp, run))["event"] # print(sub_events) # print("path", path) labels, eventIdxs = load_cxi_labels_yxhw(path, total_size=-1) labels = [ labels[i] for i in range(len(labels)) if eventIdxs[i] in sub_events ] eventIdxs = [ eventIdxs[i] for i in range(len(eventIdxs)) if eventIdxs[i] in sub_events ] print("labels", len(labels), "eventIdxs", len(eventIdxs)) n_iters = int(np.ceil(len(labels) / float(macro_batch_size))) print("# iterations", n_iters) for j in range(n_iters): idx_offset = j * macro_batch_size if j == (n_iters - 1): n = len(labels) - j * macro_batch_size batch_imgs = psana_img_loader(eventIdxs, idx_offset, n, det, this_run, times) batch_labels = labels[(j * macro_batch_size):] else: n = macro_batch_size batch_imgs = psana_img_loader(eventIdxs, idx_offset, macro_batch_size, det, this_run, times) batch_labels = labels[j * macro_batch_size:(j + 1) * macro_batch_size] batch_imgs[batch_imgs < 0] = 0 batch_imgs = batch_imgs / batch_imgs.max() my_recall = self.net.validate( batch_imgs, batch_labels, mini_batch_size=macro_batch_size * 32) print("my recall", my_recall) overall_recall += n * my_recall seen += n overall_recall /= (1.0 * seen) self.writer.add_scalar('recall_val', overall_recall, self.net.model.seen) print( "----------------------------------------- END VAL -----------------------------------------" ) def train(self): # params macro_batch_size = self.params["macro_batch_size"] algo = self.params["optim"] my_lr = self.params["lr"] n_check = self.params["n_check"] n_save = self.params["n_save"] n_policy = self.params["n_policy"] skip_trained = self.params["skip_trained"] p_skip = self.params["p_skip"] n_train_push = self.params["n_train_push"] # training #self.nets[0].set_writer(project_name=self.params["project_name"], parameters=self.params) #self.nets[0].writer.add_scalar('lr', my_lr, self.nets[0].model.seen) while not self.psana_ready: self.exp, self.run, self.path = self.df_train.sample(1).iloc[0][[ "exp", "run", "path" ]] #self.exp = "cxic0415" #self.run = '91' #self.path = "/reg/d/psdm/cxi/cxitut13/res/autosfx/cxic0415_0091.cxi" #print(exp, run, path) time.sleep(1) try: self.ds = psana.DataSource("exp=" + self.exp + ":run=" + str(self.run) + ":idx") #print(self.ds) self.det = psana.Detector( 'DscCsPad') #FIXME: could be other CsPad? #print(self.det) self.this_run = self.ds.runs().next() #print(self.this_run) self.times = self.this_run.times() print( "*********************** {}-{} OKAY ***********************" .format(self.exp, self.run)) except: print("{}-{} not avaiable".format(self.exp, self.run)) continue if skip_trained: log_filename = os.path.join( "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/train_log", "{}_{}".format(exp, run)) if os.path.isfile(log_filename): continue else: with open(log_filename, 'a'): os.utime(log_filename, None) self.psana_ready = True self.j_iter = 0 print("end of psana test") #self.net.writer.add_text("EXP-RUN", "{}-{}".format(exp, run), self.net.model.seen) if not self.cxi_ready: self.labels, self.eventIdxs = load_cxi_labels_yxhw(self.path, total_size=-1) print("labels", len(self.labels), "eventIdxs", len(self.eventIdxs)) self.n_iters = int( np.floor(len(self.labels) / float(macro_batch_size))) print("# iterations", self.n_iters) self.net.set_optimizer(adagrad=(algo == "adagrad"), lr=my_lr) for j in range(self.j_iter, self.j_iter + n_train_push): # was n_iters self.delta = n_train_push if self.j_iter == self.n_iters - 1: self.psana_ready = False self.cxi_ready = False idx_offset = j * macro_batch_size n = macro_batch_size batch_imgs = psana_img_loader(self.eventIdxs, idx_offset, macro_batch_size, self.det, self.this_run, self.times) batch_labels = self.labels[j * macro_batch_size:(j + 1) * macro_batch_size] self.net.set_optimizer(adagrad=(algo == "adagrad"), lr=my_lr) batch_imgs[batch_imgs < 0] = 0 batch_imgs = batch_imgs / batch_imgs.max() self.net.train(batch_imgs, batch_labels, mini_batch_size=macro_batch_size * 32) self.grad = self.net.getGrad() if self.net.model.seen % n_save == 0: self.net.snapshot(batch_imgs, batch_labels, tag="antfarm_zmq_trainer") print("snapshot saved") if self.net.model.seen in n_policy: my_lr /= 10.0 self.net.writer.add_scalar('lr', my_lr, self.net.model.seen) self.j_iter += 1