Exemple #1
0
checkpoint = 100

#Create master socket to recieve information from clients
socket = masterSocket()

#Step 1: Both Queen and Clients make their own Peaknet instances.
peaknet = Peaknet()
lr = 0.001

#Step 2: Queen loads DN weights
peaknet.loadCfg("/reg/neh/home/liponan/ai/pytorch-yolo2/cfg/newpeaksv10-asic.cfg") # FIXME: load latest weights
peaknet.init_model()
#peaknet.model = torch.load("/reg/d/psdm/cxi/cxic0415/res/liponan/antfarm_backup/api_demo_psana_model_000086880")

if runMasterOnGPU: peaknet.model.cuda()
peaknet.set_optimizer(adagrad=True, lr=lr)

#Communication with clients begins.
kk = 0
outdir = "/reg/d/psdm/cxi/cxic0415/res/liponan/antfarm_backup"

while 1:
    try:
        print("waiting for worker...")
        val = socket.pull()
        print("#### master pulled: ", val)
    except zmq.error.Again:
        break

    print("^^^^^^^^^^^^^^^^^^^^: ", val)
	    my_r = y_label[u] % 185
	    my_c = x_label[u] % 388
	    s[u] = my_s
	    r[u] = my_r
	    c[u] = my_c
        labels = (cls, s, r, c, bh, bw)
        #print(labels)
        if t % batch_size == 0:
            batch_imgs = imgs
        else:
            batch_imgs = np.concatenate( (batch_imgs, imgs), axis=0 )
        batch_labels.append( labels )
        t2 = time.time()
        #print("data proceessing time", t2-t1)
        if t % batch_size == (batch_size-1) or t == (dataset_hits-1):   
            pn.set_optimizer(adagrad=set_algo=="ada", lr=set_lr )
        pn.train( batch_imgs, batch_labels, mini_batch_size=32*3, use_cuda=True )
#         pn.optimize( optimizer )
        
        t5 = time.time()
        print("time per event", 1.0*(t5-t0)/batch_size)

    pn.model.save_weights( "results/weights/" + project + "_ep"+str(ep+1)+".weights" )

    img, label = load_from_cxi( filename_valid, idx_valid )
#     pn.valid( #TODO 
            #model.load_weights( "results/cxic0415_0091_ep"+str(ep)+".weights" )
            #model_dict = dict( model.named_parameters() )
            #for key, value in model_dict.items():
            #    #model_dict[key].grad.data = grad[key].data
            #     print(key)
### Peaknet setup ###

net = Peaknet()
net.loadCfg("/reg/neh/home/liponan/ai/pytorch-yolo2/cfg/newpeaksv10-asic.cfg")
net.init_model()
net.model
print("done model setup")

#####################

context = zmq.Context()
socket = context.socket(zmq.REP)
socket.bind("tcp://*:5556")

while True:
    #  Wait for next request from client
    message = socket.recv_pyobj()
    grads, delta = message  # 'messsage' always has two components
    print("Received request. delta:",
          delta)  # let's not to print out the grads
    if delta > 0:  # delta =
        net.set_optimizer(
            adagrad=True)  # number of images trained in the last iteration
        net.updateGrad(grads=grads, delta=delta, useGPU=False)
        net.optimize()
    print("imgs seen:", net.model.seen)
    if net.model.seen % n_validate == 0 and net.model.seen > 0:
        socket.send_pyobj(["validate", net.model])
    else:
        socket.send_pyobj(["train", net.model])
Exemple #4
0
class Trainer(object):
    def __init__(self, params):
        self.params = params
        # get val list
        self.get_val_list()
        # get training list
        self.get_train_list()
        # set-up Peaknet
        self.setup_peaknet()
        self.grad = None
        self.delta = 0
        self.psana_ready = False
        self.cxi_ready = False
        self.writer = SummaryWriter("runs/" + params["project_name"])

    def get_train_list(self):
        if self.params["build_train_list"]:
            self.df_train = get_train_df(
                cxi_path="/reg/d/psdm/cxi/cxitut13/res/autosfx",
                val_csv=
                "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_val.csv",
                test_csv=
                "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_test.csv"
            )
        else:
            self.df_train = pd.read_csv(
                "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_train.csv",
                index_col=0)
        print("training list", len(self.df_train))

    def get_val_list(self, n=1000):
        self.df_val = pd.read_csv(
            "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/df_val_events_1000.csv"
        )
        self.df_val = self.df_val.sort_values(by=["exp", "run", "event"])
        print("validation list", len(self.df_val), "events")
        self.df_val_runs = self.df_val[["exp", "run",
                                        "path"]].drop_duplicates()
        print("validation list", len(self.df_val_runs), "runs")

    def setup_peaknet(self, model=None):
        self.net = Peaknet()
        self.net.loadCfg(
            "/reg/neh/home/liponan/ai/pytorch-yolo2/cfg/newpeaksv10-asic.cfg")
        if model is None:
            self.net.init_model()
        else:
            self.net.model = model
        self.net.model.cuda()
        #self.net.set_writer(project_name=self.params["project_name"], parameters=self.params)

    def get_grads(self):
        return self.net.getGrad()

    def validate(self):
        print(
            "=========================================== VAL ==========================================="
        )
        macro_batch_size = self.params["macro_batch_size"]
        seen = 0
        overall_recall = 0
        # validation
        for i in range(len(self.df_val_runs)):
            exp, run, path = self.df_val_runs.iloc[i][["exp", "run", "path"]]
            try:
                ds = psana.DataSource("exp=" + exp + ":run=" + str(run) +
                                      ":idx")
                det = psana.Detector('DscCsPad')
                this_run = ds.runs().next()
                times = this_run.times()
                print(
                    "*********************** {}-{} OKAY ***********************"
                    .format(exp, run))

            except:
                print("{}-{} not avaiable".format(exp, run))
                continue
            sub_events = self.df_val.query(
                "exp == '{}' and run == '{}'".format(exp, run))["event"]
            #             print(sub_events)
            #             print("path", path)
            labels, eventIdxs = load_cxi_labels_yxhw(path, total_size=-1)
            labels = [
                labels[i] for i in range(len(labels))
                if eventIdxs[i] in sub_events
            ]
            eventIdxs = [
                eventIdxs[i] for i in range(len(eventIdxs))
                if eventIdxs[i] in sub_events
            ]
            print("labels", len(labels), "eventIdxs", len(eventIdxs))
            n_iters = int(np.ceil(len(labels) / float(macro_batch_size)))
            print("# iterations", n_iters)

            for j in range(n_iters):
                idx_offset = j * macro_batch_size
                if j == (n_iters - 1):
                    n = len(labels) - j * macro_batch_size
                    batch_imgs = psana_img_loader(eventIdxs, idx_offset, n,
                                                  det, this_run, times)
                    batch_labels = labels[(j * macro_batch_size):]
                else:
                    n = macro_batch_size
                    batch_imgs = psana_img_loader(eventIdxs, idx_offset,
                                                  macro_batch_size, det,
                                                  this_run, times)
                    batch_labels = labels[j * macro_batch_size:(j + 1) *
                                          macro_batch_size]
                batch_imgs[batch_imgs < 0] = 0
                batch_imgs = batch_imgs / batch_imgs.max()
                my_recall = self.net.validate(
                    batch_imgs,
                    batch_labels,
                    mini_batch_size=macro_batch_size * 32)
                print("my recall", my_recall)
                overall_recall += n * my_recall
                seen += n
        overall_recall /= (1.0 * seen)
        self.writer.add_scalar('recall_val', overall_recall,
                               self.net.model.seen)
        print(
            "----------------------------------------- END VAL -----------------------------------------"
        )

    def train(self):
        # params
        macro_batch_size = self.params["macro_batch_size"]
        algo = self.params["optim"]
        my_lr = self.params["lr"]
        n_check = self.params["n_check"]
        n_save = self.params["n_save"]
        n_policy = self.params["n_policy"]
        skip_trained = self.params["skip_trained"]
        p_skip = self.params["p_skip"]
        n_train_push = self.params["n_train_push"]
        # training
        #self.nets[0].set_writer(project_name=self.params["project_name"], parameters=self.params)
        #self.nets[0].writer.add_scalar('lr', my_lr, self.nets[0].model.seen)

        while not self.psana_ready:
            self.exp, self.run, self.path = self.df_train.sample(1).iloc[0][[
                "exp", "run", "path"
            ]]
            #self.exp = "cxic0415"
            #self.run = '91'
            #self.path = "/reg/d/psdm/cxi/cxitut13/res/autosfx/cxic0415_0091.cxi"
            #print(exp, run, path)
            time.sleep(1)
            try:
                self.ds = psana.DataSource("exp=" + self.exp + ":run=" +
                                           str(self.run) + ":idx")
                #print(self.ds)
                self.det = psana.Detector(
                    'DscCsPad')  #FIXME: could be other CsPad?
                #print(self.det)
                self.this_run = self.ds.runs().next()
                #print(self.this_run)
                self.times = self.this_run.times()
                print(
                    "*********************** {}-{} OKAY ***********************"
                    .format(self.exp, self.run))
            except:
                print("{}-{} not avaiable".format(self.exp, self.run))
                continue
            if skip_trained:
                log_filename = os.path.join(
                    "/reg/d/psdm/cxi/cxic0415/res/liponan/peaknet4antfarm/train_log",
                    "{}_{}".format(exp, run))
                if os.path.isfile(log_filename):
                    continue
                else:
                    with open(log_filename, 'a'):
                        os.utime(log_filename, None)
            self.psana_ready = True
            self.j_iter = 0
            print("end of psana test")

        #self.net.writer.add_text("EXP-RUN", "{}-{}".format(exp, run), self.net.model.seen)

        if not self.cxi_ready:
            self.labels, self.eventIdxs = load_cxi_labels_yxhw(self.path,
                                                               total_size=-1)
            print("labels", len(self.labels), "eventIdxs", len(self.eventIdxs))
            self.n_iters = int(
                np.floor(len(self.labels) / float(macro_batch_size)))
            print("# iterations", self.n_iters)

        self.net.set_optimizer(adagrad=(algo == "adagrad"), lr=my_lr)

        for j in range(self.j_iter, self.j_iter + n_train_push):  # was n_iters
            self.delta = n_train_push
            if self.j_iter == self.n_iters - 1:
                self.psana_ready = False
                self.cxi_ready = False

            idx_offset = j * macro_batch_size
            n = macro_batch_size
            batch_imgs = psana_img_loader(self.eventIdxs, idx_offset,
                                          macro_batch_size, self.det,
                                          self.this_run, self.times)
            batch_labels = self.labels[j * macro_batch_size:(j + 1) *
                                       macro_batch_size]
            self.net.set_optimizer(adagrad=(algo == "adagrad"), lr=my_lr)
            batch_imgs[batch_imgs < 0] = 0
            batch_imgs = batch_imgs / batch_imgs.max()
            self.net.train(batch_imgs,
                           batch_labels,
                           mini_batch_size=macro_batch_size * 32)
            self.grad = self.net.getGrad()

            if self.net.model.seen % n_save == 0:
                self.net.snapshot(batch_imgs,
                                  batch_labels,
                                  tag="antfarm_zmq_trainer")
                print("snapshot saved")

            if self.net.model.seen in n_policy:
                my_lr /= 10.0
                self.net.writer.add_scalar('lr', my_lr, self.net.model.seen)
            self.j_iter += 1