Example #1
0
def main(argv=None):
    check_args(FLAGS)
    if FLAGS.multigpu == 'n':

        if FLAGS.mode == 'train':
            from train import train_model
            train_model(FLAGS)
        elif FLAGS.mode == 'save_pb':
            from save_model import load_weights_save_pb
            load_weights_save_pb(FLAGS)
        elif FLAGS.mode == 'eval':
            from eval import eval_model
            eval_model(FLAGS)
    elif FLAGS.multigpu == 'y':
        if FLAGS.mode == 'train':
            from multigpu_train import train_model
            train_model(FLAGS)
        elif FLAGS.mode == 'save_pb':
            from save_multigpu_model import load_weights_save_pb
            load_weights_save_pb(FLAGS)
        elif FLAGS.mode == 'eval':
            from eval import eval_model
            eval_model(FLAGS)
    else:
        raise Exception('Please use true option of multigpu')
Example #2
0
def evaluate(ema, dl_eval):
    model = ema.ema_model
    acc_1_ema, acc_5_ema = eval_model(model, dl_eval)
    model = ema.model
    acc_1, acc_5 = eval_model(model, dl_eval)
    torch.cuda.empty_cache()
    return acc_1, acc_5, acc_1_ema, acc_5_ema
Example #3
0
def train(meta_files):
    if len(meta_files) < 3:
        meta_files = idm.init_dataset_meta()
    if not os.path.exists(config.OUTPUT_DIR):
        os.mkdir(config.OUTPUT_DIR)

    recognizer = cv2.face.LBPHFaceRecognizer_create()

    #global_accuracy = 0

    [list_img, list_label], num_sample = load_batch(meta_files[0])
    if num_sample < 1:
        print('Err: 0 sample found')

    recognizer.train(list_img, np.array(list_label))
    #recognizer.update(list_img, np.array(list_label))

    train_accuracy = eval_model(recognizer, meta_files[0])
    val_accuracy = eval_model(recognizer, meta_files[1])
    test_accuracy = eval_model(recognizer, meta_files[2])

    print('Train accuracy =  %f' % (train_accuracy))
    print('Test accuracy = %f' % (test_accuracy))
    print('Validate accuracy = %f' % (val_accuracy))

    recognizer.write(os.path.join(config.OUTPUT_DIR, config.OUTPUT_MODEL_FILE))
Example #4
0
def train(meta_files):

    if not os.path.exists(config.OUTPUT_DIR):
        os.mkdir(config.OUTPUT_DIR)

    #recognizer=cv2.face.EigenFaceRecognizer_create()
    #recognizer=cv2.face.FisherFaceRecognizer_create()
    recognizer = cv2.face.LBPHFaceRecognizer_create()

    global_accuracy = 0
    print('\nProcessing...')

    [list_img, list_label], num_sample = load_batch(meta_files[0])
    if num_sample < 1:
        return

    recognizer.train(list_img, np.array(list_label))
    train_accuracy = eval_model(recognizer, meta_files[0])
    val_accuracy = eval_model(recognizer, meta_files[1])
    test_accuracy = eval_model(recognizer, meta_files[2])

    print(
        '\nRESULT: Number of images: %d, Train accuracy: %g, Test accuracy: %g'
        % (num_sample, train_accuracy, test_accuracy))

    recognizer.save(os.path.join(config.OUTPUT_DIR, config.OUTPUT_MODEL_FILE))
Example #5
0
def main(argv=None):
    check_args(FLAGS)

    # Create some local cache directories used for transfer data between local path and OBS path
    if not FLAGS.data_url.startswith('s3://'):
        FLAGS.data_local = FLAGS.data_url
    else:
        FLAGS.data_local = os.path.join(FLAGS.local_data_root, 'train_data/')
        if not os.path.exists(FLAGS.data_local):
            mox.file.copy_parallel(FLAGS.data_url, FLAGS.data_local)

            # 如果自己的模型需要加载预训练参数文件,可以先手动将参数文件从外网下载到自己的机器本地,再上传到OBS
            # 然后用下面的代码,将OBS上的预训练参数文件拷贝到 ModelArts 平台训练代码所在的目录
            # 拷贝代码格式为 mox.file.copy(src_path, dst_path),其中dst_path不能是目录,必须是一个具体的文件名
            # mox.file.copy('s3://your_obs_path/imagenet_class_index.json',
            #               os.path.dirname(os.path.abspath(__file__)) + '/models/imagenet_class_index.json')
            # mox.file.copy('s3://your_obs_path/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',
            #               os.path.dirname(os.path.abspath(__file__)) + '/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5')
        else:
            print('FLAGS.data_local: %s is already exist, skip copy' %
                  FLAGS.data_local)

    if not FLAGS.train_url.startswith('s3://'):
        FLAGS.train_local = FLAGS.train_url
    else:
        FLAGS.train_local = os.path.join(FLAGS.local_data_root,
                                         'model_snapshots/')
        if not os.path.exists(FLAGS.train_local):
            os.mkdir(FLAGS.train_local)

    if not FLAGS.test_data_url.startswith('s3://'):
        FLAGS.test_data_local = FLAGS.test_data_url
    else:
        FLAGS.test_data_local = os.path.join(FLAGS.local_data_root,
                                             'test_data/')
        if not os.path.exists(FLAGS.test_data_local):
            mox.file.copy_parallel(FLAGS.test_data_url, FLAGS.test_data_local)
        else:
            print('FLAGS.test_data_local: %s is already exist, skip copy' %
                  FLAGS.test_data_local)

    FLAGS.tmp = os.path.join(FLAGS.local_data_root, 'tmp/')
    if not os.path.exists(FLAGS.tmp):
        os.mkdir(FLAGS.tmp)

    if FLAGS.mode == 'train':
        from train import train_model
        train_model(FLAGS)
    elif FLAGS.mode == 'save_pb':
        from save_model import load_weights_save_pb
        load_weights_save_pb(FLAGS)
    elif FLAGS.mode == 'eval':
        from eval import eval_model
        eval_model(FLAGS)
def evaluate(ema, dl_eval):
    model = ema.ema_model
    metric_dict = eval_model(model, dl_eval, cfg.metric)
    model = ema.model
    metric_dict_ema = eval_model(model, dl_eval, cfg.metric)
    metric_dict_ema = {f'{k}_ema': v for k, v in metric_dict_ema.items()}

    metric_dict.update(metric_dict_ema)

    torch.cuda.empty_cache()
    return metric_dict
Example #7
0
def main(argv=None):
    check_args(FLAGS)

    # Create some local cache directories used for transfer data between local path and OBS path
    if not FLAGS.data_url.startswith('s3://'):
        FLAGS.data_local = FLAGS.data_url
    else:
        FLAGS.data_local = os.path.join(FLAGS.local_data_root, 'train_data/')
        if not os.path.exists(FLAGS.data_local):
            pass
            # file.copy_parallel(FLAGS.data_url, FLAGS.data_local)
        else:
            print('FLAGS.data_local: %s is already exist, skip copy' %
                  FLAGS.data_local)

    if not FLAGS.train_url.startswith('s3://'):
        FLAGS.train_local = FLAGS.train_url
    else:
        FLAGS.train_local = os.path.join(FLAGS.local_data_root,
                                         'model_snapshots/')
        if not os.path.exists(FLAGS.train_local):
            os.mkdir(FLAGS.train_local)

    if not FLAGS.test_data_url.startswith('s3://'):
        FLAGS.test_data_local = FLAGS.test_data_url
    else:
        FLAGS.test_data_local = os.path.join(FLAGS.local_data_root,
                                             'test_data/')
        if not os.path.exists(FLAGS.test_data_local):
            pass
            #file.copy_parallel(FLAGS.test_data_url, FLAGS.test_data_local)
        else:
            print('FLAGS.test_data_local: %s is already exist, skip copy' %
                  FLAGS.test_data_local)

    FLAGS.tmp = os.path.join(FLAGS.local_data_root, 'tmp/')
    if not os.path.exists(FLAGS.tmp):
        os.mkdir(FLAGS.tmp)

    if FLAGS.mode == 'train':
        from train_eval import train_model
        train_model(FLAGS)
    elif FLAGS.mode == 'save_pb':
        from save_model import load_weights_save_pb
        load_weights_save_pb(FLAGS)
    elif FLAGS.mode == 'eval':
        from eval import eval_model
        eval_model(FLAGS)
Example #8
0
def classification_model(raw_data_file, metric_col, categorical_col,
                         target_col, test_perc, hyperopt_iterations,
                         const_params, use_predefined_params, k_fold,
                         tuning_metric):
    # preprocess data
    print('preprocess data:')
    data_obj = Preproc(raw_data_file, metric_col, categorical_col, target_col,
                       test_perc)

    # hyperparameter tuning train with best params
    print('hyperparams tuning and model fitting:')
    model, params = train_best_model(data_obj.X_train, data_obj.y_train,
                                     const_params, hyperopt_iterations, k_fold,
                                     tuning_metric, use_predefined_params)
    print('best params are {}'.format(params), file=sys.stdout)

    # evaluate model
    auc = eval_model(data_obj.X_test, data_obj.y_test, model)

    # save model
    model.save_model(save_model_dir,
                     format="json",
                     pool=cb.Pool(data_obj.X_train,
                                  data_obj.y_train,
                                  cat_features=np.where(
                                      data_obj.X_train.dtypes == object)[0]))
Example #9
0
def train():
    for epoch in range(args.epochs):
        model.train()
        print('\n\n-------------------------------------------')
        print('Epoch-{}'.format(epoch))
        print('-------------------------------------------')

        train_iter = enumerate(data.get_batches('train'))
        if not args.no_tqdm:
            train_iter = tqdm(train_iter)
            train_iter.set_description_str('Training')
            train_iter.total = len(data.train)

        for it, mb in train_iter:
            c, c_u_m, c_m, r, r_u_m, r_m, y = mb
            # print (c, c_u_m, c_m, r, y)
            # getting predictions
            pred = model(c, c_u_m, c_m, r, r_u_m, r_m)

            #train_iter.set_description(model.print_loss())

            #loss = F.nll_loss(pred, r)
            #loss = criteria(pred, y)
            #y = torch.argmax(y)
            #print (y.size())
            loss = criteria(pred, y)

            loss.backward()
            #print (model.conv3.grad)
            #clip_gradient_threshold(model, -10, 10)
            solver.step()
            solver.zero_grad()

        val_mrr = eval_model(model, data, 'valid')
        print('Validation MRR for this epoch:' + str(val_mrr))
Example #10
0
def train_model(model: nn.Module, optimizer, loss_func,
                data_loader: DataLoader, eval_data_loader: DataLoader,
                eval_tgt_id2word, device: str, train_params: AttributeDict,
                enc_params: AttributeDict, dec_params: AttributeDict,
                epoch: int):
    # Set train flag
    model.train()
    n_epochs = train_params.n_epochs
    losses = []
    data_length = len(data_loader)

    with tqdm(data_loader, total=data_length,
              desc=f'Epoch {epoch:03d}') as tqdm_iterator:
        for i, batch in enumerate(tqdm_iterator):
            loss = train_step(model, device, batch, optimizer, loss_func)
            losses.append(loss)
            tqdm_iterator.set_postfix_str(f'loss: {loss:05.3f}')

    avg_loss = np.mean(losses)
    print(f'Epochs [{epoch}/{n_epochs}] avg losses: {avg_loss:05.3f}')

    val_loss = eval_model(model, loss_func, eval_data_loader, device,
                          eval_tgt_id2word)

    return avg_loss, val_loss
Example #11
0
def main(argv=None):
    check_args(FLAGS)

    # Create some local cache directories used for transfer data between local path and OBS path
    FLAGS.data_local = FLAGS.data_url
    FLAGS.train_local = FLAGS.train_url
    FLAGS.test_data_local = FLAGS.test_data_url

    # FLAGS.tmp = os.path.join(FLAGS.local_data_root, 'tmp/')
    # print(FLAGS.tmp)
    # if not os.path.exists(FLAGS.tmp):
    #     os.mkdir(FLAGS.tmp)

    if FLAGS.mode == 'train':
        from train import train_model
        train_model(FLAGS)
    elif FLAGS.mode == 'save_pb':
        from save_model import load_weights_save_pb
        load_weights_save_pb(FLAGS)
    elif FLAGS.mode == 'eval':
        from eval import eval_model
        eval_model(FLAGS)
Example #12
0
def main(_):

    config = expconf.ExperimentConfig(data_dir=FLAGS.data_dir,
                                      root_log_dir=FLAGS.log_dir,
                                      config_path=FLAGS.config_path)

    learning_rate = config.learning_rate_nms
    softmax_ini_scores = False
    class_of_interest = config.config['general']['class_of_interest']

    if class_of_interest == 'all':
        is_one_class = False
        class_ix = 0
        n_classes = TOTAL_NUMBER_OF_CLASSES - 1
        softmax_ini_scores = True
    else:
        is_one_class = True
        class_ix = CLASSES.index(class_of_interest)
        n_classes = 1

    config.save_results()

    logging.info("config : %s" % yaml.dump(config.config))

    logging.info('loading data..')
    logging.info('train..')
    frames_data_train = load_data(
        config.train_data_dir,
        n_bboxes=config.n_bboxes,
        use_short_features=config.use_reduced_fc_features,
        one_class=is_one_class,
        class_id=class_ix)

    train_class_instances = 0
    for fid in frames_data_train.keys():
        train_class_instances += len(frames_data_train[fid]['gt_labels'])
    logging.info("number of gt objects of class %s in train : %d" %
                 (class_of_interest, train_class_instances))

    logging.info('test..')
    frames_data_test = load_data(
        config.test_data_dir,
        n_bboxes=config.n_bboxes,
        use_short_features=config.use_reduced_fc_features,
        one_class=is_one_class,
        class_id=class_ix)
    test_class_instances = 0
    for fid in frames_data_test.keys():
        test_class_instances += len(frames_data_test[fid]['gt_labels'])
    logging.info("number of gt objects of class %s in test : %d" %
                 (class_of_interest, test_class_instances))

    if config.shuffle_train_test:
        frames_data_train, frames_data_test = shuffle_train_test(
            frames_data_train, frames_data_test)

    n_frames_train = len(frames_data_train.keys())
    n_frames_test = len(frames_data_test.keys())

    logging.info("number of bboxes per image : %d" % config.n_bboxes)

    logging.info('building model graph..')

    n_dt_features = frames_data_train[0][nms_net.DT_FEATURES].shape[1]

    in_ops = input_ops(n_classes=n_classes, n_dt_features=n_dt_features)

    nnms_model = nms_net.NMSNetwork(n_classes=n_classes,
                                    input_ops=in_ops,
                                    gt_match_iou_thr=0.5,
                                    class_ix=class_ix,
                                    softmax_ini_scores=softmax_ini_scores,
                                    **config.nms_network_config)
    lr_decay_applied = False

    with tf.Session() as sess:

        step_id = 0

        sess.run(nnms_model.init_op)

        saver = tf.train.Saver(max_to_keep=5,
                               keep_checkpoint_every_n_hours=1.0)

        if not config.start_from_scratch:
            ckpt_path = tf.train.latest_checkpoint(config.log_dir)
            if ckpt_path is not None:
                logging.info('model exists, restoring..')
                ckpt_name = ntpath.basename(ckpt_path)
                step_id = int(ckpt_name.split('-')[1])
                saver.restore(sess, ckpt_path)

        summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph)

        # loss_mode = 'nms'
        # nnms_model.switch_loss('nms')
        # logging.info("current loss mode : %s" % loss_mode)

        # train_frame_probs = np.squeeze(generate_frame_probs(frames_data_train), axis=0)

        logging.info('training started..')
        for epoch_id in range(0, config.n_epochs):

            step_times = []

            for fid in shuffle_samples(n_frames_train):

                # if step_id == config.loss_change_step:
                #     learning_rate = config.learning_rate_det
                #     loss_mode = 'detection'
                #     nnms_model.switch_loss('detection')
                #     logging.info('switching loss to actual detection loss..')

                frame_data = frames_data_train[fid]

                if n_classes == 1:
                    dt_probs_ini = frame_data[nms_net.DT_SCORES]
                    gt_labels = frame_data[nms_net.GT_LABELS]
                else:
                    dt_probs_ini = softmax(frame_data[nms_net.DT_SCORES])[:,
                                                                          1:]
                    gt_labels = frame_data[nms_net.GT_LABELS] - 1

                feed_dict = {
                    nnms_model.dt_coords: frame_data[nms_net.DT_COORDS],
                    nnms_model.dt_features: frame_data[nms_net.DT_FEATURES],
                    nnms_model.dt_probs_ini: dt_probs_ini,
                    nnms_model.gt_coords: frame_data[nms_net.GT_COORDS],
                    nnms_model.gt_labels: gt_labels,
                    nnms_model.keep_prob: config.keep_prob_train
                }

                start_step = timer()

                if nnms_model.loss_type == 'nms':
                    summary, _ = sess.run([
                        nnms_model.merged_summaries, nnms_model.nms_train_step
                    ],
                                          feed_dict=feed_dict)
                else:
                    summary, _ = sess.run([
                        nnms_model.merged_summaries, nnms_model.det_train_step
                    ],
                                          feed_dict=feed_dict)

                end_step = timer()

                step_times.append(end_step - start_step)

                summary_writer.add_summary(summary, global_step=step_id)
                summary_writer.flush()

                step_id += 1

                if step_id % config.eval_step == 0:

                    logging.info('step : %d, mean time for step : %s' %
                                 (step_id, str(np.mean(step_times))))

                    if step_id % config.full_eval_step == 0:
                        full_eval = True
                    else:
                        full_eval = False

                    # logging.info('evaluating on TRAIN..')
                    # train_out_dir = os.path.join(config.log_dir, 'train')
                    # logging.info('full evaluation : %d' % full_eval)
                    # train_loss_opt, train_loss_final = eval.eval_model(sess, nnms_model,
                    #                                                 frames_data_train,
                    #                                                 global_step=step_id,
                    #                                                 n_eval_frames=config.n_eval_frames,
                    #                                                 out_dir=train_out_dir,
                    #                                                 full_eval=full_eval,
                    #                                                 nms_thres=config.nms_thres,
                    #                                                 one_class=is_one_class,
                    #                                                 class_ix=class_ix)

                    write_scalar_summary(train_loss_opt, 'train_loss_opt',
                                         summary_writer, step_id)
                    logging.info('evaluating on TEST..')
                    test_out_dir = os.path.join(config.log_dir, 'test')
                    logging.info('full evaluation : %d' % full_eval)
                    test_loss_opt, test_loss_final = eval.eval_model(
                        sess,
                        nnms_model,
                        frames_data_test,
                        global_step=step_id,
                        n_eval_frames=config.n_eval_frames,
                        out_dir=test_out_dir,
                        full_eval=full_eval,
                        nms_thres=config.nms_thres,
                        one_class=is_one_class,
                        class_ix=class_ix)
                    write_scalar_summary(test_loss_opt, 'test_loss_opt',
                                         summary_writer, step_id)

                    config.update_results(step_id, train_loss_opt,
                                          train_loss_final, test_loss_opt,
                                          test_loss_final, np.mean(step_times))
                    config.save_results()

                    saver.save(sess, config.model_file, global_step=step_id)

    logging.info('all done.')
    return
Example #13
0
        # grad_norm=0.5,
        # grad_clipping=1
    )
    metrics = trainer.train()

    # evaluate model after training
    print("loading best model for evaluation")
    model = init_model(args, args.model, num_authors, vocab, args.encoder,
                       args.max_vocab_size, date_span, args.ignore_time,
                       args.num_sk)
    with open(args.snapshot_path + "best.th", 'rb') as f:
        model.load_state_dict(torch.load(f))
    if args.cuda:
        model.cuda(args.device)
    print("Evaluation in validation data.")
    eval_model(model, vocab, val_ds, args.batch_size,
               args.device if args.cuda else -1)
    print("Evaluation in testing data.")
    eval_model(model, vocab, test_ds, args.batch_size,
               args.device if args.cuda else -1)

elif args.test:  # test the single model
    # Evaluation
    print("Evaluation ...")

    if not args.snapshot:
        print("No snapshot is provided!")
        exit(0)

    # auto-fill snapshot path
    args.snapshot = complete_snapshot(args.snapshot_path, args.snapshot)
Example #14
0
def run_training(data_type="screw",
                 model_dir="models",
                 epochs=256,
                 pretrained=True,
                 test_epochs=10,
                 freeze_resnet=20,
                 learninig_rate=0.03,
                 optim_name="SGD",
                 batch_size=64,
                 head_layer=8):
    torch.multiprocessing.freeze_support()
    # TODO: use script params for hyperparameter
    # Temperature Hyperparameter currently not used
    temperature = 0.2
    device = "cuda"

    weight_decay = 0.00003
    momentum = 0.9
    #TODO: use f strings also for the date LOL
    model_name = f"model-{data_type}" + '-{date:%Y-%m-%d_%H_%M_%S}'.format(
        date=datetime.datetime.now())

    #augmentation:
    size = 256
    min_scale = 0.5

    # create Training Dataset and Dataloader
    after_cutpaste_transform = transforms.Compose([])
    after_cutpaste_transform.transforms.append(transforms.ToTensor())
    after_cutpaste_transform.transforms.append(
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]))

    train_transform = transforms.Compose([])
    # train_transform.transforms.append(transforms.RandomResizedCrop(size, scale=(min_scale,1)))
    # train_transform.transforms.append(transforms.GaussianBlur(int(size/10), sigma=(0.1,2.0)))
    train_transform.transforms.append(transforms.Resize((256, 256)))
    train_transform.transforms.append(
        CutPaste(transform=after_cutpaste_transform))
    # train_transform.transforms.append(transforms.ToTensor())

    train_data = MVTecAT("Data",
                         data_type,
                         transform=train_transform,
                         size=int(size * (1 / min_scale)))
    dataloader = DataLoader(train_data,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=8,
                            collate_fn=cut_paste_collate_fn,
                            persistent_workers=True,
                            pin_memory=True,
                            prefetch_factor=5)

    # Writer will output to ./runs/ directory by default
    writer = SummaryWriter(Path("logdirs") / model_name)

    # create Model:
    head_layers = [512] * head_layer + [128]
    print(head_layers)
    model = ProjectionNet(pretrained=pretrained, head_layers=head_layers)
    model.to(device)

    if freeze_resnet > 0:
        model.freeze_resnet()

    loss_fn = torch.nn.CrossEntropyLoss()
    if optim_name == "sgd":
        optimizer = optim.SGD(model.parameters(),
                              lr=learninig_rate,
                              momentum=momentum,
                              weight_decay=weight_decay)
        scheduler = CosineAnnealingWarmRestarts(optimizer, epochs)
        #scheduler = None
    elif optim_name == "adam":
        optimizer = optim.Adam(model.parameters(),
                               lr=learninig_rate,
                               weight_decay=weight_decay)
        scheduler = None
    else:
        print(f"ERROR unkown optimizer: {optim_name}")

    step = 0
    import torch.autograd.profiler as profiler
    num_batches = len(dataloader)

    def get_data_inf():
        while True:
            for out in enumerate(dataloader):
                yield out

    dataloader_inf = get_data_inf()
    # From paper: "Note that, unlike conventional definition for an epoch,
    #              we define 256 parameter update steps as one epoch.
    for step in tqdm(range(epochs * 256)):
        epoch = int(step / 256)
        if epoch == freeze_resnet:
            model.unfreeze()

        batch_embeds = []
        batch_idx, data = next(dataloader_inf)
        x1, x2 = data
        x1 = x1.to(device)
        x2 = x2.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        xc = torch.cat((x1, x2), axis=0)
        embeds, logits = model(xc)

        #         embeds = F.normalize(embeds, p=2, dim=1)
        #         embeds1, embeds2 = torch.split(embeds,x1.size(0),dim=0)
        #         ip = torch.matmul(embeds1, embeds2.T)
        #         ip = ip / temperature

        #         y = torch.arange(0,x1.size(0), device=device)
        #         loss = loss_fn(ip, torch.arange(0,x1.size(0), device=device))

        y = torch.tensor([0, 1], device=device)
        y = y.repeat_interleave(x1.size(0))
        loss = loss_fn(logits, y)

        # regulize weights:
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step(epoch + batch_idx / num_batches)

        writer.add_scalar('loss', loss.item(), step)

        #         predicted = torch.argmax(ip,axis=0)
        predicted = torch.argmax(logits, axis=1)
        #         print(logits)
        #         print(predicted)
        #         print(y)
        accuracy = torch.true_divide(torch.sum(predicted == y),
                                     predicted.size(0))
        writer.add_scalar('acc', accuracy, step)
        if scheduler is not None:
            writer.add_scalar('lr', scheduler.get_last_lr()[0], step)

        # save embed for validation:
        if test_epochs > 0 and epoch % test_epochs == 0:
            batch_embeds.append(embeds.cpu().detach())

        writer.add_scalar('epoch', epoch, step)

        # run tests
        if test_epochs > 0 and epoch % test_epochs == 0:
            # run auc calculation
            #TODO: create dataset only once.
            #TODO: train predictor here or in the model class itself. Should not be in the eval part
            #TODO: we might not want to use the training datat because of droupout etc. but it should give a indecation of the model performance???
            # batch_embeds = torch.cat(batch_embeds)
            # print(batch_embeds.shape)
            model.eval()
            roc_auc = eval_model(model_name,
                                 data_type,
                                 device=device,
                                 save_plots=False,
                                 size=size,
                                 show_training_data=False,
                                 model=model)
            #train_embed=batch_embeds)
            model.train()
            writer.add_scalar('eval_auc', roc_auc, step)

    torch.save(model.state_dict(), model_dir / f"{model_name}.tch")
Example #15
0
def train_eval_model(model,
                     criterion,
                     optimizer,
                     dataloader,
                     num_epochs,
                     resume=False,
                     start_epoch=0):
    print("Start training...")

    since = time.time()
    dataloader["train"].dataset.set_num_graphs(
        cfg.TRAIN.num_graphs_in_matching_instance)
    dataset_size = len(dataloader["train"].dataset)

    device = next(model.parameters()).device
    print("model on device: {}".format(device))

    checkpoint_path = Path(cfg.model_dir) / "params"
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    if resume:
        params_path = os.path.join(cfg.warmstart_path, f"params.pt")
        print("Loading model parameters from {}".format(params_path))
        model.load_state_dict(torch.load(params_path))

        optim_path = os.path.join(cfg.warmstart_path, f"optim.pt")
        print("Loading optimizer state from {}".format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    # Evaluation only
    if cfg.evaluate_only:
        # assert resume
        print(f"Evaluating without training...")
        accs, f1_scores = eval_model(model, dataloader["test"])
        acc_dict = {
            "acc_{}".format(cls): single_acc
            for cls, single_acc in zip(dataloader["train"].dataset.classes,
                                       accs)
        }
        f1_dict = {
            "f1_{}".format(cls): single_f1_score
            for cls, single_f1_score in zip(
                dataloader["train"].dataset.classes, f1_scores)
        }
        acc_dict.update(f1_dict)
        acc_dict["matching_accuracy"] = torch.mean(accs)
        acc_dict["f1_score"] = torch.mean(f1_scores)

        time_elapsed = time.time() - since
        print("Evaluation complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600, (time_elapsed // 60) % 60,
            time_elapsed % 60))
        return model, acc_dict

    _, lr_milestones, lr_decay = lr_schedules[cfg.TRAIN.lr_schedule]
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=lr_milestones,
                                               gamma=lr_decay)

    if cfg.log_dir:
        os.makedirs(cfg.log_dir, exist_ok=True)
        writer = SummaryWriter(cfg.log_dir)

    for epoch in range(start_epoch, num_epochs):
        print("Epoch {}/{}".format(epoch, num_epochs - 1))
        print("-" * 10)

        model.train()  # Set model to training mode

        print("lr = " + ", ".join(
            ["{:.2e}".format(x["lr"]) for x in optimizer.param_groups]))

        epoch_loss = 0.0
        running_loss = 0.0
        running_acc = 0.0
        epoch_acc = 0.0
        running_f1 = 0.0
        epoch_f1 = 0.0
        running_since = time.time()
        iter_num = 0

        # Iterate over data.
        for inputs in dataloader["train"]:
            data_list = [_.cuda() for _ in inputs["images"]]
            points_gt_list = [_.cuda() for _ in inputs["Ps"]]
            n_points_gt_list = [_.cuda() for _ in inputs["ns"]]
            edges_list = [_.to("cuda") for _ in inputs["edges"]]
            perm_mat_list = [
                perm_mat.cuda() for perm_mat in inputs["gt_perm_mat"]
            ]

            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                s_pred_list = model(data_list, points_gt_list, edges_list,
                                    n_points_gt_list, perm_mat_list)

                loss = sum([
                    criterion(s_pred, perm_mat)
                    for s_pred, perm_mat in zip(s_pred_list, perm_mat_list)
                ])
                loss /= len(s_pred_list)

                # backward + optimize
                loss.backward()
                optimizer.step()

                tp, fp, fn = get_pos_neg_from_lists(s_pred_list, perm_mat_list)
                f1 = f1_score(tp, fp, fn)
                acc, _, __ = matching_accuracy_from_lists(
                    s_pred_list, perm_mat_list)

                # statistics
                bs = perm_mat_list[0].size(0)
                running_loss += loss.item() * bs  # multiply with batch size
                epoch_loss += loss.item() * bs
                running_acc += acc.item() * bs
                epoch_acc += acc.item() * bs
                running_f1 += f1.item() * bs
                epoch_f1 += f1.item() * bs

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * bs / (time.time() -
                                                               running_since)
                    loss_avg = running_loss / cfg.STATISTIC_STEP / bs
                    acc_avg = running_acc / cfg.STATISTIC_STEP / bs
                    f1_avg = running_f1 / cfg.STATISTIC_STEP / bs
                    print(
                        "Epoch {:<4} Iter {:<4} {:>4.2f}sample/s Loss={:<8.4f} Accuracy={:<2.3} F1={:<2.3}"
                        .format(epoch, iter_num, running_speed, loss_avg,
                                acc_avg, f1_avg))

                    running_acc = 0.0
                    running_f1 = 0.0
                    running_loss = 0.0
                    running_since = time.time()

        epoch_loss = epoch_loss / dataset_size
        epoch_acc = epoch_acc / dataset_size
        epoch_f1 = epoch_f1 / dataset_size

        if cfg.save_checkpoint:
            base_path = Path(checkpoint_path / "{:04}".format(epoch + 1))
            Path(base_path).mkdir(parents=True, exist_ok=True)
            path = str(base_path / "params.pt")
            torch.save(model.state_dict(), path)
            torch.save(optimizer.state_dict(), str(base_path / "optim.pt"))

        print(
            "Over whole epoch {:<4} -------- Loss: {:.4f} Accuracy: {:.3f} F1: {:.3f}"
            .format(epoch, epoch_loss, epoch_acc, epoch_f1))
        print()

        # Eval in each epoch
        accs, f1_scores = eval_model(model, dataloader["test"])
        acc_dict = {
            "acc_{}".format(cls): single_acc
            for cls, single_acc in zip(dataloader["train"].dataset.classes,
                                       accs)
        }
        f1_dict = {
            "f1_{}".format(cls): single_f1_score
            for cls, single_f1_score in zip(
                dataloader["train"].dataset.classes, f1_scores)
        }
        acc_dict.update(f1_dict)
        val_acc = torch.mean(accs)
        val_f1 = torch.mean(f1_scores)
        acc_dict["matching_accuracy"] = val_acc
        acc_dict["f1_score"] = val_f1

        # Tensorboard
        if cfg.log_dir:
            writer.add_scalar('Loss/train', epoch_loss, epoch)
            writer.add_scalar('Acc/train', epoch_acc, epoch)
            writer.add_scalar('F1/train', epoch_f1, epoch)
            writer.add_scalar('Acc/val', val_acc, epoch)
            writer.add_scalar('F1/val', val_f1, epoch)
            lr = optimizer.param_groups[0]["lr"]
            writer.add_scalar('lr', lr, epoch)

        scheduler.step()

    # Close TensorBoard writer
    if cfg.log_dir:
        writer.close()
    time_elapsed = time.time() - since
    print("Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
        time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model, acc_dict
Example #16
0
def train_model(log_dict,data,model,loss_fn,optimizer,lr_scheduler,writer,save_home):

	best_acc1 = 0
	patience_flag = 0
	train_iter,valid_iter,test_iter = data[0],data[1],data[2] # data is a tuple of three iterators

	# print("Start Training")
	for epoch in range(0,log_dict.param.nepoch):

		## train and validation
		train_loss, train_acc = train_epoch(model, train_iter, epoch,loss_fn,optimizer,log_dict)

		val_loss, val_acc ,val_f1_score,val_w_f1_score,val_top3_acc= eval_model(model, valid_iter,loss_fn,log_dict)
		print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')
		## testing
		test_loss, test_acc,test_f1_score,test_w_f1_score,test_top3_acc = eval_model(model, test_iter,loss_fn,log_dict)
		print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}% Test F1 score: {test_f1_score:.4f}')

		## save best model
		is_best = val_acc > best_acc1
		os.makedirs(save_home,exist_ok=True)

		save_checkpoint({'epoch': epoch + 1,'arch': log_dict.param.arch_name,'state_dict': model.state_dict(),'train_acc':train_acc,"val_acc":val_acc,'param':dict(log_dict.param),'optimizer' : optimizer.state_dict()},is_best,save_home+"/model_best.pth.tar")

		best_acc1 = max(val_acc, best_acc1)
		if log_dict.param.step_size != None:
			lr_scheduler.step()

		## tensorboard runs
		writer.add_scalar('Loss/train',train_loss,epoch)
		writer.add_scalar('Accuracy/train',train_acc,epoch)
		writer.add_scalar('Loss/val',val_loss,epoch)
		writer.add_scalar('Accuracy/val',val_acc,epoch)

		## save logs
		if is_best:


			patience_flag = 0
			log_dict.train_acc = train_acc
			log_dict.test_acc = test_acc
			log_dict.valid_acc = val_acc

			log_dict.test_f1_score = test_f1_score
			log_dict.valid_f1_score = val_f1_score

			log_dict.valid_top3_acc = val_top3_acc
			log_dict.test_top3_acc = test_top3_acc

			log_dict.train_loss = train_loss
			log_dict.test_loss = test_loss
			log_dict.valid_loss = val_loss

			log_dict.epoch = epoch+1
			log_dict.weighted_test_f1_score = test_w_f1_score
			log_dict.weighted_valid_f1_score = val_w_f1_score


			with open(save_home+"/log.json", 'w') as fp:
				json.dump(dict(log_dict), fp,indent=4)
			fp.close()
		else:
			patience_flag += 1

		## early stopping
		if patience_flag == log_dict.param.patience or epoch == log_dict.param.nepoch-1:
			print(log_dict)
			break
Example #17
0
from train import train_gan
from eval import eval_model
from models import *
from config import *

if __name__ == '__main__':
    model = train_gan('/content/Break-dataset/QDMR/train.csv')
    model.save_internal(seq2seq_path='model.dat')

    eval_model([('model.dat', RobertaDecomposer, SEQ_LENGTH)],
               '/content/Break-dataset/QDMR/dev.csv',
               orig_filenames=["orig.csv"],
               pred_filenames=["pred.csv"])
Example #18
0
def main(_):

    config = expconf.ExperimentConfig(data_dir=FLAGS.data_dir,
                                      root_log_dir=FLAGS.root_log_dir,
                                      config_path=FLAGS.config_path)

    logging.info("config info : %s" % config.config)

    labels_dir = os.path.join(FLAGS.data_dir, 'label_2')

    detections_dir = os.path.join(FLAGS.data_dir, 'detection_2')

    frames_ids = np.asarray([
        int(ntpath.basename(path).split('.')[0])
        for path in os.listdir(labels_dir)
    ])

    n_frames = len(frames_ids)
    n_bboxes_test = config.n_bboxes
    n_classes = 1
    class_name = config.general_params.get('class_of_interest', 'Car')
    half = n_frames / 2
    learning_rate = config.learning_rate_det

    # shuffled_samples = shuffle_samples(n_frames)
    # train_frames = frames_ids[shuffled_samples[0:half]]

    # test_frames = frames_ids[shuffled_samples[half:]]

    train_frames_path = os.path.join(FLAGS.data_dir, 'train.txt')
    train_frames = np.loadtxt(train_frames_path, dtype=int)

    test_frames_path = os.path.join(FLAGS.data_dir, 'val.txt')
    test_frames = np.loadtxt(test_frames_path, dtype=int)

    train_out_dir = os.path.join(config.log_dir, 'train')
    test_out_dir = os.path.join(config.log_dir, 'test')
    n_train_samples = len(train_frames)
    n_test_samples = len(test_frames)

    logging.info('building model graph..')

    in_ops = input_ops(config.n_dt_features, n_classes)

    nnms_model = nms_net.NMSNetwork(n_classes=1,
                                    input_ops=in_ops,
                                    class_ix=0,
                                    **config.nms_network_config)

    saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=1.0)

    config.save_results()

    logging.info('training started..')

    with tf.Session() as sess:

        sess.run(nnms_model.init_op)

        step_id = 0
        step_times = []
        data_times = []

        # loss_mode = 'nms'
        # nnms_model.switch_loss('nms')
        # logging.info("current loss mode : %s" % loss_mode)

        summary_writer = tf.summary.FileWriter(config.log_dir, sess.graph)

        for epoch_id in range(0, config.n_epochs):

            epoch_frames = train_frames[shuffle_samples(n_train_samples)]

            for fid in epoch_frames:

                # if step_id == config.loss_change_step:
                #     learning_rate = config.learning_rate_det
                #     loss_mode = 'detection'
                #     nnms_model.switch_loss('detection')
                #     logging.info('switching loss to actual detection loss..')
                #     logging.info('learning rate to %f' % learning_rate)

                start_step = timer()

                frame_data = get_frame_data_fixed(
                    frame_id=fid,
                    labels_dir=labels_dir,
                    detections_dir=detections_dir,
                    n_detections=config.n_bboxes,
                    class_name=class_name,
                    n_features=config.n_dt_features)
                data_step = timer()

                feed_dict = {
                    nnms_model.dt_coords: frame_data['dt_coords'],
                    nnms_model.dt_features: frame_data['dt_features'],
                    nnms_model.dt_probs_ini: frame_data['dt_probs'],
                    nnms_model.gt_coords: frame_data['gt_coords'],
                    nnms_model.gt_labels: frame_data['gt_labels'],
                    nnms_model.keep_prob: config.keep_prob_train
                }

                if nnms_model.loss_type == 'nms':
                    summary, _ = sess.run([
                        nnms_model.merged_summaries, nnms_model.nms_train_step
                    ],
                                          feed_dict=feed_dict)
                else:
                    summary, _ = sess.run([
                        nnms_model.merged_summaries, nnms_model.det_train_step
                    ],
                                          feed_dict=feed_dict)

                step_id += 1

                summary_writer.add_summary(summary, global_step=step_id)
                summary_writer.flush()

                end_step = timer()
                step_times.append(end_step - start_step)
                data_times.append(data_step - start_step)

                if step_id % config.eval_step == 0:

                    logging.info("learning rate %s" %
                                 str(nnms_model.learning_rate_det.eval()))

                    logging.info(
                        'curr step : %d, mean time for step : %s, for getting data : %s'
                        % (step_id, str(
                            np.mean(step_times)), str(np.mean(data_times))))

                    logging.info("eval on TRAIN..")
                    train_loss_opt, train_loss_fin = eval.eval_model(
                        sess,
                        nnms_model,
                        detections_dir=detections_dir,
                        labels_dir=labels_dir,
                        eval_frames=train_frames,
                        n_bboxes=config.n_bboxes,
                        n_features=config.n_dt_features,
                        global_step=step_id,
                        out_dir=train_out_dir,
                        nms_thres=config.nms_thres,
                        class_name=class_name)

                    logging.info("eval on TEST..")
                    test_loss_opt, test_loss_fin = eval.eval_model(
                        sess,
                        nnms_model,
                        detections_dir=detections_dir,
                        labels_dir=labels_dir,
                        eval_frames=test_frames,
                        n_bboxes=config.n_bboxes,
                        n_features=config.n_dt_features,
                        global_step=step_id,
                        out_dir=test_out_dir,
                        nms_thres=config.nms_thres,
                        class_name=class_name)

                    config.update_results(step_id, train_loss_opt,
                                          train_loss_fin, test_loss_opt,
                                          test_loss_fin, np.mean(step_times))

                    config.save_results()

                    saver.save(sess, config.model_file, global_step=step_id)

        train_loss_opt, train_loss_fin = eval.eval_model(
            sess,
            nnms_model,
            detections_dir=detections_dir,
            labels_dir=labels_dir,
            eval_frames=train_frames,
            n_bboxes=config.n_bboxes,
            n_features=config.n_dt_features,
            global_step=step_id,
            out_dir=train_out_dir,
            nms_thres=config.nms_thres,
            class_name=class_name)

        test_loss_opt, test_loss_fin = eval.eval_model(
            sess,
            nnms_model,
            detections_dir=detections_dir,
            labels_dir=labels_dir,
            eval_frames=test_frames,
            n_bboxes=config.n_bboxes,
            n_features=config.n_dt_features,
            global_step=step_id,
            out_dir=test_out_dir,
            nms_thres=config.nms_thres,
            class_name=class_name)

        config.update_results(step_id, train_loss_opt,
                              train_loss_fin, test_loss_opt, test_loss_fin,
                              np.mean(step_times))

        config.save_results()
        saver.save(sess, config.model_file, global_step=step_id)
    return
Example #19
0
def run_model(config, device):
    model_config1 = model_config(config.modelconfig)

    #     Load data and create features. If calculated features available in the cache, it will use it
    dataprocessor = MultiClassificationProcessor()
    train_dataloader, data_len, num_labels, num_train_optimization_steps, all_label_ids = dataprocessor.get_data_loader(
        config)

    #   set to right Model Name
    if config.programsettings.MODEL_NAME == "BioBERT_fc":
        model = Biobert_fc(device, model_config1)
    if config.programsettings.MODEL_NAME == "BioBERT_CNN_fc":
        model = Biobert_cnn_fc(device, model_config1)
    elif config.programsettings.MODEL_NAME == "BERT_Sequence":
        model = BertForSequenceClassification.from_pretrained(
            config.programsettings.BERT_MODEL,
            cache_dir=config.programsettings.CACHE_DIR,
            num_labels=num_labels)

#   Freeze BERT layers if we don't want to tune based on configuraiton
    if config.hyperparams.NUM_BERT_LAYERS_FREEZE >= 0:
        count = 0
        for child in model.children():
            if count > 0 and count < config.hyperparams.NUM_BERT_LAYERS_FREEZE:
                for param in child.parameters():
                    param.requires_grad = False
            count += 1

# set the optimizer
    optimizer = optim.AdamW(model.parameters(),
                            lr=config.hyperparams.LEARNING_RATE)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.hyperparams.NUM_WARMP_STEPS,
        num_training_steps=num_train_optimization_steps)  # PyTorch scheduler

    # Run the model
    # from train import train_model
    model = train_model(config,
                        model,
                        optimizer,
                        scheduler,
                        train_dataloader,
                        num_labels,
                        data_len,
                        device=device,
                        model_save_path=config.programsettings.OUTPUT_DIR,
                        model_name=config.programsettings.MODEL_NAME,
                        num_epochs=config.hyperparams.NUM_TRAIN_EPOCHS)

    # Evaluate training data
    train_inputs, train_preds, train_labels, train_loss = eval_model(
        config, model, train_dataloader, device, num_labels)

    #   Prepare dev dataset
    dev_dataloader, dev_data_len, dev_num_labels, dev_num_train_optimization_steps, all_dev_label_ids = dataprocessor.get_data_loader(
        config, source='dev')
    #   Run the trained model on dev data
    dev_inputs, dev_preds, dev_labels, dev_loss = eval_model(
        config, model, dev_dataloader, device, num_labels)

    # Evaluate Test data
    dataprocessor = MultiClassificationProcessor()
    test_dataloader, dev_data_len, dev_num_labels, dev_num_train_optimization_steps, all_dev_label_ids = dataprocessor.get_data_loader(
        config, source='test')

    test_inputs, test_preds, test_labels, test_loss = eval_model(
        config, model, test_dataloader, device, num_labels)

    return train_inputs, train_labels, train_preds, train_loss, dev_inputs, dev_labels, dev_loss, dev_preds, test_inputs, test_preds, test_labels, test_loss
Example #20
0
def train_eval_model(model,
                     permLoss,
                     optimizer,
                     dataloader,
                     num_epochs=25,
                     resume=False,
                     start_epoch=0,
                     viz=None,
                     savefiletime='time'):
    print('**************************************')
    print('Start training...')
    dataset_size = len(dataloader['train'].dataset)
    print('train datasize: {}'.format(dataset_size))

    since = time.time()
    lap_solver = hungarian
    optimal_acc = 0.0
    optimal_rot = np.inf
    device = next(model.parameters()).device

    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    if resume:
        assert start_epoch != 0
        model_path = str(checkpoint_path /
                         'params_{:04}.pt'.format(start_epoch))
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path)

        optim_path = str(checkpoint_path /
                         'optim_{:04}.pt'.format(start_epoch))
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer,
        milestones=cfg.TRAIN.LR_STEP,
        gamma=cfg.TRAIN.LR_DECAY,
        last_epoch=cfg.TRAIN.START_EPOCH - 1)

    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        model.train()  # Set model to training mode

        print('lr = ' + ', '.join(
            ['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        iter_num = 0
        running_since = time.time()
        all_train_metrics_np = defaultdict(list)

        # Iterate over data3d.
        for inputs in dataloader['train']:
            P1_gt, P2_gt = [_.cuda()
                            for _ in inputs['Ps']]  #keypoints coordinate
            n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']]  #keypoints number
            A1_gt, A2_gt = [_.cuda()
                            for _ in inputs['As']]  #edge connect matrix
            perm_mat = inputs['gt_perm_mat'].cuda()  #permute matrix
            T1_gt, T2_gt = [_.cuda() for _ in inputs['Ts']]
            Inlier_src_gt, Inlier_ref_gt = [_.cuda() for _ in inputs['Ins']]

            batch_cur_size = perm_mat.size(0)
            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                s_pred, Inlier_src_pre, Inlier_ref_pre = model(
                    P1_gt, P2_gt, A1_gt, A2_gt, n1_gt, n2_gt)

                # multi_loss = []
                if cfg.DATASET.NOISE_TYPE == 'clean':
                    permloss = permLoss(s_pred, perm_mat, n1_gt, n2_gt)
                    loss = permloss
                else:
                    if cfg.PGM.USEINLIERRATE:
                        s_pred = Inlier_src_pre * s_pred * Inlier_ref_pre.transpose(
                            2, 1).contiguous()
                    permloss = permLoss(s_pred, perm_mat, n1_gt, n2_gt)
                    loss = permloss

                # backward + optimize
                loss.backward()
                optimizer.step()

                # training accuracy statistic
                s_perm_mat = lap_solver(s_pred, n1_gt, n2_gt, Inlier_src_pre,
                                        Inlier_ref_pre)
                match_metrics = matching_accuracy(s_perm_mat, perm_mat, n1_gt)
                perform_metrics = compute_metrics(s_perm_mat, P1_gt[:, :, :3],
                                                  P2_gt[:, :, :3],
                                                  T1_gt[:, :3, :3],
                                                  T1_gt[:, :3, 3])

                for k in match_metrics:
                    all_train_metrics_np[k].append(match_metrics[k])
                for k in perform_metrics:
                    all_train_metrics_np[k].append(perform_metrics[k])
                all_train_metrics_np['loss'].append(np.repeat(loss.item(), 4))

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * batch_cur_size / (
                        time.time() - running_since)
                    # globalstep = epoch * dataset_size + iter_num * batch_cur_size
                    print(
                        'Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f} GT-Acc:{:.4f} Pred-Acc:{:.4f}'
                        .format(
                            epoch, iter_num, running_speed,
                            np.mean(
                                np.concatenate(all_train_metrics_np['loss'])
                                [-cfg.STATISTIC_STEP * batch_cur_size:]),
                            np.mean(
                                np.concatenate(all_train_metrics_np['acc_gt'])
                                [-cfg.STATISTIC_STEP * batch_cur_size:]),
                            np.mean(
                                np.concatenate(
                                    all_train_metrics_np['acc_pred'])
                                [-cfg.STATISTIC_STEP * batch_cur_size:])))
                    running_since = time.time()

        all_train_metrics_np = {
            k: np.concatenate(all_train_metrics_np[k])
            for k in all_train_metrics_np
        }
        summary_metrics = summarize_metrics(all_train_metrics_np)
        print('Epoch {:<4} Mean-Loss: {:.4f} GT-Acc:{:.4f} Pred-Acc:{:.4f}'.
              format(epoch, summary_metrics['loss'], summary_metrics['acc_gt'],
                     summary_metrics['acc_pred']))
        print_metrics(summary_metrics)

        save_model(model,
                   str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(),
                   str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        # to save values during training
        metric_is_save = False
        if metric_is_save:
            np.save(
                str(
                    Path(cfg.OUTPUT_PATH) /
                    ('train_log_' + savefiletime + '_metric')),
                all_train_metrics_np)

        if viz is not None:
            viz.update('train_loss', epoch, {'loss': summary_metrics['loss']})
            viz.update('train_acc', epoch, {'acc': summary_metrics['acc_gt']})
            viz.update(
                'train_metric', epoch, {
                    'r_mae': summary_metrics['r_mae'],
                    't_mae': summary_metrics['t_mae']
                })

        # Eval in each epochgi
        val_metrics = eval_model(model, dataloader['val'])
        if viz is not None:
            viz.update('val_acc', epoch, {'acc': val_metrics['acc_gt']})
            viz.update('val_metric', epoch, {
                'r_mae': val_metrics['r_mae'],
                't_mae': val_metrics['t_mae']
            })
        if optimal_acc < val_metrics['acc_gt']:
            optimal_acc = val_metrics['acc_gt']
            print('Current best acc model is {}'.format(epoch + 1))
        if optimal_rot > val_metrics['r_mae']:
            optimal_rot = val_metrics['r_mae']
            print('Current best rotation model is {}'.format(epoch + 1))

        # Test in each epochgi
        test_metrics = eval_model(model, dataloader['test'])
        if viz is not None:
            viz.update('test_acc', epoch, {'acc': test_metrics['acc_gt']})
            viz.update('test_metric', epoch, {
                'r_mae': test_metrics['r_mae'],
                't_mae': test_metrics['t_mae']
            })

        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model
Example #21
0
def train(cmd_opt, word2vec_matrix, dataset_loader, num_classes, image_size,
            word2vec_embedding_size, image_branch, validation_examples):
    global_step_tensor = tf.Variable(0, trainable=False, name="global_step")
    train_image_tensor, train_vector_tensor, train_label_tensor = \
        create_batches(dataset_loader, cmd_opt.batchSize // 2, image_size,
                       word2vec_embedding_size,)
    valid_image_tensor, valid_vector_tensor, valid_label_tensor = \
        create_batches(dataset_loader, cmd_opt.batchSize, image_size,
                        word2vec_embedding_size, is_train=False, 
                        is_valid=True)
    
    image_placeholder, wordvec_placeholder, groundtruth_placeholder, \
        matrix_placeholder = create_placeholders(cmd_opt.batchSize, image_size,
                                                 word2vec_embedding_size, num_classes)
    im_em, word_em, loss_tensor, train_op = train_model(image = image_placeholder, 
                                        word_vec = wordvec_placeholder,
                                        groundtruth = groundtruth_placeholder,
                                        embedding_size = cmd_opt.embeddingSize,
                                        learning_rate = cmd_opt.learningRate, 
                                        global_step = global_step_tensor,
                                        loss_margin=cmd_opt.margin, num_classes=num_classes,
                                        batch_size=cmd_opt.batchSize)
    label_tensor, _, embedding_inversion = eval_model(image_embedding = im_em, 
                              wordvec_embedding = word_em,
                              matrix = matrix_placeholder,                  
                              batch_size = cmd_opt.batchSize,
                              num_classes = num_classes, 
                              word_validation = cmd_opt.validation==0,
                              image_validation = cmd_opt.validation==1)
    supervisor_saver = tf.train.Saver()    
    # supervisor = tf.train.Supervisor(logdir=None,#cmd_opt.expDir, 
                                     # summary_op=None,
                                     # global_step=global_step_tensor,
                                     # saver=supervisor_saver)
    
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    # config_proto.gpu_options.per_process_gpu_memory_fraction = 0.6

    session = tf.Session(config=config_proto)
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord, sess=session)
    session = tf_debug.LocalCLIDebugWrapperSession(session)

    # with supervisor.managed_session(config=config_proto) as session:
    for i in range(1): 
        session.run(tf.global_variables_initializer())
        global_step = session.run(global_step_tensor)
        start_time = time.time()
        for iter in range(global_step, cmd_opt.numIters):
            loss, global_step = train_iter(session, train_op, loss_tensor,
                                              global_step_tensor, train_image_tensor,
                                              train_vector_tensor, train_label_tensor, dataset_loader.word2vec_matrix, \
                                              dataset_loader.distance_matrix, image_placeholder,
                                              wordvec_placeholder, groundtruth_placeholder, permutation=True)
            if (iter+1) % cmd_opt.displayIters == 0:
                end_time = time.time()
                print ("Time per iteration: ", str((end_time-start_time)/cmd_opt.displayIters))
                print ("Training Loss at " , iter+1, ": ", str(loss))
                start_time = time.time()
            # if (iter+1) % cmd_opt.validIters == 0:
                # if cmd_opt.validation == 0:
                   
            print("Validation accuracy: ",
                  compute_word_accuracy(session, valid_image_tensor,
                                              valid_label_tensor, label_tensor,
                                              image_placeholder,
                                              cmd_opt.batchSize, word2vec_matrix,
                                              matrix_placeholder, validation_examples))
Example #22
0
def train_model(model,
                optimizer,
                train_loader,
                epochs,
                scheduler,
                early_stopping=None,
                test_loader=None,
                eval_loader=None,
                device='cpu',
                t=1):
    scores = []
    mean_losses = []

    best_model = model.state_dict()
    best_model_i = 0
    model.to(device)

    if early_stopping is not None:
        early_stopping.reset()

    model.train()
    bar = tqdm(range(epochs), leave=True)
    for epoch in bar:
        model.train()
        losses = []
        for i, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)
            if t > 1:
                pred = torch.stack([model(x) for _ in range(t)], dim=0)
                pred = pred.mean(0)
            else:
                pred = model(x)
            loss = torch.nn.functional.cross_entropy(pred, y, reduction='none')
            losses.extend(loss.tolist())
            loss = loss.mean()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if scheduler is not None:
            if isinstance(scheduler, (StepLR, MultiStepLR)):
                scheduler.step()
            elif hasattr(scheduler, 'step'):
                scheduler.step()

        if eval_loader is not None:
            eval_scores, _ = eval_model(model,
                                        eval_loader,
                                        topk=[1, 5],
                                        device=device)
        else:
            eval_scores = 0

        mean_loss = sum(losses) / len(losses)
        mean_losses.append(mean_loss)

        if early_stopping is not None:
            r = early_stopping.step(eval_scores[1]) if eval_loader is not None \
                else early_stopping.step(mean_loss)

            if r < 0:
                break
            elif r > 0:
                best_model = model.state_dict()
                best_model_i = epoch
        else:
            best_model = model.state_dict()
            best_model_i = epoch

        train_scores, _ = eval_model(model, train_loader, device=device)
        test_scores, _ = eval_model(model, test_loader, device=device)

        bar.set_postfix({
            'Train score': train_scores[1],
            'Test score': test_scores[1],
            'Eval score': eval_scores[1] if eval_scores != 0 else 0,
            'Mean loss': mean_loss
        })
        scores.append((train_scores, eval_scores, test_scores))

    return best_model, scores, scores[best_model_i], mean_losses
Example #23
0
def train_eval_model(model,
                     criterion,
                     optimizer,
                     image_dataset,
                     dataloader,
                     tfboard_writer,
                     benchmark,
                     num_epochs=25,
                     start_epoch=0,
                     xls_wb=None):
    print('Start training...')

    since = time.time()
    dataset_size = len(dataloader['train'].dataset)
    displacement = Displacement()

    device = next(model.parameters()).device
    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    model_path, optim_path = '', ''
    if start_epoch != 0:
        model_path = str(checkpoint_path / 'params_{:04}.pt'.format(start_epoch))
        optim_path = str(checkpoint_path / 'optim_{:04}.pt'.format(start_epoch))
    if len(cfg.PRETRAINED_PATH) > 0:
        model_path = cfg.PRETRAINED_PATH
    if len(model_path) > 0:
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path, strict=False)
    if len(optim_path) > 0:
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=cfg.TRAIN.LR_STEP,
                                               gamma=cfg.TRAIN.LR_DECAY,
                                               last_epoch=cfg.TRAIN.START_EPOCH - 1)

    for epoch in range(start_epoch, num_epochs):
        # Reset seed after evaluation per epoch
        torch.manual_seed(cfg.RANDOM_SEED + epoch + 1)
        dataloader['train'] = get_dataloader(image_dataset['train'], shuffle=True, fix_seed=False)
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        model.train()  # Set model to training mode

        print('lr = ' + ', '.join(['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        epoch_loss = 0.0
        running_loss = 0.0
        running_since = time.time()
        iter_num = 0

        # Iterate over data.
        for inputs in dataloader['train']:
            if iter_num >= cfg.TRAIN.EPOCH_ITERS:
                break
            if model.module.device != torch.device('cpu'):
                inputs = data_to_cuda(inputs)

            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                outputs = model(inputs)

                if cfg.PROBLEM.TYPE == '2GM':
                    assert 'ds_mat' in outputs
                    assert 'perm_mat' in outputs
                    assert 'gt_perm_mat' in outputs

                    # compute loss
                    if cfg.TRAIN.LOSS_FUNC == 'offset':
                        d_gt, grad_mask = displacement(outputs['gt_perm_mat'], *outputs['Ps'], outputs['ns'][0])
                        d_pred, _ = displacement(outputs['ds_mat'], *outputs['Ps'], outputs['ns'][0])
                        loss = criterion(d_pred, d_gt, grad_mask)
                    elif cfg.TRAIN.LOSS_FUNC in ['perm', 'ce', 'hung']:
                        loss = criterion(outputs['ds_mat'], outputs['gt_perm_mat'], *outputs['ns'])
                    elif cfg.TRAIN.LOSS_FUNC == 'hamming':
                        loss = criterion(outputs['perm_mat'], outputs['gt_perm_mat'])
                    elif cfg.TRAIN.LOSS_FUNC == 'custom':
                        loss = torch.sum(outputs['loss'])
                    else:
                        raise ValueError(
                            'Unsupported loss function {} for problem type {}'.format(cfg.TRAIN.LOSS_FUNC,
                                                                                      cfg.PROBLEM.TYPE))

                    # compute accuracy
                    acc = matching_accuracy(outputs['perm_mat'], outputs['gt_perm_mat'], outputs['ns'][0])

                elif cfg.PROBLEM.TYPE in ['MGM', 'MGM3']:
                    assert 'ds_mat_list' in outputs
                    assert 'graph_indices' in outputs
                    assert 'perm_mat_list' in outputs
                    if not 'gt_perm_mat_list' in outputs:
                        assert 'gt_perm_mat' in outputs
                        gt_perm_mat_list = [outputs['gt_perm_mat'][idx] for idx in outputs['graph_indices']]
                    else:
                        gt_perm_mat_list = outputs['gt_perm_mat_list']

                    # compute loss & accuracy
                    if cfg.TRAIN.LOSS_FUNC in ['perm', 'ce' 'hung']:
                        loss = torch.zeros(1, device=model.module.device)
                        ns = outputs['ns']
                        for s_pred, x_gt, (idx_src, idx_tgt) in \
                                zip(outputs['ds_mat_list'], gt_perm_mat_list, outputs['graph_indices']):
                            l = criterion(s_pred, x_gt, ns[idx_src], ns[idx_tgt])
                            loss += l
                        loss /= len(outputs['ds_mat_list'])
                    elif cfg.TRAIN.LOSS_FUNC == 'plain':
                        loss = torch.sum(outputs['loss'])
                    else:
                        raise ValueError(
                            'Unsupported loss function {} for problem type {}'.format(cfg.TRAIN.LOSS_FUNC,
                                                                                      cfg.PROBLEM.TYPE))

                    # compute accuracy
                    acc = torch.zeros(1, device=model.module.device)
                    for x_pred, x_gt, (idx_src, idx_tgt) in \
                            zip(outputs['perm_mat_list'], gt_perm_mat_list, outputs['graph_indices']):
                        a = matching_accuracy(x_pred, x_gt, ns[idx_src])
                        acc += torch.sum(a)
                    acc /= len(outputs['perm_mat_list'])
                else:
                    raise ValueError('Unknown problem type {}'.format(cfg.PROBLEM.TYPE))

                # backward + optimize
                if cfg.FP16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()
                optimizer.step()

                batch_num = inputs['batch_size']

                # tfboard writer
                loss_dict = dict()
                loss_dict['loss'] = loss.item()
                tfboard_writer.add_scalars('loss', loss_dict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num)

                accdict = dict()
                accdict['matching accuracy'] = torch.mean(acc)
                tfboard_writer.add_scalars(
                    'training accuracy',
                    accdict,
                    epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                )

                # statistics
                running_loss += loss.item() * batch_num
                epoch_loss += loss.item() * batch_num

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * batch_num / (time.time() - running_since)
                    print('Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}'
                          .format(epoch, iter_num, running_speed, running_loss / cfg.STATISTIC_STEP / batch_num))
                    tfboard_writer.add_scalars(
                        'speed',
                        {'speed': running_speed},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                    )

                    tfboard_writer.add_scalars(
                        'learning rate',
                        {'lr_{}'.format(i): x['lr'] for i, x in enumerate(optimizer.param_groups)},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                    )

                    running_loss = 0.0
                    running_since = time.time()

        epoch_loss = epoch_loss / cfg.TRAIN.EPOCH_ITERS / batch_num

        save_model(model, str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(), str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss))
        print()

        # Eval in each epoch
        if dataloader['test'].dataset.cls not in ['none', 'all', None]:
            clss = [dataloader['test'].dataset.cls]
        else:
            clss = dataloader['test'].dataset.bm.classes
        l_e = (epoch == (num_epochs - 1))
        accs = eval_model(model, clss, benchmark['test'], l_e,
                          xls_sheet=xls_wb.add_sheet('epoch{}'.format(epoch + 1)))
        acc_dict = {"{}".format(cls): single_acc for cls, single_acc in zip(dataloader['test'].dataset.classes, accs)}
        acc_dict['average'] = torch.mean(accs)
        tfboard_writer.add_scalars(
            'Eval acc',
            acc_dict,
            (epoch + 1) * cfg.TRAIN.EPOCH_ITERS
        )
        wb.save(wb.__save_path)

        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'
          .format(time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model
def main():
    parser = argparse.ArgumentParser(description='Train a compositional recognizer model.')
    parser.add_argument('-c', '--class-count', type=int, default=10)
    parser.add_argument('-l', '--seq-length', type=int, default=10)
    parser.add_argument('-o', '--overlap', type=int, default=2)
    parser.add_argument('--noise', type=float, default=None)
    parser.add_argument('-s', '--hidden-size', type=int, default=512)
    parser.add_argument('-b', '--batch-size', type=int, default=8)
    parser.add_argument('-n', '--epoch-size', type=int, default=5120)
    parser.add_argument('-v', '--validation-size', type=int, default=5120)
    parser.add_argument('-e', '--epoch-count', type=int, default=100)
    parser.add_argument('-g', '--gpu_id', type=int, default=0)
    parser.add_argument('-d', '--dropout', type=float, default=0)
    parser.add_argument('-r', '--regenerate', action='store_true')
    parser.add_argument('-w', '--write-to', default=None)

    args = parser.parse_args()

    dataset = CompositionalDataset(args.class_count, args.seq_length, args.overlap, args.noise)
    model = CompositionalRecognizer(args.class_count, args.hidden_size, args.dropout)
    optimizer = torch.optim.Adam(model.parameters())
    device = torch.device('cuda:{id}'.format(id=args.gpu_id))

    if not args.regenerate:
        print('Generating training dataset...')
        dataloader = DataLoader(dataset.generate_dataset(args.epoch_size), args.batch_size, drop_last=True)
    print('Generating validation dataset...')
    val_dataloader = DataLoader(dataset.generate_dataset(args.validation_size), 256, drop_last=True)

    losses = []
    scores = []

    model = model.to(device)
    for epoch in range(1, args.epoch_count+1):
        model.train()
        total_loss = 0
        if args.regenerate:
            print('Generating training dataset for epoch {i}...'.format(i=epoch))
            dataloader = DataLoader(dataset.generate_dataset(args.epoch_size), args.batch_size, drop_last=True)
        print('Starting epoch {i}...'.format(i=epoch))
        pbar = tqdm(total=args.epoch_size, desc='Batch - (Loss = -)')
        batch = 1
        for x, labels in dataloader:
            x = x.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()

            loss = model.forward_loss(x, labels)
            loss_val = loss.item()
            total_loss += loss_val

            loss.backward()
            optimizer.step()

            pbar.update(args.batch_size)
            pbar.set_description('Batch {b} (Loss = {ls})'.format(b=batch, ls=round(loss_val, 3)))
            batch += 1
        pbar.close()
        total_loss /= args.epoch_size // args.batch_size
        losses.append(total_loss)
        print('Average epoch loss:', total_loss)
        print('Evaluating MAP score...')
        model.eval()
        map_score = eval_model(model, val_dataloader, device)
        scores.append(map_score)
        print('Epoch MAP score:', map_score)

    if args.write_to is not None:
        with open(args.write_to, 'a') as f:
            f.write(json.dumps(dict(params=vars(args), losses=losses, scores=scores)) + '\n')
def train_eval_model(model,
                     criterion,
                     optimizer,
                     dataloader,
                     tfboard_writer,
                     num_epochs=25,
                     resume=False,
                     start_epoch=0):
    print('Start training...')

    since = time.time()
    dataset_size = len(dataloader['train'].dataset)

    device = next(model.parameters()).device
    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    #model_path = str(checkpoint_path / 'params_{:04}.pt'.format(2))
    #print('Loading model parameters from {}'.format(model_path))
    #load_model(model, model_path)
    if resume:
        assert start_epoch != 0
        model_path = str(checkpoint_path /
                         'params_{:04}.pt'.format(start_epoch))
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path)

        optim_path = str(checkpoint_path /
                         'optim_{:04}.pt'.format(start_epoch))
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    margin_loss = MarginLoss(30)
    marginedge_loss = MarginLoss(1, 0.3)
    scheduler = optim.lr_scheduler.ExponentialLR(
        optimizer,
        gamma=cfg.TRAIN.LR_DECAY,
        last_epoch=cfg.TRAIN.START_EPOCH - 1)
    #scheduler.step()
    for epoch in range(start_epoch, num_epochs):
        score_thresh = min(epoch * 0.1, 0.5)
        print('Epoch {}/{},score_thresh {}'.format(epoch, num_epochs - 1,
                                                   score_thresh))
        print('-' * 10)

        model.train()  # Set model to training mode

        print('lr = ' + ', '.join(
            ['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        epoch_loss = 0.0
        running_loss = 0.0
        running_since = time.time()
        iter_num = 0

        # Iterate over data.
        for inputs in dataloader['train']:
            data1, data2 = [_.cuda() for _ in inputs['images']]

            P1_gt, P2_gt = [_.cuda() for _ in inputs['Ps']]
            n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']]

            weights = inputs['ws'].cuda()
            perm_mat = inputs['gt_perm_mat'].cuda()
            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                s_pred, d_pred,match_emb1,match_emb2,match_edgeemb1,match_edgeemb2,perm_mat,n1_gt,n2_gt = \
                    model(data1, data2, P1_gt, P2_gt, n1_gt, n2_gt,perm_mat=perm_mat,score_thresh=score_thresh)

                multi_loss = []
                loss_lsm = criterion(s_pred, perm_mat, n1_gt, n2_gt, weights)

                loss_marg = margin_loss(match_emb1, match_emb2, perm_mat,
                                        n1_gt, n2_gt)
                loss_edgemarg = marginedge_loss(match_edgeemb1, match_edgeemb2,
                                                perm_mat, n1_gt, n2_gt)
                loss = (loss_marg + loss_edgemarg
                        ) * 0.25 + loss_lsm  #(loss_marg)*0.5+loss_pca
                # backward + optimize
                loss.backward()
                optimizer.step()

                # tfboard writer
                loss_dict = {
                    'loss_{}'.format(i): l.item()
                    for i, l in enumerate(multi_loss)
                }
                loss_dict['loss'] = loss.item()
                tfboard_writer.add_scalars(
                    'loss', loss_dict,
                    epoch * cfg.TRAIN.EPOCH_ITERS + iter_num)
                # statistics
                running_loss += loss.item() * perm_mat.size(0)
                epoch_loss += loss.item() * perm_mat.size(0)

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * perm_mat.size(0) / (
                        time.time() - running_since)
                    print(
                        'Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}'
                        .format(
                            epoch, iter_num, running_speed, running_loss /
                            cfg.STATISTIC_STEP / perm_mat.size(0)))
                    tfboard_writer.add_scalars(
                        'speed', {'speed': running_speed},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num)
                    running_loss = 0.0
                    running_since = time.time()

        epoch_loss = epoch_loss / dataset_size

        save_model(model,
                   str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(),
                   str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss))
        print()

        # Eval in each epoch
        accs = eval_model(model, dataloader['test'], train_epoch=epoch)
        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model
Example #26
0
        learning_rate=args.learning_rate,
        cuda=cuda)
    start_epoch, best_accuracy = load_model(model, cuda)

    for epoch in range(start_epoch, args.epochs):
        train_model(model=model,
                    optimizer=optimizer,
                    train_loader=train_loader,
                    train_dataset=train_dataset,
                    loss_fn=loss_fn,
                    num_epochs=args.epochs,
                    epoch=epoch,
                    batch_size=args.batch_size,
                    notify=args.notify,
                    cuda=cuda)
        accuracy = eval_model(model=model, test_loader=test_loader, cuda=cuda)
        accuracy = 100. * accuracy / len(test_loader.dataset)

        logging.info('Test Accuracy: {:.2f}%'.format(accuracy))
        send_metrics(accuracy=accuracy)

        # Save checkpoint logic
        accuracy = torch.FloatTensor([accuracy])
        best_accuracy = torch.FloatTensor(max(accuracy.numpy(), best_accuracy.numpy()))
        if bool(accuracy.numpy() > best_accuracy.numpy()):
            logging.info('Saving new state for epoch {}'.format(epoch))
            state = {
                'epoch': epoch + 1,
                'state': model.state_dict(),
                'accuracy': best_accuracy
            }
Example #27
0
def train_n_epochs(model,
                   hyper,
                   data,
                   data_valid,
                   evals,
                   n_epochs,
                   feedbacks_per_epoch=10,
                   alpha_decay=1.0):
    """ Train the model for a desired amount of epochs.
        Automatically takes snapshots of the parameters after each epoch, and
        monitors the progress.

    Args:
        model:
        data:       (str) Training data
        hyper:      (dict) hyperparameters dictionary
        data_valid: (str) Validation data
        evals:      (dict of lists)
                    The dict that stores the losses and times for each epoch
        n_epochs:   (int) Number of epochs to train for
        feedbacks_per_epoch: (int) Max number of progress printouts per epoch
        alpha_decay: (float)(default=1.0)
                    How much to decay the alpha by after each epoch.

    Returns: (dict)
        - evals - the dictionary that monitors the losses, and times
    """
    timer = Timer()
    timer.start()

    # CALCULATE NUMBER OF STEPS NEEDED
    # Technically the following calculation for `samples_per_epoch` is incorrect,
    # since we are randomly sampling windows, and not dividing the data into an
    # even number of chunks.
    # But it is still a useful approximation, that allows us to have more variation
    # in the training data.
    samples_per_epoch = int(len(data_train) // hyper["SAMPLE_LENGTH"])
    steps_per_epoch = int(samples_per_epoch / hyper["BATCH_SIZE"])
    feedback_every = int(steps_per_epoch / feedbacks_per_epoch)

    try:
        for i in range(n_epochs):
            print()
            print("=" * 60)
            print("EPOCH {}/{} ({:0.2f}%) alpha={}".format(
                i + 1, n_epochs, 100 * (i / n_epochs), model.alpha))
            print("=" * 60)

            # TRAIN OVER A SINGLE EPOCH
            train_loss, epoch_time = train_n_steps(
                model,
                hyper,
                data_train,
                n_steps=steps_per_epoch,
                batch_size=hyper["BATCH_SIZE"],
                feedback_every=feedback_every)

            evals["train_loss"].append(train_loss)
            evals["train_time"].append(epoch_time)
            evals["alpha"].append(model.alpha)

            # EVALUATE ON VALIDATION DATA
            eval_loss, eval_time = eval_model(
                model,
                data_valid,
                char2id,
                seq_length=hyper["SAMPLE_LENGTH"],
                batch_size=hyper["BATCH_SIZE"])
            evals["valid_loss"].append(eval_loss)
            evals["valid_time"].append(eval_time)

            # PREPARE MODEL FOR NEXT EPOCH
            model.update_learning_rate(model.alpha * alpha_decay)
            hyper["LAST_ALPHA"] = model.alpha

            # TAKE SNAPSHOTS - of parameters and evaluation dictionary
            global_epoch = len(evals["train_loss"])
            epoch_snapshot(model,
                           epoch=global_epoch,
                           loss=eval_loss,
                           name=MODEL_NAME,
                           dir=SNAPSHOTS_DIR)
            obj2pickle(evals, EVALS_FILE)
            save_hyper_params(hyper, HYPERPARAMS_FILE)

            # FEEDBACK PRINTOUTS
            # TODO: Save a sample numerous generated strings to files at each epoch
            # Print a sample of generated text
            print_sample_generation(model, char2id, exploration=0.85)
            epoch_template = "({}) TRAIN_LOSS={: 7.3f} VALID_LOSS={: 7.3f}"
            print(
                epoch_template.format(timer.elapsed_string(), train_loss,
                                      eval_loss))

            # UPDATE LEARNING CURVE PLOT
            plot_learning_curves(evals,
                                 file=LEARNING_CURVES_FILE,
                                 model_name=MODEL_NAME)

        print("- DONE")
        return evals

    # HANDLE EARLY TERMINATION
    except KeyboardInterrupt:
        print("\n A keyboard interrupt was triggered at",
              timer.elapsed_string())

        # Save parameters as a recovery file
        print("Storing Recovery parameters")
        file = os.path.join(SNAPSHOTS_DIR, MODEL_NAME + ".recovery_params")
        take_snapshot(model, file)

        # Save evals as a recovery file
        print("Storing Recovery evals")
        file = os.path.join(MODELS_DIR, MODEL_NAME + ".recovery_evals")
        obj2pickle(evals, file)

        # Save hyper parameters
        print("Saving Hyper Params")
        hyper["LAST_ALPHA"] = model.alpha
        save_hyper_params(hyper, HYPERPARAMS_FILE)

        print("OK DONE")
        return evals
Example #28
0
def train_eval_model(model,
                     criterion,
                     optimizer,
                     dataloader,
                     tfboard_writer,
                     num_epochs=25,
                     resume=False,
                     start_epoch=0):
    print('Start training...')

    since = time.time()
    dataset_size = len(dataloader['train'].dataset)
    displacement = Displacement()
    lap_solver = hungarian

    device = next(model.parameters()).device
    print('model on device: {}'.format(device))

    checkpoint_path = Path(cfg.OUTPUT_PATH) / 'params'
    if not checkpoint_path.exists():
        checkpoint_path.mkdir(parents=True)

    if resume:
        assert start_epoch != 0
        model_path = str(checkpoint_path / 'params_{:04}.pt'.format(start_epoch))
        print('Loading model parameters from {}'.format(model_path))
        load_model(model, model_path)

        optim_path = str(checkpoint_path / 'optim_{:04}.pt'.format(start_epoch))
        print('Loading optimizer state from {}'.format(optim_path))
        optimizer.load_state_dict(torch.load(optim_path))

    scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                               milestones=cfg.TRAIN.LR_STEP,
                                               gamma=cfg.TRAIN.LR_DECAY,
                                               last_epoch=cfg.TRAIN.START_EPOCH - 1)

    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        model.train()  # Set model to training mode

        print('lr = ' + ', '.join(['{:.2e}'.format(x['lr']) for x in optimizer.param_groups]))

        epoch_loss = 0.0
        running_loss = 0.0
        running_since = time.time()
        iter_num = 0

        # Iterate over data.
        for inputs in dataloader['train']:
            if 'images' in inputs:
                data1, data2 = [_.cuda() for _ in inputs['images']]
                inp_type = 'img'
            elif 'features' in inputs:
                data1, data2 = [_.cuda() for _ in inputs['features']]
                inp_type = 'feat'
            else:
                raise ValueError('no valid data key (\'images\' or \'features\') found from dataloader!')
            P1_gt, P2_gt = [_.cuda() for _ in inputs['Ps']]
            n1_gt, n2_gt = [_.cuda() for _ in inputs['ns']]
            if 'es' in inputs:
                e1_gt, e2_gt = [_.cuda() for _ in inputs['es']]
                G1_gt, G2_gt = [_.cuda() for _ in inputs['Gs']]
                H1_gt, H2_gt = [_.cuda() for _ in inputs['Hs']]
                KG, KH = [_.cuda() for _ in inputs['Ks']]
            perm_mat = inputs['gt_perm_mat'].cuda()

            iter_num = iter_num + 1

            # zero the parameter gradients
            optimizer.zero_grad()

            with torch.set_grad_enabled(True):
                # forward
                if 'es' in inputs:
                    s_pred, d_pred = \
                        model(data1, data2, P1_gt, P2_gt, G1_gt, G2_gt, H1_gt, H2_gt, n1_gt, n2_gt, KG, KH, inp_type)
                else:
                    s_pred, d_pred = \
                    model(data1, data2, P1_gt, P2_gt, n1_gt, n2_gt)

                multi_loss = []
                if cfg.TRAIN.LOSS_FUNC == 'offset':
                    d_gt, grad_mask = displacement(perm_mat, P1_gt, P2_gt, n1_gt)
                    loss = criterion(d_pred, d_gt, grad_mask)
                elif cfg.TRAIN.LOSS_FUNC == 'perm':
                    loss = criterion(s_pred, perm_mat, n1_gt, n2_gt)
                else:
                    raise ValueError('Unknown loss function {}'.format(cfg.TRAIN.LOSS_FUNC))

                # backward + optimize
                loss.backward()
                optimizer.step()

                if cfg.MODULE == 'NGM.hypermodel':
                    tfboard_writer.add_scalars(
                        'weight',
                        {'w2': model.module.weight2, 'w3': model.module.weight3},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                    )

                # training accuracy statistic
                acc, _, __ = matching_accuracy(lap_solver(s_pred, n1_gt, n2_gt), perm_mat, n1_gt)

                # tfboard writer
                loss_dict = {'loss_{}'.format(i): l.item() for i, l in enumerate(multi_loss)}
                loss_dict['loss'] = loss.item()
                tfboard_writer.add_scalars('loss', loss_dict, epoch * cfg.TRAIN.EPOCH_ITERS + iter_num)
                accdict = dict()
                accdict['matching accuracy'] = acc
                tfboard_writer.add_scalars(
                    'training accuracy',
                    accdict,
                    epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                )

                # statistics
                running_loss += loss.item() * perm_mat.size(0)
                epoch_loss += loss.item() * perm_mat.size(0)

                if iter_num % cfg.STATISTIC_STEP == 0:
                    running_speed = cfg.STATISTIC_STEP * perm_mat.size(0) / (time.time() - running_since)
                    print('Epoch {:<4} Iteration {:<4} {:>4.2f}sample/s Loss={:<8.4f}'
                          .format(epoch, iter_num, running_speed, running_loss / cfg.STATISTIC_STEP / perm_mat.size(0)))
                    tfboard_writer.add_scalars(
                        'speed',
                        {'speed': running_speed},
                        epoch * cfg.TRAIN.EPOCH_ITERS + iter_num
                    )
                    running_loss = 0.0
                    running_since = time.time()

        epoch_loss = epoch_loss / dataset_size

        save_model(model, str(checkpoint_path / 'params_{:04}.pt'.format(epoch + 1)))
        torch.save(optimizer.state_dict(), str(checkpoint_path / 'optim_{:04}.pt'.format(epoch + 1)))

        print('Epoch {:<4} Loss: {:.4f}'.format(epoch, epoch_loss))
        print()

        # Eval in each epoch
        accs = eval_model(model, dataloader['test'])
        acc_dict = {"{}".format(cls): single_acc for cls, single_acc in zip(dataloader['train'].dataset.classes, accs)}
        acc_dict['average'] = torch.mean(accs)
        tfboard_writer.add_scalars(
            'Eval acc',
            acc_dict,
            (epoch + 1) * cfg.TRAIN.EPOCH_ITERS
        )

        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'
          .format(time_elapsed // 3600, (time_elapsed // 60) % 60, time_elapsed % 60))

    return model
Example #29
0
        '/home/dcg-adlr-mranzinger-data.cosmos1100/scene-text/icdar/incidental_text/',
        help='Path to the images to test against')

    args = parser.parse_args()

    model = EAST(False)

    paths = []

    for dirpath, dirnames, filenames in os.walk(args.root):
        for dirname in dirnames:
            if dirname == 'checkpoints':
                experiment = os.path.join(dirpath, dirname)

                try:
                    chk = resolve_checkpoint_path(experiment, load_best=True)
                    paths.append(chk)
                except:
                    pass

    paths.sort()

    for dataset in ['val', 'relabeled_val']:
        dataset = os.path.join(args.dataset, dataset)
        print(f'\n\n----------------------\nDataset: {dataset}')
        for chk in paths:
            print(f'\nUsing checkpoint: {chk}')

            submit_path = './submit'
            eval_model(model, chk, dataset, submit_path)
Example #30
0
def be_model_training(model,
                      optimizer,
                      train_loader,
                      epochs,
                      scheduler,
                      early_stopping=None,
                      test_loader=None,
                      eval_loader=None,
                      device='cpu',
                      w=1):
    def divergence():
        d = torch.tensor(0.0, device=device)

        for name, module in model.named_modules():
            if isinstance(module,
                          (EnsembleMaskedWrapper, BatchEnsembleMaskedWrapper)):
                distr = module.distributions
                for i, d1 in enumerate(distr):
                    for j, d2 in enumerate(distr):
                        if j <= i:
                            continue
                        d += MMD(d1, d2)

        return d

    model.to(device)

    distributions = dict()

    for name, module in model.named_modules():
        if isinstance(module, EnsembleMaskedWrapper):
            distr = module.distributions
            distributions[name] = distr

    for name, module in model.named_modules():
        if isinstance(module, EnsembleMaskedWrapper):
            module.set_distribution('all')

    scores = []
    mean_losses = []

    best_model = model.state_dict()
    best_model_i = 0
    model.to(device)

    if early_stopping is not None:
        early_stopping.reset()

    model.train()
    bar = tqdm(range(epochs), leave=True, desc='Mask training')

    for epoch in bar:
        model.train()
        losses = []
        kl_losses = []

        for i, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)
            pred = model(x)
            loss = torch.nn.functional.cross_entropy(pred, y, reduction='none')
            losses.extend(loss.tolist())
            loss = loss.mean()

            kl = divergence()
            kl = 1 / (kl + 1e-12)
            kl *= w
            kl_losses.append(kl.item())
            loss += kl

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if scheduler is not None:
            if isinstance(scheduler, (StepLR, MultiStepLR)):
                scheduler.step()

        if eval_loader is not None:
            eval_scores, _ = eval_model(model,
                                        eval_loader,
                                        topk=[1, 5],
                                        device=device)
        else:
            eval_scores = 0

        mean_loss = sum(losses) / len(losses)
        mean_losses.append(mean_loss)

        if early_stopping is not None:
            r = early_stopping.step(
                eval_scores[1]
            ) if eval_loader is not None else early_stopping.step(mean_loss)

            if r < 0:
                break
            elif r > 0:
                best_model = model.state_dict()
                best_model_i = epoch

        train_scores, _ = eval_model(model, train_loader, device=device)
        test_scores, _ = eval_model(model, test_loader, device=device)

        kl_losses = sum(kl_losses) / len(kl_losses)
        bar.set_postfix({
            'Train score': train_scores[1],
            'Test score': test_scores[1],
            'Eval score': eval_scores[1] if eval_scores != 0 else 0,
            'Mean loss': mean_loss,
            'Kl loss': kl_losses
        })

        scores.append((train_scores, eval_scores, test_scores))

    return best_model, scores, scores[best_model_i], mean_losses