def train_model(use_tfboard=False):
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    if use_tfboard:
        from c2board.writer import SummaryWriter
        tblogger = SummaryWriter(output_dir)
        tblogger.write_graph(model)

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model, tblogger if use_tfboard else None)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter)
            )
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

        for gpu_id in range(cfg.NUM_GPUS):
            tblogger.append_image("gpu_{}/data".format(gpu_id))
        tblogger.write_summaries(cur_iter)

    if use_tfboard:
        tblogger.close()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemple #2
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)
        if cur_iter == start_iter:
            #data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/person_mask','gpu_0/body_mask_labels','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed']
            # data_to_save = ['gpu_0/mask_emb_logits','gpu_0/mask_emb_labels','gpu_0/mask_emb_prob','gpu_0/mask_fcn_logits','gpu_0/masks_int32','gpu_0/fg_emb','gpu_0/mask_fcn_emb','gpu_0/fg_emb_normed','gpu_0/bg_emb_normed']
            # data_to_save = ['gpu_0/data','gpu_0/body_uv_rois','gpu_0/body_masks_wrt_box','gpu_0/body_mask_labels']
            # data_to_save = ['gpu_0/data', 'gpu_0/mask_rois', 'gpu_0/inter_masks_int32', 'gpu_0/masks_int32']
            data_to_save = [
                'gpu_0/data', 'gpu_0/keypoint_rois',
                'gpu_0/inter_keypoint_int32', 'gpu_0/keypoint_locations_int32'
            ]
            #data = [workspace.FetchBlob(k) for k in data_to_save]
            #cPickle.dump(data,open('inter_kps_data.pkl','wb'))

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        # np.save('DensePoseData/image.npy', workspace.FetchBlob('gpu_0/data'))
        # # np.save('DensePoseData/output.npy',workspace.FetchBlob('conv1'))
        # np.save('DensePoseData/outputgpu.npy',workspace.FetchBlob('gpu_0/conv1'))

        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            logger.info("\n\nCheckpoint Reached....Saving model \n\n")
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemple #4
0
def train_model(max_iters, roidb, pretrained_weight):
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model(
        pretrained_weight)
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir, roidb)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, max_iters):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints[max_iters - 1] = os.path.join(
        output_dir, 'model_iter{}.pkl'.format(max_iters - 1))
    nu.save_model_to_weights_file(checkpoints[max_iters - 1], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemple #5
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter, lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter)
            )
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
def train_model():
    try:
        """Model training loop."""
        logger = logging.getLogger(__name__)
        model, weights_file, start_iter, checkpoints, output_dir = create_model(
        )
        if 'final' in checkpoints:
            # The final model was found in the output directory, so nothing to do
            return checkpoints

        setup_model_for_training(model, weights_file, output_dir)
        training_stats = TrainingStats(model, cfg.TRAIN.LOG_PERIOD)
        CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)

        # initialize empty log file
        json_train_log = []

        ########################################################
        model.roi_data_loader.shutdown()
        return 0
        ########################################################
    except Exception as e:
        with open("/output/prep_log.txt", "a") as f:
            f.write("\n" + output_dir + " failed to start training \n" +
                    str(e))
        exit()

    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr, json_train_log)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)

    # Save training log-file
    log_path = os.path.join(output_dir, 'train_log.json')
    json_train_log = {
        'info': {
            'batch_size': cfg.TRAIN.IMS_PER_BATCH,
            'num_gpus': cfg.NUM_GPUS,
            'max_iterations': cfg.SOLVER.MAX_ITER,
            'datasets': cfg.TRAIN.DATASETS
        },
        'data': json_train_log
    }
    with open(log_path, 'w') as f:
        json.dump(json_train_log, f)

    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemple #7
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model()
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints

    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    #graph = net_drawer.GetPydotGraph(model.net.Proto().op, "mnist", rankdir="LR")
    #graph.write_png('graph.png')

    #max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.MAX_EPOCH == -1 else 10**8
    #print(max_iter)

    cur_iter = start_iter
    #for cur_iter in range(start_iter, max_iter):
    while True:
        training_stats.IterTic()
        if cfg.SOLVER.MAX_EPOCH == -1:
            lr = model.UpdateWorkspaceLr(cur_iter,
                                         lr_policy.get_lr_at_iter(cur_iter))
        else:
            lr = model.UpdateWorkspaceLr(
                training_stats.cur_epoch,
                lr_policy.get_lr_at_epoch(training_stats))

        workspace.RunNet(model.net.Proto().name)
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % CHECKPOINT_PERIOD == 0 and cur_iter > start_iter:
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

        if cfg.SOLVER.MAX_EPOCH == -1 and cur_iter == cfg.SOLVER.MAX_ITER:
            break

        if cfg.SOLVER.MAX_EPOCH != -1 and training_stats.cur_epoch == cfg.SOLVER.MAX_EPOCH + 1:
            break

        cur_iter += 1

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints
Exemple #8
0
def train_model():
    """Model training loop."""
    logger = logging.getLogger(__name__)
    model, weights_file, start_iter, checkpoints, output_dir = create_model(
    )  #for create model
    if 'final' in checkpoints:
        # The final model was found in the output directory, so nothing to do
        return checkpoints
    if 0:
        output_dir = '/home/icubic/daily_work/code/Detectron/train/coco_2014_train_ET_PH_part/generalized_rcnn_multi/'
    #output_dir = output_dir + '_101'
    setup_model_for_training(model, weights_file, output_dir)
    training_stats = TrainingStats(model)
    uuuu = model.roi_data_loader._blobs_queue_name
    CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS)
    print('------------train.py')
    for cur_iter in range(start_iter, cfg.SOLVER.MAX_ITER):
        training_stats.IterTic()
        lr = model.UpdateWorkspaceLr(cur_iter,
                                     lr_policy.get_lr_at_iter(cur_iter))
        #aaa_debug = workspace.FetchBlob('gpu_0/data')
        #bbb_debug = workspace.FetchBlob('gpu_0/conv1_w')
        #ccc_debug = workspace.FetchBlob('gpu_0/'+uuuu)
        try:
            workspace.RunNet(model.net.Proto().name)

            if 0:
                #import detectron.utils.blob as blob_utils
                inputs = [workspace.FetchBlob("gpu_0/rpn_rois_fpn2"),workspace.FetchBlob("gpu_0/rpn_rois_fpn3"),workspace.FetchBlob("gpu_0/rpn_rois_fpn4"),workspace.FetchBlob("gpu_0/rpn_rois_fpn5"), \
                          workspace.FetchBlob("gpu_0/rpn_rois_fpn6"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn2"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn3"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn4"), \
                          workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn5"),workspace.FetchBlob("gpu_0/rpn_roi_probs_fpn6"),workspace.FetchBlob("gpu_0/roidb"),workspace.FetchBlob("gpu_0/im_info"),\
                          ]
                rois = collect(inputs, True)
                #inputs.append(workspace.FetchBlob("gpu_0/rpn_rois_fpn2"))
                im_info = inputs[-1]
                im_scales = im_info[:, 2]
                roidb = blob_utils.deserialize(inputs[-2])
                # For historical consistency with the original Faster R-CNN
                # implementation we are *not* filtering crowd proposals.
                # This choice should be investigated in the future (it likely does
                # not matter).
                json_dataset.add_proposals(roidb,
                                           rois,
                                           im_scales,
                                           crowd_thresh=0)
                roidb_utils.add_bbox_regression_targets(roidb)
                # Compute training labels for the RPN proposals; also handles
                # distributing the proposals over FPN levels
                output_blob_names = fast_rcnn_roi_data.get_fast_rcnn_blob_names(
                )
                blobs = {k: [] for k in output_blob_names}
                fast_rcnn_roi_data.add_fast_rcnn_blobs(blobs, im_scales, roidb)
                for i, k in enumerate(output_blob_names):
                    blob_utils.py_op_copy_blob(blobs[k], outputs[i])
            #if (np.sum(bb == 1))>0:
            #   print('cc')
        except:
            aa = workspace.FetchBlob("gpu_0/rpn_rois_fpn2")
            aaa_debug = workspace.FetchBlob('gpu_0/data')
            print('aaaaaerror')
        #print("blobs:\n{}".format(workspace.Blobs()))
        #print('train.py   aaaaaaaa_debug')
        if 1:

            aaa = workspace.FetchBlob("gpu_0/data")  # nchw
            #img = aaa[1].copy()
            # BGR HWC -> CHW  12
            #transform_img = img.swapaxes(0, 1).swapaxes(1, 2)

            #cv2.imshow("image0 ", transform_img[:, :, (2, 1, 0)])

            #cv2.waitKey(0)
            #cv2.destroyAllWindows()
            #cv2.imshow('/home/icubic/daily_work/code/Detectron/aaa.png', aaa[0])
            aaa_debug = workspace.FetchBlob('gpu_0/data')
            bbb_debug = workspace.FetchBlob('gpu_0/conv1_w')
            ccc_debug = workspace.FetchBlob('gpu_0/' + uuuu)
            ddd_debug = workspace.FetchBlob('gpu_0/roidb')
            eee_debug = workspace.FetchBlob('gpu_0/im_info')
            #print("Fetched data:\n{}".format(workspace.FetchBlob("gpu_0/data")))
        if cur_iter == start_iter:
            nu.print_net(model)
        training_stats.IterToc()
        training_stats.UpdateIterStats()
        training_stats.LogIterStats(cur_iter, lr)

        if (cur_iter + 1) % (
                CHECKPOINT_PERIOD / 4
        ) == 0 and cur_iter > start_iter:  #((cur_iter + 1) % (CHECKPOINT_PERIOD/1) == 0 and (cur_iter > start_iter and cur_iter < 50000)) or ((cur_iter + 1) % (CHECKPOINT_PERIOD/8) == 0 and cur_iter > 50000):
            checkpoints[cur_iter] = os.path.join(
                output_dir, 'model_iter_50_{}.pkl'.format(cur_iter))
            nu.save_model_to_weights_file(checkpoints[cur_iter], model)

        if cur_iter == start_iter + training_stats.LOG_PERIOD:
            # Reset the iteration timer to remove outliers from the first few
            # SGD iterations
            training_stats.ResetIterTimer()

        if np.isnan(training_stats.iter_total_loss):
            logger.critical('Loss is NaN, exiting...')
            model.roi_data_loader.shutdown()
            envu.exit_on_error()

    # Save the final model
    checkpoints['final'] = os.path.join(output_dir, 'model_final_50.pkl')
    nu.save_model_to_weights_file(checkpoints['final'], model)
    # Shutdown data loading threads
    model.roi_data_loader.shutdown()
    return checkpoints