Exemple #1
0
def collect_network_statistics(net):

    status_net = []
    net_grads_graph = []

    for param_tensor in net.state_dict():

        status_net.append([
            param_tensor, (net.state_dict()[param_tensor].norm()).item(),
            list(net.state_dict()[param_tensor].size())
        ])

        all_net_grads = ((
            net.state_dict()[param_tensor]).cpu().numpy()).tolist()

        # if needed, flatten the list to get one nim and one max
        flat_net_grads = []
        if isinstance(all_net_grads, (list, )):

            for elem in all_net_grads:
                if isinstance(elem,
                              (list, )) and isinstance(elem[0], (list, )):
                    for item in elem:
                        flat_net_grads.extend(item)
                elif isinstance(elem, (list, )):
                    flat_net_grads.extend(elem)
                else:
                    flat_net_grads.extend([elem])
        else:
            flat_net_grads = all_net_grads

        net_grads_graph.append([min(flat_net_grads), max(flat_net_grads)])

    return net_grads_graph
Exemple #2
0
def evaluate():

    total = 0
    correct = 0

    with torch.no_grad():
        for data in testloader:
            inputs, labels = data
            outputs = net(inputs)
            prediction = torch.argmax(outputs)
            total += labels.size(0)
            correct += (prediction == labels).sum().item()

    accuracy = correct / total
    print('Accuracy: %f' % accuracy)
    path = os.getcwd() + '/saved_states/' + str(accuracy) + '.pt'
    torch.save(net.state_dict(), path)
    print('Saved State')
Exemple #3
0
def get_network_grads(net):
    weight_string = "weight"
    bias_string = "bias"
    output_gradients = []
    output_names = []

    parameters_names = list(net.state_dict().keys())
    j = 0
    for i in range(len(parameters_names)):
        par = parameters_names[i - j]
        is_rel_w = re.search(weight_string, par)
        is_rel_b = re.search(bias_string, par)
        if is_rel_w is None and is_rel_b is None:
            parameters_names.remove(par)
            j += 1

    for name, param in net.named_parameters():
        if name in parameters_names:
            all_net_grads = param.grad.data.cpu().numpy().tolist()
            flat_net_grads = []
            if isinstance(all_net_grads, (list, )):

                for elem in all_net_grads:
                    if isinstance(elem,
                                  (list, )) and isinstance(elem[0], (list, )):
                        for item in elem:
                            flat_net_grads.extend(item)
                    elif isinstance(elem, (list, )):
                        flat_net_grads.extend(elem)
                    else:
                        flat_net_grads.extend([elem])
            else:
                flat_net_grads = all_net_grads

            output_gradients.append([
                min(flat_net_grads),
                np.median(flat_net_grads),
                max(flat_net_grads)
            ])
            output_names.append(name)

    return output_gradients, output_names
Exemple #4
0
def run_train():

    out_dir = RESULTS_DIR + '/mask-rcnn-gray-011a-debug'
    initial_checkpoint = \
        RESULTS_DIR + '/mask-rcnn-gray-011a-debug/checkpoint/00072200_model.pth'
    #

    pretrain_file = None  #imagenet pretrain
    ## setup  -----------------
    os.makedirs(out_dir + '/checkpoint', exist_ok=True)
    os.makedirs(out_dir + '/train', exist_ok=True)
    os.makedirs(out_dir + '/backup', exist_ok=True)
    backup_project_as_zip(PROJECT_PATH,
                          out_dir + '/backup/code.train.%s.zip' % IDENTIFIER)

    log = Logger()
    log.open(out_dir + '/log.train.txt', mode='a')
    log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64))
    log.write('** some experiment setting **\n')
    log.write('\tSEED         = %u\n' % SEED)
    log.write('\tPROJECT_PATH = %s\n' % PROJECT_PATH)
    log.write('\tout_dir      = %s\n' % out_dir)
    log.write('\n')

    ## net ----------------------
    log.write('** net setting **\n')
    cfg = Configuration()
    net = MaskRcnnNet(cfg).cuda()

    if initial_checkpoint is not None:
        log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint)
        net.load_state_dict(
            torch.load(initial_checkpoint,
                       map_location=lambda storage, loc: storage))

    elif pretrain_file is not None:
        log.write('\tpretrained_file = %s\n' % pretrain_file)
        #load_pretrain_file(net, pretrain_file)

    log.write('%s\n\n' % (type(net)))
    log.write('\n')

    ## optimiser ----------------------------------
    iter_accum = 1
    batch_size = 4  ##NUM_CUDA_DEVICES*512 #256//iter_accum #512 #2*288//iter_accum

    num_iters = 1000 * 1000
    iter_smooth = 20
    iter_log = 50
    iter_valid = 100
    iter_save   = [0, num_iters-1]\
                   + list(range(0,num_iters,100))#1*1000

    LR = None  #LR = StepLR([ (0, 0.01),  (200, 0.001),  (300, -1)])
    optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
                          lr=0.001 / iter_accum,
                          momentum=0.9,
                          weight_decay=0.0001)

    start_iter = 0
    start_epoch = 0.
    if initial_checkpoint is not None:
        checkpoint = torch.load(
            initial_checkpoint.replace('_model.pth', '_optimizer.pth'))
        start_iter = checkpoint['iter']
        start_epoch = checkpoint['epoch']
        #optimizer.load_state_dict(checkpoint['optimizer'])

    ## dataset ----------------------------------------
    log.write('** dataset setting **\n')

    train_dataset = XXXXXDataset(
        #'train1_ids_gray_only1_500', mode='train',
        'valid1_ids_gray_only1_43',
        mode='train',
        transform=train_augment)
    train_loader = DataLoader(
        train_dataset,
        sampler=RandomSampler(train_dataset),
        #sampler = ConstantSampler(train_dataset,list(range(16))),
        batch_size=batch_size,
        drop_last=True,
        num_workers=4,
        pin_memory=True,
        collate_fn=train_collate)

    valid_dataset = ScienceDataset(
        'valid1_ids_gray_only1_43',
        mode='train',
        #'debug1_ids_gray_only1_10', mode='train',
        transform=valid_augment)
    valid_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=batch_size,
                              drop_last=False,
                              num_workers=4,
                              pin_memory=True,
                              collate_fn=train_collate)

    log.write('\ttrain_dataset.split = %s\n' % (train_dataset.split))
    log.write('\tvalid_dataset.split = %s\n' % (valid_dataset.split))
    log.write('\tlen(train_dataset)  = %d\n' % (len(train_dataset)))
    log.write('\tlen(valid_dataset)  = %d\n' % (len(valid_dataset)))
    log.write('\tlen(train_loader)   = %d\n' % (len(train_loader)))
    log.write('\tlen(valid_loader)   = %d\n' % (len(valid_loader)))
    log.write('\tbatch_size  = %d\n' % (batch_size))
    log.write('\titer_accum  = %d\n' % (iter_accum))
    log.write('\tbatch_size*iter_accum  = %d\n' % (batch_size * iter_accum))
    log.write('\n')

    #log.write(inspect.getsource(train_augment)+'\n')
    #log.write(inspect.getsource(valid_augment)+'\n')
    #log.write('\n')

    if 0:  #<debug>
        for inputs, truth_boxes, truth_labels, truth_instances, indices in valid_loader:

            batch_size, C, H, W = inputs.size()
            print(batch_size)

            images = inputs.cpu().numpy()
            for b in range(batch_size):
                image = (images[b].transpose((1, 2, 0)) * 255)
                image = np.clip(image.astype(np.float32) * 3, 0, 255)

                image1 = image.copy()

                truth_box = truth_boxes[b]
                truth_label = truth_labels[b]
                truth_instance = truth_instances[b]
                if truth_box is not None:
                    for box, label, instance in zip(truth_box, truth_label,
                                                    truth_instance):
                        x0, y0, x1, y1 = box.astype(np.int32)
                        cv2.rectangle(image, (x0, y0), (x1, y1), (0, 0, 255),
                                      1)
                        print(label)

                        thresh = instance > 0.5
                        contour = thresh_to_inner_contour(thresh)
                        contour = contour.astype(np.float32) * 0.5

                        image1 = contour[:, :, np.newaxis] * np.array(
                            (0, 255,
                             0)) + (1 - contour[:, :, np.newaxis]) * image1

                    print('')

                image_show('image', image)
                image_show('image1', image1)
                cv2.waitKey(0)

    ## start training here! ##############################################
    log.write('** start training here! **\n')
    log.write(' optimizer=%s\n' % str(optimizer))
    log.write(' momentum=%f\n' % optimizer.param_groups[0]['momentum'])
    log.write(' LR=%s\n\n' % str(LR))

    log.write(' images_per_epoch = %d\n\n' % len(train_dataset))
    log.write(
        ' rate    iter   epoch  num   | valid_loss                           | train_loss                           | batch_loss                           |  time    \n'
    )
    log.write(
        '------------------------------------------------------------------------------------------------------------------------------------------------------------------\n'
    )

    train_loss = np.zeros(6, np.float32)
    train_acc = 0.0
    valid_loss = np.zeros(6, np.float32)
    valid_acc = 0.0
    batch_loss = np.zeros(6, np.float32)
    batch_acc = 0.0
    rate = 0

    start = timer()
    j = 0
    i = 0

    for i in range(n_epochs):  # loop over the dataset multiple times
        sum_train_loss = np.zeros(6, np.float32)
        sum_train_acc = 0.0
        sum = 0

        net.set_mode('train')
        optimizer.zero_grad()
        for inputs, truth_boxes, truth_labels, truth_instances, indices in train_loader:
            batch_size = len(indices)
            i = j / iter_accum + start_iter
            epoch = (i - start_iter) * batch_size * iter_accum / len(
                train_dataset) + start_epoch
            num_products = epoch * len(train_dataset)

            if i % iter_valid == 0:
                net.set_mode('valid')
                valid_loss, valid_acc = evaluate(net, valid_loader)
                net.set_mode('train')

                print('\r', end='', flush=True)
                log.write('%0.4f %5.1f k %6.2f %4.1f m | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %s\n' % (\
                         rate, i/1000, epoch, num_products/1000000,
                         valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], #valid_acc,
                         train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], #train_acc,
                         batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], #batch_acc,
                         time_to_str((timer() - start)/60)))
                time.sleep(0.01)

            #if 1:
            if i in iter_save:
                torch.save(net.state_dict(),
                           out_dir + '/checkpoint/%08d_model.pth' % (i))
                torch.save(
                    {
                        'optimizer': optimizer.state_dict(),
                        'iter': i,
                        'epoch': epoch,
                    }, out_dir + '/checkpoint/%08d_optimizer.pth' % (i))

            # learning rate schduler -------------
            if LR is not None:
                lr = LR.get_rate(i)
                if lr < 0: break
                adjust_learning_rate(optimizer, lr / iter_accum)
            rate = get_learning_rate(optimizer)[0] * iter_accum

            # one iteration update  -------------
            inputs = Variable(inputs).cuda()
            net(inputs, truth_boxes, truth_labels, truth_instances)
            loss = net.loss(inputs, truth_boxes, truth_labels, truth_instances)

            if 1:  #<debug>
                debug_and_draw(net,
                               inputs,
                               truth_boxes,
                               truth_labels,
                               truth_instances,
                               mode='test')

            # masks  = (probs>0.5).float()
            # acc    = dice_loss(masks, labels)

            # accumulated update
            loss.backward()
            if j % iter_accum == 0:
                #torch.nn.utils.clip_grad_norm(net.parameters(), 1)
                optimizer.step()
                optimizer.zero_grad()

            # print statistics  ------------
            batch_acc = 0  #acc[0][0]
            batch_loss = np.array((
                loss.cpu().data.numpy()[0],
                net.rpn_cls_loss.cpu().data.numpy()[0],
                net.rpn_reg_loss.cpu().data.numpy()[0],
                net.rcnn_cls_loss.cpu().data.numpy()[0],
                net.rcnn_reg_loss.cpu().data.numpy()[0],
                net.mask_cls_loss.cpu().data.numpy()[0],
            ))
            sum_train_loss += batch_loss
            sum_train_acc += batch_acc
            sum += 1
            if i % iter_smooth == 0:
                train_loss = sum_train_loss / sum
                train_acc = sum_train_acc / sum
                sum_train_loss = np.zeros(6, np.float32)
                sum_train_acc = 0.
                sum = 0


            print('\r%0.4f %5.1f k %6.2f %4.1f m | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %s  %d,%d,%s' % (\
                         rate, i/1000, epoch, num_products/1000000,
                         valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], #valid_acc,
                         train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], #train_acc,
                         batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], #batch_acc,
                         time_to_str((timer() - start)/60) ,i,j, str(inputs.size())), end='',flush=True)
            j = j + 1

        pass  #-- end of one data loader --
    pass  #-- end of all iterations --

    if 1:  #save last
        torch.save(net.state_dict(),
                   out_dir + '/checkpoint/%d_model.pth' % (i))
        torch.save(
            {
                'optimizer': optimizer.state_dict(),
                'iter': i,
                'epoch': epoch,
            }, out_dir + '/checkpoint/%d_optimizer.pth' % (i))

    log.write('\n')